aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig343
-rw-r--r--arch/x86/Kconfig.cpu65
-rw-r--r--arch/x86/Kconfig.debug129
-rw-r--r--arch/x86/Makefile248
-rw-r--r--arch/x86/Makefile_32175
-rw-r--r--arch/x86/Makefile_64144
-rw-r--r--arch/x86/boot/Makefile28
-rw-r--r--arch/x86/boot/apm.c3
-rw-r--r--arch/x86/boot/boot.h17
-rw-r--r--arch/x86/boot/cmdline.c65
-rw-r--r--arch/x86/boot/compressed/Makefile63
-rw-r--r--arch/x86/boot/compressed/Makefile_3250
-rw-r--r--arch/x86/boot/compressed/Makefile_6430
-rw-r--r--arch/x86/boot/compressed/head_64.S8
-rw-r--r--arch/x86/boot/compressed/misc.c (renamed from arch/x86/boot/compressed/misc_32.c)77
-rw-r--r--arch/x86/boot/compressed/misc_64.c371
-rw-r--r--arch/x86/boot/compressed/relocs.c7
-rw-r--r--arch/x86/boot/compressed/vmlinux.scr (renamed from arch/x86/boot/compressed/vmlinux_64.scr)2
-rw-r--r--arch/x86/boot/compressed/vmlinux_32.lds10
-rw-r--r--arch/x86/boot/compressed/vmlinux_32.scr10
-rw-r--r--arch/x86/boot/compressed/vmlinux_64.lds12
-rw-r--r--arch/x86/boot/cpu.c26
-rw-r--r--arch/x86/boot/edd.c13
-rw-r--r--arch/x86/boot/header.S5
-rw-r--r--arch/x86/boot/main.c31
-rw-r--r--arch/x86/boot/mkcpustr.c49
-rw-r--r--arch/x86/boot/pm.c6
-rw-r--r--arch/x86/boot/pmjump.S54
-rw-r--r--arch/x86/boot/video-bios.c3
-rw-r--r--arch/x86/boot/video-vesa.c26
-rw-r--r--arch/x86/boot/video-vga.c20
-rw-r--r--arch/x86/boot/video.c33
-rw-r--r--arch/x86/boot/video.h3
-rw-r--r--arch/x86/boot/voyager.c4
-rw-r--r--arch/x86/configs/i386_defconfig4
-rw-r--r--arch/x86/configs/x86_64_defconfig9
-rw-r--r--arch/x86/crypto/Makefile12
-rw-r--r--arch/x86/crypto/aes-i586-asm_32.S89
-rw-r--r--arch/x86/crypto/aes-x86_64-asm_64.S68
-rw-r--r--arch/x86/crypto/aes_32.c515
-rw-r--r--arch/x86/crypto/aes_64.c336
-rw-r--r--arch/x86/crypto/aes_glue.c57
-rw-r--r--arch/x86/crypto/salsa20-i586-asm_32.S1114
-rw-r--r--arch/x86/crypto/salsa20-x86_64-asm_64.S920
-rw-r--r--arch/x86/crypto/salsa20_glue.c129
-rw-r--r--arch/x86/crypto/twofish_64.c97
-rw-r--r--arch/x86/crypto/twofish_glue.c (renamed from arch/x86/crypto/twofish_32.c)8
-rw-r--r--arch/x86/ia32/Makefile41
-rw-r--r--arch/x86/ia32/audit.c2
-rw-r--r--arch/x86/ia32/fpu32.c183
-rw-r--r--arch/x86/ia32/ia32_aout.c246
-rw-r--r--arch/x86/ia32/ia32_binfmt.c285
-rw-r--r--arch/x86/ia32/ia32_signal.c472
-rw-r--r--arch/x86/ia32/ia32entry.S15
-rw-r--r--arch/x86/ia32/ipc32.c30
-rw-r--r--arch/x86/ia32/mmap32.c79
-rw-r--r--arch/x86/ia32/ptrace32.c404
-rw-r--r--arch/x86/ia32/sys_ia32.c504
-rw-r--r--arch/x86/ia32/syscall32.c83
-rw-r--r--arch/x86/ia32/syscall32_syscall.S17
-rw-r--r--arch/x86/ia32/tls32.c163
-rw-r--r--arch/x86/ia32/vsyscall-sigreturn.S143
-rw-r--r--arch/x86/ia32/vsyscall-sysenter.S95
-rw-r--r--arch/x86/ia32/vsyscall.lds80
-rw-r--r--arch/x86/kernel/Makefile98
-rw-r--r--arch/x86/kernel/Makefile_3288
-rw-r--r--arch/x86/kernel/Makefile_6445
-rw-r--r--arch/x86/kernel/acpi/Makefile2
-rw-r--r--arch/x86/kernel/acpi/boot.c48
-rw-r--r--arch/x86/kernel/acpi/sleep.c87
-rw-r--r--arch/x86/kernel/acpi/sleep_32.c70
-rw-r--r--arch/x86/kernel/acpi/sleep_64.c117
-rw-r--r--arch/x86/kernel/acpi/wakeup_32.S2
-rw-r--r--arch/x86/kernel/acpi/wakeup_64.S32
-rw-r--r--arch/x86/kernel/alternative.c40
-rw-r--r--arch/x86/kernel/aperture_64.c374
-rw-r--r--arch/x86/kernel/apic_32.c158
-rw-r--r--arch/x86/kernel/apic_64.c1259
-rw-r--r--arch/x86/kernel/apm_32.c389
-rw-r--r--arch/x86/kernel/asm-offsets_32.c65
-rw-r--r--arch/x86/kernel/asm-offsets_64.c56
-rw-r--r--arch/x86/kernel/bootflag.c50
-rw-r--r--arch/x86/kernel/bugs_64.c1
-rw-r--r--arch/x86/kernel/cpu/Makefile1
-rw-r--r--arch/x86/kernel/cpu/addon_cpuid_features.c2
-rw-r--r--arch/x86/kernel/cpu/amd.c23
-rw-r--r--arch/x86/kernel/cpu/bugs.c5
-rw-r--r--arch/x86/kernel/cpu/common.c214
-rw-r--r--arch/x86/kernel/cpu/cpu.h14
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c25
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c13
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-lib.c5
-rw-r--r--arch/x86/kernel/cpu/cyrix.c8
-rw-r--r--arch/x86/kernel/cpu/feature_names.c83
-rw-r--r--arch/x86/kernel/cpu/intel.c40
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c30
-rw-r--r--arch/x86/kernel/cpu/mcheck/k7.c25
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.h2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_32.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_64.c47
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd_64.c49
-rw-r--r--arch/x86/kernel/cpu/mcheck/p4.c35
-rw-r--r--arch/x86/kernel/cpu/mcheck/p5.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/p6.c23
-rw-r--r--arch/x86/kernel/cpu/mcheck/winchip.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/amd.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/cyrix.c110
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c27
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c23
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c170
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.h13
-rw-r--r--arch/x86/kernel/cpu/mtrr/state.c3
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c1
-rw-r--r--arch/x86/kernel/cpu/proc.c76
-rw-r--r--arch/x86/kernel/cpuid.c60
-rw-r--r--arch/x86/kernel/doublefault_32.c19
-rw-r--r--arch/x86/kernel/ds.c464
-rw-r--r--arch/x86/kernel/e820_32.c241
-rw-r--r--arch/x86/kernel/e820_64.c437
-rw-r--r--arch/x86/kernel/early-quirks.c127
-rw-r--r--arch/x86/kernel/early_printk.c2
-rw-r--r--arch/x86/kernel/efi.c515
-rw-r--r--arch/x86/kernel/efi_32.c618
-rw-r--r--arch/x86/kernel/efi_64.c134
-rw-r--r--arch/x86/kernel/efi_stub_64.S109
-rw-r--r--arch/x86/kernel/entry_32.S26
-rw-r--r--arch/x86/kernel/entry_64.S127
-rw-r--r--arch/x86/kernel/genapic_64.c15
-rw-r--r--arch/x86/kernel/geode_32.c48
-rw-r--r--arch/x86/kernel/head64.c63
-rw-r--r--arch/x86/kernel/head_32.S17
-rw-r--r--arch/x86/kernel/head_64.S67
-rw-r--r--arch/x86/kernel/hpet.c62
-rw-r--r--arch/x86/kernel/i386_ksyms_32.c7
-rw-r--r--arch/x86/kernel/i387.c479
-rw-r--r--arch/x86/kernel/i387_32.c544
-rw-r--r--arch/x86/kernel/i387_64.c150
-rw-r--r--arch/x86/kernel/i8237.c2
-rw-r--r--arch/x86/kernel/i8253.c72
-rw-r--r--arch/x86/kernel/i8259_32.c26
-rw-r--r--arch/x86/kernel/i8259_64.c162
-rw-r--r--arch/x86/kernel/init_task.c1
-rw-r--r--arch/x86/kernel/io_apic_32.c27
-rw-r--r--arch/x86/kernel/io_apic_64.c114
-rw-r--r--arch/x86/kernel/io_delay.c114
-rw-r--r--arch/x86/kernel/ioport.c (renamed from arch/x86/kernel/ioport_32.c)85
-rw-r--r--arch/x86/kernel/ioport_64.c117
-rw-r--r--arch/x86/kernel/irq_32.c22
-rw-r--r--arch/x86/kernel/irq_64.c30
-rw-r--r--arch/x86/kernel/kdebugfs.c65
-rw-r--r--arch/x86/kernel/kprobes.c1066
-rw-r--r--arch/x86/kernel/kprobes_32.c756
-rw-r--r--arch/x86/kernel/kprobes_64.c749
-rw-r--r--arch/x86/kernel/ldt.c (renamed from arch/x86/kernel/ldt_32.c)113
-rw-r--r--arch/x86/kernel/ldt_64.c250
-rw-r--r--arch/x86/kernel/machine_kexec_32.c4
-rw-r--r--arch/x86/kernel/machine_kexec_64.c5
-rw-r--r--arch/x86/kernel/mfgpt_32.c30
-rw-r--r--arch/x86/kernel/microcode.c30
-rw-r--r--arch/x86/kernel/mpparse_32.c39
-rw-r--r--arch/x86/kernel/mpparse_64.c28
-rw-r--r--arch/x86/kernel/msr.c22
-rw-r--r--arch/x86/kernel/nmi_32.c25
-rw-r--r--arch/x86/kernel/nmi_64.c101
-rw-r--r--arch/x86/kernel/numaq_32.c2
-rw-r--r--arch/x86/kernel/paravirt.c (renamed from arch/x86/kernel/paravirt_32.c)96
-rw-r--r--arch/x86/kernel/paravirt_patch_32.c49
-rw-r--r--arch/x86/kernel/paravirt_patch_64.c57
-rw-r--r--arch/x86/kernel/pci-calgary_64.c43
-rw-r--r--arch/x86/kernel/pci-dma_64.c3
-rw-r--r--arch/x86/kernel/pci-gart_64.c549
-rw-r--r--arch/x86/kernel/pci-swiotlb_64.c1
-rw-r--r--arch/x86/kernel/pmtimer_64.c4
-rw-r--r--arch/x86/kernel/process_32.c430
-rw-r--r--arch/x86/kernel/process_64.c331
-rw-r--r--arch/x86/kernel/ptrace.c1566
-rw-r--r--arch/x86/kernel/ptrace_32.c717
-rw-r--r--arch/x86/kernel/ptrace_64.c621
-rw-r--r--arch/x86/kernel/quirks.c71
-rw-r--r--arch/x86/kernel/reboot.c (renamed from arch/x86/kernel/reboot_32.c)284
-rw-r--r--arch/x86/kernel/reboot_64.c176
-rw-r--r--arch/x86/kernel/reboot_fixups_32.c14
-rw-r--r--arch/x86/kernel/rtc.c204
-rw-r--r--arch/x86/kernel/scx200_32.c2
-rw-r--r--arch/x86/kernel/setup64.c59
-rw-r--r--arch/x86/kernel/setup_32.c285
-rw-r--r--arch/x86/kernel/setup_64.c624
-rw-r--r--arch/x86/kernel/signal_32.c228
-rw-r--r--arch/x86/kernel/signal_64.c136
-rw-r--r--arch/x86/kernel/smp_32.c15
-rw-r--r--arch/x86/kernel/smp_64.c91
-rw-r--r--arch/x86/kernel/smpboot_32.c63
-rw-r--r--arch/x86/kernel/smpboot_64.c81
-rw-r--r--arch/x86/kernel/smpcommon_32.c7
-rw-r--r--arch/x86/kernel/srat_32.c10
-rw-r--r--arch/x86/kernel/stacktrace.c33
-rw-r--r--arch/x86/kernel/step.c203
-rw-r--r--arch/x86/kernel/suspend_64.c38
-rw-r--r--arch/x86/kernel/suspend_asm_64.S32
-rw-r--r--arch/x86/kernel/sys_x86_64.c98
-rw-r--r--arch/x86/kernel/syscall_table_32.S4
-rw-r--r--arch/x86/kernel/test_nx.c173
-rw-r--r--arch/x86/kernel/test_rodata.c86
-rw-r--r--arch/x86/kernel/time_32.c114
-rw-r--r--arch/x86/kernel/time_64.c187
-rw-r--r--arch/x86/kernel/tls.c213
-rw-r--r--arch/x86/kernel/tls.h21
-rw-r--r--arch/x86/kernel/topology.c22
-rw-r--r--arch/x86/kernel/trampoline_32.S7
-rw-r--r--arch/x86/kernel/trampoline_64.S3
-rw-r--r--arch/x86/kernel/traps_32.c357
-rw-r--r--arch/x86/kernel/traps_64.c368
-rw-r--r--arch/x86/kernel/tsc_32.c62
-rw-r--r--arch/x86/kernel/tsc_64.c100
-rw-r--r--arch/x86/kernel/tsc_sync.c30
-rw-r--r--arch/x86/kernel/vm86_32.c115
-rw-r--r--arch/x86/kernel/vmi_32.c126
-rw-r--r--arch/x86/kernel/vmiclock_32.c3
-rw-r--r--arch/x86/kernel/vmlinux_32.lds.S22
-rw-r--r--arch/x86/kernel/vmlinux_64.lds.S42
-rw-r--r--arch/x86/kernel/vsmp_64.c11
-rw-r--r--arch/x86/kernel/vsyscall_32.S15
-rw-r--r--arch/x86/kernel/vsyscall_32.lds.S67
-rw-r--r--arch/x86/kernel/vsyscall_64.c11
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c13
-rw-r--r--arch/x86/kvm/Kconfig58
-rw-r--r--arch/x86/kvm/Makefile14
-rw-r--r--arch/x86/kvm/i8259.c450
-rw-r--r--arch/x86/kvm/irq.c78
-rw-r--r--arch/x86/kvm/irq.h88
-rw-r--r--arch/x86/kvm/kvm_svm.h45
-rw-r--r--arch/x86/kvm/lapic.c1154
-rw-r--r--arch/x86/kvm/lapic.h50
-rw-r--r--arch/x86/kvm/mmu.c1885
-rw-r--r--arch/x86/kvm/mmu.h44
-rw-r--r--arch/x86/kvm/paging_tmpl.h484
-rw-r--r--arch/x86/kvm/segment_descriptor.h29
-rw-r--r--arch/x86/kvm/svm.c1731
-rw-r--r--arch/x86/kvm/svm.h325
-rw-r--r--arch/x86/kvm/vmx.c2679
-rw-r--r--arch/x86/kvm/vmx.h324
-rw-r--r--arch/x86/kvm/x86.c3287
-rw-r--r--arch/x86/kvm/x86_emulate.c1912
-rw-r--r--arch/x86/lguest/Kconfig1
-rw-r--r--arch/x86/lguest/boot.c62
-rw-r--r--arch/x86/lib/Makefile26
-rw-r--r--arch/x86/lib/Makefile_3211
-rw-r--r--arch/x86/lib/Makefile_6413
-rw-r--r--arch/x86/lib/bitops_32.c2
-rw-r--r--arch/x86/lib/bitops_64.c2
-rw-r--r--arch/x86/lib/bitstr_64.c28
-rw-r--r--arch/x86/lib/delay_32.c4
-rw-r--r--arch/x86/lib/delay_64.c4
-rw-r--r--arch/x86/lib/memcpy_32.c4
-rw-r--r--arch/x86/lib/memmove_64.c4
-rw-r--r--arch/x86/lib/mmx_32.c31
-rw-r--r--arch/x86/lib/semaphore_32.S22
-rw-r--r--arch/x86/lib/thunk_64.S2
-rw-r--r--arch/x86/lib/usercopy_32.c14
-rw-r--r--arch/x86/lib/usercopy_64.c12
-rw-r--r--arch/x86/mach-rdc321x/Makefile5
-rw-r--r--arch/x86/mach-rdc321x/gpio.c91
-rw-r--r--arch/x86/mach-rdc321x/platform.c68
-rw-r--r--arch/x86/mach-rdc321x/wdt.c275
-rw-r--r--arch/x86/mach-visws/mpparse.c16
-rw-r--r--arch/x86/mach-voyager/setup.c34
-rw-r--r--arch/x86/mach-voyager/voyager_basic.c132
-rw-r--r--arch/x86/mach-voyager/voyager_cat.c601
-rw-r--r--arch/x86/mach-voyager/voyager_smp.c692
-rw-r--r--arch/x86/mach-voyager/voyager_thread.c52
-rw-r--r--arch/x86/math-emu/errors.c882
-rw-r--r--arch/x86/math-emu/exception.h9
-rw-r--r--arch/x86/math-emu/fpu_arith.c150
-rw-r--r--arch/x86/math-emu/fpu_asm.h1
-rw-r--r--arch/x86/math-emu/fpu_aux.c211
-rw-r--r--arch/x86/math-emu/fpu_emu.h67
-rw-r--r--arch/x86/math-emu/fpu_entry.c1230
-rw-r--r--arch/x86/math-emu/fpu_etc.c185
-rw-r--r--arch/x86/math-emu/fpu_proto.h28
-rw-r--r--arch/x86/math-emu/fpu_tags.c94
-rw-r--r--arch/x86/math-emu/fpu_trig.c2884
-rw-r--r--arch/x86/math-emu/get_address.c650
-rw-r--r--arch/x86/math-emu/load_store.c448
-rw-r--r--arch/x86/math-emu/poly.h69
-rw-r--r--arch/x86/math-emu/poly_2xm1.c199
-rw-r--r--arch/x86/math-emu/poly_atan.c353
-rw-r--r--arch/x86/math-emu/poly_l2.c376
-rw-r--r--arch/x86/math-emu/poly_sin.c599
-rw-r--r--arch/x86/math-emu/poly_tan.c338
-rw-r--r--arch/x86/math-emu/reg_add_sub.c563
-rw-r--r--arch/x86/math-emu/reg_compare.c567
-rw-r--r--arch/x86/math-emu/reg_constant.c73
-rw-r--r--arch/x86/math-emu/reg_convert.c57
-rw-r--r--arch/x86/math-emu/reg_divide.c301
-rw-r--r--arch/x86/math-emu/reg_ld_str.c2147
-rw-r--r--arch/x86/math-emu/reg_mul.c163
-rw-r--r--arch/x86/math-emu/status_w.h8
-rw-r--r--arch/x86/mm/Makefile_323
-rw-r--r--arch/x86/mm/Makefile_643
-rw-r--r--arch/x86/mm/boot_ioremap_32.c100
-rw-r--r--arch/x86/mm/discontig_32.c110
-rw-r--r--arch/x86/mm/extable.c62
-rw-r--r--arch/x86/mm/extable_32.c35
-rw-r--r--arch/x86/mm/extable_64.c34
-rw-r--r--arch/x86/mm/fault.c986
-rw-r--r--arch/x86/mm/fault_32.c659
-rw-r--r--arch/x86/mm/fault_64.c623
-rw-r--r--arch/x86/mm/highmem_32.c47
-rw-r--r--arch/x86/mm/hugetlbpage.c3
-rw-r--r--arch/x86/mm/init_32.c446
-rw-r--r--arch/x86/mm/init_64.c411
-rw-r--r--arch/x86/mm/ioremap.c485
-rw-r--r--arch/x86/mm/ioremap_32.c274
-rw-r--r--arch/x86/mm/ioremap_64.c210
-rw-r--r--arch/x86/mm/k8topology_64.c173
-rw-r--r--arch/x86/mm/mmap.c (renamed from arch/x86/mm/mmap_32.c)86
-rw-r--r--arch/x86/mm/mmap_64.c29
-rw-r--r--arch/x86/mm/numa_64.c307
-rw-r--r--arch/x86/mm/pageattr-test.c262
-rw-r--r--arch/x86/mm/pageattr.c782
-rw-r--r--arch/x86/mm/pageattr_32.c278
-rw-r--r--arch/x86/mm/pageattr_64.c255
-rw-r--r--arch/x86/mm/pgtable_32.c213
-rw-r--r--arch/x86/mm/srat_64.c95
-rw-r--r--arch/x86/oprofile/backtrace.c12
-rw-r--r--arch/x86/oprofile/nmi_int.c216
-rw-r--r--arch/x86/pci/common.c17
-rw-r--r--arch/x86/pci/fixup.c52
-rw-r--r--arch/x86/pci/i386.c2
-rw-r--r--arch/x86/pci/irq.c20
-rw-r--r--arch/x86/pci/numa.c52
-rw-r--r--arch/x86/power/cpu.c18
-rw-r--r--arch/x86/vdso/.gitignore5
-rw-r--r--arch/x86/vdso/Makefile132
-rw-r--r--arch/x86/vdso/vclock_gettime.c1
-rw-r--r--arch/x86/vdso/vdso-layout.lds.S64
-rw-r--r--arch/x86/vdso/vdso-start.S2
-rw-r--r--arch/x86/vdso/vdso.lds.S94
-rw-r--r--arch/x86/vdso/vdso32-setup.c (renamed from arch/x86/kernel/sysenter_32.c)164
-rw-r--r--arch/x86/vdso/vdso32.S19
-rw-r--r--arch/x86/vdso/vdso32/.gitignore1
-rw-r--r--arch/x86/vdso/vdso32/int80.S (renamed from arch/x86/kernel/vsyscall-int80_32.S)21
-rw-r--r--arch/x86/vdso/vdso32/note.S (renamed from arch/x86/kernel/vsyscall-note_32.S)5
-rw-r--r--arch/x86/vdso/vdso32/sigreturn.S (renamed from arch/x86/kernel/vsyscall-sigreturn_32.S)87
-rw-r--r--arch/x86/vdso/vdso32/syscall.S (renamed from arch/x86/ia32/vsyscall-syscall.S)22
-rw-r--r--arch/x86/vdso/vdso32/sysenter.S (renamed from arch/x86/kernel/vsyscall-sysenter_32.S)42
-rw-r--r--arch/x86/vdso/vdso32/vdso32.lds.S37
-rw-r--r--arch/x86/vdso/vgetcpu.c4
-rw-r--r--arch/x86/vdso/vma.c18
-rw-r--r--arch/x86/vdso/voffset.h1
-rw-r--r--arch/x86/xen/Kconfig1
-rw-r--r--arch/x86/xen/enlighten.c104
-rw-r--r--arch/x86/xen/events.c4
-rw-r--r--arch/x86/xen/mmu.c23
-rw-r--r--arch/x86/xen/setup.c9
-rw-r--r--arch/x86/xen/smp.c8
-rw-r--r--arch/x86/xen/time.c2
-rw-r--r--arch/x86/xen/xen-head.S6
359 files changed, 41811 insertions, 26093 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 80b7ba4056d..e6728bd61cc 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -17,81 +17,74 @@ config X86_64
17 17
18### Arch settings 18### Arch settings
19config X86 19config X86
20 bool 20 def_bool y
21 default y 21 select HAVE_OPROFILE
22 select HAVE_KPROBES
23
24config GENERIC_LOCKBREAK
25 def_bool n
22 26
23config GENERIC_TIME 27config GENERIC_TIME
24 bool 28 def_bool y
25 default y
26 29
27config GENERIC_CMOS_UPDATE 30config GENERIC_CMOS_UPDATE
28 bool 31 def_bool y
29 default y
30 32
31config CLOCKSOURCE_WATCHDOG 33config CLOCKSOURCE_WATCHDOG
32 bool 34 def_bool y
33 default y
34 35
35config GENERIC_CLOCKEVENTS 36config GENERIC_CLOCKEVENTS
36 bool 37 def_bool y
37 default y
38 38
39config GENERIC_CLOCKEVENTS_BROADCAST 39config GENERIC_CLOCKEVENTS_BROADCAST
40 bool 40 def_bool y
41 default y
42 depends on X86_64 || (X86_32 && X86_LOCAL_APIC) 41 depends on X86_64 || (X86_32 && X86_LOCAL_APIC)
43 42
44config LOCKDEP_SUPPORT 43config LOCKDEP_SUPPORT
45 bool 44 def_bool y
46 default y
47 45
48config STACKTRACE_SUPPORT 46config STACKTRACE_SUPPORT
49 bool 47 def_bool y
50 default y 48
49config HAVE_LATENCYTOP_SUPPORT
50 def_bool y
51 51
52config SEMAPHORE_SLEEPERS 52config SEMAPHORE_SLEEPERS
53 bool 53 def_bool y
54 default y
55 54
56config MMU 55config MMU
57 bool 56 def_bool y
58 default y
59 57
60config ZONE_DMA 58config ZONE_DMA
61 bool 59 def_bool y
62 default y
63 60
64config QUICKLIST 61config QUICKLIST
65 bool 62 def_bool X86_32
66 default X86_32
67 63
68config SBUS 64config SBUS
69 bool 65 bool
70 66
71config GENERIC_ISA_DMA 67config GENERIC_ISA_DMA
72 bool 68 def_bool y
73 default y
74 69
75config GENERIC_IOMAP 70config GENERIC_IOMAP
76 bool 71 def_bool y
77 default y
78 72
79config GENERIC_BUG 73config GENERIC_BUG
80 bool 74 def_bool y
81 default y
82 depends on BUG 75 depends on BUG
83 76
84config GENERIC_HWEIGHT 77config GENERIC_HWEIGHT
85 bool 78 def_bool y
86 default y 79
80config GENERIC_GPIO
81 def_bool n
87 82
88config ARCH_MAY_HAVE_PC_FDC 83config ARCH_MAY_HAVE_PC_FDC
89 bool 84 def_bool y
90 default y
91 85
92config DMI 86config DMI
93 bool 87 def_bool y
94 default y
95 88
96config RWSEM_GENERIC_SPINLOCK 89config RWSEM_GENERIC_SPINLOCK
97 def_bool !X86_XADD 90 def_bool !X86_XADD
@@ -112,10 +105,18 @@ config GENERIC_TIME_VSYSCALL
112 bool 105 bool
113 default X86_64 106 default X86_64
114 107
115config ARCH_SUPPORTS_OPROFILE 108config HAVE_SETUP_PER_CPU_AREA
116 bool 109 def_bool X86_64
117 default y 110
111select HAVE_KVM
118 112
113config ARCH_HIBERNATION_POSSIBLE
114 def_bool y
115 depends on !SMP || !X86_VOYAGER
116
117config ARCH_SUSPEND_POSSIBLE
118 def_bool y
119 depends on !X86_VOYAGER
119 120
120config ZONE_DMA32 121config ZONE_DMA32
121 bool 122 bool
@@ -144,9 +145,17 @@ config GENERIC_PENDING_IRQ
144 145
145config X86_SMP 146config X86_SMP
146 bool 147 bool
147 depends on X86_32 && SMP && !X86_VOYAGER 148 depends on SMP && ((X86_32 && !X86_VOYAGER) || X86_64)
148 default y 149 default y
149 150
151config X86_32_SMP
152 def_bool y
153 depends on X86_32 && SMP
154
155config X86_64_SMP
156 def_bool y
157 depends on X86_64 && SMP
158
150config X86_HT 159config X86_HT
151 bool 160 bool
152 depends on SMP 161 depends on SMP
@@ -193,8 +202,7 @@ config SMP
193 Y to "Enhanced Real Time Clock Support", below. The "Advanced Power 202 Y to "Enhanced Real Time Clock Support", below. The "Advanced Power
194 Management" code will be disabled if you say Y here. 203 Management" code will be disabled if you say Y here.
195 204
196 See also the <file:Documentation/smp.txt>, 205 See also <file:Documentation/i386/IO-APIC.txt>,
197 <file:Documentation/i386/IO-APIC.txt>,
198 <file:Documentation/nmi_watchdog.txt> and the SMP-HOWTO available at 206 <file:Documentation/nmi_watchdog.txt> and the SMP-HOWTO available at
199 <http://www.tldp.org/docs.html#howto>. 207 <http://www.tldp.org/docs.html#howto>.
200 208
@@ -292,6 +300,19 @@ config X86_ES7000
292 Only choose this option if you have such a system, otherwise you 300 Only choose this option if you have such a system, otherwise you
293 should say N here. 301 should say N here.
294 302
303config X86_RDC321X
304 bool "RDC R-321x SoC"
305 depends on X86_32
306 select M486
307 select X86_REBOOTFIXUPS
308 select GENERIC_GPIO
309 select LEDS_CLASS
310 select LEDS_GPIO
311 help
312 This option is needed for RDC R-321x system-on-chip, also known
313 as R-8610-(G).
314 If you don't have one of these chips, you should say N here.
315
295config X86_VSMP 316config X86_VSMP
296 bool "Support for ScaleMP vSMP" 317 bool "Support for ScaleMP vSMP"
297 depends on X86_64 && PCI 318 depends on X86_64 && PCI
@@ -303,8 +324,8 @@ config X86_VSMP
303endchoice 324endchoice
304 325
305config SCHED_NO_NO_OMIT_FRAME_POINTER 326config SCHED_NO_NO_OMIT_FRAME_POINTER
306 bool "Single-depth WCHAN output" 327 def_bool y
307 default y 328 prompt "Single-depth WCHAN output"
308 depends on X86_32 329 depends on X86_32
309 help 330 help
310 Calculate simpler /proc/<PID>/wchan values. If this option 331 Calculate simpler /proc/<PID>/wchan values. If this option
@@ -314,18 +335,8 @@ config SCHED_NO_NO_OMIT_FRAME_POINTER
314 335
315 If in doubt, say "Y". 336 If in doubt, say "Y".
316 337
317config PARAVIRT
318 bool
319 depends on X86_32 && !(X86_VISWS || X86_VOYAGER)
320 help
321 This changes the kernel so it can modify itself when it is run
322 under a hypervisor, potentially improving performance significantly
323 over full virtualization. However, when run without a hypervisor
324 the kernel is theoretically slower and slightly larger.
325
326menuconfig PARAVIRT_GUEST 338menuconfig PARAVIRT_GUEST
327 bool "Paravirtualized guest support" 339 bool "Paravirtualized guest support"
328 depends on X86_32
329 help 340 help
330 Say Y here to get to see options related to running Linux under 341 Say Y here to get to see options related to running Linux under
331 various hypervisors. This option alone does not add any kernel code. 342 various hypervisors. This option alone does not add any kernel code.
@@ -339,6 +350,7 @@ source "arch/x86/xen/Kconfig"
339config VMI 350config VMI
340 bool "VMI Guest support" 351 bool "VMI Guest support"
341 select PARAVIRT 352 select PARAVIRT
353 depends on X86_32
342 depends on !(X86_VISWS || X86_VOYAGER) 354 depends on !(X86_VISWS || X86_VOYAGER)
343 help 355 help
344 VMI provides a paravirtualized interface to the VMware ESX server 356 VMI provides a paravirtualized interface to the VMware ESX server
@@ -348,40 +360,43 @@ config VMI
348 360
349source "arch/x86/lguest/Kconfig" 361source "arch/x86/lguest/Kconfig"
350 362
363config PARAVIRT
364 bool "Enable paravirtualization code"
365 depends on !(X86_VISWS || X86_VOYAGER)
366 help
367 This changes the kernel so it can modify itself when it is run
368 under a hypervisor, potentially improving performance significantly
369 over full virtualization. However, when run without a hypervisor
370 the kernel is theoretically slower and slightly larger.
371
351endif 372endif
352 373
353config ACPI_SRAT 374config ACPI_SRAT
354 bool 375 def_bool y
355 default y
356 depends on X86_32 && ACPI && NUMA && (X86_SUMMIT || X86_GENERICARCH) 376 depends on X86_32 && ACPI && NUMA && (X86_SUMMIT || X86_GENERICARCH)
357 select ACPI_NUMA 377 select ACPI_NUMA
358 378
359config HAVE_ARCH_PARSE_SRAT 379config HAVE_ARCH_PARSE_SRAT
360 bool 380 def_bool y
361 default y 381 depends on ACPI_SRAT
362 depends on ACPI_SRAT
363 382
364config X86_SUMMIT_NUMA 383config X86_SUMMIT_NUMA
365 bool 384 def_bool y
366 default y
367 depends on X86_32 && NUMA && (X86_SUMMIT || X86_GENERICARCH) 385 depends on X86_32 && NUMA && (X86_SUMMIT || X86_GENERICARCH)
368 386
369config X86_CYCLONE_TIMER 387config X86_CYCLONE_TIMER
370 bool 388 def_bool y
371 default y
372 depends on X86_32 && X86_SUMMIT || X86_GENERICARCH 389 depends on X86_32 && X86_SUMMIT || X86_GENERICARCH
373 390
374config ES7000_CLUSTERED_APIC 391config ES7000_CLUSTERED_APIC
375 bool 392 def_bool y
376 default y
377 depends on SMP && X86_ES7000 && MPENTIUMIII 393 depends on SMP && X86_ES7000 && MPENTIUMIII
378 394
379source "arch/x86/Kconfig.cpu" 395source "arch/x86/Kconfig.cpu"
380 396
381config HPET_TIMER 397config HPET_TIMER
382 bool 398 def_bool X86_64
383 prompt "HPET Timer Support" if X86_32 399 prompt "HPET Timer Support" if X86_32
384 default X86_64
385 help 400 help
386 Use the IA-PC HPET (High Precision Event Timer) to manage 401 Use the IA-PC HPET (High Precision Event Timer) to manage
387 time in preference to the PIT and RTC, if a HPET is 402 time in preference to the PIT and RTC, if a HPET is
@@ -399,9 +414,8 @@ config HPET_TIMER
399 Choose N to continue using the legacy 8254 timer. 414 Choose N to continue using the legacy 8254 timer.
400 415
401config HPET_EMULATE_RTC 416config HPET_EMULATE_RTC
402 bool 417 def_bool y
403 depends on HPET_TIMER && RTC=y 418 depends on HPET_TIMER && (RTC=y || RTC=m || RTC_DRV_CMOS=m || RTC_DRV_CMOS=y)
404 default y
405 419
406# Mark as embedded because too many people got it wrong. 420# Mark as embedded because too many people got it wrong.
407# The code disables itself when not needed. 421# The code disables itself when not needed.
@@ -441,8 +455,8 @@ config CALGARY_IOMMU
441 If unsure, say Y. 455 If unsure, say Y.
442 456
443config CALGARY_IOMMU_ENABLED_BY_DEFAULT 457config CALGARY_IOMMU_ENABLED_BY_DEFAULT
444 bool "Should Calgary be enabled by default?" 458 def_bool y
445 default y 459 prompt "Should Calgary be enabled by default?"
446 depends on CALGARY_IOMMU 460 depends on CALGARY_IOMMU
447 help 461 help
448 Should Calgary be enabled by default? if you choose 'y', Calgary 462 Should Calgary be enabled by default? if you choose 'y', Calgary
@@ -451,6 +465,9 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT
451 Calgary anyway, pass 'iommu=calgary' on the kernel command line. 465 Calgary anyway, pass 'iommu=calgary' on the kernel command line.
452 If unsure, say Y. 466 If unsure, say Y.
453 467
468config IOMMU_HELPER
469 def_bool (CALGARY_IOMMU || GART_IOMMU)
470
454# need this always selected by IOMMU for the VIA workaround 471# need this always selected by IOMMU for the VIA workaround
455config SWIOTLB 472config SWIOTLB
456 bool 473 bool
@@ -486,9 +503,9 @@ config SCHED_SMT
486 N here. 503 N here.
487 504
488config SCHED_MC 505config SCHED_MC
489 bool "Multi-core scheduler support" 506 def_bool y
507 prompt "Multi-core scheduler support"
490 depends on (X86_64 && SMP) || (X86_32 && X86_HT) 508 depends on (X86_64 && SMP) || (X86_32 && X86_HT)
491 default y
492 help 509 help
493 Multi-core scheduler support improves the CPU scheduler's decision 510 Multi-core scheduler support improves the CPU scheduler's decision
494 making when dealing with multi-core CPU chips at a cost of slightly 511 making when dealing with multi-core CPU chips at a cost of slightly
@@ -522,19 +539,16 @@ config X86_UP_IOAPIC
522 an IO-APIC, then the kernel will still run with no slowdown at all. 539 an IO-APIC, then the kernel will still run with no slowdown at all.
523 540
524config X86_LOCAL_APIC 541config X86_LOCAL_APIC
525 bool 542 def_bool y
526 depends on X86_64 || (X86_32 && (X86_UP_APIC || ((X86_VISWS || SMP) && !X86_VOYAGER) || X86_GENERICARCH)) 543 depends on X86_64 || (X86_32 && (X86_UP_APIC || ((X86_VISWS || SMP) && !X86_VOYAGER) || X86_GENERICARCH))
527 default y
528 544
529config X86_IO_APIC 545config X86_IO_APIC
530 bool 546 def_bool y
531 depends on X86_64 || (X86_32 && (X86_UP_IOAPIC || (SMP && !(X86_VISWS || X86_VOYAGER)) || X86_GENERICARCH)) 547 depends on X86_64 || (X86_32 && (X86_UP_IOAPIC || (SMP && !(X86_VISWS || X86_VOYAGER)) || X86_GENERICARCH))
532 default y
533 548
534config X86_VISWS_APIC 549config X86_VISWS_APIC
535 bool 550 def_bool y
536 depends on X86_32 && X86_VISWS 551 depends on X86_32 && X86_VISWS
537 default y
538 552
539config X86_MCE 553config X86_MCE
540 bool "Machine Check Exception" 554 bool "Machine Check Exception"
@@ -554,17 +568,17 @@ config X86_MCE
554 the 386 and 486, so nearly everyone can say Y here. 568 the 386 and 486, so nearly everyone can say Y here.
555 569
556config X86_MCE_INTEL 570config X86_MCE_INTEL
557 bool "Intel MCE features" 571 def_bool y
572 prompt "Intel MCE features"
558 depends on X86_64 && X86_MCE && X86_LOCAL_APIC 573 depends on X86_64 && X86_MCE && X86_LOCAL_APIC
559 default y
560 help 574 help
561 Additional support for intel specific MCE features such as 575 Additional support for intel specific MCE features such as
562 the thermal monitor. 576 the thermal monitor.
563 577
564config X86_MCE_AMD 578config X86_MCE_AMD
565 bool "AMD MCE features" 579 def_bool y
580 prompt "AMD MCE features"
566 depends on X86_64 && X86_MCE && X86_LOCAL_APIC 581 depends on X86_64 && X86_MCE && X86_LOCAL_APIC
567 default y
568 help 582 help
569 Additional support for AMD specific MCE features such as 583 Additional support for AMD specific MCE features such as
570 the DRAM Error Threshold. 584 the DRAM Error Threshold.
@@ -637,9 +651,9 @@ config I8K
637 Say N otherwise. 651 Say N otherwise.
638 652
639config X86_REBOOTFIXUPS 653config X86_REBOOTFIXUPS
640 bool "Enable X86 board specific fixups for reboot" 654 def_bool n
655 prompt "Enable X86 board specific fixups for reboot"
641 depends on X86_32 && X86 656 depends on X86_32 && X86
642 default n
643 ---help--- 657 ---help---
644 This enables chipset and/or board specific fixups to be done 658 This enables chipset and/or board specific fixups to be done
645 in order to get reboot to work correctly. This is only needed on 659 in order to get reboot to work correctly. This is only needed on
@@ -648,7 +662,7 @@ config X86_REBOOTFIXUPS
648 system. 662 system.
649 663
650 Currently, the only fixup is for the Geode machines using 664 Currently, the only fixup is for the Geode machines using
651 CS5530A and CS5536 chipsets. 665 CS5530A and CS5536 chipsets and the RDC R-321x SoC.
652 666
653 Say Y if you want to enable the fixup. Currently, it's safe to 667 Say Y if you want to enable the fixup. Currently, it's safe to
654 enable this option even if you don't need it. 668 enable this option even if you don't need it.
@@ -672,9 +686,8 @@ config MICROCODE
672 module will be called microcode. 686 module will be called microcode.
673 687
674config MICROCODE_OLD_INTERFACE 688config MICROCODE_OLD_INTERFACE
675 bool 689 def_bool y
676 depends on MICROCODE 690 depends on MICROCODE
677 default y
678 691
679config X86_MSR 692config X86_MSR
680 tristate "/dev/cpu/*/msr - Model-specific register support" 693 tristate "/dev/cpu/*/msr - Model-specific register support"
@@ -798,13 +811,12 @@ config PAGE_OFFSET
798 depends on X86_32 811 depends on X86_32
799 812
800config HIGHMEM 813config HIGHMEM
801 bool 814 def_bool y
802 depends on X86_32 && (HIGHMEM64G || HIGHMEM4G) 815 depends on X86_32 && (HIGHMEM64G || HIGHMEM4G)
803 default y
804 816
805config X86_PAE 817config X86_PAE
806 bool "PAE (Physical Address Extension) Support" 818 def_bool n
807 default n 819 prompt "PAE (Physical Address Extension) Support"
808 depends on X86_32 && !HIGHMEM4G 820 depends on X86_32 && !HIGHMEM4G
809 select RESOURCES_64BIT 821 select RESOURCES_64BIT
810 help 822 help
@@ -836,10 +848,10 @@ comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI"
836 depends on X86_32 && X86_SUMMIT && (!HIGHMEM64G || !ACPI) 848 depends on X86_32 && X86_SUMMIT && (!HIGHMEM64G || !ACPI)
837 849
838config K8_NUMA 850config K8_NUMA
839 bool "Old style AMD Opteron NUMA detection" 851 def_bool y
840 depends on X86_64 && NUMA && PCI 852 prompt "Old style AMD Opteron NUMA detection"
841 default y 853 depends on X86_64 && NUMA && PCI
842 help 854 help
843 Enable K8 NUMA node topology detection. You should say Y here if 855 Enable K8 NUMA node topology detection. You should say Y here if
844 you have a multi processor AMD K8 system. This uses an old 856 you have a multi processor AMD K8 system. This uses an old
845 method to read the NUMA configuration directly from the builtin 857 method to read the NUMA configuration directly from the builtin
@@ -847,10 +859,10 @@ config K8_NUMA
847 instead, which also takes priority if both are compiled in. 859 instead, which also takes priority if both are compiled in.
848 860
849config X86_64_ACPI_NUMA 861config X86_64_ACPI_NUMA
850 bool "ACPI NUMA detection" 862 def_bool y
863 prompt "ACPI NUMA detection"
851 depends on X86_64 && NUMA && ACPI && PCI 864 depends on X86_64 && NUMA && ACPI && PCI
852 select ACPI_NUMA 865 select ACPI_NUMA
853 default y
854 help 866 help
855 Enable ACPI SRAT based node topology detection. 867 Enable ACPI SRAT based node topology detection.
856 868
@@ -864,52 +876,53 @@ config NUMA_EMU
864 876
865config NODES_SHIFT 877config NODES_SHIFT
866 int 878 int
879 range 1 15 if X86_64
867 default "6" if X86_64 880 default "6" if X86_64
868 default "4" if X86_NUMAQ 881 default "4" if X86_NUMAQ
869 default "3" 882 default "3"
870 depends on NEED_MULTIPLE_NODES 883 depends on NEED_MULTIPLE_NODES
871 884
872config HAVE_ARCH_BOOTMEM_NODE 885config HAVE_ARCH_BOOTMEM_NODE
873 bool 886 def_bool y
874 depends on X86_32 && NUMA 887 depends on X86_32 && NUMA
875 default y
876 888
877config ARCH_HAVE_MEMORY_PRESENT 889config ARCH_HAVE_MEMORY_PRESENT
878 bool 890 def_bool y
879 depends on X86_32 && DISCONTIGMEM 891 depends on X86_32 && DISCONTIGMEM
880 default y
881 892
882config NEED_NODE_MEMMAP_SIZE 893config NEED_NODE_MEMMAP_SIZE
883 bool 894 def_bool y
884 depends on X86_32 && (DISCONTIGMEM || SPARSEMEM) 895 depends on X86_32 && (DISCONTIGMEM || SPARSEMEM)
885 default y
886 896
887config HAVE_ARCH_ALLOC_REMAP 897config HAVE_ARCH_ALLOC_REMAP
888 bool 898 def_bool y
889 depends on X86_32 && NUMA 899 depends on X86_32 && NUMA
890 default y
891 900
892config ARCH_FLATMEM_ENABLE 901config ARCH_FLATMEM_ENABLE
893 def_bool y 902 def_bool y
894 depends on (X86_32 && ARCH_SELECT_MEMORY_MODEL && X86_PC) || (X86_64 && !NUMA) 903 depends on X86_32 && ARCH_SELECT_MEMORY_MODEL && X86_PC && !NUMA
895 904
896config ARCH_DISCONTIGMEM_ENABLE 905config ARCH_DISCONTIGMEM_ENABLE
897 def_bool y 906 def_bool y
898 depends on NUMA 907 depends on NUMA && X86_32
899 908
900config ARCH_DISCONTIGMEM_DEFAULT 909config ARCH_DISCONTIGMEM_DEFAULT
901 def_bool y 910 def_bool y
902 depends on NUMA 911 depends on NUMA && X86_32
912
913config ARCH_SPARSEMEM_DEFAULT
914 def_bool y
915 depends on X86_64
903 916
904config ARCH_SPARSEMEM_ENABLE 917config ARCH_SPARSEMEM_ENABLE
905 def_bool y 918 def_bool y
906 depends on NUMA || (EXPERIMENTAL && (X86_PC || X86_64)) 919 depends on X86_64 || NUMA || (EXPERIMENTAL && X86_PC)
907 select SPARSEMEM_STATIC if X86_32 920 select SPARSEMEM_STATIC if X86_32
908 select SPARSEMEM_VMEMMAP_ENABLE if X86_64 921 select SPARSEMEM_VMEMMAP_ENABLE if X86_64
909 922
910config ARCH_SELECT_MEMORY_MODEL 923config ARCH_SELECT_MEMORY_MODEL
911 def_bool y 924 def_bool y
912 depends on X86_32 && ARCH_SPARSEMEM_ENABLE 925 depends on ARCH_SPARSEMEM_ENABLE
913 926
914config ARCH_MEMORY_PROBE 927config ARCH_MEMORY_PROBE
915 def_bool X86_64 928 def_bool X86_64
@@ -987,42 +1000,32 @@ config MTRR
987 See <file:Documentation/mtrr.txt> for more information. 1000 See <file:Documentation/mtrr.txt> for more information.
988 1001
989config EFI 1002config EFI
990 bool "Boot from EFI support" 1003 def_bool n
991 depends on X86_32 && ACPI 1004 prompt "EFI runtime service support"
992 default n 1005 depends on ACPI
993 ---help--- 1006 ---help---
994 This enables the kernel to boot on EFI platforms using 1007 This enables the kernel to use EFI runtime services that are
995 system configuration information passed to it from the firmware.
996 This also enables the kernel to use any EFI runtime services that are
997 available (such as the EFI variable services). 1008 available (such as the EFI variable services).
998 1009
999 This option is only useful on systems that have EFI firmware 1010 This option is only useful on systems that have EFI firmware.
1000 and will result in a kernel image that is ~8k larger. In addition, 1011 In addition, you should use the latest ELILO loader available
1001 you must use the latest ELILO loader available at 1012 at <http://elilo.sourceforge.net> in order to take advantage
1002 <http://elilo.sourceforge.net> in order to take advantage of 1013 of EFI runtime services. However, even with this option, the
1003 kernel initialization using EFI information (neither GRUB nor LILO know 1014 resultant kernel should continue to boot on existing non-EFI
1004 anything about EFI). However, even with this option, the resultant 1015 platforms.
1005 kernel should continue to boot on existing non-EFI platforms.
1006 1016
1007config IRQBALANCE 1017config IRQBALANCE
1008 bool "Enable kernel irq balancing" 1018 def_bool y
1019 prompt "Enable kernel irq balancing"
1009 depends on X86_32 && SMP && X86_IO_APIC 1020 depends on X86_32 && SMP && X86_IO_APIC
1010 default y
1011 help 1021 help
1012 The default yes will allow the kernel to do irq load balancing. 1022 The default yes will allow the kernel to do irq load balancing.
1013 Saying no will keep the kernel from doing irq load balancing. 1023 Saying no will keep the kernel from doing irq load balancing.
1014 1024
1015# turning this on wastes a bunch of space.
1016# Summit needs it only when NUMA is on
1017config BOOT_IOREMAP
1018 bool
1019 depends on X86_32 && (((X86_SUMMIT || X86_GENERICARCH) && NUMA) || (X86 && EFI))
1020 default y
1021
1022config SECCOMP 1025config SECCOMP
1023 bool "Enable seccomp to safely compute untrusted bytecode" 1026 def_bool y
1027 prompt "Enable seccomp to safely compute untrusted bytecode"
1024 depends on PROC_FS 1028 depends on PROC_FS
1025 default y
1026 help 1029 help
1027 This kernel feature is useful for number crunching applications 1030 This kernel feature is useful for number crunching applications
1028 that may need to compute untrusted bytecode during their 1031 that may need to compute untrusted bytecode during their
@@ -1189,11 +1192,11 @@ config HOTPLUG_CPU
1189 suspend. 1192 suspend.
1190 1193
1191config COMPAT_VDSO 1194config COMPAT_VDSO
1192 bool "Compat VDSO support" 1195 def_bool y
1193 default y 1196 prompt "Compat VDSO support"
1194 depends on X86_32 1197 depends on X86_32 || IA32_EMULATION
1195 help 1198 help
1196 Map the VDSO to the predictable old-style address too. 1199 Map the 32-bit VDSO to the predictable old-style address too.
1197 ---help--- 1200 ---help---
1198 Say N here if you are running a sufficiently recent glibc 1201 Say N here if you are running a sufficiently recent glibc
1199 version (2.3.3 or later), to remove the high-mapped 1202 version (2.3.3 or later), to remove the high-mapped
@@ -1207,30 +1210,26 @@ config ARCH_ENABLE_MEMORY_HOTPLUG
1207 def_bool y 1210 def_bool y
1208 depends on X86_64 || (X86_32 && HIGHMEM) 1211 depends on X86_64 || (X86_32 && HIGHMEM)
1209 1212
1210config MEMORY_HOTPLUG_RESERVE
1211 def_bool X86_64
1212 depends on (MEMORY_HOTPLUG && DISCONTIGMEM)
1213
1214config HAVE_ARCH_EARLY_PFN_TO_NID 1213config HAVE_ARCH_EARLY_PFN_TO_NID
1215 def_bool X86_64 1214 def_bool X86_64
1216 depends on NUMA 1215 depends on NUMA
1217 1216
1218config OUT_OF_LINE_PFN_TO_PAGE
1219 def_bool X86_64
1220 depends on DISCONTIGMEM
1221
1222menu "Power management options" 1217menu "Power management options"
1223 depends on !X86_VOYAGER 1218 depends on !X86_VOYAGER
1224 1219
1225config ARCH_HIBERNATION_HEADER 1220config ARCH_HIBERNATION_HEADER
1226 bool 1221 def_bool y
1227 depends on X86_64 && HIBERNATION 1222 depends on X86_64 && HIBERNATION
1228 default y
1229 1223
1230source "kernel/power/Kconfig" 1224source "kernel/power/Kconfig"
1231 1225
1232source "drivers/acpi/Kconfig" 1226source "drivers/acpi/Kconfig"
1233 1227
1228config X86_APM_BOOT
1229 bool
1230 default y
1231 depends on APM || APM_MODULE
1232
1234menuconfig APM 1233menuconfig APM
1235 tristate "APM (Advanced Power Management) BIOS support" 1234 tristate "APM (Advanced Power Management) BIOS support"
1236 depends on X86_32 && PM_SLEEP && !X86_VISWS 1235 depends on X86_32 && PM_SLEEP && !X86_VISWS
@@ -1371,7 +1370,7 @@ menu "Bus options (PCI etc.)"
1371config PCI 1370config PCI
1372 bool "PCI support" if !X86_VISWS 1371 bool "PCI support" if !X86_VISWS
1373 depends on !X86_VOYAGER 1372 depends on !X86_VOYAGER
1374 default y if X86_VISWS 1373 default y
1375 select ARCH_SUPPORTS_MSI if (X86_LOCAL_APIC && X86_IO_APIC) 1374 select ARCH_SUPPORTS_MSI if (X86_LOCAL_APIC && X86_IO_APIC)
1376 help 1375 help
1377 Find out whether you have a PCI motherboard. PCI is the name of a 1376 Find out whether you have a PCI motherboard. PCI is the name of a
@@ -1379,11 +1378,6 @@ config PCI
1379 your box. Other bus systems are ISA, EISA, MicroChannel (MCA) or 1378 your box. Other bus systems are ISA, EISA, MicroChannel (MCA) or
1380 VESA. If you have PCI, say Y, otherwise N. 1379 VESA. If you have PCI, say Y, otherwise N.
1381 1380
1382 The PCI-HOWTO, available from
1383 <http://www.tldp.org/docs.html#howto>, contains valuable
1384 information about which PCI hardware does work under Linux and which
1385 doesn't.
1386
1387choice 1381choice
1388 prompt "PCI access mode" 1382 prompt "PCI access mode"
1389 depends on X86_32 && PCI && !X86_VISWS 1383 depends on X86_32 && PCI && !X86_VISWS
@@ -1418,25 +1412,21 @@ config PCI_GOANY
1418endchoice 1412endchoice
1419 1413
1420config PCI_BIOS 1414config PCI_BIOS
1421 bool 1415 def_bool y
1422 depends on X86_32 && !X86_VISWS && PCI && (PCI_GOBIOS || PCI_GOANY) 1416 depends on X86_32 && !X86_VISWS && PCI && (PCI_GOBIOS || PCI_GOANY)
1423 default y
1424 1417
1425# x86-64 doesn't support PCI BIOS access from long mode so always go direct. 1418# x86-64 doesn't support PCI BIOS access from long mode so always go direct.
1426config PCI_DIRECT 1419config PCI_DIRECT
1427 bool 1420 def_bool y
1428 depends on PCI && (X86_64 || (PCI_GODIRECT || PCI_GOANY) || X86_VISWS) 1421 depends on PCI && (X86_64 || (PCI_GODIRECT || PCI_GOANY) || X86_VISWS)
1429 default y
1430 1422
1431config PCI_MMCONFIG 1423config PCI_MMCONFIG
1432 bool 1424 def_bool y
1433 depends on X86_32 && PCI && ACPI && (PCI_GOMMCONFIG || PCI_GOANY) 1425 depends on X86_32 && PCI && ACPI && (PCI_GOMMCONFIG || PCI_GOANY)
1434 default y
1435 1426
1436config PCI_DOMAINS 1427config PCI_DOMAINS
1437 bool 1428 def_bool y
1438 depends on PCI 1429 depends on PCI
1439 default y
1440 1430
1441config PCI_MMCONFIG 1431config PCI_MMCONFIG
1442 bool "Support mmconfig PCI config space access" 1432 bool "Support mmconfig PCI config space access"
@@ -1453,9 +1443,9 @@ config DMAR
1453 remapping devices. 1443 remapping devices.
1454 1444
1455config DMAR_GFX_WA 1445config DMAR_GFX_WA
1456 bool "Support for Graphics workaround" 1446 def_bool y
1447 prompt "Support for Graphics workaround"
1457 depends on DMAR 1448 depends on DMAR
1458 default y
1459 help 1449 help
1460 Current Graphics drivers tend to use physical address 1450 Current Graphics drivers tend to use physical address
1461 for DMA and avoid using DMA APIs. Setting this config 1451 for DMA and avoid using DMA APIs. Setting this config
@@ -1464,9 +1454,8 @@ config DMAR_GFX_WA
1464 to use physical addresses for DMA. 1454 to use physical addresses for DMA.
1465 1455
1466config DMAR_FLOPPY_WA 1456config DMAR_FLOPPY_WA
1467 bool 1457 def_bool y
1468 depends on DMAR 1458 depends on DMAR
1469 default y
1470 help 1459 help
1471 Floppy disk drivers are know to bypass DMA API calls 1460 Floppy disk drivers are know to bypass DMA API calls
1472 thereby failing to work when IOMMU is enabled. This 1461 thereby failing to work when IOMMU is enabled. This
@@ -1479,8 +1468,7 @@ source "drivers/pci/Kconfig"
1479 1468
1480# x86_64 have no ISA slots, but do have ISA-style DMA. 1469# x86_64 have no ISA slots, but do have ISA-style DMA.
1481config ISA_DMA_API 1470config ISA_DMA_API
1482 bool 1471 def_bool y
1483 default y
1484 1472
1485if X86_32 1473if X86_32
1486 1474
@@ -1546,9 +1534,9 @@ config SCx200HR_TIMER
1546 other workaround is idle=poll boot option. 1534 other workaround is idle=poll boot option.
1547 1535
1548config GEODE_MFGPT_TIMER 1536config GEODE_MFGPT_TIMER
1549 bool "Geode Multi-Function General Purpose Timer (MFGPT) events" 1537 def_bool y
1538 prompt "Geode Multi-Function General Purpose Timer (MFGPT) events"
1550 depends on MGEODE_LX && GENERIC_TIME && GENERIC_CLOCKEVENTS 1539 depends on MGEODE_LX && GENERIC_TIME && GENERIC_CLOCKEVENTS
1551 default y
1552 help 1540 help
1553 This driver provides a clock event source based on the MFGPT 1541 This driver provides a clock event source based on the MFGPT
1554 timer(s) in the CS5535 and CS5536 companion chip for the geode. 1542 timer(s) in the CS5535 and CS5536 companion chip for the geode.
@@ -1575,6 +1563,7 @@ source "fs/Kconfig.binfmt"
1575config IA32_EMULATION 1563config IA32_EMULATION
1576 bool "IA32 Emulation" 1564 bool "IA32 Emulation"
1577 depends on X86_64 1565 depends on X86_64
1566 select COMPAT_BINFMT_ELF
1578 help 1567 help
1579 Include code to run 32-bit programs under a 64-bit kernel. You should 1568 Include code to run 32-bit programs under a 64-bit kernel. You should
1580 likely turn this on, unless you're 100% sure that you don't have any 1569 likely turn this on, unless you're 100% sure that you don't have any
@@ -1587,18 +1576,16 @@ config IA32_AOUT
1587 Support old a.out binaries in the 32bit emulation. 1576 Support old a.out binaries in the 32bit emulation.
1588 1577
1589config COMPAT 1578config COMPAT
1590 bool 1579 def_bool y
1591 depends on IA32_EMULATION 1580 depends on IA32_EMULATION
1592 default y
1593 1581
1594config COMPAT_FOR_U64_ALIGNMENT 1582config COMPAT_FOR_U64_ALIGNMENT
1595 def_bool COMPAT 1583 def_bool COMPAT
1596 depends on X86_64 1584 depends on X86_64
1597 1585
1598config SYSVIPC_COMPAT 1586config SYSVIPC_COMPAT
1599 bool 1587 def_bool y
1600 depends on X86_64 && COMPAT && SYSVIPC 1588 depends on X86_64 && COMPAT && SYSVIPC
1601 default y
1602 1589
1603endmenu 1590endmenu
1604 1591
@@ -1611,12 +1598,12 @@ source "drivers/firmware/Kconfig"
1611 1598
1612source "fs/Kconfig" 1599source "fs/Kconfig"
1613 1600
1614source "kernel/Kconfig.instrumentation"
1615
1616source "arch/x86/Kconfig.debug" 1601source "arch/x86/Kconfig.debug"
1617 1602
1618source "security/Kconfig" 1603source "security/Kconfig"
1619 1604
1620source "crypto/Kconfig" 1605source "crypto/Kconfig"
1621 1606
1607source "arch/x86/kvm/Kconfig"
1608
1622source "lib/Kconfig" 1609source "lib/Kconfig"
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index c30162202dc..e09a6b73a1a 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -219,10 +219,10 @@ config MGEODEGX1
219 Select this for a Geode GX1 (Cyrix MediaGX) chip. 219 Select this for a Geode GX1 (Cyrix MediaGX) chip.
220 220
221config MGEODE_LX 221config MGEODE_LX
222 bool "Geode GX/LX" 222 bool "Geode GX/LX"
223 depends on X86_32 223 depends on X86_32
224 help 224 help
225 Select this for AMD Geode GX and LX processors. 225 Select this for AMD Geode GX and LX processors.
226 226
227config MCYRIXIII 227config MCYRIXIII
228 bool "CyrixIII/VIA-C3" 228 bool "CyrixIII/VIA-C3"
@@ -258,7 +258,7 @@ config MPSC
258 Optimize for Intel Pentium 4, Pentium D and older Nocona/Dempsey 258 Optimize for Intel Pentium 4, Pentium D and older Nocona/Dempsey
259 Xeon CPUs with Intel 64bit which is compatible with x86-64. 259 Xeon CPUs with Intel 64bit which is compatible with x86-64.
260 Note that the latest Xeons (Xeon 51xx and 53xx) are not based on the 260 Note that the latest Xeons (Xeon 51xx and 53xx) are not based on the
261 Netburst core and shouldn't use this option. You can distinguish them 261 Netburst core and shouldn't use this option. You can distinguish them
262 using the cpu family field 262 using the cpu family field
263 in /proc/cpuinfo. Family 15 is an older Xeon, Family 6 a newer one. 263 in /proc/cpuinfo. Family 15 is an older Xeon, Family 6 a newer one.
264 264
@@ -317,81 +317,75 @@ config X86_L1_CACHE_SHIFT
317 default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MVIAC7 317 default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MVIAC7
318 318
319config X86_XADD 319config X86_XADD
320 bool 320 def_bool y
321 depends on X86_32 && !M386 321 depends on X86_32 && !M386
322 default y
323 322
324config X86_PPRO_FENCE 323config X86_PPRO_FENCE
325 bool 324 bool "PentiumPro memory ordering errata workaround"
326 depends on M686 || M586MMX || M586TSC || M586 || M486 || M386 || MGEODEGX1 325 depends on M686 || M586MMX || M586TSC || M586 || M486 || M386 || MGEODEGX1
327 default y 326 help
327 Old PentiumPro multiprocessor systems had errata that could cause memory
328 operations to violate the x86 ordering standard in rare cases. Enabling this
329 option will attempt to work around some (but not all) occurances of
330 this problem, at the cost of much heavier spinlock and memory barrier
331 operations.
332
333 If unsure, say n here. Even distro kernels should think twice before enabling
334 this: there are few systems, and an unlikely bug.
328 335
329config X86_F00F_BUG 336config X86_F00F_BUG
330 bool 337 def_bool y
331 depends on M586MMX || M586TSC || M586 || M486 || M386 338 depends on M586MMX || M586TSC || M586 || M486 || M386
332 default y
333 339
334config X86_WP_WORKS_OK 340config X86_WP_WORKS_OK
335 bool 341 def_bool y
336 depends on X86_32 && !M386 342 depends on X86_32 && !M386
337 default y
338 343
339config X86_INVLPG 344config X86_INVLPG
340 bool 345 def_bool y
341 depends on X86_32 && !M386 346 depends on X86_32 && !M386
342 default y
343 347
344config X86_BSWAP 348config X86_BSWAP
345 bool 349 def_bool y
346 depends on X86_32 && !M386 350 depends on X86_32 && !M386
347 default y
348 351
349config X86_POPAD_OK 352config X86_POPAD_OK
350 bool 353 def_bool y
351 depends on X86_32 && !M386 354 depends on X86_32 && !M386
352 default y
353 355
354config X86_ALIGNMENT_16 356config X86_ALIGNMENT_16
355 bool 357 def_bool y
356 depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1 358 depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1
357 default y
358 359
359config X86_GOOD_APIC 360config X86_GOOD_APIC
360 bool 361 def_bool y
361 depends on MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || MK8 || MEFFICEON || MCORE2 || MVIAC7 || X86_64 362 depends on MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || MK8 || MEFFICEON || MCORE2 || MVIAC7 || X86_64
362 default y
363 363
364config X86_INTEL_USERCOPY 364config X86_INTEL_USERCOPY
365 bool 365 def_bool y
366 depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2 366 depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2
367 default y
368 367
369config X86_USE_PPRO_CHECKSUM 368config X86_USE_PPRO_CHECKSUM
370 bool 369 def_bool y
371 depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX || MCORE2 370 depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX || MCORE2
372 default y
373 371
374config X86_USE_3DNOW 372config X86_USE_3DNOW
375 bool 373 def_bool y
376 depends on (MCYRIXIII || MK7 || MGEODE_LX) && !UML 374 depends on (MCYRIXIII || MK7 || MGEODE_LX) && !UML
377 default y
378 375
379config X86_OOSTORE 376config X86_OOSTORE
380 bool 377 def_bool y
381 depends on (MWINCHIP3D || MWINCHIP2 || MWINCHIPC6) && MTRR 378 depends on (MWINCHIP3D || MWINCHIP2 || MWINCHIPC6) && MTRR
382 default y
383 379
384config X86_TSC 380config X86_TSC
385 bool 381 def_bool y
386 depends on ((MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ) || X86_64 382 depends on ((MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ) || X86_64
387 default y
388 383
389# this should be set for all -march=.. options where the compiler 384# this should be set for all -march=.. options where the compiler
390# generates cmov. 385# generates cmov.
391config X86_CMOV 386config X86_CMOV
392 bool 387 def_bool y
393 depends on (MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7) 388 depends on (MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7)
394 default y
395 389
396config X86_MINIMUM_CPU_FAMILY 390config X86_MINIMUM_CPU_FAMILY
397 int 391 int
@@ -399,3 +393,6 @@ config X86_MINIMUM_CPU_FAMILY
399 default "4" if X86_32 && (X86_XADD || X86_CMPXCHG || X86_BSWAP || X86_WP_WORKS_OK) 393 default "4" if X86_32 && (X86_XADD || X86_CMPXCHG || X86_BSWAP || X86_WP_WORKS_OK)
400 default "3" 394 default "3"
401 395
396config X86_DEBUGCTLMSR
397 def_bool y
398 depends on !(M586MMX || M586TSC || M586 || M486 || M386)
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 761ca7b5f12..fa555148823 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -6,7 +6,7 @@ config TRACE_IRQFLAGS_SUPPORT
6source "lib/Kconfig.debug" 6source "lib/Kconfig.debug"
7 7
8config EARLY_PRINTK 8config EARLY_PRINTK
9 bool "Early printk" if EMBEDDED && DEBUG_KERNEL && X86_32 9 bool "Early printk" if EMBEDDED
10 default y 10 default y
11 help 11 help
12 Write kernel log output directly into the VGA buffer or to a serial 12 Write kernel log output directly into the VGA buffer or to a serial
@@ -40,22 +40,49 @@ comment "Page alloc debug is incompatible with Software Suspend on i386"
40 40
41config DEBUG_PAGEALLOC 41config DEBUG_PAGEALLOC
42 bool "Debug page memory allocations" 42 bool "Debug page memory allocations"
43 depends on DEBUG_KERNEL && !HIBERNATION && !HUGETLBFS 43 depends on DEBUG_KERNEL && X86_32
44 depends on X86_32
45 help 44 help
46 Unmap pages from the kernel linear mapping after free_pages(). 45 Unmap pages from the kernel linear mapping after free_pages().
47 This results in a large slowdown, but helps to find certain types 46 This results in a large slowdown, but helps to find certain types
48 of memory corruptions. 47 of memory corruptions.
49 48
49config DEBUG_PER_CPU_MAPS
50 bool "Debug access to per_cpu maps"
51 depends on DEBUG_KERNEL
52 depends on X86_64_SMP
53 default n
54 help
55 Say Y to verify that the per_cpu map being accessed has
56 been setup. Adds a fair amount of code to kernel memory
57 and decreases performance.
58
59 Say N if unsure.
60
50config DEBUG_RODATA 61config DEBUG_RODATA
51 bool "Write protect kernel read-only data structures" 62 bool "Write protect kernel read-only data structures"
63 default y
52 depends on DEBUG_KERNEL 64 depends on DEBUG_KERNEL
53 help 65 help
54 Mark the kernel read-only data as write-protected in the pagetables, 66 Mark the kernel read-only data as write-protected in the pagetables,
55 in order to catch accidental (and incorrect) writes to such const 67 in order to catch accidental (and incorrect) writes to such const
56 data. This option may have a slight performance impact because a 68 data. This is recommended so that we can catch kernel bugs sooner.
57 portion of the kernel code won't be covered by a 2MB TLB anymore. 69 If in doubt, say "Y".
58 If in doubt, say "N". 70
71config DEBUG_RODATA_TEST
72 bool "Testcase for the DEBUG_RODATA feature"
73 depends on DEBUG_RODATA
74 help
75 This option enables a testcase for the DEBUG_RODATA
76 feature as well as for the change_page_attr() infrastructure.
77 If in doubt, say "N"
78
79config DEBUG_NX_TEST
80 tristate "Testcase for the NX non-executable stack feature"
81 depends on DEBUG_KERNEL && m
82 help
83 This option enables a testcase for the CPU NX capability
84 and the software setup of this feature.
85 If in doubt, say "N"
59 86
60config 4KSTACKS 87config 4KSTACKS
61 bool "Use 4Kb for kernel stacks instead of 8Kb" 88 bool "Use 4Kb for kernel stacks instead of 8Kb"
@@ -75,8 +102,7 @@ config X86_FIND_SMP_CONFIG
75 102
76config X86_MPPARSE 103config X86_MPPARSE
77 def_bool y 104 def_bool y
78 depends on X86_LOCAL_APIC && !X86_VISWS 105 depends on (X86_32 && (X86_LOCAL_APIC && !X86_VISWS)) || X86_64
79 depends on X86_32
80 106
81config DOUBLEFAULT 107config DOUBLEFAULT
82 default y 108 default y
@@ -112,4 +138,91 @@ config IOMMU_LEAK
112 Add a simple leak tracer to the IOMMU code. This is useful when you 138 Add a simple leak tracer to the IOMMU code. This is useful when you
113 are debugging a buggy device driver that leaks IOMMU mappings. 139 are debugging a buggy device driver that leaks IOMMU mappings.
114 140
141#
142# IO delay types:
143#
144
145config IO_DELAY_TYPE_0X80
146 int
147 default "0"
148
149config IO_DELAY_TYPE_0XED
150 int
151 default "1"
152
153config IO_DELAY_TYPE_UDELAY
154 int
155 default "2"
156
157config IO_DELAY_TYPE_NONE
158 int
159 default "3"
160
161choice
162 prompt "IO delay type"
163 default IO_DELAY_0XED
164
165config IO_DELAY_0X80
166 bool "port 0x80 based port-IO delay [recommended]"
167 help
168 This is the traditional Linux IO delay used for in/out_p.
169 It is the most tested hence safest selection here.
170
171config IO_DELAY_0XED
172 bool "port 0xed based port-IO delay"
173 help
174 Use port 0xed as the IO delay. This frees up port 0x80 which is
175 often used as a hardware-debug port.
176
177config IO_DELAY_UDELAY
178 bool "udelay based port-IO delay"
179 help
180 Use udelay(2) as the IO delay method. This provides the delay
181 while not having any side-effect on the IO port space.
182
183config IO_DELAY_NONE
184 bool "no port-IO delay"
185 help
186 No port-IO delay. Will break on old boxes that require port-IO
187 delay for certain operations. Should work on most new machines.
188
189endchoice
190
191if IO_DELAY_0X80
192config DEFAULT_IO_DELAY_TYPE
193 int
194 default IO_DELAY_TYPE_0X80
195endif
196
197if IO_DELAY_0XED
198config DEFAULT_IO_DELAY_TYPE
199 int
200 default IO_DELAY_TYPE_0XED
201endif
202
203if IO_DELAY_UDELAY
204config DEFAULT_IO_DELAY_TYPE
205 int
206 default IO_DELAY_TYPE_UDELAY
207endif
208
209if IO_DELAY_NONE
210config DEFAULT_IO_DELAY_TYPE
211 int
212 default IO_DELAY_TYPE_NONE
213endif
214
215config DEBUG_BOOT_PARAMS
216 bool "Debug boot parameters"
217 depends on DEBUG_KERNEL
218 depends on DEBUG_FS
219 help
220 This option will cause struct boot_params to be exported via debugfs.
221
222config CPA_DEBUG
223 bool "CPA self-test code"
224 depends on DEBUG_KERNEL
225 help
226 Do change_page_attr() self-tests every 30 seconds.
227
115endmenu 228endmenu
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 7aa1dc6d67c..364865b1b08 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -7,13 +7,253 @@ else
7 KBUILD_DEFCONFIG := $(ARCH)_defconfig 7 KBUILD_DEFCONFIG := $(ARCH)_defconfig
8endif 8endif
9 9
10# No need to remake these files 10core-$(CONFIG_KVM) += arch/x86/kvm/
11$(srctree)/arch/x86/Makefile%: ; 11
12# BITS is used as extension for files which are available in a 32 bit
13# and a 64 bit version to simplify shared Makefiles.
14# e.g.: obj-y += foo_$(BITS).o
15export BITS
12 16
13ifeq ($(CONFIG_X86_32),y) 17ifeq ($(CONFIG_X86_32),y)
18 BITS := 32
14 UTS_MACHINE := i386 19 UTS_MACHINE := i386
15 include $(srctree)/arch/x86/Makefile_32 20 CHECKFLAGS += -D__i386__
21
22 biarch := $(call cc-option,-m32)
23 KBUILD_AFLAGS += $(biarch)
24 KBUILD_CFLAGS += $(biarch)
25
26 ifdef CONFIG_RELOCATABLE
27 LDFLAGS_vmlinux := --emit-relocs
28 endif
29
30 KBUILD_CFLAGS += -msoft-float -mregparm=3 -freg-struct-return
31
32 # prevent gcc from keeping the stack 16 byte aligned
33 KBUILD_CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2)
34
35 # Disable unit-at-a-time mode on pre-gcc-4.0 compilers, it makes gcc use
36 # a lot more stack due to the lack of sharing of stacklots:
37 KBUILD_CFLAGS += $(shell if [ $(call cc-version) -lt 0400 ] ; then \
38 echo $(call cc-option,-fno-unit-at-a-time); fi ;)
39
40 # CPU-specific tuning. Anything which can be shared with UML should go here.
41 include $(srctree)/arch/x86/Makefile_32.cpu
42 KBUILD_CFLAGS += $(cflags-y)
43
44 # temporary until string.h is fixed
45 KBUILD_CFLAGS += -ffreestanding
16else 46else
47 BITS := 64
17 UTS_MACHINE := x86_64 48 UTS_MACHINE := x86_64
18 include $(srctree)/arch/x86/Makefile_64 49 CHECKFLAGS += -D__x86_64__ -m64
50
51 KBUILD_AFLAGS += -m64
52 KBUILD_CFLAGS += -m64
53
54 # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu)
55 cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8)
56 cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona)
57
58 cflags-$(CONFIG_MCORE2) += \
59 $(call cc-option,-march=core2,$(call cc-option,-mtune=generic))
60 cflags-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=generic)
61 KBUILD_CFLAGS += $(cflags-y)
62
63 KBUILD_CFLAGS += -mno-red-zone
64 KBUILD_CFLAGS += -mcmodel=kernel
65
66 # -funit-at-a-time shrinks the kernel .text considerably
67 # unfortunately it makes reading oopses harder.
68 KBUILD_CFLAGS += $(call cc-option,-funit-at-a-time)
69
70 # this works around some issues with generating unwind tables in older gccs
71 # newer gccs do it by default
72 KBUILD_CFLAGS += -maccumulate-outgoing-args
73
74 stackp := $(CONFIG_SHELL) $(srctree)/scripts/gcc-x86_64-has-stack-protector.sh
75 stackp-$(CONFIG_CC_STACKPROTECTOR) := $(shell $(stackp) \
76 "$(CC)" -fstack-protector )
77 stackp-$(CONFIG_CC_STACKPROTECTOR_ALL) += $(shell $(stackp) \
78 "$(CC)" -fstack-protector-all )
79
80 KBUILD_CFLAGS += $(stackp-y)
19endif 81endif
82
83# Stackpointer is addressed different for 32 bit and 64 bit x86
84sp-$(CONFIG_X86_32) := esp
85sp-$(CONFIG_X86_64) := rsp
86
87# do binutils support CFI?
88cfi := $(call as-instr,.cfi_startproc\n.cfi_rel_offset $(sp-y)$(comma)0\n.cfi_endproc,-DCONFIG_AS_CFI=1)
89# is .cfi_signal_frame supported too?
90cfi-sigframe := $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1)
91KBUILD_AFLAGS += $(cfi) $(cfi-sigframe)
92KBUILD_CFLAGS += $(cfi) $(cfi-sigframe)
93
94LDFLAGS := -m elf_$(UTS_MACHINE)
95
96# Speed up the build
97KBUILD_CFLAGS += -pipe
98# Workaround for a gcc prelease that unfortunately was shipped in a suse release
99KBUILD_CFLAGS += -Wno-sign-compare
100#
101KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
102# prevent gcc from generating any FP code by mistake
103KBUILD_CFLAGS += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,)
104
105###
106# Sub architecture support
107# fcore-y is linked before mcore-y files.
108
109# Default subarch .c files
110mcore-y := arch/x86/mach-default/
111
112# Voyager subarch support
113mflags-$(CONFIG_X86_VOYAGER) := -Iinclude/asm-x86/mach-voyager
114mcore-$(CONFIG_X86_VOYAGER) := arch/x86/mach-voyager/
115
116# VISWS subarch support
117mflags-$(CONFIG_X86_VISWS) := -Iinclude/asm-x86/mach-visws
118mcore-$(CONFIG_X86_VISWS) := arch/x86/mach-visws/
119
120# NUMAQ subarch support
121mflags-$(CONFIG_X86_NUMAQ) := -Iinclude/asm-x86/mach-numaq
122mcore-$(CONFIG_X86_NUMAQ) := arch/x86/mach-default/
123
124# BIGSMP subarch support
125mflags-$(CONFIG_X86_BIGSMP) := -Iinclude/asm-x86/mach-bigsmp
126mcore-$(CONFIG_X86_BIGSMP) := arch/x86/mach-default/
127
128#Summit subarch support
129mflags-$(CONFIG_X86_SUMMIT) := -Iinclude/asm-x86/mach-summit
130mcore-$(CONFIG_X86_SUMMIT) := arch/x86/mach-default/
131
132# generic subarchitecture
133mflags-$(CONFIG_X86_GENERICARCH):= -Iinclude/asm-x86/mach-generic
134fcore-$(CONFIG_X86_GENERICARCH) += arch/x86/mach-generic/
135mcore-$(CONFIG_X86_GENERICARCH) := arch/x86/mach-default/
136
137
138# ES7000 subarch support
139mflags-$(CONFIG_X86_ES7000) := -Iinclude/asm-x86/mach-es7000
140fcore-$(CONFIG_X86_ES7000) := arch/x86/mach-es7000/
141mcore-$(CONFIG_X86_ES7000) := arch/x86/mach-default/
142
143# RDC R-321x subarch support
144mflags-$(CONFIG_X86_RDC321X) := -Iinclude/asm-x86/mach-rdc321x
145mcore-$(CONFIG_X86_RDC321X) := arch/x86/mach-default/
146core-$(CONFIG_X86_RDC321X) += arch/x86/mach-rdc321x/
147
148# default subarch .h files
149mflags-y += -Iinclude/asm-x86/mach-default
150
151# 64 bit does not support subarch support - clear sub arch variables
152fcore-$(CONFIG_X86_64) :=
153mcore-$(CONFIG_X86_64) :=
154mflags-$(CONFIG_X86_64) :=
155
156KBUILD_CFLAGS += $(mflags-y)
157KBUILD_AFLAGS += $(mflags-y)
158
159###
160# Kernel objects
161
162head-y := arch/x86/kernel/head_$(BITS).o
163head-$(CONFIG_X86_64) += arch/x86/kernel/head64.o
164head-y += arch/x86/kernel/init_task.o
165
166libs-y += arch/x86/lib/
167
168# Sub architecture files that needs linking first
169core-y += $(fcore-y)
170
171# Xen paravirtualization support
172core-$(CONFIG_XEN) += arch/x86/xen/
173
174# lguest paravirtualization support
175core-$(CONFIG_LGUEST_GUEST) += arch/x86/lguest/
176
177core-y += arch/x86/kernel/
178core-y += arch/x86/mm/
179
180# Remaining sub architecture files
181core-y += $(mcore-y)
182
183core-y += arch/x86/crypto/
184core-y += arch/x86/vdso/
185core-$(CONFIG_IA32_EMULATION) += arch/x86/ia32/
186
187# drivers-y are linked after core-y
188drivers-$(CONFIG_MATH_EMULATION) += arch/x86/math-emu/
189drivers-$(CONFIG_PCI) += arch/x86/pci/
190
191# must be linked after kernel/
192drivers-$(CONFIG_OPROFILE) += arch/x86/oprofile/
193
194ifeq ($(CONFIG_X86_32),y)
195drivers-$(CONFIG_PM) += arch/x86/power/
196drivers-$(CONFIG_FB) += arch/x86/video/
197endif
198
199####
200# boot loader support. Several targets are kept for legacy purposes
201
202boot := arch/x86/boot
203
204PHONY += zImage bzImage compressed zlilo bzlilo \
205 zdisk bzdisk fdimage fdimage144 fdimage288 isoimage install
206
207# Default kernel to build
208all: bzImage
209
210# KBUILD_IMAGE specify target image being built
211 KBUILD_IMAGE := $(boot)/bzImage
212zImage zlilo zdisk: KBUILD_IMAGE := arch/x86/boot/zImage
213
214zImage bzImage: vmlinux
215 $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE)
216 $(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot
217 $(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/$(UTS_MACHINE)/boot/bzImage
218
219compressed: zImage
220
221zlilo bzlilo: vmlinux
222 $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) zlilo
223
224zdisk bzdisk: vmlinux
225 $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) zdisk
226
227fdimage fdimage144 fdimage288 isoimage: vmlinux
228 $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) $@
229
230install: vdso_install
231 $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) install
232
233PHONY += vdso_install
234vdso_install:
235 $(Q)$(MAKE) $(build)=arch/x86/vdso $@
236
237archclean:
238 $(Q)rm -rf $(objtree)/arch/i386
239 $(Q)rm -rf $(objtree)/arch/x86_64
240 $(Q)$(MAKE) $(clean)=$(boot)
241
242define archhelp
243 echo '* bzImage - Compressed kernel image (arch/x86/boot/bzImage)'
244 echo ' install - Install kernel using'
245 echo ' (your) ~/bin/installkernel or'
246 echo ' (distribution) /sbin/installkernel or'
247 echo ' install to $$(INSTALL_PATH) and run lilo'
248 echo ' fdimage - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)'
249 echo ' fdimage144 - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)'
250 echo ' fdimage288 - Create 2.8MB boot floppy image (arch/x86/boot/fdimage)'
251 echo ' isoimage - Create a boot CD-ROM image (arch/x86/boot/image.iso)'
252 echo ' bzdisk/fdimage*/isoimage also accept:'
253 echo ' FDARGS="..." arguments for the booted kernel'
254 echo ' FDINITRD=file initrd for the booted kernel'
255endef
256
257CLEAN_FILES += arch/x86/boot/fdimage \
258 arch/x86/boot/image.iso \
259 arch/x86/boot/mtools.conf
diff --git a/arch/x86/Makefile_32 b/arch/x86/Makefile_32
deleted file mode 100644
index 50394da2f6c..00000000000
--- a/arch/x86/Makefile_32
+++ /dev/null
@@ -1,175 +0,0 @@
1#
2# i386 Makefile
3#
4# This file is included by the global makefile so that you can add your own
5# architecture-specific flags and dependencies. Remember to do have actions
6# for "archclean" cleaning up for this architecture.
7#
8# This file is subject to the terms and conditions of the GNU General Public
9# License. See the file "COPYING" in the main directory of this archive
10# for more details.
11#
12# Copyright (C) 1994 by Linus Torvalds
13#
14# 19990713 Artur Skawina <skawina@geocities.com>
15# Added '-march' and '-mpreferred-stack-boundary' support
16#
17# 20050320 Kianusch Sayah Karadji <kianusch@sk-tech.net>
18# Added support for GEODE CPU
19
20# BITS is used as extension for files which are available in a 32 bit
21# and a 64 bit version to simplify shared Makefiles.
22# e.g.: obj-y += foo_$(BITS).o
23BITS := 32
24export BITS
25
26HAS_BIARCH := $(call cc-option-yn, -m32)
27ifeq ($(HAS_BIARCH),y)
28AS := $(AS) --32
29LD := $(LD) -m elf_i386
30CC := $(CC) -m32
31endif
32
33LDFLAGS := -m elf_i386
34OBJCOPYFLAGS := -O binary -R .note -R .comment -S
35ifdef CONFIG_RELOCATABLE
36LDFLAGS_vmlinux := --emit-relocs
37endif
38CHECKFLAGS += -D__i386__
39
40KBUILD_CFLAGS += -pipe -msoft-float -mregparm=3 -freg-struct-return
41
42# prevent gcc from keeping the stack 16 byte aligned
43KBUILD_CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2)
44
45# CPU-specific tuning. Anything which can be shared with UML should go here.
46include $(srctree)/arch/x86/Makefile_32.cpu
47
48# temporary until string.h is fixed
49cflags-y += -ffreestanding
50
51# this works around some issues with generating unwind tables in older gccs
52# newer gccs do it by default
53cflags-y += -maccumulate-outgoing-args
54
55# Disable unit-at-a-time mode on pre-gcc-4.0 compilers, it makes gcc use
56# a lot more stack due to the lack of sharing of stacklots:
57KBUILD_CFLAGS += $(shell if [ $(call cc-version) -lt 0400 ] ; then echo $(call cc-option,-fno-unit-at-a-time); fi ;)
58
59# do binutils support CFI?
60cflags-y += $(call as-instr,.cfi_startproc\n.cfi_rel_offset esp${comma}0\n.cfi_endproc,-DCONFIG_AS_CFI=1,)
61KBUILD_AFLAGS += $(call as-instr,.cfi_startproc\n.cfi_rel_offset esp${comma}0\n.cfi_endproc,-DCONFIG_AS_CFI=1,)
62
63# is .cfi_signal_frame supported too?
64cflags-y += $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1,)
65KBUILD_AFLAGS += $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1,)
66
67KBUILD_CFLAGS += $(cflags-y)
68
69# Default subarch .c files
70mcore-y := arch/x86/mach-default
71
72# Voyager subarch support
73mflags-$(CONFIG_X86_VOYAGER) := -Iinclude/asm-x86/mach-voyager
74mcore-$(CONFIG_X86_VOYAGER) := arch/x86/mach-voyager
75
76# VISWS subarch support
77mflags-$(CONFIG_X86_VISWS) := -Iinclude/asm-x86/mach-visws
78mcore-$(CONFIG_X86_VISWS) := arch/x86/mach-visws
79
80# NUMAQ subarch support
81mflags-$(CONFIG_X86_NUMAQ) := -Iinclude/asm-x86/mach-numaq
82mcore-$(CONFIG_X86_NUMAQ) := arch/x86/mach-default
83
84# BIGSMP subarch support
85mflags-$(CONFIG_X86_BIGSMP) := -Iinclude/asm-x86/mach-bigsmp
86mcore-$(CONFIG_X86_BIGSMP) := arch/x86/mach-default
87
88#Summit subarch support
89mflags-$(CONFIG_X86_SUMMIT) := -Iinclude/asm-x86/mach-summit
90mcore-$(CONFIG_X86_SUMMIT) := arch/x86/mach-default
91
92# generic subarchitecture
93mflags-$(CONFIG_X86_GENERICARCH) := -Iinclude/asm-x86/mach-generic
94mcore-$(CONFIG_X86_GENERICARCH) := arch/x86/mach-default
95core-$(CONFIG_X86_GENERICARCH) += arch/x86/mach-generic/
96
97# ES7000 subarch support
98mflags-$(CONFIG_X86_ES7000) := -Iinclude/asm-x86/mach-es7000
99mcore-$(CONFIG_X86_ES7000) := arch/x86/mach-default
100core-$(CONFIG_X86_ES7000) := arch/x86/mach-es7000/
101
102# Xen paravirtualization support
103core-$(CONFIG_XEN) += arch/x86/xen/
104
105# lguest paravirtualization support
106core-$(CONFIG_LGUEST_GUEST) += arch/x86/lguest/
107
108# default subarch .h files
109mflags-y += -Iinclude/asm-x86/mach-default
110
111head-y := arch/x86/kernel/head_32.o arch/x86/kernel/init_task.o
112
113libs-y += arch/x86/lib/
114core-y += arch/x86/kernel/ \
115 arch/x86/mm/ \
116 $(mcore-y)/ \
117 arch/x86/crypto/
118drivers-$(CONFIG_MATH_EMULATION) += arch/x86/math-emu/
119drivers-$(CONFIG_PCI) += arch/x86/pci/
120# must be linked after kernel/
121drivers-$(CONFIG_OPROFILE) += arch/x86/oprofile/
122drivers-$(CONFIG_PM) += arch/x86/power/
123drivers-$(CONFIG_FB) += arch/x86/video/
124
125KBUILD_CFLAGS += $(mflags-y)
126KBUILD_AFLAGS += $(mflags-y)
127
128boot := arch/x86/boot
129
130PHONY += zImage bzImage compressed zlilo bzlilo \
131 zdisk bzdisk fdimage fdimage144 fdimage288 isoimage install
132
133all: bzImage
134
135# KBUILD_IMAGE specify target image being built
136 KBUILD_IMAGE := $(boot)/bzImage
137zImage zlilo zdisk: KBUILD_IMAGE := arch/x86/boot/zImage
138
139zImage bzImage: vmlinux
140 $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE)
141 $(Q)mkdir -p $(objtree)/arch/i386/boot
142 $(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/i386/boot/bzImage
143
144compressed: zImage
145
146zlilo bzlilo: vmlinux
147 $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) zlilo
148
149zdisk bzdisk: vmlinux
150 $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) zdisk
151
152fdimage fdimage144 fdimage288 isoimage: vmlinux
153 $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) $@
154
155install:
156 $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) install
157
158archclean:
159 $(Q)rm -rf $(objtree)/arch/i386/boot
160 $(Q)$(MAKE) $(clean)=arch/x86/boot
161
162define archhelp
163 echo '* bzImage - Compressed kernel image (arch/x86/boot/bzImage)'
164 echo ' install - Install kernel using'
165 echo ' (your) ~/bin/installkernel or'
166 echo ' (distribution) /sbin/installkernel or'
167 echo ' install to $$(INSTALL_PATH) and run lilo'
168 echo ' bzdisk - Create a boot floppy in /dev/fd0'
169 echo ' fdimage - Create a boot floppy image'
170 echo ' isoimage - Create a boot CD-ROM image'
171endef
172
173CLEAN_FILES += arch/x86/boot/fdimage \
174 arch/x86/boot/image.iso \
175 arch/x86/boot/mtools.conf
diff --git a/arch/x86/Makefile_64 b/arch/x86/Makefile_64
deleted file mode 100644
index a804860022e..00000000000
--- a/arch/x86/Makefile_64
+++ /dev/null
@@ -1,144 +0,0 @@
1#
2# x86_64 Makefile
3#
4# This file is included by the global makefile so that you can add your own
5# architecture-specific flags and dependencies. Remember to do have actions
6# for "archclean" and "archdep" for cleaning up and making dependencies for
7# this architecture
8#
9# This file is subject to the terms and conditions of the GNU General Public
10# License. See the file "COPYING" in the main directory of this archive
11# for more details.
12#
13# Copyright (C) 1994 by Linus Torvalds
14#
15# 19990713 Artur Skawina <skawina@geocities.com>
16# Added '-march' and '-mpreferred-stack-boundary' support
17# 20000913 Pavel Machek <pavel@suse.cz>
18# Converted for x86_64 architecture
19# 20010105 Andi Kleen, add IA32 compiler.
20# ....and later removed it again....
21#
22# $Id: Makefile,v 1.31 2002/03/22 15:56:07 ak Exp $
23
24# BITS is used as extension for files which are available in a 32 bit
25# and a 64 bit version to simplify shared Makefiles.
26# e.g.: obj-y += foo_$(BITS).o
27BITS := 64
28export BITS
29
30LDFLAGS := -m elf_x86_64
31OBJCOPYFLAGS := -O binary -R .note -R .comment -S
32LDFLAGS_vmlinux :=
33CHECKFLAGS += -D__x86_64__ -m64
34
35cflags-y :=
36cflags-kernel-y :=
37cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8)
38cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona)
39# gcc doesn't support -march=core2 yet as of gcc 4.3, but I hope it
40# will eventually. Use -mtune=generic as fallback
41cflags-$(CONFIG_MCORE2) += \
42 $(call cc-option,-march=core2,$(call cc-option,-mtune=generic))
43cflags-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=generic)
44
45cflags-y += -m64
46cflags-y += -mno-red-zone
47cflags-y += -mcmodel=kernel
48cflags-y += -pipe
49cflags-y += -Wno-sign-compare
50cflags-y += -fno-asynchronous-unwind-tables
51ifneq ($(CONFIG_DEBUG_INFO),y)
52# -fweb shrinks the kernel a bit, but the difference is very small
53# it also messes up debugging, so don't use it for now.
54#cflags-y += $(call cc-option,-fweb)
55endif
56# -funit-at-a-time shrinks the kernel .text considerably
57# unfortunately it makes reading oopses harder.
58cflags-y += $(call cc-option,-funit-at-a-time)
59# prevent gcc from generating any FP code by mistake
60cflags-y += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,)
61# this works around some issues with generating unwind tables in older gccs
62# newer gccs do it by default
63cflags-y += -maccumulate-outgoing-args
64
65# do binutils support CFI?
66cflags-y += $(call as-instr,.cfi_startproc\n.cfi_rel_offset rsp${comma}0\n.cfi_endproc,-DCONFIG_AS_CFI=1,)
67KBUILD_AFLAGS += $(call as-instr,.cfi_startproc\n.cfi_rel_offset rsp${comma}0\n.cfi_endproc,-DCONFIG_AS_CFI=1,)
68
69# is .cfi_signal_frame supported too?
70cflags-y += $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1,)
71KBUILD_AFLAGS += $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1,)
72
73cflags-$(CONFIG_CC_STACKPROTECTOR) += $(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-x86_64-has-stack-protector.sh "$(CC)" -fstack-protector )
74cflags-$(CONFIG_CC_STACKPROTECTOR_ALL) += $(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-x86_64-has-stack-protector.sh "$(CC)" -fstack-protector-all )
75
76KBUILD_CFLAGS += $(cflags-y)
77CFLAGS_KERNEL += $(cflags-kernel-y)
78KBUILD_AFLAGS += -m64
79
80head-y := arch/x86/kernel/head_64.o arch/x86/kernel/head64.o arch/x86/kernel/init_task.o
81
82libs-y += arch/x86/lib/
83core-y += arch/x86/kernel/ \
84 arch/x86/mm/ \
85 arch/x86/crypto/ \
86 arch/x86/vdso/
87core-$(CONFIG_IA32_EMULATION) += arch/x86/ia32/
88drivers-$(CONFIG_PCI) += arch/x86/pci/
89drivers-$(CONFIG_OPROFILE) += arch/x86/oprofile/
90
91boot := arch/x86/boot
92
93PHONY += bzImage bzlilo install archmrproper \
94 fdimage fdimage144 fdimage288 isoimage archclean
95
96#Default target when executing "make"
97all: bzImage
98
99BOOTIMAGE := arch/x86/boot/bzImage
100KBUILD_IMAGE := $(BOOTIMAGE)
101
102bzImage: vmlinux
103 $(Q)$(MAKE) $(build)=$(boot) $(BOOTIMAGE)
104 $(Q)mkdir -p $(objtree)/arch/x86_64/boot
105 $(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/x86_64/boot/bzImage
106
107bzlilo: vmlinux
108 $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(BOOTIMAGE) zlilo
109
110bzdisk: vmlinux
111 $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(BOOTIMAGE) zdisk
112
113fdimage fdimage144 fdimage288 isoimage: vmlinux
114 $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(BOOTIMAGE) $@
115
116install: vdso_install
117 $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(BOOTIMAGE) $@
118
119vdso_install:
120ifeq ($(CONFIG_IA32_EMULATION),y)
121 $(Q)$(MAKE) $(build)=arch/x86/ia32 $@
122endif
123 $(Q)$(MAKE) $(build)=arch/x86/vdso $@
124
125archclean:
126 $(Q)rm -rf $(objtree)/arch/x86_64/boot
127 $(Q)$(MAKE) $(clean)=$(boot)
128
129define archhelp
130 echo '* bzImage - Compressed kernel image (arch/x86/boot/bzImage)'
131 echo ' install - Install kernel using'
132 echo ' (your) ~/bin/installkernel or'
133 echo ' (distribution) /sbin/installkernel or'
134 echo ' install to $$(INSTALL_PATH) and run lilo'
135 echo ' bzdisk - Create a boot floppy in /dev/fd0'
136 echo ' fdimage - Create a boot floppy image'
137 echo ' isoimage - Create a boot CD-ROM image'
138endef
139
140CLEAN_FILES += arch/x86/boot/fdimage \
141 arch/x86/boot/image.iso \
142 arch/x86/boot/mtools.conf
143
144
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 7a3116ccf38..f88458e83ef 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -26,11 +26,13 @@ SVGA_MODE := -DSVGA_MODE=NORMAL_VGA
26#RAMDISK := -DRAMDISK=512 26#RAMDISK := -DRAMDISK=512
27 27
28targets := vmlinux.bin setup.bin setup.elf zImage bzImage 28targets := vmlinux.bin setup.bin setup.elf zImage bzImage
29subdir- := compressed 29subdir- := compressed
30 30
31setup-y += a20.o apm.o cmdline.o copy.o cpu.o cpucheck.o edd.o 31setup-y += a20.o cmdline.o copy.o cpu.o cpucheck.o edd.o
32setup-y += header.o main.o mca.o memory.o pm.o pmjump.o 32setup-y += header.o main.o mca.o memory.o pm.o pmjump.o
33setup-y += printf.o string.o tty.o video.o version.o voyager.o 33setup-y += printf.o string.o tty.o video.o version.o
34setup-$(CONFIG_X86_APM_BOOT) += apm.o
35setup-$(CONFIG_X86_VOYAGER) += voyager.o
34 36
35# The link order of the video-*.o modules can matter. In particular, 37# The link order of the video-*.o modules can matter. In particular,
36# video-vga.o *must* be listed first, followed by video-vesa.o. 38# video-vga.o *must* be listed first, followed by video-vesa.o.
@@ -41,18 +43,23 @@ setup-y += video-vesa.o
41setup-y += video-bios.o 43setup-y += video-bios.o
42 44
43targets += $(setup-y) 45targets += $(setup-y)
44hostprogs-y := tools/build 46hostprogs-y := mkcpustr tools/build
45 47
46HOSTCFLAGS_build.o := $(LINUXINCLUDE) 48HOST_EXTRACFLAGS += $(LINUXINCLUDE)
49
50$(obj)/cpu.o: $(obj)/cpustr.h
51
52quiet_cmd_cpustr = CPUSTR $@
53 cmd_cpustr = $(obj)/mkcpustr > $@
54targets += cpustr.h
55$(obj)/cpustr.h: $(obj)/mkcpustr FORCE
56 $(call if_changed,cpustr)
47 57
48# --------------------------------------------------------------------------- 58# ---------------------------------------------------------------------------
49 59
50# How to compile the 16-bit code. Note we always compile for -march=i386, 60# How to compile the 16-bit code. Note we always compile for -march=i386,
51# that way we can complain to the user if the CPU is insufficient. 61# that way we can complain to the user if the CPU is insufficient.
52cflags-$(CONFIG_X86_32) :=
53cflags-$(CONFIG_X86_64) := -m32
54KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \ 62KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \
55 $(cflags-y) \
56 -Wall -Wstrict-prototypes \ 63 -Wall -Wstrict-prototypes \
57 -march=i386 -mregparm=3 \ 64 -march=i386 -mregparm=3 \
58 -include $(srctree)/$(src)/code16gcc.h \ 65 -include $(srctree)/$(src)/code16gcc.h \
@@ -62,6 +69,7 @@ KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \
62 $(call cc-option, -fno-unit-at-a-time)) \ 69 $(call cc-option, -fno-unit-at-a-time)) \
63 $(call cc-option, -fno-stack-protector) \ 70 $(call cc-option, -fno-stack-protector) \
64 $(call cc-option, -mpreferred-stack-boundary=2) 71 $(call cc-option, -mpreferred-stack-boundary=2)
72KBUILD_CFLAGS += $(call cc-option,-m32)
65KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ 73KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
66 74
67$(obj)/zImage: IMAGE_OFFSET := 0x1000 75$(obj)/zImage: IMAGE_OFFSET := 0x1000
@@ -80,6 +88,7 @@ $(obj)/zImage $(obj)/bzImage: $(obj)/setup.bin \
80 $(call if_changed,image) 88 $(call if_changed,image)
81 @echo 'Kernel: $@ is ready' ' (#'`cat .version`')' 89 @echo 'Kernel: $@ is ready' ' (#'`cat .version`')'
82 90
91OBJCOPYFLAGS_vmlinux.bin := -O binary -R .note -R .comment -S
83$(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE 92$(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE
84 $(call if_changed,objcopy) 93 $(call if_changed,objcopy)
85 94
@@ -90,7 +99,6 @@ $(obj)/setup.elf: $(src)/setup.ld $(SETUP_OBJS) FORCE
90 $(call if_changed,ld) 99 $(call if_changed,ld)
91 100
92OBJCOPYFLAGS_setup.bin := -O binary 101OBJCOPYFLAGS_setup.bin := -O binary
93
94$(obj)/setup.bin: $(obj)/setup.elf FORCE 102$(obj)/setup.bin: $(obj)/setup.elf FORCE
95 $(call if_changed,objcopy) 103 $(call if_changed,objcopy)
96 104
@@ -98,7 +106,7 @@ $(obj)/compressed/vmlinux: FORCE
98 $(Q)$(MAKE) $(build)=$(obj)/compressed IMAGE_OFFSET=$(IMAGE_OFFSET) $@ 106 $(Q)$(MAKE) $(build)=$(obj)/compressed IMAGE_OFFSET=$(IMAGE_OFFSET) $@
99 107
100# Set this if you want to pass append arguments to the zdisk/fdimage/isoimage kernel 108# Set this if you want to pass append arguments to the zdisk/fdimage/isoimage kernel
101FDARGS = 109FDARGS =
102# Set this if you want an initrd included with the zdisk/fdimage/isoimage kernel 110# Set this if you want an initrd included with the zdisk/fdimage/isoimage kernel
103FDINITRD = 111FDINITRD =
104 112
diff --git a/arch/x86/boot/apm.c b/arch/x86/boot/apm.c
index eab50c55a3a..c117c7fb859 100644
--- a/arch/x86/boot/apm.c
+++ b/arch/x86/boot/apm.c
@@ -19,8 +19,6 @@
19 19
20#include "boot.h" 20#include "boot.h"
21 21
22#if defined(CONFIG_APM) || defined(CONFIG_APM_MODULE)
23
24int query_apm_bios(void) 22int query_apm_bios(void)
25{ 23{
26 u16 ax, bx, cx, dx, di; 24 u16 ax, bx, cx, dx, di;
@@ -95,4 +93,3 @@ int query_apm_bios(void)
95 return 0; 93 return 0;
96} 94}
97 95
98#endif
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index d2b5adf4651..7822a4983da 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -109,7 +109,7 @@ typedef unsigned int addr_t;
109static inline u8 rdfs8(addr_t addr) 109static inline u8 rdfs8(addr_t addr)
110{ 110{
111 u8 v; 111 u8 v;
112 asm volatile("movb %%fs:%1,%0" : "=r" (v) : "m" (*(u8 *)addr)); 112 asm volatile("movb %%fs:%1,%0" : "=q" (v) : "m" (*(u8 *)addr));
113 return v; 113 return v;
114} 114}
115static inline u16 rdfs16(addr_t addr) 115static inline u16 rdfs16(addr_t addr)
@@ -127,21 +127,21 @@ static inline u32 rdfs32(addr_t addr)
127 127
128static inline void wrfs8(u8 v, addr_t addr) 128static inline void wrfs8(u8 v, addr_t addr)
129{ 129{
130 asm volatile("movb %1,%%fs:%0" : "+m" (*(u8 *)addr) : "r" (v)); 130 asm volatile("movb %1,%%fs:%0" : "+m" (*(u8 *)addr) : "qi" (v));
131} 131}
132static inline void wrfs16(u16 v, addr_t addr) 132static inline void wrfs16(u16 v, addr_t addr)
133{ 133{
134 asm volatile("movw %1,%%fs:%0" : "+m" (*(u16 *)addr) : "r" (v)); 134 asm volatile("movw %1,%%fs:%0" : "+m" (*(u16 *)addr) : "ri" (v));
135} 135}
136static inline void wrfs32(u32 v, addr_t addr) 136static inline void wrfs32(u32 v, addr_t addr)
137{ 137{
138 asm volatile("movl %1,%%fs:%0" : "+m" (*(u32 *)addr) : "r" (v)); 138 asm volatile("movl %1,%%fs:%0" : "+m" (*(u32 *)addr) : "ri" (v));
139} 139}
140 140
141static inline u8 rdgs8(addr_t addr) 141static inline u8 rdgs8(addr_t addr)
142{ 142{
143 u8 v; 143 u8 v;
144 asm volatile("movb %%gs:%1,%0" : "=r" (v) : "m" (*(u8 *)addr)); 144 asm volatile("movb %%gs:%1,%0" : "=q" (v) : "m" (*(u8 *)addr));
145 return v; 145 return v;
146} 146}
147static inline u16 rdgs16(addr_t addr) 147static inline u16 rdgs16(addr_t addr)
@@ -159,15 +159,15 @@ static inline u32 rdgs32(addr_t addr)
159 159
160static inline void wrgs8(u8 v, addr_t addr) 160static inline void wrgs8(u8 v, addr_t addr)
161{ 161{
162 asm volatile("movb %1,%%gs:%0" : "+m" (*(u8 *)addr) : "r" (v)); 162 asm volatile("movb %1,%%gs:%0" : "+m" (*(u8 *)addr) : "qi" (v));
163} 163}
164static inline void wrgs16(u16 v, addr_t addr) 164static inline void wrgs16(u16 v, addr_t addr)
165{ 165{
166 asm volatile("movw %1,%%gs:%0" : "+m" (*(u16 *)addr) : "r" (v)); 166 asm volatile("movw %1,%%gs:%0" : "+m" (*(u16 *)addr) : "ri" (v));
167} 167}
168static inline void wrgs32(u32 v, addr_t addr) 168static inline void wrgs32(u32 v, addr_t addr)
169{ 169{
170 asm volatile("movl %1,%%gs:%0" : "+m" (*(u32 *)addr) : "r" (v)); 170 asm volatile("movl %1,%%gs:%0" : "+m" (*(u32 *)addr) : "ri" (v));
171} 171}
172 172
173/* Note: these only return true/false, not a signed return value! */ 173/* Note: these only return true/false, not a signed return value! */
@@ -241,6 +241,7 @@ int query_apm_bios(void);
241 241
242/* cmdline.c */ 242/* cmdline.c */
243int cmdline_find_option(const char *option, char *buffer, int bufsize); 243int cmdline_find_option(const char *option, char *buffer, int bufsize);
244int cmdline_find_option_bool(const char *option);
244 245
245/* cpu.c, cpucheck.c */ 246/* cpu.c, cpucheck.c */
246int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr); 247int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr);
diff --git a/arch/x86/boot/cmdline.c b/arch/x86/boot/cmdline.c
index 34bb778c435..680408a0f46 100644
--- a/arch/x86/boot/cmdline.c
+++ b/arch/x86/boot/cmdline.c
@@ -95,3 +95,68 @@ int cmdline_find_option(const char *option, char *buffer, int bufsize)
95 95
96 return len; 96 return len;
97} 97}
98
99/*
100 * Find a boolean option (like quiet,noapic,nosmp....)
101 *
102 * Returns the position of that option (starts counting with 1)
103 * or 0 on not found
104 */
105int cmdline_find_option_bool(const char *option)
106{
107 u32 cmdline_ptr = boot_params.hdr.cmd_line_ptr;
108 addr_t cptr;
109 char c;
110 int pos = 0, wstart = 0;
111 const char *opptr = NULL;
112 enum {
113 st_wordstart, /* Start of word/after whitespace */
114 st_wordcmp, /* Comparing this word */
115 st_wordskip, /* Miscompare, skip */
116 } state = st_wordstart;
117
118 if (!cmdline_ptr || cmdline_ptr >= 0x100000)
119 return -1; /* No command line, or inaccessible */
120
121 cptr = cmdline_ptr & 0xf;
122 set_fs(cmdline_ptr >> 4);
123
124 while (cptr < 0x10000) {
125 c = rdfs8(cptr++);
126 pos++;
127
128 switch (state) {
129 case st_wordstart:
130 if (!c)
131 return 0;
132 else if (myisspace(c))
133 break;
134
135 state = st_wordcmp;
136 opptr = option;
137 wstart = pos;
138 /* fall through */
139
140 case st_wordcmp:
141 if (!*opptr)
142 if (!c || myisspace(c))
143 return wstart;
144 else
145 state = st_wordskip;
146 else if (!c)
147 return 0;
148 else if (c != *opptr++)
149 state = st_wordskip;
150 break;
151
152 case st_wordskip:
153 if (!c)
154 return 0;
155 else if (myisspace(c))
156 state = st_wordstart;
157 break;
158 }
159 }
160
161 return 0; /* Buffer overrun */
162}
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 52c1db85452..d2b9f3bb87c 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -1,5 +1,64 @@
1#
2# linux/arch/x86/boot/compressed/Makefile
3#
4# create a compressed vmlinux image from the original vmlinux
5#
6
7targets := vmlinux vmlinux.bin vmlinux.bin.gz head_$(BITS).o misc.o piggy.o
8
9KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2
10KBUILD_CFLAGS += -fno-strict-aliasing -fPIC
11cflags-$(CONFIG_X86_64) := -mcmodel=small
12KBUILD_CFLAGS += $(cflags-y)
13KBUILD_CFLAGS += $(call cc-option,-ffreestanding)
14KBUILD_CFLAGS += $(call cc-option,-fno-stack-protector)
15
16KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
17
18LDFLAGS := -m elf_$(UTS_MACHINE)
19LDFLAGS_vmlinux := -T
20
21$(obj)/vmlinux: $(src)/vmlinux_$(BITS).lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/piggy.o FORCE
22 $(call if_changed,ld)
23 @:
24
25OBJCOPYFLAGS_vmlinux.bin := -O binary -R .note -R .comment -S
26$(obj)/vmlinux.bin: vmlinux FORCE
27 $(call if_changed,objcopy)
28
29
1ifeq ($(CONFIG_X86_32),y) 30ifeq ($(CONFIG_X86_32),y)
2include ${srctree}/arch/x86/boot/compressed/Makefile_32 31targets += vmlinux.bin.all vmlinux.relocs
32hostprogs-y := relocs
33
34quiet_cmd_relocs = RELOCS $@
35 cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $<
36$(obj)/vmlinux.relocs: vmlinux $(obj)/relocs FORCE
37 $(call if_changed,relocs)
38
39vmlinux.bin.all-y := $(obj)/vmlinux.bin
40vmlinux.bin.all-$(CONFIG_RELOCATABLE) += $(obj)/vmlinux.relocs
41quiet_cmd_relocbin = BUILD $@
42 cmd_relocbin = cat $(filter-out FORCE,$^) > $@
43$(obj)/vmlinux.bin.all: $(vmlinux.bin.all-y) FORCE
44 $(call if_changed,relocbin)
45
46ifdef CONFIG_RELOCATABLE
47$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin.all FORCE
48 $(call if_changed,gzip)
3else 49else
4include ${srctree}/arch/x86/boot/compressed/Makefile_64 50$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
51 $(call if_changed,gzip)
5endif 52endif
53LDFLAGS_piggy.o := -r --format binary --oformat elf32-i386 -T
54
55else
56$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
57 $(call if_changed,gzip)
58
59LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T
60endif
61
62
63$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE
64 $(call if_changed,ld)
diff --git a/arch/x86/boot/compressed/Makefile_32 b/arch/x86/boot/compressed/Makefile_32
deleted file mode 100644
index e43ff7c56e6..00000000000
--- a/arch/x86/boot/compressed/Makefile_32
+++ /dev/null
@@ -1,50 +0,0 @@
1#
2# linux/arch/x86/boot/compressed/Makefile
3#
4# create a compressed vmlinux image from the original vmlinux
5#
6
7targets := vmlinux vmlinux.bin vmlinux.bin.gz head_32.o misc_32.o piggy.o \
8 vmlinux.bin.all vmlinux.relocs
9EXTRA_AFLAGS := -traditional
10
11LDFLAGS_vmlinux := -T
12hostprogs-y := relocs
13
14KBUILD_CFLAGS := -m32 -D__KERNEL__ $(LINUX_INCLUDE) -O2 \
15 -fno-strict-aliasing -fPIC \
16 $(call cc-option,-ffreestanding) \
17 $(call cc-option,-fno-stack-protector)
18LDFLAGS := -m elf_i386
19
20$(obj)/vmlinux: $(src)/vmlinux_32.lds $(obj)/head_32.o $(obj)/misc_32.o $(obj)/piggy.o FORCE
21 $(call if_changed,ld)
22 @:
23
24$(obj)/vmlinux.bin: vmlinux FORCE
25 $(call if_changed,objcopy)
26
27quiet_cmd_relocs = RELOCS $@
28 cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $<
29$(obj)/vmlinux.relocs: vmlinux $(obj)/relocs FORCE
30 $(call if_changed,relocs)
31
32vmlinux.bin.all-y := $(obj)/vmlinux.bin
33vmlinux.bin.all-$(CONFIG_RELOCATABLE) += $(obj)/vmlinux.relocs
34quiet_cmd_relocbin = BUILD $@
35 cmd_relocbin = cat $(filter-out FORCE,$^) > $@
36$(obj)/vmlinux.bin.all: $(vmlinux.bin.all-y) FORCE
37 $(call if_changed,relocbin)
38
39ifdef CONFIG_RELOCATABLE
40$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin.all FORCE
41 $(call if_changed,gzip)
42else
43$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
44 $(call if_changed,gzip)
45endif
46
47LDFLAGS_piggy.o := -r --format binary --oformat elf32-i386 -T
48
49$(obj)/piggy.o: $(src)/vmlinux_32.scr $(obj)/vmlinux.bin.gz FORCE
50 $(call if_changed,ld)
diff --git a/arch/x86/boot/compressed/Makefile_64 b/arch/x86/boot/compressed/Makefile_64
deleted file mode 100644
index 7801e8dd90b..00000000000
--- a/arch/x86/boot/compressed/Makefile_64
+++ /dev/null
@@ -1,30 +0,0 @@
1#
2# linux/arch/x86/boot/compressed/Makefile
3#
4# create a compressed vmlinux image from the original vmlinux
5#
6
7targets := vmlinux vmlinux.bin vmlinux.bin.gz head_64.o misc_64.o piggy.o
8
9KBUILD_CFLAGS := -m64 -D__KERNEL__ $(LINUXINCLUDE) -O2 \
10 -fno-strict-aliasing -fPIC -mcmodel=small \
11 $(call cc-option, -ffreestanding) \
12 $(call cc-option, -fno-stack-protector)
13KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
14LDFLAGS := -m elf_x86_64
15
16LDFLAGS_vmlinux := -T
17$(obj)/vmlinux: $(src)/vmlinux_64.lds $(obj)/head_64.o $(obj)/misc_64.o $(obj)/piggy.o FORCE
18 $(call if_changed,ld)
19 @:
20
21$(obj)/vmlinux.bin: vmlinux FORCE
22 $(call if_changed,objcopy)
23
24$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
25 $(call if_changed,gzip)
26
27LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T
28
29$(obj)/piggy.o: $(obj)/vmlinux_64.scr $(obj)/vmlinux.bin.gz FORCE
30 $(call if_changed,ld)
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 1ccb38a7f0d..e8657b98c90 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -80,8 +80,8 @@ startup_32:
80 80
81#ifdef CONFIG_RELOCATABLE 81#ifdef CONFIG_RELOCATABLE
82 movl %ebp, %ebx 82 movl %ebp, %ebx
83 addl $(LARGE_PAGE_SIZE -1), %ebx 83 addl $(PMD_PAGE_SIZE -1), %ebx
84 andl $LARGE_PAGE_MASK, %ebx 84 andl $PMD_PAGE_MASK, %ebx
85#else 85#else
86 movl $CONFIG_PHYSICAL_START, %ebx 86 movl $CONFIG_PHYSICAL_START, %ebx
87#endif 87#endif
@@ -220,8 +220,8 @@ ENTRY(startup_64)
220 /* Start with the delta to where the kernel will run at. */ 220 /* Start with the delta to where the kernel will run at. */
221#ifdef CONFIG_RELOCATABLE 221#ifdef CONFIG_RELOCATABLE
222 leaq startup_32(%rip) /* - $startup_32 */, %rbp 222 leaq startup_32(%rip) /* - $startup_32 */, %rbp
223 addq $(LARGE_PAGE_SIZE - 1), %rbp 223 addq $(PMD_PAGE_SIZE - 1), %rbp
224 andq $LARGE_PAGE_MASK, %rbp 224 andq $PMD_PAGE_MASK, %rbp
225 movq %rbp, %rbx 225 movq %rbp, %rbx
226#else 226#else
227 movq $CONFIG_PHYSICAL_START, %rbp 227 movq $CONFIG_PHYSICAL_START, %rbp
diff --git a/arch/x86/boot/compressed/misc_32.c b/arch/x86/boot/compressed/misc.c
index b74d60d1b2f..8182e32c1b4 100644
--- a/arch/x86/boot/compressed/misc_32.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * misc.c 2 * misc.c
3 * 3 *
4 * This is a collection of several routines from gzip-1.0.3 4 * This is a collection of several routines from gzip-1.0.3
5 * adapted for Linux. 5 * adapted for Linux.
6 * 6 *
7 * malloc by Hannu Savolainen 1993 and Matthias Urlichs 1994 7 * malloc by Hannu Savolainen 1993 and Matthias Urlichs 1994
@@ -9,9 +9,18 @@
9 * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 9 * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
10 */ 10 */
11 11
12/*
13 * we have to be careful, because no indirections are allowed here, and
14 * paravirt_ops is a kind of one. As it will only run in baremetal anyway,
15 * we just keep it from happening
16 */
12#undef CONFIG_PARAVIRT 17#undef CONFIG_PARAVIRT
18#ifdef CONFIG_X86_64
19#define _LINUX_STRING_H_ 1
20#define __LINUX_BITMAP_H 1
21#endif
22
13#include <linux/linkage.h> 23#include <linux/linkage.h>
14#include <linux/vmalloc.h>
15#include <linux/screen_info.h> 24#include <linux/screen_info.h>
16#include <asm/io.h> 25#include <asm/io.h>
17#include <asm/page.h> 26#include <asm/page.h>
@@ -186,10 +195,20 @@ static void *memcpy(void *dest, const void *src, unsigned n);
186 195
187static void putstr(const char *); 196static void putstr(const char *);
188 197
189static unsigned long free_mem_ptr; 198#ifdef CONFIG_X86_64
190static unsigned long free_mem_end_ptr; 199#define memptr long
200#else
201#define memptr unsigned
202#endif
203
204static memptr free_mem_ptr;
205static memptr free_mem_end_ptr;
191 206
207#ifdef CONFIG_X86_64
208#define HEAP_SIZE 0x7000
209#else
192#define HEAP_SIZE 0x4000 210#define HEAP_SIZE 0x4000
211#endif
193 212
194static char *vidmem = (char *)0xb8000; 213static char *vidmem = (char *)0xb8000;
195static int vidport; 214static int vidport;
@@ -230,7 +249,7 @@ static void gzip_mark(void **ptr)
230 249
231static void gzip_release(void **ptr) 250static void gzip_release(void **ptr)
232{ 251{
233 free_mem_ptr = (unsigned long) *ptr; 252 free_mem_ptr = (memptr) *ptr;
234} 253}
235 254
236static void scroll(void) 255static void scroll(void)
@@ -247,8 +266,10 @@ static void putstr(const char *s)
247 int x,y,pos; 266 int x,y,pos;
248 char c; 267 char c;
249 268
269#ifdef CONFIG_X86_32
250 if (RM_SCREEN_INFO.orig_video_mode == 0 && lines == 0 && cols == 0) 270 if (RM_SCREEN_INFO.orig_video_mode == 0 && lines == 0 && cols == 0)
251 return; 271 return;
272#endif
252 273
253 x = RM_SCREEN_INFO.orig_x; 274 x = RM_SCREEN_INFO.orig_x;
254 y = RM_SCREEN_INFO.orig_y; 275 y = RM_SCREEN_INFO.orig_y;
@@ -261,7 +282,7 @@ static void putstr(const char *s)
261 y--; 282 y--;
262 } 283 }
263 } else { 284 } else {
264 vidmem [ ( x + cols * y ) * 2 ] = c; 285 vidmem [(x + cols * y) * 2] = c;
265 if ( ++x >= cols ) { 286 if ( ++x >= cols ) {
266 x = 0; 287 x = 0;
267 if ( ++y >= lines ) { 288 if ( ++y >= lines ) {
@@ -276,16 +297,16 @@ static void putstr(const char *s)
276 RM_SCREEN_INFO.orig_y = y; 297 RM_SCREEN_INFO.orig_y = y;
277 298
278 pos = (x + cols * y) * 2; /* Update cursor position */ 299 pos = (x + cols * y) * 2; /* Update cursor position */
279 outb_p(14, vidport); 300 outb(14, vidport);
280 outb_p(0xff & (pos >> 9), vidport+1); 301 outb(0xff & (pos >> 9), vidport+1);
281 outb_p(15, vidport); 302 outb(15, vidport);
282 outb_p(0xff & (pos >> 1), vidport+1); 303 outb(0xff & (pos >> 1), vidport+1);
283} 304}
284 305
285static void* memset(void* s, int c, unsigned n) 306static void* memset(void* s, int c, unsigned n)
286{ 307{
287 int i; 308 int i;
288 char *ss = (char*)s; 309 char *ss = s;
289 310
290 for (i=0;i<n;i++) ss[i] = c; 311 for (i=0;i<n;i++) ss[i] = c;
291 return s; 312 return s;
@@ -294,7 +315,8 @@ static void* memset(void* s, int c, unsigned n)
294static void* memcpy(void* dest, const void* src, unsigned n) 315static void* memcpy(void* dest, const void* src, unsigned n)
295{ 316{
296 int i; 317 int i;
297 char *d = (char *)dest, *s = (char *)src; 318 const char *s = src;
319 char *d = dest;
298 320
299 for (i=0;i<n;i++) d[i] = s[i]; 321 for (i=0;i<n;i++) d[i] = s[i];
300 return dest; 322 return dest;
@@ -339,11 +361,13 @@ static void error(char *x)
339 putstr(x); 361 putstr(x);
340 putstr("\n\n -- System halted"); 362 putstr("\n\n -- System halted");
341 363
342 while(1); /* Halt */ 364 while (1)
365 asm("hlt");
343} 366}
344 367
345asmlinkage void decompress_kernel(void *rmode, unsigned long end, 368asmlinkage void decompress_kernel(void *rmode, memptr heap,
346 uch *input_data, unsigned long input_len, uch *output) 369 uch *input_data, unsigned long input_len,
370 uch *output)
347{ 371{
348 real_mode = rmode; 372 real_mode = rmode;
349 373
@@ -358,25 +382,32 @@ asmlinkage void decompress_kernel(void *rmode, unsigned long end,
358 lines = RM_SCREEN_INFO.orig_video_lines; 382 lines = RM_SCREEN_INFO.orig_video_lines;
359 cols = RM_SCREEN_INFO.orig_video_cols; 383 cols = RM_SCREEN_INFO.orig_video_cols;
360 384
361 window = output; /* Output buffer (Normally at 1M) */ 385 window = output; /* Output buffer (Normally at 1M) */
362 free_mem_ptr = end; /* Heap */ 386 free_mem_ptr = heap; /* Heap */
363 free_mem_end_ptr = end + HEAP_SIZE; 387 free_mem_end_ptr = heap + HEAP_SIZE;
364 inbuf = input_data; /* Input buffer */ 388 inbuf = input_data; /* Input buffer */
365 insize = input_len; 389 insize = input_len;
366 inptr = 0; 390 inptr = 0;
367 391
392#ifdef CONFIG_X86_64
393 if ((ulg)output & (__KERNEL_ALIGN - 1))
394 error("Destination address not 2M aligned");
395 if ((ulg)output >= 0xffffffffffUL)
396 error("Destination address too large");
397#else
368 if ((u32)output & (CONFIG_PHYSICAL_ALIGN -1)) 398 if ((u32)output & (CONFIG_PHYSICAL_ALIGN -1))
369 error("Destination address not CONFIG_PHYSICAL_ALIGN aligned"); 399 error("Destination address not CONFIG_PHYSICAL_ALIGN aligned");
370 if (end > ((-__PAGE_OFFSET-(512 <<20)-1) & 0x7fffffff)) 400 if (heap > ((-__PAGE_OFFSET-(512<<20)-1) & 0x7fffffff))
371 error("Destination address too large"); 401 error("Destination address too large");
372#ifndef CONFIG_RELOCATABLE 402#ifndef CONFIG_RELOCATABLE
373 if ((u32)output != LOAD_PHYSICAL_ADDR) 403 if ((u32)output != LOAD_PHYSICAL_ADDR)
374 error("Wrong destination address"); 404 error("Wrong destination address");
375#endif 405#endif
406#endif
376 407
377 makecrc(); 408 makecrc();
378 putstr("Uncompressing Linux... "); 409 putstr("\nDecompressing Linux... ");
379 gunzip(); 410 gunzip();
380 putstr("Ok, booting the kernel.\n"); 411 putstr("done.\nBooting the kernel.\n");
381 return; 412 return;
382} 413}
diff --git a/arch/x86/boot/compressed/misc_64.c b/arch/x86/boot/compressed/misc_64.c
deleted file mode 100644
index 6ea015aa65e..00000000000
--- a/arch/x86/boot/compressed/misc_64.c
+++ /dev/null
@@ -1,371 +0,0 @@
1/*
2 * misc.c
3 *
4 * This is a collection of several routines from gzip-1.0.3
5 * adapted for Linux.
6 *
7 * malloc by Hannu Savolainen 1993 and Matthias Urlichs 1994
8 * puts by Nick Holloway 1993, better puts by Martin Mares 1995
9 * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
10 */
11
12#define _LINUX_STRING_H_ 1
13#define __LINUX_BITMAP_H 1
14
15#include <linux/linkage.h>
16#include <linux/screen_info.h>
17#include <asm/io.h>
18#include <asm/page.h>
19
20/* WARNING!!
21 * This code is compiled with -fPIC and it is relocated dynamically
22 * at run time, but no relocation processing is performed.
23 * This means that it is not safe to place pointers in static structures.
24 */
25
26/*
27 * Getting to provable safe in place decompression is hard.
28 * Worst case behaviours need to be analyzed.
29 * Background information:
30 *
31 * The file layout is:
32 * magic[2]
33 * method[1]
34 * flags[1]
35 * timestamp[4]
36 * extraflags[1]
37 * os[1]
38 * compressed data blocks[N]
39 * crc[4] orig_len[4]
40 *
41 * resulting in 18 bytes of non compressed data overhead.
42 *
43 * Files divided into blocks
44 * 1 bit (last block flag)
45 * 2 bits (block type)
46 *
47 * 1 block occurs every 32K -1 bytes or when there 50% compression has been achieved.
48 * The smallest block type encoding is always used.
49 *
50 * stored:
51 * 32 bits length in bytes.
52 *
53 * fixed:
54 * magic fixed tree.
55 * symbols.
56 *
57 * dynamic:
58 * dynamic tree encoding.
59 * symbols.
60 *
61 *
62 * The buffer for decompression in place is the length of the
63 * uncompressed data, plus a small amount extra to keep the algorithm safe.
64 * The compressed data is placed at the end of the buffer. The output
65 * pointer is placed at the start of the buffer and the input pointer
66 * is placed where the compressed data starts. Problems will occur
67 * when the output pointer overruns the input pointer.
68 *
69 * The output pointer can only overrun the input pointer if the input
70 * pointer is moving faster than the output pointer. A condition only
71 * triggered by data whose compressed form is larger than the uncompressed
72 * form.
73 *
74 * The worst case at the block level is a growth of the compressed data
75 * of 5 bytes per 32767 bytes.
76 *
77 * The worst case internal to a compressed block is very hard to figure.
78 * The worst case can at least be boundined by having one bit that represents
79 * 32764 bytes and then all of the rest of the bytes representing the very
80 * very last byte.
81 *
82 * All of which is enough to compute an amount of extra data that is required
83 * to be safe. To avoid problems at the block level allocating 5 extra bytes
84 * per 32767 bytes of data is sufficient. To avoind problems internal to a block
85 * adding an extra 32767 bytes (the worst case uncompressed block size) is
86 * sufficient, to ensure that in the worst case the decompressed data for
87 * block will stop the byte before the compressed data for a block begins.
88 * To avoid problems with the compressed data's meta information an extra 18
89 * bytes are needed. Leading to the formula:
90 *
91 * extra_bytes = (uncompressed_size >> 12) + 32768 + 18 + decompressor_size.
92 *
93 * Adding 8 bytes per 32K is a bit excessive but much easier to calculate.
94 * Adding 32768 instead of 32767 just makes for round numbers.
95 * Adding the decompressor_size is necessary as it musht live after all
96 * of the data as well. Last I measured the decompressor is about 14K.
97 * 10K of actual data and 4K of bss.
98 *
99 */
100
101/*
102 * gzip declarations
103 */
104
105#define OF(args) args
106#define STATIC static
107
108#undef memset
109#undef memcpy
110#define memzero(s, n) memset ((s), 0, (n))
111
112typedef unsigned char uch;
113typedef unsigned short ush;
114typedef unsigned long ulg;
115
116#define WSIZE 0x80000000 /* Window size must be at least 32k,
117 * and a power of two
118 * We don't actually have a window just
119 * a huge output buffer so I report
120 * a 2G windows size, as that should
121 * always be larger than our output buffer.
122 */
123
124static uch *inbuf; /* input buffer */
125static uch *window; /* Sliding window buffer, (and final output buffer) */
126
127static unsigned insize; /* valid bytes in inbuf */
128static unsigned inptr; /* index of next byte to be processed in inbuf */
129static unsigned outcnt; /* bytes in output buffer */
130
131/* gzip flag byte */
132#define ASCII_FLAG 0x01 /* bit 0 set: file probably ASCII text */
133#define CONTINUATION 0x02 /* bit 1 set: continuation of multi-part gzip file */
134#define EXTRA_FIELD 0x04 /* bit 2 set: extra field present */
135#define ORIG_NAME 0x08 /* bit 3 set: original file name present */
136#define COMMENT 0x10 /* bit 4 set: file comment present */
137#define ENCRYPTED 0x20 /* bit 5 set: file is encrypted */
138#define RESERVED 0xC0 /* bit 6,7: reserved */
139
140#define get_byte() (inptr < insize ? inbuf[inptr++] : fill_inbuf())
141
142/* Diagnostic functions */
143#ifdef DEBUG
144# define Assert(cond,msg) {if(!(cond)) error(msg);}
145# define Trace(x) fprintf x
146# define Tracev(x) {if (verbose) fprintf x ;}
147# define Tracevv(x) {if (verbose>1) fprintf x ;}
148# define Tracec(c,x) {if (verbose && (c)) fprintf x ;}
149# define Tracecv(c,x) {if (verbose>1 && (c)) fprintf x ;}
150#else
151# define Assert(cond,msg)
152# define Trace(x)
153# define Tracev(x)
154# define Tracevv(x)
155# define Tracec(c,x)
156# define Tracecv(c,x)
157#endif
158
159static int fill_inbuf(void);
160static void flush_window(void);
161static void error(char *m);
162static void gzip_mark(void **);
163static void gzip_release(void **);
164
165/*
166 * This is set up by the setup-routine at boot-time
167 */
168static unsigned char *real_mode; /* Pointer to real-mode data */
169
170#define RM_EXT_MEM_K (*(unsigned short *)(real_mode + 0x2))
171#ifndef STANDARD_MEMORY_BIOS_CALL
172#define RM_ALT_MEM_K (*(unsigned long *)(real_mode + 0x1e0))
173#endif
174#define RM_SCREEN_INFO (*(struct screen_info *)(real_mode+0))
175
176extern unsigned char input_data[];
177extern int input_len;
178
179static long bytes_out = 0;
180
181static void *malloc(int size);
182static void free(void *where);
183
184static void *memset(void *s, int c, unsigned n);
185static void *memcpy(void *dest, const void *src, unsigned n);
186
187static void putstr(const char *);
188
189static long free_mem_ptr;
190static long free_mem_end_ptr;
191
192#define HEAP_SIZE 0x7000
193
194static char *vidmem = (char *)0xb8000;
195static int vidport;
196static int lines, cols;
197
198#include "../../../../lib/inflate.c"
199
200static void *malloc(int size)
201{
202 void *p;
203
204 if (size <0) error("Malloc error");
205 if (free_mem_ptr <= 0) error("Memory error");
206
207 free_mem_ptr = (free_mem_ptr + 3) & ~3; /* Align */
208
209 p = (void *)free_mem_ptr;
210 free_mem_ptr += size;
211
212 if (free_mem_ptr >= free_mem_end_ptr)
213 error("Out of memory");
214
215 return p;
216}
217
218static void free(void *where)
219{ /* Don't care */
220}
221
222static void gzip_mark(void **ptr)
223{
224 *ptr = (void *) free_mem_ptr;
225}
226
227static void gzip_release(void **ptr)
228{
229 free_mem_ptr = (long) *ptr;
230}
231
232static void scroll(void)
233{
234 int i;
235
236 memcpy ( vidmem, vidmem + cols * 2, ( lines - 1 ) * cols * 2 );
237 for ( i = ( lines - 1 ) * cols * 2; i < lines * cols * 2; i += 2 )
238 vidmem[i] = ' ';
239}
240
241static void putstr(const char *s)
242{
243 int x,y,pos;
244 char c;
245
246 x = RM_SCREEN_INFO.orig_x;
247 y = RM_SCREEN_INFO.orig_y;
248
249 while ( ( c = *s++ ) != '\0' ) {
250 if ( c == '\n' ) {
251 x = 0;
252 if ( ++y >= lines ) {
253 scroll();
254 y--;
255 }
256 } else {
257 vidmem [ ( x + cols * y ) * 2 ] = c;
258 if ( ++x >= cols ) {
259 x = 0;
260 if ( ++y >= lines ) {
261 scroll();
262 y--;
263 }
264 }
265 }
266 }
267
268 RM_SCREEN_INFO.orig_x = x;
269 RM_SCREEN_INFO.orig_y = y;
270
271 pos = (x + cols * y) * 2; /* Update cursor position */
272 outb_p(14, vidport);
273 outb_p(0xff & (pos >> 9), vidport+1);
274 outb_p(15, vidport);
275 outb_p(0xff & (pos >> 1), vidport+1);
276}
277
278static void* memset(void* s, int c, unsigned n)
279{
280 int i;
281 char *ss = (char*)s;
282
283 for (i=0;i<n;i++) ss[i] = c;
284 return s;
285}
286
287static void* memcpy(void* dest, const void* src, unsigned n)
288{
289 int i;
290 char *d = (char *)dest, *s = (char *)src;
291
292 for (i=0;i<n;i++) d[i] = s[i];
293 return dest;
294}
295
296/* ===========================================================================
297 * Fill the input buffer. This is called only when the buffer is empty
298 * and at least one byte is really needed.
299 */
300static int fill_inbuf(void)
301{
302 error("ran out of input data");
303 return 0;
304}
305
306/* ===========================================================================
307 * Write the output window window[0..outcnt-1] and update crc and bytes_out.
308 * (Used for the decompressed data only.)
309 */
310static void flush_window(void)
311{
312 /* With my window equal to my output buffer
313 * I only need to compute the crc here.
314 */
315 ulg c = crc; /* temporary variable */
316 unsigned n;
317 uch *in, ch;
318
319 in = window;
320 for (n = 0; n < outcnt; n++) {
321 ch = *in++;
322 c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8);
323 }
324 crc = c;
325 bytes_out += (ulg)outcnt;
326 outcnt = 0;
327}
328
329static void error(char *x)
330{
331 putstr("\n\n");
332 putstr(x);
333 putstr("\n\n -- System halted");
334
335 while(1); /* Halt */
336}
337
338asmlinkage void decompress_kernel(void *rmode, unsigned long heap,
339 uch *input_data, unsigned long input_len, uch *output)
340{
341 real_mode = rmode;
342
343 if (RM_SCREEN_INFO.orig_video_mode == 7) {
344 vidmem = (char *) 0xb0000;
345 vidport = 0x3b4;
346 } else {
347 vidmem = (char *) 0xb8000;
348 vidport = 0x3d4;
349 }
350
351 lines = RM_SCREEN_INFO.orig_video_lines;
352 cols = RM_SCREEN_INFO.orig_video_cols;
353
354 window = output; /* Output buffer (Normally at 1M) */
355 free_mem_ptr = heap; /* Heap */
356 free_mem_end_ptr = heap + HEAP_SIZE;
357 inbuf = input_data; /* Input buffer */
358 insize = input_len;
359 inptr = 0;
360
361 if ((ulg)output & (__KERNEL_ALIGN - 1))
362 error("Destination address not 2M aligned");
363 if ((ulg)output >= 0xffffffffffUL)
364 error("Destination address too large");
365
366 makecrc();
367 putstr(".\nDecompressing Linux...");
368 gunzip();
369 putstr("done.\nBooting the kernel.\n");
370 return;
371}
diff --git a/arch/x86/boot/compressed/relocs.c b/arch/x86/boot/compressed/relocs.c
index 7a0d00b2cf2..d01ea42187e 100644
--- a/arch/x86/boot/compressed/relocs.c
+++ b/arch/x86/boot/compressed/relocs.c
@@ -27,11 +27,6 @@ static unsigned long *relocs;
27 * absolute relocations present w.r.t these symbols. 27 * absolute relocations present w.r.t these symbols.
28 */ 28 */
29static const char* safe_abs_relocs[] = { 29static const char* safe_abs_relocs[] = {
30 "__kernel_vsyscall",
31 "__kernel_rt_sigreturn",
32 "__kernel_sigreturn",
33 "SYSENTER_RETURN",
34 "VDSO_NOTE_MASK",
35 "xen_irq_disable_direct_reloc", 30 "xen_irq_disable_direct_reloc",
36 "xen_save_fl_direct_reloc", 31 "xen_save_fl_direct_reloc",
37}; 32};
@@ -45,6 +40,8 @@ static int is_safe_abs_reloc(const char* sym_name)
45 /* Match found */ 40 /* Match found */
46 return 1; 41 return 1;
47 } 42 }
43 if (strncmp(sym_name, "VDSO", 4) == 0)
44 return 1;
48 if (strncmp(sym_name, "__crc_", 6) == 0) 45 if (strncmp(sym_name, "__crc_", 6) == 0)
49 return 1; 46 return 1;
50 return 0; 47 return 0;
diff --git a/arch/x86/boot/compressed/vmlinux_64.scr b/arch/x86/boot/compressed/vmlinux.scr
index bd1429ce193..f02382ae5c4 100644
--- a/arch/x86/boot/compressed/vmlinux_64.scr
+++ b/arch/x86/boot/compressed/vmlinux.scr
@@ -1,6 +1,6 @@
1SECTIONS 1SECTIONS
2{ 2{
3 .text.compressed : { 3 .rodata.compressed : {
4 input_len = .; 4 input_len = .;
5 LONG(input_data_end - input_data) input_data = .; 5 LONG(input_data_end - input_data) input_data = .;
6 *(.data) 6 *(.data)
diff --git a/arch/x86/boot/compressed/vmlinux_32.lds b/arch/x86/boot/compressed/vmlinux_32.lds
index cc4854f6c6c..bb3c48379c4 100644
--- a/arch/x86/boot/compressed/vmlinux_32.lds
+++ b/arch/x86/boot/compressed/vmlinux_32.lds
@@ -3,17 +3,17 @@ OUTPUT_ARCH(i386)
3ENTRY(startup_32) 3ENTRY(startup_32)
4SECTIONS 4SECTIONS
5{ 5{
6 /* Be careful parts of head.S assume startup_32 is at 6 /* Be careful parts of head_32.S assume startup_32 is at
7 * address 0. 7 * address 0.
8 */ 8 */
9 . = 0 ; 9 . = 0;
10 .text.head : { 10 .text.head : {
11 _head = . ; 11 _head = . ;
12 *(.text.head) 12 *(.text.head)
13 _ehead = . ; 13 _ehead = . ;
14 } 14 }
15 .data.compressed : { 15 .rodata.compressed : {
16 *(.data.compressed) 16 *(.rodata.compressed)
17 } 17 }
18 .text : { 18 .text : {
19 _text = .; /* Text */ 19 _text = .; /* Text */
diff --git a/arch/x86/boot/compressed/vmlinux_32.scr b/arch/x86/boot/compressed/vmlinux_32.scr
deleted file mode 100644
index 707a88f7f29..00000000000
--- a/arch/x86/boot/compressed/vmlinux_32.scr
+++ /dev/null
@@ -1,10 +0,0 @@
1SECTIONS
2{
3 .data.compressed : {
4 input_len = .;
5 LONG(input_data_end - input_data) input_data = .;
6 *(.data)
7 output_len = . - 4;
8 input_data_end = .;
9 }
10}
diff --git a/arch/x86/boot/compressed/vmlinux_64.lds b/arch/x86/boot/compressed/vmlinux_64.lds
index 94c13e557fb..7e5c7209f6c 100644
--- a/arch/x86/boot/compressed/vmlinux_64.lds
+++ b/arch/x86/boot/compressed/vmlinux_64.lds
@@ -3,15 +3,19 @@ OUTPUT_ARCH(i386:x86-64)
3ENTRY(startup_64) 3ENTRY(startup_64)
4SECTIONS 4SECTIONS
5{ 5{
6 /* Be careful parts of head.S assume startup_32 is at 6 /* Be careful parts of head_64.S assume startup_32 is at
7 * address 0. 7 * address 0.
8 */ 8 */
9 . = 0; 9 . = 0;
10 .text : { 10 .text.head : {
11 _head = . ; 11 _head = . ;
12 *(.text.head) 12 *(.text.head)
13 _ehead = . ; 13 _ehead = . ;
14 *(.text.compressed) 14 }
15 .rodata.compressed : {
16 *(.rodata.compressed)
17 }
18 .text : {
15 _text = .; /* Text */ 19 _text = .; /* Text */
16 *(.text) 20 *(.text)
17 *(.text.*) 21 *(.text.*)
diff --git a/arch/x86/boot/cpu.c b/arch/x86/boot/cpu.c
index 2a5c32da585..00e19edd852 100644
--- a/arch/x86/boot/cpu.c
+++ b/arch/x86/boot/cpu.c
@@ -1,7 +1,7 @@
1/* -*- linux-c -*- ------------------------------------------------------- * 1/* -*- linux-c -*- ------------------------------------------------------- *
2 * 2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved 4 * Copyright 2007-2008 rPath, Inc. - All Rights Reserved
5 * 5 *
6 * This file is part of the Linux kernel, and is made available under 6 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2. 7 * the terms of the GNU General Public License version 2.
@@ -9,7 +9,7 @@
9 * ----------------------------------------------------------------------- */ 9 * ----------------------------------------------------------------------- */
10 10
11/* 11/*
12 * arch/i386/boot/cpu.c 12 * arch/x86/boot/cpu.c
13 * 13 *
14 * Check for obligatory CPU features and abort if the features are not 14 * Check for obligatory CPU features and abort if the features are not
15 * present. 15 * present.
@@ -19,6 +19,8 @@
19#include "bitops.h" 19#include "bitops.h"
20#include <asm/cpufeature.h> 20#include <asm/cpufeature.h>
21 21
22#include "cpustr.h"
23
22static char *cpu_name(int level) 24static char *cpu_name(int level)
23{ 25{
24 static char buf[6]; 26 static char buf[6];
@@ -35,6 +37,7 @@ int validate_cpu(void)
35{ 37{
36 u32 *err_flags; 38 u32 *err_flags;
37 int cpu_level, req_level; 39 int cpu_level, req_level;
40 const unsigned char *msg_strs;
38 41
39 check_cpu(&cpu_level, &req_level, &err_flags); 42 check_cpu(&cpu_level, &req_level, &err_flags);
40 43
@@ -51,13 +54,26 @@ int validate_cpu(void)
51 puts("This kernel requires the following features " 54 puts("This kernel requires the following features "
52 "not present on the CPU:\n"); 55 "not present on the CPU:\n");
53 56
57 msg_strs = (const unsigned char *)x86_cap_strs;
58
54 for (i = 0; i < NCAPINTS; i++) { 59 for (i = 0; i < NCAPINTS; i++) {
55 u32 e = err_flags[i]; 60 u32 e = err_flags[i];
56 61
57 for (j = 0; j < 32; j++) { 62 for (j = 0; j < 32; j++) {
58 if (e & 1) 63 int n = (i << 5)+j;
59 printf("%d:%d ", i, j); 64 if (*msg_strs < n) {
60 65 /* Skip to the next string */
66 do {
67 msg_strs++;
68 } while (*msg_strs);
69 msg_strs++;
70 }
71 if (e & 1) {
72 if (*msg_strs == n && msg_strs[1])
73 printf("%s ", msg_strs+1);
74 else
75 printf("%d:%d ", i, j);
76 }
61 e >>= 1; 77 e >>= 1;
62 } 78 }
63 } 79 }
diff --git a/arch/x86/boot/edd.c b/arch/x86/boot/edd.c
index bd138e442ec..8721dc46a0b 100644
--- a/arch/x86/boot/edd.c
+++ b/arch/x86/boot/edd.c
@@ -129,6 +129,7 @@ void query_edd(void)
129 char eddarg[8]; 129 char eddarg[8];
130 int do_mbr = 1; 130 int do_mbr = 1;
131 int do_edd = 1; 131 int do_edd = 1;
132 int be_quiet;
132 int devno; 133 int devno;
133 struct edd_info ei, *edp; 134 struct edd_info ei, *edp;
134 u32 *mbrptr; 135 u32 *mbrptr;
@@ -140,12 +141,21 @@ void query_edd(void)
140 do_edd = 0; 141 do_edd = 0;
141 } 142 }
142 143
144 be_quiet = cmdline_find_option_bool("quiet");
145
143 edp = boot_params.eddbuf; 146 edp = boot_params.eddbuf;
144 mbrptr = boot_params.edd_mbr_sig_buffer; 147 mbrptr = boot_params.edd_mbr_sig_buffer;
145 148
146 if (!do_edd) 149 if (!do_edd)
147 return; 150 return;
148 151
152 /* Bugs in OnBoard or AddOnCards Bios may hang the EDD probe,
153 * so give a hint if this happens.
154 */
155
156 if (!be_quiet)
157 printf("Probing EDD (edd=off to disable)... ");
158
149 for (devno = 0x80; devno < 0x80+EDD_MBR_SIG_MAX; devno++) { 159 for (devno = 0x80; devno < 0x80+EDD_MBR_SIG_MAX; devno++) {
150 /* 160 /*
151 * Scan the BIOS-supported hard disks and query EDD 161 * Scan the BIOS-supported hard disks and query EDD
@@ -162,6 +172,9 @@ void query_edd(void)
162 if (do_mbr && !read_mbr_sig(devno, &ei, mbrptr++)) 172 if (do_mbr && !read_mbr_sig(devno, &ei, mbrptr++))
163 boot_params.edd_mbr_sig_buf_entries = devno-0x80+1; 173 boot_params.edd_mbr_sig_buf_entries = devno-0x80+1;
164 } 174 }
175
176 if (!be_quiet)
177 printf("ok\n");
165} 178}
166 179
167#endif 180#endif
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 4cc5b0411db..64ad9016585 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -195,10 +195,13 @@ cmd_line_ptr: .long 0 # (Header version 0x0202 or later)
195 # can be located anywhere in 195 # can be located anywhere in
196 # low memory 0x10000 or higher. 196 # low memory 0x10000 or higher.
197 197
198ramdisk_max: .long (-__PAGE_OFFSET-(512 << 20)-1) & 0x7fffffff 198ramdisk_max: .long 0x7fffffff
199 # (Header version 0x0203 or later) 199 # (Header version 0x0203 or later)
200 # The highest safe address for 200 # The highest safe address for
201 # the contents of an initrd 201 # the contents of an initrd
202 # The current kernel allows up to 4 GB,
203 # but leave it at 2 GB to avoid
204 # possible bootloader bugs.
202 205
203kernel_alignment: .long CONFIG_PHYSICAL_ALIGN #physical addr alignment 206kernel_alignment: .long CONFIG_PHYSICAL_ALIGN #physical addr alignment
204 #required for protected mode 207 #required for protected mode
diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c
index 1f95750ede2..7828da5cfd0 100644
--- a/arch/x86/boot/main.c
+++ b/arch/x86/boot/main.c
@@ -100,20 +100,32 @@ static void set_bios_mode(void)
100#endif 100#endif
101} 101}
102 102
103void main(void) 103static void init_heap(void)
104{ 104{
105 /* First, copy the boot header into the "zeropage" */ 105 char *stack_end;
106 copy_boot_params();
107 106
108 /* End of heap check */
109 if (boot_params.hdr.loadflags & CAN_USE_HEAP) { 107 if (boot_params.hdr.loadflags & CAN_USE_HEAP) {
110 heap_end = (char *)(boot_params.hdr.heap_end_ptr 108 asm("leal %P1(%%esp),%0"
111 +0x200-STACK_SIZE); 109 : "=r" (stack_end) : "i" (-STACK_SIZE));
110
111 heap_end = (char *)
112 ((size_t)boot_params.hdr.heap_end_ptr + 0x200);
113 if (heap_end > stack_end)
114 heap_end = stack_end;
112 } else { 115 } else {
113 /* Boot protocol 2.00 only, no heap available */ 116 /* Boot protocol 2.00 only, no heap available */
114 puts("WARNING: Ancient bootloader, some functionality " 117 puts("WARNING: Ancient bootloader, some functionality "
115 "may be limited!\n"); 118 "may be limited!\n");
116 } 119 }
120}
121
122void main(void)
123{
124 /* First, copy the boot header into the "zeropage" */
125 copy_boot_params();
126
127 /* End of heap check */
128 init_heap();
117 129
118 /* Make sure we have all the proper CPU support */ 130 /* Make sure we have all the proper CPU support */
119 if (validate_cpu()) { 131 if (validate_cpu()) {
@@ -131,9 +143,6 @@ void main(void)
131 /* Set keyboard repeat rate (why?) */ 143 /* Set keyboard repeat rate (why?) */
132 keyboard_set_repeat(); 144 keyboard_set_repeat();
133 145
134 /* Set the video mode */
135 set_video();
136
137 /* Query MCA information */ 146 /* Query MCA information */
138 query_mca(); 147 query_mca();
139 148
@@ -154,6 +163,10 @@ void main(void)
154#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) 163#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
155 query_edd(); 164 query_edd();
156#endif 165#endif
166
167 /* Set the video mode */
168 set_video();
169
157 /* Do the last things and invoke protected mode */ 170 /* Do the last things and invoke protected mode */
158 go_to_protected_mode(); 171 go_to_protected_mode();
159} 172}
diff --git a/arch/x86/boot/mkcpustr.c b/arch/x86/boot/mkcpustr.c
new file mode 100644
index 00000000000..bbe76953bae
--- /dev/null
+++ b/arch/x86/boot/mkcpustr.c
@@ -0,0 +1,49 @@
1/* ----------------------------------------------------------------------- *
2 *
3 * Copyright 2008 rPath, Inc. - All Rights Reserved
4 *
5 * This file is part of the Linux kernel, and is made available under
6 * the terms of the GNU General Public License version 2 or (at your
7 * option) any later version; incorporated herein by reference.
8 *
9 * ----------------------------------------------------------------------- */
10
11/*
12 * This is a host program to preprocess the CPU strings into a
13 * compact format suitable for the setup code.
14 */
15
16#include <stdio.h>
17
18#include "../kernel/cpu/feature_names.c"
19
20#if NCAPFLAGS > 8
21# error "Need to adjust the boot code handling of CPUID strings"
22#endif
23
24int main(void)
25{
26 int i;
27 const char *str;
28
29 printf("static const char x86_cap_strs[] = \n");
30
31 for (i = 0; i < NCAPINTS*32; i++) {
32 str = x86_cap_flags[i];
33
34 if (i == NCAPINTS*32-1) {
35 /* The last entry must be unconditional; this
36 also consumes the compiler-added null character */
37 if (!str)
38 str = "";
39 printf("\t\"\\x%02x\"\"%s\"\n", i, str);
40 } else if (str) {
41 printf("#if REQUIRED_MASK%d & (1 << %d)\n"
42 "\t\"\\x%02x\"\"%s\\0\"\n"
43 "#endif\n",
44 i >> 5, i & 31, i, str);
45 }
46 }
47 printf("\t;\n");
48 return 0;
49}
diff --git a/arch/x86/boot/pm.c b/arch/x86/boot/pm.c
index 09fb342cc62..1a0f936c160 100644
--- a/arch/x86/boot/pm.c
+++ b/arch/x86/boot/pm.c
@@ -104,7 +104,7 @@ static void reset_coprocessor(void)
104 (((u64)(base & 0xff000000) << 32) | \ 104 (((u64)(base & 0xff000000) << 32) | \
105 ((u64)flags << 40) | \ 105 ((u64)flags << 40) | \
106 ((u64)(limit & 0x00ff0000) << 32) | \ 106 ((u64)(limit & 0x00ff0000) << 32) | \
107 ((u64)(base & 0x00ffff00) << 16) | \ 107 ((u64)(base & 0x00ffffff) << 16) | \
108 ((u64)(limit & 0x0000ffff))) 108 ((u64)(limit & 0x0000ffff)))
109 109
110struct gdt_ptr { 110struct gdt_ptr {
@@ -121,6 +121,10 @@ static void setup_gdt(void)
121 [GDT_ENTRY_BOOT_CS] = GDT_ENTRY(0xc09b, 0, 0xfffff), 121 [GDT_ENTRY_BOOT_CS] = GDT_ENTRY(0xc09b, 0, 0xfffff),
122 /* DS: data, read/write, 4 GB, base 0 */ 122 /* DS: data, read/write, 4 GB, base 0 */
123 [GDT_ENTRY_BOOT_DS] = GDT_ENTRY(0xc093, 0, 0xfffff), 123 [GDT_ENTRY_BOOT_DS] = GDT_ENTRY(0xc093, 0, 0xfffff),
124 /* TSS: 32-bit tss, 104 bytes, base 4096 */
125 /* We only have a TSS here to keep Intel VT happy;
126 we don't actually use it for anything. */
127 [GDT_ENTRY_BOOT_TSS] = GDT_ENTRY(0x0089, 4096, 103),
124 }; 128 };
125 /* Xen HVM incorrectly stores a pointer to the gdt_ptr, instead 129 /* Xen HVM incorrectly stores a pointer to the gdt_ptr, instead
126 of the gdt_ptr contents. Thus, make it static so it will 130 of the gdt_ptr contents. Thus, make it static so it will
diff --git a/arch/x86/boot/pmjump.S b/arch/x86/boot/pmjump.S
index fa6bed1fac1..f5402d51f7c 100644
--- a/arch/x86/boot/pmjump.S
+++ b/arch/x86/boot/pmjump.S
@@ -15,6 +15,7 @@
15 */ 15 */
16 16
17#include <asm/boot.h> 17#include <asm/boot.h>
18#include <asm/processor-flags.h>
18#include <asm/segment.h> 19#include <asm/segment.h>
19 20
20 .text 21 .text
@@ -29,28 +30,55 @@
29 */ 30 */
30protected_mode_jump: 31protected_mode_jump:
31 movl %edx, %esi # Pointer to boot_params table 32 movl %edx, %esi # Pointer to boot_params table
32 movl %eax, 2f # Patch ljmpl instruction 33
34 xorl %ebx, %ebx
35 movw %cs, %bx
36 shll $4, %ebx
37 addl %ebx, 2f
33 38
34 movw $__BOOT_DS, %cx 39 movw $__BOOT_DS, %cx
35 xorl %ebx, %ebx # Per the 32-bit boot protocol 40 movw $__BOOT_TSS, %di
36 xorl %ebp, %ebp # Per the 32-bit boot protocol
37 xorl %edi, %edi # Per the 32-bit boot protocol
38 41
39 movl %cr0, %edx 42 movl %cr0, %edx
40 orb $1, %dl # Protected mode (PE) bit 43 orb $X86_CR0_PE, %dl # Protected mode
41 movl %edx, %cr0 44 movl %edx, %cr0
42 jmp 1f # Short jump to serialize on 386/486 45 jmp 1f # Short jump to serialize on 386/486
431: 461:
44 47
45 movw %cx, %ds 48 # Transition to 32-bit mode
46 movw %cx, %es
47 movw %cx, %fs
48 movw %cx, %gs
49 movw %cx, %ss
50
51 # Jump to the 32-bit entrypoint
52 .byte 0x66, 0xea # ljmpl opcode 49 .byte 0x66, 0xea # ljmpl opcode
532: .long 0 # offset 502: .long in_pm32 # offset
54 .word __BOOT_CS # segment 51 .word __BOOT_CS # segment
55 52
56 .size protected_mode_jump, .-protected_mode_jump 53 .size protected_mode_jump, .-protected_mode_jump
54
55 .code32
56 .type in_pm32, @function
57in_pm32:
58 # Set up data segments for flat 32-bit mode
59 movl %ecx, %ds
60 movl %ecx, %es
61 movl %ecx, %fs
62 movl %ecx, %gs
63 movl %ecx, %ss
64 # The 32-bit code sets up its own stack, but this way we do have
65 # a valid stack if some debugging hack wants to use it.
66 addl %ebx, %esp
67
68 # Set up TR to make Intel VT happy
69 ltr %di
70
71 # Clear registers to allow for future extensions to the
72 # 32-bit boot protocol
73 xorl %ecx, %ecx
74 xorl %edx, %edx
75 xorl %ebx, %ebx
76 xorl %ebp, %ebp
77 xorl %edi, %edi
78
79 # Set up LDTR to make Intel VT happy
80 lldt %cx
81
82 jmpl *%eax # Jump to the 32-bit entrypoint
83
84 .size in_pm32, .-in_pm32
diff --git a/arch/x86/boot/video-bios.c b/arch/x86/boot/video-bios.c
index ed0672a8187..ff664a11709 100644
--- a/arch/x86/boot/video-bios.c
+++ b/arch/x86/boot/video-bios.c
@@ -104,6 +104,7 @@ static int bios_probe(void)
104 104
105 mi = GET_HEAP(struct mode_info, 1); 105 mi = GET_HEAP(struct mode_info, 1);
106 mi->mode = VIDEO_FIRST_BIOS+mode; 106 mi->mode = VIDEO_FIRST_BIOS+mode;
107 mi->depth = 0; /* text */
107 mi->x = rdfs16(0x44a); 108 mi->x = rdfs16(0x44a);
108 mi->y = rdfs8(0x484)+1; 109 mi->y = rdfs8(0x484)+1;
109 nmodes++; 110 nmodes++;
@@ -116,7 +117,7 @@ static int bios_probe(void)
116 117
117__videocard video_bios = 118__videocard video_bios =
118{ 119{
119 .card_name = "BIOS (scanned)", 120 .card_name = "BIOS",
120 .probe = bios_probe, 121 .probe = bios_probe,
121 .set_mode = bios_set_mode, 122 .set_mode = bios_set_mode,
122 .unsafe = 1, 123 .unsafe = 1,
diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c
index 4716b9a9635..662dd2f1306 100644
--- a/arch/x86/boot/video-vesa.c
+++ b/arch/x86/boot/video-vesa.c
@@ -79,20 +79,28 @@ static int vesa_probe(void)
79 /* Text Mode, TTY BIOS supported, 79 /* Text Mode, TTY BIOS supported,
80 supported by hardware */ 80 supported by hardware */
81 mi = GET_HEAP(struct mode_info, 1); 81 mi = GET_HEAP(struct mode_info, 1);
82 mi->mode = mode + VIDEO_FIRST_VESA; 82 mi->mode = mode + VIDEO_FIRST_VESA;
83 mi->x = vminfo.h_res; 83 mi->depth = 0; /* text */
84 mi->y = vminfo.v_res; 84 mi->x = vminfo.h_res;
85 mi->y = vminfo.v_res;
85 nmodes++; 86 nmodes++;
86 } else if ((vminfo.mode_attr & 0x99) == 0x99) { 87 } else if ((vminfo.mode_attr & 0x99) == 0x99 &&
88 (vminfo.memory_layout == 4 ||
89 vminfo.memory_layout == 6) &&
90 vminfo.memory_planes == 1) {
87#ifdef CONFIG_FB 91#ifdef CONFIG_FB
88 /* Graphics mode, color, linear frame buffer 92 /* Graphics mode, color, linear frame buffer
89 supported -- register the mode but hide from 93 supported. Only register the mode if
90 the menu. Only do this if framebuffer is 94 if framebuffer is configured, however,
91 configured, however, otherwise the user will 95 otherwise the user will be left without a screen.
92 be left without a screen. */ 96 We don't require CONFIG_FB_VESA, however, since
97 some of the other framebuffer drivers can use
98 this mode-setting, too. */
93 mi = GET_HEAP(struct mode_info, 1); 99 mi = GET_HEAP(struct mode_info, 1);
94 mi->mode = mode + VIDEO_FIRST_VESA; 100 mi->mode = mode + VIDEO_FIRST_VESA;
95 mi->x = mi->y = 0; 101 mi->depth = vminfo.bpp;
102 mi->x = vminfo.h_res;
103 mi->y = vminfo.v_res;
96 nmodes++; 104 nmodes++;
97#endif 105#endif
98 } 106 }
diff --git a/arch/x86/boot/video-vga.c b/arch/x86/boot/video-vga.c
index aef02f9ec0c..7259387b7d1 100644
--- a/arch/x86/boot/video-vga.c
+++ b/arch/x86/boot/video-vga.c
@@ -18,22 +18,22 @@
18#include "video.h" 18#include "video.h"
19 19
20static struct mode_info vga_modes[] = { 20static struct mode_info vga_modes[] = {
21 { VIDEO_80x25, 80, 25 }, 21 { VIDEO_80x25, 80, 25, 0 },
22 { VIDEO_8POINT, 80, 50 }, 22 { VIDEO_8POINT, 80, 50, 0 },
23 { VIDEO_80x43, 80, 43 }, 23 { VIDEO_80x43, 80, 43, 0 },
24 { VIDEO_80x28, 80, 28 }, 24 { VIDEO_80x28, 80, 28, 0 },
25 { VIDEO_80x30, 80, 30 }, 25 { VIDEO_80x30, 80, 30, 0 },
26 { VIDEO_80x34, 80, 34 }, 26 { VIDEO_80x34, 80, 34, 0 },
27 { VIDEO_80x60, 80, 60 }, 27 { VIDEO_80x60, 80, 60, 0 },
28}; 28};
29 29
30static struct mode_info ega_modes[] = { 30static struct mode_info ega_modes[] = {
31 { VIDEO_80x25, 80, 25 }, 31 { VIDEO_80x25, 80, 25, 0 },
32 { VIDEO_8POINT, 80, 43 }, 32 { VIDEO_8POINT, 80, 43, 0 },
33}; 33};
34 34
35static struct mode_info cga_modes[] = { 35static struct mode_info cga_modes[] = {
36 { VIDEO_80x25, 80, 25 }, 36 { VIDEO_80x25, 80, 25, 0 },
37}; 37};
38 38
39__videocard video_vga; 39__videocard video_vga;
diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c
index ad9712f0173..696d08f3843 100644
--- a/arch/x86/boot/video.c
+++ b/arch/x86/boot/video.c
@@ -293,13 +293,28 @@ static void display_menu(void)
293 struct mode_info *mi; 293 struct mode_info *mi;
294 char ch; 294 char ch;
295 int i; 295 int i;
296 int nmodes;
297 int modes_per_line;
298 int col;
296 299
297 puts("Mode: COLSxROWS:\n"); 300 nmodes = 0;
301 for (card = video_cards; card < video_cards_end; card++)
302 nmodes += card->nmodes;
298 303
304 modes_per_line = 1;
305 if (nmodes >= 20)
306 modes_per_line = 3;
307
308 for (col = 0; col < modes_per_line; col++)
309 puts("Mode: Resolution: Type: ");
310 putchar('\n');
311
312 col = 0;
299 ch = '0'; 313 ch = '0';
300 for (card = video_cards; card < video_cards_end; card++) { 314 for (card = video_cards; card < video_cards_end; card++) {
301 mi = card->modes; 315 mi = card->modes;
302 for (i = 0; i < card->nmodes; i++, mi++) { 316 for (i = 0; i < card->nmodes; i++, mi++) {
317 char resbuf[32];
303 int visible = mi->x && mi->y; 318 int visible = mi->x && mi->y;
304 u16 mode_id = mi->mode ? mi->mode : 319 u16 mode_id = mi->mode ? mi->mode :
305 (mi->y << 8)+mi->x; 320 (mi->y << 8)+mi->x;
@@ -307,8 +322,18 @@ static void display_menu(void)
307 if (!visible) 322 if (!visible)
308 continue; /* Hidden mode */ 323 continue; /* Hidden mode */
309 324
310 printf("%c %04X %3dx%-3d %s\n", 325 if (mi->depth)
311 ch, mode_id, mi->x, mi->y, card->card_name); 326 sprintf(resbuf, "%dx%d", mi->y, mi->depth);
327 else
328 sprintf(resbuf, "%d", mi->y);
329
330 printf("%c %03X %4dx%-7s %-6s",
331 ch, mode_id, mi->x, resbuf, card->card_name);
332 col++;
333 if (col >= modes_per_line) {
334 putchar('\n');
335 col = 0;
336 }
312 337
313 if (ch == '9') 338 if (ch == '9')
314 ch = 'a'; 339 ch = 'a';
@@ -318,6 +343,8 @@ static void display_menu(void)
318 ch++; 343 ch++;
319 } 344 }
320 } 345 }
346 if (col)
347 putchar('\n');
321} 348}
322 349
323#define H(x) ((x)-'a'+10) 350#define H(x) ((x)-'a'+10)
diff --git a/arch/x86/boot/video.h b/arch/x86/boot/video.h
index b92447d5121..d69347f79e8 100644
--- a/arch/x86/boot/video.h
+++ b/arch/x86/boot/video.h
@@ -83,7 +83,8 @@ void store_screen(void);
83 83
84struct mode_info { 84struct mode_info {
85 u16 mode; /* Mode number (vga= style) */ 85 u16 mode; /* Mode number (vga= style) */
86 u8 x, y; /* Width, height */ 86 u16 x, y; /* Width, height */
87 u16 depth; /* Bits per pixel, 0 for text mode */
87}; 88};
88 89
89struct card_info { 90struct card_info {
diff --git a/arch/x86/boot/voyager.c b/arch/x86/boot/voyager.c
index 61c8fe0453b..6499e3239b4 100644
--- a/arch/x86/boot/voyager.c
+++ b/arch/x86/boot/voyager.c
@@ -16,8 +16,6 @@
16 16
17#include "boot.h" 17#include "boot.h"
18 18
19#ifdef CONFIG_X86_VOYAGER
20
21int query_voyager(void) 19int query_voyager(void)
22{ 20{
23 u8 err; 21 u8 err;
@@ -42,5 +40,3 @@ int query_voyager(void)
42 copy_from_fs(data_ptr, di, 7); /* Table is 7 bytes apparently */ 40 copy_from_fs(data_ptr, di, 7); /* Table is 7 bytes apparently */
43 return 0; 41 return 0;
44} 42}
45
46#endif /* CONFIG_X86_VOYAGER */
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 54ee1764fda..77562e7cdab 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -99,9 +99,9 @@ CONFIG_IOSCHED_NOOP=y
99CONFIG_IOSCHED_AS=y 99CONFIG_IOSCHED_AS=y
100CONFIG_IOSCHED_DEADLINE=y 100CONFIG_IOSCHED_DEADLINE=y
101CONFIG_IOSCHED_CFQ=y 101CONFIG_IOSCHED_CFQ=y
102CONFIG_DEFAULT_AS=y 102# CONFIG_DEFAULT_AS is not set
103# CONFIG_DEFAULT_DEADLINE is not set 103# CONFIG_DEFAULT_DEADLINE is not set
104# CONFIG_DEFAULT_CFQ is not set 104CONFIG_DEFAULT_CFQ=y
105# CONFIG_DEFAULT_NOOP is not set 105# CONFIG_DEFAULT_NOOP is not set
106CONFIG_DEFAULT_IOSCHED="anticipatory" 106CONFIG_DEFAULT_IOSCHED="anticipatory"
107 107
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 38a83f9c966..9e2b0ef851d 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -145,15 +145,6 @@ CONFIG_K8_NUMA=y
145CONFIG_NODES_SHIFT=6 145CONFIG_NODES_SHIFT=6
146CONFIG_X86_64_ACPI_NUMA=y 146CONFIG_X86_64_ACPI_NUMA=y
147CONFIG_NUMA_EMU=y 147CONFIG_NUMA_EMU=y
148CONFIG_ARCH_DISCONTIGMEM_ENABLE=y
149CONFIG_ARCH_DISCONTIGMEM_DEFAULT=y
150CONFIG_ARCH_SPARSEMEM_ENABLE=y
151CONFIG_SELECT_MEMORY_MODEL=y
152# CONFIG_FLATMEM_MANUAL is not set
153CONFIG_DISCONTIGMEM_MANUAL=y
154# CONFIG_SPARSEMEM_MANUAL is not set
155CONFIG_DISCONTIGMEM=y
156CONFIG_FLAT_NODE_MEM_MAP=y
157CONFIG_NEED_MULTIPLE_NODES=y 148CONFIG_NEED_MULTIPLE_NODES=y
158# CONFIG_SPARSEMEM_STATIC is not set 149# CONFIG_SPARSEMEM_STATIC is not set
159CONFIG_SPLIT_PTLOCK_CPUS=4 150CONFIG_SPLIT_PTLOCK_CPUS=4
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 46bb609e244..3874c2de540 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -4,12 +4,16 @@
4 4
5obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o 5obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o
6obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o 6obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o
7obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o
7 8
8obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o 9obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
9obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o 10obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
11obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
10 12
11aes-i586-y := aes-i586-asm_32.o aes_32.o 13aes-i586-y := aes-i586-asm_32.o aes_glue.o
12twofish-i586-y := twofish-i586-asm_32.o twofish_32.o 14twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
15salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o
13 16
14aes-x86_64-y := aes-x86_64-asm_64.o aes_64.o 17aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
15twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_64.o 18twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
19salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
diff --git a/arch/x86/crypto/aes-i586-asm_32.S b/arch/x86/crypto/aes-i586-asm_32.S
index f942f0c8f63..1093bede3e0 100644
--- a/arch/x86/crypto/aes-i586-asm_32.S
+++ b/arch/x86/crypto/aes-i586-asm_32.S
@@ -46,9 +46,9 @@
46#define in_blk 16 46#define in_blk 16
47 47
48/* offsets in crypto_tfm structure */ 48/* offsets in crypto_tfm structure */
49#define ekey (crypto_tfm_ctx_offset + 0) 49#define klen (crypto_tfm_ctx_offset + 0)
50#define nrnd (crypto_tfm_ctx_offset + 256) 50#define ekey (crypto_tfm_ctx_offset + 4)
51#define dkey (crypto_tfm_ctx_offset + 260) 51#define dkey (crypto_tfm_ctx_offset + 244)
52 52
53// register mapping for encrypt and decrypt subroutines 53// register mapping for encrypt and decrypt subroutines
54 54
@@ -221,8 +221,8 @@
221 221
222.global aes_enc_blk 222.global aes_enc_blk
223 223
224.extern ft_tab 224.extern crypto_ft_tab
225.extern fl_tab 225.extern crypto_fl_tab
226 226
227.align 4 227.align 4
228 228
@@ -236,7 +236,7 @@ aes_enc_blk:
2361: push %ebx 2361: push %ebx
237 mov in_blk+4(%esp),%r2 237 mov in_blk+4(%esp),%r2
238 push %esi 238 push %esi
239 mov nrnd(%ebp),%r3 // number of rounds 239 mov klen(%ebp),%r3 // key size
240 push %edi 240 push %edi
241#if ekey != 0 241#if ekey != 0
242 lea ekey(%ebp),%ebp // key pointer 242 lea ekey(%ebp),%ebp // key pointer
@@ -255,26 +255,26 @@ aes_enc_blk:
255 255
256 sub $8,%esp // space for register saves on stack 256 sub $8,%esp // space for register saves on stack
257 add $16,%ebp // increment to next round key 257 add $16,%ebp // increment to next round key
258 cmp $12,%r3 258 cmp $24,%r3
259 jb 4f // 10 rounds for 128-bit key 259 jb 4f // 10 rounds for 128-bit key
260 lea 32(%ebp),%ebp 260 lea 32(%ebp),%ebp
261 je 3f // 12 rounds for 192-bit key 261 je 3f // 12 rounds for 192-bit key
262 lea 32(%ebp),%ebp 262 lea 32(%ebp),%ebp
263 263
2642: fwd_rnd1( -64(%ebp) ,ft_tab) // 14 rounds for 256-bit key 2642: fwd_rnd1( -64(%ebp), crypto_ft_tab) // 14 rounds for 256-bit key
265 fwd_rnd2( -48(%ebp) ,ft_tab) 265 fwd_rnd2( -48(%ebp), crypto_ft_tab)
2663: fwd_rnd1( -32(%ebp) ,ft_tab) // 12 rounds for 192-bit key 2663: fwd_rnd1( -32(%ebp), crypto_ft_tab) // 12 rounds for 192-bit key
267 fwd_rnd2( -16(%ebp) ,ft_tab) 267 fwd_rnd2( -16(%ebp), crypto_ft_tab)
2684: fwd_rnd1( (%ebp) ,ft_tab) // 10 rounds for 128-bit key 2684: fwd_rnd1( (%ebp), crypto_ft_tab) // 10 rounds for 128-bit key
269 fwd_rnd2( +16(%ebp) ,ft_tab) 269 fwd_rnd2( +16(%ebp), crypto_ft_tab)
270 fwd_rnd1( +32(%ebp) ,ft_tab) 270 fwd_rnd1( +32(%ebp), crypto_ft_tab)
271 fwd_rnd2( +48(%ebp) ,ft_tab) 271 fwd_rnd2( +48(%ebp), crypto_ft_tab)
272 fwd_rnd1( +64(%ebp) ,ft_tab) 272 fwd_rnd1( +64(%ebp), crypto_ft_tab)
273 fwd_rnd2( +80(%ebp) ,ft_tab) 273 fwd_rnd2( +80(%ebp), crypto_ft_tab)
274 fwd_rnd1( +96(%ebp) ,ft_tab) 274 fwd_rnd1( +96(%ebp), crypto_ft_tab)
275 fwd_rnd2(+112(%ebp) ,ft_tab) 275 fwd_rnd2(+112(%ebp), crypto_ft_tab)
276 fwd_rnd1(+128(%ebp) ,ft_tab) 276 fwd_rnd1(+128(%ebp), crypto_ft_tab)
277 fwd_rnd2(+144(%ebp) ,fl_tab) // last round uses a different table 277 fwd_rnd2(+144(%ebp), crypto_fl_tab) // last round uses a different table
278 278
279// move final values to the output array. CAUTION: the 279// move final values to the output array. CAUTION: the
280// order of these assigns rely on the register mappings 280// order of these assigns rely on the register mappings
@@ -297,8 +297,8 @@ aes_enc_blk:
297 297
298.global aes_dec_blk 298.global aes_dec_blk
299 299
300.extern it_tab 300.extern crypto_it_tab
301.extern il_tab 301.extern crypto_il_tab
302 302
303.align 4 303.align 4
304 304
@@ -312,14 +312,11 @@ aes_dec_blk:
3121: push %ebx 3121: push %ebx
313 mov in_blk+4(%esp),%r2 313 mov in_blk+4(%esp),%r2
314 push %esi 314 push %esi
315 mov nrnd(%ebp),%r3 // number of rounds 315 mov klen(%ebp),%r3 // key size
316 push %edi 316 push %edi
317#if dkey != 0 317#if dkey != 0
318 lea dkey(%ebp),%ebp // key pointer 318 lea dkey(%ebp),%ebp // key pointer
319#endif 319#endif
320 mov %r3,%r0
321 shl $4,%r0
322 add %r0,%ebp
323 320
324// input four columns and xor in first round key 321// input four columns and xor in first round key
325 322
@@ -333,27 +330,27 @@ aes_dec_blk:
333 xor 12(%ebp),%r5 330 xor 12(%ebp),%r5
334 331
335 sub $8,%esp // space for register saves on stack 332 sub $8,%esp // space for register saves on stack
336 sub $16,%ebp // increment to next round key 333 add $16,%ebp // increment to next round key
337 cmp $12,%r3 334 cmp $24,%r3
338 jb 4f // 10 rounds for 128-bit key 335 jb 4f // 10 rounds for 128-bit key
339 lea -32(%ebp),%ebp 336 lea 32(%ebp),%ebp
340 je 3f // 12 rounds for 192-bit key 337 je 3f // 12 rounds for 192-bit key
341 lea -32(%ebp),%ebp 338 lea 32(%ebp),%ebp
342 339
3432: inv_rnd1( +64(%ebp), it_tab) // 14 rounds for 256-bit key 3402: inv_rnd1( -64(%ebp), crypto_it_tab) // 14 rounds for 256-bit key
344 inv_rnd2( +48(%ebp), it_tab) 341 inv_rnd2( -48(%ebp), crypto_it_tab)
3453: inv_rnd1( +32(%ebp), it_tab) // 12 rounds for 192-bit key 3423: inv_rnd1( -32(%ebp), crypto_it_tab) // 12 rounds for 192-bit key
346 inv_rnd2( +16(%ebp), it_tab) 343 inv_rnd2( -16(%ebp), crypto_it_tab)
3474: inv_rnd1( (%ebp), it_tab) // 10 rounds for 128-bit key 3444: inv_rnd1( (%ebp), crypto_it_tab) // 10 rounds for 128-bit key
348 inv_rnd2( -16(%ebp), it_tab) 345 inv_rnd2( +16(%ebp), crypto_it_tab)
349 inv_rnd1( -32(%ebp), it_tab) 346 inv_rnd1( +32(%ebp), crypto_it_tab)
350 inv_rnd2( -48(%ebp), it_tab) 347 inv_rnd2( +48(%ebp), crypto_it_tab)
351 inv_rnd1( -64(%ebp), it_tab) 348 inv_rnd1( +64(%ebp), crypto_it_tab)
352 inv_rnd2( -80(%ebp), it_tab) 349 inv_rnd2( +80(%ebp), crypto_it_tab)
353 inv_rnd1( -96(%ebp), it_tab) 350 inv_rnd1( +96(%ebp), crypto_it_tab)
354 inv_rnd2(-112(%ebp), it_tab) 351 inv_rnd2(+112(%ebp), crypto_it_tab)
355 inv_rnd1(-128(%ebp), it_tab) 352 inv_rnd1(+128(%ebp), crypto_it_tab)
356 inv_rnd2(-144(%ebp), il_tab) // last round uses a different table 353 inv_rnd2(+144(%ebp), crypto_il_tab) // last round uses a different table
357 354
358// move final values to the output array. CAUTION: the 355// move final values to the output array. CAUTION: the
359// order of these assigns rely on the register mappings 356// order of these assigns rely on the register mappings
diff --git a/arch/x86/crypto/aes-x86_64-asm_64.S b/arch/x86/crypto/aes-x86_64-asm_64.S
index 26b40de4d0b..a120f526c3d 100644
--- a/arch/x86/crypto/aes-x86_64-asm_64.S
+++ b/arch/x86/crypto/aes-x86_64-asm_64.S
@@ -8,10 +8,10 @@
8 * including this sentence is retained in full. 8 * including this sentence is retained in full.
9 */ 9 */
10 10
11.extern aes_ft_tab 11.extern crypto_ft_tab
12.extern aes_it_tab 12.extern crypto_it_tab
13.extern aes_fl_tab 13.extern crypto_fl_tab
14.extern aes_il_tab 14.extern crypto_il_tab
15 15
16.text 16.text
17 17
@@ -56,13 +56,13 @@
56 .align 8; \ 56 .align 8; \
57FUNC: movq r1,r2; \ 57FUNC: movq r1,r2; \
58 movq r3,r4; \ 58 movq r3,r4; \
59 leaq BASE+KEY+52(r8),r9; \ 59 leaq BASE+KEY+48+4(r8),r9; \
60 movq r10,r11; \ 60 movq r10,r11; \
61 movl (r7),r5 ## E; \ 61 movl (r7),r5 ## E; \
62 movl 4(r7),r1 ## E; \ 62 movl 4(r7),r1 ## E; \
63 movl 8(r7),r6 ## E; \ 63 movl 8(r7),r6 ## E; \
64 movl 12(r7),r7 ## E; \ 64 movl 12(r7),r7 ## E; \
65 movl BASE(r8),r10 ## E; \ 65 movl BASE+0(r8),r10 ## E; \
66 xorl -48(r9),r5 ## E; \ 66 xorl -48(r9),r5 ## E; \
67 xorl -44(r9),r1 ## E; \ 67 xorl -44(r9),r1 ## E; \
68 xorl -40(r9),r6 ## E; \ 68 xorl -40(r9),r6 ## E; \
@@ -154,37 +154,37 @@ FUNC: movq r1,r2; \
154/* void aes_enc_blk(stuct crypto_tfm *tfm, u8 *out, const u8 *in) */ 154/* void aes_enc_blk(stuct crypto_tfm *tfm, u8 *out, const u8 *in) */
155 155
156 entry(aes_enc_blk,0,enc128,enc192) 156 entry(aes_enc_blk,0,enc128,enc192)
157 encrypt_round(aes_ft_tab,-96) 157 encrypt_round(crypto_ft_tab,-96)
158 encrypt_round(aes_ft_tab,-80) 158 encrypt_round(crypto_ft_tab,-80)
159enc192: encrypt_round(aes_ft_tab,-64) 159enc192: encrypt_round(crypto_ft_tab,-64)
160 encrypt_round(aes_ft_tab,-48) 160 encrypt_round(crypto_ft_tab,-48)
161enc128: encrypt_round(aes_ft_tab,-32) 161enc128: encrypt_round(crypto_ft_tab,-32)
162 encrypt_round(aes_ft_tab,-16) 162 encrypt_round(crypto_ft_tab,-16)
163 encrypt_round(aes_ft_tab, 0) 163 encrypt_round(crypto_ft_tab, 0)
164 encrypt_round(aes_ft_tab, 16) 164 encrypt_round(crypto_ft_tab, 16)
165 encrypt_round(aes_ft_tab, 32) 165 encrypt_round(crypto_ft_tab, 32)
166 encrypt_round(aes_ft_tab, 48) 166 encrypt_round(crypto_ft_tab, 48)
167 encrypt_round(aes_ft_tab, 64) 167 encrypt_round(crypto_ft_tab, 64)
168 encrypt_round(aes_ft_tab, 80) 168 encrypt_round(crypto_ft_tab, 80)
169 encrypt_round(aes_ft_tab, 96) 169 encrypt_round(crypto_ft_tab, 96)
170 encrypt_final(aes_fl_tab,112) 170 encrypt_final(crypto_fl_tab,112)
171 return 171 return
172 172
173/* void aes_dec_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in) */ 173/* void aes_dec_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in) */
174 174
175 entry(aes_dec_blk,240,dec128,dec192) 175 entry(aes_dec_blk,240,dec128,dec192)
176 decrypt_round(aes_it_tab,-96) 176 decrypt_round(crypto_it_tab,-96)
177 decrypt_round(aes_it_tab,-80) 177 decrypt_round(crypto_it_tab,-80)
178dec192: decrypt_round(aes_it_tab,-64) 178dec192: decrypt_round(crypto_it_tab,-64)
179 decrypt_round(aes_it_tab,-48) 179 decrypt_round(crypto_it_tab,-48)
180dec128: decrypt_round(aes_it_tab,-32) 180dec128: decrypt_round(crypto_it_tab,-32)
181 decrypt_round(aes_it_tab,-16) 181 decrypt_round(crypto_it_tab,-16)
182 decrypt_round(aes_it_tab, 0) 182 decrypt_round(crypto_it_tab, 0)
183 decrypt_round(aes_it_tab, 16) 183 decrypt_round(crypto_it_tab, 16)
184 decrypt_round(aes_it_tab, 32) 184 decrypt_round(crypto_it_tab, 32)
185 decrypt_round(aes_it_tab, 48) 185 decrypt_round(crypto_it_tab, 48)
186 decrypt_round(aes_it_tab, 64) 186 decrypt_round(crypto_it_tab, 64)
187 decrypt_round(aes_it_tab, 80) 187 decrypt_round(crypto_it_tab, 80)
188 decrypt_round(aes_it_tab, 96) 188 decrypt_round(crypto_it_tab, 96)
189 decrypt_final(aes_il_tab,112) 189 decrypt_final(crypto_il_tab,112)
190 return 190 return
diff --git a/arch/x86/crypto/aes_32.c b/arch/x86/crypto/aes_32.c
deleted file mode 100644
index 49aad9397f1..00000000000
--- a/arch/x86/crypto/aes_32.c
+++ /dev/null
@@ -1,515 +0,0 @@
1/*
2 *
3 * Glue Code for optimized 586 assembler version of AES
4 *
5 * Copyright (c) 2002, Dr Brian Gladman <>, Worcester, UK.
6 * All rights reserved.
7 *
8 * LICENSE TERMS
9 *
10 * The free distribution and use of this software in both source and binary
11 * form is allowed (with or without changes) provided that:
12 *
13 * 1. distributions of this source code include the above copyright
14 * notice, this list of conditions and the following disclaimer;
15 *
16 * 2. distributions in binary form include the above copyright
17 * notice, this list of conditions and the following disclaimer
18 * in the documentation and/or other associated materials;
19 *
20 * 3. the copyright holder's name is not used to endorse products
21 * built using this software without specific written permission.
22 *
23 * ALTERNATIVELY, provided that this notice is retained in full, this product
24 * may be distributed under the terms of the GNU General Public License (GPL),
25 * in which case the provisions of the GPL apply INSTEAD OF those given above.
26 *
27 * DISCLAIMER
28 *
29 * This software is provided 'as is' with no explicit or implied warranties
30 * in respect of its properties, including, but not limited to, correctness
31 * and/or fitness for purpose.
32 *
33 * Copyright (c) 2003, Adam J. Richter <adam@yggdrasil.com> (conversion to
34 * 2.5 API).
35 * Copyright (c) 2003, 2004 Fruhwirth Clemens <clemens@endorphin.org>
36 * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
37 *
38 */
39
40#include <asm/byteorder.h>
41#include <linux/kernel.h>
42#include <linux/module.h>
43#include <linux/init.h>
44#include <linux/types.h>
45#include <linux/crypto.h>
46#include <linux/linkage.h>
47
48asmlinkage void aes_enc_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
49asmlinkage void aes_dec_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
50
51#define AES_MIN_KEY_SIZE 16
52#define AES_MAX_KEY_SIZE 32
53#define AES_BLOCK_SIZE 16
54#define AES_KS_LENGTH 4 * AES_BLOCK_SIZE
55#define RC_LENGTH 29
56
57struct aes_ctx {
58 u32 ekey[AES_KS_LENGTH];
59 u32 rounds;
60 u32 dkey[AES_KS_LENGTH];
61};
62
63#define WPOLY 0x011b
64#define bytes2word(b0, b1, b2, b3) \
65 (((u32)(b3) << 24) | ((u32)(b2) << 16) | ((u32)(b1) << 8) | (b0))
66
67/* define the finite field multiplies required for Rijndael */
68#define f2(x) ((x) ? pow[log[x] + 0x19] : 0)
69#define f3(x) ((x) ? pow[log[x] + 0x01] : 0)
70#define f9(x) ((x) ? pow[log[x] + 0xc7] : 0)
71#define fb(x) ((x) ? pow[log[x] + 0x68] : 0)
72#define fd(x) ((x) ? pow[log[x] + 0xee] : 0)
73#define fe(x) ((x) ? pow[log[x] + 0xdf] : 0)
74#define fi(x) ((x) ? pow[255 - log[x]]: 0)
75
76static inline u32 upr(u32 x, int n)
77{
78 return (x << 8 * n) | (x >> (32 - 8 * n));
79}
80
81static inline u8 bval(u32 x, int n)
82{
83 return x >> 8 * n;
84}
85
86/* The forward and inverse affine transformations used in the S-box */
87#define fwd_affine(x) \
88 (w = (u32)x, w ^= (w<<1)^(w<<2)^(w<<3)^(w<<4), 0x63^(u8)(w^(w>>8)))
89
90#define inv_affine(x) \
91 (w = (u32)x, w = (w<<1)^(w<<3)^(w<<6), 0x05^(u8)(w^(w>>8)))
92
93static u32 rcon_tab[RC_LENGTH];
94
95u32 ft_tab[4][256];
96u32 fl_tab[4][256];
97static u32 im_tab[4][256];
98u32 il_tab[4][256];
99u32 it_tab[4][256];
100
101static void gen_tabs(void)
102{
103 u32 i, w;
104 u8 pow[512], log[256];
105
106 /*
107 * log and power tables for GF(2^8) finite field with
108 * WPOLY as modular polynomial - the simplest primitive
109 * root is 0x03, used here to generate the tables.
110 */
111 i = 0; w = 1;
112
113 do {
114 pow[i] = (u8)w;
115 pow[i + 255] = (u8)w;
116 log[w] = (u8)i++;
117 w ^= (w << 1) ^ (w & 0x80 ? WPOLY : 0);
118 } while (w != 1);
119
120 for(i = 0, w = 1; i < RC_LENGTH; ++i) {
121 rcon_tab[i] = bytes2word(w, 0, 0, 0);
122 w = f2(w);
123 }
124
125 for(i = 0; i < 256; ++i) {
126 u8 b;
127
128 b = fwd_affine(fi((u8)i));
129 w = bytes2word(f2(b), b, b, f3(b));
130
131 /* tables for a normal encryption round */
132 ft_tab[0][i] = w;
133 ft_tab[1][i] = upr(w, 1);
134 ft_tab[2][i] = upr(w, 2);
135 ft_tab[3][i] = upr(w, 3);
136 w = bytes2word(b, 0, 0, 0);
137
138 /*
139 * tables for last encryption round
140 * (may also be used in the key schedule)
141 */
142 fl_tab[0][i] = w;
143 fl_tab[1][i] = upr(w, 1);
144 fl_tab[2][i] = upr(w, 2);
145 fl_tab[3][i] = upr(w, 3);
146
147 b = fi(inv_affine((u8)i));
148 w = bytes2word(fe(b), f9(b), fd(b), fb(b));
149
150 /* tables for the inverse mix column operation */
151 im_tab[0][b] = w;
152 im_tab[1][b] = upr(w, 1);
153 im_tab[2][b] = upr(w, 2);
154 im_tab[3][b] = upr(w, 3);
155
156 /* tables for a normal decryption round */
157 it_tab[0][i] = w;
158 it_tab[1][i] = upr(w,1);
159 it_tab[2][i] = upr(w,2);
160 it_tab[3][i] = upr(w,3);
161
162 w = bytes2word(b, 0, 0, 0);
163
164 /* tables for last decryption round */
165 il_tab[0][i] = w;
166 il_tab[1][i] = upr(w,1);
167 il_tab[2][i] = upr(w,2);
168 il_tab[3][i] = upr(w,3);
169 }
170}
171
172#define four_tables(x,tab,vf,rf,c) \
173( tab[0][bval(vf(x,0,c),rf(0,c))] ^ \
174 tab[1][bval(vf(x,1,c),rf(1,c))] ^ \
175 tab[2][bval(vf(x,2,c),rf(2,c))] ^ \
176 tab[3][bval(vf(x,3,c),rf(3,c))] \
177)
178
179#define vf1(x,r,c) (x)
180#define rf1(r,c) (r)
181#define rf2(r,c) ((r-c)&3)
182
183#define inv_mcol(x) four_tables(x,im_tab,vf1,rf1,0)
184#define ls_box(x,c) four_tables(x,fl_tab,vf1,rf2,c)
185
186#define ff(x) inv_mcol(x)
187
188#define ke4(k,i) \
189{ \
190 k[4*(i)+4] = ss[0] ^= ls_box(ss[3],3) ^ rcon_tab[i]; \
191 k[4*(i)+5] = ss[1] ^= ss[0]; \
192 k[4*(i)+6] = ss[2] ^= ss[1]; \
193 k[4*(i)+7] = ss[3] ^= ss[2]; \
194}
195
196#define kel4(k,i) \
197{ \
198 k[4*(i)+4] = ss[0] ^= ls_box(ss[3],3) ^ rcon_tab[i]; \
199 k[4*(i)+5] = ss[1] ^= ss[0]; \
200 k[4*(i)+6] = ss[2] ^= ss[1]; k[4*(i)+7] = ss[3] ^= ss[2]; \
201}
202
203#define ke6(k,i) \
204{ \
205 k[6*(i)+ 6] = ss[0] ^= ls_box(ss[5],3) ^ rcon_tab[i]; \
206 k[6*(i)+ 7] = ss[1] ^= ss[0]; \
207 k[6*(i)+ 8] = ss[2] ^= ss[1]; \
208 k[6*(i)+ 9] = ss[3] ^= ss[2]; \
209 k[6*(i)+10] = ss[4] ^= ss[3]; \
210 k[6*(i)+11] = ss[5] ^= ss[4]; \
211}
212
213#define kel6(k,i) \
214{ \
215 k[6*(i)+ 6] = ss[0] ^= ls_box(ss[5],3) ^ rcon_tab[i]; \
216 k[6*(i)+ 7] = ss[1] ^= ss[0]; \
217 k[6*(i)+ 8] = ss[2] ^= ss[1]; \
218 k[6*(i)+ 9] = ss[3] ^= ss[2]; \
219}
220
221#define ke8(k,i) \
222{ \
223 k[8*(i)+ 8] = ss[0] ^= ls_box(ss[7],3) ^ rcon_tab[i]; \
224 k[8*(i)+ 9] = ss[1] ^= ss[0]; \
225 k[8*(i)+10] = ss[2] ^= ss[1]; \
226 k[8*(i)+11] = ss[3] ^= ss[2]; \
227 k[8*(i)+12] = ss[4] ^= ls_box(ss[3],0); \
228 k[8*(i)+13] = ss[5] ^= ss[4]; \
229 k[8*(i)+14] = ss[6] ^= ss[5]; \
230 k[8*(i)+15] = ss[7] ^= ss[6]; \
231}
232
233#define kel8(k,i) \
234{ \
235 k[8*(i)+ 8] = ss[0] ^= ls_box(ss[7],3) ^ rcon_tab[i]; \
236 k[8*(i)+ 9] = ss[1] ^= ss[0]; \
237 k[8*(i)+10] = ss[2] ^= ss[1]; \
238 k[8*(i)+11] = ss[3] ^= ss[2]; \
239}
240
241#define kdf4(k,i) \
242{ \
243 ss[0] = ss[0] ^ ss[2] ^ ss[1] ^ ss[3]; \
244 ss[1] = ss[1] ^ ss[3]; \
245 ss[2] = ss[2] ^ ss[3]; \
246 ss[3] = ss[3]; \
247 ss[4] = ls_box(ss[(i+3) % 4], 3) ^ rcon_tab[i]; \
248 ss[i % 4] ^= ss[4]; \
249 ss[4] ^= k[4*(i)]; \
250 k[4*(i)+4] = ff(ss[4]); \
251 ss[4] ^= k[4*(i)+1]; \
252 k[4*(i)+5] = ff(ss[4]); \
253 ss[4] ^= k[4*(i)+2]; \
254 k[4*(i)+6] = ff(ss[4]); \
255 ss[4] ^= k[4*(i)+3]; \
256 k[4*(i)+7] = ff(ss[4]); \
257}
258
259#define kd4(k,i) \
260{ \
261 ss[4] = ls_box(ss[(i+3) % 4], 3) ^ rcon_tab[i]; \
262 ss[i % 4] ^= ss[4]; \
263 ss[4] = ff(ss[4]); \
264 k[4*(i)+4] = ss[4] ^= k[4*(i)]; \
265 k[4*(i)+5] = ss[4] ^= k[4*(i)+1]; \
266 k[4*(i)+6] = ss[4] ^= k[4*(i)+2]; \
267 k[4*(i)+7] = ss[4] ^= k[4*(i)+3]; \
268}
269
270#define kdl4(k,i) \
271{ \
272 ss[4] = ls_box(ss[(i+3) % 4], 3) ^ rcon_tab[i]; \
273 ss[i % 4] ^= ss[4]; \
274 k[4*(i)+4] = (ss[0] ^= ss[1]) ^ ss[2] ^ ss[3]; \
275 k[4*(i)+5] = ss[1] ^ ss[3]; \
276 k[4*(i)+6] = ss[0]; \
277 k[4*(i)+7] = ss[1]; \
278}
279
280#define kdf6(k,i) \
281{ \
282 ss[0] ^= ls_box(ss[5],3) ^ rcon_tab[i]; \
283 k[6*(i)+ 6] = ff(ss[0]); \
284 ss[1] ^= ss[0]; \
285 k[6*(i)+ 7] = ff(ss[1]); \
286 ss[2] ^= ss[1]; \
287 k[6*(i)+ 8] = ff(ss[2]); \
288 ss[3] ^= ss[2]; \
289 k[6*(i)+ 9] = ff(ss[3]); \
290 ss[4] ^= ss[3]; \
291 k[6*(i)+10] = ff(ss[4]); \
292 ss[5] ^= ss[4]; \
293 k[6*(i)+11] = ff(ss[5]); \
294}
295
296#define kd6(k,i) \
297{ \
298 ss[6] = ls_box(ss[5],3) ^ rcon_tab[i]; \
299 ss[0] ^= ss[6]; ss[6] = ff(ss[6]); \
300 k[6*(i)+ 6] = ss[6] ^= k[6*(i)]; \
301 ss[1] ^= ss[0]; \
302 k[6*(i)+ 7] = ss[6] ^= k[6*(i)+ 1]; \
303 ss[2] ^= ss[1]; \
304 k[6*(i)+ 8] = ss[6] ^= k[6*(i)+ 2]; \
305 ss[3] ^= ss[2]; \
306 k[6*(i)+ 9] = ss[6] ^= k[6*(i)+ 3]; \
307 ss[4] ^= ss[3]; \
308 k[6*(i)+10] = ss[6] ^= k[6*(i)+ 4]; \
309 ss[5] ^= ss[4]; \
310 k[6*(i)+11] = ss[6] ^= k[6*(i)+ 5]; \
311}
312
313#define kdl6(k,i) \
314{ \
315 ss[0] ^= ls_box(ss[5],3) ^ rcon_tab[i]; \
316 k[6*(i)+ 6] = ss[0]; \
317 ss[1] ^= ss[0]; \
318 k[6*(i)+ 7] = ss[1]; \
319 ss[2] ^= ss[1]; \
320 k[6*(i)+ 8] = ss[2]; \
321 ss[3] ^= ss[2]; \
322 k[6*(i)+ 9] = ss[3]; \
323}
324
325#define kdf8(k,i) \
326{ \
327 ss[0] ^= ls_box(ss[7],3) ^ rcon_tab[i]; \
328 k[8*(i)+ 8] = ff(ss[0]); \
329 ss[1] ^= ss[0]; \
330 k[8*(i)+ 9] = ff(ss[1]); \
331 ss[2] ^= ss[1]; \
332 k[8*(i)+10] = ff(ss[2]); \
333 ss[3] ^= ss[2]; \
334 k[8*(i)+11] = ff(ss[3]); \
335 ss[4] ^= ls_box(ss[3],0); \
336 k[8*(i)+12] = ff(ss[4]); \
337 ss[5] ^= ss[4]; \
338 k[8*(i)+13] = ff(ss[5]); \
339 ss[6] ^= ss[5]; \
340 k[8*(i)+14] = ff(ss[6]); \
341 ss[7] ^= ss[6]; \
342 k[8*(i)+15] = ff(ss[7]); \
343}
344
345#define kd8(k,i) \
346{ \
347 u32 __g = ls_box(ss[7],3) ^ rcon_tab[i]; \
348 ss[0] ^= __g; \
349 __g = ff(__g); \
350 k[8*(i)+ 8] = __g ^= k[8*(i)]; \
351 ss[1] ^= ss[0]; \
352 k[8*(i)+ 9] = __g ^= k[8*(i)+ 1]; \
353 ss[2] ^= ss[1]; \
354 k[8*(i)+10] = __g ^= k[8*(i)+ 2]; \
355 ss[3] ^= ss[2]; \
356 k[8*(i)+11] = __g ^= k[8*(i)+ 3]; \
357 __g = ls_box(ss[3],0); \
358 ss[4] ^= __g; \
359 __g = ff(__g); \
360 k[8*(i)+12] = __g ^= k[8*(i)+ 4]; \
361 ss[5] ^= ss[4]; \
362 k[8*(i)+13] = __g ^= k[8*(i)+ 5]; \
363 ss[6] ^= ss[5]; \
364 k[8*(i)+14] = __g ^= k[8*(i)+ 6]; \
365 ss[7] ^= ss[6]; \
366 k[8*(i)+15] = __g ^= k[8*(i)+ 7]; \
367}
368
369#define kdl8(k,i) \
370{ \
371 ss[0] ^= ls_box(ss[7],3) ^ rcon_tab[i]; \
372 k[8*(i)+ 8] = ss[0]; \
373 ss[1] ^= ss[0]; \
374 k[8*(i)+ 9] = ss[1]; \
375 ss[2] ^= ss[1]; \
376 k[8*(i)+10] = ss[2]; \
377 ss[3] ^= ss[2]; \
378 k[8*(i)+11] = ss[3]; \
379}
380
381static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
382 unsigned int key_len)
383{
384 int i;
385 u32 ss[8];
386 struct aes_ctx *ctx = crypto_tfm_ctx(tfm);
387 const __le32 *key = (const __le32 *)in_key;
388 u32 *flags = &tfm->crt_flags;
389
390 /* encryption schedule */
391
392 ctx->ekey[0] = ss[0] = le32_to_cpu(key[0]);
393 ctx->ekey[1] = ss[1] = le32_to_cpu(key[1]);
394 ctx->ekey[2] = ss[2] = le32_to_cpu(key[2]);
395 ctx->ekey[3] = ss[3] = le32_to_cpu(key[3]);
396
397 switch(key_len) {
398 case 16:
399 for (i = 0; i < 9; i++)
400 ke4(ctx->ekey, i);
401 kel4(ctx->ekey, 9);
402 ctx->rounds = 10;
403 break;
404
405 case 24:
406 ctx->ekey[4] = ss[4] = le32_to_cpu(key[4]);
407 ctx->ekey[5] = ss[5] = le32_to_cpu(key[5]);
408 for (i = 0; i < 7; i++)
409 ke6(ctx->ekey, i);
410 kel6(ctx->ekey, 7);
411 ctx->rounds = 12;
412 break;
413
414 case 32:
415 ctx->ekey[4] = ss[4] = le32_to_cpu(key[4]);
416 ctx->ekey[5] = ss[5] = le32_to_cpu(key[5]);
417 ctx->ekey[6] = ss[6] = le32_to_cpu(key[6]);
418 ctx->ekey[7] = ss[7] = le32_to_cpu(key[7]);
419 for (i = 0; i < 6; i++)
420 ke8(ctx->ekey, i);
421 kel8(ctx->ekey, 6);
422 ctx->rounds = 14;
423 break;
424
425 default:
426 *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
427 return -EINVAL;
428 }
429
430 /* decryption schedule */
431
432 ctx->dkey[0] = ss[0] = le32_to_cpu(key[0]);
433 ctx->dkey[1] = ss[1] = le32_to_cpu(key[1]);
434 ctx->dkey[2] = ss[2] = le32_to_cpu(key[2]);
435 ctx->dkey[3] = ss[3] = le32_to_cpu(key[3]);
436
437 switch (key_len) {
438 case 16:
439 kdf4(ctx->dkey, 0);
440 for (i = 1; i < 9; i++)
441 kd4(ctx->dkey, i);
442 kdl4(ctx->dkey, 9);
443 break;
444
445 case 24:
446 ctx->dkey[4] = ff(ss[4] = le32_to_cpu(key[4]));
447 ctx->dkey[5] = ff(ss[5] = le32_to_cpu(key[5]));
448 kdf6(ctx->dkey, 0);
449 for (i = 1; i < 7; i++)
450 kd6(ctx->dkey, i);
451 kdl6(ctx->dkey, 7);
452 break;
453
454 case 32:
455 ctx->dkey[4] = ff(ss[4] = le32_to_cpu(key[4]));
456 ctx->dkey[5] = ff(ss[5] = le32_to_cpu(key[5]));
457 ctx->dkey[6] = ff(ss[6] = le32_to_cpu(key[6]));
458 ctx->dkey[7] = ff(ss[7] = le32_to_cpu(key[7]));
459 kdf8(ctx->dkey, 0);
460 for (i = 1; i < 6; i++)
461 kd8(ctx->dkey, i);
462 kdl8(ctx->dkey, 6);
463 break;
464 }
465 return 0;
466}
467
468static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
469{
470 aes_enc_blk(tfm, dst, src);
471}
472
473static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
474{
475 aes_dec_blk(tfm, dst, src);
476}
477
478static struct crypto_alg aes_alg = {
479 .cra_name = "aes",
480 .cra_driver_name = "aes-i586",
481 .cra_priority = 200,
482 .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
483 .cra_blocksize = AES_BLOCK_SIZE,
484 .cra_ctxsize = sizeof(struct aes_ctx),
485 .cra_module = THIS_MODULE,
486 .cra_list = LIST_HEAD_INIT(aes_alg.cra_list),
487 .cra_u = {
488 .cipher = {
489 .cia_min_keysize = AES_MIN_KEY_SIZE,
490 .cia_max_keysize = AES_MAX_KEY_SIZE,
491 .cia_setkey = aes_set_key,
492 .cia_encrypt = aes_encrypt,
493 .cia_decrypt = aes_decrypt
494 }
495 }
496};
497
498static int __init aes_init(void)
499{
500 gen_tabs();
501 return crypto_register_alg(&aes_alg);
502}
503
504static void __exit aes_fini(void)
505{
506 crypto_unregister_alg(&aes_alg);
507}
508
509module_init(aes_init);
510module_exit(aes_fini);
511
512MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm, i586 asm optimized");
513MODULE_LICENSE("Dual BSD/GPL");
514MODULE_AUTHOR("Fruhwirth Clemens, James Morris, Brian Gladman, Adam Richter");
515MODULE_ALIAS("aes");
diff --git a/arch/x86/crypto/aes_64.c b/arch/x86/crypto/aes_64.c
deleted file mode 100644
index 5cdb13ea5cc..00000000000
--- a/arch/x86/crypto/aes_64.c
+++ /dev/null
@@ -1,336 +0,0 @@
1/*
2 * Cryptographic API.
3 *
4 * AES Cipher Algorithm.
5 *
6 * Based on Brian Gladman's code.
7 *
8 * Linux developers:
9 * Alexander Kjeldaas <astor@fast.no>
10 * Herbert Valerio Riedel <hvr@hvrlab.org>
11 * Kyle McMartin <kyle@debian.org>
12 * Adam J. Richter <adam@yggdrasil.com> (conversion to 2.5 API).
13 * Andreas Steinmetz <ast@domdv.de> (adapted to x86_64 assembler)
14 *
15 * This program is free software; you can redistribute it and/or modify
16 * it under the terms of the GNU General Public License as published by
17 * the Free Software Foundation; either version 2 of the License, or
18 * (at your option) any later version.
19 *
20 * ---------------------------------------------------------------------------
21 * Copyright (c) 2002, Dr Brian Gladman <brg@gladman.me.uk>, Worcester, UK.
22 * All rights reserved.
23 *
24 * LICENSE TERMS
25 *
26 * The free distribution and use of this software in both source and binary
27 * form is allowed (with or without changes) provided that:
28 *
29 * 1. distributions of this source code include the above copyright
30 * notice, this list of conditions and the following disclaimer;
31 *
32 * 2. distributions in binary form include the above copyright
33 * notice, this list of conditions and the following disclaimer
34 * in the documentation and/or other associated materials;
35 *
36 * 3. the copyright holder's name is not used to endorse products
37 * built using this software without specific written permission.
38 *
39 * ALTERNATIVELY, provided that this notice is retained in full, this product
40 * may be distributed under the terms of the GNU General Public License (GPL),
41 * in which case the provisions of the GPL apply INSTEAD OF those given above.
42 *
43 * DISCLAIMER
44 *
45 * This software is provided 'as is' with no explicit or implied warranties
46 * in respect of its properties, including, but not limited to, correctness
47 * and/or fitness for purpose.
48 * ---------------------------------------------------------------------------
49 */
50
51/* Some changes from the Gladman version:
52 s/RIJNDAEL(e_key)/E_KEY/g
53 s/RIJNDAEL(d_key)/D_KEY/g
54*/
55
56#include <asm/byteorder.h>
57#include <linux/bitops.h>
58#include <linux/crypto.h>
59#include <linux/errno.h>
60#include <linux/init.h>
61#include <linux/module.h>
62#include <linux/types.h>
63
64#define AES_MIN_KEY_SIZE 16
65#define AES_MAX_KEY_SIZE 32
66
67#define AES_BLOCK_SIZE 16
68
69/*
70 * #define byte(x, nr) ((unsigned char)((x) >> (nr*8)))
71 */
72static inline u8 byte(const u32 x, const unsigned n)
73{
74 return x >> (n << 3);
75}
76
77struct aes_ctx
78{
79 u32 key_length;
80 u32 buf[120];
81};
82
83#define E_KEY (&ctx->buf[0])
84#define D_KEY (&ctx->buf[60])
85
86static u8 pow_tab[256] __initdata;
87static u8 log_tab[256] __initdata;
88static u8 sbx_tab[256] __initdata;
89static u8 isb_tab[256] __initdata;
90static u32 rco_tab[10];
91u32 aes_ft_tab[4][256];
92u32 aes_it_tab[4][256];
93
94u32 aes_fl_tab[4][256];
95u32 aes_il_tab[4][256];
96
97static inline u8 f_mult(u8 a, u8 b)
98{
99 u8 aa = log_tab[a], cc = aa + log_tab[b];
100
101 return pow_tab[cc + (cc < aa ? 1 : 0)];
102}
103
104#define ff_mult(a, b) (a && b ? f_mult(a, b) : 0)
105
106#define ls_box(x) \
107 (aes_fl_tab[0][byte(x, 0)] ^ \
108 aes_fl_tab[1][byte(x, 1)] ^ \
109 aes_fl_tab[2][byte(x, 2)] ^ \
110 aes_fl_tab[3][byte(x, 3)])
111
112static void __init gen_tabs(void)
113{
114 u32 i, t;
115 u8 p, q;
116
117 /* log and power tables for GF(2**8) finite field with
118 0x011b as modular polynomial - the simplest primitive
119 root is 0x03, used here to generate the tables */
120
121 for (i = 0, p = 1; i < 256; ++i) {
122 pow_tab[i] = (u8)p;
123 log_tab[p] = (u8)i;
124
125 p ^= (p << 1) ^ (p & 0x80 ? 0x01b : 0);
126 }
127
128 log_tab[1] = 0;
129
130 for (i = 0, p = 1; i < 10; ++i) {
131 rco_tab[i] = p;
132
133 p = (p << 1) ^ (p & 0x80 ? 0x01b : 0);
134 }
135
136 for (i = 0; i < 256; ++i) {
137 p = (i ? pow_tab[255 - log_tab[i]] : 0);
138 q = ((p >> 7) | (p << 1)) ^ ((p >> 6) | (p << 2));
139 p ^= 0x63 ^ q ^ ((q >> 6) | (q << 2));
140 sbx_tab[i] = p;
141 isb_tab[p] = (u8)i;
142 }
143
144 for (i = 0; i < 256; ++i) {
145 p = sbx_tab[i];
146
147 t = p;
148 aes_fl_tab[0][i] = t;
149 aes_fl_tab[1][i] = rol32(t, 8);
150 aes_fl_tab[2][i] = rol32(t, 16);
151 aes_fl_tab[3][i] = rol32(t, 24);
152
153 t = ((u32)ff_mult(2, p)) |
154 ((u32)p << 8) |
155 ((u32)p << 16) | ((u32)ff_mult(3, p) << 24);
156
157 aes_ft_tab[0][i] = t;
158 aes_ft_tab[1][i] = rol32(t, 8);
159 aes_ft_tab[2][i] = rol32(t, 16);
160 aes_ft_tab[3][i] = rol32(t, 24);
161
162 p = isb_tab[i];
163
164 t = p;
165 aes_il_tab[0][i] = t;
166 aes_il_tab[1][i] = rol32(t, 8);
167 aes_il_tab[2][i] = rol32(t, 16);
168 aes_il_tab[3][i] = rol32(t, 24);
169
170 t = ((u32)ff_mult(14, p)) |
171 ((u32)ff_mult(9, p) << 8) |
172 ((u32)ff_mult(13, p) << 16) |
173 ((u32)ff_mult(11, p) << 24);
174
175 aes_it_tab[0][i] = t;
176 aes_it_tab[1][i] = rol32(t, 8);
177 aes_it_tab[2][i] = rol32(t, 16);
178 aes_it_tab[3][i] = rol32(t, 24);
179 }
180}
181
182#define star_x(x) (((x) & 0x7f7f7f7f) << 1) ^ ((((x) & 0x80808080) >> 7) * 0x1b)
183
184#define imix_col(y, x) \
185 u = star_x(x); \
186 v = star_x(u); \
187 w = star_x(v); \
188 t = w ^ (x); \
189 (y) = u ^ v ^ w; \
190 (y) ^= ror32(u ^ t, 8) ^ \
191 ror32(v ^ t, 16) ^ \
192 ror32(t, 24)
193
194/* initialise the key schedule from the user supplied key */
195
196#define loop4(i) \
197{ \
198 t = ror32(t, 8); t = ls_box(t) ^ rco_tab[i]; \
199 t ^= E_KEY[4 * i]; E_KEY[4 * i + 4] = t; \
200 t ^= E_KEY[4 * i + 1]; E_KEY[4 * i + 5] = t; \
201 t ^= E_KEY[4 * i + 2]; E_KEY[4 * i + 6] = t; \
202 t ^= E_KEY[4 * i + 3]; E_KEY[4 * i + 7] = t; \
203}
204
205#define loop6(i) \
206{ \
207 t = ror32(t, 8); t = ls_box(t) ^ rco_tab[i]; \
208 t ^= E_KEY[6 * i]; E_KEY[6 * i + 6] = t; \
209 t ^= E_KEY[6 * i + 1]; E_KEY[6 * i + 7] = t; \
210 t ^= E_KEY[6 * i + 2]; E_KEY[6 * i + 8] = t; \
211 t ^= E_KEY[6 * i + 3]; E_KEY[6 * i + 9] = t; \
212 t ^= E_KEY[6 * i + 4]; E_KEY[6 * i + 10] = t; \
213 t ^= E_KEY[6 * i + 5]; E_KEY[6 * i + 11] = t; \
214}
215
216#define loop8(i) \
217{ \
218 t = ror32(t, 8); ; t = ls_box(t) ^ rco_tab[i]; \
219 t ^= E_KEY[8 * i]; E_KEY[8 * i + 8] = t; \
220 t ^= E_KEY[8 * i + 1]; E_KEY[8 * i + 9] = t; \
221 t ^= E_KEY[8 * i + 2]; E_KEY[8 * i + 10] = t; \
222 t ^= E_KEY[8 * i + 3]; E_KEY[8 * i + 11] = t; \
223 t = E_KEY[8 * i + 4] ^ ls_box(t); \
224 E_KEY[8 * i + 12] = t; \
225 t ^= E_KEY[8 * i + 5]; E_KEY[8 * i + 13] = t; \
226 t ^= E_KEY[8 * i + 6]; E_KEY[8 * i + 14] = t; \
227 t ^= E_KEY[8 * i + 7]; E_KEY[8 * i + 15] = t; \
228}
229
230static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
231 unsigned int key_len)
232{
233 struct aes_ctx *ctx = crypto_tfm_ctx(tfm);
234 const __le32 *key = (const __le32 *)in_key;
235 u32 *flags = &tfm->crt_flags;
236 u32 i, j, t, u, v, w;
237
238 if (key_len % 8) {
239 *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
240 return -EINVAL;
241 }
242
243 ctx->key_length = key_len;
244
245 D_KEY[key_len + 24] = E_KEY[0] = le32_to_cpu(key[0]);
246 D_KEY[key_len + 25] = E_KEY[1] = le32_to_cpu(key[1]);
247 D_KEY[key_len + 26] = E_KEY[2] = le32_to_cpu(key[2]);
248 D_KEY[key_len + 27] = E_KEY[3] = le32_to_cpu(key[3]);
249
250 switch (key_len) {
251 case 16:
252 t = E_KEY[3];
253 for (i = 0; i < 10; ++i)
254 loop4(i);
255 break;
256
257 case 24:
258 E_KEY[4] = le32_to_cpu(key[4]);
259 t = E_KEY[5] = le32_to_cpu(key[5]);
260 for (i = 0; i < 8; ++i)
261 loop6 (i);
262 break;
263
264 case 32:
265 E_KEY[4] = le32_to_cpu(key[4]);
266 E_KEY[5] = le32_to_cpu(key[5]);
267 E_KEY[6] = le32_to_cpu(key[6]);
268 t = E_KEY[7] = le32_to_cpu(key[7]);
269 for (i = 0; i < 7; ++i)
270 loop8(i);
271 break;
272 }
273
274 D_KEY[0] = E_KEY[key_len + 24];
275 D_KEY[1] = E_KEY[key_len + 25];
276 D_KEY[2] = E_KEY[key_len + 26];
277 D_KEY[3] = E_KEY[key_len + 27];
278
279 for (i = 4; i < key_len + 24; ++i) {
280 j = key_len + 24 - (i & ~3) + (i & 3);
281 imix_col(D_KEY[j], E_KEY[i]);
282 }
283
284 return 0;
285}
286
287asmlinkage void aes_enc_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in);
288asmlinkage void aes_dec_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in);
289
290static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
291{
292 aes_enc_blk(tfm, dst, src);
293}
294
295static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
296{
297 aes_dec_blk(tfm, dst, src);
298}
299
300static struct crypto_alg aes_alg = {
301 .cra_name = "aes",
302 .cra_driver_name = "aes-x86_64",
303 .cra_priority = 200,
304 .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
305 .cra_blocksize = AES_BLOCK_SIZE,
306 .cra_ctxsize = sizeof(struct aes_ctx),
307 .cra_module = THIS_MODULE,
308 .cra_list = LIST_HEAD_INIT(aes_alg.cra_list),
309 .cra_u = {
310 .cipher = {
311 .cia_min_keysize = AES_MIN_KEY_SIZE,
312 .cia_max_keysize = AES_MAX_KEY_SIZE,
313 .cia_setkey = aes_set_key,
314 .cia_encrypt = aes_encrypt,
315 .cia_decrypt = aes_decrypt
316 }
317 }
318};
319
320static int __init aes_init(void)
321{
322 gen_tabs();
323 return crypto_register_alg(&aes_alg);
324}
325
326static void __exit aes_fini(void)
327{
328 crypto_unregister_alg(&aes_alg);
329}
330
331module_init(aes_init);
332module_exit(aes_fini);
333
334MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm");
335MODULE_LICENSE("GPL");
336MODULE_ALIAS("aes");
diff --git a/arch/x86/crypto/aes_glue.c b/arch/x86/crypto/aes_glue.c
new file mode 100644
index 00000000000..71f45782711
--- /dev/null
+++ b/arch/x86/crypto/aes_glue.c
@@ -0,0 +1,57 @@
1/*
2 * Glue Code for the asm optimized version of the AES Cipher Algorithm
3 *
4 */
5
6#include <crypto/aes.h>
7
8asmlinkage void aes_enc_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in);
9asmlinkage void aes_dec_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in);
10
11static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
12{
13 aes_enc_blk(tfm, dst, src);
14}
15
16static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
17{
18 aes_dec_blk(tfm, dst, src);
19}
20
21static struct crypto_alg aes_alg = {
22 .cra_name = "aes",
23 .cra_driver_name = "aes-asm",
24 .cra_priority = 200,
25 .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
26 .cra_blocksize = AES_BLOCK_SIZE,
27 .cra_ctxsize = sizeof(struct crypto_aes_ctx),
28 .cra_module = THIS_MODULE,
29 .cra_list = LIST_HEAD_INIT(aes_alg.cra_list),
30 .cra_u = {
31 .cipher = {
32 .cia_min_keysize = AES_MIN_KEY_SIZE,
33 .cia_max_keysize = AES_MAX_KEY_SIZE,
34 .cia_setkey = crypto_aes_set_key,
35 .cia_encrypt = aes_encrypt,
36 .cia_decrypt = aes_decrypt
37 }
38 }
39};
40
41static int __init aes_init(void)
42{
43 return crypto_register_alg(&aes_alg);
44}
45
46static void __exit aes_fini(void)
47{
48 crypto_unregister_alg(&aes_alg);
49}
50
51module_init(aes_init);
52module_exit(aes_fini);
53
54MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm, asm optimized");
55MODULE_LICENSE("GPL");
56MODULE_ALIAS("aes");
57MODULE_ALIAS("aes-asm");
diff --git a/arch/x86/crypto/salsa20-i586-asm_32.S b/arch/x86/crypto/salsa20-i586-asm_32.S
new file mode 100644
index 00000000000..72eb306680b
--- /dev/null
+++ b/arch/x86/crypto/salsa20-i586-asm_32.S
@@ -0,0 +1,1114 @@
1# salsa20_pm.s version 20051229
2# D. J. Bernstein
3# Public domain.
4
5# enter ECRYPT_encrypt_bytes
6.text
7.p2align 5
8.globl ECRYPT_encrypt_bytes
9ECRYPT_encrypt_bytes:
10 mov %esp,%eax
11 and $31,%eax
12 add $256,%eax
13 sub %eax,%esp
14 # eax_stack = eax
15 movl %eax,80(%esp)
16 # ebx_stack = ebx
17 movl %ebx,84(%esp)
18 # esi_stack = esi
19 movl %esi,88(%esp)
20 # edi_stack = edi
21 movl %edi,92(%esp)
22 # ebp_stack = ebp
23 movl %ebp,96(%esp)
24 # x = arg1
25 movl 4(%esp,%eax),%edx
26 # m = arg2
27 movl 8(%esp,%eax),%esi
28 # out = arg3
29 movl 12(%esp,%eax),%edi
30 # bytes = arg4
31 movl 16(%esp,%eax),%ebx
32 # bytes -= 0
33 sub $0,%ebx
34 # goto done if unsigned<=
35 jbe ._done
36._start:
37 # in0 = *(uint32 *) (x + 0)
38 movl 0(%edx),%eax
39 # in1 = *(uint32 *) (x + 4)
40 movl 4(%edx),%ecx
41 # in2 = *(uint32 *) (x + 8)
42 movl 8(%edx),%ebp
43 # j0 = in0
44 movl %eax,164(%esp)
45 # in3 = *(uint32 *) (x + 12)
46 movl 12(%edx),%eax
47 # j1 = in1
48 movl %ecx,168(%esp)
49 # in4 = *(uint32 *) (x + 16)
50 movl 16(%edx),%ecx
51 # j2 = in2
52 movl %ebp,172(%esp)
53 # in5 = *(uint32 *) (x + 20)
54 movl 20(%edx),%ebp
55 # j3 = in3
56 movl %eax,176(%esp)
57 # in6 = *(uint32 *) (x + 24)
58 movl 24(%edx),%eax
59 # j4 = in4
60 movl %ecx,180(%esp)
61 # in7 = *(uint32 *) (x + 28)
62 movl 28(%edx),%ecx
63 # j5 = in5
64 movl %ebp,184(%esp)
65 # in8 = *(uint32 *) (x + 32)
66 movl 32(%edx),%ebp
67 # j6 = in6
68 movl %eax,188(%esp)
69 # in9 = *(uint32 *) (x + 36)
70 movl 36(%edx),%eax
71 # j7 = in7
72 movl %ecx,192(%esp)
73 # in10 = *(uint32 *) (x + 40)
74 movl 40(%edx),%ecx
75 # j8 = in8
76 movl %ebp,196(%esp)
77 # in11 = *(uint32 *) (x + 44)
78 movl 44(%edx),%ebp
79 # j9 = in9
80 movl %eax,200(%esp)
81 # in12 = *(uint32 *) (x + 48)
82 movl 48(%edx),%eax
83 # j10 = in10
84 movl %ecx,204(%esp)
85 # in13 = *(uint32 *) (x + 52)
86 movl 52(%edx),%ecx
87 # j11 = in11
88 movl %ebp,208(%esp)
89 # in14 = *(uint32 *) (x + 56)
90 movl 56(%edx),%ebp
91 # j12 = in12
92 movl %eax,212(%esp)
93 # in15 = *(uint32 *) (x + 60)
94 movl 60(%edx),%eax
95 # j13 = in13
96 movl %ecx,216(%esp)
97 # j14 = in14
98 movl %ebp,220(%esp)
99 # j15 = in15
100 movl %eax,224(%esp)
101 # x_backup = x
102 movl %edx,64(%esp)
103._bytesatleast1:
104 # bytes - 64
105 cmp $64,%ebx
106 # goto nocopy if unsigned>=
107 jae ._nocopy
108 # ctarget = out
109 movl %edi,228(%esp)
110 # out = &tmp
111 leal 0(%esp),%edi
112 # i = bytes
113 mov %ebx,%ecx
114 # while (i) { *out++ = *m++; --i }
115 rep movsb
116 # out = &tmp
117 leal 0(%esp),%edi
118 # m = &tmp
119 leal 0(%esp),%esi
120._nocopy:
121 # out_backup = out
122 movl %edi,72(%esp)
123 # m_backup = m
124 movl %esi,68(%esp)
125 # bytes_backup = bytes
126 movl %ebx,76(%esp)
127 # in0 = j0
128 movl 164(%esp),%eax
129 # in1 = j1
130 movl 168(%esp),%ecx
131 # in2 = j2
132 movl 172(%esp),%edx
133 # in3 = j3
134 movl 176(%esp),%ebx
135 # x0 = in0
136 movl %eax,100(%esp)
137 # x1 = in1
138 movl %ecx,104(%esp)
139 # x2 = in2
140 movl %edx,108(%esp)
141 # x3 = in3
142 movl %ebx,112(%esp)
143 # in4 = j4
144 movl 180(%esp),%eax
145 # in5 = j5
146 movl 184(%esp),%ecx
147 # in6 = j6
148 movl 188(%esp),%edx
149 # in7 = j7
150 movl 192(%esp),%ebx
151 # x4 = in4
152 movl %eax,116(%esp)
153 # x5 = in5
154 movl %ecx,120(%esp)
155 # x6 = in6
156 movl %edx,124(%esp)
157 # x7 = in7
158 movl %ebx,128(%esp)
159 # in8 = j8
160 movl 196(%esp),%eax
161 # in9 = j9
162 movl 200(%esp),%ecx
163 # in10 = j10
164 movl 204(%esp),%edx
165 # in11 = j11
166 movl 208(%esp),%ebx
167 # x8 = in8
168 movl %eax,132(%esp)
169 # x9 = in9
170 movl %ecx,136(%esp)
171 # x10 = in10
172 movl %edx,140(%esp)
173 # x11 = in11
174 movl %ebx,144(%esp)
175 # in12 = j12
176 movl 212(%esp),%eax
177 # in13 = j13
178 movl 216(%esp),%ecx
179 # in14 = j14
180 movl 220(%esp),%edx
181 # in15 = j15
182 movl 224(%esp),%ebx
183 # x12 = in12
184 movl %eax,148(%esp)
185 # x13 = in13
186 movl %ecx,152(%esp)
187 # x14 = in14
188 movl %edx,156(%esp)
189 # x15 = in15
190 movl %ebx,160(%esp)
191 # i = 20
192 mov $20,%ebp
193 # p = x0
194 movl 100(%esp),%eax
195 # s = x5
196 movl 120(%esp),%ecx
197 # t = x10
198 movl 140(%esp),%edx
199 # w = x15
200 movl 160(%esp),%ebx
201._mainloop:
202 # x0 = p
203 movl %eax,100(%esp)
204 # x10 = t
205 movl %edx,140(%esp)
206 # p += x12
207 addl 148(%esp),%eax
208 # x5 = s
209 movl %ecx,120(%esp)
210 # t += x6
211 addl 124(%esp),%edx
212 # x15 = w
213 movl %ebx,160(%esp)
214 # r = x1
215 movl 104(%esp),%esi
216 # r += s
217 add %ecx,%esi
218 # v = x11
219 movl 144(%esp),%edi
220 # v += w
221 add %ebx,%edi
222 # p <<<= 7
223 rol $7,%eax
224 # p ^= x4
225 xorl 116(%esp),%eax
226 # t <<<= 7
227 rol $7,%edx
228 # t ^= x14
229 xorl 156(%esp),%edx
230 # r <<<= 7
231 rol $7,%esi
232 # r ^= x9
233 xorl 136(%esp),%esi
234 # v <<<= 7
235 rol $7,%edi
236 # v ^= x3
237 xorl 112(%esp),%edi
238 # x4 = p
239 movl %eax,116(%esp)
240 # x14 = t
241 movl %edx,156(%esp)
242 # p += x0
243 addl 100(%esp),%eax
244 # x9 = r
245 movl %esi,136(%esp)
246 # t += x10
247 addl 140(%esp),%edx
248 # x3 = v
249 movl %edi,112(%esp)
250 # p <<<= 9
251 rol $9,%eax
252 # p ^= x8
253 xorl 132(%esp),%eax
254 # t <<<= 9
255 rol $9,%edx
256 # t ^= x2
257 xorl 108(%esp),%edx
258 # s += r
259 add %esi,%ecx
260 # s <<<= 9
261 rol $9,%ecx
262 # s ^= x13
263 xorl 152(%esp),%ecx
264 # w += v
265 add %edi,%ebx
266 # w <<<= 9
267 rol $9,%ebx
268 # w ^= x7
269 xorl 128(%esp),%ebx
270 # x8 = p
271 movl %eax,132(%esp)
272 # x2 = t
273 movl %edx,108(%esp)
274 # p += x4
275 addl 116(%esp),%eax
276 # x13 = s
277 movl %ecx,152(%esp)
278 # t += x14
279 addl 156(%esp),%edx
280 # x7 = w
281 movl %ebx,128(%esp)
282 # p <<<= 13
283 rol $13,%eax
284 # p ^= x12
285 xorl 148(%esp),%eax
286 # t <<<= 13
287 rol $13,%edx
288 # t ^= x6
289 xorl 124(%esp),%edx
290 # r += s
291 add %ecx,%esi
292 # r <<<= 13
293 rol $13,%esi
294 # r ^= x1
295 xorl 104(%esp),%esi
296 # v += w
297 add %ebx,%edi
298 # v <<<= 13
299 rol $13,%edi
300 # v ^= x11
301 xorl 144(%esp),%edi
302 # x12 = p
303 movl %eax,148(%esp)
304 # x6 = t
305 movl %edx,124(%esp)
306 # p += x8
307 addl 132(%esp),%eax
308 # x1 = r
309 movl %esi,104(%esp)
310 # t += x2
311 addl 108(%esp),%edx
312 # x11 = v
313 movl %edi,144(%esp)
314 # p <<<= 18
315 rol $18,%eax
316 # p ^= x0
317 xorl 100(%esp),%eax
318 # t <<<= 18
319 rol $18,%edx
320 # t ^= x10
321 xorl 140(%esp),%edx
322 # s += r
323 add %esi,%ecx
324 # s <<<= 18
325 rol $18,%ecx
326 # s ^= x5
327 xorl 120(%esp),%ecx
328 # w += v
329 add %edi,%ebx
330 # w <<<= 18
331 rol $18,%ebx
332 # w ^= x15
333 xorl 160(%esp),%ebx
334 # x0 = p
335 movl %eax,100(%esp)
336 # x10 = t
337 movl %edx,140(%esp)
338 # p += x3
339 addl 112(%esp),%eax
340 # p <<<= 7
341 rol $7,%eax
342 # x5 = s
343 movl %ecx,120(%esp)
344 # t += x9
345 addl 136(%esp),%edx
346 # x15 = w
347 movl %ebx,160(%esp)
348 # r = x4
349 movl 116(%esp),%esi
350 # r += s
351 add %ecx,%esi
352 # v = x14
353 movl 156(%esp),%edi
354 # v += w
355 add %ebx,%edi
356 # p ^= x1
357 xorl 104(%esp),%eax
358 # t <<<= 7
359 rol $7,%edx
360 # t ^= x11
361 xorl 144(%esp),%edx
362 # r <<<= 7
363 rol $7,%esi
364 # r ^= x6
365 xorl 124(%esp),%esi
366 # v <<<= 7
367 rol $7,%edi
368 # v ^= x12
369 xorl 148(%esp),%edi
370 # x1 = p
371 movl %eax,104(%esp)
372 # x11 = t
373 movl %edx,144(%esp)
374 # p += x0
375 addl 100(%esp),%eax
376 # x6 = r
377 movl %esi,124(%esp)
378 # t += x10
379 addl 140(%esp),%edx
380 # x12 = v
381 movl %edi,148(%esp)
382 # p <<<= 9
383 rol $9,%eax
384 # p ^= x2
385 xorl 108(%esp),%eax
386 # t <<<= 9
387 rol $9,%edx
388 # t ^= x8
389 xorl 132(%esp),%edx
390 # s += r
391 add %esi,%ecx
392 # s <<<= 9
393 rol $9,%ecx
394 # s ^= x7
395 xorl 128(%esp),%ecx
396 # w += v
397 add %edi,%ebx
398 # w <<<= 9
399 rol $9,%ebx
400 # w ^= x13
401 xorl 152(%esp),%ebx
402 # x2 = p
403 movl %eax,108(%esp)
404 # x8 = t
405 movl %edx,132(%esp)
406 # p += x1
407 addl 104(%esp),%eax
408 # x7 = s
409 movl %ecx,128(%esp)
410 # t += x11
411 addl 144(%esp),%edx
412 # x13 = w
413 movl %ebx,152(%esp)
414 # p <<<= 13
415 rol $13,%eax
416 # p ^= x3
417 xorl 112(%esp),%eax
418 # t <<<= 13
419 rol $13,%edx
420 # t ^= x9
421 xorl 136(%esp),%edx
422 # r += s
423 add %ecx,%esi
424 # r <<<= 13
425 rol $13,%esi
426 # r ^= x4
427 xorl 116(%esp),%esi
428 # v += w
429 add %ebx,%edi
430 # v <<<= 13
431 rol $13,%edi
432 # v ^= x14
433 xorl 156(%esp),%edi
434 # x3 = p
435 movl %eax,112(%esp)
436 # x9 = t
437 movl %edx,136(%esp)
438 # p += x2
439 addl 108(%esp),%eax
440 # x4 = r
441 movl %esi,116(%esp)
442 # t += x8
443 addl 132(%esp),%edx
444 # x14 = v
445 movl %edi,156(%esp)
446 # p <<<= 18
447 rol $18,%eax
448 # p ^= x0
449 xorl 100(%esp),%eax
450 # t <<<= 18
451 rol $18,%edx
452 # t ^= x10
453 xorl 140(%esp),%edx
454 # s += r
455 add %esi,%ecx
456 # s <<<= 18
457 rol $18,%ecx
458 # s ^= x5
459 xorl 120(%esp),%ecx
460 # w += v
461 add %edi,%ebx
462 # w <<<= 18
463 rol $18,%ebx
464 # w ^= x15
465 xorl 160(%esp),%ebx
466 # x0 = p
467 movl %eax,100(%esp)
468 # x10 = t
469 movl %edx,140(%esp)
470 # p += x12
471 addl 148(%esp),%eax
472 # x5 = s
473 movl %ecx,120(%esp)
474 # t += x6
475 addl 124(%esp),%edx
476 # x15 = w
477 movl %ebx,160(%esp)
478 # r = x1
479 movl 104(%esp),%esi
480 # r += s
481 add %ecx,%esi
482 # v = x11
483 movl 144(%esp),%edi
484 # v += w
485 add %ebx,%edi
486 # p <<<= 7
487 rol $7,%eax
488 # p ^= x4
489 xorl 116(%esp),%eax
490 # t <<<= 7
491 rol $7,%edx
492 # t ^= x14
493 xorl 156(%esp),%edx
494 # r <<<= 7
495 rol $7,%esi
496 # r ^= x9
497 xorl 136(%esp),%esi
498 # v <<<= 7
499 rol $7,%edi
500 # v ^= x3
501 xorl 112(%esp),%edi
502 # x4 = p
503 movl %eax,116(%esp)
504 # x14 = t
505 movl %edx,156(%esp)
506 # p += x0
507 addl 100(%esp),%eax
508 # x9 = r
509 movl %esi,136(%esp)
510 # t += x10
511 addl 140(%esp),%edx
512 # x3 = v
513 movl %edi,112(%esp)
514 # p <<<= 9
515 rol $9,%eax
516 # p ^= x8
517 xorl 132(%esp),%eax
518 # t <<<= 9
519 rol $9,%edx
520 # t ^= x2
521 xorl 108(%esp),%edx
522 # s += r
523 add %esi,%ecx
524 # s <<<= 9
525 rol $9,%ecx
526 # s ^= x13
527 xorl 152(%esp),%ecx
528 # w += v
529 add %edi,%ebx
530 # w <<<= 9
531 rol $9,%ebx
532 # w ^= x7
533 xorl 128(%esp),%ebx
534 # x8 = p
535 movl %eax,132(%esp)
536 # x2 = t
537 movl %edx,108(%esp)
538 # p += x4
539 addl 116(%esp),%eax
540 # x13 = s
541 movl %ecx,152(%esp)
542 # t += x14
543 addl 156(%esp),%edx
544 # x7 = w
545 movl %ebx,128(%esp)
546 # p <<<= 13
547 rol $13,%eax
548 # p ^= x12
549 xorl 148(%esp),%eax
550 # t <<<= 13
551 rol $13,%edx
552 # t ^= x6
553 xorl 124(%esp),%edx
554 # r += s
555 add %ecx,%esi
556 # r <<<= 13
557 rol $13,%esi
558 # r ^= x1
559 xorl 104(%esp),%esi
560 # v += w
561 add %ebx,%edi
562 # v <<<= 13
563 rol $13,%edi
564 # v ^= x11
565 xorl 144(%esp),%edi
566 # x12 = p
567 movl %eax,148(%esp)
568 # x6 = t
569 movl %edx,124(%esp)
570 # p += x8
571 addl 132(%esp),%eax
572 # x1 = r
573 movl %esi,104(%esp)
574 # t += x2
575 addl 108(%esp),%edx
576 # x11 = v
577 movl %edi,144(%esp)
578 # p <<<= 18
579 rol $18,%eax
580 # p ^= x0
581 xorl 100(%esp),%eax
582 # t <<<= 18
583 rol $18,%edx
584 # t ^= x10
585 xorl 140(%esp),%edx
586 # s += r
587 add %esi,%ecx
588 # s <<<= 18
589 rol $18,%ecx
590 # s ^= x5
591 xorl 120(%esp),%ecx
592 # w += v
593 add %edi,%ebx
594 # w <<<= 18
595 rol $18,%ebx
596 # w ^= x15
597 xorl 160(%esp),%ebx
598 # x0 = p
599 movl %eax,100(%esp)
600 # x10 = t
601 movl %edx,140(%esp)
602 # p += x3
603 addl 112(%esp),%eax
604 # p <<<= 7
605 rol $7,%eax
606 # x5 = s
607 movl %ecx,120(%esp)
608 # t += x9
609 addl 136(%esp),%edx
610 # x15 = w
611 movl %ebx,160(%esp)
612 # r = x4
613 movl 116(%esp),%esi
614 # r += s
615 add %ecx,%esi
616 # v = x14
617 movl 156(%esp),%edi
618 # v += w
619 add %ebx,%edi
620 # p ^= x1
621 xorl 104(%esp),%eax
622 # t <<<= 7
623 rol $7,%edx
624 # t ^= x11
625 xorl 144(%esp),%edx
626 # r <<<= 7
627 rol $7,%esi
628 # r ^= x6
629 xorl 124(%esp),%esi
630 # v <<<= 7
631 rol $7,%edi
632 # v ^= x12
633 xorl 148(%esp),%edi
634 # x1 = p
635 movl %eax,104(%esp)
636 # x11 = t
637 movl %edx,144(%esp)
638 # p += x0
639 addl 100(%esp),%eax
640 # x6 = r
641 movl %esi,124(%esp)
642 # t += x10
643 addl 140(%esp),%edx
644 # x12 = v
645 movl %edi,148(%esp)
646 # p <<<= 9
647 rol $9,%eax
648 # p ^= x2
649 xorl 108(%esp),%eax
650 # t <<<= 9
651 rol $9,%edx
652 # t ^= x8
653 xorl 132(%esp),%edx
654 # s += r
655 add %esi,%ecx
656 # s <<<= 9
657 rol $9,%ecx
658 # s ^= x7
659 xorl 128(%esp),%ecx
660 # w += v
661 add %edi,%ebx
662 # w <<<= 9
663 rol $9,%ebx
664 # w ^= x13
665 xorl 152(%esp),%ebx
666 # x2 = p
667 movl %eax,108(%esp)
668 # x8 = t
669 movl %edx,132(%esp)
670 # p += x1
671 addl 104(%esp),%eax
672 # x7 = s
673 movl %ecx,128(%esp)
674 # t += x11
675 addl 144(%esp),%edx
676 # x13 = w
677 movl %ebx,152(%esp)
678 # p <<<= 13
679 rol $13,%eax
680 # p ^= x3
681 xorl 112(%esp),%eax
682 # t <<<= 13
683 rol $13,%edx
684 # t ^= x9
685 xorl 136(%esp),%edx
686 # r += s
687 add %ecx,%esi
688 # r <<<= 13
689 rol $13,%esi
690 # r ^= x4
691 xorl 116(%esp),%esi
692 # v += w
693 add %ebx,%edi
694 # v <<<= 13
695 rol $13,%edi
696 # v ^= x14
697 xorl 156(%esp),%edi
698 # x3 = p
699 movl %eax,112(%esp)
700 # x9 = t
701 movl %edx,136(%esp)
702 # p += x2
703 addl 108(%esp),%eax
704 # x4 = r
705 movl %esi,116(%esp)
706 # t += x8
707 addl 132(%esp),%edx
708 # x14 = v
709 movl %edi,156(%esp)
710 # p <<<= 18
711 rol $18,%eax
712 # p ^= x0
713 xorl 100(%esp),%eax
714 # t <<<= 18
715 rol $18,%edx
716 # t ^= x10
717 xorl 140(%esp),%edx
718 # s += r
719 add %esi,%ecx
720 # s <<<= 18
721 rol $18,%ecx
722 # s ^= x5
723 xorl 120(%esp),%ecx
724 # w += v
725 add %edi,%ebx
726 # w <<<= 18
727 rol $18,%ebx
728 # w ^= x15
729 xorl 160(%esp),%ebx
730 # i -= 4
731 sub $4,%ebp
732 # goto mainloop if unsigned >
733 ja ._mainloop
734 # x0 = p
735 movl %eax,100(%esp)
736 # x5 = s
737 movl %ecx,120(%esp)
738 # x10 = t
739 movl %edx,140(%esp)
740 # x15 = w
741 movl %ebx,160(%esp)
742 # out = out_backup
743 movl 72(%esp),%edi
744 # m = m_backup
745 movl 68(%esp),%esi
746 # in0 = x0
747 movl 100(%esp),%eax
748 # in1 = x1
749 movl 104(%esp),%ecx
750 # in0 += j0
751 addl 164(%esp),%eax
752 # in1 += j1
753 addl 168(%esp),%ecx
754 # in0 ^= *(uint32 *) (m + 0)
755 xorl 0(%esi),%eax
756 # in1 ^= *(uint32 *) (m + 4)
757 xorl 4(%esi),%ecx
758 # *(uint32 *) (out + 0) = in0
759 movl %eax,0(%edi)
760 # *(uint32 *) (out + 4) = in1
761 movl %ecx,4(%edi)
762 # in2 = x2
763 movl 108(%esp),%eax
764 # in3 = x3
765 movl 112(%esp),%ecx
766 # in2 += j2
767 addl 172(%esp),%eax
768 # in3 += j3
769 addl 176(%esp),%ecx
770 # in2 ^= *(uint32 *) (m + 8)
771 xorl 8(%esi),%eax
772 # in3 ^= *(uint32 *) (m + 12)
773 xorl 12(%esi),%ecx
774 # *(uint32 *) (out + 8) = in2
775 movl %eax,8(%edi)
776 # *(uint32 *) (out + 12) = in3
777 movl %ecx,12(%edi)
778 # in4 = x4
779 movl 116(%esp),%eax
780 # in5 = x5
781 movl 120(%esp),%ecx
782 # in4 += j4
783 addl 180(%esp),%eax
784 # in5 += j5
785 addl 184(%esp),%ecx
786 # in4 ^= *(uint32 *) (m + 16)
787 xorl 16(%esi),%eax
788 # in5 ^= *(uint32 *) (m + 20)
789 xorl 20(%esi),%ecx
790 # *(uint32 *) (out + 16) = in4
791 movl %eax,16(%edi)
792 # *(uint32 *) (out + 20) = in5
793 movl %ecx,20(%edi)
794 # in6 = x6
795 movl 124(%esp),%eax
796 # in7 = x7
797 movl 128(%esp),%ecx
798 # in6 += j6
799 addl 188(%esp),%eax
800 # in7 += j7
801 addl 192(%esp),%ecx
802 # in6 ^= *(uint32 *) (m + 24)
803 xorl 24(%esi),%eax
804 # in7 ^= *(uint32 *) (m + 28)
805 xorl 28(%esi),%ecx
806 # *(uint32 *) (out + 24) = in6
807 movl %eax,24(%edi)
808 # *(uint32 *) (out + 28) = in7
809 movl %ecx,28(%edi)
810 # in8 = x8
811 movl 132(%esp),%eax
812 # in9 = x9
813 movl 136(%esp),%ecx
814 # in8 += j8
815 addl 196(%esp),%eax
816 # in9 += j9
817 addl 200(%esp),%ecx
818 # in8 ^= *(uint32 *) (m + 32)
819 xorl 32(%esi),%eax
820 # in9 ^= *(uint32 *) (m + 36)
821 xorl 36(%esi),%ecx
822 # *(uint32 *) (out + 32) = in8
823 movl %eax,32(%edi)
824 # *(uint32 *) (out + 36) = in9
825 movl %ecx,36(%edi)
826 # in10 = x10
827 movl 140(%esp),%eax
828 # in11 = x11
829 movl 144(%esp),%ecx
830 # in10 += j10
831 addl 204(%esp),%eax
832 # in11 += j11
833 addl 208(%esp),%ecx
834 # in10 ^= *(uint32 *) (m + 40)
835 xorl 40(%esi),%eax
836 # in11 ^= *(uint32 *) (m + 44)
837 xorl 44(%esi),%ecx
838 # *(uint32 *) (out + 40) = in10
839 movl %eax,40(%edi)
840 # *(uint32 *) (out + 44) = in11
841 movl %ecx,44(%edi)
842 # in12 = x12
843 movl 148(%esp),%eax
844 # in13 = x13
845 movl 152(%esp),%ecx
846 # in12 += j12
847 addl 212(%esp),%eax
848 # in13 += j13
849 addl 216(%esp),%ecx
850 # in12 ^= *(uint32 *) (m + 48)
851 xorl 48(%esi),%eax
852 # in13 ^= *(uint32 *) (m + 52)
853 xorl 52(%esi),%ecx
854 # *(uint32 *) (out + 48) = in12
855 movl %eax,48(%edi)
856 # *(uint32 *) (out + 52) = in13
857 movl %ecx,52(%edi)
858 # in14 = x14
859 movl 156(%esp),%eax
860 # in15 = x15
861 movl 160(%esp),%ecx
862 # in14 += j14
863 addl 220(%esp),%eax
864 # in15 += j15
865 addl 224(%esp),%ecx
866 # in14 ^= *(uint32 *) (m + 56)
867 xorl 56(%esi),%eax
868 # in15 ^= *(uint32 *) (m + 60)
869 xorl 60(%esi),%ecx
870 # *(uint32 *) (out + 56) = in14
871 movl %eax,56(%edi)
872 # *(uint32 *) (out + 60) = in15
873 movl %ecx,60(%edi)
874 # bytes = bytes_backup
875 movl 76(%esp),%ebx
876 # in8 = j8
877 movl 196(%esp),%eax
878 # in9 = j9
879 movl 200(%esp),%ecx
880 # in8 += 1
881 add $1,%eax
882 # in9 += 0 + carry
883 adc $0,%ecx
884 # j8 = in8
885 movl %eax,196(%esp)
886 # j9 = in9
887 movl %ecx,200(%esp)
888 # bytes - 64
889 cmp $64,%ebx
890 # goto bytesatleast65 if unsigned>
891 ja ._bytesatleast65
892 # goto bytesatleast64 if unsigned>=
893 jae ._bytesatleast64
894 # m = out
895 mov %edi,%esi
896 # out = ctarget
897 movl 228(%esp),%edi
898 # i = bytes
899 mov %ebx,%ecx
900 # while (i) { *out++ = *m++; --i }
901 rep movsb
902._bytesatleast64:
903 # x = x_backup
904 movl 64(%esp),%eax
905 # in8 = j8
906 movl 196(%esp),%ecx
907 # in9 = j9
908 movl 200(%esp),%edx
909 # *(uint32 *) (x + 32) = in8
910 movl %ecx,32(%eax)
911 # *(uint32 *) (x + 36) = in9
912 movl %edx,36(%eax)
913._done:
914 # eax = eax_stack
915 movl 80(%esp),%eax
916 # ebx = ebx_stack
917 movl 84(%esp),%ebx
918 # esi = esi_stack
919 movl 88(%esp),%esi
920 # edi = edi_stack
921 movl 92(%esp),%edi
922 # ebp = ebp_stack
923 movl 96(%esp),%ebp
924 # leave
925 add %eax,%esp
926 ret
927._bytesatleast65:
928 # bytes -= 64
929 sub $64,%ebx
930 # out += 64
931 add $64,%edi
932 # m += 64
933 add $64,%esi
934 # goto bytesatleast1
935 jmp ._bytesatleast1
936# enter ECRYPT_keysetup
937.text
938.p2align 5
939.globl ECRYPT_keysetup
940ECRYPT_keysetup:
941 mov %esp,%eax
942 and $31,%eax
943 add $256,%eax
944 sub %eax,%esp
945 # eax_stack = eax
946 movl %eax,64(%esp)
947 # ebx_stack = ebx
948 movl %ebx,68(%esp)
949 # esi_stack = esi
950 movl %esi,72(%esp)
951 # edi_stack = edi
952 movl %edi,76(%esp)
953 # ebp_stack = ebp
954 movl %ebp,80(%esp)
955 # k = arg2
956 movl 8(%esp,%eax),%ecx
957 # kbits = arg3
958 movl 12(%esp,%eax),%edx
959 # x = arg1
960 movl 4(%esp,%eax),%eax
961 # in1 = *(uint32 *) (k + 0)
962 movl 0(%ecx),%ebx
963 # in2 = *(uint32 *) (k + 4)
964 movl 4(%ecx),%esi
965 # in3 = *(uint32 *) (k + 8)
966 movl 8(%ecx),%edi
967 # in4 = *(uint32 *) (k + 12)
968 movl 12(%ecx),%ebp
969 # *(uint32 *) (x + 4) = in1
970 movl %ebx,4(%eax)
971 # *(uint32 *) (x + 8) = in2
972 movl %esi,8(%eax)
973 # *(uint32 *) (x + 12) = in3
974 movl %edi,12(%eax)
975 # *(uint32 *) (x + 16) = in4
976 movl %ebp,16(%eax)
977 # kbits - 256
978 cmp $256,%edx
979 # goto kbits128 if unsigned<
980 jb ._kbits128
981._kbits256:
982 # in11 = *(uint32 *) (k + 16)
983 movl 16(%ecx),%edx
984 # in12 = *(uint32 *) (k + 20)
985 movl 20(%ecx),%ebx
986 # in13 = *(uint32 *) (k + 24)
987 movl 24(%ecx),%esi
988 # in14 = *(uint32 *) (k + 28)
989 movl 28(%ecx),%ecx
990 # *(uint32 *) (x + 44) = in11
991 movl %edx,44(%eax)
992 # *(uint32 *) (x + 48) = in12
993 movl %ebx,48(%eax)
994 # *(uint32 *) (x + 52) = in13
995 movl %esi,52(%eax)
996 # *(uint32 *) (x + 56) = in14
997 movl %ecx,56(%eax)
998 # in0 = 1634760805
999 mov $1634760805,%ecx
1000 # in5 = 857760878
1001 mov $857760878,%edx
1002 # in10 = 2036477234
1003 mov $2036477234,%ebx
1004 # in15 = 1797285236
1005 mov $1797285236,%esi
1006 # *(uint32 *) (x + 0) = in0
1007 movl %ecx,0(%eax)
1008 # *(uint32 *) (x + 20) = in5
1009 movl %edx,20(%eax)
1010 # *(uint32 *) (x + 40) = in10
1011 movl %ebx,40(%eax)
1012 # *(uint32 *) (x + 60) = in15
1013 movl %esi,60(%eax)
1014 # goto keysetupdone
1015 jmp ._keysetupdone
1016._kbits128:
1017 # in11 = *(uint32 *) (k + 0)
1018 movl 0(%ecx),%edx
1019 # in12 = *(uint32 *) (k + 4)
1020 movl 4(%ecx),%ebx
1021 # in13 = *(uint32 *) (k + 8)
1022 movl 8(%ecx),%esi
1023 # in14 = *(uint32 *) (k + 12)
1024 movl 12(%ecx),%ecx
1025 # *(uint32 *) (x + 44) = in11
1026 movl %edx,44(%eax)
1027 # *(uint32 *) (x + 48) = in12
1028 movl %ebx,48(%eax)
1029 # *(uint32 *) (x + 52) = in13
1030 movl %esi,52(%eax)
1031 # *(uint32 *) (x + 56) = in14
1032 movl %ecx,56(%eax)
1033 # in0 = 1634760805
1034 mov $1634760805,%ecx
1035 # in5 = 824206446
1036 mov $824206446,%edx
1037 # in10 = 2036477238
1038 mov $2036477238,%ebx
1039 # in15 = 1797285236
1040 mov $1797285236,%esi
1041 # *(uint32 *) (x + 0) = in0
1042 movl %ecx,0(%eax)
1043 # *(uint32 *) (x + 20) = in5
1044 movl %edx,20(%eax)
1045 # *(uint32 *) (x + 40) = in10
1046 movl %ebx,40(%eax)
1047 # *(uint32 *) (x + 60) = in15
1048 movl %esi,60(%eax)
1049._keysetupdone:
1050 # eax = eax_stack
1051 movl 64(%esp),%eax
1052 # ebx = ebx_stack
1053 movl 68(%esp),%ebx
1054 # esi = esi_stack
1055 movl 72(%esp),%esi
1056 # edi = edi_stack
1057 movl 76(%esp),%edi
1058 # ebp = ebp_stack
1059 movl 80(%esp),%ebp
1060 # leave
1061 add %eax,%esp
1062 ret
1063# enter ECRYPT_ivsetup
1064.text
1065.p2align 5
1066.globl ECRYPT_ivsetup
1067ECRYPT_ivsetup:
1068 mov %esp,%eax
1069 and $31,%eax
1070 add $256,%eax
1071 sub %eax,%esp
1072 # eax_stack = eax
1073 movl %eax,64(%esp)
1074 # ebx_stack = ebx
1075 movl %ebx,68(%esp)
1076 # esi_stack = esi
1077 movl %esi,72(%esp)
1078 # edi_stack = edi
1079 movl %edi,76(%esp)
1080 # ebp_stack = ebp
1081 movl %ebp,80(%esp)
1082 # iv = arg2
1083 movl 8(%esp,%eax),%ecx
1084 # x = arg1
1085 movl 4(%esp,%eax),%eax
1086 # in6 = *(uint32 *) (iv + 0)
1087 movl 0(%ecx),%edx
1088 # in7 = *(uint32 *) (iv + 4)
1089 movl 4(%ecx),%ecx
1090 # in8 = 0
1091 mov $0,%ebx
1092 # in9 = 0
1093 mov $0,%esi
1094 # *(uint32 *) (x + 24) = in6
1095 movl %edx,24(%eax)
1096 # *(uint32 *) (x + 28) = in7
1097 movl %ecx,28(%eax)
1098 # *(uint32 *) (x + 32) = in8
1099 movl %ebx,32(%eax)
1100 # *(uint32 *) (x + 36) = in9
1101 movl %esi,36(%eax)
1102 # eax = eax_stack
1103 movl 64(%esp),%eax
1104 # ebx = ebx_stack
1105 movl 68(%esp),%ebx
1106 # esi = esi_stack
1107 movl 72(%esp),%esi
1108 # edi = edi_stack
1109 movl 76(%esp),%edi
1110 # ebp = ebp_stack
1111 movl 80(%esp),%ebp
1112 # leave
1113 add %eax,%esp
1114 ret
diff --git a/arch/x86/crypto/salsa20-x86_64-asm_64.S b/arch/x86/crypto/salsa20-x86_64-asm_64.S
new file mode 100644
index 00000000000..6214a9b0970
--- /dev/null
+++ b/arch/x86/crypto/salsa20-x86_64-asm_64.S
@@ -0,0 +1,920 @@
1# enter ECRYPT_encrypt_bytes
2.text
3.p2align 5
4.globl ECRYPT_encrypt_bytes
5ECRYPT_encrypt_bytes:
6 mov %rsp,%r11
7 and $31,%r11
8 add $256,%r11
9 sub %r11,%rsp
10 # x = arg1
11 mov %rdi,%r8
12 # m = arg2
13 mov %rsi,%rsi
14 # out = arg3
15 mov %rdx,%rdi
16 # bytes = arg4
17 mov %rcx,%rdx
18 # unsigned>? bytes - 0
19 cmp $0,%rdx
20 # comment:fp stack unchanged by jump
21 # goto done if !unsigned>
22 jbe ._done
23 # comment:fp stack unchanged by fallthrough
24# start:
25._start:
26 # r11_stack = r11
27 movq %r11,0(%rsp)
28 # r12_stack = r12
29 movq %r12,8(%rsp)
30 # r13_stack = r13
31 movq %r13,16(%rsp)
32 # r14_stack = r14
33 movq %r14,24(%rsp)
34 # r15_stack = r15
35 movq %r15,32(%rsp)
36 # rbx_stack = rbx
37 movq %rbx,40(%rsp)
38 # rbp_stack = rbp
39 movq %rbp,48(%rsp)
40 # in0 = *(uint64 *) (x + 0)
41 movq 0(%r8),%rcx
42 # in2 = *(uint64 *) (x + 8)
43 movq 8(%r8),%r9
44 # in4 = *(uint64 *) (x + 16)
45 movq 16(%r8),%rax
46 # in6 = *(uint64 *) (x + 24)
47 movq 24(%r8),%r10
48 # in8 = *(uint64 *) (x + 32)
49 movq 32(%r8),%r11
50 # in10 = *(uint64 *) (x + 40)
51 movq 40(%r8),%r12
52 # in12 = *(uint64 *) (x + 48)
53 movq 48(%r8),%r13
54 # in14 = *(uint64 *) (x + 56)
55 movq 56(%r8),%r14
56 # j0 = in0
57 movq %rcx,56(%rsp)
58 # j2 = in2
59 movq %r9,64(%rsp)
60 # j4 = in4
61 movq %rax,72(%rsp)
62 # j6 = in6
63 movq %r10,80(%rsp)
64 # j8 = in8
65 movq %r11,88(%rsp)
66 # j10 = in10
67 movq %r12,96(%rsp)
68 # j12 = in12
69 movq %r13,104(%rsp)
70 # j14 = in14
71 movq %r14,112(%rsp)
72 # x_backup = x
73 movq %r8,120(%rsp)
74# bytesatleast1:
75._bytesatleast1:
76 # unsigned<? bytes - 64
77 cmp $64,%rdx
78 # comment:fp stack unchanged by jump
79 # goto nocopy if !unsigned<
80 jae ._nocopy
81 # ctarget = out
82 movq %rdi,128(%rsp)
83 # out = &tmp
84 leaq 192(%rsp),%rdi
85 # i = bytes
86 mov %rdx,%rcx
87 # while (i) { *out++ = *m++; --i }
88 rep movsb
89 # out = &tmp
90 leaq 192(%rsp),%rdi
91 # m = &tmp
92 leaq 192(%rsp),%rsi
93 # comment:fp stack unchanged by fallthrough
94# nocopy:
95._nocopy:
96 # out_backup = out
97 movq %rdi,136(%rsp)
98 # m_backup = m
99 movq %rsi,144(%rsp)
100 # bytes_backup = bytes
101 movq %rdx,152(%rsp)
102 # x1 = j0
103 movq 56(%rsp),%rdi
104 # x0 = x1
105 mov %rdi,%rdx
106 # (uint64) x1 >>= 32
107 shr $32,%rdi
108 # x3 = j2
109 movq 64(%rsp),%rsi
110 # x2 = x3
111 mov %rsi,%rcx
112 # (uint64) x3 >>= 32
113 shr $32,%rsi
114 # x5 = j4
115 movq 72(%rsp),%r8
116 # x4 = x5
117 mov %r8,%r9
118 # (uint64) x5 >>= 32
119 shr $32,%r8
120 # x5_stack = x5
121 movq %r8,160(%rsp)
122 # x7 = j6
123 movq 80(%rsp),%r8
124 # x6 = x7
125 mov %r8,%rax
126 # (uint64) x7 >>= 32
127 shr $32,%r8
128 # x9 = j8
129 movq 88(%rsp),%r10
130 # x8 = x9
131 mov %r10,%r11
132 # (uint64) x9 >>= 32
133 shr $32,%r10
134 # x11 = j10
135 movq 96(%rsp),%r12
136 # x10 = x11
137 mov %r12,%r13
138 # x10_stack = x10
139 movq %r13,168(%rsp)
140 # (uint64) x11 >>= 32
141 shr $32,%r12
142 # x13 = j12
143 movq 104(%rsp),%r13
144 # x12 = x13
145 mov %r13,%r14
146 # (uint64) x13 >>= 32
147 shr $32,%r13
148 # x15 = j14
149 movq 112(%rsp),%r15
150 # x14 = x15
151 mov %r15,%rbx
152 # (uint64) x15 >>= 32
153 shr $32,%r15
154 # x15_stack = x15
155 movq %r15,176(%rsp)
156 # i = 20
157 mov $20,%r15
158# mainloop:
159._mainloop:
160 # i_backup = i
161 movq %r15,184(%rsp)
162 # x5 = x5_stack
163 movq 160(%rsp),%r15
164 # a = x12 + x0
165 lea (%r14,%rdx),%rbp
166 # (uint32) a <<<= 7
167 rol $7,%ebp
168 # x4 ^= a
169 xor %rbp,%r9
170 # b = x1 + x5
171 lea (%rdi,%r15),%rbp
172 # (uint32) b <<<= 7
173 rol $7,%ebp
174 # x9 ^= b
175 xor %rbp,%r10
176 # a = x0 + x4
177 lea (%rdx,%r9),%rbp
178 # (uint32) a <<<= 9
179 rol $9,%ebp
180 # x8 ^= a
181 xor %rbp,%r11
182 # b = x5 + x9
183 lea (%r15,%r10),%rbp
184 # (uint32) b <<<= 9
185 rol $9,%ebp
186 # x13 ^= b
187 xor %rbp,%r13
188 # a = x4 + x8
189 lea (%r9,%r11),%rbp
190 # (uint32) a <<<= 13
191 rol $13,%ebp
192 # x12 ^= a
193 xor %rbp,%r14
194 # b = x9 + x13
195 lea (%r10,%r13),%rbp
196 # (uint32) b <<<= 13
197 rol $13,%ebp
198 # x1 ^= b
199 xor %rbp,%rdi
200 # a = x8 + x12
201 lea (%r11,%r14),%rbp
202 # (uint32) a <<<= 18
203 rol $18,%ebp
204 # x0 ^= a
205 xor %rbp,%rdx
206 # b = x13 + x1
207 lea (%r13,%rdi),%rbp
208 # (uint32) b <<<= 18
209 rol $18,%ebp
210 # x5 ^= b
211 xor %rbp,%r15
212 # x10 = x10_stack
213 movq 168(%rsp),%rbp
214 # x5_stack = x5
215 movq %r15,160(%rsp)
216 # c = x6 + x10
217 lea (%rax,%rbp),%r15
218 # (uint32) c <<<= 7
219 rol $7,%r15d
220 # x14 ^= c
221 xor %r15,%rbx
222 # c = x10 + x14
223 lea (%rbp,%rbx),%r15
224 # (uint32) c <<<= 9
225 rol $9,%r15d
226 # x2 ^= c
227 xor %r15,%rcx
228 # c = x14 + x2
229 lea (%rbx,%rcx),%r15
230 # (uint32) c <<<= 13
231 rol $13,%r15d
232 # x6 ^= c
233 xor %r15,%rax
234 # c = x2 + x6
235 lea (%rcx,%rax),%r15
236 # (uint32) c <<<= 18
237 rol $18,%r15d
238 # x10 ^= c
239 xor %r15,%rbp
240 # x15 = x15_stack
241 movq 176(%rsp),%r15
242 # x10_stack = x10
243 movq %rbp,168(%rsp)
244 # d = x11 + x15
245 lea (%r12,%r15),%rbp
246 # (uint32) d <<<= 7
247 rol $7,%ebp
248 # x3 ^= d
249 xor %rbp,%rsi
250 # d = x15 + x3
251 lea (%r15,%rsi),%rbp
252 # (uint32) d <<<= 9
253 rol $9,%ebp
254 # x7 ^= d
255 xor %rbp,%r8
256 # d = x3 + x7
257 lea (%rsi,%r8),%rbp
258 # (uint32) d <<<= 13
259 rol $13,%ebp
260 # x11 ^= d
261 xor %rbp,%r12
262 # d = x7 + x11
263 lea (%r8,%r12),%rbp
264 # (uint32) d <<<= 18
265 rol $18,%ebp
266 # x15 ^= d
267 xor %rbp,%r15
268 # x15_stack = x15
269 movq %r15,176(%rsp)
270 # x5 = x5_stack
271 movq 160(%rsp),%r15
272 # a = x3 + x0
273 lea (%rsi,%rdx),%rbp
274 # (uint32) a <<<= 7
275 rol $7,%ebp
276 # x1 ^= a
277 xor %rbp,%rdi
278 # b = x4 + x5
279 lea (%r9,%r15),%rbp
280 # (uint32) b <<<= 7
281 rol $7,%ebp
282 # x6 ^= b
283 xor %rbp,%rax
284 # a = x0 + x1
285 lea (%rdx,%rdi),%rbp
286 # (uint32) a <<<= 9
287 rol $9,%ebp
288 # x2 ^= a
289 xor %rbp,%rcx
290 # b = x5 + x6
291 lea (%r15,%rax),%rbp
292 # (uint32) b <<<= 9
293 rol $9,%ebp
294 # x7 ^= b
295 xor %rbp,%r8
296 # a = x1 + x2
297 lea (%rdi,%rcx),%rbp
298 # (uint32) a <<<= 13
299 rol $13,%ebp
300 # x3 ^= a
301 xor %rbp,%rsi
302 # b = x6 + x7
303 lea (%rax,%r8),%rbp
304 # (uint32) b <<<= 13
305 rol $13,%ebp
306 # x4 ^= b
307 xor %rbp,%r9
308 # a = x2 + x3
309 lea (%rcx,%rsi),%rbp
310 # (uint32) a <<<= 18
311 rol $18,%ebp
312 # x0 ^= a
313 xor %rbp,%rdx
314 # b = x7 + x4
315 lea (%r8,%r9),%rbp
316 # (uint32) b <<<= 18
317 rol $18,%ebp
318 # x5 ^= b
319 xor %rbp,%r15
320 # x10 = x10_stack
321 movq 168(%rsp),%rbp
322 # x5_stack = x5
323 movq %r15,160(%rsp)
324 # c = x9 + x10
325 lea (%r10,%rbp),%r15
326 # (uint32) c <<<= 7
327 rol $7,%r15d
328 # x11 ^= c
329 xor %r15,%r12
330 # c = x10 + x11
331 lea (%rbp,%r12),%r15
332 # (uint32) c <<<= 9
333 rol $9,%r15d
334 # x8 ^= c
335 xor %r15,%r11
336 # c = x11 + x8
337 lea (%r12,%r11),%r15
338 # (uint32) c <<<= 13
339 rol $13,%r15d
340 # x9 ^= c
341 xor %r15,%r10
342 # c = x8 + x9
343 lea (%r11,%r10),%r15
344 # (uint32) c <<<= 18
345 rol $18,%r15d
346 # x10 ^= c
347 xor %r15,%rbp
348 # x15 = x15_stack
349 movq 176(%rsp),%r15
350 # x10_stack = x10
351 movq %rbp,168(%rsp)
352 # d = x14 + x15
353 lea (%rbx,%r15),%rbp
354 # (uint32) d <<<= 7
355 rol $7,%ebp
356 # x12 ^= d
357 xor %rbp,%r14
358 # d = x15 + x12
359 lea (%r15,%r14),%rbp
360 # (uint32) d <<<= 9
361 rol $9,%ebp
362 # x13 ^= d
363 xor %rbp,%r13
364 # d = x12 + x13
365 lea (%r14,%r13),%rbp
366 # (uint32) d <<<= 13
367 rol $13,%ebp
368 # x14 ^= d
369 xor %rbp,%rbx
370 # d = x13 + x14
371 lea (%r13,%rbx),%rbp
372 # (uint32) d <<<= 18
373 rol $18,%ebp
374 # x15 ^= d
375 xor %rbp,%r15
376 # x15_stack = x15
377 movq %r15,176(%rsp)
378 # x5 = x5_stack
379 movq 160(%rsp),%r15
380 # a = x12 + x0
381 lea (%r14,%rdx),%rbp
382 # (uint32) a <<<= 7
383 rol $7,%ebp
384 # x4 ^= a
385 xor %rbp,%r9
386 # b = x1 + x5
387 lea (%rdi,%r15),%rbp
388 # (uint32) b <<<= 7
389 rol $7,%ebp
390 # x9 ^= b
391 xor %rbp,%r10
392 # a = x0 + x4
393 lea (%rdx,%r9),%rbp
394 # (uint32) a <<<= 9
395 rol $9,%ebp
396 # x8 ^= a
397 xor %rbp,%r11
398 # b = x5 + x9
399 lea (%r15,%r10),%rbp
400 # (uint32) b <<<= 9
401 rol $9,%ebp
402 # x13 ^= b
403 xor %rbp,%r13
404 # a = x4 + x8
405 lea (%r9,%r11),%rbp
406 # (uint32) a <<<= 13
407 rol $13,%ebp
408 # x12 ^= a
409 xor %rbp,%r14
410 # b = x9 + x13
411 lea (%r10,%r13),%rbp
412 # (uint32) b <<<= 13
413 rol $13,%ebp
414 # x1 ^= b
415 xor %rbp,%rdi
416 # a = x8 + x12
417 lea (%r11,%r14),%rbp
418 # (uint32) a <<<= 18
419 rol $18,%ebp
420 # x0 ^= a
421 xor %rbp,%rdx
422 # b = x13 + x1
423 lea (%r13,%rdi),%rbp
424 # (uint32) b <<<= 18
425 rol $18,%ebp
426 # x5 ^= b
427 xor %rbp,%r15
428 # x10 = x10_stack
429 movq 168(%rsp),%rbp
430 # x5_stack = x5
431 movq %r15,160(%rsp)
432 # c = x6 + x10
433 lea (%rax,%rbp),%r15
434 # (uint32) c <<<= 7
435 rol $7,%r15d
436 # x14 ^= c
437 xor %r15,%rbx
438 # c = x10 + x14
439 lea (%rbp,%rbx),%r15
440 # (uint32) c <<<= 9
441 rol $9,%r15d
442 # x2 ^= c
443 xor %r15,%rcx
444 # c = x14 + x2
445 lea (%rbx,%rcx),%r15
446 # (uint32) c <<<= 13
447 rol $13,%r15d
448 # x6 ^= c
449 xor %r15,%rax
450 # c = x2 + x6
451 lea (%rcx,%rax),%r15
452 # (uint32) c <<<= 18
453 rol $18,%r15d
454 # x10 ^= c
455 xor %r15,%rbp
456 # x15 = x15_stack
457 movq 176(%rsp),%r15
458 # x10_stack = x10
459 movq %rbp,168(%rsp)
460 # d = x11 + x15
461 lea (%r12,%r15),%rbp
462 # (uint32) d <<<= 7
463 rol $7,%ebp
464 # x3 ^= d
465 xor %rbp,%rsi
466 # d = x15 + x3
467 lea (%r15,%rsi),%rbp
468 # (uint32) d <<<= 9
469 rol $9,%ebp
470 # x7 ^= d
471 xor %rbp,%r8
472 # d = x3 + x7
473 lea (%rsi,%r8),%rbp
474 # (uint32) d <<<= 13
475 rol $13,%ebp
476 # x11 ^= d
477 xor %rbp,%r12
478 # d = x7 + x11
479 lea (%r8,%r12),%rbp
480 # (uint32) d <<<= 18
481 rol $18,%ebp
482 # x15 ^= d
483 xor %rbp,%r15
484 # x15_stack = x15
485 movq %r15,176(%rsp)
486 # x5 = x5_stack
487 movq 160(%rsp),%r15
488 # a = x3 + x0
489 lea (%rsi,%rdx),%rbp
490 # (uint32) a <<<= 7
491 rol $7,%ebp
492 # x1 ^= a
493 xor %rbp,%rdi
494 # b = x4 + x5
495 lea (%r9,%r15),%rbp
496 # (uint32) b <<<= 7
497 rol $7,%ebp
498 # x6 ^= b
499 xor %rbp,%rax
500 # a = x0 + x1
501 lea (%rdx,%rdi),%rbp
502 # (uint32) a <<<= 9
503 rol $9,%ebp
504 # x2 ^= a
505 xor %rbp,%rcx
506 # b = x5 + x6
507 lea (%r15,%rax),%rbp
508 # (uint32) b <<<= 9
509 rol $9,%ebp
510 # x7 ^= b
511 xor %rbp,%r8
512 # a = x1 + x2
513 lea (%rdi,%rcx),%rbp
514 # (uint32) a <<<= 13
515 rol $13,%ebp
516 # x3 ^= a
517 xor %rbp,%rsi
518 # b = x6 + x7
519 lea (%rax,%r8),%rbp
520 # (uint32) b <<<= 13
521 rol $13,%ebp
522 # x4 ^= b
523 xor %rbp,%r9
524 # a = x2 + x3
525 lea (%rcx,%rsi),%rbp
526 # (uint32) a <<<= 18
527 rol $18,%ebp
528 # x0 ^= a
529 xor %rbp,%rdx
530 # b = x7 + x4
531 lea (%r8,%r9),%rbp
532 # (uint32) b <<<= 18
533 rol $18,%ebp
534 # x5 ^= b
535 xor %rbp,%r15
536 # x10 = x10_stack
537 movq 168(%rsp),%rbp
538 # x5_stack = x5
539 movq %r15,160(%rsp)
540 # c = x9 + x10
541 lea (%r10,%rbp),%r15
542 # (uint32) c <<<= 7
543 rol $7,%r15d
544 # x11 ^= c
545 xor %r15,%r12
546 # c = x10 + x11
547 lea (%rbp,%r12),%r15
548 # (uint32) c <<<= 9
549 rol $9,%r15d
550 # x8 ^= c
551 xor %r15,%r11
552 # c = x11 + x8
553 lea (%r12,%r11),%r15
554 # (uint32) c <<<= 13
555 rol $13,%r15d
556 # x9 ^= c
557 xor %r15,%r10
558 # c = x8 + x9
559 lea (%r11,%r10),%r15
560 # (uint32) c <<<= 18
561 rol $18,%r15d
562 # x10 ^= c
563 xor %r15,%rbp
564 # x15 = x15_stack
565 movq 176(%rsp),%r15
566 # x10_stack = x10
567 movq %rbp,168(%rsp)
568 # d = x14 + x15
569 lea (%rbx,%r15),%rbp
570 # (uint32) d <<<= 7
571 rol $7,%ebp
572 # x12 ^= d
573 xor %rbp,%r14
574 # d = x15 + x12
575 lea (%r15,%r14),%rbp
576 # (uint32) d <<<= 9
577 rol $9,%ebp
578 # x13 ^= d
579 xor %rbp,%r13
580 # d = x12 + x13
581 lea (%r14,%r13),%rbp
582 # (uint32) d <<<= 13
583 rol $13,%ebp
584 # x14 ^= d
585 xor %rbp,%rbx
586 # d = x13 + x14
587 lea (%r13,%rbx),%rbp
588 # (uint32) d <<<= 18
589 rol $18,%ebp
590 # x15 ^= d
591 xor %rbp,%r15
592 # x15_stack = x15
593 movq %r15,176(%rsp)
594 # i = i_backup
595 movq 184(%rsp),%r15
596 # unsigned>? i -= 4
597 sub $4,%r15
598 # comment:fp stack unchanged by jump
599 # goto mainloop if unsigned>
600 ja ._mainloop
601 # (uint32) x2 += j2
602 addl 64(%rsp),%ecx
603 # x3 <<= 32
604 shl $32,%rsi
605 # x3 += j2
606 addq 64(%rsp),%rsi
607 # (uint64) x3 >>= 32
608 shr $32,%rsi
609 # x3 <<= 32
610 shl $32,%rsi
611 # x2 += x3
612 add %rsi,%rcx
613 # (uint32) x6 += j6
614 addl 80(%rsp),%eax
615 # x7 <<= 32
616 shl $32,%r8
617 # x7 += j6
618 addq 80(%rsp),%r8
619 # (uint64) x7 >>= 32
620 shr $32,%r8
621 # x7 <<= 32
622 shl $32,%r8
623 # x6 += x7
624 add %r8,%rax
625 # (uint32) x8 += j8
626 addl 88(%rsp),%r11d
627 # x9 <<= 32
628 shl $32,%r10
629 # x9 += j8
630 addq 88(%rsp),%r10
631 # (uint64) x9 >>= 32
632 shr $32,%r10
633 # x9 <<= 32
634 shl $32,%r10
635 # x8 += x9
636 add %r10,%r11
637 # (uint32) x12 += j12
638 addl 104(%rsp),%r14d
639 # x13 <<= 32
640 shl $32,%r13
641 # x13 += j12
642 addq 104(%rsp),%r13
643 # (uint64) x13 >>= 32
644 shr $32,%r13
645 # x13 <<= 32
646 shl $32,%r13
647 # x12 += x13
648 add %r13,%r14
649 # (uint32) x0 += j0
650 addl 56(%rsp),%edx
651 # x1 <<= 32
652 shl $32,%rdi
653 # x1 += j0
654 addq 56(%rsp),%rdi
655 # (uint64) x1 >>= 32
656 shr $32,%rdi
657 # x1 <<= 32
658 shl $32,%rdi
659 # x0 += x1
660 add %rdi,%rdx
661 # x5 = x5_stack
662 movq 160(%rsp),%rdi
663 # (uint32) x4 += j4
664 addl 72(%rsp),%r9d
665 # x5 <<= 32
666 shl $32,%rdi
667 # x5 += j4
668 addq 72(%rsp),%rdi
669 # (uint64) x5 >>= 32
670 shr $32,%rdi
671 # x5 <<= 32
672 shl $32,%rdi
673 # x4 += x5
674 add %rdi,%r9
675 # x10 = x10_stack
676 movq 168(%rsp),%r8
677 # (uint32) x10 += j10
678 addl 96(%rsp),%r8d
679 # x11 <<= 32
680 shl $32,%r12
681 # x11 += j10
682 addq 96(%rsp),%r12
683 # (uint64) x11 >>= 32
684 shr $32,%r12
685 # x11 <<= 32
686 shl $32,%r12
687 # x10 += x11
688 add %r12,%r8
689 # x15 = x15_stack
690 movq 176(%rsp),%rdi
691 # (uint32) x14 += j14
692 addl 112(%rsp),%ebx
693 # x15 <<= 32
694 shl $32,%rdi
695 # x15 += j14
696 addq 112(%rsp),%rdi
697 # (uint64) x15 >>= 32
698 shr $32,%rdi
699 # x15 <<= 32
700 shl $32,%rdi
701 # x14 += x15
702 add %rdi,%rbx
703 # out = out_backup
704 movq 136(%rsp),%rdi
705 # m = m_backup
706 movq 144(%rsp),%rsi
707 # x0 ^= *(uint64 *) (m + 0)
708 xorq 0(%rsi),%rdx
709 # *(uint64 *) (out + 0) = x0
710 movq %rdx,0(%rdi)
711 # x2 ^= *(uint64 *) (m + 8)
712 xorq 8(%rsi),%rcx
713 # *(uint64 *) (out + 8) = x2
714 movq %rcx,8(%rdi)
715 # x4 ^= *(uint64 *) (m + 16)
716 xorq 16(%rsi),%r9
717 # *(uint64 *) (out + 16) = x4
718 movq %r9,16(%rdi)
719 # x6 ^= *(uint64 *) (m + 24)
720 xorq 24(%rsi),%rax
721 # *(uint64 *) (out + 24) = x6
722 movq %rax,24(%rdi)
723 # x8 ^= *(uint64 *) (m + 32)
724 xorq 32(%rsi),%r11
725 # *(uint64 *) (out + 32) = x8
726 movq %r11,32(%rdi)
727 # x10 ^= *(uint64 *) (m + 40)
728 xorq 40(%rsi),%r8
729 # *(uint64 *) (out + 40) = x10
730 movq %r8,40(%rdi)
731 # x12 ^= *(uint64 *) (m + 48)
732 xorq 48(%rsi),%r14
733 # *(uint64 *) (out + 48) = x12
734 movq %r14,48(%rdi)
735 # x14 ^= *(uint64 *) (m + 56)
736 xorq 56(%rsi),%rbx
737 # *(uint64 *) (out + 56) = x14
738 movq %rbx,56(%rdi)
739 # bytes = bytes_backup
740 movq 152(%rsp),%rdx
741 # in8 = j8
742 movq 88(%rsp),%rcx
743 # in8 += 1
744 add $1,%rcx
745 # j8 = in8
746 movq %rcx,88(%rsp)
747 # unsigned>? unsigned<? bytes - 64
748 cmp $64,%rdx
749 # comment:fp stack unchanged by jump
750 # goto bytesatleast65 if unsigned>
751 ja ._bytesatleast65
752 # comment:fp stack unchanged by jump
753 # goto bytesatleast64 if !unsigned<
754 jae ._bytesatleast64
755 # m = out
756 mov %rdi,%rsi
757 # out = ctarget
758 movq 128(%rsp),%rdi
759 # i = bytes
760 mov %rdx,%rcx
761 # while (i) { *out++ = *m++; --i }
762 rep movsb
763 # comment:fp stack unchanged by fallthrough
764# bytesatleast64:
765._bytesatleast64:
766 # x = x_backup
767 movq 120(%rsp),%rdi
768 # in8 = j8
769 movq 88(%rsp),%rsi
770 # *(uint64 *) (x + 32) = in8
771 movq %rsi,32(%rdi)
772 # r11 = r11_stack
773 movq 0(%rsp),%r11
774 # r12 = r12_stack
775 movq 8(%rsp),%r12
776 # r13 = r13_stack
777 movq 16(%rsp),%r13
778 # r14 = r14_stack
779 movq 24(%rsp),%r14
780 # r15 = r15_stack
781 movq 32(%rsp),%r15
782 # rbx = rbx_stack
783 movq 40(%rsp),%rbx
784 # rbp = rbp_stack
785 movq 48(%rsp),%rbp
786 # comment:fp stack unchanged by fallthrough
787# done:
788._done:
789 # leave
790 add %r11,%rsp
791 mov %rdi,%rax
792 mov %rsi,%rdx
793 ret
794# bytesatleast65:
795._bytesatleast65:
796 # bytes -= 64
797 sub $64,%rdx
798 # out += 64
799 add $64,%rdi
800 # m += 64
801 add $64,%rsi
802 # comment:fp stack unchanged by jump
803 # goto bytesatleast1
804 jmp ._bytesatleast1
805# enter ECRYPT_keysetup
806.text
807.p2align 5
808.globl ECRYPT_keysetup
809ECRYPT_keysetup:
810 mov %rsp,%r11
811 and $31,%r11
812 add $256,%r11
813 sub %r11,%rsp
814 # k = arg2
815 mov %rsi,%rsi
816 # kbits = arg3
817 mov %rdx,%rdx
818 # x = arg1
819 mov %rdi,%rdi
820 # in0 = *(uint64 *) (k + 0)
821 movq 0(%rsi),%r8
822 # in2 = *(uint64 *) (k + 8)
823 movq 8(%rsi),%r9
824 # *(uint64 *) (x + 4) = in0
825 movq %r8,4(%rdi)
826 # *(uint64 *) (x + 12) = in2
827 movq %r9,12(%rdi)
828 # unsigned<? kbits - 256
829 cmp $256,%rdx
830 # comment:fp stack unchanged by jump
831 # goto kbits128 if unsigned<
832 jb ._kbits128
833# kbits256:
834._kbits256:
835 # in10 = *(uint64 *) (k + 16)
836 movq 16(%rsi),%rdx
837 # in12 = *(uint64 *) (k + 24)
838 movq 24(%rsi),%rsi
839 # *(uint64 *) (x + 44) = in10
840 movq %rdx,44(%rdi)
841 # *(uint64 *) (x + 52) = in12
842 movq %rsi,52(%rdi)
843 # in0 = 1634760805
844 mov $1634760805,%rsi
845 # in4 = 857760878
846 mov $857760878,%rdx
847 # in10 = 2036477234
848 mov $2036477234,%rcx
849 # in14 = 1797285236
850 mov $1797285236,%r8
851 # *(uint32 *) (x + 0) = in0
852 movl %esi,0(%rdi)
853 # *(uint32 *) (x + 20) = in4
854 movl %edx,20(%rdi)
855 # *(uint32 *) (x + 40) = in10
856 movl %ecx,40(%rdi)
857 # *(uint32 *) (x + 60) = in14
858 movl %r8d,60(%rdi)
859 # comment:fp stack unchanged by jump
860 # goto keysetupdone
861 jmp ._keysetupdone
862# kbits128:
863._kbits128:
864 # in10 = *(uint64 *) (k + 0)
865 movq 0(%rsi),%rdx
866 # in12 = *(uint64 *) (k + 8)
867 movq 8(%rsi),%rsi
868 # *(uint64 *) (x + 44) = in10
869 movq %rdx,44(%rdi)
870 # *(uint64 *) (x + 52) = in12
871 movq %rsi,52(%rdi)
872 # in0 = 1634760805
873 mov $1634760805,%rsi
874 # in4 = 824206446
875 mov $824206446,%rdx
876 # in10 = 2036477238
877 mov $2036477238,%rcx
878 # in14 = 1797285236
879 mov $1797285236,%r8
880 # *(uint32 *) (x + 0) = in0
881 movl %esi,0(%rdi)
882 # *(uint32 *) (x + 20) = in4
883 movl %edx,20(%rdi)
884 # *(uint32 *) (x + 40) = in10
885 movl %ecx,40(%rdi)
886 # *(uint32 *) (x + 60) = in14
887 movl %r8d,60(%rdi)
888# keysetupdone:
889._keysetupdone:
890 # leave
891 add %r11,%rsp
892 mov %rdi,%rax
893 mov %rsi,%rdx
894 ret
895# enter ECRYPT_ivsetup
896.text
897.p2align 5
898.globl ECRYPT_ivsetup
899ECRYPT_ivsetup:
900 mov %rsp,%r11
901 and $31,%r11
902 add $256,%r11
903 sub %r11,%rsp
904 # iv = arg2
905 mov %rsi,%rsi
906 # x = arg1
907 mov %rdi,%rdi
908 # in6 = *(uint64 *) (iv + 0)
909 movq 0(%rsi),%rsi
910 # in8 = 0
911 mov $0,%r8
912 # *(uint64 *) (x + 24) = in6
913 movq %rsi,24(%rdi)
914 # *(uint64 *) (x + 32) = in8
915 movq %r8,32(%rdi)
916 # leave
917 add %r11,%rsp
918 mov %rdi,%rax
919 mov %rsi,%rdx
920 ret
diff --git a/arch/x86/crypto/salsa20_glue.c b/arch/x86/crypto/salsa20_glue.c
new file mode 100644
index 00000000000..bccb76d8098
--- /dev/null
+++ b/arch/x86/crypto/salsa20_glue.c
@@ -0,0 +1,129 @@
1/*
2 * Glue code for optimized assembly version of Salsa20.
3 *
4 * Copyright (c) 2007 Tan Swee Heng <thesweeheng@gmail.com>
5 *
6 * The assembly codes are public domain assembly codes written by Daniel. J.
7 * Bernstein <djb@cr.yp.to>. The codes are modified to include indentation
8 * and to remove extraneous comments and functions that are not needed.
9 * - i586 version, renamed as salsa20-i586-asm_32.S
10 * available from <http://cr.yp.to/snuffle/salsa20/x86-pm/salsa20.s>
11 * - x86-64 version, renamed as salsa20-x86_64-asm_64.S
12 * available from <http://cr.yp.to/snuffle/salsa20/amd64-3/salsa20.s>
13 *
14 * This program is free software; you can redistribute it and/or modify it
15 * under the terms of the GNU General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option)
17 * any later version.
18 *
19 */
20
21#include <crypto/algapi.h>
22#include <linux/module.h>
23#include <linux/crypto.h>
24
25#define SALSA20_IV_SIZE 8U
26#define SALSA20_MIN_KEY_SIZE 16U
27#define SALSA20_MAX_KEY_SIZE 32U
28
29// use the ECRYPT_* function names
30#define salsa20_keysetup ECRYPT_keysetup
31#define salsa20_ivsetup ECRYPT_ivsetup
32#define salsa20_encrypt_bytes ECRYPT_encrypt_bytes
33
34struct salsa20_ctx
35{
36 u32 input[16];
37};
38
39asmlinkage void salsa20_keysetup(struct salsa20_ctx *ctx, const u8 *k,
40 u32 keysize, u32 ivsize);
41asmlinkage void salsa20_ivsetup(struct salsa20_ctx *ctx, const u8 *iv);
42asmlinkage void salsa20_encrypt_bytes(struct salsa20_ctx *ctx,
43 const u8 *src, u8 *dst, u32 bytes);
44
45static int setkey(struct crypto_tfm *tfm, const u8 *key,
46 unsigned int keysize)
47{
48 struct salsa20_ctx *ctx = crypto_tfm_ctx(tfm);
49 salsa20_keysetup(ctx, key, keysize*8, SALSA20_IV_SIZE*8);
50 return 0;
51}
52
53static int encrypt(struct blkcipher_desc *desc,
54 struct scatterlist *dst, struct scatterlist *src,
55 unsigned int nbytes)
56{
57 struct blkcipher_walk walk;
58 struct crypto_blkcipher *tfm = desc->tfm;
59 struct salsa20_ctx *ctx = crypto_blkcipher_ctx(tfm);
60 int err;
61
62 blkcipher_walk_init(&walk, dst, src, nbytes);
63 err = blkcipher_walk_virt_block(desc, &walk, 64);
64
65 salsa20_ivsetup(ctx, walk.iv);
66
67 if (likely(walk.nbytes == nbytes))
68 {
69 salsa20_encrypt_bytes(ctx, walk.src.virt.addr,
70 walk.dst.virt.addr, nbytes);
71 return blkcipher_walk_done(desc, &walk, 0);
72 }
73
74 while (walk.nbytes >= 64) {
75 salsa20_encrypt_bytes(ctx, walk.src.virt.addr,
76 walk.dst.virt.addr,
77 walk.nbytes - (walk.nbytes % 64));
78 err = blkcipher_walk_done(desc, &walk, walk.nbytes % 64);
79 }
80
81 if (walk.nbytes) {
82 salsa20_encrypt_bytes(ctx, walk.src.virt.addr,
83 walk.dst.virt.addr, walk.nbytes);
84 err = blkcipher_walk_done(desc, &walk, 0);
85 }
86
87 return err;
88}
89
90static struct crypto_alg alg = {
91 .cra_name = "salsa20",
92 .cra_driver_name = "salsa20-asm",
93 .cra_priority = 200,
94 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
95 .cra_type = &crypto_blkcipher_type,
96 .cra_blocksize = 1,
97 .cra_ctxsize = sizeof(struct salsa20_ctx),
98 .cra_alignmask = 3,
99 .cra_module = THIS_MODULE,
100 .cra_list = LIST_HEAD_INIT(alg.cra_list),
101 .cra_u = {
102 .blkcipher = {
103 .setkey = setkey,
104 .encrypt = encrypt,
105 .decrypt = encrypt,
106 .min_keysize = SALSA20_MIN_KEY_SIZE,
107 .max_keysize = SALSA20_MAX_KEY_SIZE,
108 .ivsize = SALSA20_IV_SIZE,
109 }
110 }
111};
112
113static int __init init(void)
114{
115 return crypto_register_alg(&alg);
116}
117
118static void __exit fini(void)
119{
120 crypto_unregister_alg(&alg);
121}
122
123module_init(init);
124module_exit(fini);
125
126MODULE_LICENSE("GPL");
127MODULE_DESCRIPTION ("Salsa20 stream cipher algorithm (optimized assembly version)");
128MODULE_ALIAS("salsa20");
129MODULE_ALIAS("salsa20-asm");
diff --git a/arch/x86/crypto/twofish_64.c b/arch/x86/crypto/twofish_64.c
deleted file mode 100644
index 182d91d5cfb..00000000000
--- a/arch/x86/crypto/twofish_64.c
+++ /dev/null
@@ -1,97 +0,0 @@
1/*
2 * Glue Code for optimized x86_64 assembler version of TWOFISH
3 *
4 * Originally Twofish for GPG
5 * By Matthew Skala <mskala@ansuz.sooke.bc.ca>, July 26, 1998
6 * 256-bit key length added March 20, 1999
7 * Some modifications to reduce the text size by Werner Koch, April, 1998
8 * Ported to the kerneli patch by Marc Mutz <Marc@Mutz.com>
9 * Ported to CryptoAPI by Colin Slater <hoho@tacomeat.net>
10 *
11 * The original author has disclaimed all copyright interest in this
12 * code and thus put it in the public domain. The subsequent authors
13 * have put this under the GNU General Public License.
14 *
15 * This program is free software; you can redistribute it and/or modify
16 * it under the terms of the GNU General Public License as published by
17 * the Free Software Foundation; either version 2 of the License, or
18 * (at your option) any later version.
19 *
20 * This program is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * GNU General Public License for more details.
24 *
25 * You should have received a copy of the GNU General Public License
26 * along with this program; if not, write to the Free Software
27 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
28 * USA
29 *
30 * This code is a "clean room" implementation, written from the paper
31 * _Twofish: A 128-Bit Block Cipher_ by Bruce Schneier, John Kelsey,
32 * Doug Whiting, David Wagner, Chris Hall, and Niels Ferguson, available
33 * through http://www.counterpane.com/twofish.html
34 *
35 * For background information on multiplication in finite fields, used for
36 * the matrix operations in the key schedule, see the book _Contemporary
37 * Abstract Algebra_ by Joseph A. Gallian, especially chapter 22 in the
38 * Third Edition.
39 */
40
41#include <crypto/twofish.h>
42#include <linux/crypto.h>
43#include <linux/init.h>
44#include <linux/kernel.h>
45#include <linux/module.h>
46#include <linux/types.h>
47
48asmlinkage void twofish_enc_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
49asmlinkage void twofish_dec_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
50
51static void twofish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
52{
53 twofish_enc_blk(tfm, dst, src);
54}
55
56static void twofish_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
57{
58 twofish_dec_blk(tfm, dst, src);
59}
60
61static struct crypto_alg alg = {
62 .cra_name = "twofish",
63 .cra_driver_name = "twofish-x86_64",
64 .cra_priority = 200,
65 .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
66 .cra_blocksize = TF_BLOCK_SIZE,
67 .cra_ctxsize = sizeof(struct twofish_ctx),
68 .cra_alignmask = 3,
69 .cra_module = THIS_MODULE,
70 .cra_list = LIST_HEAD_INIT(alg.cra_list),
71 .cra_u = {
72 .cipher = {
73 .cia_min_keysize = TF_MIN_KEY_SIZE,
74 .cia_max_keysize = TF_MAX_KEY_SIZE,
75 .cia_setkey = twofish_setkey,
76 .cia_encrypt = twofish_encrypt,
77 .cia_decrypt = twofish_decrypt
78 }
79 }
80};
81
82static int __init init(void)
83{
84 return crypto_register_alg(&alg);
85}
86
87static void __exit fini(void)
88{
89 crypto_unregister_alg(&alg);
90}
91
92module_init(init);
93module_exit(fini);
94
95MODULE_LICENSE("GPL");
96MODULE_DESCRIPTION ("Twofish Cipher Algorithm, x86_64 asm optimized");
97MODULE_ALIAS("twofish");
diff --git a/arch/x86/crypto/twofish_32.c b/arch/x86/crypto/twofish_glue.c
index e3004dfe9c7..cefaf8b9aa1 100644
--- a/arch/x86/crypto/twofish_32.c
+++ b/arch/x86/crypto/twofish_glue.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Glue Code for optimized 586 assembler version of TWOFISH 2 * Glue Code for assembler optimized version of TWOFISH
3 * 3 *
4 * Originally Twofish for GPG 4 * Originally Twofish for GPG
5 * By Matthew Skala <mskala@ansuz.sooke.bc.ca>, July 26, 1998 5 * By Matthew Skala <mskala@ansuz.sooke.bc.ca>, July 26, 1998
@@ -44,7 +44,6 @@
44#include <linux/module.h> 44#include <linux/module.h>
45#include <linux/types.h> 45#include <linux/types.h>
46 46
47
48asmlinkage void twofish_enc_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src); 47asmlinkage void twofish_enc_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
49asmlinkage void twofish_dec_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src); 48asmlinkage void twofish_dec_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
50 49
@@ -60,7 +59,7 @@ static void twofish_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
60 59
61static struct crypto_alg alg = { 60static struct crypto_alg alg = {
62 .cra_name = "twofish", 61 .cra_name = "twofish",
63 .cra_driver_name = "twofish-i586", 62 .cra_driver_name = "twofish-asm",
64 .cra_priority = 200, 63 .cra_priority = 200,
65 .cra_flags = CRYPTO_ALG_TYPE_CIPHER, 64 .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
66 .cra_blocksize = TF_BLOCK_SIZE, 65 .cra_blocksize = TF_BLOCK_SIZE,
@@ -93,5 +92,6 @@ module_init(init);
93module_exit(fini); 92module_exit(fini);
94 93
95MODULE_LICENSE("GPL"); 94MODULE_LICENSE("GPL");
96MODULE_DESCRIPTION ("Twofish Cipher Algorithm, i586 asm optimized"); 95MODULE_DESCRIPTION ("Twofish Cipher Algorithm, asm optimized");
97MODULE_ALIAS("twofish"); 96MODULE_ALIAS("twofish");
97MODULE_ALIAS("twofish-asm");
diff --git a/arch/x86/ia32/Makefile b/arch/x86/ia32/Makefile
index e2edda255a8..52d0ccfcf6e 100644
--- a/arch/x86/ia32/Makefile
+++ b/arch/x86/ia32/Makefile
@@ -2,9 +2,7 @@
2# Makefile for the ia32 kernel emulation subsystem. 2# Makefile for the ia32 kernel emulation subsystem.
3# 3#
4 4
5obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o tls32.o \ 5obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o
6 ia32_binfmt.o fpu32.o ptrace32.o syscall32.o syscall32_syscall.o \
7 mmap32.o
8 6
9sysv-$(CONFIG_SYSVIPC) := ipc32.o 7sysv-$(CONFIG_SYSVIPC) := ipc32.o
10obj-$(CONFIG_IA32_EMULATION) += $(sysv-y) 8obj-$(CONFIG_IA32_EMULATION) += $(sysv-y)
@@ -13,40 +11,3 @@ obj-$(CONFIG_IA32_AOUT) += ia32_aout.o
13 11
14audit-class-$(CONFIG_AUDIT) := audit.o 12audit-class-$(CONFIG_AUDIT) := audit.o
15obj-$(CONFIG_IA32_EMULATION) += $(audit-class-y) 13obj-$(CONFIG_IA32_EMULATION) += $(audit-class-y)
16
17$(obj)/syscall32_syscall.o: \
18 $(foreach F,sysenter syscall,$(obj)/vsyscall-$F.so)
19
20# Teach kbuild about targets
21targets := $(foreach F,$(addprefix vsyscall-,sysenter syscall),\
22 $F.o $F.so $F.so.dbg)
23
24# The DSO images are built using a special linker script
25quiet_cmd_syscall = SYSCALL $@
26 cmd_syscall = $(CC) -m32 -nostdlib -shared \
27 $(call ld-option, -Wl$(comma)--hash-style=sysv) \
28 -Wl,-soname=linux-gate.so.1 -o $@ \
29 -Wl,-T,$(filter-out FORCE,$^)
30
31$(obj)/%.so: OBJCOPYFLAGS := -S
32$(obj)/%.so: $(obj)/%.so.dbg FORCE
33 $(call if_changed,objcopy)
34
35$(obj)/vsyscall-sysenter.so.dbg $(obj)/vsyscall-syscall.so.dbg: \
36$(obj)/vsyscall-%.so.dbg: $(src)/vsyscall.lds $(obj)/vsyscall-%.o FORCE
37 $(call if_changed,syscall)
38
39AFLAGS_vsyscall-sysenter.o = -m32 -Wa,-32
40AFLAGS_vsyscall-syscall.o = -m32 -Wa,-32
41
42vdsos := vdso32-sysenter.so vdso32-syscall.so
43
44quiet_cmd_vdso_install = INSTALL $@
45 cmd_vdso_install = cp $(@:vdso32-%.so=$(obj)/vsyscall-%.so.dbg) \
46 $(MODLIB)/vdso/$@
47
48$(vdsos):
49 @mkdir -p $(MODLIB)/vdso
50 $(call cmd,vdso_install)
51
52vdso_install: $(vdsos)
diff --git a/arch/x86/ia32/audit.c b/arch/x86/ia32/audit.c
index 91b7b5922df..5d7b381da69 100644
--- a/arch/x86/ia32/audit.c
+++ b/arch/x86/ia32/audit.c
@@ -27,7 +27,7 @@ unsigned ia32_signal_class[] = {
27 27
28int ia32_classify_syscall(unsigned syscall) 28int ia32_classify_syscall(unsigned syscall)
29{ 29{
30 switch(syscall) { 30 switch (syscall) {
31 case __NR_open: 31 case __NR_open:
32 return 2; 32 return 2;
33 case __NR_openat: 33 case __NR_openat:
diff --git a/arch/x86/ia32/fpu32.c b/arch/x86/ia32/fpu32.c
deleted file mode 100644
index 2c8209a3605..00000000000
--- a/arch/x86/ia32/fpu32.c
+++ /dev/null
@@ -1,183 +0,0 @@
1/*
2 * Copyright 2002 Andi Kleen, SuSE Labs.
3 * FXSAVE<->i387 conversion support. Based on code by Gareth Hughes.
4 * This is used for ptrace, signals and coredumps in 32bit emulation.
5 */
6
7#include <linux/sched.h>
8#include <asm/sigcontext32.h>
9#include <asm/processor.h>
10#include <asm/uaccess.h>
11#include <asm/i387.h>
12
13static inline unsigned short twd_i387_to_fxsr(unsigned short twd)
14{
15 unsigned int tmp; /* to avoid 16 bit prefixes in the code */
16
17 /* Transform each pair of bits into 01 (valid) or 00 (empty) */
18 tmp = ~twd;
19 tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */
20 /* and move the valid bits to the lower byte. */
21 tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */
22 tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */
23 tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */
24 return tmp;
25}
26
27static inline unsigned long twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave)
28{
29 struct _fpxreg *st = NULL;
30 unsigned long tos = (fxsave->swd >> 11) & 7;
31 unsigned long twd = (unsigned long) fxsave->twd;
32 unsigned long tag;
33 unsigned long ret = 0xffff0000;
34 int i;
35
36#define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16);
37
38 for (i = 0 ; i < 8 ; i++) {
39 if (twd & 0x1) {
40 st = FPREG_ADDR( fxsave, (i - tos) & 7 );
41
42 switch (st->exponent & 0x7fff) {
43 case 0x7fff:
44 tag = 2; /* Special */
45 break;
46 case 0x0000:
47 if ( !st->significand[0] &&
48 !st->significand[1] &&
49 !st->significand[2] &&
50 !st->significand[3] ) {
51 tag = 1; /* Zero */
52 } else {
53 tag = 2; /* Special */
54 }
55 break;
56 default:
57 if (st->significand[3] & 0x8000) {
58 tag = 0; /* Valid */
59 } else {
60 tag = 2; /* Special */
61 }
62 break;
63 }
64 } else {
65 tag = 3; /* Empty */
66 }
67 ret |= (tag << (2 * i));
68 twd = twd >> 1;
69 }
70 return ret;
71}
72
73
74static inline int convert_fxsr_from_user(struct i387_fxsave_struct *fxsave,
75 struct _fpstate_ia32 __user *buf)
76{
77 struct _fpxreg *to;
78 struct _fpreg __user *from;
79 int i;
80 u32 v;
81 int err = 0;
82
83#define G(num,val) err |= __get_user(val, num + (u32 __user *)buf)
84 G(0, fxsave->cwd);
85 G(1, fxsave->swd);
86 G(2, fxsave->twd);
87 fxsave->twd = twd_i387_to_fxsr(fxsave->twd);
88 G(3, fxsave->rip);
89 G(4, v);
90 fxsave->fop = v>>16; /* cs ignored */
91 G(5, fxsave->rdp);
92 /* 6: ds ignored */
93#undef G
94 if (err)
95 return -1;
96
97 to = (struct _fpxreg *)&fxsave->st_space[0];
98 from = &buf->_st[0];
99 for (i = 0 ; i < 8 ; i++, to++, from++) {
100 if (__copy_from_user(to, from, sizeof(*from)))
101 return -1;
102 }
103 return 0;
104}
105
106
107static inline int convert_fxsr_to_user(struct _fpstate_ia32 __user *buf,
108 struct i387_fxsave_struct *fxsave,
109 struct pt_regs *regs,
110 struct task_struct *tsk)
111{
112 struct _fpreg __user *to;
113 struct _fpxreg *from;
114 int i;
115 u16 cs,ds;
116 int err = 0;
117
118 if (tsk == current) {
119 /* should be actually ds/cs at fpu exception time,
120 but that information is not available in 64bit mode. */
121 asm("movw %%ds,%0 " : "=r" (ds));
122 asm("movw %%cs,%0 " : "=r" (cs));
123 } else { /* ptrace. task has stopped. */
124 ds = tsk->thread.ds;
125 cs = regs->cs;
126 }
127
128#define P(num,val) err |= __put_user(val, num + (u32 __user *)buf)
129 P(0, (u32)fxsave->cwd | 0xffff0000);
130 P(1, (u32)fxsave->swd | 0xffff0000);
131 P(2, twd_fxsr_to_i387(fxsave));
132 P(3, (u32)fxsave->rip);
133 P(4, cs | ((u32)fxsave->fop) << 16);
134 P(5, fxsave->rdp);
135 P(6, 0xffff0000 | ds);
136#undef P
137
138 if (err)
139 return -1;
140
141 to = &buf->_st[0];
142 from = (struct _fpxreg *) &fxsave->st_space[0];
143 for ( i = 0 ; i < 8 ; i++, to++, from++ ) {
144 if (__copy_to_user(to, from, sizeof(*to)))
145 return -1;
146 }
147 return 0;
148}
149
150int restore_i387_ia32(struct task_struct *tsk, struct _fpstate_ia32 __user *buf, int fsave)
151{
152 clear_fpu(tsk);
153 if (!fsave) {
154 if (__copy_from_user(&tsk->thread.i387.fxsave,
155 &buf->_fxsr_env[0],
156 sizeof(struct i387_fxsave_struct)))
157 return -1;
158 tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
159 set_stopped_child_used_math(tsk);
160 }
161 return convert_fxsr_from_user(&tsk->thread.i387.fxsave, buf);
162}
163
164int save_i387_ia32(struct task_struct *tsk,
165 struct _fpstate_ia32 __user *buf,
166 struct pt_regs *regs,
167 int fsave)
168{
169 int err = 0;
170
171 init_fpu(tsk);
172 if (convert_fxsr_to_user(buf, &tsk->thread.i387.fxsave, regs, tsk))
173 return -1;
174 if (fsave)
175 return 0;
176 err |= __put_user(tsk->thread.i387.fxsave.swd, &buf->status);
177 if (fsave)
178 return err ? -1 : 1;
179 err |= __put_user(X86_FXSR_MAGIC, &buf->magic);
180 err |= __copy_to_user(&buf->_fxsr_env[0], &tsk->thread.i387.fxsave,
181 sizeof(struct i387_fxsave_struct));
182 return err ? -1 : 1;
183}
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index f82e1a94fcb..e4c12079171 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -25,6 +25,7 @@
25#include <linux/binfmts.h> 25#include <linux/binfmts.h>
26#include <linux/personality.h> 26#include <linux/personality.h>
27#include <linux/init.h> 27#include <linux/init.h>
28#include <linux/jiffies.h>
28 29
29#include <asm/system.h> 30#include <asm/system.h>
30#include <asm/uaccess.h> 31#include <asm/uaccess.h>
@@ -36,61 +37,67 @@
36#undef WARN_OLD 37#undef WARN_OLD
37#undef CORE_DUMP /* probably broken */ 38#undef CORE_DUMP /* probably broken */
38 39
39static int load_aout_binary(struct linux_binprm *, struct pt_regs * regs); 40static int load_aout_binary(struct linux_binprm *, struct pt_regs *regs);
40static int load_aout_library(struct file*); 41static int load_aout_library(struct file *);
41 42
42#ifdef CORE_DUMP 43#ifdef CORE_DUMP
43static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit); 44static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file,
45 unsigned long limit);
44 46
45/* 47/*
46 * fill in the user structure for a core dump.. 48 * fill in the user structure for a core dump..
47 */ 49 */
48static void dump_thread32(struct pt_regs * regs, struct user32 * dump) 50static void dump_thread32(struct pt_regs *regs, struct user32 *dump)
49{ 51{
50 u32 fs,gs; 52 u32 fs, gs;
51 53
52/* changed the size calculations - should hopefully work better. lbt */ 54/* changed the size calculations - should hopefully work better. lbt */
53 dump->magic = CMAGIC; 55 dump->magic = CMAGIC;
54 dump->start_code = 0; 56 dump->start_code = 0;
55 dump->start_stack = regs->rsp & ~(PAGE_SIZE - 1); 57 dump->start_stack = regs->sp & ~(PAGE_SIZE - 1);
56 dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT; 58 dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
57 dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT; 59 dump->u_dsize = ((unsigned long)
60 (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
58 dump->u_dsize -= dump->u_tsize; 61 dump->u_dsize -= dump->u_tsize;
59 dump->u_ssize = 0; 62 dump->u_ssize = 0;
60 dump->u_debugreg[0] = current->thread.debugreg0; 63 dump->u_debugreg[0] = current->thread.debugreg0;
61 dump->u_debugreg[1] = current->thread.debugreg1; 64 dump->u_debugreg[1] = current->thread.debugreg1;
62 dump->u_debugreg[2] = current->thread.debugreg2; 65 dump->u_debugreg[2] = current->thread.debugreg2;
63 dump->u_debugreg[3] = current->thread.debugreg3; 66 dump->u_debugreg[3] = current->thread.debugreg3;
64 dump->u_debugreg[4] = 0; 67 dump->u_debugreg[4] = 0;
65 dump->u_debugreg[5] = 0; 68 dump->u_debugreg[5] = 0;
66 dump->u_debugreg[6] = current->thread.debugreg6; 69 dump->u_debugreg[6] = current->thread.debugreg6;
67 dump->u_debugreg[7] = current->thread.debugreg7; 70 dump->u_debugreg[7] = current->thread.debugreg7;
68 71
69 if (dump->start_stack < 0xc0000000) 72 if (dump->start_stack < 0xc0000000) {
70 dump->u_ssize = ((unsigned long) (0xc0000000 - dump->start_stack)) >> PAGE_SHIFT; 73 unsigned long tmp;
71 74
72 dump->regs.ebx = regs->rbx; 75 tmp = (unsigned long) (0xc0000000 - dump->start_stack);
73 dump->regs.ecx = regs->rcx; 76 dump->u_ssize = tmp >> PAGE_SHIFT;
74 dump->regs.edx = regs->rdx; 77 }
75 dump->regs.esi = regs->rsi; 78
76 dump->regs.edi = regs->rdi; 79 dump->regs.bx = regs->bx;
77 dump->regs.ebp = regs->rbp; 80 dump->regs.cx = regs->cx;
78 dump->regs.eax = regs->rax; 81 dump->regs.dx = regs->dx;
82 dump->regs.si = regs->si;
83 dump->regs.di = regs->di;
84 dump->regs.bp = regs->bp;
85 dump->regs.ax = regs->ax;
79 dump->regs.ds = current->thread.ds; 86 dump->regs.ds = current->thread.ds;
80 dump->regs.es = current->thread.es; 87 dump->regs.es = current->thread.es;
81 asm("movl %%fs,%0" : "=r" (fs)); dump->regs.fs = fs; 88 asm("movl %%fs,%0" : "=r" (fs)); dump->regs.fs = fs;
82 asm("movl %%gs,%0" : "=r" (gs)); dump->regs.gs = gs; 89 asm("movl %%gs,%0" : "=r" (gs)); dump->regs.gs = gs;
83 dump->regs.orig_eax = regs->orig_rax; 90 dump->regs.orig_ax = regs->orig_ax;
84 dump->regs.eip = regs->rip; 91 dump->regs.ip = regs->ip;
85 dump->regs.cs = regs->cs; 92 dump->regs.cs = regs->cs;
86 dump->regs.eflags = regs->eflags; 93 dump->regs.flags = regs->flags;
87 dump->regs.esp = regs->rsp; 94 dump->regs.sp = regs->sp;
88 dump->regs.ss = regs->ss; 95 dump->regs.ss = regs->ss;
89 96
90#if 1 /* FIXME */ 97#if 1 /* FIXME */
91 dump->u_fpvalid = 0; 98 dump->u_fpvalid = 0;
92#else 99#else
93 dump->u_fpvalid = dump_fpu (regs, &dump->i387); 100 dump->u_fpvalid = dump_fpu(regs, &dump->i387);
94#endif 101#endif
95} 102}
96 103
@@ -128,15 +135,19 @@ static int dump_write(struct file *file, const void *addr, int nr)
128 return file->f_op->write(file, addr, nr, &file->f_pos) == nr; 135 return file->f_op->write(file, addr, nr, &file->f_pos) == nr;
129} 136}
130 137
131#define DUMP_WRITE(addr, nr) \ 138#define DUMP_WRITE(addr, nr) \
132 if (!dump_write(file, (void *)(addr), (nr))) \ 139 if (!dump_write(file, (void *)(addr), (nr))) \
133 goto end_coredump; 140 goto end_coredump;
134 141
135#define DUMP_SEEK(offset) \ 142#define DUMP_SEEK(offset) \
136if (file->f_op->llseek) { \ 143 if (file->f_op->llseek) { \
137 if (file->f_op->llseek(file,(offset),0) != (offset)) \ 144 if (file->f_op->llseek(file, (offset), 0) != (offset)) \
138 goto end_coredump; \ 145 goto end_coredump; \
139} else file->f_pos = (offset) 146 } else \
147 file->f_pos = (offset)
148
149#define START_DATA() (u.u_tsize << PAGE_SHIFT)
150#define START_STACK(u) (u.start_stack)
140 151
141/* 152/*
142 * Routine writes a core dump image in the current directory. 153 * Routine writes a core dump image in the current directory.
@@ -148,62 +159,70 @@ if (file->f_op->llseek) { \
148 * dumping of the process results in another error.. 159 * dumping of the process results in another error..
149 */ 160 */
150 161
151static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit) 162static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file,
163 unsigned long limit)
152{ 164{
153 mm_segment_t fs; 165 mm_segment_t fs;
154 int has_dumped = 0; 166 int has_dumped = 0;
155 unsigned long dump_start, dump_size; 167 unsigned long dump_start, dump_size;
156 struct user32 dump; 168 struct user32 dump;
157# define START_DATA(u) (u.u_tsize << PAGE_SHIFT)
158# define START_STACK(u) (u.start_stack)
159 169
160 fs = get_fs(); 170 fs = get_fs();
161 set_fs(KERNEL_DS); 171 set_fs(KERNEL_DS);
162 has_dumped = 1; 172 has_dumped = 1;
163 current->flags |= PF_DUMPCORE; 173 current->flags |= PF_DUMPCORE;
164 strncpy(dump.u_comm, current->comm, sizeof(current->comm)); 174 strncpy(dump.u_comm, current->comm, sizeof(current->comm));
165 dump.u_ar0 = (u32)(((unsigned long)(&dump.regs)) - ((unsigned long)(&dump))); 175 dump.u_ar0 = (u32)(((unsigned long)(&dump.regs)) -
176 ((unsigned long)(&dump)));
166 dump.signal = signr; 177 dump.signal = signr;
167 dump_thread32(regs, &dump); 178 dump_thread32(regs, &dump);
168 179
169/* If the size of the dump file exceeds the rlimit, then see what would happen 180 /*
170 if we wrote the stack, but not the data area. */ 181 * If the size of the dump file exceeds the rlimit, then see
182 * what would happen if we wrote the stack, but not the data
183 * area.
184 */
171 if ((dump.u_dsize + dump.u_ssize + 1) * PAGE_SIZE > limit) 185 if ((dump.u_dsize + dump.u_ssize + 1) * PAGE_SIZE > limit)
172 dump.u_dsize = 0; 186 dump.u_dsize = 0;
173 187
174/* Make sure we have enough room to write the stack and data areas. */ 188 /* Make sure we have enough room to write the stack and data areas. */
175 if ((dump.u_ssize + 1) * PAGE_SIZE > limit) 189 if ((dump.u_ssize + 1) * PAGE_SIZE > limit)
176 dump.u_ssize = 0; 190 dump.u_ssize = 0;
177 191
178/* make sure we actually have a data and stack area to dump */ 192 /* make sure we actually have a data and stack area to dump */
179 set_fs(USER_DS); 193 set_fs(USER_DS);
180 if (!access_ok(VERIFY_READ, (void *) (unsigned long)START_DATA(dump), dump.u_dsize << PAGE_SHIFT)) 194 if (!access_ok(VERIFY_READ, (void *) (unsigned long)START_DATA(dump),
195 dump.u_dsize << PAGE_SHIFT))
181 dump.u_dsize = 0; 196 dump.u_dsize = 0;
182 if (!access_ok(VERIFY_READ, (void *) (unsigned long)START_STACK(dump), dump.u_ssize << PAGE_SHIFT)) 197 if (!access_ok(VERIFY_READ, (void *) (unsigned long)START_STACK(dump),
198 dump.u_ssize << PAGE_SHIFT))
183 dump.u_ssize = 0; 199 dump.u_ssize = 0;
184 200
185 set_fs(KERNEL_DS); 201 set_fs(KERNEL_DS);
186/* struct user */ 202 /* struct user */
187 DUMP_WRITE(&dump,sizeof(dump)); 203 DUMP_WRITE(&dump, sizeof(dump));
188/* Now dump all of the user data. Include malloced stuff as well */ 204 /* Now dump all of the user data. Include malloced stuff as well */
189 DUMP_SEEK(PAGE_SIZE); 205 DUMP_SEEK(PAGE_SIZE);
190/* now we start writing out the user space info */ 206 /* now we start writing out the user space info */
191 set_fs(USER_DS); 207 set_fs(USER_DS);
192/* Dump the data area */ 208 /* Dump the data area */
193 if (dump.u_dsize != 0) { 209 if (dump.u_dsize != 0) {
194 dump_start = START_DATA(dump); 210 dump_start = START_DATA(dump);
195 dump_size = dump.u_dsize << PAGE_SHIFT; 211 dump_size = dump.u_dsize << PAGE_SHIFT;
196 DUMP_WRITE(dump_start,dump_size); 212 DUMP_WRITE(dump_start, dump_size);
197 } 213 }
198/* Now prepare to dump the stack area */ 214 /* Now prepare to dump the stack area */
199 if (dump.u_ssize != 0) { 215 if (dump.u_ssize != 0) {
200 dump_start = START_STACK(dump); 216 dump_start = START_STACK(dump);
201 dump_size = dump.u_ssize << PAGE_SHIFT; 217 dump_size = dump.u_ssize << PAGE_SHIFT;
202 DUMP_WRITE(dump_start,dump_size); 218 DUMP_WRITE(dump_start, dump_size);
203 } 219 }
204/* Finally dump the task struct. Not be used by gdb, but could be useful */ 220 /*
221 * Finally dump the task struct. Not be used by gdb, but
222 * could be useful
223 */
205 set_fs(KERNEL_DS); 224 set_fs(KERNEL_DS);
206 DUMP_WRITE(current,sizeof(*current)); 225 DUMP_WRITE(current, sizeof(*current));
207end_coredump: 226end_coredump:
208 set_fs(fs); 227 set_fs(fs);
209 return has_dumped; 228 return has_dumped;
@@ -217,35 +236,34 @@ end_coredump:
217 */ 236 */
218static u32 __user *create_aout_tables(char __user *p, struct linux_binprm *bprm) 237static u32 __user *create_aout_tables(char __user *p, struct linux_binprm *bprm)
219{ 238{
220 u32 __user *argv; 239 u32 __user *argv, *envp, *sp;
221 u32 __user *envp; 240 int argc = bprm->argc, envc = bprm->envc;
222 u32 __user *sp;
223 int argc = bprm->argc;
224 int envc = bprm->envc;
225 241
226 sp = (u32 __user *) ((-(unsigned long)sizeof(u32)) & (unsigned long) p); 242 sp = (u32 __user *) ((-(unsigned long)sizeof(u32)) & (unsigned long) p);
227 sp -= envc+1; 243 sp -= envc+1;
228 envp = sp; 244 envp = sp;
229 sp -= argc+1; 245 sp -= argc+1;
230 argv = sp; 246 argv = sp;
231 put_user((unsigned long) envp,--sp); 247 put_user((unsigned long) envp, --sp);
232 put_user((unsigned long) argv,--sp); 248 put_user((unsigned long) argv, --sp);
233 put_user(argc,--sp); 249 put_user(argc, --sp);
234 current->mm->arg_start = (unsigned long) p; 250 current->mm->arg_start = (unsigned long) p;
235 while (argc-->0) { 251 while (argc-- > 0) {
236 char c; 252 char c;
237 put_user((u32)(unsigned long)p,argv++); 253
254 put_user((u32)(unsigned long)p, argv++);
238 do { 255 do {
239 get_user(c,p++); 256 get_user(c, p++);
240 } while (c); 257 } while (c);
241 } 258 }
242 put_user(0, argv); 259 put_user(0, argv);
243 current->mm->arg_end = current->mm->env_start = (unsigned long) p; 260 current->mm->arg_end = current->mm->env_start = (unsigned long) p;
244 while (envc-->0) { 261 while (envc-- > 0) {
245 char c; 262 char c;
246 put_user((u32)(unsigned long)p,envp++); 263
264 put_user((u32)(unsigned long)p, envp++);
247 do { 265 do {
248 get_user(c,p++); 266 get_user(c, p++);
249 } while (c); 267 } while (c);
250 } 268 }
251 put_user(0, envp); 269 put_user(0, envp);
@@ -257,20 +275,18 @@ static u32 __user *create_aout_tables(char __user *p, struct linux_binprm *bprm)
257 * These are the functions used to load a.out style executables and shared 275 * These are the functions used to load a.out style executables and shared
258 * libraries. There is no binary dependent code anywhere else. 276 * libraries. There is no binary dependent code anywhere else.
259 */ 277 */
260 278static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs)
261static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
262{ 279{
280 unsigned long error, fd_offset, rlim;
263 struct exec ex; 281 struct exec ex;
264 unsigned long error;
265 unsigned long fd_offset;
266 unsigned long rlim;
267 int retval; 282 int retval;
268 283
269 ex = *((struct exec *) bprm->buf); /* exec-header */ 284 ex = *((struct exec *) bprm->buf); /* exec-header */
270 if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != OMAGIC && 285 if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != OMAGIC &&
271 N_MAGIC(ex) != QMAGIC && N_MAGIC(ex) != NMAGIC) || 286 N_MAGIC(ex) != QMAGIC && N_MAGIC(ex) != NMAGIC) ||
272 N_TRSIZE(ex) || N_DRSIZE(ex) || 287 N_TRSIZE(ex) || N_DRSIZE(ex) ||
273 i_size_read(bprm->file->f_path.dentry->d_inode) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) { 288 i_size_read(bprm->file->f_path.dentry->d_inode) <
289 ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) {
274 return -ENOEXEC; 290 return -ENOEXEC;
275 } 291 }
276 292
@@ -291,13 +307,13 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
291 if (retval) 307 if (retval)
292 return retval; 308 return retval;
293 309
294 regs->cs = __USER32_CS; 310 regs->cs = __USER32_CS;
295 regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 = 311 regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 =
296 regs->r13 = regs->r14 = regs->r15 = 0; 312 regs->r13 = regs->r14 = regs->r15 = 0;
297 313
298 /* OK, This is the point of no return */ 314 /* OK, This is the point of no return */
299 set_personality(PER_LINUX); 315 set_personality(PER_LINUX);
300 set_thread_flag(TIF_IA32); 316 set_thread_flag(TIF_IA32);
301 clear_thread_flag(TIF_ABI_PENDING); 317 clear_thread_flag(TIF_ABI_PENDING);
302 318
303 current->mm->end_code = ex.a_text + 319 current->mm->end_code = ex.a_text +
@@ -311,7 +327,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
311 327
312 current->mm->mmap = NULL; 328 current->mm->mmap = NULL;
313 compute_creds(bprm); 329 compute_creds(bprm);
314 current->flags &= ~PF_FORKNOEXEC; 330 current->flags &= ~PF_FORKNOEXEC;
315 331
316 if (N_MAGIC(ex) == OMAGIC) { 332 if (N_MAGIC(ex) == OMAGIC) {
317 unsigned long text_addr, map_size; 333 unsigned long text_addr, map_size;
@@ -338,30 +354,31 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
338 send_sig(SIGKILL, current, 0); 354 send_sig(SIGKILL, current, 0);
339 return error; 355 return error;
340 } 356 }
341 357
342 flush_icache_range(text_addr, text_addr+ex.a_text+ex.a_data); 358 flush_icache_range(text_addr, text_addr+ex.a_text+ex.a_data);
343 } else { 359 } else {
344#ifdef WARN_OLD 360#ifdef WARN_OLD
345 static unsigned long error_time, error_time2; 361 static unsigned long error_time, error_time2;
346 if ((ex.a_text & 0xfff || ex.a_data & 0xfff) && 362 if ((ex.a_text & 0xfff || ex.a_data & 0xfff) &&
347 (N_MAGIC(ex) != NMAGIC) && (jiffies-error_time2) > 5*HZ) 363 (N_MAGIC(ex) != NMAGIC) &&
348 { 364 time_after(jiffies, error_time2 + 5*HZ)) {
349 printk(KERN_NOTICE "executable not page aligned\n"); 365 printk(KERN_NOTICE "executable not page aligned\n");
350 error_time2 = jiffies; 366 error_time2 = jiffies;
351 } 367 }
352 368
353 if ((fd_offset & ~PAGE_MASK) != 0 && 369 if ((fd_offset & ~PAGE_MASK) != 0 &&
354 (jiffies-error_time) > 5*HZ) 370 time_after(jiffies, error_time + 5*HZ)) {
355 { 371 printk(KERN_WARNING
356 printk(KERN_WARNING 372 "fd_offset is not page aligned. Please convert "
357 "fd_offset is not page aligned. Please convert program: %s\n", 373 "program: %s\n",
358 bprm->file->f_path.dentry->d_name.name); 374 bprm->file->f_path.dentry->d_name.name);
359 error_time = jiffies; 375 error_time = jiffies;
360 } 376 }
361#endif 377#endif
362 378
363 if (!bprm->file->f_op->mmap||((fd_offset & ~PAGE_MASK) != 0)) { 379 if (!bprm->file->f_op->mmap || (fd_offset & ~PAGE_MASK) != 0) {
364 loff_t pos = fd_offset; 380 loff_t pos = fd_offset;
381
365 down_write(&current->mm->mmap_sem); 382 down_write(&current->mm->mmap_sem);
366 do_brk(N_TXTADDR(ex), ex.a_text+ex.a_data); 383 do_brk(N_TXTADDR(ex), ex.a_text+ex.a_data);
367 up_write(&current->mm->mmap_sem); 384 up_write(&current->mm->mmap_sem);
@@ -376,9 +393,10 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
376 393
377 down_write(&current->mm->mmap_sem); 394 down_write(&current->mm->mmap_sem);
378 error = do_mmap(bprm->file, N_TXTADDR(ex), ex.a_text, 395 error = do_mmap(bprm->file, N_TXTADDR(ex), ex.a_text,
379 PROT_READ | PROT_EXEC, 396 PROT_READ | PROT_EXEC,
380 MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE | MAP_32BIT, 397 MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE |
381 fd_offset); 398 MAP_EXECUTABLE | MAP_32BIT,
399 fd_offset);
382 up_write(&current->mm->mmap_sem); 400 up_write(&current->mm->mmap_sem);
383 401
384 if (error != N_TXTADDR(ex)) { 402 if (error != N_TXTADDR(ex)) {
@@ -387,9 +405,10 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
387 } 405 }
388 406
389 down_write(&current->mm->mmap_sem); 407 down_write(&current->mm->mmap_sem);
390 error = do_mmap(bprm->file, N_DATADDR(ex), ex.a_data, 408 error = do_mmap(bprm->file, N_DATADDR(ex), ex.a_data,
391 PROT_READ | PROT_WRITE | PROT_EXEC, 409 PROT_READ | PROT_WRITE | PROT_EXEC,
392 MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE | MAP_32BIT, 410 MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE |
411 MAP_EXECUTABLE | MAP_32BIT,
393 fd_offset + ex.a_text); 412 fd_offset + ex.a_text);
394 up_write(&current->mm->mmap_sem); 413 up_write(&current->mm->mmap_sem);
395 if (error != N_DATADDR(ex)) { 414 if (error != N_DATADDR(ex)) {
@@ -403,9 +422,9 @@ beyond_if:
403 set_brk(current->mm->start_brk, current->mm->brk); 422 set_brk(current->mm->start_brk, current->mm->brk);
404 423
405 retval = setup_arg_pages(bprm, IA32_STACK_TOP, EXSTACK_DEFAULT); 424 retval = setup_arg_pages(bprm, IA32_STACK_TOP, EXSTACK_DEFAULT);
406 if (retval < 0) { 425 if (retval < 0) {
407 /* Someone check-me: is this error path enough? */ 426 /* Someone check-me: is this error path enough? */
408 send_sig(SIGKILL, current, 0); 427 send_sig(SIGKILL, current, 0);
409 return retval; 428 return retval;
410 } 429 }
411 430
@@ -414,10 +433,10 @@ beyond_if:
414 /* start thread */ 433 /* start thread */
415 asm volatile("movl %0,%%fs" :: "r" (0)); \ 434 asm volatile("movl %0,%%fs" :: "r" (0)); \
416 asm volatile("movl %0,%%es; movl %0,%%ds": :"r" (__USER32_DS)); 435 asm volatile("movl %0,%%es; movl %0,%%ds": :"r" (__USER32_DS));
417 load_gs_index(0); 436 load_gs_index(0);
418 (regs)->rip = ex.a_entry; 437 (regs)->ip = ex.a_entry;
419 (regs)->rsp = current->mm->start_stack; 438 (regs)->sp = current->mm->start_stack;
420 (regs)->eflags = 0x200; 439 (regs)->flags = 0x200;
421 (regs)->cs = __USER32_CS; 440 (regs)->cs = __USER32_CS;
422 (regs)->ss = __USER32_DS; 441 (regs)->ss = __USER32_DS;
423 regs->r8 = regs->r9 = regs->r10 = regs->r11 = 442 regs->r8 = regs->r9 = regs->r10 = regs->r11 =
@@ -425,7 +444,7 @@ beyond_if:
425 set_fs(USER_DS); 444 set_fs(USER_DS);
426 if (unlikely(current->ptrace & PT_PTRACED)) { 445 if (unlikely(current->ptrace & PT_PTRACED)) {
427 if (current->ptrace & PT_TRACE_EXEC) 446 if (current->ptrace & PT_TRACE_EXEC)
428 ptrace_notify ((PTRACE_EVENT_EXEC << 8) | SIGTRAP); 447 ptrace_notify((PTRACE_EVENT_EXEC << 8) | SIGTRAP);
429 else 448 else
430 send_sig(SIGTRAP, current, 0); 449 send_sig(SIGTRAP, current, 0);
431 } 450 }
@@ -434,9 +453,8 @@ beyond_if:
434 453
435static int load_aout_library(struct file *file) 454static int load_aout_library(struct file *file)
436{ 455{
437 struct inode * inode; 456 struct inode *inode;
438 unsigned long bss, start_addr, len; 457 unsigned long bss, start_addr, len, error;
439 unsigned long error;
440 int retval; 458 int retval;
441 struct exec ex; 459 struct exec ex;
442 460
@@ -450,7 +468,8 @@ static int load_aout_library(struct file *file)
450 /* We come in here for the regular a.out style of shared libraries */ 468 /* We come in here for the regular a.out style of shared libraries */
451 if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != QMAGIC) || N_TRSIZE(ex) || 469 if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != QMAGIC) || N_TRSIZE(ex) ||
452 N_DRSIZE(ex) || ((ex.a_entry & 0xfff) && N_MAGIC(ex) == ZMAGIC) || 470 N_DRSIZE(ex) || ((ex.a_entry & 0xfff) && N_MAGIC(ex) == ZMAGIC) ||
453 i_size_read(inode) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) { 471 i_size_read(inode) <
472 ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) {
454 goto out; 473 goto out;
455 } 474 }
456 475
@@ -467,10 +486,10 @@ static int load_aout_library(struct file *file)
467 486
468#ifdef WARN_OLD 487#ifdef WARN_OLD
469 static unsigned long error_time; 488 static unsigned long error_time;
470 if ((jiffies-error_time) > 5*HZ) 489 if (time_after(jiffies, error_time + 5*HZ)) {
471 { 490 printk(KERN_WARNING
472 printk(KERN_WARNING 491 "N_TXTOFF is not page aligned. Please convert "
473 "N_TXTOFF is not page aligned. Please convert library: %s\n", 492 "library: %s\n",
474 file->f_path.dentry->d_name.name); 493 file->f_path.dentry->d_name.name);
475 error_time = jiffies; 494 error_time = jiffies;
476 } 495 }
@@ -478,11 +497,12 @@ static int load_aout_library(struct file *file)
478 down_write(&current->mm->mmap_sem); 497 down_write(&current->mm->mmap_sem);
479 do_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss); 498 do_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss);
480 up_write(&current->mm->mmap_sem); 499 up_write(&current->mm->mmap_sem);
481 500
482 file->f_op->read(file, (char __user *)start_addr, 501 file->f_op->read(file, (char __user *)start_addr,
483 ex.a_text + ex.a_data, &pos); 502 ex.a_text + ex.a_data, &pos);
484 flush_icache_range((unsigned long) start_addr, 503 flush_icache_range((unsigned long) start_addr,
485 (unsigned long) start_addr + ex.a_text + ex.a_data); 504 (unsigned long) start_addr + ex.a_text +
505 ex.a_data);
486 506
487 retval = 0; 507 retval = 0;
488 goto out; 508 goto out;
diff --git a/arch/x86/ia32/ia32_binfmt.c b/arch/x86/ia32/ia32_binfmt.c
deleted file mode 100644
index 55822d2cf05..00000000000
--- a/arch/x86/ia32/ia32_binfmt.c
+++ /dev/null
@@ -1,285 +0,0 @@
1/*
2 * Written 2000,2002 by Andi Kleen.
3 *
4 * Loosely based on the sparc64 and IA64 32bit emulation loaders.
5 * This tricks binfmt_elf.c into loading 32bit binaries using lots
6 * of ugly preprocessor tricks. Talk about very very poor man's inheritance.
7 */
8
9#include <linux/types.h>
10#include <linux/stddef.h>
11#include <linux/rwsem.h>
12#include <linux/sched.h>
13#include <linux/compat.h>
14#include <linux/string.h>
15#include <linux/binfmts.h>
16#include <linux/mm.h>
17#include <linux/security.h>
18#include <linux/elfcore-compat.h>
19
20#include <asm/segment.h>
21#include <asm/ptrace.h>
22#include <asm/processor.h>
23#include <asm/user32.h>
24#include <asm/sigcontext32.h>
25#include <asm/fpu32.h>
26#include <asm/i387.h>
27#include <asm/uaccess.h>
28#include <asm/ia32.h>
29#include <asm/vsyscall32.h>
30
31#undef ELF_ARCH
32#undef ELF_CLASS
33#define ELF_CLASS ELFCLASS32
34#define ELF_ARCH EM_386
35
36#undef elfhdr
37#undef elf_phdr
38#undef elf_note
39#undef elf_addr_t
40#define elfhdr elf32_hdr
41#define elf_phdr elf32_phdr
42#define elf_note elf32_note
43#define elf_addr_t Elf32_Off
44
45#define ELF_NAME "elf/i386"
46
47#define AT_SYSINFO 32
48#define AT_SYSINFO_EHDR 33
49
50int sysctl_vsyscall32 = 1;
51
52#undef ARCH_DLINFO
53#define ARCH_DLINFO do { \
54 if (sysctl_vsyscall32) { \
55 current->mm->context.vdso = (void *)VSYSCALL32_BASE; \
56 NEW_AUX_ENT(AT_SYSINFO, (u32)(u64)VSYSCALL32_VSYSCALL); \
57 NEW_AUX_ENT(AT_SYSINFO_EHDR, VSYSCALL32_BASE); \
58 } \
59} while(0)
60
61struct file;
62
63#define IA32_EMULATOR 1
64
65#undef ELF_ET_DYN_BASE
66
67#define ELF_ET_DYN_BASE (TASK_UNMAPPED_BASE + 0x1000000)
68
69#define jiffies_to_timeval(a,b) do { (b)->tv_usec = 0; (b)->tv_sec = (a)/HZ; }while(0)
70
71#define _GET_SEG(x) \
72 ({ __u32 seg; asm("movl %%" __stringify(x) ",%0" : "=r"(seg)); seg; })
73
74/* Assumes current==process to be dumped */
75#undef ELF_CORE_COPY_REGS
76#define ELF_CORE_COPY_REGS(pr_reg, regs) \
77 pr_reg[0] = regs->rbx; \
78 pr_reg[1] = regs->rcx; \
79 pr_reg[2] = regs->rdx; \
80 pr_reg[3] = regs->rsi; \
81 pr_reg[4] = regs->rdi; \
82 pr_reg[5] = regs->rbp; \
83 pr_reg[6] = regs->rax; \
84 pr_reg[7] = _GET_SEG(ds); \
85 pr_reg[8] = _GET_SEG(es); \
86 pr_reg[9] = _GET_SEG(fs); \
87 pr_reg[10] = _GET_SEG(gs); \
88 pr_reg[11] = regs->orig_rax; \
89 pr_reg[12] = regs->rip; \
90 pr_reg[13] = regs->cs; \
91 pr_reg[14] = regs->eflags; \
92 pr_reg[15] = regs->rsp; \
93 pr_reg[16] = regs->ss;
94
95
96#define elf_prstatus compat_elf_prstatus
97#define elf_prpsinfo compat_elf_prpsinfo
98#define elf_fpregset_t struct user_i387_ia32_struct
99#define elf_fpxregset_t struct user32_fxsr_struct
100#define user user32
101
102#undef elf_read_implies_exec
103#define elf_read_implies_exec(ex, executable_stack) (executable_stack != EXSTACK_DISABLE_X)
104
105#define elf_core_copy_regs elf32_core_copy_regs
106static inline void elf32_core_copy_regs(compat_elf_gregset_t *elfregs,
107 struct pt_regs *regs)
108{
109 ELF_CORE_COPY_REGS((&elfregs->ebx), regs)
110}
111
112#define elf_core_copy_task_regs elf32_core_copy_task_regs
113static inline int elf32_core_copy_task_regs(struct task_struct *t,
114 compat_elf_gregset_t* elfregs)
115{
116 struct pt_regs *pp = task_pt_regs(t);
117 ELF_CORE_COPY_REGS((&elfregs->ebx), pp);
118 /* fix wrong segments */
119 elfregs->ds = t->thread.ds;
120 elfregs->fs = t->thread.fsindex;
121 elfregs->gs = t->thread.gsindex;
122 elfregs->es = t->thread.es;
123 return 1;
124}
125
126#define elf_core_copy_task_fpregs elf32_core_copy_task_fpregs
127static inline int
128elf32_core_copy_task_fpregs(struct task_struct *tsk, struct pt_regs *regs,
129 elf_fpregset_t *fpu)
130{
131 struct _fpstate_ia32 *fpstate = (void*)fpu;
132 mm_segment_t oldfs = get_fs();
133
134 if (!tsk_used_math(tsk))
135 return 0;
136 if (!regs)
137 regs = task_pt_regs(tsk);
138 if (tsk == current)
139 unlazy_fpu(tsk);
140 set_fs(KERNEL_DS);
141 save_i387_ia32(tsk, fpstate, regs, 1);
142 /* Correct for i386 bug. It puts the fop into the upper 16bits of
143 the tag word (like FXSAVE), not into the fcs*/
144 fpstate->cssel |= fpstate->tag & 0xffff0000;
145 set_fs(oldfs);
146 return 1;
147}
148
149#define ELF_CORE_COPY_XFPREGS 1
150#define ELF_CORE_XFPREG_TYPE NT_PRXFPREG
151#define elf_core_copy_task_xfpregs elf32_core_copy_task_xfpregs
152static inline int
153elf32_core_copy_task_xfpregs(struct task_struct *t, elf_fpxregset_t *xfpu)
154{
155 struct pt_regs *regs = task_pt_regs(t);
156 if (!tsk_used_math(t))
157 return 0;
158 if (t == current)
159 unlazy_fpu(t);
160 memcpy(xfpu, &t->thread.i387.fxsave, sizeof(elf_fpxregset_t));
161 xfpu->fcs = regs->cs;
162 xfpu->fos = t->thread.ds; /* right? */
163 return 1;
164}
165
166#undef elf_check_arch
167#define elf_check_arch(x) \
168 ((x)->e_machine == EM_386)
169
170extern int force_personality32;
171
172#undef ELF_EXEC_PAGESIZE
173#undef ELF_HWCAP
174#undef ELF_PLATFORM
175#undef SET_PERSONALITY
176#define ELF_EXEC_PAGESIZE PAGE_SIZE
177#define ELF_HWCAP (boot_cpu_data.x86_capability[0])
178#define ELF_PLATFORM ("i686")
179#define SET_PERSONALITY(ex, ibcs2) \
180do { \
181 unsigned long new_flags = 0; \
182 if ((ex).e_ident[EI_CLASS] == ELFCLASS32) \
183 new_flags = _TIF_IA32; \
184 if ((current_thread_info()->flags & _TIF_IA32) \
185 != new_flags) \
186 set_thread_flag(TIF_ABI_PENDING); \
187 else \
188 clear_thread_flag(TIF_ABI_PENDING); \
189 /* XXX This overwrites the user set personality */ \
190 current->personality |= force_personality32; \
191} while (0)
192
193/* Override some function names */
194#define elf_format elf32_format
195
196#define init_elf_binfmt init_elf32_binfmt
197#define exit_elf_binfmt exit_elf32_binfmt
198
199#define load_elf_binary load_elf32_binary
200
201#undef ELF_PLAT_INIT
202#define ELF_PLAT_INIT(r, load_addr) elf32_init(r)
203
204#undef start_thread
205#define start_thread(regs,new_rip,new_rsp) do { \
206 asm volatile("movl %0,%%fs" :: "r" (0)); \
207 asm volatile("movl %0,%%es; movl %0,%%ds": :"r" (__USER32_DS)); \
208 load_gs_index(0); \
209 (regs)->rip = (new_rip); \
210 (regs)->rsp = (new_rsp); \
211 (regs)->eflags = 0x200; \
212 (regs)->cs = __USER32_CS; \
213 (regs)->ss = __USER32_DS; \
214 set_fs(USER_DS); \
215} while(0)
216
217
218#include <linux/module.h>
219
220MODULE_DESCRIPTION("Binary format loader for compatibility with IA32 ELF binaries.");
221MODULE_AUTHOR("Eric Youngdale, Andi Kleen");
222
223#undef MODULE_DESCRIPTION
224#undef MODULE_AUTHOR
225
226static void elf32_init(struct pt_regs *);
227
228#define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
229#define arch_setup_additional_pages syscall32_setup_pages
230extern int syscall32_setup_pages(struct linux_binprm *, int exstack);
231
232#include "../../../fs/binfmt_elf.c"
233
234static void elf32_init(struct pt_regs *regs)
235{
236 struct task_struct *me = current;
237 regs->rdi = 0;
238 regs->rsi = 0;
239 regs->rdx = 0;
240 regs->rcx = 0;
241 regs->rax = 0;
242 regs->rbx = 0;
243 regs->rbp = 0;
244 regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 =
245 regs->r13 = regs->r14 = regs->r15 = 0;
246 me->thread.fs = 0;
247 me->thread.gs = 0;
248 me->thread.fsindex = 0;
249 me->thread.gsindex = 0;
250 me->thread.ds = __USER_DS;
251 me->thread.es = __USER_DS;
252}
253
254#ifdef CONFIG_SYSCTL
255/* Register vsyscall32 into the ABI table */
256#include <linux/sysctl.h>
257
258static ctl_table abi_table2[] = {
259 {
260 .procname = "vsyscall32",
261 .data = &sysctl_vsyscall32,
262 .maxlen = sizeof(int),
263 .mode = 0644,
264 .proc_handler = proc_dointvec
265 },
266 {}
267};
268
269static ctl_table abi_root_table2[] = {
270 {
271 .ctl_name = CTL_ABI,
272 .procname = "abi",
273 .mode = 0555,
274 .child = abi_table2
275 },
276 {}
277};
278
279static __init int ia32_binfmt_init(void)
280{
281 register_sysctl_table(abi_root_table2);
282 return 0;
283}
284__initcall(ia32_binfmt_init);
285#endif
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 6ea19c25f90..1c0503bdfb1 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -29,9 +29,8 @@
29#include <asm/ia32_unistd.h> 29#include <asm/ia32_unistd.h>
30#include <asm/user32.h> 30#include <asm/user32.h>
31#include <asm/sigcontext32.h> 31#include <asm/sigcontext32.h>
32#include <asm/fpu32.h>
33#include <asm/proto.h> 32#include <asm/proto.h>
34#include <asm/vsyscall32.h> 33#include <asm/vdso.h>
35 34
36#define DEBUG_SIG 0 35#define DEBUG_SIG 0
37 36
@@ -43,7 +42,8 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
43int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from) 42int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from)
44{ 43{
45 int err; 44 int err;
46 if (!access_ok (VERIFY_WRITE, to, sizeof(compat_siginfo_t))) 45
46 if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t)))
47 return -EFAULT; 47 return -EFAULT;
48 48
49 /* If you change siginfo_t structure, please make sure that 49 /* If you change siginfo_t structure, please make sure that
@@ -53,16 +53,19 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from)
53 3 ints plus the relevant union member. */ 53 3 ints plus the relevant union member. */
54 err = __put_user(from->si_signo, &to->si_signo); 54 err = __put_user(from->si_signo, &to->si_signo);
55 err |= __put_user(from->si_errno, &to->si_errno); 55 err |= __put_user(from->si_errno, &to->si_errno);
56 err |= __put_user((short)from->si_code, &to->si_code); 56 err |= __put_user((short)from->si_code, &to->si_code);
57 57
58 if (from->si_code < 0) { 58 if (from->si_code < 0) {
59 err |= __put_user(from->si_pid, &to->si_pid); 59 err |= __put_user(from->si_pid, &to->si_pid);
60 err |= __put_user(from->si_uid, &to->si_uid); 60 err |= __put_user(from->si_uid, &to->si_uid);
61 err |= __put_user(ptr_to_compat(from->si_ptr), &to->si_ptr); 61 err |= __put_user(ptr_to_compat(from->si_ptr), &to->si_ptr);
62 } else { 62 } else {
63 /* First 32bits of unions are always present: 63 /*
64 * si_pid === si_band === si_tid === si_addr(LS half) */ 64 * First 32bits of unions are always present:
65 err |= __put_user(from->_sifields._pad[0], &to->_sifields._pad[0]); 65 * si_pid === si_band === si_tid === si_addr(LS half)
66 */
67 err |= __put_user(from->_sifields._pad[0],
68 &to->_sifields._pad[0]);
66 switch (from->si_code >> 16) { 69 switch (from->si_code >> 16) {
67 case __SI_FAULT >> 16: 70 case __SI_FAULT >> 16:
68 break; 71 break;
@@ -76,14 +79,15 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from)
76 err |= __put_user(from->si_uid, &to->si_uid); 79 err |= __put_user(from->si_uid, &to->si_uid);
77 break; 80 break;
78 case __SI_POLL >> 16: 81 case __SI_POLL >> 16:
79 err |= __put_user(from->si_fd, &to->si_fd); 82 err |= __put_user(from->si_fd, &to->si_fd);
80 break; 83 break;
81 case __SI_TIMER >> 16: 84 case __SI_TIMER >> 16:
82 err |= __put_user(from->si_overrun, &to->si_overrun); 85 err |= __put_user(from->si_overrun, &to->si_overrun);
83 err |= __put_user(ptr_to_compat(from->si_ptr), 86 err |= __put_user(ptr_to_compat(from->si_ptr),
84 &to->si_ptr); 87 &to->si_ptr);
85 break; 88 break;
86 case __SI_RT >> 16: /* This is not generated by the kernel as of now. */ 89 /* This is not generated by the kernel as of now. */
90 case __SI_RT >> 16:
87 case __SI_MESGQ >> 16: 91 case __SI_MESGQ >> 16:
88 err |= __put_user(from->si_uid, &to->si_uid); 92 err |= __put_user(from->si_uid, &to->si_uid);
89 err |= __put_user(from->si_int, &to->si_int); 93 err |= __put_user(from->si_int, &to->si_int);
@@ -97,7 +101,8 @@ int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
97{ 101{
98 int err; 102 int err;
99 u32 ptr32; 103 u32 ptr32;
100 if (!access_ok (VERIFY_READ, from, sizeof(compat_siginfo_t))) 104
105 if (!access_ok(VERIFY_READ, from, sizeof(compat_siginfo_t)))
101 return -EFAULT; 106 return -EFAULT;
102 107
103 err = __get_user(to->si_signo, &from->si_signo); 108 err = __get_user(to->si_signo, &from->si_signo);
@@ -112,8 +117,7 @@ int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
112 return err; 117 return err;
113} 118}
114 119
115asmlinkage long 120asmlinkage long sys32_sigsuspend(int history0, int history1, old_sigset_t mask)
116sys32_sigsuspend(int history0, int history1, old_sigset_t mask)
117{ 121{
118 mask &= _BLOCKABLE; 122 mask &= _BLOCKABLE;
119 spin_lock_irq(&current->sighand->siglock); 123 spin_lock_irq(&current->sighand->siglock);
@@ -128,36 +132,37 @@ sys32_sigsuspend(int history0, int history1, old_sigset_t mask)
128 return -ERESTARTNOHAND; 132 return -ERESTARTNOHAND;
129} 133}
130 134
131asmlinkage long 135asmlinkage long sys32_sigaltstack(const stack_ia32_t __user *uss_ptr,
132sys32_sigaltstack(const stack_ia32_t __user *uss_ptr, 136 stack_ia32_t __user *uoss_ptr,
133 stack_ia32_t __user *uoss_ptr, 137 struct pt_regs *regs)
134 struct pt_regs *regs)
135{ 138{
136 stack_t uss,uoss; 139 stack_t uss, uoss;
137 int ret; 140 int ret;
138 mm_segment_t seg; 141 mm_segment_t seg;
139 if (uss_ptr) { 142
143 if (uss_ptr) {
140 u32 ptr; 144 u32 ptr;
141 memset(&uss,0,sizeof(stack_t)); 145
142 if (!access_ok(VERIFY_READ,uss_ptr,sizeof(stack_ia32_t)) || 146 memset(&uss, 0, sizeof(stack_t));
147 if (!access_ok(VERIFY_READ, uss_ptr, sizeof(stack_ia32_t)) ||
143 __get_user(ptr, &uss_ptr->ss_sp) || 148 __get_user(ptr, &uss_ptr->ss_sp) ||
144 __get_user(uss.ss_flags, &uss_ptr->ss_flags) || 149 __get_user(uss.ss_flags, &uss_ptr->ss_flags) ||
145 __get_user(uss.ss_size, &uss_ptr->ss_size)) 150 __get_user(uss.ss_size, &uss_ptr->ss_size))
146 return -EFAULT; 151 return -EFAULT;
147 uss.ss_sp = compat_ptr(ptr); 152 uss.ss_sp = compat_ptr(ptr);
148 } 153 }
149 seg = get_fs(); 154 seg = get_fs();
150 set_fs(KERNEL_DS); 155 set_fs(KERNEL_DS);
151 ret = do_sigaltstack(uss_ptr ? &uss : NULL, &uoss, regs->rsp); 156 ret = do_sigaltstack(uss_ptr ? &uss : NULL, &uoss, regs->sp);
152 set_fs(seg); 157 set_fs(seg);
153 if (ret >= 0 && uoss_ptr) { 158 if (ret >= 0 && uoss_ptr) {
154 if (!access_ok(VERIFY_WRITE,uoss_ptr,sizeof(stack_ia32_t)) || 159 if (!access_ok(VERIFY_WRITE, uoss_ptr, sizeof(stack_ia32_t)) ||
155 __put_user(ptr_to_compat(uoss.ss_sp), &uoss_ptr->ss_sp) || 160 __put_user(ptr_to_compat(uoss.ss_sp), &uoss_ptr->ss_sp) ||
156 __put_user(uoss.ss_flags, &uoss_ptr->ss_flags) || 161 __put_user(uoss.ss_flags, &uoss_ptr->ss_flags) ||
157 __put_user(uoss.ss_size, &uoss_ptr->ss_size)) 162 __put_user(uoss.ss_size, &uoss_ptr->ss_size))
158 ret = -EFAULT; 163 ret = -EFAULT;
159 } 164 }
160 return ret; 165 return ret;
161} 166}
162 167
163/* 168/*
@@ -186,87 +191,85 @@ struct rt_sigframe
186 char retcode[8]; 191 char retcode[8];
187}; 192};
188 193
189static int 194#define COPY(x) { \
190ia32_restore_sigcontext(struct pt_regs *regs, struct sigcontext_ia32 __user *sc, unsigned int *peax) 195 unsigned int reg; \
196 err |= __get_user(reg, &sc->x); \
197 regs->x = reg; \
198}
199
200#define RELOAD_SEG(seg,mask) \
201 { unsigned int cur; \
202 unsigned short pre; \
203 err |= __get_user(pre, &sc->seg); \
204 asm volatile("movl %%" #seg ",%0" : "=r" (cur)); \
205 pre |= mask; \
206 if (pre != cur) loadsegment(seg, pre); }
207
208static int ia32_restore_sigcontext(struct pt_regs *regs,
209 struct sigcontext_ia32 __user *sc,
210 unsigned int *peax)
191{ 211{
192 unsigned int err = 0; 212 unsigned int tmpflags, gs, oldgs, err = 0;
193 213 struct _fpstate_ia32 __user *buf;
214 u32 tmp;
215
194 /* Always make any pending restarted system calls return -EINTR */ 216 /* Always make any pending restarted system calls return -EINTR */
195 current_thread_info()->restart_block.fn = do_no_restart_syscall; 217 current_thread_info()->restart_block.fn = do_no_restart_syscall;
196 218
197#if DEBUG_SIG 219#if DEBUG_SIG
198 printk("SIG restore_sigcontext: sc=%p err(%x) eip(%x) cs(%x) flg(%x)\n", 220 printk(KERN_DEBUG "SIG restore_sigcontext: "
199 sc, sc->err, sc->eip, sc->cs, sc->eflags); 221 "sc=%p err(%x) eip(%x) cs(%x) flg(%x)\n",
222 sc, sc->err, sc->ip, sc->cs, sc->flags);
200#endif 223#endif
201#define COPY(x) { \
202 unsigned int reg; \
203 err |= __get_user(reg, &sc->e ##x); \
204 regs->r ## x = reg; \
205}
206 224
207#define RELOAD_SEG(seg,mask) \ 225 /*
208 { unsigned int cur; \ 226 * Reload fs and gs if they have changed in the signal
209 unsigned short pre; \ 227 * handler. This does not handle long fs/gs base changes in
210 err |= __get_user(pre, &sc->seg); \ 228 * the handler, but does not clobber them at least in the
211 asm volatile("movl %%" #seg ",%0" : "=r" (cur)); \ 229 * normal case.
212 pre |= mask; \ 230 */
213 if (pre != cur) loadsegment(seg,pre); } 231 err |= __get_user(gs, &sc->gs);
214 232 gs |= 3;
215 /* Reload fs and gs if they have changed in the signal handler. 233 asm("movl %%gs,%0" : "=r" (oldgs));
216 This does not handle long fs/gs base changes in the handler, but 234 if (gs != oldgs)
217 does not clobber them at least in the normal case. */ 235 load_gs_index(gs);
218 236
219 { 237 RELOAD_SEG(fs, 3);
220 unsigned gs, oldgs; 238 RELOAD_SEG(ds, 3);
221 err |= __get_user(gs, &sc->gs); 239 RELOAD_SEG(es, 3);
222 gs |= 3;
223 asm("movl %%gs,%0" : "=r" (oldgs));
224 if (gs != oldgs)
225 load_gs_index(gs);
226 }
227 RELOAD_SEG(fs,3);
228 RELOAD_SEG(ds,3);
229 RELOAD_SEG(es,3);
230 240
231 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); 241 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
232 COPY(dx); COPY(cx); COPY(ip); 242 COPY(dx); COPY(cx); COPY(ip);
233 /* Don't touch extended registers */ 243 /* Don't touch extended registers */
234 244
235 err |= __get_user(regs->cs, &sc->cs); 245 err |= __get_user(regs->cs, &sc->cs);
236 regs->cs |= 3; 246 regs->cs |= 3;
237 err |= __get_user(regs->ss, &sc->ss); 247 err |= __get_user(regs->ss, &sc->ss);
238 regs->ss |= 3; 248 regs->ss |= 3;
239 249
240 { 250 err |= __get_user(tmpflags, &sc->flags);
241 unsigned int tmpflags; 251 regs->flags = (regs->flags & ~0x40DD5) | (tmpflags & 0x40DD5);
242 err |= __get_user(tmpflags, &sc->eflags); 252 /* disable syscall checks */
243 regs->eflags = (regs->eflags & ~0x40DD5) | (tmpflags & 0x40DD5); 253 regs->orig_ax = -1;
244 regs->orig_rax = -1; /* disable syscall checks */ 254
245 } 255 err |= __get_user(tmp, &sc->fpstate);
256 buf = compat_ptr(tmp);
257 if (buf) {
258 if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
259 goto badframe;
260 err |= restore_i387_ia32(buf);
261 } else {
262 struct task_struct *me = current;
246 263
247 { 264 if (used_math()) {
248 u32 tmp; 265 clear_fpu(me);
249 struct _fpstate_ia32 __user * buf; 266 clear_used_math();
250 err |= __get_user(tmp, &sc->fpstate);
251 buf = compat_ptr(tmp);
252 if (buf) {
253 if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
254 goto badframe;
255 err |= restore_i387_ia32(current, buf, 0);
256 } else {
257 struct task_struct *me = current;
258 if (used_math()) {
259 clear_fpu(me);
260 clear_used_math();
261 }
262 } 267 }
263 } 268 }
264 269
265 { 270 err |= __get_user(tmp, &sc->ax);
266 u32 tmp; 271 *peax = tmp;
267 err |= __get_user(tmp, &sc->eax); 272
268 *peax = tmp;
269 }
270 return err; 273 return err;
271 274
272badframe: 275badframe:
@@ -275,15 +278,16 @@ badframe:
275 278
276asmlinkage long sys32_sigreturn(struct pt_regs *regs) 279asmlinkage long sys32_sigreturn(struct pt_regs *regs)
277{ 280{
278 struct sigframe __user *frame = (struct sigframe __user *)(regs->rsp-8); 281 struct sigframe __user *frame = (struct sigframe __user *)(regs->sp-8);
279 sigset_t set; 282 sigset_t set;
280 unsigned int eax; 283 unsigned int ax;
281 284
282 if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) 285 if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
283 goto badframe; 286 goto badframe;
284 if (__get_user(set.sig[0], &frame->sc.oldmask) 287 if (__get_user(set.sig[0], &frame->sc.oldmask)
285 || (_COMPAT_NSIG_WORDS > 1 288 || (_COMPAT_NSIG_WORDS > 1
286 && __copy_from_user((((char *) &set.sig) + 4), &frame->extramask, 289 && __copy_from_user((((char *) &set.sig) + 4),
290 &frame->extramask,
287 sizeof(frame->extramask)))) 291 sizeof(frame->extramask))))
288 goto badframe; 292 goto badframe;
289 293
@@ -292,24 +296,24 @@ asmlinkage long sys32_sigreturn(struct pt_regs *regs)
292 current->blocked = set; 296 current->blocked = set;
293 recalc_sigpending(); 297 recalc_sigpending();
294 spin_unlock_irq(&current->sighand->siglock); 298 spin_unlock_irq(&current->sighand->siglock);
295 299
296 if (ia32_restore_sigcontext(regs, &frame->sc, &eax)) 300 if (ia32_restore_sigcontext(regs, &frame->sc, &ax))
297 goto badframe; 301 goto badframe;
298 return eax; 302 return ax;
299 303
300badframe: 304badframe:
301 signal_fault(regs, frame, "32bit sigreturn"); 305 signal_fault(regs, frame, "32bit sigreturn");
302 return 0; 306 return 0;
303} 307}
304 308
305asmlinkage long sys32_rt_sigreturn(struct pt_regs *regs) 309asmlinkage long sys32_rt_sigreturn(struct pt_regs *regs)
306{ 310{
307 struct rt_sigframe __user *frame; 311 struct rt_sigframe __user *frame;
308 sigset_t set; 312 sigset_t set;
309 unsigned int eax; 313 unsigned int ax;
310 struct pt_regs tregs; 314 struct pt_regs tregs;
311 315
312 frame = (struct rt_sigframe __user *)(regs->rsp - 4); 316 frame = (struct rt_sigframe __user *)(regs->sp - 4);
313 317
314 if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) 318 if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
315 goto badframe; 319 goto badframe;
@@ -321,28 +325,28 @@ asmlinkage long sys32_rt_sigreturn(struct pt_regs *regs)
321 current->blocked = set; 325 current->blocked = set;
322 recalc_sigpending(); 326 recalc_sigpending();
323 spin_unlock_irq(&current->sighand->siglock); 327 spin_unlock_irq(&current->sighand->siglock);
324 328
325 if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax)) 329 if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
326 goto badframe; 330 goto badframe;
327 331
328 tregs = *regs; 332 tregs = *regs;
329 if (sys32_sigaltstack(&frame->uc.uc_stack, NULL, &tregs) == -EFAULT) 333 if (sys32_sigaltstack(&frame->uc.uc_stack, NULL, &tregs) == -EFAULT)
330 goto badframe; 334 goto badframe;
331 335
332 return eax; 336 return ax;
333 337
334badframe: 338badframe:
335 signal_fault(regs,frame,"32bit rt sigreturn"); 339 signal_fault(regs, frame, "32bit rt sigreturn");
336 return 0; 340 return 0;
337} 341}
338 342
339/* 343/*
340 * Set up a signal frame. 344 * Set up a signal frame.
341 */ 345 */
342 346
343static int 347static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc,
344ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, struct _fpstate_ia32 __user *fpstate, 348 struct _fpstate_ia32 __user *fpstate,
345 struct pt_regs *regs, unsigned int mask) 349 struct pt_regs *regs, unsigned int mask)
346{ 350{
347 int tmp, err = 0; 351 int tmp, err = 0;
348 352
@@ -356,26 +360,26 @@ ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, struct _fpstate_ia32 __
356 __asm__("movl %%es,%0" : "=r"(tmp): "0"(tmp)); 360 __asm__("movl %%es,%0" : "=r"(tmp): "0"(tmp));
357 err |= __put_user(tmp, (unsigned int __user *)&sc->es); 361 err |= __put_user(tmp, (unsigned int __user *)&sc->es);
358 362
359 err |= __put_user((u32)regs->rdi, &sc->edi); 363 err |= __put_user((u32)regs->di, &sc->di);
360 err |= __put_user((u32)regs->rsi, &sc->esi); 364 err |= __put_user((u32)regs->si, &sc->si);
361 err |= __put_user((u32)regs->rbp, &sc->ebp); 365 err |= __put_user((u32)regs->bp, &sc->bp);
362 err |= __put_user((u32)regs->rsp, &sc->esp); 366 err |= __put_user((u32)regs->sp, &sc->sp);
363 err |= __put_user((u32)regs->rbx, &sc->ebx); 367 err |= __put_user((u32)regs->bx, &sc->bx);
364 err |= __put_user((u32)regs->rdx, &sc->edx); 368 err |= __put_user((u32)regs->dx, &sc->dx);
365 err |= __put_user((u32)regs->rcx, &sc->ecx); 369 err |= __put_user((u32)regs->cx, &sc->cx);
366 err |= __put_user((u32)regs->rax, &sc->eax); 370 err |= __put_user((u32)regs->ax, &sc->ax);
367 err |= __put_user((u32)regs->cs, &sc->cs); 371 err |= __put_user((u32)regs->cs, &sc->cs);
368 err |= __put_user((u32)regs->ss, &sc->ss); 372 err |= __put_user((u32)regs->ss, &sc->ss);
369 err |= __put_user(current->thread.trap_no, &sc->trapno); 373 err |= __put_user(current->thread.trap_no, &sc->trapno);
370 err |= __put_user(current->thread.error_code, &sc->err); 374 err |= __put_user(current->thread.error_code, &sc->err);
371 err |= __put_user((u32)regs->rip, &sc->eip); 375 err |= __put_user((u32)regs->ip, &sc->ip);
372 err |= __put_user((u32)regs->eflags, &sc->eflags); 376 err |= __put_user((u32)regs->flags, &sc->flags);
373 err |= __put_user((u32)regs->rsp, &sc->esp_at_signal); 377 err |= __put_user((u32)regs->sp, &sc->sp_at_signal);
374 378
375 tmp = save_i387_ia32(current, fpstate, regs, 0); 379 tmp = save_i387_ia32(fpstate);
376 if (tmp < 0) 380 if (tmp < 0)
377 err = -EFAULT; 381 err = -EFAULT;
378 else { 382 else {
379 clear_used_math(); 383 clear_used_math();
380 stts(); 384 stts();
381 err |= __put_user(ptr_to_compat(tmp ? fpstate : NULL), 385 err |= __put_user(ptr_to_compat(tmp ? fpstate : NULL),
@@ -392,40 +396,53 @@ ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, struct _fpstate_ia32 __
392/* 396/*
393 * Determine which stack to use.. 397 * Determine which stack to use..
394 */ 398 */
395static void __user * 399static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs,
396get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size) 400 size_t frame_size)
397{ 401{
398 unsigned long rsp; 402 unsigned long sp;
399 403
400 /* Default to using normal stack */ 404 /* Default to using normal stack */
401 rsp = regs->rsp; 405 sp = regs->sp;
402 406
403 /* This is the X/Open sanctioned signal stack switching. */ 407 /* This is the X/Open sanctioned signal stack switching. */
404 if (ka->sa.sa_flags & SA_ONSTACK) { 408 if (ka->sa.sa_flags & SA_ONSTACK) {
405 if (sas_ss_flags(rsp) == 0) 409 if (sas_ss_flags(sp) == 0)
406 rsp = current->sas_ss_sp + current->sas_ss_size; 410 sp = current->sas_ss_sp + current->sas_ss_size;
407 } 411 }
408 412
409 /* This is the legacy signal stack switching. */ 413 /* This is the legacy signal stack switching. */
410 else if ((regs->ss & 0xffff) != __USER_DS && 414 else if ((regs->ss & 0xffff) != __USER_DS &&
411 !(ka->sa.sa_flags & SA_RESTORER) && 415 !(ka->sa.sa_flags & SA_RESTORER) &&
412 ka->sa.sa_restorer) { 416 ka->sa.sa_restorer)
413 rsp = (unsigned long) ka->sa.sa_restorer; 417 sp = (unsigned long) ka->sa.sa_restorer;
414 }
415 418
416 rsp -= frame_size; 419 sp -= frame_size;
417 /* Align the stack pointer according to the i386 ABI, 420 /* Align the stack pointer according to the i386 ABI,
418 * i.e. so that on function entry ((sp + 4) & 15) == 0. */ 421 * i.e. so that on function entry ((sp + 4) & 15) == 0. */
419 rsp = ((rsp + 4) & -16ul) - 4; 422 sp = ((sp + 4) & -16ul) - 4;
420 return (void __user *) rsp; 423 return (void __user *) sp;
421} 424}
422 425
423int ia32_setup_frame(int sig, struct k_sigaction *ka, 426int ia32_setup_frame(int sig, struct k_sigaction *ka,
424 compat_sigset_t *set, struct pt_regs * regs) 427 compat_sigset_t *set, struct pt_regs *regs)
425{ 428{
426 struct sigframe __user *frame; 429 struct sigframe __user *frame;
430 void __user *restorer;
427 int err = 0; 431 int err = 0;
428 432
433 /* copy_to_user optimizes that into a single 8 byte store */
434 static const struct {
435 u16 poplmovl;
436 u32 val;
437 u16 int80;
438 u16 pad;
439 } __attribute__((packed)) code = {
440 0xb858, /* popl %eax ; movl $...,%eax */
441 __NR_ia32_sigreturn,
442 0x80cd, /* int $0x80 */
443 0,
444 };
445
429 frame = get_sigframe(ka, regs, sizeof(*frame)); 446 frame = get_sigframe(ka, regs, sizeof(*frame));
430 447
431 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 448 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
@@ -443,64 +460,53 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
443 if (_COMPAT_NSIG_WORDS > 1) { 460 if (_COMPAT_NSIG_WORDS > 1) {
444 err |= __copy_to_user(frame->extramask, &set->sig[1], 461 err |= __copy_to_user(frame->extramask, &set->sig[1],
445 sizeof(frame->extramask)); 462 sizeof(frame->extramask));
463 if (err)
464 goto give_sigsegv;
446 } 465 }
447 if (err)
448 goto give_sigsegv;
449 466
450 /* Return stub is in 32bit vsyscall page */ 467 if (ka->sa.sa_flags & SA_RESTORER) {
451 { 468 restorer = ka->sa.sa_restorer;
452 void __user *restorer; 469 } else {
470 /* Return stub is in 32bit vsyscall page */
453 if (current->binfmt->hasvdso) 471 if (current->binfmt->hasvdso)
454 restorer = VSYSCALL32_SIGRETURN; 472 restorer = VDSO32_SYMBOL(current->mm->context.vdso,
473 sigreturn);
455 else 474 else
456 restorer = (void *)&frame->retcode; 475 restorer = &frame->retcode;
457 if (ka->sa.sa_flags & SA_RESTORER)
458 restorer = ka->sa.sa_restorer;
459 err |= __put_user(ptr_to_compat(restorer), &frame->pretcode);
460 }
461 /* These are actually not used anymore, but left because some
462 gdb versions depend on them as a marker. */
463 {
464 /* copy_to_user optimizes that into a single 8 byte store */
465 static const struct {
466 u16 poplmovl;
467 u32 val;
468 u16 int80;
469 u16 pad;
470 } __attribute__((packed)) code = {
471 0xb858, /* popl %eax ; movl $...,%eax */
472 __NR_ia32_sigreturn,
473 0x80cd, /* int $0x80 */
474 0,
475 };
476 err |= __copy_to_user(frame->retcode, &code, 8);
477 } 476 }
477 err |= __put_user(ptr_to_compat(restorer), &frame->pretcode);
478
479 /*
480 * These are actually not used anymore, but left because some
481 * gdb versions depend on them as a marker.
482 */
483 err |= __copy_to_user(frame->retcode, &code, 8);
478 if (err) 484 if (err)
479 goto give_sigsegv; 485 goto give_sigsegv;
480 486
481 /* Set up registers for signal handler */ 487 /* Set up registers for signal handler */
482 regs->rsp = (unsigned long) frame; 488 regs->sp = (unsigned long) frame;
483 regs->rip = (unsigned long) ka->sa.sa_handler; 489 regs->ip = (unsigned long) ka->sa.sa_handler;
484 490
485 /* Make -mregparm=3 work */ 491 /* Make -mregparm=3 work */
486 regs->rax = sig; 492 regs->ax = sig;
487 regs->rdx = 0; 493 regs->dx = 0;
488 regs->rcx = 0; 494 regs->cx = 0;
489 495
490 asm volatile("movl %0,%%ds" :: "r" (__USER32_DS)); 496 asm volatile("movl %0,%%ds" :: "r" (__USER32_DS));
491 asm volatile("movl %0,%%es" :: "r" (__USER32_DS)); 497 asm volatile("movl %0,%%es" :: "r" (__USER32_DS));
492 498
493 regs->cs = __USER32_CS; 499 regs->cs = __USER32_CS;
494 regs->ss = __USER32_DS; 500 regs->ss = __USER32_DS;
495 501
496 set_fs(USER_DS); 502 set_fs(USER_DS);
497 regs->eflags &= ~TF_MASK; 503 regs->flags &= ~X86_EFLAGS_TF;
498 if (test_thread_flag(TIF_SINGLESTEP)) 504 if (test_thread_flag(TIF_SINGLESTEP))
499 ptrace_notify(SIGTRAP); 505 ptrace_notify(SIGTRAP);
500 506
501#if DEBUG_SIG 507#if DEBUG_SIG
502 printk("SIG deliver (%s:%d): sp=%p pc=%lx ra=%u\n", 508 printk(KERN_DEBUG "SIG deliver (%s:%d): sp=%p pc=%lx ra=%u\n",
503 current->comm, current->pid, frame, regs->rip, frame->pretcode); 509 current->comm, current->pid, frame, regs->ip, frame->pretcode);
504#endif 510#endif
505 511
506 return 0; 512 return 0;
@@ -511,25 +517,34 @@ give_sigsegv:
511} 517}
512 518
513int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, 519int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
514 compat_sigset_t *set, struct pt_regs * regs) 520 compat_sigset_t *set, struct pt_regs *regs)
515{ 521{
516 struct rt_sigframe __user *frame; 522 struct rt_sigframe __user *frame;
523 struct exec_domain *ed = current_thread_info()->exec_domain;
524 void __user *restorer;
517 int err = 0; 525 int err = 0;
518 526
527 /* __copy_to_user optimizes that into a single 8 byte store */
528 static const struct {
529 u8 movl;
530 u32 val;
531 u16 int80;
532 u16 pad;
533 u8 pad2;
534 } __attribute__((packed)) code = {
535 0xb8,
536 __NR_ia32_rt_sigreturn,
537 0x80cd,
538 0,
539 };
540
519 frame = get_sigframe(ka, regs, sizeof(*frame)); 541 frame = get_sigframe(ka, regs, sizeof(*frame));
520 542
521 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 543 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
522 goto give_sigsegv; 544 goto give_sigsegv;
523 545
524 { 546 err |= __put_user((ed && ed->signal_invmap && sig < 32
525 struct exec_domain *ed = current_thread_info()->exec_domain; 547 ? ed->signal_invmap[sig] : sig), &frame->sig);
526 err |= __put_user((ed
527 && ed->signal_invmap
528 && sig < 32
529 ? ed->signal_invmap[sig]
530 : sig),
531 &frame->sig);
532 }
533 err |= __put_user(ptr_to_compat(&frame->info), &frame->pinfo); 548 err |= __put_user(ptr_to_compat(&frame->info), &frame->pinfo);
534 err |= __put_user(ptr_to_compat(&frame->uc), &frame->puc); 549 err |= __put_user(ptr_to_compat(&frame->uc), &frame->puc);
535 err |= copy_siginfo_to_user32(&frame->info, info); 550 err |= copy_siginfo_to_user32(&frame->info, info);
@@ -540,73 +555,58 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
540 err |= __put_user(0, &frame->uc.uc_flags); 555 err |= __put_user(0, &frame->uc.uc_flags);
541 err |= __put_user(0, &frame->uc.uc_link); 556 err |= __put_user(0, &frame->uc.uc_link);
542 err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); 557 err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
543 err |= __put_user(sas_ss_flags(regs->rsp), 558 err |= __put_user(sas_ss_flags(regs->sp),
544 &frame->uc.uc_stack.ss_flags); 559 &frame->uc.uc_stack.ss_flags);
545 err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); 560 err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
546 err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate, 561 err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate,
547 regs, set->sig[0]); 562 regs, set->sig[0]);
548 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); 563 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
549 if (err) 564 if (err)
550 goto give_sigsegv; 565 goto give_sigsegv;
551 566
552 567 if (ka->sa.sa_flags & SA_RESTORER)
553 { 568 restorer = ka->sa.sa_restorer;
554 void __user *restorer = VSYSCALL32_RTSIGRETURN; 569 else
555 if (ka->sa.sa_flags & SA_RESTORER) 570 restorer = VDSO32_SYMBOL(current->mm->context.vdso,
556 restorer = ka->sa.sa_restorer; 571 rt_sigreturn);
557 err |= __put_user(ptr_to_compat(restorer), &frame->pretcode); 572 err |= __put_user(ptr_to_compat(restorer), &frame->pretcode);
558 } 573
559 574 /*
560 /* This is movl $,%eax ; int $0x80 */ 575 * Not actually used anymore, but left because some gdb
561 /* Not actually used anymore, but left because some gdb versions 576 * versions need it.
562 need it. */ 577 */
563 { 578 err |= __copy_to_user(frame->retcode, &code, 8);
564 /* __copy_to_user optimizes that into a single 8 byte store */
565 static const struct {
566 u8 movl;
567 u32 val;
568 u16 int80;
569 u16 pad;
570 u8 pad2;
571 } __attribute__((packed)) code = {
572 0xb8,
573 __NR_ia32_rt_sigreturn,
574 0x80cd,
575 0,
576 };
577 err |= __copy_to_user(frame->retcode, &code, 8);
578 }
579 if (err) 579 if (err)
580 goto give_sigsegv; 580 goto give_sigsegv;
581 581
582 /* Set up registers for signal handler */ 582 /* Set up registers for signal handler */
583 regs->rsp = (unsigned long) frame; 583 regs->sp = (unsigned long) frame;
584 regs->rip = (unsigned long) ka->sa.sa_handler; 584 regs->ip = (unsigned long) ka->sa.sa_handler;
585 585
586 /* Make -mregparm=3 work */ 586 /* Make -mregparm=3 work */
587 regs->rax = sig; 587 regs->ax = sig;
588 regs->rdx = (unsigned long) &frame->info; 588 regs->dx = (unsigned long) &frame->info;
589 regs->rcx = (unsigned long) &frame->uc; 589 regs->cx = (unsigned long) &frame->uc;
590 590
591 /* Make -mregparm=3 work */ 591 /* Make -mregparm=3 work */
592 regs->rax = sig; 592 regs->ax = sig;
593 regs->rdx = (unsigned long) &frame->info; 593 regs->dx = (unsigned long) &frame->info;
594 regs->rcx = (unsigned long) &frame->uc; 594 regs->cx = (unsigned long) &frame->uc;
595
596 asm volatile("movl %0,%%ds" :: "r" (__USER32_DS));
597 asm volatile("movl %0,%%es" :: "r" (__USER32_DS));
595 598
596 asm volatile("movl %0,%%ds" :: "r" (__USER32_DS)); 599 regs->cs = __USER32_CS;
597 asm volatile("movl %0,%%es" :: "r" (__USER32_DS)); 600 regs->ss = __USER32_DS;
598
599 regs->cs = __USER32_CS;
600 regs->ss = __USER32_DS;
601 601
602 set_fs(USER_DS); 602 set_fs(USER_DS);
603 regs->eflags &= ~TF_MASK; 603 regs->flags &= ~X86_EFLAGS_TF;
604 if (test_thread_flag(TIF_SINGLESTEP)) 604 if (test_thread_flag(TIF_SINGLESTEP))
605 ptrace_notify(SIGTRAP); 605 ptrace_notify(SIGTRAP);
606 606
607#if DEBUG_SIG 607#if DEBUG_SIG
608 printk("SIG deliver (%s:%d): sp=%p pc=%lx ra=%u\n", 608 printk(KERN_DEBUG "SIG deliver (%s:%d): sp=%p pc=%lx ra=%u\n",
609 current->comm, current->pid, frame, regs->rip, frame->pretcode); 609 current->comm, current->pid, frame, regs->ip, frame->pretcode);
610#endif 610#endif
611 611
612 return 0; 612 return 0;
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index df588f0f76e..8022d3c695c 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -12,7 +12,6 @@
12#include <asm/ia32_unistd.h> 12#include <asm/ia32_unistd.h>
13#include <asm/thread_info.h> 13#include <asm/thread_info.h>
14#include <asm/segment.h> 14#include <asm/segment.h>
15#include <asm/vsyscall32.h>
16#include <asm/irqflags.h> 15#include <asm/irqflags.h>
17#include <linux/linkage.h> 16#include <linux/linkage.h>
18 17
@@ -104,7 +103,7 @@ ENTRY(ia32_sysenter_target)
104 pushfq 103 pushfq
105 CFI_ADJUST_CFA_OFFSET 8 104 CFI_ADJUST_CFA_OFFSET 8
106 /*CFI_REL_OFFSET rflags,0*/ 105 /*CFI_REL_OFFSET rflags,0*/
107 movl $VSYSCALL32_SYSEXIT, %r10d 106 movl 8*3-THREAD_SIZE+threadinfo_sysenter_return(%rsp), %r10d
108 CFI_REGISTER rip,r10 107 CFI_REGISTER rip,r10
109 pushq $__USER32_CS 108 pushq $__USER32_CS
110 CFI_ADJUST_CFA_OFFSET 8 109 CFI_ADJUST_CFA_OFFSET 8
@@ -142,6 +141,8 @@ sysenter_do_call:
142 andl $~TS_COMPAT,threadinfo_status(%r10) 141 andl $~TS_COMPAT,threadinfo_status(%r10)
143 /* clear IF, that popfq doesn't enable interrupts early */ 142 /* clear IF, that popfq doesn't enable interrupts early */
144 andl $~0x200,EFLAGS-R11(%rsp) 143 andl $~0x200,EFLAGS-R11(%rsp)
144 movl RIP-R11(%rsp),%edx /* User %eip */
145 CFI_REGISTER rip,rdx
145 RESTORE_ARGS 1,24,1,1,1,1 146 RESTORE_ARGS 1,24,1,1,1,1
146 popfq 147 popfq
147 CFI_ADJUST_CFA_OFFSET -8 148 CFI_ADJUST_CFA_OFFSET -8
@@ -149,8 +150,6 @@ sysenter_do_call:
149 popq %rcx /* User %esp */ 150 popq %rcx /* User %esp */
150 CFI_ADJUST_CFA_OFFSET -8 151 CFI_ADJUST_CFA_OFFSET -8
151 CFI_REGISTER rsp,rcx 152 CFI_REGISTER rsp,rcx
152 movl $VSYSCALL32_SYSEXIT,%edx /* User %eip */
153 CFI_REGISTER rip,rdx
154 TRACE_IRQS_ON 153 TRACE_IRQS_ON
155 swapgs 154 swapgs
156 sti /* sti only takes effect after the next instruction */ 155 sti /* sti only takes effect after the next instruction */
@@ -644,8 +643,8 @@ ia32_sys_call_table:
644 .quad compat_sys_futex /* 240 */ 643 .quad compat_sys_futex /* 240 */
645 .quad compat_sys_sched_setaffinity 644 .quad compat_sys_sched_setaffinity
646 .quad compat_sys_sched_getaffinity 645 .quad compat_sys_sched_getaffinity
647 .quad sys32_set_thread_area 646 .quad sys_set_thread_area
648 .quad sys32_get_thread_area 647 .quad sys_get_thread_area
649 .quad compat_sys_io_setup /* 245 */ 648 .quad compat_sys_io_setup /* 245 */
650 .quad sys_io_destroy 649 .quad sys_io_destroy
651 .quad compat_sys_io_getevents 650 .quad compat_sys_io_getevents
@@ -723,7 +722,9 @@ ia32_sys_call_table:
723 .quad sys_epoll_pwait 722 .quad sys_epoll_pwait
724 .quad compat_sys_utimensat /* 320 */ 723 .quad compat_sys_utimensat /* 320 */
725 .quad compat_sys_signalfd 724 .quad compat_sys_signalfd
726 .quad compat_sys_timerfd 725 .quad sys_timerfd_create
727 .quad sys_eventfd 726 .quad sys_eventfd
728 .quad sys32_fallocate 727 .quad sys32_fallocate
728 .quad compat_sys_timerfd_settime /* 325 */
729 .quad compat_sys_timerfd_gettime
729ia32_syscall_end: 730ia32_syscall_end:
diff --git a/arch/x86/ia32/ipc32.c b/arch/x86/ia32/ipc32.c
index 7b3342e5aab..d21991ce606 100644
--- a/arch/x86/ia32/ipc32.c
+++ b/arch/x86/ia32/ipc32.c
@@ -9,9 +9,8 @@
9#include <linux/ipc.h> 9#include <linux/ipc.h>
10#include <linux/compat.h> 10#include <linux/compat.h>
11 11
12asmlinkage long 12asmlinkage long sys32_ipc(u32 call, int first, int second, int third,
13sys32_ipc(u32 call, int first, int second, int third, 13 compat_uptr_t ptr, u32 fifth)
14 compat_uptr_t ptr, u32 fifth)
15{ 14{
16 int version; 15 int version;
17 16
@@ -19,36 +18,35 @@ sys32_ipc(u32 call, int first, int second, int third,
19 call &= 0xffff; 18 call &= 0xffff;
20 19
21 switch (call) { 20 switch (call) {
22 case SEMOP: 21 case SEMOP:
23 /* struct sembuf is the same on 32 and 64bit :)) */ 22 /* struct sembuf is the same on 32 and 64bit :)) */
24 return sys_semtimedop(first, compat_ptr(ptr), second, NULL); 23 return sys_semtimedop(first, compat_ptr(ptr), second, NULL);
25 case SEMTIMEDOP: 24 case SEMTIMEDOP:
26 return compat_sys_semtimedop(first, compat_ptr(ptr), second, 25 return compat_sys_semtimedop(first, compat_ptr(ptr), second,
27 compat_ptr(fifth)); 26 compat_ptr(fifth));
28 case SEMGET: 27 case SEMGET:
29 return sys_semget(first, second, third); 28 return sys_semget(first, second, third);
30 case SEMCTL: 29 case SEMCTL:
31 return compat_sys_semctl(first, second, third, compat_ptr(ptr)); 30 return compat_sys_semctl(first, second, third, compat_ptr(ptr));
32 31
33 case MSGSND: 32 case MSGSND:
34 return compat_sys_msgsnd(first, second, third, compat_ptr(ptr)); 33 return compat_sys_msgsnd(first, second, third, compat_ptr(ptr));
35 case MSGRCV: 34 case MSGRCV:
36 return compat_sys_msgrcv(first, second, fifth, third, 35 return compat_sys_msgrcv(first, second, fifth, third,
37 version, compat_ptr(ptr)); 36 version, compat_ptr(ptr));
38 case MSGGET: 37 case MSGGET:
39 return sys_msgget((key_t) first, second); 38 return sys_msgget((key_t) first, second);
40 case MSGCTL: 39 case MSGCTL:
41 return compat_sys_msgctl(first, second, compat_ptr(ptr)); 40 return compat_sys_msgctl(first, second, compat_ptr(ptr));
42 41
43 case SHMAT: 42 case SHMAT:
44 return compat_sys_shmat(first, second, third, version, 43 return compat_sys_shmat(first, second, third, version,
45 compat_ptr(ptr)); 44 compat_ptr(ptr));
46 break; 45 case SHMDT:
47 case SHMDT:
48 return sys_shmdt(compat_ptr(ptr)); 46 return sys_shmdt(compat_ptr(ptr));
49 case SHMGET: 47 case SHMGET:
50 return sys_shmget(first, (unsigned)second, third); 48 return sys_shmget(first, (unsigned)second, third);
51 case SHMCTL: 49 case SHMCTL:
52 return compat_sys_shmctl(first, second, compat_ptr(ptr)); 50 return compat_sys_shmctl(first, second, compat_ptr(ptr));
53 } 51 }
54 return -ENOSYS; 52 return -ENOSYS;
diff --git a/arch/x86/ia32/mmap32.c b/arch/x86/ia32/mmap32.c
deleted file mode 100644
index e4b84b4a417..00000000000
--- a/arch/x86/ia32/mmap32.c
+++ /dev/null
@@ -1,79 +0,0 @@
1/*
2 * linux/arch/x86_64/ia32/mm/mmap.c
3 *
4 * flexible mmap layout support
5 *
6 * Based on the i386 version which was
7 *
8 * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
9 * All Rights Reserved.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 *
25 *
26 * Started by Ingo Molnar <mingo@elte.hu>
27 */
28
29#include <linux/personality.h>
30#include <linux/mm.h>
31#include <linux/random.h>
32#include <linux/sched.h>
33
34/*
35 * Top of mmap area (just below the process stack).
36 *
37 * Leave an at least ~128 MB hole.
38 */
39#define MIN_GAP (128*1024*1024)
40#define MAX_GAP (TASK_SIZE/6*5)
41
42static inline unsigned long mmap_base(struct mm_struct *mm)
43{
44 unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur;
45 unsigned long random_factor = 0;
46
47 if (current->flags & PF_RANDOMIZE)
48 random_factor = get_random_int() % (1024*1024);
49
50 if (gap < MIN_GAP)
51 gap = MIN_GAP;
52 else if (gap > MAX_GAP)
53 gap = MAX_GAP;
54
55 return PAGE_ALIGN(TASK_SIZE - gap - random_factor);
56}
57
58/*
59 * This function, called very early during the creation of a new
60 * process VM image, sets up which VM layout function to use:
61 */
62void ia32_pick_mmap_layout(struct mm_struct *mm)
63{
64 /*
65 * Fall back to the standard layout if the personality
66 * bit is set, or if the expected stack growth is unlimited:
67 */
68 if (sysctl_legacy_va_layout ||
69 (current->personality & ADDR_COMPAT_LAYOUT) ||
70 current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY) {
71 mm->mmap_base = TASK_UNMAPPED_BASE;
72 mm->get_unmapped_area = arch_get_unmapped_area;
73 mm->unmap_area = arch_unmap_area;
74 } else {
75 mm->mmap_base = mmap_base(mm);
76 mm->get_unmapped_area = arch_get_unmapped_area_topdown;
77 mm->unmap_area = arch_unmap_area_topdown;
78 }
79}
diff --git a/arch/x86/ia32/ptrace32.c b/arch/x86/ia32/ptrace32.c
deleted file mode 100644
index 4a233ad6269..00000000000
--- a/arch/x86/ia32/ptrace32.c
+++ /dev/null
@@ -1,404 +0,0 @@
1/*
2 * 32bit ptrace for x86-64.
3 *
4 * Copyright 2001,2002 Andi Kleen, SuSE Labs.
5 * Some parts copied from arch/i386/kernel/ptrace.c. See that file for earlier
6 * copyright.
7 *
8 * This allows to access 64bit processes too; but there is no way to see the extended
9 * register contents.
10 */
11
12#include <linux/kernel.h>
13#include <linux/stddef.h>
14#include <linux/sched.h>
15#include <linux/syscalls.h>
16#include <linux/unistd.h>
17#include <linux/mm.h>
18#include <linux/err.h>
19#include <linux/ptrace.h>
20#include <asm/ptrace.h>
21#include <asm/compat.h>
22#include <asm/uaccess.h>
23#include <asm/user32.h>
24#include <asm/user.h>
25#include <asm/errno.h>
26#include <asm/debugreg.h>
27#include <asm/i387.h>
28#include <asm/fpu32.h>
29#include <asm/ia32.h>
30
31/*
32 * Determines which flags the user has access to [1 = access, 0 = no access].
33 * Prohibits changing ID(21), VIP(20), VIF(19), VM(17), IOPL(12-13), IF(9).
34 * Also masks reserved bits (31-22, 15, 5, 3, 1).
35 */
36#define FLAG_MASK 0x54dd5UL
37
38#define R32(l,q) \
39 case offsetof(struct user32, regs.l): stack[offsetof(struct pt_regs, q)/8] = val; break
40
41static int putreg32(struct task_struct *child, unsigned regno, u32 val)
42{
43 int i;
44 __u64 *stack = (__u64 *)task_pt_regs(child);
45
46 switch (regno) {
47 case offsetof(struct user32, regs.fs):
48 if (val && (val & 3) != 3) return -EIO;
49 child->thread.fsindex = val & 0xffff;
50 break;
51 case offsetof(struct user32, regs.gs):
52 if (val && (val & 3) != 3) return -EIO;
53 child->thread.gsindex = val & 0xffff;
54 break;
55 case offsetof(struct user32, regs.ds):
56 if (val && (val & 3) != 3) return -EIO;
57 child->thread.ds = val & 0xffff;
58 break;
59 case offsetof(struct user32, regs.es):
60 child->thread.es = val & 0xffff;
61 break;
62 case offsetof(struct user32, regs.ss):
63 if ((val & 3) != 3) return -EIO;
64 stack[offsetof(struct pt_regs, ss)/8] = val & 0xffff;
65 break;
66 case offsetof(struct user32, regs.cs):
67 if ((val & 3) != 3) return -EIO;
68 stack[offsetof(struct pt_regs, cs)/8] = val & 0xffff;
69 break;
70
71 R32(ebx, rbx);
72 R32(ecx, rcx);
73 R32(edx, rdx);
74 R32(edi, rdi);
75 R32(esi, rsi);
76 R32(ebp, rbp);
77 R32(eax, rax);
78 R32(orig_eax, orig_rax);
79 R32(eip, rip);
80 R32(esp, rsp);
81
82 case offsetof(struct user32, regs.eflags): {
83 __u64 *flags = &stack[offsetof(struct pt_regs, eflags)/8];
84 val &= FLAG_MASK;
85 *flags = val | (*flags & ~FLAG_MASK);
86 break;
87 }
88
89 case offsetof(struct user32, u_debugreg[4]):
90 case offsetof(struct user32, u_debugreg[5]):
91 return -EIO;
92
93 case offsetof(struct user32, u_debugreg[0]):
94 child->thread.debugreg0 = val;
95 break;
96
97 case offsetof(struct user32, u_debugreg[1]):
98 child->thread.debugreg1 = val;
99 break;
100
101 case offsetof(struct user32, u_debugreg[2]):
102 child->thread.debugreg2 = val;
103 break;
104
105 case offsetof(struct user32, u_debugreg[3]):
106 child->thread.debugreg3 = val;
107 break;
108
109 case offsetof(struct user32, u_debugreg[6]):
110 child->thread.debugreg6 = val;
111 break;
112
113 case offsetof(struct user32, u_debugreg[7]):
114 val &= ~DR_CONTROL_RESERVED;
115 /* See arch/i386/kernel/ptrace.c for an explanation of
116 * this awkward check.*/
117 for(i=0; i<4; i++)
118 if ((0x5454 >> ((val >> (16 + 4*i)) & 0xf)) & 1)
119 return -EIO;
120 child->thread.debugreg7 = val;
121 if (val)
122 set_tsk_thread_flag(child, TIF_DEBUG);
123 else
124 clear_tsk_thread_flag(child, TIF_DEBUG);
125 break;
126
127 default:
128 if (regno > sizeof(struct user32) || (regno & 3))
129 return -EIO;
130
131 /* Other dummy fields in the virtual user structure are ignored */
132 break;
133 }
134 return 0;
135}
136
137#undef R32
138
139#define R32(l,q) \
140 case offsetof(struct user32, regs.l): *val = stack[offsetof(struct pt_regs, q)/8]; break
141
142static int getreg32(struct task_struct *child, unsigned regno, u32 *val)
143{
144 __u64 *stack = (__u64 *)task_pt_regs(child);
145
146 switch (regno) {
147 case offsetof(struct user32, regs.fs):
148 *val = child->thread.fsindex;
149 break;
150 case offsetof(struct user32, regs.gs):
151 *val = child->thread.gsindex;
152 break;
153 case offsetof(struct user32, regs.ds):
154 *val = child->thread.ds;
155 break;
156 case offsetof(struct user32, regs.es):
157 *val = child->thread.es;
158 break;
159
160 R32(cs, cs);
161 R32(ss, ss);
162 R32(ebx, rbx);
163 R32(ecx, rcx);
164 R32(edx, rdx);
165 R32(edi, rdi);
166 R32(esi, rsi);
167 R32(ebp, rbp);
168 R32(eax, rax);
169 R32(orig_eax, orig_rax);
170 R32(eip, rip);
171 R32(eflags, eflags);
172 R32(esp, rsp);
173
174 case offsetof(struct user32, u_debugreg[0]):
175 *val = child->thread.debugreg0;
176 break;
177 case offsetof(struct user32, u_debugreg[1]):
178 *val = child->thread.debugreg1;
179 break;
180 case offsetof(struct user32, u_debugreg[2]):
181 *val = child->thread.debugreg2;
182 break;
183 case offsetof(struct user32, u_debugreg[3]):
184 *val = child->thread.debugreg3;
185 break;
186 case offsetof(struct user32, u_debugreg[6]):
187 *val = child->thread.debugreg6;
188 break;
189 case offsetof(struct user32, u_debugreg[7]):
190 *val = child->thread.debugreg7;
191 break;
192
193 default:
194 if (regno > sizeof(struct user32) || (regno & 3))
195 return -EIO;
196
197 /* Other dummy fields in the virtual user structure are ignored */
198 *val = 0;
199 break;
200 }
201 return 0;
202}
203
204#undef R32
205
206static long ptrace32_siginfo(unsigned request, u32 pid, u32 addr, u32 data)
207{
208 int ret;
209 compat_siginfo_t __user *si32 = compat_ptr(data);
210 siginfo_t ssi;
211 siginfo_t __user *si = compat_alloc_user_space(sizeof(siginfo_t));
212 if (request == PTRACE_SETSIGINFO) {
213 memset(&ssi, 0, sizeof(siginfo_t));
214 ret = copy_siginfo_from_user32(&ssi, si32);
215 if (ret)
216 return ret;
217 if (copy_to_user(si, &ssi, sizeof(siginfo_t)))
218 return -EFAULT;
219 }
220 ret = sys_ptrace(request, pid, addr, (unsigned long)si);
221 if (ret)
222 return ret;
223 if (request == PTRACE_GETSIGINFO) {
224 if (copy_from_user(&ssi, si, sizeof(siginfo_t)))
225 return -EFAULT;
226 ret = copy_siginfo_to_user32(si32, &ssi);
227 }
228 return ret;
229}
230
231asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data)
232{
233 struct task_struct *child;
234 struct pt_regs *childregs;
235 void __user *datap = compat_ptr(data);
236 int ret;
237 __u32 val;
238
239 switch (request) {
240 case PTRACE_TRACEME:
241 case PTRACE_ATTACH:
242 case PTRACE_KILL:
243 case PTRACE_CONT:
244 case PTRACE_SINGLESTEP:
245 case PTRACE_DETACH:
246 case PTRACE_SYSCALL:
247 case PTRACE_OLDSETOPTIONS:
248 case PTRACE_SETOPTIONS:
249 case PTRACE_SET_THREAD_AREA:
250 case PTRACE_GET_THREAD_AREA:
251 return sys_ptrace(request, pid, addr, data);
252
253 default:
254 return -EINVAL;
255
256 case PTRACE_PEEKTEXT:
257 case PTRACE_PEEKDATA:
258 case PTRACE_POKEDATA:
259 case PTRACE_POKETEXT:
260 case PTRACE_POKEUSR:
261 case PTRACE_PEEKUSR:
262 case PTRACE_GETREGS:
263 case PTRACE_SETREGS:
264 case PTRACE_SETFPREGS:
265 case PTRACE_GETFPREGS:
266 case PTRACE_SETFPXREGS:
267 case PTRACE_GETFPXREGS:
268 case PTRACE_GETEVENTMSG:
269 break;
270
271 case PTRACE_SETSIGINFO:
272 case PTRACE_GETSIGINFO:
273 return ptrace32_siginfo(request, pid, addr, data);
274 }
275
276 child = ptrace_get_task_struct(pid);
277 if (IS_ERR(child))
278 return PTR_ERR(child);
279
280 ret = ptrace_check_attach(child, request == PTRACE_KILL);
281 if (ret < 0)
282 goto out;
283
284 childregs = task_pt_regs(child);
285
286 switch (request) {
287 case PTRACE_PEEKDATA:
288 case PTRACE_PEEKTEXT:
289 ret = 0;
290 if (access_process_vm(child, addr, &val, sizeof(u32), 0)!=sizeof(u32))
291 ret = -EIO;
292 else
293 ret = put_user(val, (unsigned int __user *)datap);
294 break;
295
296 case PTRACE_POKEDATA:
297 case PTRACE_POKETEXT:
298 ret = 0;
299 if (access_process_vm(child, addr, &data, sizeof(u32), 1)!=sizeof(u32))
300 ret = -EIO;
301 break;
302
303 case PTRACE_PEEKUSR:
304 ret = getreg32(child, addr, &val);
305 if (ret == 0)
306 ret = put_user(val, (__u32 __user *)datap);
307 break;
308
309 case PTRACE_POKEUSR:
310 ret = putreg32(child, addr, data);
311 break;
312
313 case PTRACE_GETREGS: { /* Get all gp regs from the child. */
314 int i;
315 if (!access_ok(VERIFY_WRITE, datap, 16*4)) {
316 ret = -EIO;
317 break;
318 }
319 ret = 0;
320 for ( i = 0; i <= 16*4 ; i += sizeof(__u32) ) {
321 getreg32(child, i, &val);
322 ret |= __put_user(val,(u32 __user *)datap);
323 datap += sizeof(u32);
324 }
325 break;
326 }
327
328 case PTRACE_SETREGS: { /* Set all gp regs in the child. */
329 unsigned long tmp;
330 int i;
331 if (!access_ok(VERIFY_READ, datap, 16*4)) {
332 ret = -EIO;
333 break;
334 }
335 ret = 0;
336 for ( i = 0; i <= 16*4; i += sizeof(u32) ) {
337 ret |= __get_user(tmp, (u32 __user *)datap);
338 putreg32(child, i, tmp);
339 datap += sizeof(u32);
340 }
341 break;
342 }
343
344 case PTRACE_GETFPREGS:
345 ret = -EIO;
346 if (!access_ok(VERIFY_READ, compat_ptr(data),
347 sizeof(struct user_i387_struct)))
348 break;
349 save_i387_ia32(child, datap, childregs, 1);
350 ret = 0;
351 break;
352
353 case PTRACE_SETFPREGS:
354 ret = -EIO;
355 if (!access_ok(VERIFY_WRITE, datap,
356 sizeof(struct user_i387_struct)))
357 break;
358 ret = 0;
359 /* don't check EFAULT to be bug-to-bug compatible to i386 */
360 restore_i387_ia32(child, datap, 1);
361 break;
362
363 case PTRACE_GETFPXREGS: {
364 struct user32_fxsr_struct __user *u = datap;
365 init_fpu(child);
366 ret = -EIO;
367 if (!access_ok(VERIFY_WRITE, u, sizeof(*u)))
368 break;
369 ret = -EFAULT;
370 if (__copy_to_user(u, &child->thread.i387.fxsave, sizeof(*u)))
371 break;
372 ret = __put_user(childregs->cs, &u->fcs);
373 ret |= __put_user(child->thread.ds, &u->fos);
374 break;
375 }
376 case PTRACE_SETFPXREGS: {
377 struct user32_fxsr_struct __user *u = datap;
378 unlazy_fpu(child);
379 ret = -EIO;
380 if (!access_ok(VERIFY_READ, u, sizeof(*u)))
381 break;
382 /* no checking to be bug-to-bug compatible with i386. */
383 /* but silence warning */
384 if (__copy_from_user(&child->thread.i387.fxsave, u, sizeof(*u)))
385 ;
386 set_stopped_child_used_math(child);
387 child->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
388 ret = 0;
389 break;
390 }
391
392 case PTRACE_GETEVENTMSG:
393 ret = put_user(child->ptrace_message,(unsigned int __user *)compat_ptr(data));
394 break;
395
396 default:
397 BUG();
398 }
399
400 out:
401 put_task_struct(child);
402 return ret;
403}
404
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index bee96d61443..abf71d26fc2 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -1,29 +1,29 @@
1/* 1/*
2 * sys_ia32.c: Conversion between 32bit and 64bit native syscalls. Based on 2 * sys_ia32.c: Conversion between 32bit and 64bit native syscalls. Based on
3 * sys_sparc32 3 * sys_sparc32
4 * 4 *
5 * Copyright (C) 2000 VA Linux Co 5 * Copyright (C) 2000 VA Linux Co
6 * Copyright (C) 2000 Don Dugger <n0ano@valinux.com> 6 * Copyright (C) 2000 Don Dugger <n0ano@valinux.com>
7 * Copyright (C) 1999 Arun Sharma <arun.sharma@intel.com> 7 * Copyright (C) 1999 Arun Sharma <arun.sharma@intel.com>
8 * Copyright (C) 1997,1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz) 8 * Copyright (C) 1997,1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
9 * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) 9 * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu)
10 * Copyright (C) 2000 Hewlett-Packard Co. 10 * Copyright (C) 2000 Hewlett-Packard Co.
11 * Copyright (C) 2000 David Mosberger-Tang <davidm@hpl.hp.com> 11 * Copyright (C) 2000 David Mosberger-Tang <davidm@hpl.hp.com>
12 * Copyright (C) 2000,2001,2002 Andi Kleen, SuSE Labs (x86-64 port) 12 * Copyright (C) 2000,2001,2002 Andi Kleen, SuSE Labs (x86-64 port)
13 * 13 *
14 * These routines maintain argument size conversion between 32bit and 64bit 14 * These routines maintain argument size conversion between 32bit and 64bit
15 * environment. In 2.5 most of this should be moved to a generic directory. 15 * environment. In 2.5 most of this should be moved to a generic directory.
16 * 16 *
17 * This file assumes that there is a hole at the end of user address space. 17 * This file assumes that there is a hole at the end of user address space.
18 * 18 *
19 * Some of the functions are LE specific currently. These are hopefully all marked. 19 * Some of the functions are LE specific currently. These are
20 * This should be fixed. 20 * hopefully all marked. This should be fixed.
21 */ 21 */
22 22
23#include <linux/kernel.h> 23#include <linux/kernel.h>
24#include <linux/sched.h> 24#include <linux/sched.h>
25#include <linux/fs.h> 25#include <linux/fs.h>
26#include <linux/file.h> 26#include <linux/file.h>
27#include <linux/signal.h> 27#include <linux/signal.h>
28#include <linux/syscalls.h> 28#include <linux/syscalls.h>
29#include <linux/resource.h> 29#include <linux/resource.h>
@@ -90,43 +90,44 @@ int cp_compat_stat(struct kstat *kbuf, struct compat_stat __user *ubuf)
90 if (sizeof(ino) < sizeof(kbuf->ino) && ino != kbuf->ino) 90 if (sizeof(ino) < sizeof(kbuf->ino) && ino != kbuf->ino)
91 return -EOVERFLOW; 91 return -EOVERFLOW;
92 if (!access_ok(VERIFY_WRITE, ubuf, sizeof(struct compat_stat)) || 92 if (!access_ok(VERIFY_WRITE, ubuf, sizeof(struct compat_stat)) ||
93 __put_user (old_encode_dev(kbuf->dev), &ubuf->st_dev) || 93 __put_user(old_encode_dev(kbuf->dev), &ubuf->st_dev) ||
94 __put_user (ino, &ubuf->st_ino) || 94 __put_user(ino, &ubuf->st_ino) ||
95 __put_user (kbuf->mode, &ubuf->st_mode) || 95 __put_user(kbuf->mode, &ubuf->st_mode) ||
96 __put_user (kbuf->nlink, &ubuf->st_nlink) || 96 __put_user(kbuf->nlink, &ubuf->st_nlink) ||
97 __put_user (uid, &ubuf->st_uid) || 97 __put_user(uid, &ubuf->st_uid) ||
98 __put_user (gid, &ubuf->st_gid) || 98 __put_user(gid, &ubuf->st_gid) ||
99 __put_user (old_encode_dev(kbuf->rdev), &ubuf->st_rdev) || 99 __put_user(old_encode_dev(kbuf->rdev), &ubuf->st_rdev) ||
100 __put_user (kbuf->size, &ubuf->st_size) || 100 __put_user(kbuf->size, &ubuf->st_size) ||
101 __put_user (kbuf->atime.tv_sec, &ubuf->st_atime) || 101 __put_user(kbuf->atime.tv_sec, &ubuf->st_atime) ||
102 __put_user (kbuf->atime.tv_nsec, &ubuf->st_atime_nsec) || 102 __put_user(kbuf->atime.tv_nsec, &ubuf->st_atime_nsec) ||
103 __put_user (kbuf->mtime.tv_sec, &ubuf->st_mtime) || 103 __put_user(kbuf->mtime.tv_sec, &ubuf->st_mtime) ||
104 __put_user (kbuf->mtime.tv_nsec, &ubuf->st_mtime_nsec) || 104 __put_user(kbuf->mtime.tv_nsec, &ubuf->st_mtime_nsec) ||
105 __put_user (kbuf->ctime.tv_sec, &ubuf->st_ctime) || 105 __put_user(kbuf->ctime.tv_sec, &ubuf->st_ctime) ||
106 __put_user (kbuf->ctime.tv_nsec, &ubuf->st_ctime_nsec) || 106 __put_user(kbuf->ctime.tv_nsec, &ubuf->st_ctime_nsec) ||
107 __put_user (kbuf->blksize, &ubuf->st_blksize) || 107 __put_user(kbuf->blksize, &ubuf->st_blksize) ||
108 __put_user (kbuf->blocks, &ubuf->st_blocks)) 108 __put_user(kbuf->blocks, &ubuf->st_blocks))
109 return -EFAULT; 109 return -EFAULT;
110 return 0; 110 return 0;
111} 111}
112 112
113asmlinkage long 113asmlinkage long sys32_truncate64(char __user *filename,
114sys32_truncate64(char __user * filename, unsigned long offset_low, unsigned long offset_high) 114 unsigned long offset_low,
115 unsigned long offset_high)
115{ 116{
116 return sys_truncate(filename, ((loff_t) offset_high << 32) | offset_low); 117 return sys_truncate(filename, ((loff_t) offset_high << 32) | offset_low);
117} 118}
118 119
119asmlinkage long 120asmlinkage long sys32_ftruncate64(unsigned int fd, unsigned long offset_low,
120sys32_ftruncate64(unsigned int fd, unsigned long offset_low, unsigned long offset_high) 121 unsigned long offset_high)
121{ 122{
122 return sys_ftruncate(fd, ((loff_t) offset_high << 32) | offset_low); 123 return sys_ftruncate(fd, ((loff_t) offset_high << 32) | offset_low);
123} 124}
124 125
125/* Another set for IA32/LFS -- x86_64 struct stat is different due to 126/*
126 support for 64bit inode numbers. */ 127 * Another set for IA32/LFS -- x86_64 struct stat is different due to
127 128 * support for 64bit inode numbers.
128static int 129 */
129cp_stat64(struct stat64 __user *ubuf, struct kstat *stat) 130static int cp_stat64(struct stat64 __user *ubuf, struct kstat *stat)
130{ 131{
131 typeof(ubuf->st_uid) uid = 0; 132 typeof(ubuf->st_uid) uid = 0;
132 typeof(ubuf->st_gid) gid = 0; 133 typeof(ubuf->st_gid) gid = 0;
@@ -134,38 +135,39 @@ cp_stat64(struct stat64 __user *ubuf, struct kstat *stat)
134 SET_GID(gid, stat->gid); 135 SET_GID(gid, stat->gid);
135 if (!access_ok(VERIFY_WRITE, ubuf, sizeof(struct stat64)) || 136 if (!access_ok(VERIFY_WRITE, ubuf, sizeof(struct stat64)) ||
136 __put_user(huge_encode_dev(stat->dev), &ubuf->st_dev) || 137 __put_user(huge_encode_dev(stat->dev), &ubuf->st_dev) ||
137 __put_user (stat->ino, &ubuf->__st_ino) || 138 __put_user(stat->ino, &ubuf->__st_ino) ||
138 __put_user (stat->ino, &ubuf->st_ino) || 139 __put_user(stat->ino, &ubuf->st_ino) ||
139 __put_user (stat->mode, &ubuf->st_mode) || 140 __put_user(stat->mode, &ubuf->st_mode) ||
140 __put_user (stat->nlink, &ubuf->st_nlink) || 141 __put_user(stat->nlink, &ubuf->st_nlink) ||
141 __put_user (uid, &ubuf->st_uid) || 142 __put_user(uid, &ubuf->st_uid) ||
142 __put_user (gid, &ubuf->st_gid) || 143 __put_user(gid, &ubuf->st_gid) ||
143 __put_user (huge_encode_dev(stat->rdev), &ubuf->st_rdev) || 144 __put_user(huge_encode_dev(stat->rdev), &ubuf->st_rdev) ||
144 __put_user (stat->size, &ubuf->st_size) || 145 __put_user(stat->size, &ubuf->st_size) ||
145 __put_user (stat->atime.tv_sec, &ubuf->st_atime) || 146 __put_user(stat->atime.tv_sec, &ubuf->st_atime) ||
146 __put_user (stat->atime.tv_nsec, &ubuf->st_atime_nsec) || 147 __put_user(stat->atime.tv_nsec, &ubuf->st_atime_nsec) ||
147 __put_user (stat->mtime.tv_sec, &ubuf->st_mtime) || 148 __put_user(stat->mtime.tv_sec, &ubuf->st_mtime) ||
148 __put_user (stat->mtime.tv_nsec, &ubuf->st_mtime_nsec) || 149 __put_user(stat->mtime.tv_nsec, &ubuf->st_mtime_nsec) ||
149 __put_user (stat->ctime.tv_sec, &ubuf->st_ctime) || 150 __put_user(stat->ctime.tv_sec, &ubuf->st_ctime) ||
150 __put_user (stat->ctime.tv_nsec, &ubuf->st_ctime_nsec) || 151 __put_user(stat->ctime.tv_nsec, &ubuf->st_ctime_nsec) ||
151 __put_user (stat->blksize, &ubuf->st_blksize) || 152 __put_user(stat->blksize, &ubuf->st_blksize) ||
152 __put_user (stat->blocks, &ubuf->st_blocks)) 153 __put_user(stat->blocks, &ubuf->st_blocks))
153 return -EFAULT; 154 return -EFAULT;
154 return 0; 155 return 0;
155} 156}
156 157
157asmlinkage long 158asmlinkage long sys32_stat64(char __user *filename,
158sys32_stat64(char __user * filename, struct stat64 __user *statbuf) 159 struct stat64 __user *statbuf)
159{ 160{
160 struct kstat stat; 161 struct kstat stat;
161 int ret = vfs_stat(filename, &stat); 162 int ret = vfs_stat(filename, &stat);
163
162 if (!ret) 164 if (!ret)
163 ret = cp_stat64(statbuf, &stat); 165 ret = cp_stat64(statbuf, &stat);
164 return ret; 166 return ret;
165} 167}
166 168
167asmlinkage long 169asmlinkage long sys32_lstat64(char __user *filename,
168sys32_lstat64(char __user * filename, struct stat64 __user *statbuf) 170 struct stat64 __user *statbuf)
169{ 171{
170 struct kstat stat; 172 struct kstat stat;
171 int ret = vfs_lstat(filename, &stat); 173 int ret = vfs_lstat(filename, &stat);
@@ -174,8 +176,7 @@ sys32_lstat64(char __user * filename, struct stat64 __user *statbuf)
174 return ret; 176 return ret;
175} 177}
176 178
177asmlinkage long 179asmlinkage long sys32_fstat64(unsigned int fd, struct stat64 __user *statbuf)
178sys32_fstat64(unsigned int fd, struct stat64 __user *statbuf)
179{ 180{
180 struct kstat stat; 181 struct kstat stat;
181 int ret = vfs_fstat(fd, &stat); 182 int ret = vfs_fstat(fd, &stat);
@@ -184,9 +185,8 @@ sys32_fstat64(unsigned int fd, struct stat64 __user *statbuf)
184 return ret; 185 return ret;
185} 186}
186 187
187asmlinkage long 188asmlinkage long sys32_fstatat(unsigned int dfd, char __user *filename,
188sys32_fstatat(unsigned int dfd, char __user *filename, 189 struct stat64 __user *statbuf, int flag)
189 struct stat64 __user* statbuf, int flag)
190{ 190{
191 struct kstat stat; 191 struct kstat stat;
192 int error = -EINVAL; 192 int error = -EINVAL;
@@ -221,8 +221,7 @@ struct mmap_arg_struct {
221 unsigned int offset; 221 unsigned int offset;
222}; 222};
223 223
224asmlinkage long 224asmlinkage long sys32_mmap(struct mmap_arg_struct __user *arg)
225sys32_mmap(struct mmap_arg_struct __user *arg)
226{ 225{
227 struct mmap_arg_struct a; 226 struct mmap_arg_struct a;
228 struct file *file = NULL; 227 struct file *file = NULL;
@@ -233,33 +232,33 @@ sys32_mmap(struct mmap_arg_struct __user *arg)
233 return -EFAULT; 232 return -EFAULT;
234 233
235 if (a.offset & ~PAGE_MASK) 234 if (a.offset & ~PAGE_MASK)
236 return -EINVAL; 235 return -EINVAL;
237 236
238 if (!(a.flags & MAP_ANONYMOUS)) { 237 if (!(a.flags & MAP_ANONYMOUS)) {
239 file = fget(a.fd); 238 file = fget(a.fd);
240 if (!file) 239 if (!file)
241 return -EBADF; 240 return -EBADF;
242 } 241 }
243 242
244 mm = current->mm; 243 mm = current->mm;
245 down_write(&mm->mmap_sem); 244 down_write(&mm->mmap_sem);
246 retval = do_mmap_pgoff(file, a.addr, a.len, a.prot, a.flags, a.offset>>PAGE_SHIFT); 245 retval = do_mmap_pgoff(file, a.addr, a.len, a.prot, a.flags,
246 a.offset>>PAGE_SHIFT);
247 if (file) 247 if (file)
248 fput(file); 248 fput(file);
249 249
250 up_write(&mm->mmap_sem); 250 up_write(&mm->mmap_sem);
251 251
252 return retval; 252 return retval;
253} 253}
254 254
255asmlinkage long 255asmlinkage long sys32_mprotect(unsigned long start, size_t len,
256sys32_mprotect(unsigned long start, size_t len, unsigned long prot) 256 unsigned long prot)
257{ 257{
258 return sys_mprotect(start,len,prot); 258 return sys_mprotect(start, len, prot);
259} 259}
260 260
261asmlinkage long 261asmlinkage long sys32_pipe(int __user *fd)
262sys32_pipe(int __user *fd)
263{ 262{
264 int retval; 263 int retval;
265 int fds[2]; 264 int fds[2];
@@ -269,13 +268,13 @@ sys32_pipe(int __user *fd)
269 goto out; 268 goto out;
270 if (copy_to_user(fd, fds, sizeof(fds))) 269 if (copy_to_user(fd, fds, sizeof(fds)))
271 retval = -EFAULT; 270 retval = -EFAULT;
272 out: 271out:
273 return retval; 272 return retval;
274} 273}
275 274
276asmlinkage long 275asmlinkage long sys32_rt_sigaction(int sig, struct sigaction32 __user *act,
277sys32_rt_sigaction(int sig, struct sigaction32 __user *act, 276 struct sigaction32 __user *oact,
278 struct sigaction32 __user *oact, unsigned int sigsetsize) 277 unsigned int sigsetsize)
279{ 278{
280 struct k_sigaction new_ka, old_ka; 279 struct k_sigaction new_ka, old_ka;
281 int ret; 280 int ret;
@@ -291,12 +290,17 @@ sys32_rt_sigaction(int sig, struct sigaction32 __user *act,
291 if (!access_ok(VERIFY_READ, act, sizeof(*act)) || 290 if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
292 __get_user(handler, &act->sa_handler) || 291 __get_user(handler, &act->sa_handler) ||
293 __get_user(new_ka.sa.sa_flags, &act->sa_flags) || 292 __get_user(new_ka.sa.sa_flags, &act->sa_flags) ||
294 __get_user(restorer, &act->sa_restorer)|| 293 __get_user(restorer, &act->sa_restorer) ||
295 __copy_from_user(&set32, &act->sa_mask, sizeof(compat_sigset_t))) 294 __copy_from_user(&set32, &act->sa_mask,
295 sizeof(compat_sigset_t)))
296 return -EFAULT; 296 return -EFAULT;
297 new_ka.sa.sa_handler = compat_ptr(handler); 297 new_ka.sa.sa_handler = compat_ptr(handler);
298 new_ka.sa.sa_restorer = compat_ptr(restorer); 298 new_ka.sa.sa_restorer = compat_ptr(restorer);
299 /* FIXME: here we rely on _COMPAT_NSIG_WORS to be >= than _NSIG_WORDS << 1 */ 299
300 /*
301 * FIXME: here we rely on _COMPAT_NSIG_WORS to be >=
302 * than _NSIG_WORDS << 1
303 */
300 switch (_NSIG_WORDS) { 304 switch (_NSIG_WORDS) {
301 case 4: new_ka.sa.sa_mask.sig[3] = set32.sig[6] 305 case 4: new_ka.sa.sa_mask.sig[3] = set32.sig[6]
302 | (((long)set32.sig[7]) << 32); 306 | (((long)set32.sig[7]) << 32);
@@ -312,7 +316,10 @@ sys32_rt_sigaction(int sig, struct sigaction32 __user *act,
312 ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); 316 ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
313 317
314 if (!ret && oact) { 318 if (!ret && oact) {
315 /* FIXME: here we rely on _COMPAT_NSIG_WORS to be >= than _NSIG_WORDS << 1 */ 319 /*
320 * FIXME: here we rely on _COMPAT_NSIG_WORS to be >=
321 * than _NSIG_WORDS << 1
322 */
316 switch (_NSIG_WORDS) { 323 switch (_NSIG_WORDS) {
317 case 4: 324 case 4:
318 set32.sig[7] = (old_ka.sa.sa_mask.sig[3] >> 32); 325 set32.sig[7] = (old_ka.sa.sa_mask.sig[3] >> 32);
@@ -328,23 +335,26 @@ sys32_rt_sigaction(int sig, struct sigaction32 __user *act,
328 set32.sig[0] = old_ka.sa.sa_mask.sig[0]; 335 set32.sig[0] = old_ka.sa.sa_mask.sig[0];
329 } 336 }
330 if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) || 337 if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
331 __put_user(ptr_to_compat(old_ka.sa.sa_handler), &oact->sa_handler) || 338 __put_user(ptr_to_compat(old_ka.sa.sa_handler),
332 __put_user(ptr_to_compat(old_ka.sa.sa_restorer), &oact->sa_restorer) || 339 &oact->sa_handler) ||
340 __put_user(ptr_to_compat(old_ka.sa.sa_restorer),
341 &oact->sa_restorer) ||
333 __put_user(old_ka.sa.sa_flags, &oact->sa_flags) || 342 __put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
334 __copy_to_user(&oact->sa_mask, &set32, sizeof(compat_sigset_t))) 343 __copy_to_user(&oact->sa_mask, &set32,
344 sizeof(compat_sigset_t)))
335 return -EFAULT; 345 return -EFAULT;
336 } 346 }
337 347
338 return ret; 348 return ret;
339} 349}
340 350
341asmlinkage long 351asmlinkage long sys32_sigaction(int sig, struct old_sigaction32 __user *act,
342sys32_sigaction (int sig, struct old_sigaction32 __user *act, struct old_sigaction32 __user *oact) 352 struct old_sigaction32 __user *oact)
343{ 353{
344 struct k_sigaction new_ka, old_ka; 354 struct k_sigaction new_ka, old_ka;
345 int ret; 355 int ret;
346 356
347 if (act) { 357 if (act) {
348 compat_old_sigset_t mask; 358 compat_old_sigset_t mask;
349 compat_uptr_t handler, restorer; 359 compat_uptr_t handler, restorer;
350 360
@@ -359,33 +369,35 @@ sys32_sigaction (int sig, struct old_sigaction32 __user *act, struct old_sigacti
359 new_ka.sa.sa_restorer = compat_ptr(restorer); 369 new_ka.sa.sa_restorer = compat_ptr(restorer);
360 370
361 siginitset(&new_ka.sa.sa_mask, mask); 371 siginitset(&new_ka.sa.sa_mask, mask);
362 } 372 }
363 373
364 ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); 374 ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
365 375
366 if (!ret && oact) { 376 if (!ret && oact) {
367 if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) || 377 if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
368 __put_user(ptr_to_compat(old_ka.sa.sa_handler), &oact->sa_handler) || 378 __put_user(ptr_to_compat(old_ka.sa.sa_handler),
369 __put_user(ptr_to_compat(old_ka.sa.sa_restorer), &oact->sa_restorer) || 379 &oact->sa_handler) ||
380 __put_user(ptr_to_compat(old_ka.sa.sa_restorer),
381 &oact->sa_restorer) ||
370 __put_user(old_ka.sa.sa_flags, &oact->sa_flags) || 382 __put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
371 __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask)) 383 __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask))
372 return -EFAULT; 384 return -EFAULT;
373 } 385 }
374 386
375 return ret; 387 return ret;
376} 388}
377 389
378asmlinkage long 390asmlinkage long sys32_rt_sigprocmask(int how, compat_sigset_t __user *set,
379sys32_rt_sigprocmask(int how, compat_sigset_t __user *set, 391 compat_sigset_t __user *oset,
380 compat_sigset_t __user *oset, unsigned int sigsetsize) 392 unsigned int sigsetsize)
381{ 393{
382 sigset_t s; 394 sigset_t s;
383 compat_sigset_t s32; 395 compat_sigset_t s32;
384 int ret; 396 int ret;
385 mm_segment_t old_fs = get_fs(); 397 mm_segment_t old_fs = get_fs();
386 398
387 if (set) { 399 if (set) {
388 if (copy_from_user (&s32, set, sizeof(compat_sigset_t))) 400 if (copy_from_user(&s32, set, sizeof(compat_sigset_t)))
389 return -EFAULT; 401 return -EFAULT;
390 switch (_NSIG_WORDS) { 402 switch (_NSIG_WORDS) {
391 case 4: s.sig[3] = s32.sig[6] | (((long)s32.sig[7]) << 32); 403 case 4: s.sig[3] = s32.sig[6] | (((long)s32.sig[7]) << 32);
@@ -394,13 +406,14 @@ sys32_rt_sigprocmask(int how, compat_sigset_t __user *set,
394 case 1: s.sig[0] = s32.sig[0] | (((long)s32.sig[1]) << 32); 406 case 1: s.sig[0] = s32.sig[0] | (((long)s32.sig[1]) << 32);
395 } 407 }
396 } 408 }
397 set_fs (KERNEL_DS); 409 set_fs(KERNEL_DS);
398 ret = sys_rt_sigprocmask(how, 410 ret = sys_rt_sigprocmask(how,
399 set ? (sigset_t __user *)&s : NULL, 411 set ? (sigset_t __user *)&s : NULL,
400 oset ? (sigset_t __user *)&s : NULL, 412 oset ? (sigset_t __user *)&s : NULL,
401 sigsetsize); 413 sigsetsize);
402 set_fs (old_fs); 414 set_fs(old_fs);
403 if (ret) return ret; 415 if (ret)
416 return ret;
404 if (oset) { 417 if (oset) {
405 switch (_NSIG_WORDS) { 418 switch (_NSIG_WORDS) {
406 case 4: s32.sig[7] = (s.sig[3] >> 32); s32.sig[6] = s.sig[3]; 419 case 4: s32.sig[7] = (s.sig[3] >> 32); s32.sig[6] = s.sig[3];
@@ -408,52 +421,49 @@ sys32_rt_sigprocmask(int how, compat_sigset_t __user *set,
408 case 2: s32.sig[3] = (s.sig[1] >> 32); s32.sig[2] = s.sig[1]; 421 case 2: s32.sig[3] = (s.sig[1] >> 32); s32.sig[2] = s.sig[1];
409 case 1: s32.sig[1] = (s.sig[0] >> 32); s32.sig[0] = s.sig[0]; 422 case 1: s32.sig[1] = (s.sig[0] >> 32); s32.sig[0] = s.sig[0];
410 } 423 }
411 if (copy_to_user (oset, &s32, sizeof(compat_sigset_t))) 424 if (copy_to_user(oset, &s32, sizeof(compat_sigset_t)))
412 return -EFAULT; 425 return -EFAULT;
413 } 426 }
414 return 0; 427 return 0;
415} 428}
416 429
417static inline long 430static inline long get_tv32(struct timeval *o, struct compat_timeval __user *i)
418get_tv32(struct timeval *o, struct compat_timeval __user *i)
419{ 431{
420 int err = -EFAULT; 432 int err = -EFAULT;
421 if (access_ok(VERIFY_READ, i, sizeof(*i))) { 433
434 if (access_ok(VERIFY_READ, i, sizeof(*i))) {
422 err = __get_user(o->tv_sec, &i->tv_sec); 435 err = __get_user(o->tv_sec, &i->tv_sec);
423 err |= __get_user(o->tv_usec, &i->tv_usec); 436 err |= __get_user(o->tv_usec, &i->tv_usec);
424 } 437 }
425 return err; 438 return err;
426} 439}
427 440
428static inline long 441static inline long put_tv32(struct compat_timeval __user *o, struct timeval *i)
429put_tv32(struct compat_timeval __user *o, struct timeval *i)
430{ 442{
431 int err = -EFAULT; 443 int err = -EFAULT;
432 if (access_ok(VERIFY_WRITE, o, sizeof(*o))) { 444
445 if (access_ok(VERIFY_WRITE, o, sizeof(*o))) {
433 err = __put_user(i->tv_sec, &o->tv_sec); 446 err = __put_user(i->tv_sec, &o->tv_sec);
434 err |= __put_user(i->tv_usec, &o->tv_usec); 447 err |= __put_user(i->tv_usec, &o->tv_usec);
435 } 448 }
436 return err; 449 return err;
437} 450}
438 451
439extern unsigned int alarm_setitimer(unsigned int seconds); 452asmlinkage long sys32_alarm(unsigned int seconds)
440
441asmlinkage long
442sys32_alarm(unsigned int seconds)
443{ 453{
444 return alarm_setitimer(seconds); 454 return alarm_setitimer(seconds);
445} 455}
446 456
447/* Translations due to time_t size differences. Which affects all 457/*
448 sorts of things, like timeval and itimerval. */ 458 * Translations due to time_t size differences. Which affects all
449 459 * sorts of things, like timeval and itimerval.
450extern struct timezone sys_tz; 460 */
451 461asmlinkage long sys32_gettimeofday(struct compat_timeval __user *tv,
452asmlinkage long 462 struct timezone __user *tz)
453sys32_gettimeofday(struct compat_timeval __user *tv, struct timezone __user *tz)
454{ 463{
455 if (tv) { 464 if (tv) {
456 struct timeval ktv; 465 struct timeval ktv;
466
457 do_gettimeofday(&ktv); 467 do_gettimeofday(&ktv);
458 if (put_tv32(tv, &ktv)) 468 if (put_tv32(tv, &ktv))
459 return -EFAULT; 469 return -EFAULT;
@@ -465,14 +475,14 @@ sys32_gettimeofday(struct compat_timeval __user *tv, struct timezone __user *tz)
465 return 0; 475 return 0;
466} 476}
467 477
468asmlinkage long 478asmlinkage long sys32_settimeofday(struct compat_timeval __user *tv,
469sys32_settimeofday(struct compat_timeval __user *tv, struct timezone __user *tz) 479 struct timezone __user *tz)
470{ 480{
471 struct timeval ktv; 481 struct timeval ktv;
472 struct timespec kts; 482 struct timespec kts;
473 struct timezone ktz; 483 struct timezone ktz;
474 484
475 if (tv) { 485 if (tv) {
476 if (get_tv32(&ktv, tv)) 486 if (get_tv32(&ktv, tv))
477 return -EFAULT; 487 return -EFAULT;
478 kts.tv_sec = ktv.tv_sec; 488 kts.tv_sec = ktv.tv_sec;
@@ -494,8 +504,7 @@ struct sel_arg_struct {
494 unsigned int tvp; 504 unsigned int tvp;
495}; 505};
496 506
497asmlinkage long 507asmlinkage long sys32_old_select(struct sel_arg_struct __user *arg)
498sys32_old_select(struct sel_arg_struct __user *arg)
499{ 508{
500 struct sel_arg_struct a; 509 struct sel_arg_struct a;
501 510
@@ -505,50 +514,45 @@ sys32_old_select(struct sel_arg_struct __user *arg)
505 compat_ptr(a.exp), compat_ptr(a.tvp)); 514 compat_ptr(a.exp), compat_ptr(a.tvp));
506} 515}
507 516
508extern asmlinkage long 517asmlinkage long sys32_waitpid(compat_pid_t pid, unsigned int *stat_addr,
509compat_sys_wait4(compat_pid_t pid, compat_uint_t * stat_addr, int options, 518 int options)
510 struct compat_rusage *ru);
511
512asmlinkage long
513sys32_waitpid(compat_pid_t pid, unsigned int *stat_addr, int options)
514{ 519{
515 return compat_sys_wait4(pid, stat_addr, options, NULL); 520 return compat_sys_wait4(pid, stat_addr, options, NULL);
516} 521}
517 522
518/* 32-bit timeval and related flotsam. */ 523/* 32-bit timeval and related flotsam. */
519 524
520asmlinkage long 525asmlinkage long sys32_sysfs(int option, u32 arg1, u32 arg2)
521sys32_sysfs(int option, u32 arg1, u32 arg2)
522{ 526{
523 return sys_sysfs(option, arg1, arg2); 527 return sys_sysfs(option, arg1, arg2);
524} 528}
525 529
526asmlinkage long 530asmlinkage long sys32_sched_rr_get_interval(compat_pid_t pid,
527sys32_sched_rr_get_interval(compat_pid_t pid, struct compat_timespec __user *interval) 531 struct compat_timespec __user *interval)
528{ 532{
529 struct timespec t; 533 struct timespec t;
530 int ret; 534 int ret;
531 mm_segment_t old_fs = get_fs (); 535 mm_segment_t old_fs = get_fs();
532 536
533 set_fs (KERNEL_DS); 537 set_fs(KERNEL_DS);
534 ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t); 538 ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t);
535 set_fs (old_fs); 539 set_fs(old_fs);
536 if (put_compat_timespec(&t, interval)) 540 if (put_compat_timespec(&t, interval))
537 return -EFAULT; 541 return -EFAULT;
538 return ret; 542 return ret;
539} 543}
540 544
541asmlinkage long 545asmlinkage long sys32_rt_sigpending(compat_sigset_t __user *set,
542sys32_rt_sigpending(compat_sigset_t __user *set, compat_size_t sigsetsize) 546 compat_size_t sigsetsize)
543{ 547{
544 sigset_t s; 548 sigset_t s;
545 compat_sigset_t s32; 549 compat_sigset_t s32;
546 int ret; 550 int ret;
547 mm_segment_t old_fs = get_fs(); 551 mm_segment_t old_fs = get_fs();
548 552
549 set_fs (KERNEL_DS); 553 set_fs(KERNEL_DS);
550 ret = sys_rt_sigpending((sigset_t __user *)&s, sigsetsize); 554 ret = sys_rt_sigpending((sigset_t __user *)&s, sigsetsize);
551 set_fs (old_fs); 555 set_fs(old_fs);
552 if (!ret) { 556 if (!ret) {
553 switch (_NSIG_WORDS) { 557 switch (_NSIG_WORDS) {
554 case 4: s32.sig[7] = (s.sig[3] >> 32); s32.sig[6] = s.sig[3]; 558 case 4: s32.sig[7] = (s.sig[3] >> 32); s32.sig[6] = s.sig[3];
@@ -556,30 +560,29 @@ sys32_rt_sigpending(compat_sigset_t __user *set, compat_size_t sigsetsize)
556 case 2: s32.sig[3] = (s.sig[1] >> 32); s32.sig[2] = s.sig[1]; 560 case 2: s32.sig[3] = (s.sig[1] >> 32); s32.sig[2] = s.sig[1];
557 case 1: s32.sig[1] = (s.sig[0] >> 32); s32.sig[0] = s.sig[0]; 561 case 1: s32.sig[1] = (s.sig[0] >> 32); s32.sig[0] = s.sig[0];
558 } 562 }
559 if (copy_to_user (set, &s32, sizeof(compat_sigset_t))) 563 if (copy_to_user(set, &s32, sizeof(compat_sigset_t)))
560 return -EFAULT; 564 return -EFAULT;
561 } 565 }
562 return ret; 566 return ret;
563} 567}
564 568
565asmlinkage long 569asmlinkage long sys32_rt_sigqueueinfo(int pid, int sig,
566sys32_rt_sigqueueinfo(int pid, int sig, compat_siginfo_t __user *uinfo) 570 compat_siginfo_t __user *uinfo)
567{ 571{
568 siginfo_t info; 572 siginfo_t info;
569 int ret; 573 int ret;
570 mm_segment_t old_fs = get_fs(); 574 mm_segment_t old_fs = get_fs();
571 575
572 if (copy_siginfo_from_user32(&info, uinfo)) 576 if (copy_siginfo_from_user32(&info, uinfo))
573 return -EFAULT; 577 return -EFAULT;
574 set_fs (KERNEL_DS); 578 set_fs(KERNEL_DS);
575 ret = sys_rt_sigqueueinfo(pid, sig, (siginfo_t __user *)&info); 579 ret = sys_rt_sigqueueinfo(pid, sig, (siginfo_t __user *)&info);
576 set_fs (old_fs); 580 set_fs(old_fs);
577 return ret; 581 return ret;
578} 582}
579 583
580/* These are here just in case some old ia32 binary calls it. */ 584/* These are here just in case some old ia32 binary calls it. */
581asmlinkage long 585asmlinkage long sys32_pause(void)
582sys32_pause(void)
583{ 586{
584 current->state = TASK_INTERRUPTIBLE; 587 current->state = TASK_INTERRUPTIBLE;
585 schedule(); 588 schedule();
@@ -599,25 +602,25 @@ struct sysctl_ia32 {
599}; 602};
600 603
601 604
602asmlinkage long 605asmlinkage long sys32_sysctl(struct sysctl_ia32 __user *args32)
603sys32_sysctl(struct sysctl_ia32 __user *args32)
604{ 606{
605 struct sysctl_ia32 a32; 607 struct sysctl_ia32 a32;
606 mm_segment_t old_fs = get_fs (); 608 mm_segment_t old_fs = get_fs();
607 void __user *oldvalp, *newvalp; 609 void __user *oldvalp, *newvalp;
608 size_t oldlen; 610 size_t oldlen;
609 int __user *namep; 611 int __user *namep;
610 long ret; 612 long ret;
611 613
612 if (copy_from_user(&a32, args32, sizeof (a32))) 614 if (copy_from_user(&a32, args32, sizeof(a32)))
613 return -EFAULT; 615 return -EFAULT;
614 616
615 /* 617 /*
616 * We need to pre-validate these because we have to disable address checking 618 * We need to pre-validate these because we have to disable
617 * before calling do_sysctl() because of OLDLEN but we can't run the risk of the 619 * address checking before calling do_sysctl() because of
618 * user specifying bad addresses here. Well, since we're dealing with 32 bit 620 * OLDLEN but we can't run the risk of the user specifying bad
619 * addresses, we KNOW that access_ok() will always succeed, so this is an 621 * addresses here. Well, since we're dealing with 32 bit
620 * expensive NOP, but so what... 622 * addresses, we KNOW that access_ok() will always succeed, so
623 * this is an expensive NOP, but so what...
621 */ 624 */
622 namep = compat_ptr(a32.name); 625 namep = compat_ptr(a32.name);
623 oldvalp = compat_ptr(a32.oldval); 626 oldvalp = compat_ptr(a32.oldval);
@@ -636,34 +639,34 @@ sys32_sysctl(struct sysctl_ia32 __user *args32)
636 unlock_kernel(); 639 unlock_kernel();
637 set_fs(old_fs); 640 set_fs(old_fs);
638 641
639 if (oldvalp && put_user (oldlen, (int __user *)compat_ptr(a32.oldlenp))) 642 if (oldvalp && put_user(oldlen, (int __user *)compat_ptr(a32.oldlenp)))
640 return -EFAULT; 643 return -EFAULT;
641 644
642 return ret; 645 return ret;
643} 646}
644#endif 647#endif
645 648
646/* warning: next two assume little endian */ 649/* warning: next two assume little endian */
647asmlinkage long 650asmlinkage long sys32_pread(unsigned int fd, char __user *ubuf, u32 count,
648sys32_pread(unsigned int fd, char __user *ubuf, u32 count, u32 poslo, u32 poshi) 651 u32 poslo, u32 poshi)
649{ 652{
650 return sys_pread64(fd, ubuf, count, 653 return sys_pread64(fd, ubuf, count,
651 ((loff_t)AA(poshi) << 32) | AA(poslo)); 654 ((loff_t)AA(poshi) << 32) | AA(poslo));
652} 655}
653 656
654asmlinkage long 657asmlinkage long sys32_pwrite(unsigned int fd, char __user *ubuf, u32 count,
655sys32_pwrite(unsigned int fd, char __user *ubuf, u32 count, u32 poslo, u32 poshi) 658 u32 poslo, u32 poshi)
656{ 659{
657 return sys_pwrite64(fd, ubuf, count, 660 return sys_pwrite64(fd, ubuf, count,
658 ((loff_t)AA(poshi) << 32) | AA(poslo)); 661 ((loff_t)AA(poshi) << 32) | AA(poslo));
659} 662}
660 663
661 664
662asmlinkage long 665asmlinkage long sys32_personality(unsigned long personality)
663sys32_personality(unsigned long personality)
664{ 666{
665 int ret; 667 int ret;
666 if (personality(current->personality) == PER_LINUX32 && 668
669 if (personality(current->personality) == PER_LINUX32 &&
667 personality == PER_LINUX) 670 personality == PER_LINUX)
668 personality = PER_LINUX32; 671 personality = PER_LINUX32;
669 ret = sys_personality(personality); 672 ret = sys_personality(personality);
@@ -672,34 +675,33 @@ sys32_personality(unsigned long personality)
672 return ret; 675 return ret;
673} 676}
674 677
675asmlinkage long 678asmlinkage long sys32_sendfile(int out_fd, int in_fd,
676sys32_sendfile(int out_fd, int in_fd, compat_off_t __user *offset, s32 count) 679 compat_off_t __user *offset, s32 count)
677{ 680{
678 mm_segment_t old_fs = get_fs(); 681 mm_segment_t old_fs = get_fs();
679 int ret; 682 int ret;
680 off_t of; 683 off_t of;
681 684
682 if (offset && get_user(of, offset)) 685 if (offset && get_user(of, offset))
683 return -EFAULT; 686 return -EFAULT;
684 687
685 set_fs(KERNEL_DS); 688 set_fs(KERNEL_DS);
686 ret = sys_sendfile(out_fd, in_fd, offset ? (off_t __user *)&of : NULL, 689 ret = sys_sendfile(out_fd, in_fd, offset ? (off_t __user *)&of : NULL,
687 count); 690 count);
688 set_fs(old_fs); 691 set_fs(old_fs);
689 692
690 if (offset && put_user(of, offset)) 693 if (offset && put_user(of, offset))
691 return -EFAULT; 694 return -EFAULT;
692
693 return ret; 695 return ret;
694} 696}
695 697
696asmlinkage long sys32_mmap2(unsigned long addr, unsigned long len, 698asmlinkage long sys32_mmap2(unsigned long addr, unsigned long len,
697 unsigned long prot, unsigned long flags, 699 unsigned long prot, unsigned long flags,
698 unsigned long fd, unsigned long pgoff) 700 unsigned long fd, unsigned long pgoff)
699{ 701{
700 struct mm_struct *mm = current->mm; 702 struct mm_struct *mm = current->mm;
701 unsigned long error; 703 unsigned long error;
702 struct file * file = NULL; 704 struct file *file = NULL;
703 705
704 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); 706 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
705 if (!(flags & MAP_ANONYMOUS)) { 707 if (!(flags & MAP_ANONYMOUS)) {
@@ -717,36 +719,35 @@ asmlinkage long sys32_mmap2(unsigned long addr, unsigned long len,
717 return error; 719 return error;
718} 720}
719 721
720asmlinkage long sys32_olduname(struct oldold_utsname __user * name) 722asmlinkage long sys32_olduname(struct oldold_utsname __user *name)
721{ 723{
724 char *arch = "x86_64";
722 int err; 725 int err;
723 726
724 if (!name) 727 if (!name)
725 return -EFAULT; 728 return -EFAULT;
726 if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname))) 729 if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname)))
727 return -EFAULT; 730 return -EFAULT;
728 731
729 down_read(&uts_sem); 732 down_read(&uts_sem);
730 733
731 err = __copy_to_user(&name->sysname,&utsname()->sysname, 734 err = __copy_to_user(&name->sysname, &utsname()->sysname,
732 __OLD_UTS_LEN); 735 __OLD_UTS_LEN);
733 err |= __put_user(0,name->sysname+__OLD_UTS_LEN); 736 err |= __put_user(0, name->sysname+__OLD_UTS_LEN);
734 err |= __copy_to_user(&name->nodename,&utsname()->nodename, 737 err |= __copy_to_user(&name->nodename, &utsname()->nodename,
735 __OLD_UTS_LEN); 738 __OLD_UTS_LEN);
736 err |= __put_user(0,name->nodename+__OLD_UTS_LEN); 739 err |= __put_user(0, name->nodename+__OLD_UTS_LEN);
737 err |= __copy_to_user(&name->release,&utsname()->release, 740 err |= __copy_to_user(&name->release, &utsname()->release,
738 __OLD_UTS_LEN); 741 __OLD_UTS_LEN);
739 err |= __put_user(0,name->release+__OLD_UTS_LEN); 742 err |= __put_user(0, name->release+__OLD_UTS_LEN);
740 err |= __copy_to_user(&name->version,&utsname()->version, 743 err |= __copy_to_user(&name->version, &utsname()->version,
741 __OLD_UTS_LEN); 744 __OLD_UTS_LEN);
742 err |= __put_user(0,name->version+__OLD_UTS_LEN); 745 err |= __put_user(0, name->version+__OLD_UTS_LEN);
743 { 746
744 char *arch = "x86_64"; 747 if (personality(current->personality) == PER_LINUX32)
745 if (personality(current->personality) == PER_LINUX32) 748 arch = "i686";
746 arch = "i686"; 749
747 750 err |= __copy_to_user(&name->machine, arch, strlen(arch) + 1);
748 err |= __copy_to_user(&name->machine, arch, strlen(arch)+1);
749 }
750 751
751 up_read(&uts_sem); 752 up_read(&uts_sem);
752 753
@@ -755,17 +756,19 @@ asmlinkage long sys32_olduname(struct oldold_utsname __user * name)
755 return err; 756 return err;
756} 757}
757 758
758long sys32_uname(struct old_utsname __user * name) 759long sys32_uname(struct old_utsname __user *name)
759{ 760{
760 int err; 761 int err;
762
761 if (!name) 763 if (!name)
762 return -EFAULT; 764 return -EFAULT;
763 down_read(&uts_sem); 765 down_read(&uts_sem);
764 err = copy_to_user(name, utsname(), sizeof (*name)); 766 err = copy_to_user(name, utsname(), sizeof(*name));
765 up_read(&uts_sem); 767 up_read(&uts_sem);
766 if (personality(current->personality) == PER_LINUX32) 768 if (personality(current->personality) == PER_LINUX32)
767 err |= copy_to_user(&name->machine, "i686", 5); 769 err |= copy_to_user(&name->machine, "i686", 5);
768 return err?-EFAULT:0; 770
771 return err ? -EFAULT : 0;
769} 772}
770 773
771long sys32_ustat(unsigned dev, struct ustat32 __user *u32p) 774long sys32_ustat(unsigned dev, struct ustat32 __user *u32p)
@@ -773,27 +776,28 @@ long sys32_ustat(unsigned dev, struct ustat32 __user *u32p)
773 struct ustat u; 776 struct ustat u;
774 mm_segment_t seg; 777 mm_segment_t seg;
775 int ret; 778 int ret;
776 779
777 seg = get_fs(); 780 seg = get_fs();
778 set_fs(KERNEL_DS); 781 set_fs(KERNEL_DS);
779 ret = sys_ustat(dev, (struct ustat __user *)&u); 782 ret = sys_ustat(dev, (struct ustat __user *)&u);
780 set_fs(seg); 783 set_fs(seg);
781 if (ret >= 0) { 784 if (ret < 0)
782 if (!access_ok(VERIFY_WRITE,u32p,sizeof(struct ustat32)) || 785 return ret;
783 __put_user((__u32) u.f_tfree, &u32p->f_tfree) || 786
784 __put_user((__u32) u.f_tinode, &u32p->f_tfree) || 787 if (!access_ok(VERIFY_WRITE, u32p, sizeof(struct ustat32)) ||
785 __copy_to_user(&u32p->f_fname, u.f_fname, sizeof(u.f_fname)) || 788 __put_user((__u32) u.f_tfree, &u32p->f_tfree) ||
786 __copy_to_user(&u32p->f_fpack, u.f_fpack, sizeof(u.f_fpack))) 789 __put_user((__u32) u.f_tinode, &u32p->f_tfree) ||
787 ret = -EFAULT; 790 __copy_to_user(&u32p->f_fname, u.f_fname, sizeof(u.f_fname)) ||
788 } 791 __copy_to_user(&u32p->f_fpack, u.f_fpack, sizeof(u.f_fpack)))
792 ret = -EFAULT;
789 return ret; 793 return ret;
790} 794}
791 795
792asmlinkage long sys32_execve(char __user *name, compat_uptr_t __user *argv, 796asmlinkage long sys32_execve(char __user *name, compat_uptr_t __user *argv,
793 compat_uptr_t __user *envp, struct pt_regs *regs) 797 compat_uptr_t __user *envp, struct pt_regs *regs)
794{ 798{
795 long error; 799 long error;
796 char * filename; 800 char *filename;
797 801
798 filename = getname(name); 802 filename = getname(name);
799 error = PTR_ERR(filename); 803 error = PTR_ERR(filename);
@@ -812,18 +816,19 @@ asmlinkage long sys32_execve(char __user *name, compat_uptr_t __user *argv,
812asmlinkage long sys32_clone(unsigned int clone_flags, unsigned int newsp, 816asmlinkage long sys32_clone(unsigned int clone_flags, unsigned int newsp,
813 struct pt_regs *regs) 817 struct pt_regs *regs)
814{ 818{
815 void __user *parent_tid = (void __user *)regs->rdx; 819 void __user *parent_tid = (void __user *)regs->dx;
816 void __user *child_tid = (void __user *)regs->rdi; 820 void __user *child_tid = (void __user *)regs->di;
821
817 if (!newsp) 822 if (!newsp)
818 newsp = regs->rsp; 823 newsp = regs->sp;
819 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); 824 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
820} 825}
821 826
822/* 827/*
823 * Some system calls that need sign extended arguments. This could be done by a generic wrapper. 828 * Some system calls that need sign extended arguments. This could be
824 */ 829 * done by a generic wrapper.
825 830 */
826long sys32_lseek (unsigned int fd, int offset, unsigned int whence) 831long sys32_lseek(unsigned int fd, int offset, unsigned int whence)
827{ 832{
828 return sys_lseek(fd, offset, whence); 833 return sys_lseek(fd, offset, whence);
829} 834}
@@ -832,49 +837,52 @@ long sys32_kill(int pid, int sig)
832{ 837{
833 return sys_kill(pid, sig); 838 return sys_kill(pid, sig);
834} 839}
835 840
836long sys32_fadvise64_64(int fd, __u32 offset_low, __u32 offset_high, 841long sys32_fadvise64_64(int fd, __u32 offset_low, __u32 offset_high,
837 __u32 len_low, __u32 len_high, int advice) 842 __u32 len_low, __u32 len_high, int advice)
838{ 843{
839 return sys_fadvise64_64(fd, 844 return sys_fadvise64_64(fd,
840 (((u64)offset_high)<<32) | offset_low, 845 (((u64)offset_high)<<32) | offset_low,
841 (((u64)len_high)<<32) | len_low, 846 (((u64)len_high)<<32) | len_low,
842 advice); 847 advice);
843} 848}
844 849
845long sys32_vm86_warning(void) 850long sys32_vm86_warning(void)
846{ 851{
847 struct task_struct *me = current; 852 struct task_struct *me = current;
848 static char lastcomm[sizeof(me->comm)]; 853 static char lastcomm[sizeof(me->comm)];
854
849 if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) { 855 if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) {
850 compat_printk(KERN_INFO "%s: vm86 mode not supported on 64 bit kernel\n", 856 compat_printk(KERN_INFO
851 me->comm); 857 "%s: vm86 mode not supported on 64 bit kernel\n",
858 me->comm);
852 strncpy(lastcomm, me->comm, sizeof(lastcomm)); 859 strncpy(lastcomm, me->comm, sizeof(lastcomm));
853 } 860 }
854 return -ENOSYS; 861 return -ENOSYS;
855} 862}
856 863
857long sys32_lookup_dcookie(u32 addr_low, u32 addr_high, 864long sys32_lookup_dcookie(u32 addr_low, u32 addr_high,
858 char __user * buf, size_t len) 865 char __user *buf, size_t len)
859{ 866{
860 return sys_lookup_dcookie(((u64)addr_high << 32) | addr_low, buf, len); 867 return sys_lookup_dcookie(((u64)addr_high << 32) | addr_low, buf, len);
861} 868}
862 869
863asmlinkage ssize_t sys32_readahead(int fd, unsigned off_lo, unsigned off_hi, size_t count) 870asmlinkage ssize_t sys32_readahead(int fd, unsigned off_lo, unsigned off_hi,
871 size_t count)
864{ 872{
865 return sys_readahead(fd, ((u64)off_hi << 32) | off_lo, count); 873 return sys_readahead(fd, ((u64)off_hi << 32) | off_lo, count);
866} 874}
867 875
868asmlinkage long sys32_sync_file_range(int fd, unsigned off_low, unsigned off_hi, 876asmlinkage long sys32_sync_file_range(int fd, unsigned off_low, unsigned off_hi,
869 unsigned n_low, unsigned n_hi, int flags) 877 unsigned n_low, unsigned n_hi, int flags)
870{ 878{
871 return sys_sync_file_range(fd, 879 return sys_sync_file_range(fd,
872 ((u64)off_hi << 32) | off_low, 880 ((u64)off_hi << 32) | off_low,
873 ((u64)n_hi << 32) | n_low, flags); 881 ((u64)n_hi << 32) | n_low, flags);
874} 882}
875 883
876asmlinkage long sys32_fadvise64(int fd, unsigned offset_lo, unsigned offset_hi, size_t len, 884asmlinkage long sys32_fadvise64(int fd, unsigned offset_lo, unsigned offset_hi,
877 int advice) 885 size_t len, int advice)
878{ 886{
879 return sys_fadvise64_64(fd, ((u64)offset_hi << 32) | offset_lo, 887 return sys_fadvise64_64(fd, ((u64)offset_hi << 32) | offset_lo,
880 len, advice); 888 len, advice);
diff --git a/arch/x86/ia32/syscall32.c b/arch/x86/ia32/syscall32.c
deleted file mode 100644
index 15013bac181..00000000000
--- a/arch/x86/ia32/syscall32.c
+++ /dev/null
@@ -1,83 +0,0 @@
1/* Copyright 2002,2003 Andi Kleen, SuSE Labs */
2
3/* vsyscall handling for 32bit processes. Map a stub page into it
4 on demand because 32bit cannot reach the kernel's fixmaps */
5
6#include <linux/mm.h>
7#include <linux/string.h>
8#include <linux/kernel.h>
9#include <linux/gfp.h>
10#include <linux/init.h>
11#include <linux/stringify.h>
12#include <linux/security.h>
13#include <asm/proto.h>
14#include <asm/tlbflush.h>
15#include <asm/ia32_unistd.h>
16#include <asm/vsyscall32.h>
17
18extern unsigned char syscall32_syscall[], syscall32_syscall_end[];
19extern unsigned char syscall32_sysenter[], syscall32_sysenter_end[];
20extern int sysctl_vsyscall32;
21
22static struct page *syscall32_pages[1];
23static int use_sysenter = -1;
24
25struct linux_binprm;
26
27/* Setup a VMA at program startup for the vsyscall page */
28int syscall32_setup_pages(struct linux_binprm *bprm, int exstack)
29{
30 struct mm_struct *mm = current->mm;
31 int ret;
32
33 down_write(&mm->mmap_sem);
34 /*
35 * MAYWRITE to allow gdb to COW and set breakpoints
36 *
37 * Make sure the vDSO gets into every core dump.
38 * Dumping its contents makes post-mortem fully interpretable later
39 * without matching up the same kernel and hardware config to see
40 * what PC values meant.
41 */
42 /* Could randomize here */
43 ret = install_special_mapping(mm, VSYSCALL32_BASE, PAGE_SIZE,
44 VM_READ|VM_EXEC|
45 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
46 VM_ALWAYSDUMP,
47 syscall32_pages);
48 up_write(&mm->mmap_sem);
49 return ret;
50}
51
52static int __init init_syscall32(void)
53{
54 char *syscall32_page = (void *)get_zeroed_page(GFP_KERNEL);
55 if (!syscall32_page)
56 panic("Cannot allocate syscall32 page");
57 syscall32_pages[0] = virt_to_page(syscall32_page);
58 if (use_sysenter > 0) {
59 memcpy(syscall32_page, syscall32_sysenter,
60 syscall32_sysenter_end - syscall32_sysenter);
61 } else {
62 memcpy(syscall32_page, syscall32_syscall,
63 syscall32_syscall_end - syscall32_syscall);
64 }
65 return 0;
66}
67
68__initcall(init_syscall32);
69
70/* May not be __init: called during resume */
71void syscall32_cpu_init(void)
72{
73 if (use_sysenter < 0)
74 use_sysenter = (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL);
75
76 /* Load these always in case some future AMD CPU supports
77 SYSENTER from compat mode too. */
78 checking_wrmsrl(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
79 checking_wrmsrl(MSR_IA32_SYSENTER_ESP, 0ULL);
80 checking_wrmsrl(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
81
82 wrmsrl(MSR_CSTAR, ia32_cstar_target);
83}
diff --git a/arch/x86/ia32/syscall32_syscall.S b/arch/x86/ia32/syscall32_syscall.S
deleted file mode 100644
index 933f0f08b1c..00000000000
--- a/arch/x86/ia32/syscall32_syscall.S
+++ /dev/null
@@ -1,17 +0,0 @@
1/* 32bit VDSOs mapped into user space. */
2
3 .section ".init.data","aw"
4
5 .globl syscall32_syscall
6 .globl syscall32_syscall_end
7
8syscall32_syscall:
9 .incbin "arch/x86/ia32/vsyscall-syscall.so"
10syscall32_syscall_end:
11
12 .globl syscall32_sysenter
13 .globl syscall32_sysenter_end
14
15syscall32_sysenter:
16 .incbin "arch/x86/ia32/vsyscall-sysenter.so"
17syscall32_sysenter_end:
diff --git a/arch/x86/ia32/tls32.c b/arch/x86/ia32/tls32.c
deleted file mode 100644
index 1cc4340de3c..00000000000
--- a/arch/x86/ia32/tls32.c
+++ /dev/null
@@ -1,163 +0,0 @@
1#include <linux/kernel.h>
2#include <linux/errno.h>
3#include <linux/sched.h>
4#include <linux/user.h>
5
6#include <asm/uaccess.h>
7#include <asm/desc.h>
8#include <asm/system.h>
9#include <asm/ldt.h>
10#include <asm/processor.h>
11#include <asm/proto.h>
12
13/*
14 * sys_alloc_thread_area: get a yet unused TLS descriptor index.
15 */
16static int get_free_idx(void)
17{
18 struct thread_struct *t = &current->thread;
19 int idx;
20
21 for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++)
22 if (desc_empty((struct n_desc_struct *)(t->tls_array) + idx))
23 return idx + GDT_ENTRY_TLS_MIN;
24 return -ESRCH;
25}
26
27/*
28 * Set a given TLS descriptor:
29 * When you want addresses > 32bit use arch_prctl()
30 */
31int do_set_thread_area(struct thread_struct *t, struct user_desc __user *u_info)
32{
33 struct user_desc info;
34 struct n_desc_struct *desc;
35 int cpu, idx;
36
37 if (copy_from_user(&info, u_info, sizeof(info)))
38 return -EFAULT;
39
40 idx = info.entry_number;
41
42 /*
43 * index -1 means the kernel should try to find and
44 * allocate an empty descriptor:
45 */
46 if (idx == -1) {
47 idx = get_free_idx();
48 if (idx < 0)
49 return idx;
50 if (put_user(idx, &u_info->entry_number))
51 return -EFAULT;
52 }
53
54 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
55 return -EINVAL;
56
57 desc = ((struct n_desc_struct *)t->tls_array) + idx - GDT_ENTRY_TLS_MIN;
58
59 /*
60 * We must not get preempted while modifying the TLS.
61 */
62 cpu = get_cpu();
63
64 if (LDT_empty(&info)) {
65 desc->a = 0;
66 desc->b = 0;
67 } else {
68 desc->a = LDT_entry_a(&info);
69 desc->b = LDT_entry_b(&info);
70 }
71 if (t == &current->thread)
72 load_TLS(t, cpu);
73
74 put_cpu();
75 return 0;
76}
77
78asmlinkage long sys32_set_thread_area(struct user_desc __user *u_info)
79{
80 return do_set_thread_area(&current->thread, u_info);
81}
82
83
84/*
85 * Get the current Thread-Local Storage area:
86 */
87
88#define GET_BASE(desc) ( \
89 (((desc)->a >> 16) & 0x0000ffff) | \
90 (((desc)->b << 16) & 0x00ff0000) | \
91 ( (desc)->b & 0xff000000) )
92
93#define GET_LIMIT(desc) ( \
94 ((desc)->a & 0x0ffff) | \
95 ((desc)->b & 0xf0000) )
96
97#define GET_32BIT(desc) (((desc)->b >> 22) & 1)
98#define GET_CONTENTS(desc) (((desc)->b >> 10) & 3)
99#define GET_WRITABLE(desc) (((desc)->b >> 9) & 1)
100#define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1)
101#define GET_PRESENT(desc) (((desc)->b >> 15) & 1)
102#define GET_USEABLE(desc) (((desc)->b >> 20) & 1)
103#define GET_LONGMODE(desc) (((desc)->b >> 21) & 1)
104
105int do_get_thread_area(struct thread_struct *t, struct user_desc __user *u_info)
106{
107 struct user_desc info;
108 struct n_desc_struct *desc;
109 int idx;
110
111 if (get_user(idx, &u_info->entry_number))
112 return -EFAULT;
113 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
114 return -EINVAL;
115
116 desc = ((struct n_desc_struct *)t->tls_array) + idx - GDT_ENTRY_TLS_MIN;
117
118 memset(&info, 0, sizeof(struct user_desc));
119 info.entry_number = idx;
120 info.base_addr = GET_BASE(desc);
121 info.limit = GET_LIMIT(desc);
122 info.seg_32bit = GET_32BIT(desc);
123 info.contents = GET_CONTENTS(desc);
124 info.read_exec_only = !GET_WRITABLE(desc);
125 info.limit_in_pages = GET_LIMIT_PAGES(desc);
126 info.seg_not_present = !GET_PRESENT(desc);
127 info.useable = GET_USEABLE(desc);
128 info.lm = GET_LONGMODE(desc);
129
130 if (copy_to_user(u_info, &info, sizeof(info)))
131 return -EFAULT;
132 return 0;
133}
134
135asmlinkage long sys32_get_thread_area(struct user_desc __user *u_info)
136{
137 return do_get_thread_area(&current->thread, u_info);
138}
139
140
141int ia32_child_tls(struct task_struct *p, struct pt_regs *childregs)
142{
143 struct n_desc_struct *desc;
144 struct user_desc info;
145 struct user_desc __user *cp;
146 int idx;
147
148 cp = (void __user *)childregs->rsi;
149 if (copy_from_user(&info, cp, sizeof(info)))
150 return -EFAULT;
151 if (LDT_empty(&info))
152 return -EINVAL;
153
154 idx = info.entry_number;
155 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
156 return -EINVAL;
157
158 desc = (struct n_desc_struct *)(p->thread.tls_array) + idx - GDT_ENTRY_TLS_MIN;
159 desc->a = LDT_entry_a(&info);
160 desc->b = LDT_entry_b(&info);
161
162 return 0;
163}
diff --git a/arch/x86/ia32/vsyscall-sigreturn.S b/arch/x86/ia32/vsyscall-sigreturn.S
deleted file mode 100644
index b383be00bae..00000000000
--- a/arch/x86/ia32/vsyscall-sigreturn.S
+++ /dev/null
@@ -1,143 +0,0 @@
1/*
2 * Common code for the sigreturn entry points on the vsyscall page.
3 * This code uses SYSCALL_ENTER_KERNEL (either syscall or int $0x80)
4 * to enter the kernel.
5 * This file is #include'd by vsyscall-*.S to define them after the
6 * vsyscall entry point. The addresses we get for these entry points
7 * by doing ".balign 32" must match in both versions of the page.
8 */
9
10 .code32
11 .section .text.sigreturn,"ax"
12 .balign 32
13 .globl __kernel_sigreturn
14 .type __kernel_sigreturn,@function
15__kernel_sigreturn:
16.LSTART_sigreturn:
17 popl %eax
18 movl $__NR_ia32_sigreturn, %eax
19 SYSCALL_ENTER_KERNEL
20.LEND_sigreturn:
21 .size __kernel_sigreturn,.-.LSTART_sigreturn
22
23 .section .text.rtsigreturn,"ax"
24 .balign 32
25 .globl __kernel_rt_sigreturn
26 .type __kernel_rt_sigreturn,@function
27__kernel_rt_sigreturn:
28.LSTART_rt_sigreturn:
29 movl $__NR_ia32_rt_sigreturn, %eax
30 SYSCALL_ENTER_KERNEL
31.LEND_rt_sigreturn:
32 .size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn
33
34 .section .eh_frame,"a",@progbits
35.LSTARTFRAMES:
36 .long .LENDCIES-.LSTARTCIES
37.LSTARTCIES:
38 .long 0 /* CIE ID */
39 .byte 1 /* Version number */
40 .string "zRS" /* NUL-terminated augmentation string */
41 .uleb128 1 /* Code alignment factor */
42 .sleb128 -4 /* Data alignment factor */
43 .byte 8 /* Return address register column */
44 .uleb128 1 /* Augmentation value length */
45 .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
46 .byte 0x0c /* DW_CFA_def_cfa */
47 .uleb128 4
48 .uleb128 4
49 .byte 0x88 /* DW_CFA_offset, column 0x8 */
50 .uleb128 1
51 .align 4
52.LENDCIES:
53
54 .long .LENDFDE2-.LSTARTFDE2 /* Length FDE */
55.LSTARTFDE2:
56 .long .LSTARTFDE2-.LSTARTFRAMES /* CIE pointer */
57 /* HACK: The dwarf2 unwind routines will subtract 1 from the
58 return address to get an address in the middle of the
59 presumed call instruction. Since we didn't get here via
60 a call, we need to include the nop before the real start
61 to make up for it. */
62 .long .LSTART_sigreturn-1-. /* PC-relative start address */
63 .long .LEND_sigreturn-.LSTART_sigreturn+1
64 .uleb128 0 /* Augmentation length */
65 /* What follows are the instructions for the table generation.
66 We record the locations of each register saved. This is
67 complicated by the fact that the "CFA" is always assumed to
68 be the value of the stack pointer in the caller. This means
69 that we must define the CFA of this body of code to be the
70 saved value of the stack pointer in the sigcontext. Which
71 also means that there is no fixed relation to the other
72 saved registers, which means that we must use DW_CFA_expression
73 to compute their addresses. It also means that when we
74 adjust the stack with the popl, we have to do it all over again. */
75
76#define do_cfa_expr(offset) \
77 .byte 0x0f; /* DW_CFA_def_cfa_expression */ \
78 .uleb128 1f-0f; /* length */ \
790: .byte 0x74; /* DW_OP_breg4 */ \
80 .sleb128 offset; /* offset */ \
81 .byte 0x06; /* DW_OP_deref */ \
821:
83
84#define do_expr(regno, offset) \
85 .byte 0x10; /* DW_CFA_expression */ \
86 .uleb128 regno; /* regno */ \
87 .uleb128 1f-0f; /* length */ \
880: .byte 0x74; /* DW_OP_breg4 */ \
89 .sleb128 offset; /* offset */ \
901:
91
92 do_cfa_expr(IA32_SIGCONTEXT_esp+4)
93 do_expr(0, IA32_SIGCONTEXT_eax+4)
94 do_expr(1, IA32_SIGCONTEXT_ecx+4)
95 do_expr(2, IA32_SIGCONTEXT_edx+4)
96 do_expr(3, IA32_SIGCONTEXT_ebx+4)
97 do_expr(5, IA32_SIGCONTEXT_ebp+4)
98 do_expr(6, IA32_SIGCONTEXT_esi+4)
99 do_expr(7, IA32_SIGCONTEXT_edi+4)
100 do_expr(8, IA32_SIGCONTEXT_eip+4)
101
102 .byte 0x42 /* DW_CFA_advance_loc 2 -- nop; popl eax. */
103
104 do_cfa_expr(IA32_SIGCONTEXT_esp)
105 do_expr(0, IA32_SIGCONTEXT_eax)
106 do_expr(1, IA32_SIGCONTEXT_ecx)
107 do_expr(2, IA32_SIGCONTEXT_edx)
108 do_expr(3, IA32_SIGCONTEXT_ebx)
109 do_expr(5, IA32_SIGCONTEXT_ebp)
110 do_expr(6, IA32_SIGCONTEXT_esi)
111 do_expr(7, IA32_SIGCONTEXT_edi)
112 do_expr(8, IA32_SIGCONTEXT_eip)
113
114 .align 4
115.LENDFDE2:
116
117 .long .LENDFDE3-.LSTARTFDE3 /* Length FDE */
118.LSTARTFDE3:
119 .long .LSTARTFDE3-.LSTARTFRAMES /* CIE pointer */
120 /* HACK: See above wrt unwind library assumptions. */
121 .long .LSTART_rt_sigreturn-1-. /* PC-relative start address */
122 .long .LEND_rt_sigreturn-.LSTART_rt_sigreturn+1
123 .uleb128 0 /* Augmentation */
124 /* What follows are the instructions for the table generation.
125 We record the locations of each register saved. This is
126 slightly less complicated than the above, since we don't
127 modify the stack pointer in the process. */
128
129 do_cfa_expr(IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_esp)
130 do_expr(0, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_eax)
131 do_expr(1, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ecx)
132 do_expr(2, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_edx)
133 do_expr(3, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ebx)
134 do_expr(5, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ebp)
135 do_expr(6, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_esi)
136 do_expr(7, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_edi)
137 do_expr(8, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_eip)
138
139 .align 4
140.LENDFDE3:
141
142#include "../../x86/kernel/vsyscall-note_32.S"
143
diff --git a/arch/x86/ia32/vsyscall-sysenter.S b/arch/x86/ia32/vsyscall-sysenter.S
deleted file mode 100644
index ae056e553d1..00000000000
--- a/arch/x86/ia32/vsyscall-sysenter.S
+++ /dev/null
@@ -1,95 +0,0 @@
1/*
2 * Code for the vsyscall page. This version uses the sysenter instruction.
3 */
4
5#include <asm/ia32_unistd.h>
6#include <asm/asm-offsets.h>
7
8 .code32
9 .text
10 .section .text.vsyscall,"ax"
11 .globl __kernel_vsyscall
12 .type __kernel_vsyscall,@function
13__kernel_vsyscall:
14.LSTART_vsyscall:
15 push %ecx
16.Lpush_ecx:
17 push %edx
18.Lpush_edx:
19 push %ebp
20.Lenter_kernel:
21 movl %esp,%ebp
22 sysenter
23 .space 7,0x90
24 jmp .Lenter_kernel
25 /* 16: System call normal return point is here! */
26 pop %ebp
27.Lpop_ebp:
28 pop %edx
29.Lpop_edx:
30 pop %ecx
31.Lpop_ecx:
32 ret
33.LEND_vsyscall:
34 .size __kernel_vsyscall,.-.LSTART_vsyscall
35
36 .section .eh_frame,"a",@progbits
37.LSTARTFRAME:
38 .long .LENDCIE-.LSTARTCIE
39.LSTARTCIE:
40 .long 0 /* CIE ID */
41 .byte 1 /* Version number */
42 .string "zR" /* NUL-terminated augmentation string */
43 .uleb128 1 /* Code alignment factor */
44 .sleb128 -4 /* Data alignment factor */
45 .byte 8 /* Return address register column */
46 .uleb128 1 /* Augmentation value length */
47 .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
48 .byte 0x0c /* DW_CFA_def_cfa */
49 .uleb128 4
50 .uleb128 4
51 .byte 0x88 /* DW_CFA_offset, column 0x8 */
52 .uleb128 1
53 .align 4
54.LENDCIE:
55
56 .long .LENDFDE1-.LSTARTFDE1 /* Length FDE */
57.LSTARTFDE1:
58 .long .LSTARTFDE1-.LSTARTFRAME /* CIE pointer */
59 .long .LSTART_vsyscall-. /* PC-relative start address */
60 .long .LEND_vsyscall-.LSTART_vsyscall
61 .uleb128 0 /* Augmentation length */
62 /* What follows are the instructions for the table generation.
63 We have to record all changes of the stack pointer. */
64 .byte 0x04 /* DW_CFA_advance_loc4 */
65 .long .Lpush_ecx-.LSTART_vsyscall
66 .byte 0x0e /* DW_CFA_def_cfa_offset */
67 .byte 0x08 /* RA at offset 8 now */
68 .byte 0x04 /* DW_CFA_advance_loc4 */
69 .long .Lpush_edx-.Lpush_ecx
70 .byte 0x0e /* DW_CFA_def_cfa_offset */
71 .byte 0x0c /* RA at offset 12 now */
72 .byte 0x04 /* DW_CFA_advance_loc4 */
73 .long .Lenter_kernel-.Lpush_edx
74 .byte 0x0e /* DW_CFA_def_cfa_offset */
75 .byte 0x10 /* RA at offset 16 now */
76 .byte 0x85, 0x04 /* DW_CFA_offset %ebp -16 */
77 /* Finally the epilogue. */
78 .byte 0x04 /* DW_CFA_advance_loc4 */
79 .long .Lpop_ebp-.Lenter_kernel
80 .byte 0x0e /* DW_CFA_def_cfa_offset */
81 .byte 0x12 /* RA at offset 12 now */
82 .byte 0xc5 /* DW_CFA_restore %ebp */
83 .byte 0x04 /* DW_CFA_advance_loc4 */
84 .long .Lpop_edx-.Lpop_ebp
85 .byte 0x0e /* DW_CFA_def_cfa_offset */
86 .byte 0x08 /* RA at offset 8 now */
87 .byte 0x04 /* DW_CFA_advance_loc4 */
88 .long .Lpop_ecx-.Lpop_edx
89 .byte 0x0e /* DW_CFA_def_cfa_offset */
90 .byte 0x04 /* RA at offset 4 now */
91 .align 4
92.LENDFDE1:
93
94#define SYSCALL_ENTER_KERNEL int $0x80
95#include "vsyscall-sigreturn.S"
diff --git a/arch/x86/ia32/vsyscall.lds b/arch/x86/ia32/vsyscall.lds
deleted file mode 100644
index 1dc86ff5bcb..00000000000
--- a/arch/x86/ia32/vsyscall.lds
+++ /dev/null
@@ -1,80 +0,0 @@
1/*
2 * Linker script for vsyscall DSO. The vsyscall page is an ELF shared
3 * object prelinked to its virtual address. This script controls its layout.
4 */
5
6/* This must match <asm/fixmap.h>. */
7VSYSCALL_BASE = 0xffffe000;
8
9SECTIONS
10{
11 . = VSYSCALL_BASE + SIZEOF_HEADERS;
12
13 .hash : { *(.hash) } :text
14 .gnu.hash : { *(.gnu.hash) }
15 .dynsym : { *(.dynsym) }
16 .dynstr : { *(.dynstr) }
17 .gnu.version : { *(.gnu.version) }
18 .gnu.version_d : { *(.gnu.version_d) }
19 .gnu.version_r : { *(.gnu.version_r) }
20
21 /* This linker script is used both with -r and with -shared.
22 For the layouts to match, we need to skip more than enough
23 space for the dynamic symbol table et al. If this amount
24 is insufficient, ld -shared will barf. Just increase it here. */
25 . = VSYSCALL_BASE + 0x400;
26
27 .text.vsyscall : { *(.text.vsyscall) } :text =0x90909090
28
29 /* This is an 32bit object and we cannot easily get the offsets
30 into the 64bit kernel. Just hardcode them here. This assumes
31 that all the stubs don't need more than 0x100 bytes. */
32 . = VSYSCALL_BASE + 0x500;
33
34 .text.sigreturn : { *(.text.sigreturn) } :text =0x90909090
35
36 . = VSYSCALL_BASE + 0x600;
37
38 .text.rtsigreturn : { *(.text.rtsigreturn) } :text =0x90909090
39
40 .note : { *(.note.*) } :text :note
41 .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr
42 .eh_frame : { KEEP (*(.eh_frame)) } :text
43 .dynamic : { *(.dynamic) } :text :dynamic
44 .useless : {
45 *(.got.plt) *(.got)
46 *(.data .data.* .gnu.linkonce.d.*)
47 *(.dynbss)
48 *(.bss .bss.* .gnu.linkonce.b.*)
49 } :text
50}
51
52/*
53 * We must supply the ELF program headers explicitly to get just one
54 * PT_LOAD segment, and set the flags explicitly to make segments read-only.
55 */
56PHDRS
57{
58 text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */
59 dynamic PT_DYNAMIC FLAGS(4); /* PF_R */
60 note PT_NOTE FLAGS(4); /* PF_R */
61 eh_frame_hdr 0x6474e550; /* PT_GNU_EH_FRAME, but ld doesn't match the name */
62}
63
64/*
65 * This controls what symbols we export from the DSO.
66 */
67VERSION
68{
69 LINUX_2.5 {
70 global:
71 __kernel_vsyscall;
72 __kernel_sigreturn;
73 __kernel_rt_sigreturn;
74
75 local: *;
76 };
77}
78
79/* The ELF entry point can be used to set the AT_SYSINFO value. */
80ENTRY(__kernel_vsyscall);
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 38573340b14..21dc1a061bf 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -1,9 +1,93 @@
1ifeq ($(CONFIG_X86_32),y) 1#
2include ${srctree}/arch/x86/kernel/Makefile_32 2# Makefile for the linux kernel.
3else 3#
4include ${srctree}/arch/x86/kernel/Makefile_64 4
5extra-y := head_$(BITS).o init_task.o vmlinux.lds
6extra-$(CONFIG_X86_64) += head64.o
7
8CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
9CFLAGS_vsyscall_64.o := $(PROFILING) -g0
10
11obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o
12obj-y += traps_$(BITS).o irq_$(BITS).o
13obj-y += time_$(BITS).o ioport.o ldt.o
14obj-y += setup_$(BITS).o i8259_$(BITS).o
15obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
16obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
17obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o setup64.o
18obj-y += pci-dma_$(BITS).o bootflag.o e820_$(BITS).o
19obj-y += quirks.o i8237.o topology.o kdebugfs.o
20obj-y += alternative.o i8253.o
21obj-$(CONFIG_X86_64) += pci-nommu_64.o bugs_64.o
22obj-y += tsc_$(BITS).o io_delay.o rtc.o
23
24obj-y += i387.o
25obj-y += ptrace.o
26obj-y += ds.o
27obj-$(CONFIG_X86_32) += tls.o
28obj-$(CONFIG_IA32_EMULATION) += tls.o
29obj-y += step.o
30obj-$(CONFIG_STACKTRACE) += stacktrace.o
31obj-y += cpu/
32obj-y += acpi/
33obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o
34obj-$(CONFIG_X86_64) += reboot.o
35obj-$(CONFIG_MCA) += mca_32.o
36obj-$(CONFIG_X86_MSR) += msr.o
37obj-$(CONFIG_X86_CPUID) += cpuid.o
38obj-$(CONFIG_MICROCODE) += microcode.o
39obj-$(CONFIG_PCI) += early-quirks.o
40apm-y := apm_32.o
41obj-$(CONFIG_APM) += apm.o
42obj-$(CONFIG_X86_SMP) += smp_$(BITS).o smpboot_$(BITS).o tsc_sync.o
43obj-$(CONFIG_X86_32_SMP) += smpcommon_32.o
44obj-$(CONFIG_X86_64_SMP) += smp_64.o smpboot_64.o tsc_sync.o
45obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o
46obj-$(CONFIG_X86_MPPARSE) += mpparse_$(BITS).o
47obj-$(CONFIG_X86_LOCAL_APIC) += apic_$(BITS).o nmi_$(BITS).o
48obj-$(CONFIG_X86_IO_APIC) += io_apic_$(BITS).o
49obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
50obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o
51obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
52obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
53obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
54obj-$(CONFIG_X86_SUMMIT_NUMA) += summit_32.o
55obj-$(CONFIG_X86_VSMP) += vsmp_64.o
56obj-$(CONFIG_KPROBES) += kprobes.o
57obj-$(CONFIG_MODULES) += module_$(BITS).o
58obj-$(CONFIG_ACPI_SRAT) += srat_32.o
59obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o
60obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o
61obj-$(CONFIG_VM86) += vm86_32.o
62obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
63
64obj-$(CONFIG_HPET_TIMER) += hpet.o
65
66obj-$(CONFIG_K8_NB) += k8.o
67obj-$(CONFIG_MGEODE_LX) += geode_32.o mfgpt_32.o
68obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o
69obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
70
71obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o
72obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
73
74ifdef CONFIG_INPUT_PCSPKR
75obj-y += pcspeaker.o
5endif 76endif
6 77
7# Workaround to delete .lds files with make clean 78obj-$(CONFIG_SCx200) += scx200.o
8# The problem is that we do not enter Makefile_32 with make clean. 79scx200-y += scx200_32.o
9clean-files := vsyscall*.lds vsyscall*.so 80
81###
82# 64 bit specific files
83ifeq ($(CONFIG_X86_64),y)
84 obj-y += genapic_64.o genapic_flat_64.o
85 obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o
86 obj-$(CONFIG_AUDIT) += audit_64.o
87 obj-$(CONFIG_PM) += suspend_64.o
88 obj-$(CONFIG_HIBERNATION) += suspend_asm_64.o
89
90 obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o
91 obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o
92 obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o
93endif
diff --git a/arch/x86/kernel/Makefile_32 b/arch/x86/kernel/Makefile_32
deleted file mode 100644
index a7bc93c2766..00000000000
--- a/arch/x86/kernel/Makefile_32
+++ /dev/null
@@ -1,88 +0,0 @@
1#
2# Makefile for the linux kernel.
3#
4
5extra-y := head_32.o init_task.o vmlinux.lds
6CPPFLAGS_vmlinux.lds += -Ui386
7
8obj-y := process_32.o signal_32.o entry_32.o traps_32.o irq_32.o \
9 ptrace_32.o time_32.o ioport_32.o ldt_32.o setup_32.o i8259_32.o sys_i386_32.o \
10 pci-dma_32.o i386_ksyms_32.o i387_32.o bootflag.o e820_32.o\
11 quirks.o i8237.o topology.o alternative.o i8253.o tsc_32.o
12
13obj-$(CONFIG_STACKTRACE) += stacktrace.o
14obj-y += cpu/
15obj-y += acpi/
16obj-$(CONFIG_X86_BIOS_REBOOT) += reboot_32.o
17obj-$(CONFIG_MCA) += mca_32.o
18obj-$(CONFIG_X86_MSR) += msr.o
19obj-$(CONFIG_X86_CPUID) += cpuid.o
20obj-$(CONFIG_MICROCODE) += microcode.o
21obj-$(CONFIG_PCI) += early-quirks.o
22obj-$(CONFIG_APM) += apm_32.o
23obj-$(CONFIG_X86_SMP) += smp_32.o smpboot_32.o tsc_sync.o
24obj-$(CONFIG_SMP) += smpcommon_32.o
25obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_32.o
26obj-$(CONFIG_X86_MPPARSE) += mpparse_32.o
27obj-$(CONFIG_X86_LOCAL_APIC) += apic_32.o nmi_32.o
28obj-$(CONFIG_X86_IO_APIC) += io_apic_32.o
29obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
30obj-$(CONFIG_KEXEC) += machine_kexec_32.o relocate_kernel_32.o crash.o
31obj-$(CONFIG_CRASH_DUMP) += crash_dump_32.o
32obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
33obj-$(CONFIG_X86_SUMMIT_NUMA) += summit_32.o
34obj-$(CONFIG_KPROBES) += kprobes_32.o
35obj-$(CONFIG_MODULES) += module_32.o
36obj-y += sysenter_32.o vsyscall_32.o
37obj-$(CONFIG_ACPI_SRAT) += srat_32.o
38obj-$(CONFIG_EFI) += efi_32.o efi_stub_32.o
39obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o
40obj-$(CONFIG_VM86) += vm86_32.o
41obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
42obj-$(CONFIG_HPET_TIMER) += hpet.o
43obj-$(CONFIG_K8_NB) += k8.o
44obj-$(CONFIG_MGEODE_LX) += geode_32.o mfgpt_32.o
45
46obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o
47obj-$(CONFIG_PARAVIRT) += paravirt_32.o
48obj-y += pcspeaker.o
49
50obj-$(CONFIG_SCx200) += scx200_32.o
51
52# vsyscall_32.o contains the vsyscall DSO images as __initdata.
53# We must build both images before we can assemble it.
54# Note: kbuild does not track this dependency due to usage of .incbin
55$(obj)/vsyscall_32.o: $(obj)/vsyscall-int80_32.so $(obj)/vsyscall-sysenter_32.so
56targets += $(foreach F,int80 sysenter,vsyscall-$F_32.o vsyscall-$F_32.so)
57targets += vsyscall-note_32.o vsyscall_32.lds
58
59# The DSO images are built using a special linker script.
60quiet_cmd_syscall = SYSCALL $@
61 cmd_syscall = $(CC) -m elf_i386 -nostdlib $(SYSCFLAGS_$(@F)) \
62 -Wl,-T,$(filter-out FORCE,$^) -o $@
63
64export CPPFLAGS_vsyscall_32.lds += -P -C -Ui386
65
66vsyscall-flags = -shared -s -Wl,-soname=linux-gate.so.1 \
67 $(call ld-option, -Wl$(comma)--hash-style=sysv)
68SYSCFLAGS_vsyscall-sysenter_32.so = $(vsyscall-flags)
69SYSCFLAGS_vsyscall-int80_32.so = $(vsyscall-flags)
70
71$(obj)/vsyscall-int80_32.so $(obj)/vsyscall-sysenter_32.so: \
72$(obj)/vsyscall-%.so: $(src)/vsyscall_32.lds \
73 $(obj)/vsyscall-%.o $(obj)/vsyscall-note_32.o FORCE
74 $(call if_changed,syscall)
75
76# We also create a special relocatable object that should mirror the symbol
77# table and layout of the linked DSO. With ld -R we can then refer to
78# these symbols in the kernel code rather than hand-coded addresses.
79extra-y += vsyscall-syms.o
80$(obj)/built-in.o: $(obj)/vsyscall-syms.o
81$(obj)/built-in.o: ld_flags += -R $(obj)/vsyscall-syms.o
82
83SYSCFLAGS_vsyscall-syms.o = -r
84$(obj)/vsyscall-syms.o: $(src)/vsyscall_32.lds \
85 $(obj)/vsyscall-sysenter_32.o $(obj)/vsyscall-note_32.o FORCE
86 $(call if_changed,syscall)
87
88
diff --git a/arch/x86/kernel/Makefile_64 b/arch/x86/kernel/Makefile_64
deleted file mode 100644
index 5a88890d8ee..00000000000
--- a/arch/x86/kernel/Makefile_64
+++ /dev/null
@@ -1,45 +0,0 @@
1#
2# Makefile for the linux kernel.
3#
4
5extra-y := head_64.o head64.o init_task.o vmlinux.lds
6CPPFLAGS_vmlinux.lds += -Ux86_64
7EXTRA_AFLAGS := -traditional
8
9obj-y := process_64.o signal_64.o entry_64.o traps_64.o irq_64.o \
10 ptrace_64.o time_64.o ioport_64.o ldt_64.o setup_64.o i8259_64.o sys_x86_64.o \
11 x8664_ksyms_64.o i387_64.o syscall_64.o vsyscall_64.o \
12 setup64.o bootflag.o e820_64.o reboot_64.o quirks.o i8237.o \
13 pci-dma_64.o pci-nommu_64.o alternative.o hpet.o tsc_64.o bugs_64.o \
14 i8253.o
15
16obj-$(CONFIG_STACKTRACE) += stacktrace.o
17obj-y += cpu/
18obj-y += acpi/
19obj-$(CONFIG_X86_MSR) += msr.o
20obj-$(CONFIG_MICROCODE) += microcode.o
21obj-$(CONFIG_X86_CPUID) += cpuid.o
22obj-$(CONFIG_SMP) += smp_64.o smpboot_64.o trampoline_64.o tsc_sync.o
23obj-y += apic_64.o nmi_64.o
24obj-y += io_apic_64.o mpparse_64.o genapic_64.o genapic_flat_64.o
25obj-$(CONFIG_KEXEC) += machine_kexec_64.o relocate_kernel_64.o crash.o
26obj-$(CONFIG_CRASH_DUMP) += crash_dump_64.o
27obj-$(CONFIG_PM) += suspend_64.o
28obj-$(CONFIG_HIBERNATION) += suspend_asm_64.o
29obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
30obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o
31obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o
32obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o
33obj-$(CONFIG_KPROBES) += kprobes_64.o
34obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o
35obj-$(CONFIG_X86_VSMP) += vsmp_64.o
36obj-$(CONFIG_K8_NB) += k8.o
37obj-$(CONFIG_AUDIT) += audit_64.o
38
39obj-$(CONFIG_MODULES) += module_64.o
40obj-$(CONFIG_PCI) += early-quirks.o
41
42obj-y += topology.o
43obj-y += pcspeaker.o
44
45CFLAGS_vsyscall_64.o := $(PROFILING) -g0
diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile
index 1351c3982ee..19d3d6e9d09 100644
--- a/arch/x86/kernel/acpi/Makefile
+++ b/arch/x86/kernel/acpi/Makefile
@@ -1,5 +1,5 @@
1obj-$(CONFIG_ACPI) += boot.o 1obj-$(CONFIG_ACPI) += boot.o
2obj-$(CONFIG_ACPI_SLEEP) += sleep_$(BITS).o wakeup_$(BITS).o 2obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o
3 3
4ifneq ($(CONFIG_ACPI_PROCESSOR),) 4ifneq ($(CONFIG_ACPI_PROCESSOR),)
5obj-y += cstate.o processor.o 5obj-y += cstate.o processor.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 0ca27c7b0e8..fc8825d4b99 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -496,7 +496,8 @@ EXPORT_SYMBOL(acpi_register_gsi);
496 * ACPI based hotplug support for CPU 496 * ACPI based hotplug support for CPU
497 */ 497 */
498#ifdef CONFIG_ACPI_HOTPLUG_CPU 498#ifdef CONFIG_ACPI_HOTPLUG_CPU
499int acpi_map_lsapic(acpi_handle handle, int *pcpu) 499
500static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu)
500{ 501{
501 struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; 502 struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
502 union acpi_object *obj; 503 union acpi_object *obj;
@@ -551,6 +552,11 @@ int acpi_map_lsapic(acpi_handle handle, int *pcpu)
551 return 0; 552 return 0;
552} 553}
553 554
555/* wrapper to silence section mismatch warning */
556int __ref acpi_map_lsapic(acpi_handle handle, int *pcpu)
557{
558 return _acpi_map_lsapic(handle, pcpu);
559}
554EXPORT_SYMBOL(acpi_map_lsapic); 560EXPORT_SYMBOL(acpi_map_lsapic);
555 561
556int acpi_unmap_lsapic(int cpu) 562int acpi_unmap_lsapic(int cpu)
@@ -581,25 +587,6 @@ int acpi_unregister_ioapic(acpi_handle handle, u32 gsi_base)
581 587
582EXPORT_SYMBOL(acpi_unregister_ioapic); 588EXPORT_SYMBOL(acpi_unregister_ioapic);
583 589
584static unsigned long __init
585acpi_scan_rsdp(unsigned long start, unsigned long length)
586{
587 unsigned long offset = 0;
588 unsigned long sig_len = sizeof("RSD PTR ") - 1;
589
590 /*
591 * Scan all 16-byte boundaries of the physical memory region for the
592 * RSDP signature.
593 */
594 for (offset = 0; offset < length; offset += 16) {
595 if (strncmp((char *)(phys_to_virt(start) + offset), "RSD PTR ", sig_len))
596 continue;
597 return (start + offset);
598 }
599
600 return 0;
601}
602
603static int __init acpi_parse_sbf(struct acpi_table_header *table) 590static int __init acpi_parse_sbf(struct acpi_table_header *table)
604{ 591{
605 struct acpi_table_boot *sb; 592 struct acpi_table_boot *sb;
@@ -742,27 +729,6 @@ static int __init acpi_parse_fadt(struct acpi_table_header *table)
742 return 0; 729 return 0;
743} 730}
744 731
745unsigned long __init acpi_find_rsdp(void)
746{
747 unsigned long rsdp_phys = 0;
748
749 if (efi_enabled) {
750 if (efi.acpi20 != EFI_INVALID_TABLE_ADDR)
751 return efi.acpi20;
752 else if (efi.acpi != EFI_INVALID_TABLE_ADDR)
753 return efi.acpi;
754 }
755 /*
756 * Scan memory looking for the RSDP signature. First search EBDA (low
757 * memory) paragraphs and then search upper memory (E0000-FFFFF).
758 */
759 rsdp_phys = acpi_scan_rsdp(0, 0x400);
760 if (!rsdp_phys)
761 rsdp_phys = acpi_scan_rsdp(0xE0000, 0x20000);
762
763 return rsdp_phys;
764}
765
766#ifdef CONFIG_X86_LOCAL_APIC 732#ifdef CONFIG_X86_LOCAL_APIC
767/* 733/*
768 * Parse LAPIC entries in MADT 734 * Parse LAPIC entries in MADT
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
new file mode 100644
index 00000000000..6bc815cd8cb
--- /dev/null
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -0,0 +1,87 @@
1/*
2 * sleep.c - x86-specific ACPI sleep support.
3 *
4 * Copyright (C) 2001-2003 Patrick Mochel
5 * Copyright (C) 2001-2003 Pavel Machek <pavel@suse.cz>
6 */
7
8#include <linux/acpi.h>
9#include <linux/bootmem.h>
10#include <linux/dmi.h>
11#include <linux/cpumask.h>
12
13#include <asm/smp.h>
14
15/* address in low memory of the wakeup routine. */
16unsigned long acpi_wakeup_address = 0;
17unsigned long acpi_realmode_flags;
18extern char wakeup_start, wakeup_end;
19
20extern unsigned long acpi_copy_wakeup_routine(unsigned long);
21
22/**
23 * acpi_save_state_mem - save kernel state
24 *
25 * Create an identity mapped page table and copy the wakeup routine to
26 * low memory.
27 */
28int acpi_save_state_mem(void)
29{
30 if (!acpi_wakeup_address) {
31 printk(KERN_ERR "Could not allocate memory during boot, S3 disabled\n");
32 return -ENOMEM;
33 }
34 memcpy((void *)acpi_wakeup_address, &wakeup_start,
35 &wakeup_end - &wakeup_start);
36 acpi_copy_wakeup_routine(acpi_wakeup_address);
37
38 return 0;
39}
40
41/*
42 * acpi_restore_state - undo effects of acpi_save_state_mem
43 */
44void acpi_restore_state_mem(void)
45{
46}
47
48
49/**
50 * acpi_reserve_bootmem - do _very_ early ACPI initialisation
51 *
52 * We allocate a page from the first 1MB of memory for the wakeup
53 * routine for when we come back from a sleep state. The
54 * runtime allocator allows specification of <16MB pages, but not
55 * <1MB pages.
56 */
57void __init acpi_reserve_bootmem(void)
58{
59 if ((&wakeup_end - &wakeup_start) > PAGE_SIZE*2) {
60 printk(KERN_ERR
61 "ACPI: Wakeup code way too big, S3 disabled.\n");
62 return;
63 }
64
65 acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE*2);
66 if (!acpi_wakeup_address)
67 printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
68}
69
70
71static int __init acpi_sleep_setup(char *str)
72{
73 while ((str != NULL) && (*str != '\0')) {
74 if (strncmp(str, "s3_bios", 7) == 0)
75 acpi_realmode_flags |= 1;
76 if (strncmp(str, "s3_mode", 7) == 0)
77 acpi_realmode_flags |= 2;
78 if (strncmp(str, "s3_beep", 7) == 0)
79 acpi_realmode_flags |= 4;
80 str = strchr(str, ',');
81 if (str != NULL)
82 str += strspn(str, ", \t");
83 }
84 return 1;
85}
86
87__setup("acpi_sleep=", acpi_sleep_setup);
diff --git a/arch/x86/kernel/acpi/sleep_32.c b/arch/x86/kernel/acpi/sleep_32.c
index 10699489cfe..63fe5525e02 100644
--- a/arch/x86/kernel/acpi/sleep_32.c
+++ b/arch/x86/kernel/acpi/sleep_32.c
@@ -12,76 +12,6 @@
12 12
13#include <asm/smp.h> 13#include <asm/smp.h>
14 14
15/* address in low memory of the wakeup routine. */
16unsigned long acpi_wakeup_address = 0;
17unsigned long acpi_realmode_flags;
18extern char wakeup_start, wakeup_end;
19
20extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long));
21
22/**
23 * acpi_save_state_mem - save kernel state
24 *
25 * Create an identity mapped page table and copy the wakeup routine to
26 * low memory.
27 */
28int acpi_save_state_mem(void)
29{
30 if (!acpi_wakeup_address)
31 return 1;
32 memcpy((void *)acpi_wakeup_address, &wakeup_start,
33 &wakeup_end - &wakeup_start);
34 acpi_copy_wakeup_routine(acpi_wakeup_address);
35
36 return 0;
37}
38
39/*
40 * acpi_restore_state - undo effects of acpi_save_state_mem
41 */
42void acpi_restore_state_mem(void)
43{
44}
45
46/**
47 * acpi_reserve_bootmem - do _very_ early ACPI initialisation
48 *
49 * We allocate a page from the first 1MB of memory for the wakeup
50 * routine for when we come back from a sleep state. The
51 * runtime allocator allows specification of <16MB pages, but not
52 * <1MB pages.
53 */
54void __init acpi_reserve_bootmem(void)
55{
56 if ((&wakeup_end - &wakeup_start) > PAGE_SIZE) {
57 printk(KERN_ERR
58 "ACPI: Wakeup code way too big, S3 disabled.\n");
59 return;
60 }
61
62 acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE);
63 if (!acpi_wakeup_address)
64 printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
65}
66
67static int __init acpi_sleep_setup(char *str)
68{
69 while ((str != NULL) && (*str != '\0')) {
70 if (strncmp(str, "s3_bios", 7) == 0)
71 acpi_realmode_flags |= 1;
72 if (strncmp(str, "s3_mode", 7) == 0)
73 acpi_realmode_flags |= 2;
74 if (strncmp(str, "s3_beep", 7) == 0)
75 acpi_realmode_flags |= 4;
76 str = strchr(str, ',');
77 if (str != NULL)
78 str += strspn(str, ", \t");
79 }
80 return 1;
81}
82
83__setup("acpi_sleep=", acpi_sleep_setup);
84
85/* Ouch, we want to delete this. We already have better version in userspace, in 15/* Ouch, we want to delete this. We already have better version in userspace, in
86 s2ram from suspend.sf.net project */ 16 s2ram from suspend.sf.net project */
87static __init int reset_videomode_after_s3(const struct dmi_system_id *d) 17static __init int reset_videomode_after_s3(const struct dmi_system_id *d)
diff --git a/arch/x86/kernel/acpi/sleep_64.c b/arch/x86/kernel/acpi/sleep_64.c
deleted file mode 100644
index da42de261ba..00000000000
--- a/arch/x86/kernel/acpi/sleep_64.c
+++ /dev/null
@@ -1,117 +0,0 @@
1/*
2 * acpi.c - Architecture-Specific Low-Level ACPI Support
3 *
4 * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
5 * Copyright (C) 2001 Jun Nakajima <jun.nakajima@intel.com>
6 * Copyright (C) 2001 Patrick Mochel <mochel@osdl.org>
7 * Copyright (C) 2002 Andi Kleen, SuSE Labs (x86-64 port)
8 * Copyright (C) 2003 Pavel Machek, SuSE Labs
9 *
10 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 *
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, write to the Free Software
24 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25 *
26 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
27 */
28
29#include <linux/kernel.h>
30#include <linux/init.h>
31#include <linux/types.h>
32#include <linux/stddef.h>
33#include <linux/slab.h>
34#include <linux/pci.h>
35#include <linux/bootmem.h>
36#include <linux/acpi.h>
37#include <linux/cpumask.h>
38
39#include <asm/mpspec.h>
40#include <asm/io.h>
41#include <asm/apic.h>
42#include <asm/apicdef.h>
43#include <asm/page.h>
44#include <asm/pgtable.h>
45#include <asm/pgalloc.h>
46#include <asm/io_apic.h>
47#include <asm/proto.h>
48#include <asm/tlbflush.h>
49
50/* --------------------------------------------------------------------------
51 Low-Level Sleep Support
52 -------------------------------------------------------------------------- */
53
54/* address in low memory of the wakeup routine. */
55unsigned long acpi_wakeup_address = 0;
56unsigned long acpi_realmode_flags;
57extern char wakeup_start, wakeup_end;
58
59extern unsigned long acpi_copy_wakeup_routine(unsigned long);
60
61/**
62 * acpi_save_state_mem - save kernel state
63 *
64 * Create an identity mapped page table and copy the wakeup routine to
65 * low memory.
66 */
67int acpi_save_state_mem(void)
68{
69 memcpy((void *)acpi_wakeup_address, &wakeup_start,
70 &wakeup_end - &wakeup_start);
71 acpi_copy_wakeup_routine(acpi_wakeup_address);
72
73 return 0;
74}
75
76/*
77 * acpi_restore_state
78 */
79void acpi_restore_state_mem(void)
80{
81}
82
83/**
84 * acpi_reserve_bootmem - do _very_ early ACPI initialisation
85 *
86 * We allocate a page in low memory for the wakeup
87 * routine for when we come back from a sleep state. The
88 * runtime allocator allows specification of <16M pages, but not
89 * <1M pages.
90 */
91void __init acpi_reserve_bootmem(void)
92{
93 acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE*2);
94 if ((&wakeup_end - &wakeup_start) > (PAGE_SIZE*2))
95 printk(KERN_CRIT
96 "ACPI: Wakeup code way too big, will crash on attempt"
97 " to suspend\n");
98}
99
100static int __init acpi_sleep_setup(char *str)
101{
102 while ((str != NULL) && (*str != '\0')) {
103 if (strncmp(str, "s3_bios", 7) == 0)
104 acpi_realmode_flags |= 1;
105 if (strncmp(str, "s3_mode", 7) == 0)
106 acpi_realmode_flags |= 2;
107 if (strncmp(str, "s3_beep", 7) == 0)
108 acpi_realmode_flags |= 4;
109 str = strchr(str, ',');
110 if (str != NULL)
111 str += strspn(str, ", \t");
112 }
113 return 1;
114}
115
116__setup("acpi_sleep=", acpi_sleep_setup);
117
diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S
index 1e931aaf2ef..f53e3277f8e 100644
--- a/arch/x86/kernel/acpi/wakeup_32.S
+++ b/arch/x86/kernel/acpi/wakeup_32.S
@@ -1,4 +1,4 @@
1.text 1 .section .text.page_aligned
2#include <linux/linkage.h> 2#include <linux/linkage.h>
3#include <asm/segment.h> 3#include <asm/segment.h>
4#include <asm/page.h> 4#include <asm/page.h>
diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S
index 5ed3bc5c61d..2e1b9e0d076 100644
--- a/arch/x86/kernel/acpi/wakeup_64.S
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -344,13 +344,13 @@ do_suspend_lowlevel:
344 call save_processor_state 344 call save_processor_state
345 345
346 movq $saved_context, %rax 346 movq $saved_context, %rax
347 movq %rsp, pt_regs_rsp(%rax) 347 movq %rsp, pt_regs_sp(%rax)
348 movq %rbp, pt_regs_rbp(%rax) 348 movq %rbp, pt_regs_bp(%rax)
349 movq %rsi, pt_regs_rsi(%rax) 349 movq %rsi, pt_regs_si(%rax)
350 movq %rdi, pt_regs_rdi(%rax) 350 movq %rdi, pt_regs_di(%rax)
351 movq %rbx, pt_regs_rbx(%rax) 351 movq %rbx, pt_regs_bx(%rax)
352 movq %rcx, pt_regs_rcx(%rax) 352 movq %rcx, pt_regs_cx(%rax)
353 movq %rdx, pt_regs_rdx(%rax) 353 movq %rdx, pt_regs_dx(%rax)
354 movq %r8, pt_regs_r8(%rax) 354 movq %r8, pt_regs_r8(%rax)
355 movq %r9, pt_regs_r9(%rax) 355 movq %r9, pt_regs_r9(%rax)
356 movq %r10, pt_regs_r10(%rax) 356 movq %r10, pt_regs_r10(%rax)
@@ -360,7 +360,7 @@ do_suspend_lowlevel:
360 movq %r14, pt_regs_r14(%rax) 360 movq %r14, pt_regs_r14(%rax)
361 movq %r15, pt_regs_r15(%rax) 361 movq %r15, pt_regs_r15(%rax)
362 pushfq 362 pushfq
363 popq pt_regs_eflags(%rax) 363 popq pt_regs_flags(%rax)
364 364
365 movq $.L97, saved_rip(%rip) 365 movq $.L97, saved_rip(%rip)
366 366
@@ -391,15 +391,15 @@ do_suspend_lowlevel:
391 movq %rbx, %cr2 391 movq %rbx, %cr2
392 movq saved_context_cr0(%rax), %rbx 392 movq saved_context_cr0(%rax), %rbx
393 movq %rbx, %cr0 393 movq %rbx, %cr0
394 pushq pt_regs_eflags(%rax) 394 pushq pt_regs_flags(%rax)
395 popfq 395 popfq
396 movq pt_regs_rsp(%rax), %rsp 396 movq pt_regs_sp(%rax), %rsp
397 movq pt_regs_rbp(%rax), %rbp 397 movq pt_regs_bp(%rax), %rbp
398 movq pt_regs_rsi(%rax), %rsi 398 movq pt_regs_si(%rax), %rsi
399 movq pt_regs_rdi(%rax), %rdi 399 movq pt_regs_di(%rax), %rdi
400 movq pt_regs_rbx(%rax), %rbx 400 movq pt_regs_bx(%rax), %rbx
401 movq pt_regs_rcx(%rax), %rcx 401 movq pt_regs_cx(%rax), %rcx
402 movq pt_regs_rdx(%rax), %rdx 402 movq pt_regs_dx(%rax), %rdx
403 movq pt_regs_r8(%rax), %r8 403 movq pt_regs_r8(%rax), %r8
404 movq pt_regs_r9(%rax), %r9 404 movq pt_regs_r9(%rax), %r9
405 movq pt_regs_r10(%rax), %r10 405 movq pt_regs_r10(%rax), %r10
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index d6405e0842b..45d79ea890a 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -273,6 +273,7 @@ struct smp_alt_module {
273}; 273};
274static LIST_HEAD(smp_alt_modules); 274static LIST_HEAD(smp_alt_modules);
275static DEFINE_SPINLOCK(smp_alt); 275static DEFINE_SPINLOCK(smp_alt);
276static int smp_mode = 1; /* protected by smp_alt */
276 277
277void alternatives_smp_module_add(struct module *mod, char *name, 278void alternatives_smp_module_add(struct module *mod, char *name,
278 void *locks, void *locks_end, 279 void *locks, void *locks_end,
@@ -341,12 +342,13 @@ void alternatives_smp_switch(int smp)
341 342
342#ifdef CONFIG_LOCKDEP 343#ifdef CONFIG_LOCKDEP
343 /* 344 /*
344 * A not yet fixed binutils section handling bug prevents 345 * Older binutils section handling bug prevented
345 * alternatives-replacement from working reliably, so turn 346 * alternatives-replacement from working reliably.
346 * it off: 347 *
348 * If this still occurs then you should see a hang
349 * or crash shortly after this line:
347 */ 350 */
348 printk("lockdep: not fixing up alternatives.\n"); 351 printk("lockdep: fixing up alternatives.\n");
349 return;
350#endif 352#endif
351 353
352 if (noreplace_smp || smp_alt_once) 354 if (noreplace_smp || smp_alt_once)
@@ -354,21 +356,29 @@ void alternatives_smp_switch(int smp)
354 BUG_ON(!smp && (num_online_cpus() > 1)); 356 BUG_ON(!smp && (num_online_cpus() > 1));
355 357
356 spin_lock_irqsave(&smp_alt, flags); 358 spin_lock_irqsave(&smp_alt, flags);
357 if (smp) { 359
360 /*
361 * Avoid unnecessary switches because it forces JIT based VMs to
362 * throw away all cached translations, which can be quite costly.
363 */
364 if (smp == smp_mode) {
365 /* nothing */
366 } else if (smp) {
358 printk(KERN_INFO "SMP alternatives: switching to SMP code\n"); 367 printk(KERN_INFO "SMP alternatives: switching to SMP code\n");
359 clear_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability); 368 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
360 clear_bit(X86_FEATURE_UP, cpu_data(0).x86_capability); 369 clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
361 list_for_each_entry(mod, &smp_alt_modules, next) 370 list_for_each_entry(mod, &smp_alt_modules, next)
362 alternatives_smp_lock(mod->locks, mod->locks_end, 371 alternatives_smp_lock(mod->locks, mod->locks_end,
363 mod->text, mod->text_end); 372 mod->text, mod->text_end);
364 } else { 373 } else {
365 printk(KERN_INFO "SMP alternatives: switching to UP code\n"); 374 printk(KERN_INFO "SMP alternatives: switching to UP code\n");
366 set_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability); 375 set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
367 set_bit(X86_FEATURE_UP, cpu_data(0).x86_capability); 376 set_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
368 list_for_each_entry(mod, &smp_alt_modules, next) 377 list_for_each_entry(mod, &smp_alt_modules, next)
369 alternatives_smp_unlock(mod->locks, mod->locks_end, 378 alternatives_smp_unlock(mod->locks, mod->locks_end,
370 mod->text, mod->text_end); 379 mod->text, mod->text_end);
371 } 380 }
381 smp_mode = smp;
372 spin_unlock_irqrestore(&smp_alt, flags); 382 spin_unlock_irqrestore(&smp_alt, flags);
373} 383}
374 384
@@ -431,8 +441,9 @@ void __init alternative_instructions(void)
431 if (smp_alt_once) { 441 if (smp_alt_once) {
432 if (1 == num_possible_cpus()) { 442 if (1 == num_possible_cpus()) {
433 printk(KERN_INFO "SMP alternatives: switching to UP code\n"); 443 printk(KERN_INFO "SMP alternatives: switching to UP code\n");
434 set_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability); 444 set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
435 set_bit(X86_FEATURE_UP, cpu_data(0).x86_capability); 445 set_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
446
436 alternatives_smp_unlock(__smp_locks, __smp_locks_end, 447 alternatives_smp_unlock(__smp_locks, __smp_locks_end,
437 _text, _etext); 448 _text, _etext);
438 } 449 }
@@ -440,7 +451,10 @@ void __init alternative_instructions(void)
440 alternatives_smp_module_add(NULL, "core kernel", 451 alternatives_smp_module_add(NULL, "core kernel",
441 __smp_locks, __smp_locks_end, 452 __smp_locks, __smp_locks_end,
442 _text, _etext); 453 _text, _etext);
443 alternatives_smp_switch(0); 454
455 /* Only switch to UP mode if we don't immediately boot others */
456 if (num_possible_cpus() == 1 || setup_max_cpus <= 1)
457 alternatives_smp_switch(0);
444 } 458 }
445#endif 459#endif
446 apply_paravirt(__parainstructions, __parainstructions_end); 460 apply_paravirt(__parainstructions, __parainstructions_end);
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 5b6992799c9..608152a2a05 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -1,12 +1,12 @@
1/* 1/*
2 * Firmware replacement code. 2 * Firmware replacement code.
3 * 3 *
4 * Work around broken BIOSes that don't set an aperture or only set the 4 * Work around broken BIOSes that don't set an aperture or only set the
5 * aperture in the AGP bridge. 5 * aperture in the AGP bridge.
6 * If all fails map the aperture over some low memory. This is cheaper than 6 * If all fails map the aperture over some low memory. This is cheaper than
7 * doing bounce buffering. The memory is lost. This is done at early boot 7 * doing bounce buffering. The memory is lost. This is done at early boot
8 * because only the bootmem allocator can allocate 32+MB. 8 * because only the bootmem allocator can allocate 32+MB.
9 * 9 *
10 * Copyright 2002 Andi Kleen, SuSE Labs. 10 * Copyright 2002 Andi Kleen, SuSE Labs.
11 */ 11 */
12#include <linux/kernel.h> 12#include <linux/kernel.h>
@@ -30,7 +30,7 @@ int gart_iommu_aperture_disabled __initdata = 0;
30int gart_iommu_aperture_allowed __initdata = 0; 30int gart_iommu_aperture_allowed __initdata = 0;
31 31
32int fallback_aper_order __initdata = 1; /* 64MB */ 32int fallback_aper_order __initdata = 1; /* 64MB */
33int fallback_aper_force __initdata = 0; 33int fallback_aper_force __initdata = 0;
34 34
35int fix_aperture __initdata = 1; 35int fix_aperture __initdata = 1;
36 36
@@ -49,167 +49,270 @@ static void __init insert_aperture_resource(u32 aper_base, u32 aper_size)
49/* This code runs before the PCI subsystem is initialized, so just 49/* This code runs before the PCI subsystem is initialized, so just
50 access the northbridge directly. */ 50 access the northbridge directly. */
51 51
52static u32 __init allocate_aperture(void) 52static u32 __init allocate_aperture(void)
53{ 53{
54 u32 aper_size; 54 u32 aper_size;
55 void *p; 55 void *p;
56 56
57 if (fallback_aper_order > 7) 57 if (fallback_aper_order > 7)
58 fallback_aper_order = 7; 58 fallback_aper_order = 7;
59 aper_size = (32 * 1024 * 1024) << fallback_aper_order; 59 aper_size = (32 * 1024 * 1024) << fallback_aper_order;
60 60
61 /* 61 /*
62 * Aperture has to be naturally aligned. This means an 2GB aperture won't 62 * Aperture has to be naturally aligned. This means a 2GB aperture
63 * have much chance of finding a place in the lower 4GB of memory. 63 * won't have much chance of finding a place in the lower 4GB of
64 * Unfortunately we cannot move it up because that would make the 64 * memory. Unfortunately we cannot move it up because that would
65 * IOMMU useless. 65 * make the IOMMU useless.
66 */ 66 */
67 p = __alloc_bootmem_nopanic(aper_size, aper_size, 0); 67 p = __alloc_bootmem_nopanic(aper_size, aper_size, 0);
68 if (!p || __pa(p)+aper_size > 0xffffffff) { 68 if (!p || __pa(p)+aper_size > 0xffffffff) {
69 printk("Cannot allocate aperture memory hole (%p,%uK)\n", 69 printk(KERN_ERR
70 p, aper_size>>10); 70 "Cannot allocate aperture memory hole (%p,%uK)\n",
71 p, aper_size>>10);
71 if (p) 72 if (p)
72 free_bootmem(__pa(p), aper_size); 73 free_bootmem(__pa(p), aper_size);
73 return 0; 74 return 0;
74 } 75 }
75 printk("Mapping aperture over %d KB of RAM @ %lx\n", 76 printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n",
76 aper_size >> 10, __pa(p)); 77 aper_size >> 10, __pa(p));
77 insert_aperture_resource((u32)__pa(p), aper_size); 78 insert_aperture_resource((u32)__pa(p), aper_size);
78 return (u32)__pa(p); 79
80 return (u32)__pa(p);
79} 81}
80 82
81static int __init aperture_valid(u64 aper_base, u32 aper_size) 83static int __init aperture_valid(u64 aper_base, u32 aper_size)
82{ 84{
83 if (!aper_base) 85 if (!aper_base)
84 return 0;
85 if (aper_size < 64*1024*1024) {
86 printk("Aperture too small (%d MB)\n", aper_size>>20);
87 return 0; 86 return 0;
88 } 87
89 if (aper_base + aper_size > 0x100000000UL) { 88 if (aper_base + aper_size > 0x100000000UL) {
90 printk("Aperture beyond 4GB. Ignoring.\n"); 89 printk(KERN_ERR "Aperture beyond 4GB. Ignoring.\n");
91 return 0; 90 return 0;
92 } 91 }
93 if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) { 92 if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) {
94 printk("Aperture pointing to e820 RAM. Ignoring.\n"); 93 printk(KERN_ERR "Aperture pointing to e820 RAM. Ignoring.\n");
95 return 0; 94 return 0;
96 } 95 }
96 if (aper_size < 64*1024*1024) {
97 printk(KERN_ERR "Aperture too small (%d MB)\n", aper_size>>20);
98 return 0;
99 }
100
97 return 1; 101 return 1;
98} 102}
99 103
100/* Find a PCI capability */ 104/* Find a PCI capability */
101static __u32 __init find_cap(int num, int slot, int func, int cap) 105static __u32 __init find_cap(int num, int slot, int func, int cap)
102{ 106{
103 u8 pos;
104 int bytes; 107 int bytes;
105 if (!(read_pci_config_16(num,slot,func,PCI_STATUS) & PCI_STATUS_CAP_LIST)) 108 u8 pos;
109
110 if (!(read_pci_config_16(num, slot, func, PCI_STATUS) &
111 PCI_STATUS_CAP_LIST))
106 return 0; 112 return 0;
107 pos = read_pci_config_byte(num,slot,func,PCI_CAPABILITY_LIST); 113
108 for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) { 114 pos = read_pci_config_byte(num, slot, func, PCI_CAPABILITY_LIST);
115 for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) {
109 u8 id; 116 u8 id;
110 pos &= ~3; 117
111 id = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_ID); 118 pos &= ~3;
119 id = read_pci_config_byte(num, slot, func, pos+PCI_CAP_LIST_ID);
112 if (id == 0xff) 120 if (id == 0xff)
113 break; 121 break;
114 if (id == cap) 122 if (id == cap)
115 return pos; 123 return pos;
116 pos = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_NEXT); 124 pos = read_pci_config_byte(num, slot, func,
117 } 125 pos+PCI_CAP_LIST_NEXT);
126 }
118 return 0; 127 return 0;
119} 128}
120 129
121/* Read a standard AGPv3 bridge header */ 130/* Read a standard AGPv3 bridge header */
122static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order) 131static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order)
123{ 132{
124 u32 apsize; 133 u32 apsize;
125 u32 apsizereg; 134 u32 apsizereg;
126 int nbits; 135 int nbits;
127 u32 aper_low, aper_hi; 136 u32 aper_low, aper_hi;
128 u64 aper; 137 u64 aper;
129 138
130 printk("AGP bridge at %02x:%02x:%02x\n", num, slot, func); 139 printk(KERN_INFO "AGP bridge at %02x:%02x:%02x\n", num, slot, func);
131 apsizereg = read_pci_config_16(num,slot,func, cap + 0x14); 140 apsizereg = read_pci_config_16(num, slot, func, cap + 0x14);
132 if (apsizereg == 0xffffffff) { 141 if (apsizereg == 0xffffffff) {
133 printk("APSIZE in AGP bridge unreadable\n"); 142 printk(KERN_ERR "APSIZE in AGP bridge unreadable\n");
134 return 0; 143 return 0;
135 } 144 }
136 145
137 apsize = apsizereg & 0xfff; 146 apsize = apsizereg & 0xfff;
138 /* Some BIOS use weird encodings not in the AGPv3 table. */ 147 /* Some BIOS use weird encodings not in the AGPv3 table. */
139 if (apsize & 0xff) 148 if (apsize & 0xff)
140 apsize |= 0xf00; 149 apsize |= 0xf00;
141 nbits = hweight16(apsize); 150 nbits = hweight16(apsize);
142 *order = 7 - nbits; 151 *order = 7 - nbits;
143 if ((int)*order < 0) /* < 32MB */ 152 if ((int)*order < 0) /* < 32MB */
144 *order = 0; 153 *order = 0;
145 154
146 aper_low = read_pci_config(num,slot,func, 0x10); 155 aper_low = read_pci_config(num, slot, func, 0x10);
147 aper_hi = read_pci_config(num,slot,func,0x14); 156 aper_hi = read_pci_config(num, slot, func, 0x14);
148 aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32); 157 aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32);
149 158
150 printk("Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n", 159 printk(KERN_INFO "Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n",
151 aper, 32 << *order, apsizereg); 160 aper, 32 << *order, apsizereg);
152 161
153 if (!aperture_valid(aper, (32*1024*1024) << *order)) 162 if (!aperture_valid(aper, (32*1024*1024) << *order))
154 return 0; 163 return 0;
155 return (u32)aper; 164 return (u32)aper;
156} 165}
157
158/* Look for an AGP bridge. Windows only expects the aperture in the
159 AGP bridge and some BIOS forget to initialize the Northbridge too.
160 Work around this here.
161
162 Do an PCI bus scan by hand because we're running before the PCI
163 subsystem.
164 166
165 All K8 AGP bridges are AGPv3 compliant, so we can do this scan 167/*
166 generically. It's probably overkill to always scan all slots because 168 * Look for an AGP bridge. Windows only expects the aperture in the
167 the AGP bridges should be always an own bus on the HT hierarchy, 169 * AGP bridge and some BIOS forget to initialize the Northbridge too.
168 but do it here for future safety. */ 170 * Work around this here.
171 *
172 * Do an PCI bus scan by hand because we're running before the PCI
173 * subsystem.
174 *
175 * All K8 AGP bridges are AGPv3 compliant, so we can do this scan
176 * generically. It's probably overkill to always scan all slots because
177 * the AGP bridges should be always an own bus on the HT hierarchy,
178 * but do it here for future safety.
179 */
169static __u32 __init search_agp_bridge(u32 *order, int *valid_agp) 180static __u32 __init search_agp_bridge(u32 *order, int *valid_agp)
170{ 181{
171 int num, slot, func; 182 int num, slot, func;
172 183
173 /* Poor man's PCI discovery */ 184 /* Poor man's PCI discovery */
174 for (num = 0; num < 256; num++) { 185 for (num = 0; num < 256; num++) {
175 for (slot = 0; slot < 32; slot++) { 186 for (slot = 0; slot < 32; slot++) {
176 for (func = 0; func < 8; func++) { 187 for (func = 0; func < 8; func++) {
177 u32 class, cap; 188 u32 class, cap;
178 u8 type; 189 u8 type;
179 class = read_pci_config(num,slot,func, 190 class = read_pci_config(num, slot, func,
180 PCI_CLASS_REVISION); 191 PCI_CLASS_REVISION);
181 if (class == 0xffffffff) 192 if (class == 0xffffffff)
182 break; 193 break;
183 194
184 switch (class >> 16) { 195 switch (class >> 16) {
185 case PCI_CLASS_BRIDGE_HOST: 196 case PCI_CLASS_BRIDGE_HOST:
186 case PCI_CLASS_BRIDGE_OTHER: /* needed? */ 197 case PCI_CLASS_BRIDGE_OTHER: /* needed? */
187 /* AGP bridge? */ 198 /* AGP bridge? */
188 cap = find_cap(num,slot,func,PCI_CAP_ID_AGP); 199 cap = find_cap(num, slot, func,
200 PCI_CAP_ID_AGP);
189 if (!cap) 201 if (!cap)
190 break; 202 break;
191 *valid_agp = 1; 203 *valid_agp = 1;
192 return read_agp(num,slot,func,cap,order); 204 return read_agp(num, slot, func, cap,
193 } 205 order);
194 206 }
207
195 /* No multi-function device? */ 208 /* No multi-function device? */
196 type = read_pci_config_byte(num,slot,func, 209 type = read_pci_config_byte(num, slot, func,
197 PCI_HEADER_TYPE); 210 PCI_HEADER_TYPE);
198 if (!(type & 0x80)) 211 if (!(type & 0x80))
199 break; 212 break;
200 } 213 }
201 } 214 }
202 } 215 }
203 printk("No AGP bridge found\n"); 216 printk(KERN_INFO "No AGP bridge found\n");
217
204 return 0; 218 return 0;
205} 219}
206 220
221static int gart_fix_e820 __initdata = 1;
222
223static int __init parse_gart_mem(char *p)
224{
225 if (!p)
226 return -EINVAL;
227
228 if (!strncmp(p, "off", 3))
229 gart_fix_e820 = 0;
230 else if (!strncmp(p, "on", 2))
231 gart_fix_e820 = 1;
232
233 return 0;
234}
235early_param("gart_fix_e820", parse_gart_mem);
236
237void __init early_gart_iommu_check(void)
238{
239 /*
240 * in case it is enabled before, esp for kexec/kdump,
241 * previous kernel already enable that. memset called
242 * by allocate_aperture/__alloc_bootmem_nopanic cause restart.
243 * or second kernel have different position for GART hole. and new
244 * kernel could use hole as RAM that is still used by GART set by
245 * first kernel
246 * or BIOS forget to put that in reserved.
247 * try to update e820 to make that region as reserved.
248 */
249 int fix, num;
250 u32 ctl;
251 u32 aper_size = 0, aper_order = 0, last_aper_order = 0;
252 u64 aper_base = 0, last_aper_base = 0;
253 int aper_enabled = 0, last_aper_enabled = 0;
254
255 if (!early_pci_allowed())
256 return;
257
258 fix = 0;
259 for (num = 24; num < 32; num++) {
260 if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
261 continue;
262
263 ctl = read_pci_config(0, num, 3, 0x90);
264 aper_enabled = ctl & 1;
265 aper_order = (ctl >> 1) & 7;
266 aper_size = (32 * 1024 * 1024) << aper_order;
267 aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff;
268 aper_base <<= 25;
269
270 if ((last_aper_order && aper_order != last_aper_order) ||
271 (last_aper_base && aper_base != last_aper_base) ||
272 (last_aper_enabled && aper_enabled != last_aper_enabled)) {
273 fix = 1;
274 break;
275 }
276 last_aper_order = aper_order;
277 last_aper_base = aper_base;
278 last_aper_enabled = aper_enabled;
279 }
280
281 if (!fix && !aper_enabled)
282 return;
283
284 if (!aper_base || !aper_size || aper_base + aper_size > 0x100000000UL)
285 fix = 1;
286
287 if (gart_fix_e820 && !fix && aper_enabled) {
288 if (e820_any_mapped(aper_base, aper_base + aper_size,
289 E820_RAM)) {
290 /* reserved it, so we can resuse it in second kernel */
291 printk(KERN_INFO "update e820 for GART\n");
292 add_memory_region(aper_base, aper_size, E820_RESERVED);
293 update_e820();
294 }
295 return;
296 }
297
298 /* different nodes have different setting, disable them all at first*/
299 for (num = 24; num < 32; num++) {
300 if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
301 continue;
302
303 ctl = read_pci_config(0, num, 3, 0x90);
304 ctl &= ~1;
305 write_pci_config(0, num, 3, 0x90, ctl);
306 }
307
308}
309
207void __init gart_iommu_hole_init(void) 310void __init gart_iommu_hole_init(void)
208{ 311{
209 int fix, num;
210 u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0; 312 u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0;
211 u64 aper_base, last_aper_base = 0; 313 u64 aper_base, last_aper_base = 0;
212 int valid_agp = 0; 314 int fix, num, valid_agp = 0;
315 int node;
213 316
214 if (gart_iommu_aperture_disabled || !fix_aperture || 317 if (gart_iommu_aperture_disabled || !fix_aperture ||
215 !early_pci_allowed()) 318 !early_pci_allowed())
@@ -218,24 +321,26 @@ void __init gart_iommu_hole_init(void)
218 printk(KERN_INFO "Checking aperture...\n"); 321 printk(KERN_INFO "Checking aperture...\n");
219 322
220 fix = 0; 323 fix = 0;
221 for (num = 24; num < 32; num++) { 324 node = 0;
325 for (num = 24; num < 32; num++) {
222 if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00))) 326 if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
223 continue; 327 continue;
224 328
225 iommu_detected = 1; 329 iommu_detected = 1;
226 gart_iommu_aperture = 1; 330 gart_iommu_aperture = 1;
227 331
228 aper_order = (read_pci_config(0, num, 3, 0x90) >> 1) & 7; 332 aper_order = (read_pci_config(0, num, 3, 0x90) >> 1) & 7;
229 aper_size = (32 * 1024 * 1024) << aper_order; 333 aper_size = (32 * 1024 * 1024) << aper_order;
230 aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff; 334 aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff;
231 aper_base <<= 25; 335 aper_base <<= 25;
336
337 printk(KERN_INFO "Node %d: aperture @ %Lx size %u MB\n",
338 node, aper_base, aper_size >> 20);
339 node++;
232 340
233 printk("CPU %d: aperture @ %Lx size %u MB\n", num-24,
234 aper_base, aper_size>>20);
235
236 if (!aperture_valid(aper_base, aper_size)) { 341 if (!aperture_valid(aper_base, aper_size)) {
237 fix = 1; 342 fix = 1;
238 break; 343 break;
239 } 344 }
240 345
241 if ((last_aper_order && aper_order != last_aper_order) || 346 if ((last_aper_order && aper_order != last_aper_order) ||
@@ -245,55 +350,64 @@ void __init gart_iommu_hole_init(void)
245 } 350 }
246 last_aper_order = aper_order; 351 last_aper_order = aper_order;
247 last_aper_base = aper_base; 352 last_aper_base = aper_base;
248 } 353 }
249 354
250 if (!fix && !fallback_aper_force) { 355 if (!fix && !fallback_aper_force) {
251 if (last_aper_base) { 356 if (last_aper_base) {
252 unsigned long n = (32 * 1024 * 1024) << last_aper_order; 357 unsigned long n = (32 * 1024 * 1024) << last_aper_order;
358
253 insert_aperture_resource((u32)last_aper_base, n); 359 insert_aperture_resource((u32)last_aper_base, n);
254 } 360 }
255 return; 361 return;
256 } 362 }
257 363
258 if (!fallback_aper_force) 364 if (!fallback_aper_force)
259 aper_alloc = search_agp_bridge(&aper_order, &valid_agp); 365 aper_alloc = search_agp_bridge(&aper_order, &valid_agp);
260 366
261 if (aper_alloc) { 367 if (aper_alloc) {
262 /* Got the aperture from the AGP bridge */ 368 /* Got the aperture from the AGP bridge */
263 } else if (swiotlb && !valid_agp) { 369 } else if (swiotlb && !valid_agp) {
264 /* Do nothing */ 370 /* Do nothing */
265 } else if ((!no_iommu && end_pfn > MAX_DMA32_PFN) || 371 } else if ((!no_iommu && end_pfn > MAX_DMA32_PFN) ||
266 force_iommu || 372 force_iommu ||
267 valid_agp || 373 valid_agp ||
268 fallback_aper_force) { 374 fallback_aper_force) {
269 printk("Your BIOS doesn't leave a aperture memory hole\n"); 375 printk(KERN_ERR
270 printk("Please enable the IOMMU option in the BIOS setup\n"); 376 "Your BIOS doesn't leave a aperture memory hole\n");
271 printk("This costs you %d MB of RAM\n", 377 printk(KERN_ERR
272 32 << fallback_aper_order); 378 "Please enable the IOMMU option in the BIOS setup\n");
379 printk(KERN_ERR
380 "This costs you %d MB of RAM\n",
381 32 << fallback_aper_order);
273 382
274 aper_order = fallback_aper_order; 383 aper_order = fallback_aper_order;
275 aper_alloc = allocate_aperture(); 384 aper_alloc = allocate_aperture();
276 if (!aper_alloc) { 385 if (!aper_alloc) {
277 /* Could disable AGP and IOMMU here, but it's probably 386 /*
278 not worth it. But the later users cannot deal with 387 * Could disable AGP and IOMMU here, but it's
279 bad apertures and turning on the aperture over memory 388 * probably not worth it. But the later users
280 causes very strange problems, so it's better to 389 * cannot deal with bad apertures and turning
281 panic early. */ 390 * on the aperture over memory causes very
391 * strange problems, so it's better to panic
392 * early.
393 */
282 panic("Not enough memory for aperture"); 394 panic("Not enough memory for aperture");
283 } 395 }
284 } else { 396 } else {
285 return; 397 return;
286 } 398 }
287 399
288 /* Fix up the north bridges */ 400 /* Fix up the north bridges */
289 for (num = 24; num < 32; num++) { 401 for (num = 24; num < 32; num++) {
290 if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00))) 402 if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
291 continue; 403 continue;
292 404
293 /* Don't enable translation yet. That is done later. 405 /*
294 Assume this BIOS didn't initialise the GART so 406 * Don't enable translation yet. That is done later.
295 just overwrite all previous bits */ 407 * Assume this BIOS didn't initialise the GART so
296 write_pci_config(0, num, 3, 0x90, aper_order<<1); 408 * just overwrite all previous bits
297 write_pci_config(0, num, 3, 0x94, aper_alloc>>25); 409 */
298 } 410 write_pci_config(0, num, 3, 0x90, aper_order<<1);
299} 411 write_pci_config(0, num, 3, 0x94, aper_alloc>>25);
412 }
413}
diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index edb5108e5d0..35a568ea840 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -43,12 +43,10 @@
43#include <mach_apicdef.h> 43#include <mach_apicdef.h>
44#include <mach_ipi.h> 44#include <mach_ipi.h>
45 45
46#include "io_ports.h"
47
48/* 46/*
49 * Sanity check 47 * Sanity check
50 */ 48 */
51#if (SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F 49#if ((SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F)
52# error SPURIOUS_APIC_VECTOR definition error 50# error SPURIOUS_APIC_VECTOR definition error
53#endif 51#endif
54 52
@@ -57,7 +55,7 @@
57 * 55 *
58 * -1=force-disable, +1=force-enable 56 * -1=force-disable, +1=force-enable
59 */ 57 */
60static int enable_local_apic __initdata = 0; 58static int enable_local_apic __initdata;
61 59
62/* Local APIC timer verification ok */ 60/* Local APIC timer verification ok */
63static int local_apic_timer_verify_ok; 61static int local_apic_timer_verify_ok;
@@ -101,6 +99,8 @@ static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
101/* Local APIC was disabled by the BIOS and enabled by the kernel */ 99/* Local APIC was disabled by the BIOS and enabled by the kernel */
102static int enabled_via_apicbase; 100static int enabled_via_apicbase;
103 101
102static unsigned long apic_phys;
103
104/* 104/*
105 * Get the LAPIC version 105 * Get the LAPIC version
106 */ 106 */
@@ -110,7 +110,7 @@ static inline int lapic_get_version(void)
110} 110}
111 111
112/* 112/*
113 * Check, if the APIC is integrated or a seperate chip 113 * Check, if the APIC is integrated or a separate chip
114 */ 114 */
115static inline int lapic_is_integrated(void) 115static inline int lapic_is_integrated(void)
116{ 116{
@@ -135,9 +135,9 @@ void apic_wait_icr_idle(void)
135 cpu_relax(); 135 cpu_relax();
136} 136}
137 137
138unsigned long safe_apic_wait_icr_idle(void) 138u32 safe_apic_wait_icr_idle(void)
139{ 139{
140 unsigned long send_status; 140 u32 send_status;
141 int timeout; 141 int timeout;
142 142
143 timeout = 0; 143 timeout = 0;
@@ -154,7 +154,7 @@ unsigned long safe_apic_wait_icr_idle(void)
154/** 154/**
155 * enable_NMI_through_LVT0 - enable NMI through local vector table 0 155 * enable_NMI_through_LVT0 - enable NMI through local vector table 0
156 */ 156 */
157void enable_NMI_through_LVT0 (void * dummy) 157void __cpuinit enable_NMI_through_LVT0(void)
158{ 158{
159 unsigned int v = APIC_DM_NMI; 159 unsigned int v = APIC_DM_NMI;
160 160
@@ -379,8 +379,10 @@ void __init setup_boot_APIC_clock(void)
379 */ 379 */
380 if (local_apic_timer_disabled) { 380 if (local_apic_timer_disabled) {
381 /* No broadcast on UP ! */ 381 /* No broadcast on UP ! */
382 if (num_possible_cpus() > 1) 382 if (num_possible_cpus() > 1) {
383 lapic_clockevent.mult = 1;
383 setup_APIC_timer(); 384 setup_APIC_timer();
385 }
384 return; 386 return;
385 } 387 }
386 388
@@ -434,7 +436,7 @@ void __init setup_boot_APIC_clock(void)
434 "with PM Timer: %ldms instead of 100ms\n", 436 "with PM Timer: %ldms instead of 100ms\n",
435 (long)res); 437 (long)res);
436 /* Correct the lapic counter value */ 438 /* Correct the lapic counter value */
437 res = (((u64) delta ) * pm_100ms); 439 res = (((u64) delta) * pm_100ms);
438 do_div(res, deltapm); 440 do_div(res, deltapm);
439 printk(KERN_INFO "APIC delta adjusted to PM-Timer: " 441 printk(KERN_INFO "APIC delta adjusted to PM-Timer: "
440 "%lu (%ld)\n", (unsigned long) res, delta); 442 "%lu (%ld)\n", (unsigned long) res, delta);
@@ -472,6 +474,19 @@ void __init setup_boot_APIC_clock(void)
472 474
473 local_apic_timer_verify_ok = 1; 475 local_apic_timer_verify_ok = 1;
474 476
477 /*
478 * Do a sanity check on the APIC calibration result
479 */
480 if (calibration_result < (1000000 / HZ)) {
481 local_irq_enable();
482 printk(KERN_WARNING
483 "APIC frequency too slow, disabling apic timer\n");
484 /* No broadcast on UP ! */
485 if (num_possible_cpus() > 1)
486 setup_APIC_timer();
487 return;
488 }
489
475 /* We trust the pm timer based calibration */ 490 /* We trust the pm timer based calibration */
476 if (!pm_referenced) { 491 if (!pm_referenced) {
477 apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); 492 apic_printk(APIC_VERBOSE, "... verify APIC timer\n");
@@ -563,6 +578,9 @@ static void local_apic_timer_interrupt(void)
563 return; 578 return;
564 } 579 }
565 580
581 /*
582 * the NMI deadlock-detector uses this.
583 */
566 per_cpu(irq_stat, cpu).apic_timer_irqs++; 584 per_cpu(irq_stat, cpu).apic_timer_irqs++;
567 585
568 evt->event_handler(evt); 586 evt->event_handler(evt);
@@ -576,8 +594,7 @@ static void local_apic_timer_interrupt(void)
576 * [ if a single-CPU system runs an SMP kernel then we call the local 594 * [ if a single-CPU system runs an SMP kernel then we call the local
577 * interrupt as well. Thus we cannot inline the local irq ... ] 595 * interrupt as well. Thus we cannot inline the local irq ... ]
578 */ 596 */
579 597void smp_apic_timer_interrupt(struct pt_regs *regs)
580void fastcall smp_apic_timer_interrupt(struct pt_regs *regs)
581{ 598{
582 struct pt_regs *old_regs = set_irq_regs(regs); 599 struct pt_regs *old_regs = set_irq_regs(regs);
583 600
@@ -616,9 +633,14 @@ int setup_profiling_timer(unsigned int multiplier)
616 */ 633 */
617void clear_local_APIC(void) 634void clear_local_APIC(void)
618{ 635{
619 int maxlvt = lapic_get_maxlvt(); 636 int maxlvt;
620 unsigned long v; 637 u32 v;
638
639 /* APIC hasn't been mapped yet */
640 if (!apic_phys)
641 return;
621 642
643 maxlvt = lapic_get_maxlvt();
622 /* 644 /*
623 * Masking an LVT entry can trigger a local APIC error 645 * Masking an LVT entry can trigger a local APIC error
624 * if the vector is zero. Mask LVTERR first to prevent this. 646 * if the vector is zero. Mask LVTERR first to prevent this.
@@ -976,7 +998,8 @@ void __cpuinit setup_local_APIC(void)
976 value |= APIC_LVT_LEVEL_TRIGGER; 998 value |= APIC_LVT_LEVEL_TRIGGER;
977 apic_write_around(APIC_LVT1, value); 999 apic_write_around(APIC_LVT1, value);
978 1000
979 if (integrated && !esr_disable) { /* !82489DX */ 1001 if (integrated && !esr_disable) {
1002 /* !82489DX */
980 maxlvt = lapic_get_maxlvt(); 1003 maxlvt = lapic_get_maxlvt();
981 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ 1004 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
982 apic_write(APIC_ESR, 0); 1005 apic_write(APIC_ESR, 0);
@@ -1020,7 +1043,7 @@ void __cpuinit setup_local_APIC(void)
1020/* 1043/*
1021 * Detect and initialize APIC 1044 * Detect and initialize APIC
1022 */ 1045 */
1023static int __init detect_init_APIC (void) 1046static int __init detect_init_APIC(void)
1024{ 1047{
1025 u32 h, l, features; 1048 u32 h, l, features;
1026 1049
@@ -1077,7 +1100,7 @@ static int __init detect_init_APIC (void)
1077 printk(KERN_WARNING "Could not enable APIC!\n"); 1100 printk(KERN_WARNING "Could not enable APIC!\n");
1078 return -1; 1101 return -1;
1079 } 1102 }
1080 set_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); 1103 set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
1081 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; 1104 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
1082 1105
1083 /* The BIOS may have set up the APIC at some other address */ 1106 /* The BIOS may have set up the APIC at some other address */
@@ -1104,8 +1127,6 @@ no_apic:
1104 */ 1127 */
1105void __init init_apic_mappings(void) 1128void __init init_apic_mappings(void)
1106{ 1129{
1107 unsigned long apic_phys;
1108
1109 /* 1130 /*
1110 * If no local APIC can be found then set up a fake all 1131 * If no local APIC can be found then set up a fake all
1111 * zeroes page to simulate the local APIC and another 1132 * zeroes page to simulate the local APIC and another
@@ -1164,10 +1185,10 @@ fake_ioapic_page:
1164 * This initializes the IO-APIC and APIC hardware if this is 1185 * This initializes the IO-APIC and APIC hardware if this is
1165 * a UP kernel. 1186 * a UP kernel.
1166 */ 1187 */
1167int __init APIC_init_uniprocessor (void) 1188int __init APIC_init_uniprocessor(void)
1168{ 1189{
1169 if (enable_local_apic < 0) 1190 if (enable_local_apic < 0)
1170 clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); 1191 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
1171 1192
1172 if (!smp_found_config && !cpu_has_apic) 1193 if (!smp_found_config && !cpu_has_apic)
1173 return -1; 1194 return -1;
@@ -1179,7 +1200,7 @@ int __init APIC_init_uniprocessor (void)
1179 APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { 1200 APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
1180 printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", 1201 printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
1181 boot_cpu_physical_apicid); 1202 boot_cpu_physical_apicid);
1182 clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); 1203 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
1183 return -1; 1204 return -1;
1184 } 1205 }
1185 1206
@@ -1210,50 +1231,6 @@ int __init APIC_init_uniprocessor (void)
1210} 1231}
1211 1232
1212/* 1233/*
1213 * APIC command line parameters
1214 */
1215static int __init parse_lapic(char *arg)
1216{
1217 enable_local_apic = 1;
1218 return 0;
1219}
1220early_param("lapic", parse_lapic);
1221
1222static int __init parse_nolapic(char *arg)
1223{
1224 enable_local_apic = -1;
1225 clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
1226 return 0;
1227}
1228early_param("nolapic", parse_nolapic);
1229
1230static int __init parse_disable_lapic_timer(char *arg)
1231{
1232 local_apic_timer_disabled = 1;
1233 return 0;
1234}
1235early_param("nolapic_timer", parse_disable_lapic_timer);
1236
1237static int __init parse_lapic_timer_c2_ok(char *arg)
1238{
1239 local_apic_timer_c2_ok = 1;
1240 return 0;
1241}
1242early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok);
1243
1244static int __init apic_set_verbosity(char *str)
1245{
1246 if (strcmp("debug", str) == 0)
1247 apic_verbosity = APIC_DEBUG;
1248 else if (strcmp("verbose", str) == 0)
1249 apic_verbosity = APIC_VERBOSE;
1250 return 1;
1251}
1252
1253__setup("apic=", apic_set_verbosity);
1254
1255
1256/*
1257 * Local APIC interrupts 1234 * Local APIC interrupts
1258 */ 1235 */
1259 1236
@@ -1306,7 +1283,7 @@ void smp_error_interrupt(struct pt_regs *regs)
1306 6: Received illegal vector 1283 6: Received illegal vector
1307 7: Illegal register address 1284 7: Illegal register address
1308 */ 1285 */
1309 printk (KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n", 1286 printk(KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n",
1310 smp_processor_id(), v , v1); 1287 smp_processor_id(), v , v1);
1311 irq_exit(); 1288 irq_exit();
1312} 1289}
@@ -1393,7 +1370,7 @@ void disconnect_bsp_APIC(int virt_wire_setup)
1393 value = apic_read(APIC_LVT0); 1370 value = apic_read(APIC_LVT0);
1394 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | 1371 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
1395 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | 1372 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
1396 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED ); 1373 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1397 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; 1374 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1398 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); 1375 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
1399 apic_write_around(APIC_LVT0, value); 1376 apic_write_around(APIC_LVT0, value);
@@ -1530,7 +1507,7 @@ static int lapic_resume(struct sys_device *dev)
1530 */ 1507 */
1531 1508
1532static struct sysdev_class lapic_sysclass = { 1509static struct sysdev_class lapic_sysclass = {
1533 set_kset_name("lapic"), 1510 .name = "lapic",
1534 .resume = lapic_resume, 1511 .resume = lapic_resume,
1535 .suspend = lapic_suspend, 1512 .suspend = lapic_suspend,
1536}; 1513};
@@ -1565,3 +1542,46 @@ device_initcall(init_lapic_sysfs);
1565static void apic_pm_activate(void) { } 1542static void apic_pm_activate(void) { }
1566 1543
1567#endif /* CONFIG_PM */ 1544#endif /* CONFIG_PM */
1545
1546/*
1547 * APIC command line parameters
1548 */
1549static int __init parse_lapic(char *arg)
1550{
1551 enable_local_apic = 1;
1552 return 0;
1553}
1554early_param("lapic", parse_lapic);
1555
1556static int __init parse_nolapic(char *arg)
1557{
1558 enable_local_apic = -1;
1559 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
1560 return 0;
1561}
1562early_param("nolapic", parse_nolapic);
1563
1564static int __init parse_disable_lapic_timer(char *arg)
1565{
1566 local_apic_timer_disabled = 1;
1567 return 0;
1568}
1569early_param("nolapic_timer", parse_disable_lapic_timer);
1570
1571static int __init parse_lapic_timer_c2_ok(char *arg)
1572{
1573 local_apic_timer_c2_ok = 1;
1574 return 0;
1575}
1576early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok);
1577
1578static int __init apic_set_verbosity(char *str)
1579{
1580 if (strcmp("debug", str) == 0)
1581 apic_verbosity = APIC_DEBUG;
1582 else if (strcmp("verbose", str) == 0)
1583 apic_verbosity = APIC_VERBOSE;
1584 return 1;
1585}
1586__setup("apic=", apic_set_verbosity);
1587
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index f28ccb588fb..d8d03e09dea 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -23,32 +23,37 @@
23#include <linux/mc146818rtc.h> 23#include <linux/mc146818rtc.h>
24#include <linux/kernel_stat.h> 24#include <linux/kernel_stat.h>
25#include <linux/sysdev.h> 25#include <linux/sysdev.h>
26#include <linux/module.h>
27#include <linux/ioport.h> 26#include <linux/ioport.h>
28#include <linux/clockchips.h> 27#include <linux/clockchips.h>
28#include <linux/acpi_pmtmr.h>
29#include <linux/module.h>
29 30
30#include <asm/atomic.h> 31#include <asm/atomic.h>
31#include <asm/smp.h> 32#include <asm/smp.h>
32#include <asm/mtrr.h> 33#include <asm/mtrr.h>
33#include <asm/mpspec.h> 34#include <asm/mpspec.h>
35#include <asm/hpet.h>
34#include <asm/pgalloc.h> 36#include <asm/pgalloc.h>
35#include <asm/mach_apic.h> 37#include <asm/mach_apic.h>
36#include <asm/nmi.h> 38#include <asm/nmi.h>
37#include <asm/idle.h> 39#include <asm/idle.h>
38#include <asm/proto.h> 40#include <asm/proto.h>
39#include <asm/timex.h> 41#include <asm/timex.h>
40#include <asm/hpet.h>
41#include <asm/apic.h> 42#include <asm/apic.h>
42 43
43int apic_verbosity;
44int disable_apic_timer __cpuinitdata; 44int disable_apic_timer __cpuinitdata;
45static int apic_calibrate_pmtmr __initdata; 45static int apic_calibrate_pmtmr __initdata;
46int disable_apic;
46 47
47/* Local APIC timer works in C2? */ 48/* Local APIC timer works in C2 */
48int local_apic_timer_c2_ok; 49int local_apic_timer_c2_ok;
49EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); 50EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
50 51
51static struct resource *ioapic_resources; 52/*
53 * Debug level, exported for io_apic.c
54 */
55int apic_verbosity;
56
52static struct resource lapic_resource = { 57static struct resource lapic_resource = {
53 .name = "Local APIC", 58 .name = "Local APIC",
54 .flags = IORESOURCE_MEM | IORESOURCE_BUSY, 59 .flags = IORESOURCE_MEM | IORESOURCE_BUSY,
@@ -60,10 +65,8 @@ static int lapic_next_event(unsigned long delta,
60 struct clock_event_device *evt); 65 struct clock_event_device *evt);
61static void lapic_timer_setup(enum clock_event_mode mode, 66static void lapic_timer_setup(enum clock_event_mode mode,
62 struct clock_event_device *evt); 67 struct clock_event_device *evt);
63
64static void lapic_timer_broadcast(cpumask_t mask); 68static void lapic_timer_broadcast(cpumask_t mask);
65 69static void apic_pm_activate(void);
66static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen);
67 70
68static struct clock_event_device lapic_clockevent = { 71static struct clock_event_device lapic_clockevent = {
69 .name = "lapic", 72 .name = "lapic",
@@ -78,6 +81,150 @@ static struct clock_event_device lapic_clockevent = {
78}; 81};
79static DEFINE_PER_CPU(struct clock_event_device, lapic_events); 82static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
80 83
84static unsigned long apic_phys;
85
86/*
87 * Get the LAPIC version
88 */
89static inline int lapic_get_version(void)
90{
91 return GET_APIC_VERSION(apic_read(APIC_LVR));
92}
93
94/*
95 * Check, if the APIC is integrated or a seperate chip
96 */
97static inline int lapic_is_integrated(void)
98{
99 return 1;
100}
101
102/*
103 * Check, whether this is a modern or a first generation APIC
104 */
105static int modern_apic(void)
106{
107 /* AMD systems use old APIC versions, so check the CPU */
108 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
109 boot_cpu_data.x86 >= 0xf)
110 return 1;
111 return lapic_get_version() >= 0x14;
112}
113
114void apic_wait_icr_idle(void)
115{
116 while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
117 cpu_relax();
118}
119
120u32 safe_apic_wait_icr_idle(void)
121{
122 u32 send_status;
123 int timeout;
124
125 timeout = 0;
126 do {
127 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
128 if (!send_status)
129 break;
130 udelay(100);
131 } while (timeout++ < 1000);
132
133 return send_status;
134}
135
136/**
137 * enable_NMI_through_LVT0 - enable NMI through local vector table 0
138 */
139void __cpuinit enable_NMI_through_LVT0(void)
140{
141 unsigned int v;
142
143 /* unmask and set to NMI */
144 v = APIC_DM_NMI;
145 apic_write(APIC_LVT0, v);
146}
147
148/**
149 * lapic_get_maxlvt - get the maximum number of local vector table entries
150 */
151int lapic_get_maxlvt(void)
152{
153 unsigned int v, maxlvt;
154
155 v = apic_read(APIC_LVR);
156 maxlvt = GET_APIC_MAXLVT(v);
157 return maxlvt;
158}
159
160/*
161 * This function sets up the local APIC timer, with a timeout of
162 * 'clocks' APIC bus clock. During calibration we actually call
163 * this function twice on the boot CPU, once with a bogus timeout
164 * value, second time for real. The other (noncalibrating) CPUs
165 * call this function only once, with the real, calibrated value.
166 *
167 * We do reads before writes even if unnecessary, to get around the
168 * P5 APIC double write bug.
169 */
170
171static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
172{
173 unsigned int lvtt_value, tmp_value;
174
175 lvtt_value = LOCAL_TIMER_VECTOR;
176 if (!oneshot)
177 lvtt_value |= APIC_LVT_TIMER_PERIODIC;
178 if (!irqen)
179 lvtt_value |= APIC_LVT_MASKED;
180
181 apic_write(APIC_LVTT, lvtt_value);
182
183 /*
184 * Divide PICLK by 16
185 */
186 tmp_value = apic_read(APIC_TDCR);
187 apic_write(APIC_TDCR, (tmp_value
188 & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE))
189 | APIC_TDR_DIV_16);
190
191 if (!oneshot)
192 apic_write(APIC_TMICT, clocks);
193}
194
195/*
196 * Setup extended LVT, AMD specific (K8, family 10h)
197 *
198 * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and
199 * MCE interrupts are supported. Thus MCE offset must be set to 0.
200 */
201
202#define APIC_EILVT_LVTOFF_MCE 0
203#define APIC_EILVT_LVTOFF_IBS 1
204
205static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
206{
207 unsigned long reg = (lvt_off << 4) + APIC_EILVT0;
208 unsigned int v = (mask << 16) | (msg_type << 8) | vector;
209
210 apic_write(reg, v);
211}
212
213u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask)
214{
215 setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask);
216 return APIC_EILVT_LVTOFF_MCE;
217}
218
219u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask)
220{
221 setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask);
222 return APIC_EILVT_LVTOFF_IBS;
223}
224
225/*
226 * Program the next event, relative to now
227 */
81static int lapic_next_event(unsigned long delta, 228static int lapic_next_event(unsigned long delta,
82 struct clock_event_device *evt) 229 struct clock_event_device *evt)
83{ 230{
@@ -85,6 +232,9 @@ static int lapic_next_event(unsigned long delta,
85 return 0; 232 return 0;
86} 233}
87 234
235/*
236 * Setup the lapic timer in periodic or oneshot mode
237 */
88static void lapic_timer_setup(enum clock_event_mode mode, 238static void lapic_timer_setup(enum clock_event_mode mode,
89 struct clock_event_device *evt) 239 struct clock_event_device *evt)
90{ 240{
@@ -127,75 +277,261 @@ static void lapic_timer_broadcast(cpumask_t mask)
127#endif 277#endif
128} 278}
129 279
130static void apic_pm_activate(void); 280/*
281 * Setup the local APIC timer for this CPU. Copy the initilized values
282 * of the boot CPU and register the clock event in the framework.
283 */
284static void setup_APIC_timer(void)
285{
286 struct clock_event_device *levt = &__get_cpu_var(lapic_events);
131 287
132void apic_wait_icr_idle(void) 288 memcpy(levt, &lapic_clockevent, sizeof(*levt));
289 levt->cpumask = cpumask_of_cpu(smp_processor_id());
290
291 clockevents_register_device(levt);
292}
293
294/*
295 * In this function we calibrate APIC bus clocks to the external
296 * timer. Unfortunately we cannot use jiffies and the timer irq
297 * to calibrate, since some later bootup code depends on getting
298 * the first irq? Ugh.
299 *
300 * We want to do the calibration only once since we
301 * want to have local timer irqs syncron. CPUs connected
302 * by the same APIC bus have the very same bus frequency.
303 * And we want to have irqs off anyways, no accidental
304 * APIC irq that way.
305 */
306
307#define TICK_COUNT 100000000
308
309static void __init calibrate_APIC_clock(void)
133{ 310{
134 while (apic_read(APIC_ICR) & APIC_ICR_BUSY) 311 unsigned apic, apic_start;
135 cpu_relax(); 312 unsigned long tsc, tsc_start;
313 int result;
314
315 local_irq_disable();
316
317 /*
318 * Put whatever arbitrary (but long enough) timeout
319 * value into the APIC clock, we just want to get the
320 * counter running for calibration.
321 *
322 * No interrupt enable !
323 */
324 __setup_APIC_LVTT(250000000, 0, 0);
325
326 apic_start = apic_read(APIC_TMCCT);
327#ifdef CONFIG_X86_PM_TIMER
328 if (apic_calibrate_pmtmr && pmtmr_ioport) {
329 pmtimer_wait(5000); /* 5ms wait */
330 apic = apic_read(APIC_TMCCT);
331 result = (apic_start - apic) * 1000L / 5;
332 } else
333#endif
334 {
335 rdtscll(tsc_start);
336
337 do {
338 apic = apic_read(APIC_TMCCT);
339 rdtscll(tsc);
340 } while ((tsc - tsc_start) < TICK_COUNT &&
341 (apic_start - apic) < TICK_COUNT);
342
343 result = (apic_start - apic) * 1000L * tsc_khz /
344 (tsc - tsc_start);
345 }
346
347 local_irq_enable();
348
349 printk(KERN_DEBUG "APIC timer calibration result %d\n", result);
350
351 printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n",
352 result / 1000 / 1000, result / 1000 % 1000);
353
354 /* Calculate the scaled math multiplication factor */
355 lapic_clockevent.mult = div_sc(result, NSEC_PER_SEC, 32);
356 lapic_clockevent.max_delta_ns =
357 clockevent_delta2ns(0x7FFFFF, &lapic_clockevent);
358 lapic_clockevent.min_delta_ns =
359 clockevent_delta2ns(0xF, &lapic_clockevent);
360
361 calibration_result = result / HZ;
136} 362}
137 363
138unsigned int safe_apic_wait_icr_idle(void) 364/*
365 * Setup the boot APIC
366 *
367 * Calibrate and verify the result.
368 */
369void __init setup_boot_APIC_clock(void)
139{ 370{
140 unsigned int send_status; 371 /*
141 int timeout; 372 * The local apic timer can be disabled via the kernel commandline.
373 * Register the lapic timer as a dummy clock event source on SMP
374 * systems, so the broadcast mechanism is used. On UP systems simply
375 * ignore it.
376 */
377 if (disable_apic_timer) {
378 printk(KERN_INFO "Disabling APIC timer\n");
379 /* No broadcast on UP ! */
380 if (num_possible_cpus() > 1) {
381 lapic_clockevent.mult = 1;
382 setup_APIC_timer();
383 }
384 return;
385 }
142 386
143 timeout = 0; 387 printk(KERN_INFO "Using local APIC timer interrupts.\n");
144 do { 388 calibrate_APIC_clock();
145 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
146 if (!send_status)
147 break;
148 udelay(100);
149 } while (timeout++ < 1000);
150 389
151 return send_status; 390 /*
391 * Do a sanity check on the APIC calibration result
392 */
393 if (calibration_result < (1000000 / HZ)) {
394 printk(KERN_WARNING
395 "APIC frequency too slow, disabling apic timer\n");
396 /* No broadcast on UP ! */
397 if (num_possible_cpus() > 1)
398 setup_APIC_timer();
399 return;
400 }
401
402 /*
403 * If nmi_watchdog is set to IO_APIC, we need the
404 * PIT/HPET going. Otherwise register lapic as a dummy
405 * device.
406 */
407 if (nmi_watchdog != NMI_IO_APIC)
408 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
409 else
410 printk(KERN_WARNING "APIC timer registered as dummy,"
411 " due to nmi_watchdog=1!\n");
412
413 setup_APIC_timer();
152} 414}
153 415
154void enable_NMI_through_LVT0 (void * dummy) 416/*
417 * AMD C1E enabled CPUs have a real nasty problem: Some BIOSes set the
418 * C1E flag only in the secondary CPU, so when we detect the wreckage
419 * we already have enabled the boot CPU local apic timer. Check, if
420 * disable_apic_timer is set and the DUMMY flag is cleared. If yes,
421 * set the DUMMY flag again and force the broadcast mode in the
422 * clockevents layer.
423 */
424void __cpuinit check_boot_apic_timer_broadcast(void)
155{ 425{
156 unsigned int v; 426 if (!disable_apic_timer ||
427 (lapic_clockevent.features & CLOCK_EVT_FEAT_DUMMY))
428 return;
157 429
158 /* unmask and set to NMI */ 430 printk(KERN_INFO "AMD C1E detected late. Force timer broadcast.\n");
159 v = APIC_DM_NMI; 431 lapic_clockevent.features |= CLOCK_EVT_FEAT_DUMMY;
160 apic_write(APIC_LVT0, v); 432
433 local_irq_enable();
434 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, &boot_cpu_id);
435 local_irq_disable();
161} 436}
162 437
163int get_maxlvt(void) 438void __cpuinit setup_secondary_APIC_clock(void)
164{ 439{
165 unsigned int v, maxlvt; 440 check_boot_apic_timer_broadcast();
441 setup_APIC_timer();
442}
166 443
167 v = apic_read(APIC_LVR); 444/*
168 maxlvt = GET_APIC_MAXLVT(v); 445 * The guts of the apic timer interrupt
169 return maxlvt; 446 */
447static void local_apic_timer_interrupt(void)
448{
449 int cpu = smp_processor_id();
450 struct clock_event_device *evt = &per_cpu(lapic_events, cpu);
451
452 /*
453 * Normally we should not be here till LAPIC has been initialized but
454 * in some cases like kdump, its possible that there is a pending LAPIC
455 * timer interrupt from previous kernel's context and is delivered in
456 * new kernel the moment interrupts are enabled.
457 *
458 * Interrupts are enabled early and LAPIC is setup much later, hence
459 * its possible that when we get here evt->event_handler is NULL.
460 * Check for event_handler being NULL and discard the interrupt as
461 * spurious.
462 */
463 if (!evt->event_handler) {
464 printk(KERN_WARNING
465 "Spurious LAPIC timer interrupt on cpu %d\n", cpu);
466 /* Switch it off */
467 lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt);
468 return;
469 }
470
471 /*
472 * the NMI deadlock-detector uses this.
473 */
474 add_pda(apic_timer_irqs, 1);
475
476 evt->event_handler(evt);
170} 477}
171 478
172/* 479/*
173 * 'what should we do if we get a hw irq event on an illegal vector'. 480 * Local APIC timer interrupt. This is the most natural way for doing
174 * each architecture has to answer this themselves. 481 * local interrupts, but local timer interrupts can be emulated by
482 * broadcast interrupts too. [in case the hw doesn't support APIC timers]
483 *
484 * [ if a single-CPU system runs an SMP kernel then we call the local
485 * interrupt as well. Thus we cannot inline the local irq ... ]
175 */ 486 */
176void ack_bad_irq(unsigned int irq) 487void smp_apic_timer_interrupt(struct pt_regs *regs)
177{ 488{
178 printk("unexpected IRQ trap at vector %02x\n", irq); 489 struct pt_regs *old_regs = set_irq_regs(regs);
490
179 /* 491 /*
180 * Currently unexpected vectors happen only on SMP and APIC. 492 * NOTE! We'd better ACK the irq immediately,
181 * We _must_ ack these because every local APIC has only N 493 * because timer handling can be slow.
182 * irq slots per priority level, and a 'hanging, unacked' IRQ
183 * holds up an irq slot - in excessive cases (when multiple
184 * unexpected vectors occur) that might lock up the APIC
185 * completely.
186 * But don't ack when the APIC is disabled. -AK
187 */ 494 */
188 if (!disable_apic) 495 ack_APIC_irq();
189 ack_APIC_irq(); 496 /*
497 * update_process_times() expects us to have done irq_enter().
498 * Besides, if we don't timer interrupts ignore the global
499 * interrupt lock, which is the WrongThing (tm) to do.
500 */
501 exit_idle();
502 irq_enter();
503 local_apic_timer_interrupt();
504 irq_exit();
505 set_irq_regs(old_regs);
506}
507
508int setup_profiling_timer(unsigned int multiplier)
509{
510 return -EINVAL;
190} 511}
191 512
513
514/*
515 * Local APIC start and shutdown
516 */
517
518/**
519 * clear_local_APIC - shutdown the local APIC
520 *
521 * This is called, when a CPU is disabled and before rebooting, so the state of
522 * the local APIC has no dangling leftovers. Also used to cleanout any BIOS
523 * leftovers during boot.
524 */
192void clear_local_APIC(void) 525void clear_local_APIC(void)
193{ 526{
194 int maxlvt; 527 int maxlvt = lapic_get_maxlvt();
195 unsigned int v; 528 u32 v;
196 529
197 maxlvt = get_maxlvt(); 530 /* APIC hasn't been mapped yet */
531 if (!apic_phys)
532 return;
198 533
534 maxlvt = lapic_get_maxlvt();
199 /* 535 /*
200 * Masking an LVT entry can trigger a local APIC error 536 * Masking an LVT entry can trigger a local APIC error
201 * if the vector is zero. Mask LVTERR first to prevent this. 537 * if the vector is zero. Mask LVTERR first to prevent this.
@@ -233,45 +569,9 @@ void clear_local_APIC(void)
233 apic_read(APIC_ESR); 569 apic_read(APIC_ESR);
234} 570}
235 571
236void disconnect_bsp_APIC(int virt_wire_setup) 572/**
237{ 573 * disable_local_APIC - clear and disable the local APIC
238 /* Go back to Virtual Wire compatibility mode */ 574 */
239 unsigned long value;
240
241 /* For the spurious interrupt use vector F, and enable it */
242 value = apic_read(APIC_SPIV);
243 value &= ~APIC_VECTOR_MASK;
244 value |= APIC_SPIV_APIC_ENABLED;
245 value |= 0xf;
246 apic_write(APIC_SPIV, value);
247
248 if (!virt_wire_setup) {
249 /*
250 * For LVT0 make it edge triggered, active high,
251 * external and enabled
252 */
253 value = apic_read(APIC_LVT0);
254 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
255 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
256 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED );
257 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
258 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
259 apic_write(APIC_LVT0, value);
260 } else {
261 /* Disable LVT0 */
262 apic_write(APIC_LVT0, APIC_LVT_MASKED);
263 }
264
265 /* For LVT1 make it edge triggered, active high, nmi and enabled */
266 value = apic_read(APIC_LVT1);
267 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
268 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
269 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
270 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
271 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
272 apic_write(APIC_LVT1, value);
273}
274
275void disable_local_APIC(void) 575void disable_local_APIC(void)
276{ 576{
277 unsigned int value; 577 unsigned int value;
@@ -333,7 +633,7 @@ int __init verify_local_APIC(void)
333 reg1 = GET_APIC_VERSION(reg0); 633 reg1 = GET_APIC_VERSION(reg0);
334 if (reg1 == 0x00 || reg1 == 0xff) 634 if (reg1 == 0x00 || reg1 == 0xff)
335 return 0; 635 return 0;
336 reg1 = get_maxlvt(); 636 reg1 = lapic_get_maxlvt();
337 if (reg1 < 0x02 || reg1 == 0xff) 637 if (reg1 < 0x02 || reg1 == 0xff)
338 return 0; 638 return 0;
339 639
@@ -355,18 +655,20 @@ int __init verify_local_APIC(void)
355 * compatibility mode, but most boxes are anymore. 655 * compatibility mode, but most boxes are anymore.
356 */ 656 */
357 reg0 = apic_read(APIC_LVT0); 657 reg0 = apic_read(APIC_LVT0);
358 apic_printk(APIC_DEBUG,"Getting LVT0: %x\n", reg0); 658 apic_printk(APIC_DEBUG, "Getting LVT0: %x\n", reg0);
359 reg1 = apic_read(APIC_LVT1); 659 reg1 = apic_read(APIC_LVT1);
360 apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1); 660 apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1);
361 661
362 return 1; 662 return 1;
363} 663}
364 664
665/**
666 * sync_Arb_IDs - synchronize APIC bus arbitration IDs
667 */
365void __init sync_Arb_IDs(void) 668void __init sync_Arb_IDs(void)
366{ 669{
367 /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 */ 670 /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 */
368 unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR)); 671 if (modern_apic())
369 if (ver >= 0x14) /* P4 or higher */
370 return; 672 return;
371 673
372 /* 674 /*
@@ -418,9 +720,12 @@ void __init init_bsp_APIC(void)
418 apic_write(APIC_LVT1, value); 720 apic_write(APIC_LVT1, value);
419} 721}
420 722
421void __cpuinit setup_local_APIC (void) 723/**
724 * setup_local_APIC - setup the local APIC
725 */
726void __cpuinit setup_local_APIC(void)
422{ 727{
423 unsigned int value, maxlvt; 728 unsigned int value;
424 int i, j; 729 int i, j;
425 730
426 value = apic_read(APIC_LVR); 731 value = apic_read(APIC_LVR);
@@ -516,30 +821,217 @@ void __cpuinit setup_local_APIC (void)
516 else 821 else
517 value = APIC_DM_NMI | APIC_LVT_MASKED; 822 value = APIC_DM_NMI | APIC_LVT_MASKED;
518 apic_write(APIC_LVT1, value); 823 apic_write(APIC_LVT1, value);
824}
519 825
520 { 826void __cpuinit lapic_setup_esr(void)
521 unsigned oldvalue; 827{
522 maxlvt = get_maxlvt(); 828 unsigned maxlvt = lapic_get_maxlvt();
523 oldvalue = apic_read(APIC_ESR); 829
524 value = ERROR_APIC_VECTOR; // enables sending errors 830 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR);
525 apic_write(APIC_LVTERR, value); 831 /*
526 /* 832 * spec says clear errors after enabling vector.
527 * spec says clear errors after enabling vector. 833 */
528 */ 834 if (maxlvt > 3)
529 if (maxlvt > 3) 835 apic_write(APIC_ESR, 0);
530 apic_write(APIC_ESR, 0); 836}
531 value = apic_read(APIC_ESR);
532 if (value != oldvalue)
533 apic_printk(APIC_VERBOSE,
534 "ESR value after enabling vector: %08x, after %08x\n",
535 oldvalue, value);
536 }
537 837
838void __cpuinit end_local_APIC_setup(void)
839{
840 lapic_setup_esr();
538 nmi_watchdog_default(); 841 nmi_watchdog_default();
539 setup_apic_nmi_watchdog(NULL); 842 setup_apic_nmi_watchdog(NULL);
540 apic_pm_activate(); 843 apic_pm_activate();
541} 844}
542 845
846/*
847 * Detect and enable local APICs on non-SMP boards.
848 * Original code written by Keir Fraser.
849 * On AMD64 we trust the BIOS - if it says no APIC it is likely
850 * not correctly set up (usually the APIC timer won't work etc.)
851 */
852static int __init detect_init_APIC(void)
853{
854 if (!cpu_has_apic) {
855 printk(KERN_INFO "No local APIC present\n");
856 return -1;
857 }
858
859 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
860 boot_cpu_id = 0;
861 return 0;
862}
863
864/**
865 * init_apic_mappings - initialize APIC mappings
866 */
867void __init init_apic_mappings(void)
868{
869 /*
870 * If no local APIC can be found then set up a fake all
871 * zeroes page to simulate the local APIC and another
872 * one for the IO-APIC.
873 */
874 if (!smp_found_config && detect_init_APIC()) {
875 apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
876 apic_phys = __pa(apic_phys);
877 } else
878 apic_phys = mp_lapic_addr;
879
880 set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
881 apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
882 APIC_BASE, apic_phys);
883
884 /* Put local APIC into the resource map. */
885 lapic_resource.start = apic_phys;
886 lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1;
887 insert_resource(&iomem_resource, &lapic_resource);
888
889 /*
890 * Fetch the APIC ID of the BSP in case we have a
891 * default configuration (or the MP table is broken).
892 */
893 boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
894}
895
896/*
897 * This initializes the IO-APIC and APIC hardware if this is
898 * a UP kernel.
899 */
900int __init APIC_init_uniprocessor(void)
901{
902 if (disable_apic) {
903 printk(KERN_INFO "Apic disabled\n");
904 return -1;
905 }
906 if (!cpu_has_apic) {
907 disable_apic = 1;
908 printk(KERN_INFO "Apic disabled by BIOS\n");
909 return -1;
910 }
911
912 verify_local_APIC();
913
914 phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id);
915 apic_write(APIC_ID, SET_APIC_ID(boot_cpu_id));
916
917 setup_local_APIC();
918
919 /*
920 * Now enable IO-APICs, actually call clear_IO_APIC
921 * We need clear_IO_APIC before enabling vector on BP
922 */
923 if (!skip_ioapic_setup && nr_ioapics)
924 enable_IO_APIC();
925
926 end_local_APIC_setup();
927
928 if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
929 setup_IO_APIC();
930 else
931 nr_ioapics = 0;
932 setup_boot_APIC_clock();
933 check_nmi_watchdog();
934 return 0;
935}
936
937/*
938 * Local APIC interrupts
939 */
940
941/*
942 * This interrupt should _never_ happen with our APIC/SMP architecture
943 */
944asmlinkage void smp_spurious_interrupt(void)
945{
946 unsigned int v;
947 exit_idle();
948 irq_enter();
949 /*
950 * Check if this really is a spurious interrupt and ACK it
951 * if it is a vectored one. Just in case...
952 * Spurious interrupts should not be ACKed.
953 */
954 v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1));
955 if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
956 ack_APIC_irq();
957
958 add_pda(irq_spurious_count, 1);
959 irq_exit();
960}
961
962/*
963 * This interrupt should never happen with our APIC/SMP architecture
964 */
965asmlinkage void smp_error_interrupt(void)
966{
967 unsigned int v, v1;
968
969 exit_idle();
970 irq_enter();
971 /* First tickle the hardware, only then report what went on. -- REW */
972 v = apic_read(APIC_ESR);
973 apic_write(APIC_ESR, 0);
974 v1 = apic_read(APIC_ESR);
975 ack_APIC_irq();
976 atomic_inc(&irq_err_count);
977
978 /* Here is what the APIC error bits mean:
979 0: Send CS error
980 1: Receive CS error
981 2: Send accept error
982 3: Receive accept error
983 4: Reserved
984 5: Send illegal vector
985 6: Received illegal vector
986 7: Illegal register address
987 */
988 printk(KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n",
989 smp_processor_id(), v , v1);
990 irq_exit();
991}
992
993void disconnect_bsp_APIC(int virt_wire_setup)
994{
995 /* Go back to Virtual Wire compatibility mode */
996 unsigned long value;
997
998 /* For the spurious interrupt use vector F, and enable it */
999 value = apic_read(APIC_SPIV);
1000 value &= ~APIC_VECTOR_MASK;
1001 value |= APIC_SPIV_APIC_ENABLED;
1002 value |= 0xf;
1003 apic_write(APIC_SPIV, value);
1004
1005 if (!virt_wire_setup) {
1006 /*
1007 * For LVT0 make it edge triggered, active high,
1008 * external and enabled
1009 */
1010 value = apic_read(APIC_LVT0);
1011 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
1012 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
1013 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1014 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1015 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
1016 apic_write(APIC_LVT0, value);
1017 } else {
1018 /* Disable LVT0 */
1019 apic_write(APIC_LVT0, APIC_LVT_MASKED);
1020 }
1021
1022 /* For LVT1 make it edge triggered, active high, nmi and enabled */
1023 value = apic_read(APIC_LVT1);
1024 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
1025 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
1026 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1027 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1028 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
1029 apic_write(APIC_LVT1, value);
1030}
1031
1032/*
1033 * Power management
1034 */
543#ifdef CONFIG_PM 1035#ifdef CONFIG_PM
544 1036
545static struct { 1037static struct {
@@ -571,7 +1063,7 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state)
571 if (!apic_pm_state.active) 1063 if (!apic_pm_state.active)
572 return 0; 1064 return 0;
573 1065
574 maxlvt = get_maxlvt(); 1066 maxlvt = lapic_get_maxlvt();
575 1067
576 apic_pm_state.apic_id = apic_read(APIC_ID); 1068 apic_pm_state.apic_id = apic_read(APIC_ID);
577 apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); 1069 apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
@@ -605,7 +1097,7 @@ static int lapic_resume(struct sys_device *dev)
605 if (!apic_pm_state.active) 1097 if (!apic_pm_state.active)
606 return 0; 1098 return 0;
607 1099
608 maxlvt = get_maxlvt(); 1100 maxlvt = lapic_get_maxlvt();
609 1101
610 local_irq_save(flags); 1102 local_irq_save(flags);
611 rdmsr(MSR_IA32_APICBASE, l, h); 1103 rdmsr(MSR_IA32_APICBASE, l, h);
@@ -639,14 +1131,14 @@ static int lapic_resume(struct sys_device *dev)
639} 1131}
640 1132
641static struct sysdev_class lapic_sysclass = { 1133static struct sysdev_class lapic_sysclass = {
642 set_kset_name("lapic"), 1134 .name = "lapic",
643 .resume = lapic_resume, 1135 .resume = lapic_resume,
644 .suspend = lapic_suspend, 1136 .suspend = lapic_suspend,
645}; 1137};
646 1138
647static struct sys_device device_lapic = { 1139static struct sys_device device_lapic = {
648 .id = 0, 1140 .id = 0,
649 .cls = &lapic_sysclass, 1141 .cls = &lapic_sysclass,
650}; 1142};
651 1143
652static void __cpuinit apic_pm_activate(void) 1144static void __cpuinit apic_pm_activate(void)
@@ -657,9 +1149,11 @@ static void __cpuinit apic_pm_activate(void)
657static int __init init_lapic_sysfs(void) 1149static int __init init_lapic_sysfs(void)
658{ 1150{
659 int error; 1151 int error;
1152
660 if (!cpu_has_apic) 1153 if (!cpu_has_apic)
661 return 0; 1154 return 0;
662 /* XXX: remove suspend/resume procs if !apic_pm_state.active? */ 1155 /* XXX: remove suspend/resume procs if !apic_pm_state.active? */
1156
663 error = sysdev_class_register(&lapic_sysclass); 1157 error = sysdev_class_register(&lapic_sysclass);
664 if (!error) 1158 if (!error)
665 error = sysdev_register(&device_lapic); 1159 error = sysdev_register(&device_lapic);
@@ -673,423 +1167,6 @@ static void apic_pm_activate(void) { }
673 1167
674#endif /* CONFIG_PM */ 1168#endif /* CONFIG_PM */
675 1169
676static int __init apic_set_verbosity(char *str)
677{
678 if (str == NULL) {
679 skip_ioapic_setup = 0;
680 ioapic_force = 1;
681 return 0;
682 }
683 if (strcmp("debug", str) == 0)
684 apic_verbosity = APIC_DEBUG;
685 else if (strcmp("verbose", str) == 0)
686 apic_verbosity = APIC_VERBOSE;
687 else {
688 printk(KERN_WARNING "APIC Verbosity level %s not recognised"
689 " use apic=verbose or apic=debug\n", str);
690 return -EINVAL;
691 }
692
693 return 0;
694}
695early_param("apic", apic_set_verbosity);
696
697/*
698 * Detect and enable local APICs on non-SMP boards.
699 * Original code written by Keir Fraser.
700 * On AMD64 we trust the BIOS - if it says no APIC it is likely
701 * not correctly set up (usually the APIC timer won't work etc.)
702 */
703
704static int __init detect_init_APIC (void)
705{
706 if (!cpu_has_apic) {
707 printk(KERN_INFO "No local APIC present\n");
708 return -1;
709 }
710
711 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
712 boot_cpu_id = 0;
713 return 0;
714}
715
716#ifdef CONFIG_X86_IO_APIC
717static struct resource * __init ioapic_setup_resources(void)
718{
719#define IOAPIC_RESOURCE_NAME_SIZE 11
720 unsigned long n;
721 struct resource *res;
722 char *mem;
723 int i;
724
725 if (nr_ioapics <= 0)
726 return NULL;
727
728 n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource);
729 n *= nr_ioapics;
730
731 mem = alloc_bootmem(n);
732 res = (void *)mem;
733
734 if (mem != NULL) {
735 memset(mem, 0, n);
736 mem += sizeof(struct resource) * nr_ioapics;
737
738 for (i = 0; i < nr_ioapics; i++) {
739 res[i].name = mem;
740 res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
741 sprintf(mem, "IOAPIC %u", i);
742 mem += IOAPIC_RESOURCE_NAME_SIZE;
743 }
744 }
745
746 ioapic_resources = res;
747
748 return res;
749}
750
751static int __init ioapic_insert_resources(void)
752{
753 int i;
754 struct resource *r = ioapic_resources;
755
756 if (!r) {
757 printk("IO APIC resources could be not be allocated.\n");
758 return -1;
759 }
760
761 for (i = 0; i < nr_ioapics; i++) {
762 insert_resource(&iomem_resource, r);
763 r++;
764 }
765
766 return 0;
767}
768
769/* Insert the IO APIC resources after PCI initialization has occured to handle
770 * IO APICS that are mapped in on a BAR in PCI space. */
771late_initcall(ioapic_insert_resources);
772#endif
773
774void __init init_apic_mappings(void)
775{
776 unsigned long apic_phys;
777
778 /*
779 * If no local APIC can be found then set up a fake all
780 * zeroes page to simulate the local APIC and another
781 * one for the IO-APIC.
782 */
783 if (!smp_found_config && detect_init_APIC()) {
784 apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
785 apic_phys = __pa(apic_phys);
786 } else
787 apic_phys = mp_lapic_addr;
788
789 set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
790 apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
791 APIC_BASE, apic_phys);
792
793 /* Put local APIC into the resource map. */
794 lapic_resource.start = apic_phys;
795 lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1;
796 insert_resource(&iomem_resource, &lapic_resource);
797
798 /*
799 * Fetch the APIC ID of the BSP in case we have a
800 * default configuration (or the MP table is broken).
801 */
802 boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
803
804 {
805 unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
806 int i;
807 struct resource *ioapic_res;
808
809 ioapic_res = ioapic_setup_resources();
810 for (i = 0; i < nr_ioapics; i++) {
811 if (smp_found_config) {
812 ioapic_phys = mp_ioapics[i].mpc_apicaddr;
813 } else {
814 ioapic_phys = (unsigned long)
815 alloc_bootmem_pages(PAGE_SIZE);
816 ioapic_phys = __pa(ioapic_phys);
817 }
818 set_fixmap_nocache(idx, ioapic_phys);
819 apic_printk(APIC_VERBOSE,
820 "mapped IOAPIC to %016lx (%016lx)\n",
821 __fix_to_virt(idx), ioapic_phys);
822 idx++;
823
824 if (ioapic_res != NULL) {
825 ioapic_res->start = ioapic_phys;
826 ioapic_res->end = ioapic_phys + (4 * 1024) - 1;
827 ioapic_res++;
828 }
829 }
830 }
831}
832
833/*
834 * This function sets up the local APIC timer, with a timeout of
835 * 'clocks' APIC bus clock. During calibration we actually call
836 * this function twice on the boot CPU, once with a bogus timeout
837 * value, second time for real. The other (noncalibrating) CPUs
838 * call this function only once, with the real, calibrated value.
839 *
840 * We do reads before writes even if unnecessary, to get around the
841 * P5 APIC double write bug.
842 */
843
844static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
845{
846 unsigned int lvtt_value, tmp_value;
847
848 lvtt_value = LOCAL_TIMER_VECTOR;
849 if (!oneshot)
850 lvtt_value |= APIC_LVT_TIMER_PERIODIC;
851 if (!irqen)
852 lvtt_value |= APIC_LVT_MASKED;
853
854 apic_write(APIC_LVTT, lvtt_value);
855
856 /*
857 * Divide PICLK by 16
858 */
859 tmp_value = apic_read(APIC_TDCR);
860 apic_write(APIC_TDCR, (tmp_value
861 & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE))
862 | APIC_TDR_DIV_16);
863
864 if (!oneshot)
865 apic_write(APIC_TMICT, clocks);
866}
867
868static void setup_APIC_timer(void)
869{
870 struct clock_event_device *levt = &__get_cpu_var(lapic_events);
871
872 memcpy(levt, &lapic_clockevent, sizeof(*levt));
873 levt->cpumask = cpumask_of_cpu(smp_processor_id());
874
875 clockevents_register_device(levt);
876}
877
878/*
879 * In this function we calibrate APIC bus clocks to the external
880 * timer. Unfortunately we cannot use jiffies and the timer irq
881 * to calibrate, since some later bootup code depends on getting
882 * the first irq? Ugh.
883 *
884 * We want to do the calibration only once since we
885 * want to have local timer irqs syncron. CPUs connected
886 * by the same APIC bus have the very same bus frequency.
887 * And we want to have irqs off anyways, no accidental
888 * APIC irq that way.
889 */
890
891#define TICK_COUNT 100000000
892
893static void __init calibrate_APIC_clock(void)
894{
895 unsigned apic, apic_start;
896 unsigned long tsc, tsc_start;
897 int result;
898
899 local_irq_disable();
900
901 /*
902 * Put whatever arbitrary (but long enough) timeout
903 * value into the APIC clock, we just want to get the
904 * counter running for calibration.
905 *
906 * No interrupt enable !
907 */
908 __setup_APIC_LVTT(250000000, 0, 0);
909
910 apic_start = apic_read(APIC_TMCCT);
911#ifdef CONFIG_X86_PM_TIMER
912 if (apic_calibrate_pmtmr && pmtmr_ioport) {
913 pmtimer_wait(5000); /* 5ms wait */
914 apic = apic_read(APIC_TMCCT);
915 result = (apic_start - apic) * 1000L / 5;
916 } else
917#endif
918 {
919 rdtscll(tsc_start);
920
921 do {
922 apic = apic_read(APIC_TMCCT);
923 rdtscll(tsc);
924 } while ((tsc - tsc_start) < TICK_COUNT &&
925 (apic_start - apic) < TICK_COUNT);
926
927 result = (apic_start - apic) * 1000L * tsc_khz /
928 (tsc - tsc_start);
929 }
930
931 local_irq_enable();
932
933 printk(KERN_DEBUG "APIC timer calibration result %d\n", result);
934
935 printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n",
936 result / 1000 / 1000, result / 1000 % 1000);
937
938 /* Calculate the scaled math multiplication factor */
939 lapic_clockevent.mult = div_sc(result, NSEC_PER_SEC, 32);
940 lapic_clockevent.max_delta_ns =
941 clockevent_delta2ns(0x7FFFFF, &lapic_clockevent);
942 lapic_clockevent.min_delta_ns =
943 clockevent_delta2ns(0xF, &lapic_clockevent);
944
945 calibration_result = result / HZ;
946}
947
948void __init setup_boot_APIC_clock (void)
949{
950 /*
951 * The local apic timer can be disabled via the kernel commandline.
952 * Register the lapic timer as a dummy clock event source on SMP
953 * systems, so the broadcast mechanism is used. On UP systems simply
954 * ignore it.
955 */
956 if (disable_apic_timer) {
957 printk(KERN_INFO "Disabling APIC timer\n");
958 /* No broadcast on UP ! */
959 if (num_possible_cpus() > 1)
960 setup_APIC_timer();
961 return;
962 }
963
964 printk(KERN_INFO "Using local APIC timer interrupts.\n");
965 calibrate_APIC_clock();
966
967 /*
968 * If nmi_watchdog is set to IO_APIC, we need the
969 * PIT/HPET going. Otherwise register lapic as a dummy
970 * device.
971 */
972 if (nmi_watchdog != NMI_IO_APIC)
973 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
974 else
975 printk(KERN_WARNING "APIC timer registered as dummy,"
976 " due to nmi_watchdog=1!\n");
977
978 setup_APIC_timer();
979}
980
981/*
982 * AMD C1E enabled CPUs have a real nasty problem: Some BIOSes set the
983 * C1E flag only in the secondary CPU, so when we detect the wreckage
984 * we already have enabled the boot CPU local apic timer. Check, if
985 * disable_apic_timer is set and the DUMMY flag is cleared. If yes,
986 * set the DUMMY flag again and force the broadcast mode in the
987 * clockevents layer.
988 */
989void __cpuinit check_boot_apic_timer_broadcast(void)
990{
991 if (!disable_apic_timer ||
992 (lapic_clockevent.features & CLOCK_EVT_FEAT_DUMMY))
993 return;
994
995 printk(KERN_INFO "AMD C1E detected late. Force timer broadcast.\n");
996 lapic_clockevent.features |= CLOCK_EVT_FEAT_DUMMY;
997
998 local_irq_enable();
999 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, &boot_cpu_id);
1000 local_irq_disable();
1001}
1002
1003void __cpuinit setup_secondary_APIC_clock(void)
1004{
1005 check_boot_apic_timer_broadcast();
1006 setup_APIC_timer();
1007}
1008
1009int setup_profiling_timer(unsigned int multiplier)
1010{
1011 return -EINVAL;
1012}
1013
1014void setup_APIC_extended_lvt(unsigned char lvt_off, unsigned char vector,
1015 unsigned char msg_type, unsigned char mask)
1016{
1017 unsigned long reg = (lvt_off << 4) + K8_APIC_EXT_LVT_BASE;
1018 unsigned int v = (mask << 16) | (msg_type << 8) | vector;
1019 apic_write(reg, v);
1020}
1021
1022/*
1023 * Local timer interrupt handler. It does both profiling and
1024 * process statistics/rescheduling.
1025 *
1026 * We do profiling in every local tick, statistics/rescheduling
1027 * happen only every 'profiling multiplier' ticks. The default
1028 * multiplier is 1 and it can be changed by writing the new multiplier
1029 * value into /proc/profile.
1030 */
1031
1032void smp_local_timer_interrupt(void)
1033{
1034 int cpu = smp_processor_id();
1035 struct clock_event_device *evt = &per_cpu(lapic_events, cpu);
1036
1037 /*
1038 * Normally we should not be here till LAPIC has been initialized but
1039 * in some cases like kdump, its possible that there is a pending LAPIC
1040 * timer interrupt from previous kernel's context and is delivered in
1041 * new kernel the moment interrupts are enabled.
1042 *
1043 * Interrupts are enabled early and LAPIC is setup much later, hence
1044 * its possible that when we get here evt->event_handler is NULL.
1045 * Check for event_handler being NULL and discard the interrupt as
1046 * spurious.
1047 */
1048 if (!evt->event_handler) {
1049 printk(KERN_WARNING
1050 "Spurious LAPIC timer interrupt on cpu %d\n", cpu);
1051 /* Switch it off */
1052 lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt);
1053 return;
1054 }
1055
1056 /*
1057 * the NMI deadlock-detector uses this.
1058 */
1059 add_pda(apic_timer_irqs, 1);
1060
1061 evt->event_handler(evt);
1062}
1063
1064/*
1065 * Local APIC timer interrupt. This is the most natural way for doing
1066 * local interrupts, but local timer interrupts can be emulated by
1067 * broadcast interrupts too. [in case the hw doesn't support APIC timers]
1068 *
1069 * [ if a single-CPU system runs an SMP kernel then we call the local
1070 * interrupt as well. Thus we cannot inline the local irq ... ]
1071 */
1072void smp_apic_timer_interrupt(struct pt_regs *regs)
1073{
1074 struct pt_regs *old_regs = set_irq_regs(regs);
1075
1076 /*
1077 * NOTE! We'd better ACK the irq immediately,
1078 * because timer handling can be slow.
1079 */
1080 ack_APIC_irq();
1081 /*
1082 * update_process_times() expects us to have done irq_enter().
1083 * Besides, if we don't timer interrupts ignore the global
1084 * interrupt lock, which is the WrongThing (tm) to do.
1085 */
1086 exit_idle();
1087 irq_enter();
1088 smp_local_timer_interrupt();
1089 irq_exit();
1090 set_irq_regs(old_regs);
1091}
1092
1093/* 1170/*
1094 * apic_is_clustered_box() -- Check if we can expect good TSC 1171 * apic_is_clustered_box() -- Check if we can expect good TSC
1095 * 1172 *
@@ -1103,21 +1180,34 @@ __cpuinit int apic_is_clustered_box(void)
1103{ 1180{
1104 int i, clusters, zeros; 1181 int i, clusters, zeros;
1105 unsigned id; 1182 unsigned id;
1183 u16 *bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr;
1106 DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS); 1184 DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS);
1107 1185
1108 bitmap_zero(clustermap, NUM_APIC_CLUSTERS); 1186 bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
1109 1187
1110 for (i = 0; i < NR_CPUS; i++) { 1188 for (i = 0; i < NR_CPUS; i++) {
1111 id = bios_cpu_apicid[i]; 1189 /* are we being called early in kernel startup? */
1190 if (bios_cpu_apicid) {
1191 id = bios_cpu_apicid[i];
1192 }
1193 else if (i < nr_cpu_ids) {
1194 if (cpu_present(i))
1195 id = per_cpu(x86_bios_cpu_apicid, i);
1196 else
1197 continue;
1198 }
1199 else
1200 break;
1201
1112 if (id != BAD_APICID) 1202 if (id != BAD_APICID)
1113 __set_bit(APIC_CLUSTERID(id), clustermap); 1203 __set_bit(APIC_CLUSTERID(id), clustermap);
1114 } 1204 }
1115 1205
1116 /* Problem: Partially populated chassis may not have CPUs in some of 1206 /* Problem: Partially populated chassis may not have CPUs in some of
1117 * the APIC clusters they have been allocated. Only present CPUs have 1207 * the APIC clusters they have been allocated. Only present CPUs have
1118 * bios_cpu_apicid entries, thus causing zeroes in the bitmap. Since 1208 * x86_bios_cpu_apicid entries, thus causing zeroes in the bitmap.
1119 * clusters are allocated sequentially, count zeros only if they are 1209 * Since clusters are allocated sequentially, count zeros only if
1120 * bounded by ones. 1210 * they are bounded by ones.
1121 */ 1211 */
1122 clusters = 0; 1212 clusters = 0;
1123 zeros = 0; 1213 zeros = 0;
@@ -1138,96 +1228,33 @@ __cpuinit int apic_is_clustered_box(void)
1138} 1228}
1139 1229
1140/* 1230/*
1141 * This interrupt should _never_ happen with our APIC/SMP architecture 1231 * APIC command line parameters
1142 */
1143asmlinkage void smp_spurious_interrupt(void)
1144{
1145 unsigned int v;
1146 exit_idle();
1147 irq_enter();
1148 /*
1149 * Check if this really is a spurious interrupt and ACK it
1150 * if it is a vectored one. Just in case...
1151 * Spurious interrupts should not be ACKed.
1152 */
1153 v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1));
1154 if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
1155 ack_APIC_irq();
1156
1157 add_pda(irq_spurious_count, 1);
1158 irq_exit();
1159}
1160
1161/*
1162 * This interrupt should never happen with our APIC/SMP architecture
1163 */ 1232 */
1164 1233static int __init apic_set_verbosity(char *str)
1165asmlinkage void smp_error_interrupt(void)
1166{
1167 unsigned int v, v1;
1168
1169 exit_idle();
1170 irq_enter();
1171 /* First tickle the hardware, only then report what went on. -- REW */
1172 v = apic_read(APIC_ESR);
1173 apic_write(APIC_ESR, 0);
1174 v1 = apic_read(APIC_ESR);
1175 ack_APIC_irq();
1176 atomic_inc(&irq_err_count);
1177
1178 /* Here is what the APIC error bits mean:
1179 0: Send CS error
1180 1: Receive CS error
1181 2: Send accept error
1182 3: Receive accept error
1183 4: Reserved
1184 5: Send illegal vector
1185 6: Received illegal vector
1186 7: Illegal register address
1187 */
1188 printk (KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n",
1189 smp_processor_id(), v , v1);
1190 irq_exit();
1191}
1192
1193int disable_apic;
1194
1195/*
1196 * This initializes the IO-APIC and APIC hardware if this is
1197 * a UP kernel.
1198 */
1199int __init APIC_init_uniprocessor (void)
1200{ 1234{
1201 if (disable_apic) { 1235 if (str == NULL) {
1202 printk(KERN_INFO "Apic disabled\n"); 1236 skip_ioapic_setup = 0;
1203 return -1; 1237 ioapic_force = 1;
1238 return 0;
1204 } 1239 }
1205 if (!cpu_has_apic) { 1240 if (strcmp("debug", str) == 0)
1206 disable_apic = 1; 1241 apic_verbosity = APIC_DEBUG;
1207 printk(KERN_INFO "Apic disabled by BIOS\n"); 1242 else if (strcmp("verbose", str) == 0)
1208 return -1; 1243 apic_verbosity = APIC_VERBOSE;
1244 else {
1245 printk(KERN_WARNING "APIC Verbosity level %s not recognised"
1246 " use apic=verbose or apic=debug\n", str);
1247 return -EINVAL;
1209 } 1248 }
1210 1249
1211 verify_local_APIC();
1212
1213 phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id);
1214 apic_write(APIC_ID, SET_APIC_ID(boot_cpu_id));
1215
1216 setup_local_APIC();
1217
1218 if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
1219 setup_IO_APIC();
1220 else
1221 nr_ioapics = 0;
1222 setup_boot_APIC_clock();
1223 check_nmi_watchdog();
1224 return 0; 1250 return 0;
1225} 1251}
1252early_param("apic", apic_set_verbosity);
1226 1253
1227static __init int setup_disableapic(char *str) 1254static __init int setup_disableapic(char *str)
1228{ 1255{
1229 disable_apic = 1; 1256 disable_apic = 1;
1230 clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); 1257 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
1231 return 0; 1258 return 0;
1232} 1259}
1233early_param("disableapic", setup_disableapic); 1260early_param("disableapic", setup_disableapic);
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 17089a04102..d4438ef296d 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -227,6 +227,7 @@
227#include <linux/dmi.h> 227#include <linux/dmi.h>
228#include <linux/suspend.h> 228#include <linux/suspend.h>
229#include <linux/kthread.h> 229#include <linux/kthread.h>
230#include <linux/jiffies.h>
230 231
231#include <asm/system.h> 232#include <asm/system.h>
232#include <asm/uaccess.h> 233#include <asm/uaccess.h>
@@ -235,8 +236,6 @@
235#include <asm/paravirt.h> 236#include <asm/paravirt.h>
236#include <asm/reboot.h> 237#include <asm/reboot.h>
237 238
238#include "io_ports.h"
239
240#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT) 239#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT)
241extern int (*console_blank_hook)(int); 240extern int (*console_blank_hook)(int);
242#endif 241#endif
@@ -324,7 +323,7 @@ extern int (*console_blank_hook)(int);
324/* 323/*
325 * Ignore suspend events for this amount of time after a resume 324 * Ignore suspend events for this amount of time after a resume
326 */ 325 */
327#define DEFAULT_BOUNCE_INTERVAL (3 * HZ) 326#define DEFAULT_BOUNCE_INTERVAL (3 * HZ)
328 327
329/* 328/*
330 * Maximum number of events stored 329 * Maximum number of events stored
@@ -336,7 +335,7 @@ extern int (*console_blank_hook)(int);
336 */ 335 */
337struct apm_user { 336struct apm_user {
338 int magic; 337 int magic;
339 struct apm_user * next; 338 struct apm_user *next;
340 unsigned int suser: 1; 339 unsigned int suser: 1;
341 unsigned int writer: 1; 340 unsigned int writer: 1;
342 unsigned int reader: 1; 341 unsigned int reader: 1;
@@ -372,44 +371,44 @@ struct apm_user {
372static struct { 371static struct {
373 unsigned long offset; 372 unsigned long offset;
374 unsigned short segment; 373 unsigned short segment;
375} apm_bios_entry; 374} apm_bios_entry;
376static int clock_slowed; 375static int clock_slowed;
377static int idle_threshold __read_mostly = DEFAULT_IDLE_THRESHOLD; 376static int idle_threshold __read_mostly = DEFAULT_IDLE_THRESHOLD;
378static int idle_period __read_mostly = DEFAULT_IDLE_PERIOD; 377static int idle_period __read_mostly = DEFAULT_IDLE_PERIOD;
379static int set_pm_idle; 378static int set_pm_idle;
380static int suspends_pending; 379static int suspends_pending;
381static int standbys_pending; 380static int standbys_pending;
382static int ignore_sys_suspend; 381static int ignore_sys_suspend;
383static int ignore_normal_resume; 382static int ignore_normal_resume;
384static int bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL; 383static int bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL;
385 384
386static int debug __read_mostly; 385static int debug __read_mostly;
387static int smp __read_mostly; 386static int smp __read_mostly;
388static int apm_disabled = -1; 387static int apm_disabled = -1;
389#ifdef CONFIG_SMP 388#ifdef CONFIG_SMP
390static int power_off; 389static int power_off;
391#else 390#else
392static int power_off = 1; 391static int power_off = 1;
393#endif 392#endif
394#ifdef CONFIG_APM_REAL_MODE_POWER_OFF 393#ifdef CONFIG_APM_REAL_MODE_POWER_OFF
395static int realmode_power_off = 1; 394static int realmode_power_off = 1;
396#else 395#else
397static int realmode_power_off; 396static int realmode_power_off;
398#endif 397#endif
399#ifdef CONFIG_APM_ALLOW_INTS 398#ifdef CONFIG_APM_ALLOW_INTS
400static int allow_ints = 1; 399static int allow_ints = 1;
401#else 400#else
402static int allow_ints; 401static int allow_ints;
403#endif 402#endif
404static int broken_psr; 403static int broken_psr;
405 404
406static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue); 405static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue);
407static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue); 406static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue);
408static struct apm_user * user_list; 407static struct apm_user *user_list;
409static DEFINE_SPINLOCK(user_list_lock); 408static DEFINE_SPINLOCK(user_list_lock);
410static const struct desc_struct bad_bios_desc = { 0, 0x00409200 }; 409static const struct desc_struct bad_bios_desc = { { { 0, 0x00409200 } } };
411 410
412static const char driver_version[] = "1.16ac"; /* no spaces */ 411static const char driver_version[] = "1.16ac"; /* no spaces */
413 412
414static struct task_struct *kapmd_task; 413static struct task_struct *kapmd_task;
415 414
@@ -417,7 +416,7 @@ static struct task_struct *kapmd_task;
417 * APM event names taken from the APM 1.2 specification. These are 416 * APM event names taken from the APM 1.2 specification. These are
418 * the message codes that the BIOS uses to tell us about events 417 * the message codes that the BIOS uses to tell us about events
419 */ 418 */
420static const char * const apm_event_name[] = { 419static const char * const apm_event_name[] = {
421 "system standby", 420 "system standby",
422 "system suspend", 421 "system suspend",
423 "normal resume", 422 "normal resume",
@@ -435,14 +434,14 @@ static const char * const apm_event_name[] = {
435 434
436typedef struct lookup_t { 435typedef struct lookup_t {
437 int key; 436 int key;
438 char * msg; 437 char *msg;
439} lookup_t; 438} lookup_t;
440 439
441/* 440/*
442 * The BIOS returns a set of standard error codes in AX when the 441 * The BIOS returns a set of standard error codes in AX when the
443 * carry flag is set. 442 * carry flag is set.
444 */ 443 */
445 444
446static const lookup_t error_table[] = { 445static const lookup_t error_table[] = {
447/* N/A { APM_SUCCESS, "Operation succeeded" }, */ 446/* N/A { APM_SUCCESS, "Operation succeeded" }, */
448 { APM_DISABLED, "Power management disabled" }, 447 { APM_DISABLED, "Power management disabled" },
@@ -472,24 +471,25 @@ static const lookup_t error_table[] = {
472 * Write a meaningful log entry to the kernel log in the event of 471 * Write a meaningful log entry to the kernel log in the event of
473 * an APM error. 472 * an APM error.
474 */ 473 */
475 474
476static void apm_error(char *str, int err) 475static void apm_error(char *str, int err)
477{ 476{
478 int i; 477 int i;
479 478
480 for (i = 0; i < ERROR_COUNT; i++) 479 for (i = 0; i < ERROR_COUNT; i++)
481 if (error_table[i].key == err) break; 480 if (error_table[i].key == err)
481 break;
482 if (i < ERROR_COUNT) 482 if (i < ERROR_COUNT)
483 printk(KERN_NOTICE "apm: %s: %s\n", str, error_table[i].msg); 483 printk(KERN_NOTICE "apm: %s: %s\n", str, error_table[i].msg);
484 else 484 else
485 printk(KERN_NOTICE "apm: %s: unknown error code %#2.2x\n", 485 printk(KERN_NOTICE "apm: %s: unknown error code %#2.2x\n",
486 str, err); 486 str, err);
487} 487}
488 488
489/* 489/*
490 * Lock APM functionality to physical CPU 0 490 * Lock APM functionality to physical CPU 0
491 */ 491 */
492 492
493#ifdef CONFIG_SMP 493#ifdef CONFIG_SMP
494 494
495static cpumask_t apm_save_cpus(void) 495static cpumask_t apm_save_cpus(void)
@@ -511,7 +511,7 @@ static inline void apm_restore_cpus(cpumask_t mask)
511/* 511/*
512 * No CPU lockdown needed on a uniprocessor 512 * No CPU lockdown needed on a uniprocessor
513 */ 513 */
514 514
515#define apm_save_cpus() (current->cpus_allowed) 515#define apm_save_cpus() (current->cpus_allowed)
516#define apm_restore_cpus(x) (void)(x) 516#define apm_restore_cpus(x) (void)(x)
517 517
@@ -590,7 +590,7 @@ static inline void apm_irq_restore(unsigned long flags)
590 * code is returned in AH (bits 8-15 of eax) and this function 590 * code is returned in AH (bits 8-15 of eax) and this function
591 * returns non-zero. 591 * returns non-zero.
592 */ 592 */
593 593
594static u8 apm_bios_call(u32 func, u32 ebx_in, u32 ecx_in, 594static u8 apm_bios_call(u32 func, u32 ebx_in, u32 ecx_in,
595 u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, u32 *esi) 595 u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, u32 *esi)
596{ 596{
@@ -602,7 +602,7 @@ static u8 apm_bios_call(u32 func, u32 ebx_in, u32 ecx_in,
602 struct desc_struct *gdt; 602 struct desc_struct *gdt;
603 603
604 cpus = apm_save_cpus(); 604 cpus = apm_save_cpus();
605 605
606 cpu = get_cpu(); 606 cpu = get_cpu();
607 gdt = get_cpu_gdt_table(cpu); 607 gdt = get_cpu_gdt_table(cpu);
608 save_desc_40 = gdt[0x40 / 8]; 608 save_desc_40 = gdt[0x40 / 8];
@@ -616,7 +616,7 @@ static u8 apm_bios_call(u32 func, u32 ebx_in, u32 ecx_in,
616 gdt[0x40 / 8] = save_desc_40; 616 gdt[0x40 / 8] = save_desc_40;
617 put_cpu(); 617 put_cpu();
618 apm_restore_cpus(cpus); 618 apm_restore_cpus(cpus);
619 619
620 return *eax & 0xff; 620 return *eax & 0xff;
621} 621}
622 622
@@ -645,7 +645,7 @@ static u8 apm_bios_call_simple(u32 func, u32 ebx_in, u32 ecx_in, u32 *eax)
645 struct desc_struct *gdt; 645 struct desc_struct *gdt;
646 646
647 cpus = apm_save_cpus(); 647 cpus = apm_save_cpus();
648 648
649 cpu = get_cpu(); 649 cpu = get_cpu();
650 gdt = get_cpu_gdt_table(cpu); 650 gdt = get_cpu_gdt_table(cpu);
651 save_desc_40 = gdt[0x40 / 8]; 651 save_desc_40 = gdt[0x40 / 8];
@@ -680,7 +680,7 @@ static u8 apm_bios_call_simple(u32 func, u32 ebx_in, u32 ecx_in, u32 *eax)
680 680
681static int apm_driver_version(u_short *val) 681static int apm_driver_version(u_short *val)
682{ 682{
683 u32 eax; 683 u32 eax;
684 684
685 if (apm_bios_call_simple(APM_FUNC_VERSION, 0, *val, &eax)) 685 if (apm_bios_call_simple(APM_FUNC_VERSION, 0, *val, &eax))
686 return (eax >> 8) & 0xff; 686 return (eax >> 8) & 0xff;
@@ -704,16 +704,16 @@ static int apm_driver_version(u_short *val)
704 * that APM 1.2 is in use. If no messges are pending the value 0x80 704 * that APM 1.2 is in use. If no messges are pending the value 0x80
705 * is returned (No power management events pending). 705 * is returned (No power management events pending).
706 */ 706 */
707 707
708static int apm_get_event(apm_event_t *event, apm_eventinfo_t *info) 708static int apm_get_event(apm_event_t *event, apm_eventinfo_t *info)
709{ 709{
710 u32 eax; 710 u32 eax;
711 u32 ebx; 711 u32 ebx;
712 u32 ecx; 712 u32 ecx;
713 u32 dummy; 713 u32 dummy;
714 714
715 if (apm_bios_call(APM_FUNC_GET_EVENT, 0, 0, &eax, &ebx, &ecx, 715 if (apm_bios_call(APM_FUNC_GET_EVENT, 0, 0, &eax, &ebx, &ecx,
716 &dummy, &dummy)) 716 &dummy, &dummy))
717 return (eax >> 8) & 0xff; 717 return (eax >> 8) & 0xff;
718 *event = ebx; 718 *event = ebx;
719 if (apm_info.connection_version < 0x0102) 719 if (apm_info.connection_version < 0x0102)
@@ -736,10 +736,10 @@ static int apm_get_event(apm_event_t *event, apm_eventinfo_t *info)
736 * The state holds the state to transition to, which may in fact 736 * The state holds the state to transition to, which may in fact
737 * be an acceptance of a BIOS requested state change. 737 * be an acceptance of a BIOS requested state change.
738 */ 738 */
739 739
740static int set_power_state(u_short what, u_short state) 740static int set_power_state(u_short what, u_short state)
741{ 741{
742 u32 eax; 742 u32 eax;
743 743
744 if (apm_bios_call_simple(APM_FUNC_SET_STATE, what, state, &eax)) 744 if (apm_bios_call_simple(APM_FUNC_SET_STATE, what, state, &eax))
745 return (eax >> 8) & 0xff; 745 return (eax >> 8) & 0xff;
@@ -752,7 +752,7 @@ static int set_power_state(u_short what, u_short state)
752 * 752 *
753 * Transition the entire system into a new APM power state. 753 * Transition the entire system into a new APM power state.
754 */ 754 */
755 755
756static int set_system_power_state(u_short state) 756static int set_system_power_state(u_short state)
757{ 757{
758 return set_power_state(APM_DEVICE_ALL, state); 758 return set_power_state(APM_DEVICE_ALL, state);
@@ -766,13 +766,13 @@ static int set_system_power_state(u_short state)
766 * to handle the idle request. On a success the function returns 1 766 * to handle the idle request. On a success the function returns 1
767 * if the BIOS did clock slowing or 0 otherwise. 767 * if the BIOS did clock slowing or 0 otherwise.
768 */ 768 */
769 769
770static int apm_do_idle(void) 770static int apm_do_idle(void)
771{ 771{
772 u32 eax; 772 u32 eax;
773 u8 ret = 0; 773 u8 ret = 0;
774 int idled = 0; 774 int idled = 0;
775 int polling; 775 int polling;
776 776
777 polling = !!(current_thread_info()->status & TS_POLLING); 777 polling = !!(current_thread_info()->status & TS_POLLING);
778 if (polling) { 778 if (polling) {
@@ -799,10 +799,9 @@ static int apm_do_idle(void)
799 /* This always fails on some SMP boards running UP kernels. 799 /* This always fails on some SMP boards running UP kernels.
800 * Only report the failure the first 5 times. 800 * Only report the failure the first 5 times.
801 */ 801 */
802 if (++t < 5) 802 if (++t < 5) {
803 {
804 printk(KERN_DEBUG "apm_do_idle failed (%d)\n", 803 printk(KERN_DEBUG "apm_do_idle failed (%d)\n",
805 (eax >> 8) & 0xff); 804 (eax >> 8) & 0xff);
806 t = jiffies; 805 t = jiffies;
807 } 806 }
808 return -1; 807 return -1;
@@ -814,15 +813,15 @@ static int apm_do_idle(void)
814/** 813/**
815 * apm_do_busy - inform the BIOS the CPU is busy 814 * apm_do_busy - inform the BIOS the CPU is busy
816 * 815 *
817 * Request that the BIOS brings the CPU back to full performance. 816 * Request that the BIOS brings the CPU back to full performance.
818 */ 817 */
819 818
820static void apm_do_busy(void) 819static void apm_do_busy(void)
821{ 820{
822 u32 dummy; 821 u32 dummy;
823 822
824 if (clock_slowed || ALWAYS_CALL_BUSY) { 823 if (clock_slowed || ALWAYS_CALL_BUSY) {
825 (void) apm_bios_call_simple(APM_FUNC_BUSY, 0, 0, &dummy); 824 (void)apm_bios_call_simple(APM_FUNC_BUSY, 0, 0, &dummy);
826 clock_slowed = 0; 825 clock_slowed = 0;
827 } 826 }
828} 827}
@@ -833,15 +832,15 @@ static void apm_do_busy(void)
833 * power management - we probably want 832 * power management - we probably want
834 * to conserve power. 833 * to conserve power.
835 */ 834 */
836#define IDLE_CALC_LIMIT (HZ * 100) 835#define IDLE_CALC_LIMIT (HZ * 100)
837#define IDLE_LEAKY_MAX 16 836#define IDLE_LEAKY_MAX 16
838 837
839static void (*original_pm_idle)(void) __read_mostly; 838static void (*original_pm_idle)(void) __read_mostly;
840 839
841/** 840/**
842 * apm_cpu_idle - cpu idling for APM capable Linux 841 * apm_cpu_idle - cpu idling for APM capable Linux
843 * 842 *
844 * This is the idling function the kernel executes when APM is available. It 843 * This is the idling function the kernel executes when APM is available. It
845 * tries to do BIOS powermanagement based on the average system idle time. 844 * tries to do BIOS powermanagement based on the average system idle time.
846 * Furthermore it calls the system default idle routine. 845 * Furthermore it calls the system default idle routine.
847 */ 846 */
@@ -882,7 +881,8 @@ recalc:
882 881
883 t = jiffies; 882 t = jiffies;
884 switch (apm_do_idle()) { 883 switch (apm_do_idle()) {
885 case 0: apm_idle_done = 1; 884 case 0:
885 apm_idle_done = 1;
886 if (t != jiffies) { 886 if (t != jiffies) {
887 if (bucket) { 887 if (bucket) {
888 bucket = IDLE_LEAKY_MAX; 888 bucket = IDLE_LEAKY_MAX;
@@ -893,7 +893,8 @@ recalc:
893 continue; 893 continue;
894 } 894 }
895 break; 895 break;
896 case 1: apm_idle_done = 1; 896 case 1:
897 apm_idle_done = 1;
897 break; 898 break;
898 default: /* BIOS refused */ 899 default: /* BIOS refused */
899 break; 900 break;
@@ -921,10 +922,10 @@ recalc:
921 * the SMP call on CPU0 as some systems will only honour this call 922 * the SMP call on CPU0 as some systems will only honour this call
922 * on their first cpu. 923 * on their first cpu.
923 */ 924 */
924 925
925static void apm_power_off(void) 926static void apm_power_off(void)
926{ 927{
927 unsigned char po_bios_call[] = { 928 unsigned char po_bios_call[] = {
928 0xb8, 0x00, 0x10, /* movw $0x1000,ax */ 929 0xb8, 0x00, 0x10, /* movw $0x1000,ax */
929 0x8e, 0xd0, /* movw ax,ss */ 930 0x8e, 0xd0, /* movw ax,ss */
930 0xbc, 0x00, 0xf0, /* movw $0xf000,sp */ 931 0xbc, 0x00, 0xf0, /* movw $0xf000,sp */
@@ -935,13 +936,12 @@ static void apm_power_off(void)
935 }; 936 };
936 937
937 /* Some bioses don't like being called from CPU != 0 */ 938 /* Some bioses don't like being called from CPU != 0 */
938 if (apm_info.realmode_power_off) 939 if (apm_info.realmode_power_off) {
939 {
940 (void)apm_save_cpus(); 940 (void)apm_save_cpus();
941 machine_real_restart(po_bios_call, sizeof(po_bios_call)); 941 machine_real_restart(po_bios_call, sizeof(po_bios_call));
942 } else {
943 (void)set_system_power_state(APM_STATE_OFF);
942 } 944 }
943 else
944 (void) set_system_power_state(APM_STATE_OFF);
945} 945}
946 946
947#ifdef CONFIG_APM_DO_ENABLE 947#ifdef CONFIG_APM_DO_ENABLE
@@ -950,17 +950,17 @@ static void apm_power_off(void)
950 * apm_enable_power_management - enable BIOS APM power management 950 * apm_enable_power_management - enable BIOS APM power management
951 * @enable: enable yes/no 951 * @enable: enable yes/no
952 * 952 *
953 * Enable or disable the APM BIOS power services. 953 * Enable or disable the APM BIOS power services.
954 */ 954 */
955 955
956static int apm_enable_power_management(int enable) 956static int apm_enable_power_management(int enable)
957{ 957{
958 u32 eax; 958 u32 eax;
959 959
960 if ((enable == 0) && (apm_info.bios.flags & APM_BIOS_DISENGAGED)) 960 if ((enable == 0) && (apm_info.bios.flags & APM_BIOS_DISENGAGED))
961 return APM_NOT_ENGAGED; 961 return APM_NOT_ENGAGED;
962 if (apm_bios_call_simple(APM_FUNC_ENABLE_PM, APM_DEVICE_BALL, 962 if (apm_bios_call_simple(APM_FUNC_ENABLE_PM, APM_DEVICE_BALL,
963 enable, &eax)) 963 enable, &eax))
964 return (eax >> 8) & 0xff; 964 return (eax >> 8) & 0xff;
965 if (enable) 965 if (enable)
966 apm_info.bios.flags &= ~APM_BIOS_DISABLED; 966 apm_info.bios.flags &= ~APM_BIOS_DISABLED;
@@ -983,19 +983,19 @@ static int apm_enable_power_management(int enable)
983 * if reported is a lifetime in secodnds/minutes at current powwer 983 * if reported is a lifetime in secodnds/minutes at current powwer
984 * consumption. 984 * consumption.
985 */ 985 */
986 986
987static int apm_get_power_status(u_short *status, u_short *bat, u_short *life) 987static int apm_get_power_status(u_short *status, u_short *bat, u_short *life)
988{ 988{
989 u32 eax; 989 u32 eax;
990 u32 ebx; 990 u32 ebx;
991 u32 ecx; 991 u32 ecx;
992 u32 edx; 992 u32 edx;
993 u32 dummy; 993 u32 dummy;
994 994
995 if (apm_info.get_power_status_broken) 995 if (apm_info.get_power_status_broken)
996 return APM_32_UNSUPPORTED; 996 return APM_32_UNSUPPORTED;
997 if (apm_bios_call(APM_FUNC_GET_STATUS, APM_DEVICE_ALL, 0, 997 if (apm_bios_call(APM_FUNC_GET_STATUS, APM_DEVICE_ALL, 0,
998 &eax, &ebx, &ecx, &edx, &dummy)) 998 &eax, &ebx, &ecx, &edx, &dummy))
999 return (eax >> 8) & 0xff; 999 return (eax >> 8) & 0xff;
1000 *status = ebx; 1000 *status = ebx;
1001 *bat = ecx; 1001 *bat = ecx;
@@ -1011,11 +1011,11 @@ static int apm_get_power_status(u_short *status, u_short *bat, u_short *life)
1011static int apm_get_battery_status(u_short which, u_short *status, 1011static int apm_get_battery_status(u_short which, u_short *status,
1012 u_short *bat, u_short *life, u_short *nbat) 1012 u_short *bat, u_short *life, u_short *nbat)
1013{ 1013{
1014 u32 eax; 1014 u32 eax;
1015 u32 ebx; 1015 u32 ebx;
1016 u32 ecx; 1016 u32 ecx;
1017 u32 edx; 1017 u32 edx;
1018 u32 esi; 1018 u32 esi;
1019 1019
1020 if (apm_info.connection_version < 0x0102) { 1020 if (apm_info.connection_version < 0x0102) {
1021 /* pretend we only have one battery. */ 1021 /* pretend we only have one battery. */
@@ -1026,7 +1026,7 @@ static int apm_get_battery_status(u_short which, u_short *status,
1026 } 1026 }
1027 1027
1028 if (apm_bios_call(APM_FUNC_GET_STATUS, (0x8000 | (which)), 0, &eax, 1028 if (apm_bios_call(APM_FUNC_GET_STATUS, (0x8000 | (which)), 0, &eax,
1029 &ebx, &ecx, &edx, &esi)) 1029 &ebx, &ecx, &edx, &esi))
1030 return (eax >> 8) & 0xff; 1030 return (eax >> 8) & 0xff;
1031 *status = ebx; 1031 *status = ebx;
1032 *bat = ecx; 1032 *bat = ecx;
@@ -1044,10 +1044,10 @@ static int apm_get_battery_status(u_short which, u_short *status,
1044 * Activate or deactive power management on either a specific device 1044 * Activate or deactive power management on either a specific device
1045 * or the entire system (%APM_DEVICE_ALL). 1045 * or the entire system (%APM_DEVICE_ALL).
1046 */ 1046 */
1047 1047
1048static int apm_engage_power_management(u_short device, int enable) 1048static int apm_engage_power_management(u_short device, int enable)
1049{ 1049{
1050 u32 eax; 1050 u32 eax;
1051 1051
1052 if ((enable == 0) && (device == APM_DEVICE_ALL) 1052 if ((enable == 0) && (device == APM_DEVICE_ALL)
1053 && (apm_info.bios.flags & APM_BIOS_DISABLED)) 1053 && (apm_info.bios.flags & APM_BIOS_DISABLED))
@@ -1074,7 +1074,7 @@ static int apm_engage_power_management(u_short device, int enable)
1074 * all video devices. Typically the BIOS will do laptop backlight and 1074 * all video devices. Typically the BIOS will do laptop backlight and
1075 * monitor powerdown for us. 1075 * monitor powerdown for us.
1076 */ 1076 */
1077 1077
1078static int apm_console_blank(int blank) 1078static int apm_console_blank(int blank)
1079{ 1079{
1080 int error = APM_NOT_ENGAGED; /* silence gcc */ 1080 int error = APM_NOT_ENGAGED; /* silence gcc */
@@ -1126,7 +1126,7 @@ static apm_event_t get_queued_event(struct apm_user *as)
1126 1126
1127static void queue_event(apm_event_t event, struct apm_user *sender) 1127static void queue_event(apm_event_t event, struct apm_user *sender)
1128{ 1128{
1129 struct apm_user * as; 1129 struct apm_user *as;
1130 1130
1131 spin_lock(&user_list_lock); 1131 spin_lock(&user_list_lock);
1132 if (user_list == NULL) 1132 if (user_list == NULL)
@@ -1174,11 +1174,11 @@ static void reinit_timer(void)
1174 1174
1175 spin_lock_irqsave(&i8253_lock, flags); 1175 spin_lock_irqsave(&i8253_lock, flags);
1176 /* set the clock to HZ */ 1176 /* set the clock to HZ */
1177 outb_p(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */ 1177 outb_pit(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */
1178 udelay(10); 1178 udelay(10);
1179 outb_p(LATCH & 0xff, PIT_CH0); /* LSB */ 1179 outb_pit(LATCH & 0xff, PIT_CH0); /* LSB */
1180 udelay(10); 1180 udelay(10);
1181 outb(LATCH >> 8, PIT_CH0); /* MSB */ 1181 outb_pit(LATCH >> 8, PIT_CH0); /* MSB */
1182 udelay(10); 1182 udelay(10);
1183 spin_unlock_irqrestore(&i8253_lock, flags); 1183 spin_unlock_irqrestore(&i8253_lock, flags);
1184#endif 1184#endif
@@ -1186,7 +1186,7 @@ static void reinit_timer(void)
1186 1186
1187static int suspend(int vetoable) 1187static int suspend(int vetoable)
1188{ 1188{
1189 int err; 1189 int err;
1190 struct apm_user *as; 1190 struct apm_user *as;
1191 1191
1192 if (pm_send_all(PM_SUSPEND, (void *)3)) { 1192 if (pm_send_all(PM_SUSPEND, (void *)3)) {
@@ -1239,7 +1239,7 @@ static int suspend(int vetoable)
1239 1239
1240static void standby(void) 1240static void standby(void)
1241{ 1241{
1242 int err; 1242 int err;
1243 1243
1244 local_irq_disable(); 1244 local_irq_disable();
1245 device_power_down(PMSG_SUSPEND); 1245 device_power_down(PMSG_SUSPEND);
@@ -1256,8 +1256,8 @@ static void standby(void)
1256 1256
1257static apm_event_t get_event(void) 1257static apm_event_t get_event(void)
1258{ 1258{
1259 int error; 1259 int error;
1260 apm_event_t event = APM_NO_EVENTS; /* silence gcc */ 1260 apm_event_t event = APM_NO_EVENTS; /* silence gcc */
1261 apm_eventinfo_t info; 1261 apm_eventinfo_t info;
1262 1262
1263 static int notified; 1263 static int notified;
@@ -1275,9 +1275,9 @@ static apm_event_t get_event(void)
1275 1275
1276static void check_events(void) 1276static void check_events(void)
1277{ 1277{
1278 apm_event_t event; 1278 apm_event_t event;
1279 static unsigned long last_resume; 1279 static unsigned long last_resume;
1280 static int ignore_bounce; 1280 static int ignore_bounce;
1281 1281
1282 while ((event = get_event()) != 0) { 1282 while ((event = get_event()) != 0) {
1283 if (debug) { 1283 if (debug) {
@@ -1289,7 +1289,7 @@ static void check_events(void)
1289 "event 0x%02x\n", event); 1289 "event 0x%02x\n", event);
1290 } 1290 }
1291 if (ignore_bounce 1291 if (ignore_bounce
1292 && ((jiffies - last_resume) > bounce_interval)) 1292 && (time_after(jiffies, last_resume + bounce_interval)))
1293 ignore_bounce = 0; 1293 ignore_bounce = 0;
1294 1294
1295 switch (event) { 1295 switch (event) {
@@ -1357,7 +1357,7 @@ static void check_events(void)
1357 /* 1357 /*
1358 * We are not allowed to reject a critical suspend. 1358 * We are not allowed to reject a critical suspend.
1359 */ 1359 */
1360 (void) suspend(0); 1360 (void)suspend(0);
1361 break; 1361 break;
1362 } 1362 }
1363 } 1363 }
@@ -1365,12 +1365,12 @@ static void check_events(void)
1365 1365
1366static void apm_event_handler(void) 1366static void apm_event_handler(void)
1367{ 1367{
1368 static int pending_count = 4; 1368 static int pending_count = 4;
1369 int err; 1369 int err;
1370 1370
1371 if ((standbys_pending > 0) || (suspends_pending > 0)) { 1371 if ((standbys_pending > 0) || (suspends_pending > 0)) {
1372 if ((apm_info.connection_version > 0x100) && 1372 if ((apm_info.connection_version > 0x100) &&
1373 (pending_count-- <= 0)) { 1373 (pending_count-- <= 0)) {
1374 pending_count = 4; 1374 pending_count = 4;
1375 if (debug) 1375 if (debug)
1376 printk(KERN_DEBUG "apm: setting state busy\n"); 1376 printk(KERN_DEBUG "apm: setting state busy\n");
@@ -1418,9 +1418,9 @@ static int check_apm_user(struct apm_user *as, const char *func)
1418 1418
1419static ssize_t do_read(struct file *fp, char __user *buf, size_t count, loff_t *ppos) 1419static ssize_t do_read(struct file *fp, char __user *buf, size_t count, loff_t *ppos)
1420{ 1420{
1421 struct apm_user * as; 1421 struct apm_user *as;
1422 int i; 1422 int i;
1423 apm_event_t event; 1423 apm_event_t event;
1424 1424
1425 as = fp->private_data; 1425 as = fp->private_data;
1426 if (check_apm_user(as, "read")) 1426 if (check_apm_user(as, "read"))
@@ -1459,9 +1459,9 @@ static ssize_t do_read(struct file *fp, char __user *buf, size_t count, loff_t *
1459 return 0; 1459 return 0;
1460} 1460}
1461 1461
1462static unsigned int do_poll(struct file *fp, poll_table * wait) 1462static unsigned int do_poll(struct file *fp, poll_table *wait)
1463{ 1463{
1464 struct apm_user * as; 1464 struct apm_user *as;
1465 1465
1466 as = fp->private_data; 1466 as = fp->private_data;
1467 if (check_apm_user(as, "poll")) 1467 if (check_apm_user(as, "poll"))
@@ -1472,10 +1472,10 @@ static unsigned int do_poll(struct file *fp, poll_table * wait)
1472 return 0; 1472 return 0;
1473} 1473}
1474 1474
1475static int do_ioctl(struct inode * inode, struct file *filp, 1475static int do_ioctl(struct inode *inode, struct file *filp,
1476 u_int cmd, u_long arg) 1476 u_int cmd, u_long arg)
1477{ 1477{
1478 struct apm_user * as; 1478 struct apm_user *as;
1479 1479
1480 as = filp->private_data; 1480 as = filp->private_data;
1481 if (check_apm_user(as, "ioctl")) 1481 if (check_apm_user(as, "ioctl"))
@@ -1515,9 +1515,9 @@ static int do_ioctl(struct inode * inode, struct file *filp,
1515 return 0; 1515 return 0;
1516} 1516}
1517 1517
1518static int do_release(struct inode * inode, struct file * filp) 1518static int do_release(struct inode *inode, struct file *filp)
1519{ 1519{
1520 struct apm_user * as; 1520 struct apm_user *as;
1521 1521
1522 as = filp->private_data; 1522 as = filp->private_data;
1523 if (check_apm_user(as, "release")) 1523 if (check_apm_user(as, "release"))
@@ -1533,11 +1533,11 @@ static int do_release(struct inode * inode, struct file * filp)
1533 if (suspends_pending <= 0) 1533 if (suspends_pending <= 0)
1534 (void) suspend(1); 1534 (void) suspend(1);
1535 } 1535 }
1536 spin_lock(&user_list_lock); 1536 spin_lock(&user_list_lock);
1537 if (user_list == as) 1537 if (user_list == as)
1538 user_list = as->next; 1538 user_list = as->next;
1539 else { 1539 else {
1540 struct apm_user * as1; 1540 struct apm_user *as1;
1541 1541
1542 for (as1 = user_list; 1542 for (as1 = user_list;
1543 (as1 != NULL) && (as1->next != as); 1543 (as1 != NULL) && (as1->next != as);
@@ -1553,9 +1553,9 @@ static int do_release(struct inode * inode, struct file * filp)
1553 return 0; 1553 return 0;
1554} 1554}
1555 1555
1556static int do_open(struct inode * inode, struct file * filp) 1556static int do_open(struct inode *inode, struct file *filp)
1557{ 1557{
1558 struct apm_user * as; 1558 struct apm_user *as;
1559 1559
1560 as = kmalloc(sizeof(*as), GFP_KERNEL); 1560 as = kmalloc(sizeof(*as), GFP_KERNEL);
1561 if (as == NULL) { 1561 if (as == NULL) {
@@ -1569,7 +1569,7 @@ static int do_open(struct inode * inode, struct file * filp)
1569 as->suspends_read = as->standbys_read = 0; 1569 as->suspends_read = as->standbys_read = 0;
1570 /* 1570 /*
1571 * XXX - this is a tiny bit broken, when we consider BSD 1571 * XXX - this is a tiny bit broken, when we consider BSD
1572 * process accounting. If the device is opened by root, we 1572 * process accounting. If the device is opened by root, we
1573 * instantly flag that we used superuser privs. Who knows, 1573 * instantly flag that we used superuser privs. Who knows,
1574 * we might close the device immediately without doing a 1574 * we might close the device immediately without doing a
1575 * privileged operation -- cevans 1575 * privileged operation -- cevans
@@ -1652,16 +1652,16 @@ static int proc_apm_show(struct seq_file *m, void *v)
1652 8) min = minutes; sec = seconds */ 1652 8) min = minutes; sec = seconds */
1653 1653
1654 seq_printf(m, "%s %d.%d 0x%02x 0x%02x 0x%02x 0x%02x %d%% %d %s\n", 1654 seq_printf(m, "%s %d.%d 0x%02x 0x%02x 0x%02x 0x%02x %d%% %d %s\n",
1655 driver_version, 1655 driver_version,
1656 (apm_info.bios.version >> 8) & 0xff, 1656 (apm_info.bios.version >> 8) & 0xff,
1657 apm_info.bios.version & 0xff, 1657 apm_info.bios.version & 0xff,
1658 apm_info.bios.flags, 1658 apm_info.bios.flags,
1659 ac_line_status, 1659 ac_line_status,
1660 battery_status, 1660 battery_status,
1661 battery_flag, 1661 battery_flag,
1662 percentage, 1662 percentage,
1663 time_units, 1663 time_units,
1664 units); 1664 units);
1665 return 0; 1665 return 0;
1666} 1666}
1667 1667
@@ -1684,8 +1684,8 @@ static int apm(void *unused)
1684 unsigned short cx; 1684 unsigned short cx;
1685 unsigned short dx; 1685 unsigned short dx;
1686 int error; 1686 int error;
1687 char * power_stat; 1687 char *power_stat;
1688 char * bat_stat; 1688 char *bat_stat;
1689 1689
1690#ifdef CONFIG_SMP 1690#ifdef CONFIG_SMP
1691 /* 2002/08/01 - WT 1691 /* 2002/08/01 - WT
@@ -1744,23 +1744,41 @@ static int apm(void *unused)
1744 } 1744 }
1745 } 1745 }
1746 1746
1747 if (debug && (num_online_cpus() == 1 || smp )) { 1747 if (debug && (num_online_cpus() == 1 || smp)) {
1748 error = apm_get_power_status(&bx, &cx, &dx); 1748 error = apm_get_power_status(&bx, &cx, &dx);
1749 if (error) 1749 if (error)
1750 printk(KERN_INFO "apm: power status not available\n"); 1750 printk(KERN_INFO "apm: power status not available\n");
1751 else { 1751 else {
1752 switch ((bx >> 8) & 0xff) { 1752 switch ((bx >> 8) & 0xff) {
1753 case 0: power_stat = "off line"; break; 1753 case 0:
1754 case 1: power_stat = "on line"; break; 1754 power_stat = "off line";
1755 case 2: power_stat = "on backup power"; break; 1755 break;
1756 default: power_stat = "unknown"; break; 1756 case 1:
1757 power_stat = "on line";
1758 break;
1759 case 2:
1760 power_stat = "on backup power";
1761 break;
1762 default:
1763 power_stat = "unknown";
1764 break;
1757 } 1765 }
1758 switch (bx & 0xff) { 1766 switch (bx & 0xff) {
1759 case 0: bat_stat = "high"; break; 1767 case 0:
1760 case 1: bat_stat = "low"; break; 1768 bat_stat = "high";
1761 case 2: bat_stat = "critical"; break; 1769 break;
1762 case 3: bat_stat = "charging"; break; 1770 case 1:
1763 default: bat_stat = "unknown"; break; 1771 bat_stat = "low";
1772 break;
1773 case 2:
1774 bat_stat = "critical";
1775 break;
1776 case 3:
1777 bat_stat = "charging";
1778 break;
1779 default:
1780 bat_stat = "unknown";
1781 break;
1764 } 1782 }
1765 printk(KERN_INFO 1783 printk(KERN_INFO
1766 "apm: AC %s, battery status %s, battery life ", 1784 "apm: AC %s, battery status %s, battery life ",
@@ -1777,8 +1795,8 @@ static int apm(void *unused)
1777 printk("unknown\n"); 1795 printk("unknown\n");
1778 else 1796 else
1779 printk("%d %s\n", dx & 0x7fff, 1797 printk("%d %s\n", dx & 0x7fff,
1780 (dx & 0x8000) ? 1798 (dx & 0x8000) ?
1781 "minutes" : "seconds"); 1799 "minutes" : "seconds");
1782 } 1800 }
1783 } 1801 }
1784 } 1802 }
@@ -1803,7 +1821,7 @@ static int apm(void *unused)
1803#ifndef MODULE 1821#ifndef MODULE
1804static int __init apm_setup(char *str) 1822static int __init apm_setup(char *str)
1805{ 1823{
1806 int invert; 1824 int invert;
1807 1825
1808 while ((str != NULL) && (*str != '\0')) { 1826 while ((str != NULL) && (*str != '\0')) {
1809 if (strncmp(str, "off", 3) == 0) 1827 if (strncmp(str, "off", 3) == 0)
@@ -1828,14 +1846,13 @@ static int __init apm_setup(char *str)
1828 if ((strncmp(str, "power-off", 9) == 0) || 1846 if ((strncmp(str, "power-off", 9) == 0) ||
1829 (strncmp(str, "power_off", 9) == 0)) 1847 (strncmp(str, "power_off", 9) == 0))
1830 power_off = !invert; 1848 power_off = !invert;
1831 if (strncmp(str, "smp", 3) == 0) 1849 if (strncmp(str, "smp", 3) == 0) {
1832 {
1833 smp = !invert; 1850 smp = !invert;
1834 idle_threshold = 100; 1851 idle_threshold = 100;
1835 } 1852 }
1836 if ((strncmp(str, "allow-ints", 10) == 0) || 1853 if ((strncmp(str, "allow-ints", 10) == 0) ||
1837 (strncmp(str, "allow_ints", 10) == 0)) 1854 (strncmp(str, "allow_ints", 10) == 0))
1838 apm_info.allow_ints = !invert; 1855 apm_info.allow_ints = !invert;
1839 if ((strncmp(str, "broken-psr", 10) == 0) || 1856 if ((strncmp(str, "broken-psr", 10) == 0) ||
1840 (strncmp(str, "broken_psr", 10) == 0)) 1857 (strncmp(str, "broken_psr", 10) == 0))
1841 apm_info.get_power_status_broken = !invert; 1858 apm_info.get_power_status_broken = !invert;
@@ -1881,7 +1898,8 @@ static int __init print_if_true(const struct dmi_system_id *d)
1881 */ 1898 */
1882static int __init broken_ps2_resume(const struct dmi_system_id *d) 1899static int __init broken_ps2_resume(const struct dmi_system_id *d)
1883{ 1900{
1884 printk(KERN_INFO "%s machine detected. Mousepad Resume Bug workaround hopefully not needed.\n", d->ident); 1901 printk(KERN_INFO "%s machine detected. Mousepad Resume Bug "
1902 "workaround hopefully not needed.\n", d->ident);
1885 return 0; 1903 return 0;
1886} 1904}
1887 1905
@@ -1890,7 +1908,8 @@ static int __init set_realmode_power_off(const struct dmi_system_id *d)
1890{ 1908{
1891 if (apm_info.realmode_power_off == 0) { 1909 if (apm_info.realmode_power_off == 0) {
1892 apm_info.realmode_power_off = 1; 1910 apm_info.realmode_power_off = 1;
1893 printk(KERN_INFO "%s bios detected. Using realmode poweroff only.\n", d->ident); 1911 printk(KERN_INFO "%s bios detected. "
1912 "Using realmode poweroff only.\n", d->ident);
1894 } 1913 }
1895 return 0; 1914 return 0;
1896} 1915}
@@ -1900,7 +1919,8 @@ static int __init set_apm_ints(const struct dmi_system_id *d)
1900{ 1919{
1901 if (apm_info.allow_ints == 0) { 1920 if (apm_info.allow_ints == 0) {
1902 apm_info.allow_ints = 1; 1921 apm_info.allow_ints = 1;
1903 printk(KERN_INFO "%s machine detected. Enabling interrupts during APM calls.\n", d->ident); 1922 printk(KERN_INFO "%s machine detected. "
1923 "Enabling interrupts during APM calls.\n", d->ident);
1904 } 1924 }
1905 return 0; 1925 return 0;
1906} 1926}
@@ -1910,7 +1930,8 @@ static int __init apm_is_horked(const struct dmi_system_id *d)
1910{ 1930{
1911 if (apm_info.disabled == 0) { 1931 if (apm_info.disabled == 0) {
1912 apm_info.disabled = 1; 1932 apm_info.disabled = 1;
1913 printk(KERN_INFO "%s machine detected. Disabling APM.\n", d->ident); 1933 printk(KERN_INFO "%s machine detected. "
1934 "Disabling APM.\n", d->ident);
1914 } 1935 }
1915 return 0; 1936 return 0;
1916} 1937}
@@ -1919,7 +1940,8 @@ static int __init apm_is_horked_d850md(const struct dmi_system_id *d)
1919{ 1940{
1920 if (apm_info.disabled == 0) { 1941 if (apm_info.disabled == 0) {
1921 apm_info.disabled = 1; 1942 apm_info.disabled = 1;
1922 printk(KERN_INFO "%s machine detected. Disabling APM.\n", d->ident); 1943 printk(KERN_INFO "%s machine detected. "
1944 "Disabling APM.\n", d->ident);
1923 printk(KERN_INFO "This bug is fixed in bios P15 which is available for \n"); 1945 printk(KERN_INFO "This bug is fixed in bios P15 which is available for \n");
1924 printk(KERN_INFO "download from support.intel.com \n"); 1946 printk(KERN_INFO "download from support.intel.com \n");
1925 } 1947 }
@@ -1931,7 +1953,8 @@ static int __init apm_likes_to_melt(const struct dmi_system_id *d)
1931{ 1953{
1932 if (apm_info.forbid_idle == 0) { 1954 if (apm_info.forbid_idle == 0) {
1933 apm_info.forbid_idle = 1; 1955 apm_info.forbid_idle = 1;
1934 printk(KERN_INFO "%s machine detected. Disabling APM idle calls.\n", d->ident); 1956 printk(KERN_INFO "%s machine detected. "
1957 "Disabling APM idle calls.\n", d->ident);
1935 } 1958 }
1936 return 0; 1959 return 0;
1937} 1960}
@@ -1954,7 +1977,8 @@ static int __init apm_likes_to_melt(const struct dmi_system_id *d)
1954static int __init broken_apm_power(const struct dmi_system_id *d) 1977static int __init broken_apm_power(const struct dmi_system_id *d)
1955{ 1978{
1956 apm_info.get_power_status_broken = 1; 1979 apm_info.get_power_status_broken = 1;
1957 printk(KERN_WARNING "BIOS strings suggest APM bugs, disabling power status reporting.\n"); 1980 printk(KERN_WARNING "BIOS strings suggest APM bugs, "
1981 "disabling power status reporting.\n");
1958 return 0; 1982 return 0;
1959} 1983}
1960 1984
@@ -1965,7 +1989,8 @@ static int __init broken_apm_power(const struct dmi_system_id *d)
1965static int __init swab_apm_power_in_minutes(const struct dmi_system_id *d) 1989static int __init swab_apm_power_in_minutes(const struct dmi_system_id *d)
1966{ 1990{
1967 apm_info.get_power_status_swabinminutes = 1; 1991 apm_info.get_power_status_swabinminutes = 1;
1968 printk(KERN_WARNING "BIOS strings suggest APM reports battery life in minutes and wrong byte order.\n"); 1992 printk(KERN_WARNING "BIOS strings suggest APM reports battery life "
1993 "in minutes and wrong byte order.\n");
1969 return 0; 1994 return 0;
1970} 1995}
1971 1996
@@ -1990,8 +2015,8 @@ static struct dmi_system_id __initdata apm_dmi_table[] = {
1990 apm_is_horked, "Dell Inspiron 2500", 2015 apm_is_horked, "Dell Inspiron 2500",
1991 { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), 2016 { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
1992 DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 2500"), 2017 DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 2500"),
1993 DMI_MATCH(DMI_BIOS_VENDOR,"Phoenix Technologies LTD"), 2018 DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
1994 DMI_MATCH(DMI_BIOS_VERSION,"A11"), }, 2019 DMI_MATCH(DMI_BIOS_VERSION, "A11"), },
1995 }, 2020 },
1996 { /* Allow interrupts during suspend on Dell Inspiron laptops*/ 2021 { /* Allow interrupts during suspend on Dell Inspiron laptops*/
1997 set_apm_ints, "Dell Inspiron", { 2022 set_apm_ints, "Dell Inspiron", {
@@ -2014,15 +2039,15 @@ static struct dmi_system_id __initdata apm_dmi_table[] = {
2014 apm_is_horked, "Dell Dimension 4100", 2039 apm_is_horked, "Dell Dimension 4100",
2015 { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), 2040 { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
2016 DMI_MATCH(DMI_PRODUCT_NAME, "XPS-Z"), 2041 DMI_MATCH(DMI_PRODUCT_NAME, "XPS-Z"),
2017 DMI_MATCH(DMI_BIOS_VENDOR,"Intel Corp."), 2042 DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."),
2018 DMI_MATCH(DMI_BIOS_VERSION,"A11"), }, 2043 DMI_MATCH(DMI_BIOS_VERSION, "A11"), },
2019 }, 2044 },
2020 { /* Allow interrupts during suspend on Compaq Laptops*/ 2045 { /* Allow interrupts during suspend on Compaq Laptops*/
2021 set_apm_ints, "Compaq 12XL125", 2046 set_apm_ints, "Compaq 12XL125",
2022 { DMI_MATCH(DMI_SYS_VENDOR, "Compaq"), 2047 { DMI_MATCH(DMI_SYS_VENDOR, "Compaq"),
2023 DMI_MATCH(DMI_PRODUCT_NAME, "Compaq PC"), 2048 DMI_MATCH(DMI_PRODUCT_NAME, "Compaq PC"),
2024 DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), 2049 DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
2025 DMI_MATCH(DMI_BIOS_VERSION,"4.06"), }, 2050 DMI_MATCH(DMI_BIOS_VERSION, "4.06"), },
2026 }, 2051 },
2027 { /* Allow interrupts during APM or the clock goes slow */ 2052 { /* Allow interrupts during APM or the clock goes slow */
2028 set_apm_ints, "ASUSTeK", 2053 set_apm_ints, "ASUSTeK",
@@ -2064,15 +2089,15 @@ static struct dmi_system_id __initdata apm_dmi_table[] = {
2064 apm_is_horked, "Sharp PC-PJ/AX", 2089 apm_is_horked, "Sharp PC-PJ/AX",
2065 { DMI_MATCH(DMI_SYS_VENDOR, "SHARP"), 2090 { DMI_MATCH(DMI_SYS_VENDOR, "SHARP"),
2066 DMI_MATCH(DMI_PRODUCT_NAME, "PC-PJ/AX"), 2091 DMI_MATCH(DMI_PRODUCT_NAME, "PC-PJ/AX"),
2067 DMI_MATCH(DMI_BIOS_VENDOR,"SystemSoft"), 2092 DMI_MATCH(DMI_BIOS_VENDOR, "SystemSoft"),
2068 DMI_MATCH(DMI_BIOS_VERSION,"Version R2.08"), }, 2093 DMI_MATCH(DMI_BIOS_VERSION, "Version R2.08"), },
2069 }, 2094 },
2070 { /* APM crashes */ 2095 { /* APM crashes */
2071 apm_is_horked, "Dell Inspiron 2500", 2096 apm_is_horked, "Dell Inspiron 2500",
2072 { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), 2097 { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
2073 DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 2500"), 2098 DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 2500"),
2074 DMI_MATCH(DMI_BIOS_VENDOR,"Phoenix Technologies LTD"), 2099 DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
2075 DMI_MATCH(DMI_BIOS_VERSION,"A11"), }, 2100 DMI_MATCH(DMI_BIOS_VERSION, "A11"), },
2076 }, 2101 },
2077 { /* APM idle hangs */ 2102 { /* APM idle hangs */
2078 apm_likes_to_melt, "Jabil AMD", 2103 apm_likes_to_melt, "Jabil AMD",
@@ -2203,11 +2228,11 @@ static int __init apm_init(void)
2203 return -ENODEV; 2228 return -ENODEV;
2204 } 2229 }
2205 printk(KERN_INFO 2230 printk(KERN_INFO
2206 "apm: BIOS version %d.%d Flags 0x%02x (Driver version %s)\n", 2231 "apm: BIOS version %d.%d Flags 0x%02x (Driver version %s)\n",
2207 ((apm_info.bios.version >> 8) & 0xff), 2232 ((apm_info.bios.version >> 8) & 0xff),
2208 (apm_info.bios.version & 0xff), 2233 (apm_info.bios.version & 0xff),
2209 apm_info.bios.flags, 2234 apm_info.bios.flags,
2210 driver_version); 2235 driver_version);
2211 if ((apm_info.bios.flags & APM_32_BIT_SUPPORT) == 0) { 2236 if ((apm_info.bios.flags & APM_32_BIT_SUPPORT) == 0) {
2212 printk(KERN_INFO "apm: no 32 bit BIOS support\n"); 2237 printk(KERN_INFO "apm: no 32 bit BIOS support\n");
2213 return -ENODEV; 2238 return -ENODEV;
@@ -2256,14 +2281,12 @@ static int __init apm_init(void)
2256 apm_info.disabled = 1; 2281 apm_info.disabled = 1;
2257 return -ENODEV; 2282 return -ENODEV;
2258 } 2283 }
2259 if (PM_IS_ACTIVE()) { 2284 if (pm_flags & PM_ACPI) {
2260 printk(KERN_NOTICE "apm: overridden by ACPI.\n"); 2285 printk(KERN_NOTICE "apm: overridden by ACPI.\n");
2261 apm_info.disabled = 1; 2286 apm_info.disabled = 1;
2262 return -ENODEV; 2287 return -ENODEV;
2263 } 2288 }
2264#ifdef CONFIG_PM_LEGACY 2289 pm_flags |= PM_APM;
2265 pm_active = 1;
2266#endif
2267 2290
2268 /* 2291 /*
2269 * Set up a segment that references the real mode segment 0x40 2292 * Set up a segment that references the real mode segment 0x40
@@ -2314,9 +2337,9 @@ static int __init apm_init(void)
2314 } 2337 }
2315 wake_up_process(kapmd_task); 2338 wake_up_process(kapmd_task);
2316 2339
2317 if (num_online_cpus() > 1 && !smp ) { 2340 if (num_online_cpus() > 1 && !smp) {
2318 printk(KERN_NOTICE 2341 printk(KERN_NOTICE
2319 "apm: disabled - APM is not SMP safe (power off active).\n"); 2342 "apm: disabled - APM is not SMP safe (power off active).\n");
2320 return 0; 2343 return 0;
2321 } 2344 }
2322 2345
@@ -2341,7 +2364,7 @@ static int __init apm_init(void)
2341 2364
2342static void __exit apm_exit(void) 2365static void __exit apm_exit(void)
2343{ 2366{
2344 int error; 2367 int error;
2345 2368
2346 if (set_pm_idle) { 2369 if (set_pm_idle) {
2347 pm_idle = original_pm_idle; 2370 pm_idle = original_pm_idle;
@@ -2366,9 +2389,7 @@ static void __exit apm_exit(void)
2366 kthread_stop(kapmd_task); 2389 kthread_stop(kapmd_task);
2367 kapmd_task = NULL; 2390 kapmd_task = NULL;
2368 } 2391 }
2369#ifdef CONFIG_PM_LEGACY 2392 pm_flags &= ~PM_APM;
2370 pm_active = 0;
2371#endif
2372} 2393}
2373 2394
2374module_init(apm_init); 2395module_init(apm_init);
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 0e45981b2dd..afd84463b71 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -38,15 +38,15 @@ void foo(void);
38 38
39void foo(void) 39void foo(void)
40{ 40{
41 OFFSET(SIGCONTEXT_eax, sigcontext, eax); 41 OFFSET(IA32_SIGCONTEXT_ax, sigcontext, ax);
42 OFFSET(SIGCONTEXT_ebx, sigcontext, ebx); 42 OFFSET(IA32_SIGCONTEXT_bx, sigcontext, bx);
43 OFFSET(SIGCONTEXT_ecx, sigcontext, ecx); 43 OFFSET(IA32_SIGCONTEXT_cx, sigcontext, cx);
44 OFFSET(SIGCONTEXT_edx, sigcontext, edx); 44 OFFSET(IA32_SIGCONTEXT_dx, sigcontext, dx);
45 OFFSET(SIGCONTEXT_esi, sigcontext, esi); 45 OFFSET(IA32_SIGCONTEXT_si, sigcontext, si);
46 OFFSET(SIGCONTEXT_edi, sigcontext, edi); 46 OFFSET(IA32_SIGCONTEXT_di, sigcontext, di);
47 OFFSET(SIGCONTEXT_ebp, sigcontext, ebp); 47 OFFSET(IA32_SIGCONTEXT_bp, sigcontext, bp);
48 OFFSET(SIGCONTEXT_esp, sigcontext, esp); 48 OFFSET(IA32_SIGCONTEXT_sp, sigcontext, sp);
49 OFFSET(SIGCONTEXT_eip, sigcontext, eip); 49 OFFSET(IA32_SIGCONTEXT_ip, sigcontext, ip);
50 BLANK(); 50 BLANK();
51 51
52 OFFSET(CPUINFO_x86, cpuinfo_x86, x86); 52 OFFSET(CPUINFO_x86, cpuinfo_x86, x86);
@@ -70,39 +70,38 @@ void foo(void)
70 OFFSET(TI_cpu, thread_info, cpu); 70 OFFSET(TI_cpu, thread_info, cpu);
71 BLANK(); 71 BLANK();
72 72
73 OFFSET(GDS_size, Xgt_desc_struct, size); 73 OFFSET(GDS_size, desc_ptr, size);
74 OFFSET(GDS_address, Xgt_desc_struct, address); 74 OFFSET(GDS_address, desc_ptr, address);
75 OFFSET(GDS_pad, Xgt_desc_struct, pad);
76 BLANK(); 75 BLANK();
77 76
78 OFFSET(PT_EBX, pt_regs, ebx); 77 OFFSET(PT_EBX, pt_regs, bx);
79 OFFSET(PT_ECX, pt_regs, ecx); 78 OFFSET(PT_ECX, pt_regs, cx);
80 OFFSET(PT_EDX, pt_regs, edx); 79 OFFSET(PT_EDX, pt_regs, dx);
81 OFFSET(PT_ESI, pt_regs, esi); 80 OFFSET(PT_ESI, pt_regs, si);
82 OFFSET(PT_EDI, pt_regs, edi); 81 OFFSET(PT_EDI, pt_regs, di);
83 OFFSET(PT_EBP, pt_regs, ebp); 82 OFFSET(PT_EBP, pt_regs, bp);
84 OFFSET(PT_EAX, pt_regs, eax); 83 OFFSET(PT_EAX, pt_regs, ax);
85 OFFSET(PT_DS, pt_regs, xds); 84 OFFSET(PT_DS, pt_regs, ds);
86 OFFSET(PT_ES, pt_regs, xes); 85 OFFSET(PT_ES, pt_regs, es);
87 OFFSET(PT_FS, pt_regs, xfs); 86 OFFSET(PT_FS, pt_regs, fs);
88 OFFSET(PT_ORIG_EAX, pt_regs, orig_eax); 87 OFFSET(PT_ORIG_EAX, pt_regs, orig_ax);
89 OFFSET(PT_EIP, pt_regs, eip); 88 OFFSET(PT_EIP, pt_regs, ip);
90 OFFSET(PT_CS, pt_regs, xcs); 89 OFFSET(PT_CS, pt_regs, cs);
91 OFFSET(PT_EFLAGS, pt_regs, eflags); 90 OFFSET(PT_EFLAGS, pt_regs, flags);
92 OFFSET(PT_OLDESP, pt_regs, esp); 91 OFFSET(PT_OLDESP, pt_regs, sp);
93 OFFSET(PT_OLDSS, pt_regs, xss); 92 OFFSET(PT_OLDSS, pt_regs, ss);
94 BLANK(); 93 BLANK();
95 94
96 OFFSET(EXEC_DOMAIN_handler, exec_domain, handler); 95 OFFSET(EXEC_DOMAIN_handler, exec_domain, handler);
97 OFFSET(RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext); 96 OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext);
98 BLANK(); 97 BLANK();
99 98
100 OFFSET(pbe_address, pbe, address); 99 OFFSET(pbe_address, pbe, address);
101 OFFSET(pbe_orig_address, pbe, orig_address); 100 OFFSET(pbe_orig_address, pbe, orig_address);
102 OFFSET(pbe_next, pbe, next); 101 OFFSET(pbe_next, pbe, next);
103 102
104 /* Offset from the sysenter stack to tss.esp0 */ 103 /* Offset from the sysenter stack to tss.sp0 */
105 DEFINE(TSS_sysenter_esp0, offsetof(struct tss_struct, x86_tss.esp0) - 104 DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
106 sizeof(struct tss_struct)); 105 sizeof(struct tss_struct));
107 106
108 DEFINE(PAGE_SIZE_asm, PAGE_SIZE); 107 DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
@@ -111,8 +110,6 @@ void foo(void)
111 DEFINE(PTRS_PER_PMD, PTRS_PER_PMD); 110 DEFINE(PTRS_PER_PMD, PTRS_PER_PMD);
112 DEFINE(PTRS_PER_PGD, PTRS_PER_PGD); 111 DEFINE(PTRS_PER_PGD, PTRS_PER_PGD);
113 112
114 DEFINE(VDSO_PRELINK_asm, VDSO_PRELINK);
115
116 OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); 113 OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
117 114
118#ifdef CONFIG_PARAVIRT 115#ifdef CONFIG_PARAVIRT
@@ -123,7 +120,7 @@ void foo(void)
123 OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable); 120 OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
124 OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable); 121 OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
125 OFFSET(PV_CPU_iret, pv_cpu_ops, iret); 122 OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
126 OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit); 123 OFFSET(PV_CPU_irq_enable_syscall_ret, pv_cpu_ops, irq_enable_syscall_ret);
127 OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0); 124 OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
128#endif 125#endif
129 126
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index d1b6ed98774..494e1e096ee 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -38,7 +38,6 @@ int main(void)
38#define ENTRY(entry) DEFINE(tsk_ ## entry, offsetof(struct task_struct, entry)) 38#define ENTRY(entry) DEFINE(tsk_ ## entry, offsetof(struct task_struct, entry))
39 ENTRY(state); 39 ENTRY(state);
40 ENTRY(flags); 40 ENTRY(flags);
41 ENTRY(thread);
42 ENTRY(pid); 41 ENTRY(pid);
43 BLANK(); 42 BLANK();
44#undef ENTRY 43#undef ENTRY
@@ -47,6 +46,9 @@ int main(void)
47 ENTRY(addr_limit); 46 ENTRY(addr_limit);
48 ENTRY(preempt_count); 47 ENTRY(preempt_count);
49 ENTRY(status); 48 ENTRY(status);
49#ifdef CONFIG_IA32_EMULATION
50 ENTRY(sysenter_return);
51#endif
50 BLANK(); 52 BLANK();
51#undef ENTRY 53#undef ENTRY
52#define ENTRY(entry) DEFINE(pda_ ## entry, offsetof(struct x8664_pda, entry)) 54#define ENTRY(entry) DEFINE(pda_ ## entry, offsetof(struct x8664_pda, entry))
@@ -59,17 +61,31 @@ int main(void)
59 ENTRY(data_offset); 61 ENTRY(data_offset);
60 BLANK(); 62 BLANK();
61#undef ENTRY 63#undef ENTRY
64#ifdef CONFIG_PARAVIRT
65 BLANK();
66 OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
67 OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops);
68 OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
69 OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
70 OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
71 OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
72 OFFSET(PV_CPU_irq_enable_syscall_ret, pv_cpu_ops, irq_enable_syscall_ret);
73 OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs);
74 OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2);
75#endif
76
77
62#ifdef CONFIG_IA32_EMULATION 78#ifdef CONFIG_IA32_EMULATION
63#define ENTRY(entry) DEFINE(IA32_SIGCONTEXT_ ## entry, offsetof(struct sigcontext_ia32, entry)) 79#define ENTRY(entry) DEFINE(IA32_SIGCONTEXT_ ## entry, offsetof(struct sigcontext_ia32, entry))
64 ENTRY(eax); 80 ENTRY(ax);
65 ENTRY(ebx); 81 ENTRY(bx);
66 ENTRY(ecx); 82 ENTRY(cx);
67 ENTRY(edx); 83 ENTRY(dx);
68 ENTRY(esi); 84 ENTRY(si);
69 ENTRY(edi); 85 ENTRY(di);
70 ENTRY(ebp); 86 ENTRY(bp);
71 ENTRY(esp); 87 ENTRY(sp);
72 ENTRY(eip); 88 ENTRY(ip);
73 BLANK(); 89 BLANK();
74#undef ENTRY 90#undef ENTRY
75 DEFINE(IA32_RT_SIGFRAME_sigcontext, 91 DEFINE(IA32_RT_SIGFRAME_sigcontext,
@@ -81,14 +97,14 @@ int main(void)
81 DEFINE(pbe_next, offsetof(struct pbe, next)); 97 DEFINE(pbe_next, offsetof(struct pbe, next));
82 BLANK(); 98 BLANK();
83#define ENTRY(entry) DEFINE(pt_regs_ ## entry, offsetof(struct pt_regs, entry)) 99#define ENTRY(entry) DEFINE(pt_regs_ ## entry, offsetof(struct pt_regs, entry))
84 ENTRY(rbx); 100 ENTRY(bx);
85 ENTRY(rbx); 101 ENTRY(bx);
86 ENTRY(rcx); 102 ENTRY(cx);
87 ENTRY(rdx); 103 ENTRY(dx);
88 ENTRY(rsp); 104 ENTRY(sp);
89 ENTRY(rbp); 105 ENTRY(bp);
90 ENTRY(rsi); 106 ENTRY(si);
91 ENTRY(rdi); 107 ENTRY(di);
92 ENTRY(r8); 108 ENTRY(r8);
93 ENTRY(r9); 109 ENTRY(r9);
94 ENTRY(r10); 110 ENTRY(r10);
@@ -97,7 +113,7 @@ int main(void)
97 ENTRY(r13); 113 ENTRY(r13);
98 ENTRY(r14); 114 ENTRY(r14);
99 ENTRY(r15); 115 ENTRY(r15);
100 ENTRY(eflags); 116 ENTRY(flags);
101 BLANK(); 117 BLANK();
102#undef ENTRY 118#undef ENTRY
103#define ENTRY(entry) DEFINE(saved_context_ ## entry, offsetof(struct saved_context, entry)) 119#define ENTRY(entry) DEFINE(saved_context_ ## entry, offsetof(struct saved_context, entry))
@@ -108,7 +124,7 @@ int main(void)
108 ENTRY(cr8); 124 ENTRY(cr8);
109 BLANK(); 125 BLANK();
110#undef ENTRY 126#undef ENTRY
111 DEFINE(TSS_ist, offsetof(struct tss_struct, ist)); 127 DEFINE(TSS_ist, offsetof(struct tss_struct, x86_tss.ist));
112 BLANK(); 128 BLANK();
113 DEFINE(crypto_tfm_ctx_offset, offsetof(struct crypto_tfm, __crt_ctx)); 129 DEFINE(crypto_tfm_ctx_offset, offsetof(struct crypto_tfm, __crt_ctx));
114 BLANK(); 130 BLANK();
diff --git a/arch/x86/kernel/bootflag.c b/arch/x86/kernel/bootflag.c
index 0b9860530a6..30f25a75fe2 100644
--- a/arch/x86/kernel/bootflag.c
+++ b/arch/x86/kernel/bootflag.c
@@ -1,8 +1,6 @@
1/* 1/*
2 * Implement 'Simple Boot Flag Specification 2.0' 2 * Implement 'Simple Boot Flag Specification 2.0'
3 */ 3 */
4
5
6#include <linux/types.h> 4#include <linux/types.h>
7#include <linux/kernel.h> 5#include <linux/kernel.h>
8#include <linux/init.h> 6#include <linux/init.h>
@@ -14,40 +12,38 @@
14 12
15#include <linux/mc146818rtc.h> 13#include <linux/mc146818rtc.h>
16 14
17
18#define SBF_RESERVED (0x78) 15#define SBF_RESERVED (0x78)
19#define SBF_PNPOS (1<<0) 16#define SBF_PNPOS (1<<0)
20#define SBF_BOOTING (1<<1) 17#define SBF_BOOTING (1<<1)
21#define SBF_DIAG (1<<2) 18#define SBF_DIAG (1<<2)
22#define SBF_PARITY (1<<7) 19#define SBF_PARITY (1<<7)
23 20
24
25int sbf_port __initdata = -1; /* set via acpi_boot_init() */ 21int sbf_port __initdata = -1; /* set via acpi_boot_init() */
26 22
27
28static int __init parity(u8 v) 23static int __init parity(u8 v)
29{ 24{
30 int x = 0; 25 int x = 0;
31 int i; 26 int i;
32 27
33 for(i=0;i<8;i++) 28 for (i = 0; i < 8; i++) {
34 { 29 x ^= (v & 1);
35 x^=(v&1); 30 v >>= 1;
36 v>>=1;
37 } 31 }
32
38 return x; 33 return x;
39} 34}
40 35
41static void __init sbf_write(u8 v) 36static void __init sbf_write(u8 v)
42{ 37{
43 unsigned long flags; 38 unsigned long flags;
44 if(sbf_port != -1) 39
45 { 40 if (sbf_port != -1) {
46 v &= ~SBF_PARITY; 41 v &= ~SBF_PARITY;
47 if(!parity(v)) 42 if (!parity(v))
48 v|=SBF_PARITY; 43 v |= SBF_PARITY;
49 44
50 printk(KERN_INFO "Simple Boot Flag at 0x%x set to 0x%x\n", sbf_port, v); 45 printk(KERN_INFO "Simple Boot Flag at 0x%x set to 0x%x\n",
46 sbf_port, v);
51 47
52 spin_lock_irqsave(&rtc_lock, flags); 48 spin_lock_irqsave(&rtc_lock, flags);
53 CMOS_WRITE(v, sbf_port); 49 CMOS_WRITE(v, sbf_port);
@@ -57,33 +53,41 @@ static void __init sbf_write(u8 v)
57 53
58static u8 __init sbf_read(void) 54static u8 __init sbf_read(void)
59{ 55{
60 u8 v;
61 unsigned long flags; 56 unsigned long flags;
62 if(sbf_port == -1) 57 u8 v;
58
59 if (sbf_port == -1)
63 return 0; 60 return 0;
61
64 spin_lock_irqsave(&rtc_lock, flags); 62 spin_lock_irqsave(&rtc_lock, flags);
65 v = CMOS_READ(sbf_port); 63 v = CMOS_READ(sbf_port);
66 spin_unlock_irqrestore(&rtc_lock, flags); 64 spin_unlock_irqrestore(&rtc_lock, flags);
65
67 return v; 66 return v;
68} 67}
69 68
70static int __init sbf_value_valid(u8 v) 69static int __init sbf_value_valid(u8 v)
71{ 70{
72 if(v&SBF_RESERVED) /* Reserved bits */ 71 if (v & SBF_RESERVED) /* Reserved bits */
73 return 0; 72 return 0;
74 if(!parity(v)) 73 if (!parity(v))
75 return 0; 74 return 0;
75
76 return 1; 76 return 1;
77} 77}
78 78
79static int __init sbf_init(void) 79static int __init sbf_init(void)
80{ 80{
81 u8 v; 81 u8 v;
82 if(sbf_port == -1) 82
83 if (sbf_port == -1)
83 return 0; 84 return 0;
85
84 v = sbf_read(); 86 v = sbf_read();
85 if(!sbf_value_valid(v)) 87 if (!sbf_value_valid(v)) {
86 printk(KERN_WARNING "Simple Boot Flag value 0x%x read from CMOS RAM was invalid\n",v); 88 printk(KERN_WARNING "Simple Boot Flag value 0x%x read from "
89 "CMOS RAM was invalid\n", v);
90 }
87 91
88 v &= ~SBF_RESERVED; 92 v &= ~SBF_RESERVED;
89 v &= ~SBF_BOOTING; 93 v &= ~SBF_BOOTING;
@@ -92,7 +96,7 @@ static int __init sbf_init(void)
92 v |= SBF_PNPOS; 96 v |= SBF_PNPOS;
93#endif 97#endif
94 sbf_write(v); 98 sbf_write(v);
99
95 return 0; 100 return 0;
96} 101}
97
98module_init(sbf_init); 102module_init(sbf_init);
diff --git a/arch/x86/kernel/bugs_64.c b/arch/x86/kernel/bugs_64.c
index 9a189cef640..8f520f93ffd 100644
--- a/arch/x86/kernel/bugs_64.c
+++ b/arch/x86/kernel/bugs_64.c
@@ -13,7 +13,6 @@
13void __init check_bugs(void) 13void __init check_bugs(void)
14{ 14{
15 identify_cpu(&boot_cpu_data); 15 identify_cpu(&boot_cpu_data);
16 mtrr_bp_init();
17#if !defined(CONFIG_SMP) 16#if !defined(CONFIG_SMP)
18 printk("CPU: "); 17 printk("CPU: ");
19 print_cpu_info(&boot_cpu_data); 18 print_cpu_info(&boot_cpu_data);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index cfdb2f3bd76..a0c4d7c5dbd 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -3,6 +3,7 @@
3# 3#
4 4
5obj-y := intel_cacheinfo.o addon_cpuid_features.o 5obj-y := intel_cacheinfo.o addon_cpuid_features.o
6obj-y += feature_names.o
6 7
7obj-$(CONFIG_X86_32) += common.o proc.o bugs.o 8obj-$(CONFIG_X86_32) += common.o proc.o bugs.o
8obj-$(CONFIG_X86_32) += amd.o 9obj-$(CONFIG_X86_32) += amd.o
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index 3e91d3ee26e..238468ae199 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -45,6 +45,6 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
45 &regs[CR_ECX], &regs[CR_EDX]); 45 &regs[CR_ECX], &regs[CR_EDX]);
46 46
47 if (regs[cb->reg] & (1 << cb->bit)) 47 if (regs[cb->reg] & (1 << cb->bit))
48 set_bit(cb->feature, c->x86_capability); 48 set_cpu_cap(c, cb->feature);
49 } 49 }
50} 50}
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 1ff88c7f45c..693e353999c 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -63,6 +63,15 @@ static __cpuinit int amd_apic_timer_broken(void)
63 63
64int force_mwait __cpuinitdata; 64int force_mwait __cpuinitdata;
65 65
66void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
67{
68 if (cpuid_eax(0x80000000) >= 0x80000007) {
69 c->x86_power = cpuid_edx(0x80000007);
70 if (c->x86_power & (1<<8))
71 set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
72 }
73}
74
66static void __cpuinit init_amd(struct cpuinfo_x86 *c) 75static void __cpuinit init_amd(struct cpuinfo_x86 *c)
67{ 76{
68 u32 l, h; 77 u32 l, h;
@@ -85,6 +94,8 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
85 } 94 }
86#endif 95#endif
87 96
97 early_init_amd(c);
98
88 /* 99 /*
89 * FIXME: We should handle the K5 here. Set up the write 100 * FIXME: We should handle the K5 here. Set up the write
90 * range and also turn on MSR 83 bits 4 and 31 (write alloc, 101 * range and also turn on MSR 83 bits 4 and 31 (write alloc,
@@ -257,12 +268,6 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
257 c->x86_max_cores = (cpuid_ecx(0x80000008) & 0xff) + 1; 268 c->x86_max_cores = (cpuid_ecx(0x80000008) & 0xff) + 1;
258 } 269 }
259 270
260 if (cpuid_eax(0x80000000) >= 0x80000007) {
261 c->x86_power = cpuid_edx(0x80000007);
262 if (c->x86_power & (1<<8))
263 set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
264 }
265
266#ifdef CONFIG_X86_HT 271#ifdef CONFIG_X86_HT
267 /* 272 /*
268 * On a AMD multi core setup the lower bits of the APIC id 273 * On a AMD multi core setup the lower bits of the APIC id
@@ -295,12 +300,12 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
295 local_apic_timer_disabled = 1; 300 local_apic_timer_disabled = 1;
296#endif 301#endif
297 302
298 if (c->x86 == 0x10 && !force_mwait)
299 clear_bit(X86_FEATURE_MWAIT, c->x86_capability);
300
301 /* K6s reports MCEs but don't actually have all the MSRs */ 303 /* K6s reports MCEs but don't actually have all the MSRs */
302 if (c->x86 < 6) 304 if (c->x86 < 6)
303 clear_bit(X86_FEATURE_MCE, c->x86_capability); 305 clear_bit(X86_FEATURE_MCE, c->x86_capability);
306
307 if (cpu_has_xmm2)
308 set_bit(X86_FEATURE_MFENCE_RDTSC, c->x86_capability);
304} 309}
305 310
306static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 * c, unsigned int size) 311static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 * c, unsigned int size)
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 205fd5ba57f..9b95edcfc6a 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -11,6 +11,7 @@
11#include <linux/utsname.h> 11#include <linux/utsname.h>
12#include <asm/bugs.h> 12#include <asm/bugs.h>
13#include <asm/processor.h> 13#include <asm/processor.h>
14#include <asm/processor-flags.h>
14#include <asm/i387.h> 15#include <asm/i387.h>
15#include <asm/msr.h> 16#include <asm/msr.h>
16#include <asm/paravirt.h> 17#include <asm/paravirt.h>
@@ -35,7 +36,7 @@ __setup("mca-pentium", mca_pentium);
35static int __init no_387(char *s) 36static int __init no_387(char *s)
36{ 37{
37 boot_cpu_data.hard_math = 0; 38 boot_cpu_data.hard_math = 0;
38 write_cr0(0xE | read_cr0()); 39 write_cr0(X86_CR0_TS | X86_CR0_EM | X86_CR0_MP | read_cr0());
39 return 1; 40 return 1;
40} 41}
41 42
@@ -153,7 +154,7 @@ static void __init check_config(void)
153 * If we configured ourselves for a TSC, we'd better have one! 154 * If we configured ourselves for a TSC, we'd better have one!
154 */ 155 */
155#ifdef CONFIG_X86_TSC 156#ifdef CONFIG_X86_TSC
156 if (!cpu_has_tsc && !tsc_disable) 157 if (!cpu_has_tsc)
157 panic("Kernel compiled for Pentium+, requires TSC feature!"); 158 panic("Kernel compiled for Pentium+, requires TSC feature!");
158#endif 159#endif
159 160
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index e2fcf2051bd..f86a3c4a266 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -22,43 +22,48 @@
22#include "cpu.h" 22#include "cpu.h"
23 23
24DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = { 24DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
25 [GDT_ENTRY_KERNEL_CS] = { 0x0000ffff, 0x00cf9a00 }, 25 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } },
26 [GDT_ENTRY_KERNEL_DS] = { 0x0000ffff, 0x00cf9200 }, 26 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } },
27 [GDT_ENTRY_DEFAULT_USER_CS] = { 0x0000ffff, 0x00cffa00 }, 27 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } },
28 [GDT_ENTRY_DEFAULT_USER_DS] = { 0x0000ffff, 0x00cff200 }, 28 [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } },
29 /* 29 /*
30 * Segments used for calling PnP BIOS have byte granularity. 30 * Segments used for calling PnP BIOS have byte granularity.
31 * They code segments and data segments have fixed 64k limits, 31 * They code segments and data segments have fixed 64k limits,
32 * the transfer segment sizes are set at run time. 32 * the transfer segment sizes are set at run time.
33 */ 33 */
34 [GDT_ENTRY_PNPBIOS_CS32] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */ 34 /* 32-bit code */
35 [GDT_ENTRY_PNPBIOS_CS16] = { 0x0000ffff, 0x00009a00 },/* 16-bit code */ 35 [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } },
36 [GDT_ENTRY_PNPBIOS_DS] = { 0x0000ffff, 0x00009200 }, /* 16-bit data */ 36 /* 16-bit code */
37 [GDT_ENTRY_PNPBIOS_TS1] = { 0x00000000, 0x00009200 },/* 16-bit data */ 37 [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } },
38 [GDT_ENTRY_PNPBIOS_TS2] = { 0x00000000, 0x00009200 },/* 16-bit data */ 38 /* 16-bit data */
39 [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } },
40 /* 16-bit data */
41 [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } },
42 /* 16-bit data */
43 [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } },
39 /* 44 /*
40 * The APM segments have byte granularity and their bases 45 * The APM segments have byte granularity and their bases
41 * are set at run time. All have 64k limits. 46 * are set at run time. All have 64k limits.
42 */ 47 */
43 [GDT_ENTRY_APMBIOS_BASE] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */ 48 /* 32-bit code */
49 [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } },
44 /* 16-bit code */ 50 /* 16-bit code */
45 [GDT_ENTRY_APMBIOS_BASE+1] = { 0x0000ffff, 0x00009a00 }, 51 [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } },
46 [GDT_ENTRY_APMBIOS_BASE+2] = { 0x0000ffff, 0x00409200 }, /* data */ 52 /* data */
53 [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } },
47 54
48 [GDT_ENTRY_ESPFIX_SS] = { 0x00000000, 0x00c09200 }, 55 [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } },
49 [GDT_ENTRY_PERCPU] = { 0x00000000, 0x00000000 }, 56 [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } },
50} }; 57} };
51EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); 58EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
52 59
60__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
61
53static int cachesize_override __cpuinitdata = -1; 62static int cachesize_override __cpuinitdata = -1;
54static int disable_x86_fxsr __cpuinitdata;
55static int disable_x86_serial_nr __cpuinitdata = 1; 63static int disable_x86_serial_nr __cpuinitdata = 1;
56static int disable_x86_sep __cpuinitdata;
57 64
58struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {}; 65struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {};
59 66
60extern int disable_pse;
61
62static void __cpuinit default_init(struct cpuinfo_x86 * c) 67static void __cpuinit default_init(struct cpuinfo_x86 * c)
63{ 68{
64 /* Not much we can do here... */ 69 /* Not much we can do here... */
@@ -207,16 +212,8 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c, int early)
207 212
208static int __init x86_fxsr_setup(char * s) 213static int __init x86_fxsr_setup(char * s)
209{ 214{
210 /* Tell all the other CPUs to not use it... */ 215 setup_clear_cpu_cap(X86_FEATURE_FXSR);
211 disable_x86_fxsr = 1; 216 setup_clear_cpu_cap(X86_FEATURE_XMM);
212
213 /*
214 * ... and clear the bits early in the boot_cpu_data
215 * so that the bootup process doesn't try to do this
216 * either.
217 */
218 clear_bit(X86_FEATURE_FXSR, boot_cpu_data.x86_capability);
219 clear_bit(X86_FEATURE_XMM, boot_cpu_data.x86_capability);
220 return 1; 217 return 1;
221} 218}
222__setup("nofxsr", x86_fxsr_setup); 219__setup("nofxsr", x86_fxsr_setup);
@@ -224,7 +221,7 @@ __setup("nofxsr", x86_fxsr_setup);
224 221
225static int __init x86_sep_setup(char * s) 222static int __init x86_sep_setup(char * s)
226{ 223{
227 disable_x86_sep = 1; 224 setup_clear_cpu_cap(X86_FEATURE_SEP);
228 return 1; 225 return 1;
229} 226}
230__setup("nosep", x86_sep_setup); 227__setup("nosep", x86_sep_setup);
@@ -261,10 +258,10 @@ static int __cpuinit have_cpuid_p(void)
261void __init cpu_detect(struct cpuinfo_x86 *c) 258void __init cpu_detect(struct cpuinfo_x86 *c)
262{ 259{
263 /* Get vendor name */ 260 /* Get vendor name */
264 cpuid(0x00000000, &c->cpuid_level, 261 cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
265 (int *)&c->x86_vendor_id[0], 262 (unsigned int *)&c->x86_vendor_id[0],
266 (int *)&c->x86_vendor_id[8], 263 (unsigned int *)&c->x86_vendor_id[8],
267 (int *)&c->x86_vendor_id[4]); 264 (unsigned int *)&c->x86_vendor_id[4]);
268 265
269 c->x86 = 4; 266 c->x86 = 4;
270 if (c->cpuid_level >= 0x00000001) { 267 if (c->cpuid_level >= 0x00000001) {
@@ -277,10 +274,39 @@ void __init cpu_detect(struct cpuinfo_x86 *c)
277 if (c->x86 >= 0x6) 274 if (c->x86 >= 0x6)
278 c->x86_model += ((tfms >> 16) & 0xF) << 4; 275 c->x86_model += ((tfms >> 16) & 0xF) << 4;
279 c->x86_mask = tfms & 15; 276 c->x86_mask = tfms & 15;
280 if (cap0 & (1<<19)) 277 if (cap0 & (1<<19)) {
281 c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8; 278 c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8;
279 c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
280 }
282 } 281 }
283} 282}
283static void __cpuinit early_get_cap(struct cpuinfo_x86 *c)
284{
285 u32 tfms, xlvl;
286 unsigned int ebx;
287
288 memset(&c->x86_capability, 0, sizeof c->x86_capability);
289 if (have_cpuid_p()) {
290 /* Intel-defined flags: level 0x00000001 */
291 if (c->cpuid_level >= 0x00000001) {
292 u32 capability, excap;
293 cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
294 c->x86_capability[0] = capability;
295 c->x86_capability[4] = excap;
296 }
297
298 /* AMD-defined flags: level 0x80000001 */
299 xlvl = cpuid_eax(0x80000000);
300 if ((xlvl & 0xffff0000) == 0x80000000) {
301 if (xlvl >= 0x80000001) {
302 c->x86_capability[1] = cpuid_edx(0x80000001);
303 c->x86_capability[6] = cpuid_ecx(0x80000001);
304 }
305 }
306
307 }
308
309}
284 310
285/* Do minimum CPU detection early. 311/* Do minimum CPU detection early.
286 Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment. 312 Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment.
@@ -293,6 +319,7 @@ static void __init early_cpu_detect(void)
293 struct cpuinfo_x86 *c = &boot_cpu_data; 319 struct cpuinfo_x86 *c = &boot_cpu_data;
294 320
295 c->x86_cache_alignment = 32; 321 c->x86_cache_alignment = 32;
322 c->x86_clflush_size = 32;
296 323
297 if (!have_cpuid_p()) 324 if (!have_cpuid_p())
298 return; 325 return;
@@ -300,19 +327,30 @@ static void __init early_cpu_detect(void)
300 cpu_detect(c); 327 cpu_detect(c);
301 328
302 get_cpu_vendor(c, 1); 329 get_cpu_vendor(c, 1);
330
331 switch (c->x86_vendor) {
332 case X86_VENDOR_AMD:
333 early_init_amd(c);
334 break;
335 case X86_VENDOR_INTEL:
336 early_init_intel(c);
337 break;
338 }
339
340 early_get_cap(c);
303} 341}
304 342
305static void __cpuinit generic_identify(struct cpuinfo_x86 * c) 343static void __cpuinit generic_identify(struct cpuinfo_x86 * c)
306{ 344{
307 u32 tfms, xlvl; 345 u32 tfms, xlvl;
308 int ebx; 346 unsigned int ebx;
309 347
310 if (have_cpuid_p()) { 348 if (have_cpuid_p()) {
311 /* Get vendor name */ 349 /* Get vendor name */
312 cpuid(0x00000000, &c->cpuid_level, 350 cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
313 (int *)&c->x86_vendor_id[0], 351 (unsigned int *)&c->x86_vendor_id[0],
314 (int *)&c->x86_vendor_id[8], 352 (unsigned int *)&c->x86_vendor_id[8],
315 (int *)&c->x86_vendor_id[4]); 353 (unsigned int *)&c->x86_vendor_id[4]);
316 354
317 get_cpu_vendor(c, 0); 355 get_cpu_vendor(c, 0);
318 /* Initialize the standard set of capabilities */ 356 /* Initialize the standard set of capabilities */
@@ -357,8 +395,6 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 * c)
357 init_scattered_cpuid_features(c); 395 init_scattered_cpuid_features(c);
358 } 396 }
359 397
360 early_intel_workaround(c);
361
362#ifdef CONFIG_X86_HT 398#ifdef CONFIG_X86_HT
363 c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff; 399 c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
364#endif 400#endif
@@ -392,7 +428,7 @@ __setup("serialnumber", x86_serial_nr_setup);
392/* 428/*
393 * This does the hard work of actually picking apart the CPU stuff... 429 * This does the hard work of actually picking apart the CPU stuff...
394 */ 430 */
395static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) 431void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
396{ 432{
397 int i; 433 int i;
398 434
@@ -418,20 +454,9 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
418 454
419 generic_identify(c); 455 generic_identify(c);
420 456
421 printk(KERN_DEBUG "CPU: After generic identify, caps:"); 457 if (this_cpu->c_identify)
422 for (i = 0; i < NCAPINTS; i++)
423 printk(" %08lx", c->x86_capability[i]);
424 printk("\n");
425
426 if (this_cpu->c_identify) {
427 this_cpu->c_identify(c); 458 this_cpu->c_identify(c);
428 459
429 printk(KERN_DEBUG "CPU: After vendor identify, caps:");
430 for (i = 0; i < NCAPINTS; i++)
431 printk(" %08lx", c->x86_capability[i]);
432 printk("\n");
433 }
434
435 /* 460 /*
436 * Vendor-specific initialization. In this section we 461 * Vendor-specific initialization. In this section we
437 * canonicalize the feature flags, meaning if there are 462 * canonicalize the feature flags, meaning if there are
@@ -453,23 +478,6 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
453 * we do "generic changes." 478 * we do "generic changes."
454 */ 479 */
455 480
456 /* TSC disabled? */
457 if ( tsc_disable )
458 clear_bit(X86_FEATURE_TSC, c->x86_capability);
459
460 /* FXSR disabled? */
461 if (disable_x86_fxsr) {
462 clear_bit(X86_FEATURE_FXSR, c->x86_capability);
463 clear_bit(X86_FEATURE_XMM, c->x86_capability);
464 }
465
466 /* SEP disabled? */
467 if (disable_x86_sep)
468 clear_bit(X86_FEATURE_SEP, c->x86_capability);
469
470 if (disable_pse)
471 clear_bit(X86_FEATURE_PSE, c->x86_capability);
472
473 /* If the model name is still unset, do table lookup. */ 481 /* If the model name is still unset, do table lookup. */
474 if ( !c->x86_model_id[0] ) { 482 if ( !c->x86_model_id[0] ) {
475 char *p; 483 char *p;
@@ -482,13 +490,6 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
482 c->x86, c->x86_model); 490 c->x86, c->x86_model);
483 } 491 }
484 492
485 /* Now the feature flags better reflect actual CPU features! */
486
487 printk(KERN_DEBUG "CPU: After all inits, caps:");
488 for (i = 0; i < NCAPINTS; i++)
489 printk(" %08lx", c->x86_capability[i]);
490 printk("\n");
491
492 /* 493 /*
493 * On SMP, boot_cpu_data holds the common feature set between 494 * On SMP, boot_cpu_data holds the common feature set between
494 * all CPUs; so make sure that we indicate which features are 495 * all CPUs; so make sure that we indicate which features are
@@ -501,8 +502,14 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
501 boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; 502 boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
502 } 503 }
503 504
505 /* Clear all flags overriden by options */
506 for (i = 0; i < NCAPINTS; i++)
507 c->x86_capability[i] ^= cleared_cpu_caps[i];
508
504 /* Init Machine Check Exception if available. */ 509 /* Init Machine Check Exception if available. */
505 mcheck_init(c); 510 mcheck_init(c);
511
512 select_idle_routine(c);
506} 513}
507 514
508void __init identify_boot_cpu(void) 515void __init identify_boot_cpu(void)
@@ -510,7 +517,6 @@ void __init identify_boot_cpu(void)
510 identify_cpu(&boot_cpu_data); 517 identify_cpu(&boot_cpu_data);
511 sysenter_setup(); 518 sysenter_setup();
512 enable_sep_cpu(); 519 enable_sep_cpu();
513 mtrr_bp_init();
514} 520}
515 521
516void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) 522void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
@@ -567,6 +573,13 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
567} 573}
568#endif 574#endif
569 575
576static __init int setup_noclflush(char *arg)
577{
578 setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
579 return 1;
580}
581__setup("noclflush", setup_noclflush);
582
570void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) 583void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
571{ 584{
572 char *vendor = NULL; 585 char *vendor = NULL;
@@ -590,6 +603,17 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
590 printk("\n"); 603 printk("\n");
591} 604}
592 605
606static __init int setup_disablecpuid(char *arg)
607{
608 int bit;
609 if (get_option(&arg, &bit) && bit < NCAPINTS*32)
610 setup_clear_cpu_cap(bit);
611 else
612 return 0;
613 return 1;
614}
615__setup("clearcpuid=", setup_disablecpuid);
616
593cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; 617cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
594 618
595/* This is hacky. :) 619/* This is hacky. :)
@@ -599,16 +623,6 @@ cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
599 * They will insert themselves into the cpu_devs structure. 623 * They will insert themselves into the cpu_devs structure.
600 * Then, when cpu_init() is called, we can just iterate over that array. 624 * Then, when cpu_init() is called, we can just iterate over that array.
601 */ 625 */
602
603extern int intel_cpu_init(void);
604extern int cyrix_init_cpu(void);
605extern int nsc_init_cpu(void);
606extern int amd_init_cpu(void);
607extern int centaur_init_cpu(void);
608extern int transmeta_init_cpu(void);
609extern int nexgen_init_cpu(void);
610extern int umc_init_cpu(void);
611
612void __init early_cpu_init(void) 626void __init early_cpu_init(void)
613{ 627{
614 intel_cpu_init(); 628 intel_cpu_init();
@@ -620,21 +634,13 @@ void __init early_cpu_init(void)
620 nexgen_init_cpu(); 634 nexgen_init_cpu();
621 umc_init_cpu(); 635 umc_init_cpu();
622 early_cpu_detect(); 636 early_cpu_detect();
623
624#ifdef CONFIG_DEBUG_PAGEALLOC
625 /* pse is not compatible with on-the-fly unmapping,
626 * disable it even if the cpus claim to support it.
627 */
628 clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
629 disable_pse = 1;
630#endif
631} 637}
632 638
633/* Make sure %fs is initialized properly in idle threads */ 639/* Make sure %fs is initialized properly in idle threads */
634struct pt_regs * __devinit idle_regs(struct pt_regs *regs) 640struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
635{ 641{
636 memset(regs, 0, sizeof(struct pt_regs)); 642 memset(regs, 0, sizeof(struct pt_regs));
637 regs->xfs = __KERNEL_PERCPU; 643 regs->fs = __KERNEL_PERCPU;
638 return regs; 644 return regs;
639} 645}
640 646
@@ -642,7 +648,7 @@ struct pt_regs * __devinit idle_regs(struct pt_regs *regs)
642 * it's on the real one. */ 648 * it's on the real one. */
643void switch_to_new_gdt(void) 649void switch_to_new_gdt(void)
644{ 650{
645 struct Xgt_desc_struct gdt_descr; 651 struct desc_ptr gdt_descr;
646 652
647 gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id()); 653 gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
648 gdt_descr.size = GDT_SIZE - 1; 654 gdt_descr.size = GDT_SIZE - 1;
@@ -672,12 +678,6 @@ void __cpuinit cpu_init(void)
672 678
673 if (cpu_has_vme || cpu_has_tsc || cpu_has_de) 679 if (cpu_has_vme || cpu_has_tsc || cpu_has_de)
674 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); 680 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
675 if (tsc_disable && cpu_has_tsc) {
676 printk(KERN_NOTICE "Disabling TSC...\n");
677 /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/
678 clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
679 set_in_cr4(X86_CR4_TSD);
680 }
681 681
682 load_idt(&idt_descr); 682 load_idt(&idt_descr);
683 switch_to_new_gdt(); 683 switch_to_new_gdt();
@@ -691,7 +691,7 @@ void __cpuinit cpu_init(void)
691 BUG(); 691 BUG();
692 enter_lazy_tlb(&init_mm, curr); 692 enter_lazy_tlb(&init_mm, curr);
693 693
694 load_esp0(t, thread); 694 load_sp0(t, thread);
695 set_tss_desc(cpu,t); 695 set_tss_desc(cpu,t);
696 load_TR_desc(); 696 load_TR_desc();
697 load_LDT(&init_mm.context); 697 load_LDT(&init_mm.context);
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 2f6432cef6f..e0b38c33d84 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -24,5 +24,15 @@ extern struct cpu_dev * cpu_devs [X86_VENDOR_NUM];
24extern int get_model_name(struct cpuinfo_x86 *c); 24extern int get_model_name(struct cpuinfo_x86 *c);
25extern void display_cacheinfo(struct cpuinfo_x86 *c); 25extern void display_cacheinfo(struct cpuinfo_x86 *c);
26 26
27extern void early_intel_workaround(struct cpuinfo_x86 *c); 27extern void early_init_intel(struct cpuinfo_x86 *c);
28 28extern void early_init_amd(struct cpuinfo_x86 *c);
29
30/* Specific CPU type init functions */
31int intel_cpu_init(void);
32int amd_init_cpu(void);
33int cyrix_init_cpu(void);
34int nsc_init_cpu(void);
35int centaur_init_cpu(void);
36int transmeta_init_cpu(void);
37int nexgen_init_cpu(void);
38int umc_init_cpu(void);
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index fea0af0476b..a962dcb9c40 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -67,7 +67,8 @@ struct acpi_cpufreq_data {
67 unsigned int cpu_feature; 67 unsigned int cpu_feature;
68}; 68};
69 69
70static struct acpi_cpufreq_data *drv_data[NR_CPUS]; 70static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data);
71
71/* acpi_perf_data is a pointer to percpu data. */ 72/* acpi_perf_data is a pointer to percpu data. */
72static struct acpi_processor_performance *acpi_perf_data; 73static struct acpi_processor_performance *acpi_perf_data;
73 74
@@ -218,14 +219,14 @@ static u32 get_cur_val(cpumask_t mask)
218 if (unlikely(cpus_empty(mask))) 219 if (unlikely(cpus_empty(mask)))
219 return 0; 220 return 0;
220 221
221 switch (drv_data[first_cpu(mask)]->cpu_feature) { 222 switch (per_cpu(drv_data, first_cpu(mask))->cpu_feature) {
222 case SYSTEM_INTEL_MSR_CAPABLE: 223 case SYSTEM_INTEL_MSR_CAPABLE:
223 cmd.type = SYSTEM_INTEL_MSR_CAPABLE; 224 cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
224 cmd.addr.msr.reg = MSR_IA32_PERF_STATUS; 225 cmd.addr.msr.reg = MSR_IA32_PERF_STATUS;
225 break; 226 break;
226 case SYSTEM_IO_CAPABLE: 227 case SYSTEM_IO_CAPABLE:
227 cmd.type = SYSTEM_IO_CAPABLE; 228 cmd.type = SYSTEM_IO_CAPABLE;
228 perf = drv_data[first_cpu(mask)]->acpi_data; 229 perf = per_cpu(drv_data, first_cpu(mask))->acpi_data;
229 cmd.addr.io.port = perf->control_register.address; 230 cmd.addr.io.port = perf->control_register.address;
230 cmd.addr.io.bit_width = perf->control_register.bit_width; 231 cmd.addr.io.bit_width = perf->control_register.bit_width;
231 break; 232 break;
@@ -325,7 +326,7 @@ static unsigned int get_measured_perf(unsigned int cpu)
325 326
326#endif 327#endif
327 328
328 retval = drv_data[cpu]->max_freq * perf_percent / 100; 329 retval = per_cpu(drv_data, cpu)->max_freq * perf_percent / 100;
329 330
330 put_cpu(); 331 put_cpu();
331 set_cpus_allowed(current, saved_mask); 332 set_cpus_allowed(current, saved_mask);
@@ -336,7 +337,7 @@ static unsigned int get_measured_perf(unsigned int cpu)
336 337
337static unsigned int get_cur_freq_on_cpu(unsigned int cpu) 338static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
338{ 339{
339 struct acpi_cpufreq_data *data = drv_data[cpu]; 340 struct acpi_cpufreq_data *data = per_cpu(drv_data, cpu);
340 unsigned int freq; 341 unsigned int freq;
341 342
342 dprintk("get_cur_freq_on_cpu (%d)\n", cpu); 343 dprintk("get_cur_freq_on_cpu (%d)\n", cpu);
@@ -370,7 +371,7 @@ static unsigned int check_freqs(cpumask_t mask, unsigned int freq,
370static int acpi_cpufreq_target(struct cpufreq_policy *policy, 371static int acpi_cpufreq_target(struct cpufreq_policy *policy,
371 unsigned int target_freq, unsigned int relation) 372 unsigned int target_freq, unsigned int relation)
372{ 373{
373 struct acpi_cpufreq_data *data = drv_data[policy->cpu]; 374 struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu);
374 struct acpi_processor_performance *perf; 375 struct acpi_processor_performance *perf;
375 struct cpufreq_freqs freqs; 376 struct cpufreq_freqs freqs;
376 cpumask_t online_policy_cpus; 377 cpumask_t online_policy_cpus;
@@ -466,7 +467,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
466 467
467static int acpi_cpufreq_verify(struct cpufreq_policy *policy) 468static int acpi_cpufreq_verify(struct cpufreq_policy *policy)
468{ 469{
469 struct acpi_cpufreq_data *data = drv_data[policy->cpu]; 470 struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu);
470 471
471 dprintk("acpi_cpufreq_verify\n"); 472 dprintk("acpi_cpufreq_verify\n");
472 473
@@ -570,7 +571,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
570 return -ENOMEM; 571 return -ENOMEM;
571 572
572 data->acpi_data = percpu_ptr(acpi_perf_data, cpu); 573 data->acpi_data = percpu_ptr(acpi_perf_data, cpu);
573 drv_data[cpu] = data; 574 per_cpu(drv_data, cpu) = data;
574 575
575 if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) 576 if (cpu_has(c, X86_FEATURE_CONSTANT_TSC))
576 acpi_cpufreq_driver.flags |= CPUFREQ_CONST_LOOPS; 577 acpi_cpufreq_driver.flags |= CPUFREQ_CONST_LOOPS;
@@ -714,20 +715,20 @@ err_unreg:
714 acpi_processor_unregister_performance(perf, cpu); 715 acpi_processor_unregister_performance(perf, cpu);
715err_free: 716err_free:
716 kfree(data); 717 kfree(data);
717 drv_data[cpu] = NULL; 718 per_cpu(drv_data, cpu) = NULL;
718 719
719 return result; 720 return result;
720} 721}
721 722
722static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy) 723static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy)
723{ 724{
724 struct acpi_cpufreq_data *data = drv_data[policy->cpu]; 725 struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu);
725 726
726 dprintk("acpi_cpufreq_cpu_exit\n"); 727 dprintk("acpi_cpufreq_cpu_exit\n");
727 728
728 if (data) { 729 if (data) {
729 cpufreq_frequency_table_put_attr(policy->cpu); 730 cpufreq_frequency_table_put_attr(policy->cpu);
730 drv_data[policy->cpu] = NULL; 731 per_cpu(drv_data, policy->cpu) = NULL;
731 acpi_processor_unregister_performance(data->acpi_data, 732 acpi_processor_unregister_performance(data->acpi_data,
732 policy->cpu); 733 policy->cpu);
733 kfree(data); 734 kfree(data);
@@ -738,7 +739,7 @@ static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy)
738 739
739static int acpi_cpufreq_resume(struct cpufreq_policy *policy) 740static int acpi_cpufreq_resume(struct cpufreq_policy *policy)
740{ 741{
741 struct acpi_cpufreq_data *data = drv_data[policy->cpu]; 742 struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu);
742 743
743 dprintk("acpi_cpufreq_resume\n"); 744 dprintk("acpi_cpufreq_resume\n");
744 745
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
index 749d00cb2eb..06fcce516d5 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.c
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c
@@ -694,7 +694,7 @@ static acpi_status longhaul_walk_callback(acpi_handle obj_handle,
694 if ( acpi_bus_get_device(obj_handle, &d) ) { 694 if ( acpi_bus_get_device(obj_handle, &d) ) {
695 return 0; 695 return 0;
696 } 696 }
697 *return_value = (void *)acpi_driver_data(d); 697 *return_value = acpi_driver_data(d);
698 return 1; 698 return 1;
699} 699}
700 700
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 99e1ef9939b..5affe91ca1e 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -52,7 +52,7 @@
52/* serialize freq changes */ 52/* serialize freq changes */
53static DEFINE_MUTEX(fidvid_mutex); 53static DEFINE_MUTEX(fidvid_mutex);
54 54
55static struct powernow_k8_data *powernow_data[NR_CPUS]; 55static DEFINE_PER_CPU(struct powernow_k8_data *, powernow_data);
56 56
57static int cpu_family = CPU_OPTERON; 57static int cpu_family = CPU_OPTERON;
58 58
@@ -827,7 +827,6 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpuf
827 827
828 for (i = 0; i < data->acpi_data.state_count; i++) { 828 for (i = 0; i < data->acpi_data.state_count; i++) {
829 u32 index; 829 u32 index;
830 u32 hi = 0, lo = 0;
831 830
832 index = data->acpi_data.states[i].control & HW_PSTATE_MASK; 831 index = data->acpi_data.states[i].control & HW_PSTATE_MASK;
833 if (index > data->max_hw_pstate) { 832 if (index > data->max_hw_pstate) {
@@ -1018,7 +1017,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned i
1018static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsigned relation) 1017static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsigned relation)
1019{ 1018{
1020 cpumask_t oldmask = CPU_MASK_ALL; 1019 cpumask_t oldmask = CPU_MASK_ALL;
1021 struct powernow_k8_data *data = powernow_data[pol->cpu]; 1020 struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
1022 u32 checkfid; 1021 u32 checkfid;
1023 u32 checkvid; 1022 u32 checkvid;
1024 unsigned int newstate; 1023 unsigned int newstate;
@@ -1094,7 +1093,7 @@ err_out:
1094/* Driver entry point to verify the policy and range of frequencies */ 1093/* Driver entry point to verify the policy and range of frequencies */
1095static int powernowk8_verify(struct cpufreq_policy *pol) 1094static int powernowk8_verify(struct cpufreq_policy *pol)
1096{ 1095{
1097 struct powernow_k8_data *data = powernow_data[pol->cpu]; 1096 struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
1098 1097
1099 if (!data) 1098 if (!data)
1100 return -EINVAL; 1099 return -EINVAL;
@@ -1202,7 +1201,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1202 dprintk("cpu_init done, current fid 0x%x, vid 0x%x\n", 1201 dprintk("cpu_init done, current fid 0x%x, vid 0x%x\n",
1203 data->currfid, data->currvid); 1202 data->currfid, data->currvid);
1204 1203
1205 powernow_data[pol->cpu] = data; 1204 per_cpu(powernow_data, pol->cpu) = data;
1206 1205
1207 return 0; 1206 return 0;
1208 1207
@@ -1216,7 +1215,7 @@ err_out:
1216 1215
1217static int __devexit powernowk8_cpu_exit (struct cpufreq_policy *pol) 1216static int __devexit powernowk8_cpu_exit (struct cpufreq_policy *pol)
1218{ 1217{
1219 struct powernow_k8_data *data = powernow_data[pol->cpu]; 1218 struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
1220 1219
1221 if (!data) 1220 if (!data)
1222 return -EINVAL; 1221 return -EINVAL;
@@ -1237,7 +1236,7 @@ static unsigned int powernowk8_get (unsigned int cpu)
1237 cpumask_t oldmask = current->cpus_allowed; 1236 cpumask_t oldmask = current->cpus_allowed;
1238 unsigned int khz = 0; 1237 unsigned int khz = 0;
1239 1238
1240 data = powernow_data[first_cpu(per_cpu(cpu_core_map, cpu))]; 1239 data = per_cpu(powernow_data, first_cpu(per_cpu(cpu_core_map, cpu)));
1241 1240
1242 if (!data) 1241 if (!data)
1243 return -EINVAL; 1242 return -EINVAL;
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
index 76c3ab0da46..98d4fdb7dc0 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
@@ -189,10 +189,7 @@ static unsigned int pentium4_get_frequency(void)
189 printk(KERN_DEBUG "speedstep-lib: couldn't detect FSB speed. Please send an e-mail to <linux@brodo.de>\n"); 189 printk(KERN_DEBUG "speedstep-lib: couldn't detect FSB speed. Please send an e-mail to <linux@brodo.de>\n");
190 190
191 /* Multiplier. */ 191 /* Multiplier. */
192 if (c->x86_model < 2) 192 mult = msr_lo >> 24;
193 mult = msr_lo >> 27;
194 else
195 mult = msr_lo >> 24;
196 193
197 dprintk("P4 - FSB %u kHz; Multiplier %u; Speed %u kHz\n", fsb, mult, (fsb * mult)); 194 dprintk("P4 - FSB %u kHz; Multiplier %u; Speed %u kHz\n", fsb, mult, (fsb * mult));
198 195
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index 88d66fb8411..7139b026270 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -5,6 +5,7 @@
5#include <asm/dma.h> 5#include <asm/dma.h>
6#include <asm/io.h> 6#include <asm/io.h>
7#include <asm/processor-cyrix.h> 7#include <asm/processor-cyrix.h>
8#include <asm/processor-flags.h>
8#include <asm/timer.h> 9#include <asm/timer.h>
9#include <asm/pci-direct.h> 10#include <asm/pci-direct.h>
10#include <asm/tsc.h> 11#include <asm/tsc.h>
@@ -82,8 +83,6 @@ static char cyrix_model_mult2[] __cpuinitdata = "12233445";
82 * FIXME: our newer udelay uses the tsc. We don't need to frob with SLOP 83 * FIXME: our newer udelay uses the tsc. We don't need to frob with SLOP
83 */ 84 */
84 85
85extern void calibrate_delay(void) __init;
86
87static void __cpuinit check_cx686_slop(struct cpuinfo_x86 *c) 86static void __cpuinit check_cx686_slop(struct cpuinfo_x86 *c)
88{ 87{
89 unsigned long flags; 88 unsigned long flags;
@@ -126,15 +125,12 @@ static void __cpuinit set_cx86_reorder(void)
126 125
127static void __cpuinit set_cx86_memwb(void) 126static void __cpuinit set_cx86_memwb(void)
128{ 127{
129 u32 cr0;
130
131 printk(KERN_INFO "Enable Memory-Write-back mode on Cyrix/NSC processor.\n"); 128 printk(KERN_INFO "Enable Memory-Write-back mode on Cyrix/NSC processor.\n");
132 129
133 /* CCR2 bit 2: unlock NW bit */ 130 /* CCR2 bit 2: unlock NW bit */
134 setCx86(CX86_CCR2, getCx86(CX86_CCR2) & ~0x04); 131 setCx86(CX86_CCR2, getCx86(CX86_CCR2) & ~0x04);
135 /* set 'Not Write-through' */ 132 /* set 'Not Write-through' */
136 cr0 = 0x20000000; 133 write_cr0(read_cr0() | X86_CR0_NW);
137 write_cr0(read_cr0() | cr0);
138 /* CCR2 bit 2: lock NW bit and set WT1 */ 134 /* CCR2 bit 2: lock NW bit and set WT1 */
139 setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x14 ); 135 setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x14 );
140} 136}
diff --git a/arch/x86/kernel/cpu/feature_names.c b/arch/x86/kernel/cpu/feature_names.c
new file mode 100644
index 00000000000..ee975ac6bbc
--- /dev/null
+++ b/arch/x86/kernel/cpu/feature_names.c
@@ -0,0 +1,83 @@
1/*
2 * Strings for the various x86 capability flags.
3 *
4 * This file must not contain any executable code.
5 */
6
7#include "asm/cpufeature.h"
8
9/*
10 * These flag bits must match the definitions in <asm/cpufeature.h>.
11 * NULL means this bit is undefined or reserved; either way it doesn't
12 * have meaning as far as Linux is concerned. Note that it's important
13 * to realize there is a difference between this table and CPUID -- if
14 * applications want to get the raw CPUID data, they should access
15 * /dev/cpu/<cpu_nr>/cpuid instead.
16 */
17const char * const x86_cap_flags[NCAPINTS*32] = {
18 /* Intel-defined */
19 "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
20 "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
21 "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx",
22 "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe",
23
24 /* AMD-defined */
25 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
26 NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
27 NULL, NULL, NULL, "mp", "nx", NULL, "mmxext", NULL,
28 NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm",
29 "3dnowext", "3dnow",
30
31 /* Transmeta-defined */
32 "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
33 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
34 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
35 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
36
37 /* Other (Linux-defined) */
38 "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr",
39 NULL, NULL, NULL, NULL,
40 "constant_tsc", "up", NULL, "arch_perfmon",
41 "pebs", "bts", NULL, NULL,
42 "rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
43 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
44
45 /* Intel-defined (#2) */
46 "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
47 "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL,
48 NULL, NULL, "dca", "sse4_1", "sse4_2", NULL, NULL, "popcnt",
49 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
50
51 /* VIA/Cyrix/Centaur-defined */
52 NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en",
53 "ace2", "ace2_en", "phe", "phe_en", "pmm", "pmm_en", NULL, NULL,
54 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
55 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
56
57 /* AMD-defined (#2) */
58 "lahf_lm", "cmp_legacy", "svm", "extapic",
59 "cr8_legacy", "abm", "sse4a", "misalignsse",
60 "3dnowprefetch", "osvw", "ibs", "sse5",
61 "skinit", "wdt", NULL, NULL,
62 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
63 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
64
65 /* Auxiliary (Linux-defined) */
66 "ida", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
67 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
68 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
69 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
70};
71
72const char *const x86_power_flags[32] = {
73 "ts", /* temperature sensor */
74 "fid", /* frequency id control */
75 "vid", /* voltage id control */
76 "ttp", /* thermal trip */
77 "tm",
78 "stc",
79 "100mhzsteps",
80 "hwpstate",
81 "", /* tsc invariant mapped to constant_tsc */
82 /* nothing */
83};
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index cc8c501b9f3..fae31ce747b 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -11,6 +11,9 @@
11#include <asm/pgtable.h> 11#include <asm/pgtable.h>
12#include <asm/msr.h> 12#include <asm/msr.h>
13#include <asm/uaccess.h> 13#include <asm/uaccess.h>
14#include <asm/ptrace.h>
15#include <asm/ds.h>
16#include <asm/bugs.h>
14 17
15#include "cpu.h" 18#include "cpu.h"
16 19
@@ -27,13 +30,14 @@
27struct movsl_mask movsl_mask __read_mostly; 30struct movsl_mask movsl_mask __read_mostly;
28#endif 31#endif
29 32
30void __cpuinit early_intel_workaround(struct cpuinfo_x86 *c) 33void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
31{ 34{
32 if (c->x86_vendor != X86_VENDOR_INTEL)
33 return;
34 /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */ 35 /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */
35 if (c->x86 == 15 && c->x86_cache_alignment == 64) 36 if (c->x86 == 15 && c->x86_cache_alignment == 64)
36 c->x86_cache_alignment = 128; 37 c->x86_cache_alignment = 128;
38 if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
39 (c->x86 == 0x6 && c->x86_model >= 0x0e))
40 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
37} 41}
38 42
39/* 43/*
@@ -113,6 +117,8 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
113 unsigned int l2 = 0; 117 unsigned int l2 = 0;
114 char *p = NULL; 118 char *p = NULL;
115 119
120 early_init_intel(c);
121
116#ifdef CONFIG_X86_F00F_BUG 122#ifdef CONFIG_X86_F00F_BUG
117 /* 123 /*
118 * All current models of Pentium and Pentium with MMX technology CPUs 124 * All current models of Pentium and Pentium with MMX technology CPUs
@@ -132,7 +138,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
132 } 138 }
133#endif 139#endif
134 140
135 select_idle_routine(c);
136 l2 = init_intel_cacheinfo(c); 141 l2 = init_intel_cacheinfo(c);
137 if (c->cpuid_level > 9 ) { 142 if (c->cpuid_level > 9 ) {
138 unsigned eax = cpuid_eax(10); 143 unsigned eax = cpuid_eax(10);
@@ -201,16 +206,13 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
201 } 206 }
202#endif 207#endif
203 208
209 if (cpu_has_xmm2)
210 set_bit(X86_FEATURE_LFENCE_RDTSC, c->x86_capability);
204 if (c->x86 == 15) { 211 if (c->x86 == 15) {
205 set_bit(X86_FEATURE_P4, c->x86_capability); 212 set_bit(X86_FEATURE_P4, c->x86_capability);
206 set_bit(X86_FEATURE_SYNC_RDTSC, c->x86_capability);
207 } 213 }
208 if (c->x86 == 6) 214 if (c->x86 == 6)
209 set_bit(X86_FEATURE_P3, c->x86_capability); 215 set_bit(X86_FEATURE_P3, c->x86_capability);
210 if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
211 (c->x86 == 0x6 && c->x86_model >= 0x0e))
212 set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
213
214 if (cpu_has_ds) { 216 if (cpu_has_ds) {
215 unsigned int l1; 217 unsigned int l1;
216 rdmsr(MSR_IA32_MISC_ENABLE, l1, l2); 218 rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
@@ -219,6 +221,9 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
219 if (!(l1 & (1<<12))) 221 if (!(l1 & (1<<12)))
220 set_bit(X86_FEATURE_PEBS, c->x86_capability); 222 set_bit(X86_FEATURE_PEBS, c->x86_capability);
221 } 223 }
224
225 if (cpu_has_bts)
226 ds_init_intel(c);
222} 227}
223 228
224static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 * c, unsigned int size) 229static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 * c, unsigned int size)
@@ -342,5 +347,22 @@ unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new)
342EXPORT_SYMBOL(cmpxchg_386_u32); 347EXPORT_SYMBOL(cmpxchg_386_u32);
343#endif 348#endif
344 349
350#ifndef CONFIG_X86_CMPXCHG64
351unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new)
352{
353 u64 prev;
354 unsigned long flags;
355
356 /* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */
357 local_irq_save(flags);
358 prev = *(u64 *)ptr;
359 if (prev == old)
360 *(u64 *)ptr = new;
361 local_irq_restore(flags);
362 return prev;
363}
364EXPORT_SYMBOL(cmpxchg_486_u64);
365#endif
366
345// arch_initcall(intel_cpu_init); 367// arch_initcall(intel_cpu_init);
346 368
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 9f530ff43c2..1b889860eb7 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -352,8 +352,8 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
352 */ 352 */
353 if ((num_cache_leaves == 0 || c->x86 == 15) && c->cpuid_level > 1) { 353 if ((num_cache_leaves == 0 || c->x86 == 15) && c->cpuid_level > 1) {
354 /* supports eax=2 call */ 354 /* supports eax=2 call */
355 int i, j, n; 355 int j, n;
356 int regs[4]; 356 unsigned int regs[4];
357 unsigned char *dp = (unsigned char *)regs; 357 unsigned char *dp = (unsigned char *)regs;
358 int only_trace = 0; 358 int only_trace = 0;
359 359
@@ -368,7 +368,7 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
368 368
369 /* If bit 31 is set, this is an unknown format */ 369 /* If bit 31 is set, this is an unknown format */
370 for ( j = 0 ; j < 3 ; j++ ) { 370 for ( j = 0 ; j < 3 ; j++ ) {
371 if ( regs[j] < 0 ) regs[j] = 0; 371 if (regs[j] & (1 << 31)) regs[j] = 0;
372 } 372 }
373 373
374 /* Byte 0 is level count, not a descriptor */ 374 /* Byte 0 is level count, not a descriptor */
@@ -733,10 +733,8 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
733 if (unlikely(retval < 0)) 733 if (unlikely(retval < 0))
734 return retval; 734 return retval;
735 735
736 cache_kobject[cpu]->parent = &sys_dev->kobj; 736 retval = kobject_init_and_add(cache_kobject[cpu], &ktype_percpu_entry,
737 kobject_set_name(cache_kobject[cpu], "%s", "cache"); 737 &sys_dev->kobj, "%s", "cache");
738 cache_kobject[cpu]->ktype = &ktype_percpu_entry;
739 retval = kobject_register(cache_kobject[cpu]);
740 if (retval < 0) { 738 if (retval < 0) {
741 cpuid4_cache_sysfs_exit(cpu); 739 cpuid4_cache_sysfs_exit(cpu);
742 return retval; 740 return retval;
@@ -746,23 +744,23 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
746 this_object = INDEX_KOBJECT_PTR(cpu,i); 744 this_object = INDEX_KOBJECT_PTR(cpu,i);
747 this_object->cpu = cpu; 745 this_object->cpu = cpu;
748 this_object->index = i; 746 this_object->index = i;
749 this_object->kobj.parent = cache_kobject[cpu]; 747 retval = kobject_init_and_add(&(this_object->kobj),
750 kobject_set_name(&(this_object->kobj), "index%1lu", i); 748 &ktype_cache, cache_kobject[cpu],
751 this_object->kobj.ktype = &ktype_cache; 749 "index%1lu", i);
752 retval = kobject_register(&(this_object->kobj));
753 if (unlikely(retval)) { 750 if (unlikely(retval)) {
754 for (j = 0; j < i; j++) { 751 for (j = 0; j < i; j++) {
755 kobject_unregister( 752 kobject_put(&(INDEX_KOBJECT_PTR(cpu,j)->kobj));
756 &(INDEX_KOBJECT_PTR(cpu,j)->kobj));
757 } 753 }
758 kobject_unregister(cache_kobject[cpu]); 754 kobject_put(cache_kobject[cpu]);
759 cpuid4_cache_sysfs_exit(cpu); 755 cpuid4_cache_sysfs_exit(cpu);
760 break; 756 break;
761 } 757 }
758 kobject_uevent(&(this_object->kobj), KOBJ_ADD);
762 } 759 }
763 if (!retval) 760 if (!retval)
764 cpu_set(cpu, cache_dev_map); 761 cpu_set(cpu, cache_dev_map);
765 762
763 kobject_uevent(cache_kobject[cpu], KOBJ_ADD);
766 return retval; 764 return retval;
767} 765}
768 766
@@ -778,8 +776,8 @@ static void __cpuinit cache_remove_dev(struct sys_device * sys_dev)
778 cpu_clear(cpu, cache_dev_map); 776 cpu_clear(cpu, cache_dev_map);
779 777
780 for (i = 0; i < num_cache_leaves; i++) 778 for (i = 0; i < num_cache_leaves; i++)
781 kobject_unregister(&(INDEX_KOBJECT_PTR(cpu,i)->kobj)); 779 kobject_put(&(INDEX_KOBJECT_PTR(cpu,i)->kobj));
782 kobject_unregister(cache_kobject[cpu]); 780 kobject_put(cache_kobject[cpu]);
783 cpuid4_cache_sysfs_exit(cpu); 781 cpuid4_cache_sysfs_exit(cpu);
784} 782}
785 783
diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c
index eef63e3630c..e633c9c2b76 100644
--- a/arch/x86/kernel/cpu/mcheck/k7.c
+++ b/arch/x86/kernel/cpu/mcheck/k7.c
@@ -16,7 +16,7 @@
16#include "mce.h" 16#include "mce.h"
17 17
18/* Machine Check Handler For AMD Athlon/Duron */ 18/* Machine Check Handler For AMD Athlon/Duron */
19static fastcall void k7_machine_check(struct pt_regs * regs, long error_code) 19static void k7_machine_check(struct pt_regs * regs, long error_code)
20{ 20{
21 int recover=1; 21 int recover=1;
22 u32 alow, ahigh, high, low; 22 u32 alow, ahigh, high, low;
@@ -27,29 +27,32 @@ static fastcall void k7_machine_check(struct pt_regs * regs, long error_code)
27 if (mcgstl & (1<<0)) /* Recoverable ? */ 27 if (mcgstl & (1<<0)) /* Recoverable ? */
28 recover=0; 28 recover=0;
29 29
30 printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", 30 printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
31 smp_processor_id(), mcgsth, mcgstl); 31 smp_processor_id(), mcgsth, mcgstl);
32 32
33 for (i=1; i<nr_mce_banks; i++) { 33 for (i = 1; i < nr_mce_banks; i++) {
34 rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high); 34 rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
35 if (high&(1<<31)) { 35 if (high&(1<<31)) {
36 char misc[20];
37 char addr[24];
38 misc[0] = addr[0] = '\0';
36 if (high & (1<<29)) 39 if (high & (1<<29))
37 recover |= 1; 40 recover |= 1;
38 if (high & (1<<25)) 41 if (high & (1<<25))
39 recover |= 2; 42 recover |= 2;
40 printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low);
41 high &= ~(1<<31); 43 high &= ~(1<<31);
42 if (high & (1<<27)) { 44 if (high & (1<<27)) {
43 rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh); 45 rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
44 printk ("[%08x%08x]", ahigh, alow); 46 snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
45 } 47 }
46 if (high & (1<<26)) { 48 if (high & (1<<26)) {
47 rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh); 49 rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
48 printk (" at %08x%08x", ahigh, alow); 50 snprintf(addr, 24, " at %08x%08x", ahigh, alow);
49 } 51 }
50 printk ("\n"); 52 printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
53 smp_processor_id(), i, high, low, misc, addr);
51 /* Clear it */ 54 /* Clear it */
52 wrmsr (MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); 55 wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
53 /* Serialize */ 56 /* Serialize */
54 wmb(); 57 wmb();
55 add_taint(TAINT_MACHINE_CHECK); 58 add_taint(TAINT_MACHINE_CHECK);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.h b/arch/x86/kernel/cpu/mcheck/mce.h
index 81fb6e2d35f..ae9f628838f 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.h
+++ b/arch/x86/kernel/cpu/mcheck/mce.h
@@ -8,7 +8,7 @@ void intel_p6_mcheck_init(struct cpuinfo_x86 *c);
8void winchip_mcheck_init(struct cpuinfo_x86 *c); 8void winchip_mcheck_init(struct cpuinfo_x86 *c);
9 9
10/* Call the installed machine check handler for this CPU setup. */ 10/* Call the installed machine check handler for this CPU setup. */
11extern fastcall void (*machine_check_vector)(struct pt_regs *, long error_code); 11extern void (*machine_check_vector)(struct pt_regs *, long error_code);
12 12
13extern int nr_mce_banks; 13extern int nr_mce_banks;
14 14
diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c
index 34c781eddee..a5182dcd94a 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_32.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_32.c
@@ -22,13 +22,13 @@ int nr_mce_banks;
22EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ 22EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */
23 23
24/* Handle unconfigured int18 (should never happen) */ 24/* Handle unconfigured int18 (should never happen) */
25static fastcall void unexpected_machine_check(struct pt_regs * regs, long error_code) 25static void unexpected_machine_check(struct pt_regs * regs, long error_code)
26{ 26{
27 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", smp_processor_id()); 27 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", smp_processor_id());
28} 28}
29 29
30/* Call the installed machine check handler for this CPU setup. */ 30/* Call the installed machine check handler for this CPU setup. */
31void fastcall (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check; 31void (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check;
32 32
33/* This has to be run for each processor */ 33/* This has to be run for each processor */
34void mcheck_init(struct cpuinfo_x86 *c) 34void mcheck_init(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 4b21d29fb5a..9a699ed0359 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -63,7 +63,7 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
63 * separate MCEs from kernel messages to avoid bogus bug reports. 63 * separate MCEs from kernel messages to avoid bogus bug reports.
64 */ 64 */
65 65
66struct mce_log mcelog = { 66static struct mce_log mcelog = {
67 MCE_LOG_SIGNATURE, 67 MCE_LOG_SIGNATURE,
68 MCE_LOG_LEN, 68 MCE_LOG_LEN,
69}; 69};
@@ -80,7 +80,7 @@ void mce_log(struct mce *mce)
80 /* When the buffer fills up discard new entries. Assume 80 /* When the buffer fills up discard new entries. Assume
81 that the earlier errors are the more interesting. */ 81 that the earlier errors are the more interesting. */
82 if (entry >= MCE_LOG_LEN) { 82 if (entry >= MCE_LOG_LEN) {
83 set_bit(MCE_OVERFLOW, &mcelog.flags); 83 set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags);
84 return; 84 return;
85 } 85 }
86 /* Old left over entry. Skip. */ 86 /* Old left over entry. Skip. */
@@ -110,12 +110,12 @@ static void print_mce(struct mce *m)
110 KERN_EMERG 110 KERN_EMERG
111 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", 111 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
112 m->cpu, m->mcgstatus, m->bank, m->status); 112 m->cpu, m->mcgstatus, m->bank, m->status);
113 if (m->rip) { 113 if (m->ip) {
114 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ", 114 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
115 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", 115 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
116 m->cs, m->rip); 116 m->cs, m->ip);
117 if (m->cs == __KERNEL_CS) 117 if (m->cs == __KERNEL_CS)
118 print_symbol("{%s}", m->rip); 118 print_symbol("{%s}", m->ip);
119 printk("\n"); 119 printk("\n");
120 } 120 }
121 printk(KERN_EMERG "TSC %Lx ", m->tsc); 121 printk(KERN_EMERG "TSC %Lx ", m->tsc);
@@ -156,16 +156,16 @@ static int mce_available(struct cpuinfo_x86 *c)
156static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) 156static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
157{ 157{
158 if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) { 158 if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
159 m->rip = regs->rip; 159 m->ip = regs->ip;
160 m->cs = regs->cs; 160 m->cs = regs->cs;
161 } else { 161 } else {
162 m->rip = 0; 162 m->ip = 0;
163 m->cs = 0; 163 m->cs = 0;
164 } 164 }
165 if (rip_msr) { 165 if (rip_msr) {
166 /* Assume the RIP in the MSR is exact. Is this true? */ 166 /* Assume the RIP in the MSR is exact. Is this true? */
167 m->mcgstatus |= MCG_STATUS_EIPV; 167 m->mcgstatus |= MCG_STATUS_EIPV;
168 rdmsrl(rip_msr, m->rip); 168 rdmsrl(rip_msr, m->ip);
169 m->cs = 0; 169 m->cs = 0;
170 } 170 }
171} 171}
@@ -192,10 +192,10 @@ void do_machine_check(struct pt_regs * regs, long error_code)
192 192
193 atomic_inc(&mce_entry); 193 atomic_inc(&mce_entry);
194 194
195 if (regs) 195 if ((regs
196 notify_die(DIE_NMI, "machine check", regs, error_code, 18, 196 && notify_die(DIE_NMI, "machine check", regs, error_code,
197 SIGKILL); 197 18, SIGKILL) == NOTIFY_STOP)
198 if (!banks) 198 || !banks)
199 goto out2; 199 goto out2;
200 200
201 memset(&m, 0, sizeof(struct mce)); 201 memset(&m, 0, sizeof(struct mce));
@@ -288,7 +288,7 @@ void do_machine_check(struct pt_regs * regs, long error_code)
288 * instruction which caused the MCE. 288 * instruction which caused the MCE.
289 */ 289 */
290 if (m.mcgstatus & MCG_STATUS_EIPV) 290 if (m.mcgstatus & MCG_STATUS_EIPV)
291 user_space = panicm.rip && (panicm.cs & 3); 291 user_space = panicm.ip && (panicm.cs & 3);
292 292
293 /* 293 /*
294 * If we know that the error was in user space, send a 294 * If we know that the error was in user space, send a
@@ -564,7 +564,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
564 loff_t *off) 564 loff_t *off)
565{ 565{
566 unsigned long *cpu_tsc; 566 unsigned long *cpu_tsc;
567 static DECLARE_MUTEX(mce_read_sem); 567 static DEFINE_MUTEX(mce_read_mutex);
568 unsigned next; 568 unsigned next;
569 char __user *buf = ubuf; 569 char __user *buf = ubuf;
570 int i, err; 570 int i, err;
@@ -573,12 +573,12 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
573 if (!cpu_tsc) 573 if (!cpu_tsc)
574 return -ENOMEM; 574 return -ENOMEM;
575 575
576 down(&mce_read_sem); 576 mutex_lock(&mce_read_mutex);
577 next = rcu_dereference(mcelog.next); 577 next = rcu_dereference(mcelog.next);
578 578
579 /* Only supports full reads right now */ 579 /* Only supports full reads right now */
580 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { 580 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
581 up(&mce_read_sem); 581 mutex_unlock(&mce_read_mutex);
582 kfree(cpu_tsc); 582 kfree(cpu_tsc);
583 return -EINVAL; 583 return -EINVAL;
584 } 584 }
@@ -621,7 +621,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
621 memset(&mcelog.entry[i], 0, sizeof(struct mce)); 621 memset(&mcelog.entry[i], 0, sizeof(struct mce));
622 } 622 }
623 } 623 }
624 up(&mce_read_sem); 624 mutex_unlock(&mce_read_mutex);
625 kfree(cpu_tsc); 625 kfree(cpu_tsc);
626 return err ? -EFAULT : buf - ubuf; 626 return err ? -EFAULT : buf - ubuf;
627} 627}
@@ -634,8 +634,7 @@ static unsigned int mce_poll(struct file *file, poll_table *wait)
634 return 0; 634 return 0;
635} 635}
636 636
637static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, 637static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
638 unsigned long arg)
639{ 638{
640 int __user *p = (int __user *)arg; 639 int __user *p = (int __user *)arg;
641 640
@@ -664,7 +663,7 @@ static const struct file_operations mce_chrdev_ops = {
664 .release = mce_release, 663 .release = mce_release,
665 .read = mce_read, 664 .read = mce_read,
666 .poll = mce_poll, 665 .poll = mce_poll,
667 .ioctl = mce_ioctl, 666 .unlocked_ioctl = mce_ioctl,
668}; 667};
669 668
670static struct miscdevice mce_log_device = { 669static struct miscdevice mce_log_device = {
@@ -745,7 +744,7 @@ static void mce_restart(void)
745 744
746static struct sysdev_class mce_sysclass = { 745static struct sysdev_class mce_sysclass = {
747 .resume = mce_resume, 746 .resume = mce_resume,
748 set_kset_name("machinecheck"), 747 .name = "machinecheck",
749}; 748};
750 749
751DEFINE_PER_CPU(struct sys_device, device_mce); 750DEFINE_PER_CPU(struct sys_device, device_mce);
@@ -855,8 +854,8 @@ static void mce_remove_device(unsigned int cpu)
855} 854}
856 855
857/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 856/* Get notified when a cpu comes on/off. Be hotplug friendly. */
858static int 857static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
859mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 858 unsigned long action, void *hcpu)
860{ 859{
861 unsigned int cpu = (unsigned long)hcpu; 860 unsigned int cpu = (unsigned long)hcpu;
862 861
@@ -873,7 +872,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
873 return NOTIFY_OK; 872 return NOTIFY_OK;
874} 873}
875 874
876static struct notifier_block mce_cpu_notifier = { 875static struct notifier_block mce_cpu_notifier __cpuinitdata = {
877 .notifier_call = mce_cpu_callback, 876 .notifier_call = mce_cpu_callback,
878}; 877};
879 878
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 752fb16a817..32671da8184 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -65,7 +65,7 @@ static struct threshold_block threshold_defaults = {
65}; 65};
66 66
67struct threshold_bank { 67struct threshold_bank {
68 struct kobject kobj; 68 struct kobject *kobj;
69 struct threshold_block *blocks; 69 struct threshold_block *blocks;
70 cpumask_t cpus; 70 cpumask_t cpus;
71}; 71};
@@ -118,6 +118,7 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c)
118{ 118{
119 unsigned int bank, block; 119 unsigned int bank, block;
120 unsigned int cpu = smp_processor_id(); 120 unsigned int cpu = smp_processor_id();
121 u8 lvt_off;
121 u32 low = 0, high = 0, address = 0; 122 u32 low = 0, high = 0, address = 0;
122 123
123 for (bank = 0; bank < NR_BANKS; ++bank) { 124 for (bank = 0; bank < NR_BANKS; ++bank) {
@@ -153,14 +154,13 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c)
153 if (shared_bank[bank] && c->cpu_core_id) 154 if (shared_bank[bank] && c->cpu_core_id)
154 break; 155 break;
155#endif 156#endif
157 lvt_off = setup_APIC_eilvt_mce(THRESHOLD_APIC_VECTOR,
158 APIC_EILVT_MSG_FIX, 0);
159
156 high &= ~MASK_LVTOFF_HI; 160 high &= ~MASK_LVTOFF_HI;
157 high |= K8_APIC_EXT_LVT_ENTRY_THRESHOLD << 20; 161 high |= lvt_off << 20;
158 wrmsr(address, low, high); 162 wrmsr(address, low, high);
159 163
160 setup_APIC_extended_lvt(K8_APIC_EXT_LVT_ENTRY_THRESHOLD,
161 THRESHOLD_APIC_VECTOR,
162 K8_APIC_EXT_INT_MSG_FIX, 0);
163
164 threshold_defaults.address = address; 164 threshold_defaults.address = address;
165 threshold_restart_bank(&threshold_defaults, 0, 0); 165 threshold_restart_bank(&threshold_defaults, 0, 0);
166 } 166 }
@@ -432,10 +432,9 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
432 else 432 else
433 per_cpu(threshold_banks, cpu)[bank]->blocks = b; 433 per_cpu(threshold_banks, cpu)[bank]->blocks = b;
434 434
435 kobject_set_name(&b->kobj, "misc%i", block); 435 err = kobject_init_and_add(&b->kobj, &threshold_ktype,
436 b->kobj.parent = &per_cpu(threshold_banks, cpu)[bank]->kobj; 436 per_cpu(threshold_banks, cpu)[bank]->kobj,
437 b->kobj.ktype = &threshold_ktype; 437 "misc%i", block);
438 err = kobject_register(&b->kobj);
439 if (err) 438 if (err)
440 goto out_free; 439 goto out_free;
441recurse: 440recurse:
@@ -451,11 +450,14 @@ recurse:
451 if (err) 450 if (err)
452 goto out_free; 451 goto out_free;
453 452
453 if (b)
454 kobject_uevent(&b->kobj, KOBJ_ADD);
455
454 return err; 456 return err;
455 457
456out_free: 458out_free:
457 if (b) { 459 if (b) {
458 kobject_unregister(&b->kobj); 460 kobject_put(&b->kobj);
459 kfree(b); 461 kfree(b);
460 } 462 }
461 return err; 463 return err;
@@ -489,7 +491,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
489 goto out; 491 goto out;
490 492
491 err = sysfs_create_link(&per_cpu(device_mce, cpu).kobj, 493 err = sysfs_create_link(&per_cpu(device_mce, cpu).kobj,
492 &b->kobj, name); 494 b->kobj, name);
493 if (err) 495 if (err)
494 goto out; 496 goto out;
495 497
@@ -505,16 +507,15 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
505 goto out; 507 goto out;
506 } 508 }
507 509
508 kobject_set_name(&b->kobj, "threshold_bank%i", bank); 510 b->kobj = kobject_create_and_add(name, &per_cpu(device_mce, cpu).kobj);
509 b->kobj.parent = &per_cpu(device_mce, cpu).kobj; 511 if (!b->kobj)
512 goto out_free;
513
510#ifndef CONFIG_SMP 514#ifndef CONFIG_SMP
511 b->cpus = CPU_MASK_ALL; 515 b->cpus = CPU_MASK_ALL;
512#else 516#else
513 b->cpus = per_cpu(cpu_core_map, cpu); 517 b->cpus = per_cpu(cpu_core_map, cpu);
514#endif 518#endif
515 err = kobject_register(&b->kobj);
516 if (err)
517 goto out_free;
518 519
519 per_cpu(threshold_banks, cpu)[bank] = b; 520 per_cpu(threshold_banks, cpu)[bank] = b;
520 521
@@ -531,7 +532,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
531 continue; 532 continue;
532 533
533 err = sysfs_create_link(&per_cpu(device_mce, i).kobj, 534 err = sysfs_create_link(&per_cpu(device_mce, i).kobj,
534 &b->kobj, name); 535 b->kobj, name);
535 if (err) 536 if (err)
536 goto out; 537 goto out;
537 538
@@ -554,7 +555,7 @@ static __cpuinit int threshold_create_device(unsigned int cpu)
554 int err = 0; 555 int err = 0;
555 556
556 for (bank = 0; bank < NR_BANKS; ++bank) { 557 for (bank = 0; bank < NR_BANKS; ++bank) {
557 if (!(per_cpu(bank_map, cpu) & 1 << bank)) 558 if (!(per_cpu(bank_map, cpu) & (1 << bank)))
558 continue; 559 continue;
559 err = threshold_create_bank(cpu, bank); 560 err = threshold_create_bank(cpu, bank);
560 if (err) 561 if (err)
@@ -581,7 +582,7 @@ static void deallocate_threshold_block(unsigned int cpu,
581 return; 582 return;
582 583
583 list_for_each_entry_safe(pos, tmp, &head->blocks->miscj, miscj) { 584 list_for_each_entry_safe(pos, tmp, &head->blocks->miscj, miscj) {
584 kobject_unregister(&pos->kobj); 585 kobject_put(&pos->kobj);
585 list_del(&pos->miscj); 586 list_del(&pos->miscj);
586 kfree(pos); 587 kfree(pos);
587 } 588 }
@@ -627,7 +628,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
627 deallocate_threshold_block(cpu, bank); 628 deallocate_threshold_block(cpu, bank);
628 629
629free_out: 630free_out:
630 kobject_unregister(&b->kobj); 631 kobject_put(b->kobj);
631 kfree(b); 632 kfree(b);
632 per_cpu(threshold_banks, cpu)[bank] = NULL; 633 per_cpu(threshold_banks, cpu)[bank] = NULL;
633} 634}
@@ -637,14 +638,14 @@ static void threshold_remove_device(unsigned int cpu)
637 unsigned int bank; 638 unsigned int bank;
638 639
639 for (bank = 0; bank < NR_BANKS; ++bank) { 640 for (bank = 0; bank < NR_BANKS; ++bank) {
640 if (!(per_cpu(bank_map, cpu) & 1 << bank)) 641 if (!(per_cpu(bank_map, cpu) & (1 << bank)))
641 continue; 642 continue;
642 threshold_remove_bank(cpu, bank); 643 threshold_remove_bank(cpu, bank);
643 } 644 }
644} 645}
645 646
646/* get notified when a cpu comes on/off */ 647/* get notified when a cpu comes on/off */
647static int threshold_cpu_callback(struct notifier_block *nfb, 648static int __cpuinit threshold_cpu_callback(struct notifier_block *nfb,
648 unsigned long action, void *hcpu) 649 unsigned long action, void *hcpu)
649{ 650{
650 /* cpu was unsigned int to begin with */ 651 /* cpu was unsigned int to begin with */
@@ -669,7 +670,7 @@ static int threshold_cpu_callback(struct notifier_block *nfb,
669 return NOTIFY_OK; 670 return NOTIFY_OK;
670} 671}
671 672
672static struct notifier_block threshold_cpu_notifier = { 673static struct notifier_block threshold_cpu_notifier __cpuinitdata = {
673 .notifier_call = threshold_cpu_callback, 674 .notifier_call = threshold_cpu_callback,
674}; 675};
675 676
diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c
index be4dabfee1f..cb03345554a 100644
--- a/arch/x86/kernel/cpu/mcheck/p4.c
+++ b/arch/x86/kernel/cpu/mcheck/p4.c
@@ -57,7 +57,7 @@ static void intel_thermal_interrupt(struct pt_regs *regs)
57/* Thermal interrupt handler for this CPU setup */ 57/* Thermal interrupt handler for this CPU setup */
58static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = unexpected_thermal_interrupt; 58static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = unexpected_thermal_interrupt;
59 59
60fastcall void smp_thermal_interrupt(struct pt_regs *regs) 60void smp_thermal_interrupt(struct pt_regs *regs)
61{ 61{
62 irq_enter(); 62 irq_enter();
63 vendor_thermal_interrupt(regs); 63 vendor_thermal_interrupt(regs);
@@ -141,7 +141,7 @@ static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
141 rdmsr (MSR_IA32_MCG_EIP, r->eip, h); 141 rdmsr (MSR_IA32_MCG_EIP, r->eip, h);
142} 142}
143 143
144static fastcall void intel_machine_check(struct pt_regs * regs, long error_code) 144static void intel_machine_check(struct pt_regs * regs, long error_code)
145{ 145{
146 int recover=1; 146 int recover=1;
147 u32 alow, ahigh, high, low; 147 u32 alow, ahigh, high, low;
@@ -152,38 +152,41 @@ static fastcall void intel_machine_check(struct pt_regs * regs, long error_code)
152 if (mcgstl & (1<<0)) /* Recoverable ? */ 152 if (mcgstl & (1<<0)) /* Recoverable ? */
153 recover=0; 153 recover=0;
154 154
155 printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", 155 printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
156 smp_processor_id(), mcgsth, mcgstl); 156 smp_processor_id(), mcgsth, mcgstl);
157 157
158 if (mce_num_extended_msrs > 0) { 158 if (mce_num_extended_msrs > 0) {
159 struct intel_mce_extended_msrs dbg; 159 struct intel_mce_extended_msrs dbg;
160 intel_get_extended_msrs(&dbg); 160 intel_get_extended_msrs(&dbg);
161 printk (KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n", 161 printk(KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n"
162 smp_processor_id(), dbg.eip, dbg.eflags); 162 "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n"
163 printk (KERN_DEBUG "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n", 163 "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n",
164 dbg.eax, dbg.ebx, dbg.ecx, dbg.edx); 164 smp_processor_id(), dbg.eip, dbg.eflags,
165 printk (KERN_DEBUG "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n", 165 dbg.eax, dbg.ebx, dbg.ecx, dbg.edx,
166 dbg.esi, dbg.edi, dbg.ebp, dbg.esp); 166 dbg.esi, dbg.edi, dbg.ebp, dbg.esp);
167 } 167 }
168 168
169 for (i=0; i<nr_mce_banks; i++) { 169 for (i = 0; i < nr_mce_banks; i++) {
170 rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high); 170 rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
171 if (high & (1<<31)) { 171 if (high & (1<<31)) {
172 char misc[20];
173 char addr[24];
174 misc[0] = addr[0] = '\0';
172 if (high & (1<<29)) 175 if (high & (1<<29))
173 recover |= 1; 176 recover |= 1;
174 if (high & (1<<25)) 177 if (high & (1<<25))
175 recover |= 2; 178 recover |= 2;
176 printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low);
177 high &= ~(1<<31); 179 high &= ~(1<<31);
178 if (high & (1<<27)) { 180 if (high & (1<<27)) {
179 rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh); 181 rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
180 printk ("[%08x%08x]", ahigh, alow); 182 snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
181 } 183 }
182 if (high & (1<<26)) { 184 if (high & (1<<26)) {
183 rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh); 185 rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
184 printk (" at %08x%08x", ahigh, alow); 186 snprintf(addr, 24, " at %08x%08x", ahigh, alow);
185 } 187 }
186 printk ("\n"); 188 printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
189 smp_processor_id(), i, high, low, misc, addr);
187 } 190 }
188 } 191 }
189 192
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c
index 94bc43d950c..a18310aaae0 100644
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -16,7 +16,7 @@
16#include "mce.h" 16#include "mce.h"
17 17
18/* Machine check handler for Pentium class Intel */ 18/* Machine check handler for Pentium class Intel */
19static fastcall void pentium_machine_check(struct pt_regs * regs, long error_code) 19static void pentium_machine_check(struct pt_regs * regs, long error_code)
20{ 20{
21 u32 loaddr, hi, lotype; 21 u32 loaddr, hi, lotype;
22 rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi); 22 rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c
index deeae42ce19..74342604d30 100644
--- a/arch/x86/kernel/cpu/mcheck/p6.c
+++ b/arch/x86/kernel/cpu/mcheck/p6.c
@@ -16,7 +16,7 @@
16#include "mce.h" 16#include "mce.h"
17 17
18/* Machine Check Handler For PII/PIII */ 18/* Machine Check Handler For PII/PIII */
19static fastcall void intel_machine_check(struct pt_regs * regs, long error_code) 19static void intel_machine_check(struct pt_regs * regs, long error_code)
20{ 20{
21 int recover=1; 21 int recover=1;
22 u32 alow, ahigh, high, low; 22 u32 alow, ahigh, high, low;
@@ -27,27 +27,30 @@ static fastcall void intel_machine_check(struct pt_regs * regs, long error_code)
27 if (mcgstl & (1<<0)) /* Recoverable ? */ 27 if (mcgstl & (1<<0)) /* Recoverable ? */
28 recover=0; 28 recover=0;
29 29
30 printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", 30 printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
31 smp_processor_id(), mcgsth, mcgstl); 31 smp_processor_id(), mcgsth, mcgstl);
32 32
33 for (i=0; i<nr_mce_banks; i++) { 33 for (i = 0; i < nr_mce_banks; i++) {
34 rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high); 34 rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
35 if (high & (1<<31)) { 35 if (high & (1<<31)) {
36 char misc[20];
37 char addr[24];
38 misc[0] = addr[0] = '\0';
36 if (high & (1<<29)) 39 if (high & (1<<29))
37 recover |= 1; 40 recover |= 1;
38 if (high & (1<<25)) 41 if (high & (1<<25))
39 recover |= 2; 42 recover |= 2;
40 printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low);
41 high &= ~(1<<31); 43 high &= ~(1<<31);
42 if (high & (1<<27)) { 44 if (high & (1<<27)) {
43 rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh); 45 rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
44 printk ("[%08x%08x]", ahigh, alow); 46 snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
45 } 47 }
46 if (high & (1<<26)) { 48 if (high & (1<<26)) {
47 rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh); 49 rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
48 printk (" at %08x%08x", ahigh, alow); 50 snprintf(addr, 24, " at %08x%08x", ahigh, alow);
49 } 51 }
50 printk ("\n"); 52 printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
53 smp_processor_id(), i, high, low, misc, addr);
51 } 54 }
52 } 55 }
53 56
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c
index 9e424b6c293..3d428d5afc5 100644
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -15,7 +15,7 @@
15#include "mce.h" 15#include "mce.h"
16 16
17/* Machine check handler for WinChip C6 */ 17/* Machine check handler for WinChip C6 */
18static fastcall void winchip_machine_check(struct pt_regs * regs, long error_code) 18static void winchip_machine_check(struct pt_regs * regs, long error_code)
19{ 19{
20 printk(KERN_EMERG "CPU0: Machine Check Exception.\n"); 20 printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
21 add_taint(TAINT_MACHINE_CHECK); 21 add_taint(TAINT_MACHINE_CHECK);
diff --git a/arch/x86/kernel/cpu/mtrr/amd.c b/arch/x86/kernel/cpu/mtrr/amd.c
index 0949cdbf848..ee2331b0e58 100644
--- a/arch/x86/kernel/cpu/mtrr/amd.c
+++ b/arch/x86/kernel/cpu/mtrr/amd.c
@@ -53,8 +53,6 @@ static void amd_set_mtrr(unsigned int reg, unsigned long base,
53 <base> The base address of the region. 53 <base> The base address of the region.
54 <size> The size of the region. If this is 0 the region is disabled. 54 <size> The size of the region. If this is 0 the region is disabled.
55 <type> The type of the region. 55 <type> The type of the region.
56 <do_safe> If TRUE, do the change safely. If FALSE, safety measures should
57 be done externally.
58 [RETURNS] Nothing. 56 [RETURNS] Nothing.
59*/ 57*/
60{ 58{
diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c
index 9964be3de2b..ff14c320040 100644
--- a/arch/x86/kernel/cpu/mtrr/cyrix.c
+++ b/arch/x86/kernel/cpu/mtrr/cyrix.c
@@ -4,10 +4,9 @@
4#include <asm/msr.h> 4#include <asm/msr.h>
5#include <asm/io.h> 5#include <asm/io.h>
6#include <asm/processor-cyrix.h> 6#include <asm/processor-cyrix.h>
7#include <asm/processor-flags.h>
7#include "mtrr.h" 8#include "mtrr.h"
8 9
9int arr3_protected;
10
11static void 10static void
12cyrix_get_arr(unsigned int reg, unsigned long *base, 11cyrix_get_arr(unsigned int reg, unsigned long *base,
13 unsigned long *size, mtrr_type * type) 12 unsigned long *size, mtrr_type * type)
@@ -98,8 +97,6 @@ cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg)
98 case 4: 97 case 4:
99 return replace_reg; 98 return replace_reg;
100 case 3: 99 case 3:
101 if (arr3_protected)
102 break;
103 case 2: 100 case 2:
104 case 1: 101 case 1:
105 case 0: 102 case 0:
@@ -114,8 +111,6 @@ cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg)
114 } else { 111 } else {
115 for (i = 0; i < 7; i++) { 112 for (i = 0; i < 7; i++) {
116 cyrix_get_arr(i, &lbase, &lsize, &ltype); 113 cyrix_get_arr(i, &lbase, &lsize, &ltype);
117 if ((i == 3) && arr3_protected)
118 continue;
119 if (lsize == 0) 114 if (lsize == 0)
120 return i; 115 return i;
121 } 116 }
@@ -142,7 +137,7 @@ static void prepare_set(void)
142 137
143 /* Disable and flush caches. Note that wbinvd flushes the TLBs as 138 /* Disable and flush caches. Note that wbinvd flushes the TLBs as
144 a side-effect */ 139 a side-effect */
145 cr0 = read_cr0() | 0x40000000; 140 cr0 = read_cr0() | X86_CR0_CD;
146 wbinvd(); 141 wbinvd();
147 write_cr0(cr0); 142 write_cr0(cr0);
148 wbinvd(); 143 wbinvd();
@@ -259,107 +254,6 @@ static void cyrix_set_all(void)
259 post_set(); 254 post_set();
260} 255}
261 256
262#if 0
263/*
264 * On Cyrix 6x86(MX) and M II the ARR3 is special: it has connection
265 * with the SMM (System Management Mode) mode. So we need the following:
266 * Check whether SMI_LOCK (CCR3 bit 0) is set
267 * if it is set, write a warning message: ARR3 cannot be changed!
268 * (it cannot be changed until the next processor reset)
269 * if it is reset, then we can change it, set all the needed bits:
270 * - disable access to SMM memory through ARR3 range (CCR1 bit 7 reset)
271 * - disable access to SMM memory (CCR1 bit 2 reset)
272 * - disable SMM mode (CCR1 bit 1 reset)
273 * - disable write protection of ARR3 (CCR6 bit 1 reset)
274 * - (maybe) disable ARR3
275 * Just to be sure, we enable ARR usage by the processor (CCR5 bit 5 set)
276 */
277static void __init
278cyrix_arr_init(void)
279{
280 struct set_mtrr_context ctxt;
281 unsigned char ccr[7];
282 int ccrc[7] = { 0, 0, 0, 0, 0, 0, 0 };
283#ifdef CONFIG_SMP
284 int i;
285#endif
286
287 /* flush cache and enable MAPEN */
288 set_mtrr_prepare_save(&ctxt);
289 set_mtrr_cache_disable(&ctxt);
290
291 /* Save all CCRs locally */
292 ccr[0] = getCx86(CX86_CCR0);
293 ccr[1] = getCx86(CX86_CCR1);
294 ccr[2] = getCx86(CX86_CCR2);
295 ccr[3] = ctxt.ccr3;
296 ccr[4] = getCx86(CX86_CCR4);
297 ccr[5] = getCx86(CX86_CCR5);
298 ccr[6] = getCx86(CX86_CCR6);
299
300 if (ccr[3] & 1) {
301 ccrc[3] = 1;
302 arr3_protected = 1;
303 } else {
304 /* Disable SMM mode (bit 1), access to SMM memory (bit 2) and
305 * access to SMM memory through ARR3 (bit 7).
306 */
307 if (ccr[1] & 0x80) {
308 ccr[1] &= 0x7f;
309 ccrc[1] |= 0x80;
310 }
311 if (ccr[1] & 0x04) {
312 ccr[1] &= 0xfb;
313 ccrc[1] |= 0x04;
314 }
315 if (ccr[1] & 0x02) {
316 ccr[1] &= 0xfd;
317 ccrc[1] |= 0x02;
318 }
319 arr3_protected = 0;
320 if (ccr[6] & 0x02) {
321 ccr[6] &= 0xfd;
322 ccrc[6] = 1; /* Disable write protection of ARR3 */
323 setCx86(CX86_CCR6, ccr[6]);
324 }
325 /* Disable ARR3. This is safe now that we disabled SMM. */
326 /* cyrix_set_arr_up (3, 0, 0, 0, FALSE); */
327 }
328 /* If we changed CCR1 in memory, change it in the processor, too. */
329 if (ccrc[1])
330 setCx86(CX86_CCR1, ccr[1]);
331
332 /* Enable ARR usage by the processor */
333 if (!(ccr[5] & 0x20)) {
334 ccr[5] |= 0x20;
335 ccrc[5] = 1;
336 setCx86(CX86_CCR5, ccr[5]);
337 }
338#ifdef CONFIG_SMP
339 for (i = 0; i < 7; i++)
340 ccr_state[i] = ccr[i];
341 for (i = 0; i < 8; i++)
342 cyrix_get_arr(i,
343 &arr_state[i].base, &arr_state[i].size,
344 &arr_state[i].type);
345#endif
346
347 set_mtrr_done(&ctxt); /* flush cache and disable MAPEN */
348
349 if (ccrc[5])
350 printk(KERN_INFO "mtrr: ARR usage was not enabled, enabled manually\n");
351 if (ccrc[3])
352 printk(KERN_INFO "mtrr: ARR3 cannot be changed\n");
353/*
354 if ( ccrc[1] & 0x80) printk ("mtrr: SMM memory access through ARR3 disabled\n");
355 if ( ccrc[1] & 0x04) printk ("mtrr: SMM memory access disabled\n");
356 if ( ccrc[1] & 0x02) printk ("mtrr: SMM mode disabled\n");
357*/
358 if (ccrc[6])
359 printk(KERN_INFO "mtrr: ARR3 was write protected, unprotected\n");
360}
361#endif
362
363static struct mtrr_ops cyrix_mtrr_ops = { 257static struct mtrr_ops cyrix_mtrr_ops = {
364 .vendor = X86_VENDOR_CYRIX, 258 .vendor = X86_VENDOR_CYRIX,
365// .init = cyrix_arr_init, 259// .init = cyrix_arr_init,
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 992f08dfbb6..103d61a59b1 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -9,11 +9,12 @@
9#include <asm/msr.h> 9#include <asm/msr.h>
10#include <asm/system.h> 10#include <asm/system.h>
11#include <asm/cpufeature.h> 11#include <asm/cpufeature.h>
12#include <asm/processor-flags.h>
12#include <asm/tlbflush.h> 13#include <asm/tlbflush.h>
13#include "mtrr.h" 14#include "mtrr.h"
14 15
15struct mtrr_state { 16struct mtrr_state {
16 struct mtrr_var_range *var_ranges; 17 struct mtrr_var_range var_ranges[MAX_VAR_RANGES];
17 mtrr_type fixed_ranges[NUM_FIXED_RANGES]; 18 mtrr_type fixed_ranges[NUM_FIXED_RANGES];
18 unsigned char enabled; 19 unsigned char enabled;
19 unsigned char have_fixed; 20 unsigned char have_fixed;
@@ -85,12 +86,6 @@ void __init get_mtrr_state(void)
85 struct mtrr_var_range *vrs; 86 struct mtrr_var_range *vrs;
86 unsigned lo, dummy; 87 unsigned lo, dummy;
87 88
88 if (!mtrr_state.var_ranges) {
89 mtrr_state.var_ranges = kmalloc(num_var_ranges * sizeof (struct mtrr_var_range),
90 GFP_KERNEL);
91 if (!mtrr_state.var_ranges)
92 return;
93 }
94 vrs = mtrr_state.var_ranges; 89 vrs = mtrr_state.var_ranges;
95 90
96 rdmsr(MTRRcap_MSR, lo, dummy); 91 rdmsr(MTRRcap_MSR, lo, dummy);
@@ -188,7 +183,7 @@ static inline void k8_enable_fixed_iorrs(void)
188 * \param changed pointer which indicates whether the MTRR needed to be changed 183 * \param changed pointer which indicates whether the MTRR needed to be changed
189 * \param msrwords pointer to the MSR values which the MSR should have 184 * \param msrwords pointer to the MSR values which the MSR should have
190 */ 185 */
191static void set_fixed_range(int msr, int * changed, unsigned int * msrwords) 186static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords)
192{ 187{
193 unsigned lo, hi; 188 unsigned lo, hi;
194 189
@@ -200,7 +195,7 @@ static void set_fixed_range(int msr, int * changed, unsigned int * msrwords)
200 ((msrwords[0] | msrwords[1]) & K8_MTRR_RDMEM_WRMEM_MASK)) 195 ((msrwords[0] | msrwords[1]) & K8_MTRR_RDMEM_WRMEM_MASK))
201 k8_enable_fixed_iorrs(); 196 k8_enable_fixed_iorrs();
202 mtrr_wrmsr(msr, msrwords[0], msrwords[1]); 197 mtrr_wrmsr(msr, msrwords[0], msrwords[1]);
203 *changed = TRUE; 198 *changed = true;
204 } 199 }
205} 200}
206 201
@@ -260,7 +255,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
260static int set_fixed_ranges(mtrr_type * frs) 255static int set_fixed_ranges(mtrr_type * frs)
261{ 256{
262 unsigned long long *saved = (unsigned long long *) frs; 257 unsigned long long *saved = (unsigned long long *) frs;
263 int changed = FALSE; 258 bool changed = false;
264 int block=-1, range; 259 int block=-1, range;
265 260
266 while (fixed_range_blocks[++block].ranges) 261 while (fixed_range_blocks[++block].ranges)
@@ -273,17 +268,17 @@ static int set_fixed_ranges(mtrr_type * frs)
273 268
274/* Set the MSR pair relating to a var range. Returns TRUE if 269/* Set the MSR pair relating to a var range. Returns TRUE if
275 changes are made */ 270 changes are made */
276static int set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr) 271static bool set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr)
277{ 272{
278 unsigned int lo, hi; 273 unsigned int lo, hi;
279 int changed = FALSE; 274 bool changed = false;
280 275
281 rdmsr(MTRRphysBase_MSR(index), lo, hi); 276 rdmsr(MTRRphysBase_MSR(index), lo, hi);
282 if ((vr->base_lo & 0xfffff0ffUL) != (lo & 0xfffff0ffUL) 277 if ((vr->base_lo & 0xfffff0ffUL) != (lo & 0xfffff0ffUL)
283 || (vr->base_hi & (size_and_mask >> (32 - PAGE_SHIFT))) != 278 || (vr->base_hi & (size_and_mask >> (32 - PAGE_SHIFT))) !=
284 (hi & (size_and_mask >> (32 - PAGE_SHIFT)))) { 279 (hi & (size_and_mask >> (32 - PAGE_SHIFT)))) {
285 mtrr_wrmsr(MTRRphysBase_MSR(index), vr->base_lo, vr->base_hi); 280 mtrr_wrmsr(MTRRphysBase_MSR(index), vr->base_lo, vr->base_hi);
286 changed = TRUE; 281 changed = true;
287 } 282 }
288 283
289 rdmsr(MTRRphysMask_MSR(index), lo, hi); 284 rdmsr(MTRRphysMask_MSR(index), lo, hi);
@@ -292,7 +287,7 @@ static int set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr)
292 || (vr->mask_hi & (size_and_mask >> (32 - PAGE_SHIFT))) != 287 || (vr->mask_hi & (size_and_mask >> (32 - PAGE_SHIFT))) !=
293 (hi & (size_and_mask >> (32 - PAGE_SHIFT)))) { 288 (hi & (size_and_mask >> (32 - PAGE_SHIFT)))) {
294 mtrr_wrmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi); 289 mtrr_wrmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi);
295 changed = TRUE; 290 changed = true;
296 } 291 }
297 return changed; 292 return changed;
298} 293}
@@ -350,7 +345,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
350 spin_lock(&set_atomicity_lock); 345 spin_lock(&set_atomicity_lock);
351 346
352 /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */ 347 /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */
353 cr0 = read_cr0() | 0x40000000; /* set CD flag */ 348 cr0 = read_cr0() | X86_CR0_CD;
354 write_cr0(cr0); 349 write_cr0(cr0);
355 wbinvd(); 350 wbinvd();
356 351
@@ -417,8 +412,6 @@ static void generic_set_mtrr(unsigned int reg, unsigned long base,
417 <base> The base address of the region. 412 <base> The base address of the region.
418 <size> The size of the region. If this is 0 the region is disabled. 413 <size> The size of the region. If this is 0 the region is disabled.
419 <type> The type of the region. 414 <type> The type of the region.
420 <do_safe> If TRUE, do the change safely. If FALSE, safety measures should
421 be done externally.
422 [RETURNS] Nothing. 415 [RETURNS] Nothing.
423*/ 416*/
424{ 417{
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index c7d8f175674..91e150acb46 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -11,10 +11,6 @@
11#include <asm/mtrr.h> 11#include <asm/mtrr.h>
12#include "mtrr.h" 12#include "mtrr.h"
13 13
14/* RED-PEN: this is accessed without any locking */
15extern unsigned int *usage_table;
16
17
18#define FILE_FCOUNT(f) (((struct seq_file *)((f)->private_data))->private) 14#define FILE_FCOUNT(f) (((struct seq_file *)((f)->private_data))->private)
19 15
20static const char *const mtrr_strings[MTRR_NUM_TYPES] = 16static const char *const mtrr_strings[MTRR_NUM_TYPES] =
@@ -37,7 +33,7 @@ const char *mtrr_attrib_to_str(int x)
37 33
38static int 34static int
39mtrr_file_add(unsigned long base, unsigned long size, 35mtrr_file_add(unsigned long base, unsigned long size,
40 unsigned int type, char increment, struct file *file, int page) 36 unsigned int type, bool increment, struct file *file, int page)
41{ 37{
42 int reg, max; 38 int reg, max;
43 unsigned int *fcount = FILE_FCOUNT(file); 39 unsigned int *fcount = FILE_FCOUNT(file);
@@ -55,7 +51,7 @@ mtrr_file_add(unsigned long base, unsigned long size,
55 base >>= PAGE_SHIFT; 51 base >>= PAGE_SHIFT;
56 size >>= PAGE_SHIFT; 52 size >>= PAGE_SHIFT;
57 } 53 }
58 reg = mtrr_add_page(base, size, type, 1); 54 reg = mtrr_add_page(base, size, type, true);
59 if (reg >= 0) 55 if (reg >= 0)
60 ++fcount[reg]; 56 ++fcount[reg];
61 return reg; 57 return reg;
@@ -141,7 +137,7 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
141 size >>= PAGE_SHIFT; 137 size >>= PAGE_SHIFT;
142 err = 138 err =
143 mtrr_add_page((unsigned long) base, (unsigned long) size, i, 139 mtrr_add_page((unsigned long) base, (unsigned long) size, i,
144 1); 140 true);
145 if (err < 0) 141 if (err < 0)
146 return err; 142 return err;
147 return len; 143 return len;
@@ -217,7 +213,7 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
217 if (!capable(CAP_SYS_ADMIN)) 213 if (!capable(CAP_SYS_ADMIN))
218 return -EPERM; 214 return -EPERM;
219 err = 215 err =
220 mtrr_file_add(sentry.base, sentry.size, sentry.type, 1, 216 mtrr_file_add(sentry.base, sentry.size, sentry.type, true,
221 file, 0); 217 file, 0);
222 break; 218 break;
223 case MTRRIOC_SET_ENTRY: 219 case MTRRIOC_SET_ENTRY:
@@ -226,7 +222,7 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
226#endif 222#endif
227 if (!capable(CAP_SYS_ADMIN)) 223 if (!capable(CAP_SYS_ADMIN))
228 return -EPERM; 224 return -EPERM;
229 err = mtrr_add(sentry.base, sentry.size, sentry.type, 0); 225 err = mtrr_add(sentry.base, sentry.size, sentry.type, false);
230 break; 226 break;
231 case MTRRIOC_DEL_ENTRY: 227 case MTRRIOC_DEL_ENTRY:
232#ifdef CONFIG_COMPAT 228#ifdef CONFIG_COMPAT
@@ -270,7 +266,7 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
270 if (!capable(CAP_SYS_ADMIN)) 266 if (!capable(CAP_SYS_ADMIN))
271 return -EPERM; 267 return -EPERM;
272 err = 268 err =
273 mtrr_file_add(sentry.base, sentry.size, sentry.type, 1, 269 mtrr_file_add(sentry.base, sentry.size, sentry.type, true,
274 file, 1); 270 file, 1);
275 break; 271 break;
276 case MTRRIOC_SET_PAGE_ENTRY: 272 case MTRRIOC_SET_PAGE_ENTRY:
@@ -279,7 +275,8 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
279#endif 275#endif
280 if (!capable(CAP_SYS_ADMIN)) 276 if (!capable(CAP_SYS_ADMIN))
281 return -EPERM; 277 return -EPERM;
282 err = mtrr_add_page(sentry.base, sentry.size, sentry.type, 0); 278 err =
279 mtrr_add_page(sentry.base, sentry.size, sentry.type, false);
283 break; 280 break;
284 case MTRRIOC_DEL_PAGE_ENTRY: 281 case MTRRIOC_DEL_PAGE_ENTRY:
285#ifdef CONFIG_COMPAT 282#ifdef CONFIG_COMPAT
@@ -396,7 +393,7 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset)
396 for (i = 0; i < max; i++) { 393 for (i = 0; i < max; i++) {
397 mtrr_if->get(i, &base, &size, &type); 394 mtrr_if->get(i, &base, &size, &type);
398 if (size == 0) 395 if (size == 0)
399 usage_table[i] = 0; 396 mtrr_usage_table[i] = 0;
400 else { 397 else {
401 if (size < (0x100000 >> PAGE_SHIFT)) { 398 if (size < (0x100000 >> PAGE_SHIFT)) {
402 /* less than 1MB */ 399 /* less than 1MB */
@@ -410,7 +407,7 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset)
410 len += seq_printf(seq, 407 len += seq_printf(seq,
411 "reg%02i: base=0x%05lx000 (%4luMB), size=%4lu%cB: %s, count=%d\n", 408 "reg%02i: base=0x%05lx000 (%4luMB), size=%4lu%cB: %s, count=%d\n",
412 i, base, base >> (20 - PAGE_SHIFT), size, factor, 409 i, base, base >> (20 - PAGE_SHIFT), size, factor,
413 mtrr_attrib_to_str(type), usage_table[i]); 410 mtrr_attrib_to_str(type), mtrr_usage_table[i]);
414 } 411 }
415 } 412 }
416 return 0; 413 return 0;
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 3b20613325d..b6e136f23d3 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -38,8 +38,8 @@
38#include <linux/cpu.h> 38#include <linux/cpu.h>
39#include <linux/mutex.h> 39#include <linux/mutex.h>
40 40
41#include <asm/e820.h>
41#include <asm/mtrr.h> 42#include <asm/mtrr.h>
42
43#include <asm/uaccess.h> 43#include <asm/uaccess.h>
44#include <asm/processor.h> 44#include <asm/processor.h>
45#include <asm/msr.h> 45#include <asm/msr.h>
@@ -47,7 +47,7 @@
47 47
48u32 num_var_ranges = 0; 48u32 num_var_ranges = 0;
49 49
50unsigned int *usage_table; 50unsigned int mtrr_usage_table[MAX_VAR_RANGES];
51static DEFINE_MUTEX(mtrr_mutex); 51static DEFINE_MUTEX(mtrr_mutex);
52 52
53u64 size_or_mask, size_and_mask; 53u64 size_or_mask, size_and_mask;
@@ -59,12 +59,6 @@ struct mtrr_ops * mtrr_if = NULL;
59static void set_mtrr(unsigned int reg, unsigned long base, 59static void set_mtrr(unsigned int reg, unsigned long base,
60 unsigned long size, mtrr_type type); 60 unsigned long size, mtrr_type type);
61 61
62#ifndef CONFIG_X86_64
63extern int arr3_protected;
64#else
65#define arr3_protected 0
66#endif
67
68void set_mtrr_ops(struct mtrr_ops * ops) 62void set_mtrr_ops(struct mtrr_ops * ops)
69{ 63{
70 if (ops->vendor && ops->vendor < X86_VENDOR_NUM) 64 if (ops->vendor && ops->vendor < X86_VENDOR_NUM)
@@ -121,13 +115,8 @@ static void __init init_table(void)
121 int i, max; 115 int i, max;
122 116
123 max = num_var_ranges; 117 max = num_var_ranges;
124 if ((usage_table = kmalloc(max * sizeof *usage_table, GFP_KERNEL))
125 == NULL) {
126 printk(KERN_ERR "mtrr: could not allocate\n");
127 return;
128 }
129 for (i = 0; i < max; i++) 118 for (i = 0; i < max; i++)
130 usage_table[i] = 1; 119 mtrr_usage_table[i] = 1;
131} 120}
132 121
133struct set_mtrr_data { 122struct set_mtrr_data {
@@ -311,7 +300,7 @@ static void set_mtrr(unsigned int reg, unsigned long base,
311 */ 300 */
312 301
313int mtrr_add_page(unsigned long base, unsigned long size, 302int mtrr_add_page(unsigned long base, unsigned long size,
314 unsigned int type, char increment) 303 unsigned int type, bool increment)
315{ 304{
316 int i, replace, error; 305 int i, replace, error;
317 mtrr_type ltype; 306 mtrr_type ltype;
@@ -349,7 +338,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
349 replace = -1; 338 replace = -1;
350 339
351 /* No CPU hotplug when we change MTRR entries */ 340 /* No CPU hotplug when we change MTRR entries */
352 lock_cpu_hotplug(); 341 get_online_cpus();
353 /* Search for existing MTRR */ 342 /* Search for existing MTRR */
354 mutex_lock(&mtrr_mutex); 343 mutex_lock(&mtrr_mutex);
355 for (i = 0; i < num_var_ranges; ++i) { 344 for (i = 0; i < num_var_ranges; ++i) {
@@ -383,7 +372,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
383 goto out; 372 goto out;
384 } 373 }
385 if (increment) 374 if (increment)
386 ++usage_table[i]; 375 ++mtrr_usage_table[i];
387 error = i; 376 error = i;
388 goto out; 377 goto out;
389 } 378 }
@@ -391,13 +380,15 @@ int mtrr_add_page(unsigned long base, unsigned long size,
391 i = mtrr_if->get_free_region(base, size, replace); 380 i = mtrr_if->get_free_region(base, size, replace);
392 if (i >= 0) { 381 if (i >= 0) {
393 set_mtrr(i, base, size, type); 382 set_mtrr(i, base, size, type);
394 if (likely(replace < 0)) 383 if (likely(replace < 0)) {
395 usage_table[i] = 1; 384 mtrr_usage_table[i] = 1;
396 else { 385 } else {
397 usage_table[i] = usage_table[replace] + !!increment; 386 mtrr_usage_table[i] = mtrr_usage_table[replace];
387 if (increment)
388 mtrr_usage_table[i]++;
398 if (unlikely(replace != i)) { 389 if (unlikely(replace != i)) {
399 set_mtrr(replace, 0, 0, 0); 390 set_mtrr(replace, 0, 0, 0);
400 usage_table[replace] = 0; 391 mtrr_usage_table[replace] = 0;
401 } 392 }
402 } 393 }
403 } else 394 } else
@@ -405,7 +396,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
405 error = i; 396 error = i;
406 out: 397 out:
407 mutex_unlock(&mtrr_mutex); 398 mutex_unlock(&mtrr_mutex);
408 unlock_cpu_hotplug(); 399 put_online_cpus();
409 return error; 400 return error;
410} 401}
411 402
@@ -460,7 +451,7 @@ static int mtrr_check(unsigned long base, unsigned long size)
460 451
461int 452int
462mtrr_add(unsigned long base, unsigned long size, unsigned int type, 453mtrr_add(unsigned long base, unsigned long size, unsigned int type,
463 char increment) 454 bool increment)
464{ 455{
465 if (mtrr_check(base, size)) 456 if (mtrr_check(base, size))
466 return -EINVAL; 457 return -EINVAL;
@@ -495,7 +486,7 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
495 486
496 max = num_var_ranges; 487 max = num_var_ranges;
497 /* No CPU hotplug when we change MTRR entries */ 488 /* No CPU hotplug when we change MTRR entries */
498 lock_cpu_hotplug(); 489 get_online_cpus();
499 mutex_lock(&mtrr_mutex); 490 mutex_lock(&mtrr_mutex);
500 if (reg < 0) { 491 if (reg < 0) {
501 /* Search for existing MTRR */ 492 /* Search for existing MTRR */
@@ -516,27 +507,21 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
516 printk(KERN_WARNING "mtrr: register: %d too big\n", reg); 507 printk(KERN_WARNING "mtrr: register: %d too big\n", reg);
517 goto out; 508 goto out;
518 } 509 }
519 if (is_cpu(CYRIX) && !use_intel()) {
520 if ((reg == 3) && arr3_protected) {
521 printk(KERN_WARNING "mtrr: ARR3 cannot be changed\n");
522 goto out;
523 }
524 }
525 mtrr_if->get(reg, &lbase, &lsize, &ltype); 510 mtrr_if->get(reg, &lbase, &lsize, &ltype);
526 if (lsize < 1) { 511 if (lsize < 1) {
527 printk(KERN_WARNING "mtrr: MTRR %d not used\n", reg); 512 printk(KERN_WARNING "mtrr: MTRR %d not used\n", reg);
528 goto out; 513 goto out;
529 } 514 }
530 if (usage_table[reg] < 1) { 515 if (mtrr_usage_table[reg] < 1) {
531 printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg); 516 printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg);
532 goto out; 517 goto out;
533 } 518 }
534 if (--usage_table[reg] < 1) 519 if (--mtrr_usage_table[reg] < 1)
535 set_mtrr(reg, 0, 0, 0); 520 set_mtrr(reg, 0, 0, 0);
536 error = reg; 521 error = reg;
537 out: 522 out:
538 mutex_unlock(&mtrr_mutex); 523 mutex_unlock(&mtrr_mutex);
539 unlock_cpu_hotplug(); 524 put_online_cpus();
540 return error; 525 return error;
541} 526}
542/** 527/**
@@ -569,10 +554,6 @@ EXPORT_SYMBOL(mtrr_del);
569 * These should be called implicitly, but we can't yet until all the initcall 554 * These should be called implicitly, but we can't yet until all the initcall
570 * stuff is done... 555 * stuff is done...
571 */ 556 */
572extern void amd_init_mtrr(void);
573extern void cyrix_init_mtrr(void);
574extern void centaur_init_mtrr(void);
575
576static void __init init_ifs(void) 557static void __init init_ifs(void)
577{ 558{
578#ifndef CONFIG_X86_64 559#ifndef CONFIG_X86_64
@@ -591,16 +572,11 @@ struct mtrr_value {
591 unsigned long lsize; 572 unsigned long lsize;
592}; 573};
593 574
594static struct mtrr_value * mtrr_state; 575static struct mtrr_value mtrr_state[MAX_VAR_RANGES];
595 576
596static int mtrr_save(struct sys_device * sysdev, pm_message_t state) 577static int mtrr_save(struct sys_device * sysdev, pm_message_t state)
597{ 578{
598 int i; 579 int i;
599 int size = num_var_ranges * sizeof(struct mtrr_value);
600
601 mtrr_state = kzalloc(size,GFP_ATOMIC);
602 if (!mtrr_state)
603 return -ENOMEM;
604 580
605 for (i = 0; i < num_var_ranges; i++) { 581 for (i = 0; i < num_var_ranges; i++) {
606 mtrr_if->get(i, 582 mtrr_if->get(i,
@@ -622,7 +598,6 @@ static int mtrr_restore(struct sys_device * sysdev)
622 mtrr_state[i].lsize, 598 mtrr_state[i].lsize,
623 mtrr_state[i].ltype); 599 mtrr_state[i].ltype);
624 } 600 }
625 kfree(mtrr_state);
626 return 0; 601 return 0;
627} 602}
628 603
@@ -633,6 +608,111 @@ static struct sysdev_driver mtrr_sysdev_driver = {
633 .resume = mtrr_restore, 608 .resume = mtrr_restore,
634}; 609};
635 610
611static int disable_mtrr_trim;
612
613static int __init disable_mtrr_trim_setup(char *str)
614{
615 disable_mtrr_trim = 1;
616 return 0;
617}
618early_param("disable_mtrr_trim", disable_mtrr_trim_setup);
619
620/*
621 * Newer AMD K8s and later CPUs have a special magic MSR way to force WB
622 * for memory >4GB. Check for that here.
623 * Note this won't check if the MTRRs < 4GB where the magic bit doesn't
624 * apply to are wrong, but so far we don't know of any such case in the wild.
625 */
626#define Tom2Enabled (1U << 21)
627#define Tom2ForceMemTypeWB (1U << 22)
628
629static __init int amd_special_default_mtrr(void)
630{
631 u32 l, h;
632
633 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
634 return 0;
635 if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11)
636 return 0;
637 /* In case some hypervisor doesn't pass SYSCFG through */
638 if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0)
639 return 0;
640 /*
641 * Memory between 4GB and top of mem is forced WB by this magic bit.
642 * Reserved before K8RevF, but should be zero there.
643 */
644 if ((l & (Tom2Enabled | Tom2ForceMemTypeWB)) ==
645 (Tom2Enabled | Tom2ForceMemTypeWB))
646 return 1;
647 return 0;
648}
649
650/**
651 * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs
652 *
653 * Some buggy BIOSes don't setup the MTRRs properly for systems with certain
654 * memory configurations. This routine checks that the highest MTRR matches
655 * the end of memory, to make sure the MTRRs having a write back type cover
656 * all of the memory the kernel is intending to use. If not, it'll trim any
657 * memory off the end by adjusting end_pfn, removing it from the kernel's
658 * allocation pools, warning the user with an obnoxious message.
659 */
660int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
661{
662 unsigned long i, base, size, highest_pfn = 0, def, dummy;
663 mtrr_type type;
664 u64 trim_start, trim_size;
665
666 /*
667 * Make sure we only trim uncachable memory on machines that
668 * support the Intel MTRR architecture:
669 */
670 if (!is_cpu(INTEL) || disable_mtrr_trim)
671 return 0;
672 rdmsr(MTRRdefType_MSR, def, dummy);
673 def &= 0xff;
674 if (def != MTRR_TYPE_UNCACHABLE)
675 return 0;
676
677 if (amd_special_default_mtrr())
678 return 0;
679
680 /* Find highest cached pfn */
681 for (i = 0; i < num_var_ranges; i++) {
682 mtrr_if->get(i, &base, &size, &type);
683 if (type != MTRR_TYPE_WRBACK)
684 continue;
685 if (highest_pfn < base + size)
686 highest_pfn = base + size;
687 }
688
689 /* kvm/qemu doesn't have mtrr set right, don't trim them all */
690 if (!highest_pfn) {
691 printk(KERN_WARNING "WARNING: strange, CPU MTRRs all blank?\n");
692 WARN_ON(1);
693 return 0;
694 }
695
696 if (highest_pfn < end_pfn) {
697 printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover"
698 " all of memory, losing %luMB of RAM.\n",
699 (end_pfn - highest_pfn) >> (20 - PAGE_SHIFT));
700
701 WARN_ON(1);
702
703 printk(KERN_INFO "update e820 for mtrr\n");
704 trim_start = highest_pfn;
705 trim_start <<= PAGE_SHIFT;
706 trim_size = end_pfn;
707 trim_size <<= PAGE_SHIFT;
708 trim_size -= trim_start;
709 add_memory_region(trim_start, trim_size, E820_RESERVED);
710 update_e820();
711 return 1;
712 }
713
714 return 0;
715}
636 716
637/** 717/**
638 * mtrr_bp_init - initialize mtrrs on the boot CPU 718 * mtrr_bp_init - initialize mtrrs on the boot CPU
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index 289dfe6030e..2cc77eb6fea 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -2,10 +2,8 @@
2 * local mtrr defines. 2 * local mtrr defines.
3 */ 3 */
4 4
5#ifndef TRUE 5#include <linux/types.h>
6#define TRUE 1 6#include <linux/stddef.h>
7#define FALSE 0
8#endif
9 7
10#define MTRRcap_MSR 0x0fe 8#define MTRRcap_MSR 0x0fe
11#define MTRRdefType_MSR 0x2ff 9#define MTRRdefType_MSR 0x2ff
@@ -14,6 +12,7 @@
14#define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1) 12#define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1)
15 13
16#define NUM_FIXED_RANGES 88 14#define NUM_FIXED_RANGES 88
15#define MAX_VAR_RANGES 256
17#define MTRRfix64K_00000_MSR 0x250 16#define MTRRfix64K_00000_MSR 0x250
18#define MTRRfix16K_80000_MSR 0x258 17#define MTRRfix16K_80000_MSR 0x258
19#define MTRRfix16K_A0000_MSR 0x259 18#define MTRRfix16K_A0000_MSR 0x259
@@ -34,6 +33,8 @@
34 an 8 bit field: */ 33 an 8 bit field: */
35typedef u8 mtrr_type; 34typedef u8 mtrr_type;
36 35
36extern unsigned int mtrr_usage_table[MAX_VAR_RANGES];
37
37struct mtrr_ops { 38struct mtrr_ops {
38 u32 vendor; 39 u32 vendor;
39 u32 use_intel_if; 40 u32 use_intel_if;
@@ -96,3 +97,7 @@ void mtrr_state_warn(void);
96const char *mtrr_attrib_to_str(int x); 97const char *mtrr_attrib_to_str(int x);
97void mtrr_wrmsr(unsigned, unsigned, unsigned); 98void mtrr_wrmsr(unsigned, unsigned, unsigned);
98 99
100/* CPU specific mtrr init functions */
101int amd_init_mtrr(void);
102int cyrix_init_mtrr(void);
103int centaur_init_mtrr(void);
diff --git a/arch/x86/kernel/cpu/mtrr/state.c b/arch/x86/kernel/cpu/mtrr/state.c
index 49e20c2afcd..9f8ba923d1c 100644
--- a/arch/x86/kernel/cpu/mtrr/state.c
+++ b/arch/x86/kernel/cpu/mtrr/state.c
@@ -4,6 +4,7 @@
4#include <asm/mtrr.h> 4#include <asm/mtrr.h>
5#include <asm/msr.h> 5#include <asm/msr.h>
6#include <asm/processor-cyrix.h> 6#include <asm/processor-cyrix.h>
7#include <asm/processor-flags.h>
7#include "mtrr.h" 8#include "mtrr.h"
8 9
9 10
@@ -25,7 +26,7 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt)
25 26
26 /* Disable and flush caches. Note that wbinvd flushes the TLBs as 27 /* Disable and flush caches. Note that wbinvd flushes the TLBs as
27 a side-effect */ 28 a side-effect */
28 cr0 = read_cr0() | 0x40000000; 29 cr0 = read_cr0() | X86_CR0_CD;
29 wbinvd(); 30 wbinvd();
30 write_cr0(cr0); 31 write_cr0(cr0);
31 wbinvd(); 32 wbinvd();
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index c02541e6e65..9b838324b81 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -167,7 +167,6 @@ void release_evntsel_nmi(unsigned int msr)
167 clear_bit(counter, evntsel_nmi_owner); 167 clear_bit(counter, evntsel_nmi_owner);
168} 168}
169 169
170EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi);
171EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit); 170EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
172EXPORT_SYMBOL(reserve_perfctr_nmi); 171EXPORT_SYMBOL(reserve_perfctr_nmi);
173EXPORT_SYMBOL(release_perfctr_nmi); 172EXPORT_SYMBOL(release_perfctr_nmi);
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 3900e46d66d..af11d31dce0 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -10,80 +10,6 @@
10 */ 10 */
11static int show_cpuinfo(struct seq_file *m, void *v) 11static int show_cpuinfo(struct seq_file *m, void *v)
12{ 12{
13 /*
14 * These flag bits must match the definitions in <asm/cpufeature.h>.
15 * NULL means this bit is undefined or reserved; either way it doesn't
16 * have meaning as far as Linux is concerned. Note that it's important
17 * to realize there is a difference between this table and CPUID -- if
18 * applications want to get the raw CPUID data, they should access
19 * /dev/cpu/<cpu_nr>/cpuid instead.
20 */
21 static const char * const x86_cap_flags[] = {
22 /* Intel-defined */
23 "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
24 "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
25 "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx",
26 "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe",
27
28 /* AMD-defined */
29 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
30 NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
31 NULL, NULL, NULL, "mp", "nx", NULL, "mmxext", NULL,
32 NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm",
33 "3dnowext", "3dnow",
34
35 /* Transmeta-defined */
36 "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
37 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
38 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
39 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
40
41 /* Other (Linux-defined) */
42 "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr",
43 NULL, NULL, NULL, NULL,
44 "constant_tsc", "up", NULL, "arch_perfmon",
45 "pebs", "bts", NULL, "sync_rdtsc",
46 "rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
47 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
48
49 /* Intel-defined (#2) */
50 "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
51 "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL,
52 NULL, NULL, "dca", "sse4_1", "sse4_2", NULL, NULL, "popcnt",
53 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
54
55 /* VIA/Cyrix/Centaur-defined */
56 NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en",
57 "ace2", "ace2_en", "phe", "phe_en", "pmm", "pmm_en", NULL, NULL,
58 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
59 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
60
61 /* AMD-defined (#2) */
62 "lahf_lm", "cmp_legacy", "svm", "extapic",
63 "cr8_legacy", "abm", "sse4a", "misalignsse",
64 "3dnowprefetch", "osvw", "ibs", "sse5",
65 "skinit", "wdt", NULL, NULL,
66 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
67 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
68
69 /* Auxiliary (Linux-defined) */
70 "ida", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
71 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
72 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
73 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
74 };
75 static const char * const x86_power_flags[] = {
76 "ts", /* temperature sensor */
77 "fid", /* frequency id control */
78 "vid", /* voltage id control */
79 "ttp", /* thermal trip */
80 "tm",
81 "stc",
82 "100mhzsteps",
83 "hwpstate",
84 "", /* constant_tsc - moved to flags */
85 /* nothing */
86 };
87 struct cpuinfo_x86 *c = v; 13 struct cpuinfo_x86 *c = v;
88 int i, n = 0; 14 int i, n = 0;
89 int fpu_exception; 15 int fpu_exception;
@@ -188,7 +114,7 @@ static void *c_next(struct seq_file *m, void *v, loff_t *pos)
188static void c_stop(struct seq_file *m, void *v) 114static void c_stop(struct seq_file *m, void *v)
189{ 115{
190} 116}
191struct seq_operations cpuinfo_op = { 117const struct seq_operations cpuinfo_op = {
192 .start = c_start, 118 .start = c_start,
193 .next = c_next, 119 .next = c_next,
194 .stop = c_stop, 120 .stop = c_stop,
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 05c9936a16c..288e7a6598a 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -1,6 +1,6 @@
1/* ----------------------------------------------------------------------- * 1/* ----------------------------------------------------------------------- *
2 * 2 *
3 * Copyright 2000 H. Peter Anvin - All Rights Reserved 3 * Copyright 2000-2008 H. Peter Anvin - All Rights Reserved
4 * 4 *
5 * This program is free software; you can redistribute it and/or modify 5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by 6 * it under the terms of the GNU General Public License as published by
@@ -17,6 +17,10 @@
17 * and then read in chunks of 16 bytes. A larger size means multiple 17 * and then read in chunks of 16 bytes. A larger size means multiple
18 * reads of consecutive levels. 18 * reads of consecutive levels.
19 * 19 *
20 * The lower 32 bits of the file position is used as the incoming %eax,
21 * and the upper 32 bits of the file position as the incoming %ecx,
22 * the latter intended for "counting" eax levels like eax=4.
23 *
20 * This driver uses /dev/cpu/%d/cpuid where %d is the minor number, and on 24 * This driver uses /dev/cpu/%d/cpuid where %d is the minor number, and on
21 * an SMP box will direct the access to CPU %d. 25 * an SMP box will direct the access to CPU %d.
22 */ 26 */
@@ -43,35 +47,24 @@
43 47
44static struct class *cpuid_class; 48static struct class *cpuid_class;
45 49
46struct cpuid_command { 50struct cpuid_regs {
47 u32 reg; 51 u32 eax, ebx, ecx, edx;
48 u32 *data;
49}; 52};
50 53
51static void cpuid_smp_cpuid(void *cmd_block) 54static void cpuid_smp_cpuid(void *cmd_block)
52{ 55{
53 struct cpuid_command *cmd = (struct cpuid_command *)cmd_block; 56 struct cpuid_regs *cmd = (struct cpuid_regs *)cmd_block;
54
55 cpuid(cmd->reg, &cmd->data[0], &cmd->data[1], &cmd->data[2],
56 &cmd->data[3]);
57}
58
59static inline void do_cpuid(int cpu, u32 reg, u32 * data)
60{
61 struct cpuid_command cmd;
62
63 cmd.reg = reg;
64 cmd.data = data;
65 57
66 smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1, 1); 58 cpuid_count(cmd->eax, cmd->ecx,
59 &cmd->eax, &cmd->ebx, &cmd->ecx, &cmd->edx);
67} 60}
68 61
69static loff_t cpuid_seek(struct file *file, loff_t offset, int orig) 62static loff_t cpuid_seek(struct file *file, loff_t offset, int orig)
70{ 63{
71 loff_t ret; 64 loff_t ret;
65 struct inode *inode = file->f_mapping->host;
72 66
73 lock_kernel(); 67 mutex_lock(&inode->i_mutex);
74
75 switch (orig) { 68 switch (orig) {
76 case 0: 69 case 0:
77 file->f_pos = offset; 70 file->f_pos = offset;
@@ -84,8 +77,7 @@ static loff_t cpuid_seek(struct file *file, loff_t offset, int orig)
84 default: 77 default:
85 ret = -EINVAL; 78 ret = -EINVAL;
86 } 79 }
87 80 mutex_unlock(&inode->i_mutex);
88 unlock_kernel();
89 return ret; 81 return ret;
90} 82}
91 83
@@ -93,19 +85,21 @@ static ssize_t cpuid_read(struct file *file, char __user *buf,
93 size_t count, loff_t * ppos) 85 size_t count, loff_t * ppos)
94{ 86{
95 char __user *tmp = buf; 87 char __user *tmp = buf;
96 u32 data[4]; 88 struct cpuid_regs cmd;
97 u32 reg = *ppos;
98 int cpu = iminor(file->f_path.dentry->d_inode); 89 int cpu = iminor(file->f_path.dentry->d_inode);
90 u64 pos = *ppos;
99 91
100 if (count % 16) 92 if (count % 16)
101 return -EINVAL; /* Invalid chunk size */ 93 return -EINVAL; /* Invalid chunk size */
102 94
103 for (; count; count -= 16) { 95 for (; count; count -= 16) {
104 do_cpuid(cpu, reg, data); 96 cmd.eax = pos;
105 if (copy_to_user(tmp, &data, 16)) 97 cmd.ecx = pos >> 32;
98 smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1, 1);
99 if (copy_to_user(tmp, &cmd, 16))
106 return -EFAULT; 100 return -EFAULT;
107 tmp += 16; 101 tmp += 16;
108 *ppos = reg++; 102 *ppos = ++pos;
109 } 103 }
110 104
111 return tmp - buf; 105 return tmp - buf;
@@ -157,20 +151,20 @@ static int __cpuinit cpuid_class_cpu_callback(struct notifier_block *nfb,
157 151
158 switch (action) { 152 switch (action) {
159 case CPU_UP_PREPARE: 153 case CPU_UP_PREPARE:
160 case CPU_UP_PREPARE_FROZEN:
161 err = cpuid_device_create(cpu); 154 err = cpuid_device_create(cpu);
162 break; 155 break;
163 case CPU_UP_CANCELED: 156 case CPU_UP_CANCELED:
164 case CPU_UP_CANCELED_FROZEN:
165 case CPU_DEAD: 157 case CPU_DEAD:
166 case CPU_DEAD_FROZEN:
167 cpuid_device_destroy(cpu); 158 cpuid_device_destroy(cpu);
168 break; 159 break;
160 case CPU_UP_CANCELED_FROZEN:
161 destroy_suspended_device(cpuid_class, MKDEV(CPUID_MAJOR, cpu));
162 break;
169 } 163 }
170 return err ? NOTIFY_BAD : NOTIFY_OK; 164 return err ? NOTIFY_BAD : NOTIFY_OK;
171} 165}
172 166
173static struct notifier_block __cpuinitdata cpuid_class_cpu_notifier = 167static struct notifier_block __refdata cpuid_class_cpu_notifier =
174{ 168{
175 .notifier_call = cpuid_class_cpu_callback, 169 .notifier_call = cpuid_class_cpu_callback,
176}; 170};
@@ -193,7 +187,7 @@ static int __init cpuid_init(void)
193 } 187 }
194 for_each_online_cpu(i) { 188 for_each_online_cpu(i) {
195 err = cpuid_device_create(i); 189 err = cpuid_device_create(i);
196 if (err != 0) 190 if (err != 0)
197 goto out_class; 191 goto out_class;
198 } 192 }
199 register_hotcpu_notifier(&cpuid_class_cpu_notifier); 193 register_hotcpu_notifier(&cpuid_class_cpu_notifier);
@@ -208,7 +202,7 @@ out_class:
208 } 202 }
209 class_destroy(cpuid_class); 203 class_destroy(cpuid_class);
210out_chrdev: 204out_chrdev:
211 unregister_chrdev(CPUID_MAJOR, "cpu/cpuid"); 205 unregister_chrdev(CPUID_MAJOR, "cpu/cpuid");
212out: 206out:
213 return err; 207 return err;
214} 208}
diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c
index 40978af630e..a47798b59f0 100644
--- a/arch/x86/kernel/doublefault_32.c
+++ b/arch/x86/kernel/doublefault_32.c
@@ -17,7 +17,7 @@ static unsigned long doublefault_stack[DOUBLEFAULT_STACKSIZE];
17 17
18static void doublefault_fn(void) 18static void doublefault_fn(void)
19{ 19{
20 struct Xgt_desc_struct gdt_desc = {0, 0}; 20 struct desc_ptr gdt_desc = {0, 0};
21 unsigned long gdt, tss; 21 unsigned long gdt, tss;
22 22
23 store_gdt(&gdt_desc); 23 store_gdt(&gdt_desc);
@@ -33,14 +33,15 @@ static void doublefault_fn(void)
33 printk(KERN_EMERG "double fault, tss at %08lx\n", tss); 33 printk(KERN_EMERG "double fault, tss at %08lx\n", tss);
34 34
35 if (ptr_ok(tss)) { 35 if (ptr_ok(tss)) {
36 struct i386_hw_tss *t = (struct i386_hw_tss *)tss; 36 struct x86_hw_tss *t = (struct x86_hw_tss *)tss;
37 37
38 printk(KERN_EMERG "eip = %08lx, esp = %08lx\n", t->eip, t->esp); 38 printk(KERN_EMERG "eip = %08lx, esp = %08lx\n",
39 t->ip, t->sp);
39 40
40 printk(KERN_EMERG "eax = %08lx, ebx = %08lx, ecx = %08lx, edx = %08lx\n", 41 printk(KERN_EMERG "eax = %08lx, ebx = %08lx, ecx = %08lx, edx = %08lx\n",
41 t->eax, t->ebx, t->ecx, t->edx); 42 t->ax, t->bx, t->cx, t->dx);
42 printk(KERN_EMERG "esi = %08lx, edi = %08lx\n", 43 printk(KERN_EMERG "esi = %08lx, edi = %08lx\n",
43 t->esi, t->edi); 44 t->si, t->di);
44 } 45 }
45 } 46 }
46 47
@@ -50,15 +51,15 @@ static void doublefault_fn(void)
50 51
51struct tss_struct doublefault_tss __cacheline_aligned = { 52struct tss_struct doublefault_tss __cacheline_aligned = {
52 .x86_tss = { 53 .x86_tss = {
53 .esp0 = STACK_START, 54 .sp0 = STACK_START,
54 .ss0 = __KERNEL_DS, 55 .ss0 = __KERNEL_DS,
55 .ldt = 0, 56 .ldt = 0,
56 .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, 57 .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
57 58
58 .eip = (unsigned long) doublefault_fn, 59 .ip = (unsigned long) doublefault_fn,
59 /* 0x2 bit is always set */ 60 /* 0x2 bit is always set */
60 .eflags = X86_EFLAGS_SF | 0x2, 61 .flags = X86_EFLAGS_SF | 0x2,
61 .esp = STACK_START, 62 .sp = STACK_START,
62 .es = __USER_DS, 63 .es = __USER_DS,
63 .cs = __KERNEL_CS, 64 .cs = __KERNEL_CS,
64 .ss = __KERNEL_DS, 65 .ss = __KERNEL_DS,
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
new file mode 100644
index 00000000000..dcd918c1580
--- /dev/null
+++ b/arch/x86/kernel/ds.c
@@ -0,0 +1,464 @@
1/*
2 * Debug Store support
3 *
4 * This provides a low-level interface to the hardware's Debug Store
5 * feature that is used for last branch recording (LBR) and
6 * precise-event based sampling (PEBS).
7 *
8 * Different architectures use a different DS layout/pointer size.
9 * The below functions therefore work on a void*.
10 *
11 *
12 * Since there is no user for PEBS, yet, only LBR (or branch
13 * trace store, BTS) is supported.
14 *
15 *
16 * Copyright (C) 2007 Intel Corporation.
17 * Markus Metzger <markus.t.metzger@intel.com>, Dec 2007
18 */
19
20#include <asm/ds.h>
21
22#include <linux/errno.h>
23#include <linux/string.h>
24#include <linux/slab.h>
25
26
27/*
28 * Debug Store (DS) save area configuration (see Intel64 and IA32
29 * Architectures Software Developer's Manual, section 18.5)
30 *
31 * The DS configuration consists of the following fields; different
32 * architetures vary in the size of those fields.
33 * - double-word aligned base linear address of the BTS buffer
34 * - write pointer into the BTS buffer
35 * - end linear address of the BTS buffer (one byte beyond the end of
36 * the buffer)
37 * - interrupt pointer into BTS buffer
38 * (interrupt occurs when write pointer passes interrupt pointer)
39 * - double-word aligned base linear address of the PEBS buffer
40 * - write pointer into the PEBS buffer
41 * - end linear address of the PEBS buffer (one byte beyond the end of
42 * the buffer)
43 * - interrupt pointer into PEBS buffer
44 * (interrupt occurs when write pointer passes interrupt pointer)
45 * - value to which counter is reset following counter overflow
46 *
47 * On later architectures, the last branch recording hardware uses
48 * 64bit pointers even in 32bit mode.
49 *
50 *
51 * Branch Trace Store (BTS) records store information about control
52 * flow changes. They at least provide the following information:
53 * - source linear address
54 * - destination linear address
55 *
56 * Netburst supported a predicated bit that had been dropped in later
57 * architectures. We do not suppor it.
58 *
59 *
60 * In order to abstract from the actual DS and BTS layout, we describe
61 * the access to the relevant fields.
62 * Thanks to Andi Kleen for proposing this design.
63 *
64 * The implementation, however, is not as general as it might seem. In
65 * order to stay somewhat simple and efficient, we assume an
66 * underlying unsigned type (mostly a pointer type) and we expect the
67 * field to be at least as big as that type.
68 */
69
70/*
71 * A special from_ip address to indicate that the BTS record is an
72 * info record that needs to be interpreted or skipped.
73 */
74#define BTS_ESCAPE_ADDRESS (-1)
75
76/*
77 * A field access descriptor
78 */
79struct access_desc {
80 unsigned char offset;
81 unsigned char size;
82};
83
84/*
85 * The configuration for a particular DS/BTS hardware implementation.
86 */
87struct ds_configuration {
88 /* the DS configuration */
89 unsigned char sizeof_ds;
90 struct access_desc bts_buffer_base;
91 struct access_desc bts_index;
92 struct access_desc bts_absolute_maximum;
93 struct access_desc bts_interrupt_threshold;
94 /* the BTS configuration */
95 unsigned char sizeof_bts;
96 struct access_desc from_ip;
97 struct access_desc to_ip;
98 /* BTS variants used to store additional information like
99 timestamps */
100 struct access_desc info_type;
101 struct access_desc info_data;
102 unsigned long debugctl_mask;
103};
104
105/*
106 * The global configuration used by the below accessor functions
107 */
108static struct ds_configuration ds_cfg;
109
110/*
111 * Accessor functions for some DS and BTS fields using the above
112 * global ptrace_bts_cfg.
113 */
114static inline unsigned long get_bts_buffer_base(char *base)
115{
116 return *(unsigned long *)(base + ds_cfg.bts_buffer_base.offset);
117}
118static inline void set_bts_buffer_base(char *base, unsigned long value)
119{
120 (*(unsigned long *)(base + ds_cfg.bts_buffer_base.offset)) = value;
121}
122static inline unsigned long get_bts_index(char *base)
123{
124 return *(unsigned long *)(base + ds_cfg.bts_index.offset);
125}
126static inline void set_bts_index(char *base, unsigned long value)
127{
128 (*(unsigned long *)(base + ds_cfg.bts_index.offset)) = value;
129}
130static inline unsigned long get_bts_absolute_maximum(char *base)
131{
132 return *(unsigned long *)(base + ds_cfg.bts_absolute_maximum.offset);
133}
134static inline void set_bts_absolute_maximum(char *base, unsigned long value)
135{
136 (*(unsigned long *)(base + ds_cfg.bts_absolute_maximum.offset)) = value;
137}
138static inline unsigned long get_bts_interrupt_threshold(char *base)
139{
140 return *(unsigned long *)(base + ds_cfg.bts_interrupt_threshold.offset);
141}
142static inline void set_bts_interrupt_threshold(char *base, unsigned long value)
143{
144 (*(unsigned long *)(base + ds_cfg.bts_interrupt_threshold.offset)) = value;
145}
146static inline unsigned long get_from_ip(char *base)
147{
148 return *(unsigned long *)(base + ds_cfg.from_ip.offset);
149}
150static inline void set_from_ip(char *base, unsigned long value)
151{
152 (*(unsigned long *)(base + ds_cfg.from_ip.offset)) = value;
153}
154static inline unsigned long get_to_ip(char *base)
155{
156 return *(unsigned long *)(base + ds_cfg.to_ip.offset);
157}
158static inline void set_to_ip(char *base, unsigned long value)
159{
160 (*(unsigned long *)(base + ds_cfg.to_ip.offset)) = value;
161}
162static inline unsigned char get_info_type(char *base)
163{
164 return *(unsigned char *)(base + ds_cfg.info_type.offset);
165}
166static inline void set_info_type(char *base, unsigned char value)
167{
168 (*(unsigned char *)(base + ds_cfg.info_type.offset)) = value;
169}
170static inline unsigned long get_info_data(char *base)
171{
172 return *(unsigned long *)(base + ds_cfg.info_data.offset);
173}
174static inline void set_info_data(char *base, unsigned long value)
175{
176 (*(unsigned long *)(base + ds_cfg.info_data.offset)) = value;
177}
178
179
180int ds_allocate(void **dsp, size_t bts_size_in_bytes)
181{
182 size_t bts_size_in_records;
183 unsigned long bts;
184 void *ds;
185
186 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
187 return -EOPNOTSUPP;
188
189 if (bts_size_in_bytes < 0)
190 return -EINVAL;
191
192 bts_size_in_records =
193 bts_size_in_bytes / ds_cfg.sizeof_bts;
194 bts_size_in_bytes =
195 bts_size_in_records * ds_cfg.sizeof_bts;
196
197 if (bts_size_in_bytes <= 0)
198 return -EINVAL;
199
200 bts = (unsigned long)kzalloc(bts_size_in_bytes, GFP_KERNEL);
201
202 if (!bts)
203 return -ENOMEM;
204
205 ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL);
206
207 if (!ds) {
208 kfree((void *)bts);
209 return -ENOMEM;
210 }
211
212 set_bts_buffer_base(ds, bts);
213 set_bts_index(ds, bts);
214 set_bts_absolute_maximum(ds, bts + bts_size_in_bytes);
215 set_bts_interrupt_threshold(ds, bts + bts_size_in_bytes + 1);
216
217 *dsp = ds;
218 return 0;
219}
220
221int ds_free(void **dsp)
222{
223 if (*dsp)
224 kfree((void *)get_bts_buffer_base(*dsp));
225 kfree(*dsp);
226 *dsp = NULL;
227
228 return 0;
229}
230
231int ds_get_bts_size(void *ds)
232{
233 int size_in_bytes;
234
235 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
236 return -EOPNOTSUPP;
237
238 if (!ds)
239 return 0;
240
241 size_in_bytes =
242 get_bts_absolute_maximum(ds) -
243 get_bts_buffer_base(ds);
244 return size_in_bytes;
245}
246
247int ds_get_bts_end(void *ds)
248{
249 int size_in_bytes = ds_get_bts_size(ds);
250
251 if (size_in_bytes <= 0)
252 return size_in_bytes;
253
254 return size_in_bytes / ds_cfg.sizeof_bts;
255}
256
257int ds_get_bts_index(void *ds)
258{
259 int index_offset_in_bytes;
260
261 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
262 return -EOPNOTSUPP;
263
264 index_offset_in_bytes =
265 get_bts_index(ds) -
266 get_bts_buffer_base(ds);
267
268 return index_offset_in_bytes / ds_cfg.sizeof_bts;
269}
270
271int ds_set_overflow(void *ds, int method)
272{
273 switch (method) {
274 case DS_O_SIGNAL:
275 return -EOPNOTSUPP;
276 case DS_O_WRAP:
277 return 0;
278 default:
279 return -EINVAL;
280 }
281}
282
283int ds_get_overflow(void *ds)
284{
285 return DS_O_WRAP;
286}
287
288int ds_clear(void *ds)
289{
290 int bts_size = ds_get_bts_size(ds);
291 unsigned long bts_base;
292
293 if (bts_size <= 0)
294 return bts_size;
295
296 bts_base = get_bts_buffer_base(ds);
297 memset((void *)bts_base, 0, bts_size);
298
299 set_bts_index(ds, bts_base);
300 return 0;
301}
302
303int ds_read_bts(void *ds, int index, struct bts_struct *out)
304{
305 void *bts;
306
307 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
308 return -EOPNOTSUPP;
309
310 if (index < 0)
311 return -EINVAL;
312
313 if (index >= ds_get_bts_size(ds))
314 return -EINVAL;
315
316 bts = (void *)(get_bts_buffer_base(ds) + (index * ds_cfg.sizeof_bts));
317
318 memset(out, 0, sizeof(*out));
319 if (get_from_ip(bts) == BTS_ESCAPE_ADDRESS) {
320 out->qualifier = get_info_type(bts);
321 out->variant.jiffies = get_info_data(bts);
322 } else {
323 out->qualifier = BTS_BRANCH;
324 out->variant.lbr.from_ip = get_from_ip(bts);
325 out->variant.lbr.to_ip = get_to_ip(bts);
326 }
327
328 return sizeof(*out);;
329}
330
331int ds_write_bts(void *ds, const struct bts_struct *in)
332{
333 unsigned long bts;
334
335 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
336 return -EOPNOTSUPP;
337
338 if (ds_get_bts_size(ds) <= 0)
339 return -ENXIO;
340
341 bts = get_bts_index(ds);
342
343 memset((void *)bts, 0, ds_cfg.sizeof_bts);
344 switch (in->qualifier) {
345 case BTS_INVALID:
346 break;
347
348 case BTS_BRANCH:
349 set_from_ip((void *)bts, in->variant.lbr.from_ip);
350 set_to_ip((void *)bts, in->variant.lbr.to_ip);
351 break;
352
353 case BTS_TASK_ARRIVES:
354 case BTS_TASK_DEPARTS:
355 set_from_ip((void *)bts, BTS_ESCAPE_ADDRESS);
356 set_info_type((void *)bts, in->qualifier);
357 set_info_data((void *)bts, in->variant.jiffies);
358 break;
359
360 default:
361 return -EINVAL;
362 }
363
364 bts = bts + ds_cfg.sizeof_bts;
365 if (bts >= get_bts_absolute_maximum(ds))
366 bts = get_bts_buffer_base(ds);
367 set_bts_index(ds, bts);
368
369 return ds_cfg.sizeof_bts;
370}
371
372unsigned long ds_debugctl_mask(void)
373{
374 return ds_cfg.debugctl_mask;
375}
376
377#ifdef __i386__
378static const struct ds_configuration ds_cfg_netburst = {
379 .sizeof_ds = 9 * 4,
380 .bts_buffer_base = { 0, 4 },
381 .bts_index = { 4, 4 },
382 .bts_absolute_maximum = { 8, 4 },
383 .bts_interrupt_threshold = { 12, 4 },
384 .sizeof_bts = 3 * 4,
385 .from_ip = { 0, 4 },
386 .to_ip = { 4, 4 },
387 .info_type = { 4, 1 },
388 .info_data = { 8, 4 },
389 .debugctl_mask = (1<<2)|(1<<3)
390};
391
392static const struct ds_configuration ds_cfg_pentium_m = {
393 .sizeof_ds = 9 * 4,
394 .bts_buffer_base = { 0, 4 },
395 .bts_index = { 4, 4 },
396 .bts_absolute_maximum = { 8, 4 },
397 .bts_interrupt_threshold = { 12, 4 },
398 .sizeof_bts = 3 * 4,
399 .from_ip = { 0, 4 },
400 .to_ip = { 4, 4 },
401 .info_type = { 4, 1 },
402 .info_data = { 8, 4 },
403 .debugctl_mask = (1<<6)|(1<<7)
404};
405#endif /* _i386_ */
406
407static const struct ds_configuration ds_cfg_core2 = {
408 .sizeof_ds = 9 * 8,
409 .bts_buffer_base = { 0, 8 },
410 .bts_index = { 8, 8 },
411 .bts_absolute_maximum = { 16, 8 },
412 .bts_interrupt_threshold = { 24, 8 },
413 .sizeof_bts = 3 * 8,
414 .from_ip = { 0, 8 },
415 .to_ip = { 8, 8 },
416 .info_type = { 8, 1 },
417 .info_data = { 16, 8 },
418 .debugctl_mask = (1<<6)|(1<<7)|(1<<9)
419};
420
421static inline void
422ds_configure(const struct ds_configuration *cfg)
423{
424 ds_cfg = *cfg;
425}
426
427void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
428{
429 switch (c->x86) {
430 case 0x6:
431 switch (c->x86_model) {
432#ifdef __i386__
433 case 0xD:
434 case 0xE: /* Pentium M */
435 ds_configure(&ds_cfg_pentium_m);
436 break;
437#endif /* _i386_ */
438 case 0xF: /* Core2 */
439 ds_configure(&ds_cfg_core2);
440 break;
441 default:
442 /* sorry, don't know about them */
443 break;
444 }
445 break;
446 case 0xF:
447 switch (c->x86_model) {
448#ifdef __i386__
449 case 0x0:
450 case 0x1:
451 case 0x2: /* Netburst */
452 ds_configure(&ds_cfg_netburst);
453 break;
454#endif /* _i386_ */
455 default:
456 /* sorry, don't know about them */
457 break;
458 }
459 break;
460 default:
461 /* sorry, don't know about them */
462 break;
463 }
464}
diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c
index 18f500d185a..4e16ef4a265 100644
--- a/arch/x86/kernel/e820_32.c
+++ b/arch/x86/kernel/e820_32.c
@@ -7,7 +7,6 @@
7#include <linux/kexec.h> 7#include <linux/kexec.h>
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/mm.h> 9#include <linux/mm.h>
10#include <linux/efi.h>
11#include <linux/pfn.h> 10#include <linux/pfn.h>
12#include <linux/uaccess.h> 11#include <linux/uaccess.h>
13#include <linux/suspend.h> 12#include <linux/suspend.h>
@@ -17,11 +16,6 @@
17#include <asm/e820.h> 16#include <asm/e820.h>
18#include <asm/setup.h> 17#include <asm/setup.h>
19 18
20#ifdef CONFIG_EFI
21int efi_enabled = 0;
22EXPORT_SYMBOL(efi_enabled);
23#endif
24
25struct e820map e820; 19struct e820map e820;
26struct change_member { 20struct change_member {
27 struct e820entry *pbios; /* pointer to original bios entry */ 21 struct e820entry *pbios; /* pointer to original bios entry */
@@ -37,26 +31,6 @@ unsigned long pci_mem_start = 0x10000000;
37EXPORT_SYMBOL(pci_mem_start); 31EXPORT_SYMBOL(pci_mem_start);
38#endif 32#endif
39extern int user_defined_memmap; 33extern int user_defined_memmap;
40struct resource data_resource = {
41 .name = "Kernel data",
42 .start = 0,
43 .end = 0,
44 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
45};
46
47struct resource code_resource = {
48 .name = "Kernel code",
49 .start = 0,
50 .end = 0,
51 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
52};
53
54struct resource bss_resource = {
55 .name = "Kernel bss",
56 .start = 0,
57 .end = 0,
58 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
59};
60 34
61static struct resource system_rom_resource = { 35static struct resource system_rom_resource = {
62 .name = "System ROM", 36 .name = "System ROM",
@@ -111,60 +85,6 @@ static struct resource video_rom_resource = {
111 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM 85 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
112}; 86};
113 87
114static struct resource video_ram_resource = {
115 .name = "Video RAM area",
116 .start = 0xa0000,
117 .end = 0xbffff,
118 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
119};
120
121static struct resource standard_io_resources[] = { {
122 .name = "dma1",
123 .start = 0x0000,
124 .end = 0x001f,
125 .flags = IORESOURCE_BUSY | IORESOURCE_IO
126}, {
127 .name = "pic1",
128 .start = 0x0020,
129 .end = 0x0021,
130 .flags = IORESOURCE_BUSY | IORESOURCE_IO
131}, {
132 .name = "timer0",
133 .start = 0x0040,
134 .end = 0x0043,
135 .flags = IORESOURCE_BUSY | IORESOURCE_IO
136}, {
137 .name = "timer1",
138 .start = 0x0050,
139 .end = 0x0053,
140 .flags = IORESOURCE_BUSY | IORESOURCE_IO
141}, {
142 .name = "keyboard",
143 .start = 0x0060,
144 .end = 0x006f,
145 .flags = IORESOURCE_BUSY | IORESOURCE_IO
146}, {
147 .name = "dma page reg",
148 .start = 0x0080,
149 .end = 0x008f,
150 .flags = IORESOURCE_BUSY | IORESOURCE_IO
151}, {
152 .name = "pic2",
153 .start = 0x00a0,
154 .end = 0x00a1,
155 .flags = IORESOURCE_BUSY | IORESOURCE_IO
156}, {
157 .name = "dma2",
158 .start = 0x00c0,
159 .end = 0x00df,
160 .flags = IORESOURCE_BUSY | IORESOURCE_IO
161}, {
162 .name = "fpu",
163 .start = 0x00f0,
164 .end = 0x00ff,
165 .flags = IORESOURCE_BUSY | IORESOURCE_IO
166} };
167
168#define ROMSIGNATURE 0xaa55 88#define ROMSIGNATURE 0xaa55
169 89
170static int __init romsignature(const unsigned char *rom) 90static int __init romsignature(const unsigned char *rom)
@@ -260,10 +180,9 @@ static void __init probe_roms(void)
260 * Request address space for all standard RAM and ROM resources 180 * Request address space for all standard RAM and ROM resources
261 * and also for regions reported as reserved by the e820. 181 * and also for regions reported as reserved by the e820.
262 */ 182 */
263static void __init 183void __init init_iomem_resources(struct resource *code_resource,
264legacy_init_iomem_resources(struct resource *code_resource, 184 struct resource *data_resource,
265 struct resource *data_resource, 185 struct resource *bss_resource)
266 struct resource *bss_resource)
267{ 186{
268 int i; 187 int i;
269 188
@@ -305,35 +224,6 @@ legacy_init_iomem_resources(struct resource *code_resource,
305 } 224 }
306} 225}
307 226
308/*
309 * Request address space for all standard resources
310 *
311 * This is called just before pcibios_init(), which is also a
312 * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
313 */
314static int __init request_standard_resources(void)
315{
316 int i;
317
318 printk("Setting up standard PCI resources\n");
319 if (efi_enabled)
320 efi_initialize_iomem_resources(&code_resource,
321 &data_resource, &bss_resource);
322 else
323 legacy_init_iomem_resources(&code_resource,
324 &data_resource, &bss_resource);
325
326 /* EFI systems may still have VGA */
327 request_resource(&iomem_resource, &video_ram_resource);
328
329 /* request I/O space for devices used on all i[345]86 PCs */
330 for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
331 request_resource(&ioport_resource, &standard_io_resources[i]);
332 return 0;
333}
334
335subsys_initcall(request_standard_resources);
336
337#if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION) 227#if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION)
338/** 228/**
339 * e820_mark_nosave_regions - Find the ranges of physical addresses that do not 229 * e820_mark_nosave_regions - Find the ranges of physical addresses that do not
@@ -370,19 +260,17 @@ void __init add_memory_region(unsigned long long start,
370{ 260{
371 int x; 261 int x;
372 262
373 if (!efi_enabled) { 263 x = e820.nr_map;
374 x = e820.nr_map;
375
376 if (x == E820MAX) {
377 printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
378 return;
379 }
380 264
381 e820.map[x].addr = start; 265 if (x == E820MAX) {
382 e820.map[x].size = size; 266 printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
383 e820.map[x].type = type; 267 return;
384 e820.nr_map++;
385 } 268 }
269
270 e820.map[x].addr = start;
271 e820.map[x].size = size;
272 e820.map[x].type = type;
273 e820.nr_map++;
386} /* add_memory_region */ 274} /* add_memory_region */
387 275
388/* 276/*
@@ -598,29 +486,6 @@ int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
598} 486}
599 487
600/* 488/*
601 * Callback for efi_memory_walk.
602 */
603static int __init
604efi_find_max_pfn(unsigned long start, unsigned long end, void *arg)
605{
606 unsigned long *max_pfn = arg, pfn;
607
608 if (start < end) {
609 pfn = PFN_UP(end -1);
610 if (pfn > *max_pfn)
611 *max_pfn = pfn;
612 }
613 return 0;
614}
615
616static int __init
617efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
618{
619 memory_present(0, PFN_UP(start), PFN_DOWN(end));
620 return 0;
621}
622
623/*
624 * Find the highest page frame number we have available 489 * Find the highest page frame number we have available
625 */ 490 */
626void __init find_max_pfn(void) 491void __init find_max_pfn(void)
@@ -628,11 +493,6 @@ void __init find_max_pfn(void)
628 int i; 493 int i;
629 494
630 max_pfn = 0; 495 max_pfn = 0;
631 if (efi_enabled) {
632 efi_memmap_walk(efi_find_max_pfn, &max_pfn);
633 efi_memmap_walk(efi_memory_present_wrapper, NULL);
634 return;
635 }
636 496
637 for (i = 0; i < e820.nr_map; i++) { 497 for (i = 0; i < e820.nr_map; i++) {
638 unsigned long start, end; 498 unsigned long start, end;
@@ -650,34 +510,12 @@ void __init find_max_pfn(void)
650} 510}
651 511
652/* 512/*
653 * Free all available memory for boot time allocation. Used
654 * as a callback function by efi_memory_walk()
655 */
656
657static int __init
658free_available_memory(unsigned long start, unsigned long end, void *arg)
659{
660 /* check max_low_pfn */
661 if (start >= (max_low_pfn << PAGE_SHIFT))
662 return 0;
663 if (end >= (max_low_pfn << PAGE_SHIFT))
664 end = max_low_pfn << PAGE_SHIFT;
665 if (start < end)
666 free_bootmem(start, end - start);
667
668 return 0;
669}
670/*
671 * Register fully available low RAM pages with the bootmem allocator. 513 * Register fully available low RAM pages with the bootmem allocator.
672 */ 514 */
673void __init register_bootmem_low_pages(unsigned long max_low_pfn) 515void __init register_bootmem_low_pages(unsigned long max_low_pfn)
674{ 516{
675 int i; 517 int i;
676 518
677 if (efi_enabled) {
678 efi_memmap_walk(free_available_memory, NULL);
679 return;
680 }
681 for (i = 0; i < e820.nr_map; i++) { 519 for (i = 0; i < e820.nr_map; i++) {
682 unsigned long curr_pfn, last_pfn, size; 520 unsigned long curr_pfn, last_pfn, size;
683 /* 521 /*
@@ -785,56 +623,12 @@ void __init print_memory_map(char *who)
785 } 623 }
786} 624}
787 625
788static __init __always_inline void efi_limit_regions(unsigned long long size)
789{
790 unsigned long long current_addr = 0;
791 efi_memory_desc_t *md, *next_md;
792 void *p, *p1;
793 int i, j;
794
795 j = 0;
796 p1 = memmap.map;
797 for (p = p1, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) {
798 md = p;
799 next_md = p1;
800 current_addr = md->phys_addr +
801 PFN_PHYS(md->num_pages);
802 if (is_available_memory(md)) {
803 if (md->phys_addr >= size) continue;
804 memcpy(next_md, md, memmap.desc_size);
805 if (current_addr >= size) {
806 next_md->num_pages -=
807 PFN_UP(current_addr-size);
808 }
809 p1 += memmap.desc_size;
810 next_md = p1;
811 j++;
812 } else if ((md->attribute & EFI_MEMORY_RUNTIME) ==
813 EFI_MEMORY_RUNTIME) {
814 /* In order to make runtime services
815 * available we have to include runtime
816 * memory regions in memory map */
817 memcpy(next_md, md, memmap.desc_size);
818 p1 += memmap.desc_size;
819 next_md = p1;
820 j++;
821 }
822 }
823 memmap.nr_map = j;
824 memmap.map_end = memmap.map +
825 (memmap.nr_map * memmap.desc_size);
826}
827
828void __init limit_regions(unsigned long long size) 626void __init limit_regions(unsigned long long size)
829{ 627{
830 unsigned long long current_addr; 628 unsigned long long current_addr;
831 int i; 629 int i;
832 630
833 print_memory_map("limit_regions start"); 631 print_memory_map("limit_regions start");
834 if (efi_enabled) {
835 efi_limit_regions(size);
836 return;
837 }
838 for (i = 0; i < e820.nr_map; i++) { 632 for (i = 0; i < e820.nr_map; i++) {
839 current_addr = e820.map[i].addr + e820.map[i].size; 633 current_addr = e820.map[i].addr + e820.map[i].size;
840 if (current_addr < size) 634 if (current_addr < size)
@@ -955,3 +749,14 @@ static int __init parse_memmap(char *arg)
955 return 0; 749 return 0;
956} 750}
957early_param("memmap", parse_memmap); 751early_param("memmap", parse_memmap);
752void __init update_e820(void)
753{
754 u8 nr_map;
755
756 nr_map = e820.nr_map;
757 if (sanitize_e820_map(e820.map, &nr_map))
758 return;
759 e820.nr_map = nr_map;
760 printk(KERN_INFO "modified physical RAM map:\n");
761 print_memory_map("modified");
762}
diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c
index 04698e0b056..9f65b4cc323 100644
--- a/arch/x86/kernel/e820_64.c
+++ b/arch/x86/kernel/e820_64.c
@@ -1,4 +1,4 @@
1/* 1/*
2 * Handle the memory map. 2 * Handle the memory map.
3 * The functions here do the job until bootmem takes over. 3 * The functions here do the job until bootmem takes over.
4 * 4 *
@@ -26,80 +26,92 @@
26#include <asm/proto.h> 26#include <asm/proto.h>
27#include <asm/setup.h> 27#include <asm/setup.h>
28#include <asm/sections.h> 28#include <asm/sections.h>
29#include <asm/kdebug.h>
29 30
30struct e820map e820; 31struct e820map e820;
31 32
32/* 33/*
33 * PFN of last memory page. 34 * PFN of last memory page.
34 */ 35 */
35unsigned long end_pfn; 36unsigned long end_pfn;
36EXPORT_SYMBOL(end_pfn);
37 37
38/* 38/*
39 * end_pfn only includes RAM, while end_pfn_map includes all e820 entries. 39 * end_pfn only includes RAM, while end_pfn_map includes all e820 entries.
40 * The direct mapping extends to end_pfn_map, so that we can directly access 40 * The direct mapping extends to end_pfn_map, so that we can directly access
41 * apertures, ACPI and other tables without having to play with fixmaps. 41 * apertures, ACPI and other tables without having to play with fixmaps.
42 */ 42 */
43unsigned long end_pfn_map; 43unsigned long end_pfn_map;
44 44
45/* 45/*
46 * Last pfn which the user wants to use. 46 * Last pfn which the user wants to use.
47 */ 47 */
48static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT; 48static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT;
49 49
50extern struct resource code_resource, data_resource, bss_resource; 50/*
51 51 * Early reserved memory areas.
52/* Check for some hardcoded bad areas that early boot is not allowed to touch */ 52 */
53static inline int bad_addr(unsigned long *addrp, unsigned long size) 53#define MAX_EARLY_RES 20
54{ 54
55 unsigned long addr = *addrp, last = addr + size; 55struct early_res {
56 56 unsigned long start, end;
57 /* various gunk below that needed for SMP startup */ 57 char name[16];
58 if (addr < 0x8000) { 58};
59 *addrp = PAGE_ALIGN(0x8000); 59static struct early_res early_res[MAX_EARLY_RES] __initdata = {
60 return 1; 60 { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */
61 } 61#ifdef CONFIG_SMP
62 62 { SMP_TRAMPOLINE_BASE, SMP_TRAMPOLINE_BASE + 2*PAGE_SIZE, "SMP_TRAMPOLINE" },
63 /* direct mapping tables of the kernel */
64 if (last >= table_start<<PAGE_SHIFT && addr < table_end<<PAGE_SHIFT) {
65 *addrp = PAGE_ALIGN(table_end << PAGE_SHIFT);
66 return 1;
67 }
68
69 /* initrd */
70#ifdef CONFIG_BLK_DEV_INITRD
71 if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
72 unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
73 unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
74 unsigned long ramdisk_end = ramdisk_image+ramdisk_size;
75
76 if (last >= ramdisk_image && addr < ramdisk_end) {
77 *addrp = PAGE_ALIGN(ramdisk_end);
78 return 1;
79 }
80 }
81#endif 63#endif
82 /* kernel code */ 64 {}
83 if (last >= __pa_symbol(&_text) && addr < __pa_symbol(&_end)) { 65};
84 *addrp = PAGE_ALIGN(__pa_symbol(&_end)); 66
85 return 1; 67void __init reserve_early(unsigned long start, unsigned long end, char *name)
68{
69 int i;
70 struct early_res *r;
71 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
72 r = &early_res[i];
73 if (end > r->start && start < r->end)
74 panic("Overlapping early reservations %lx-%lx %s to %lx-%lx %s\n",
75 start, end - 1, name?name:"", r->start, r->end - 1, r->name);
86 } 76 }
77 if (i >= MAX_EARLY_RES)
78 panic("Too many early reservations");
79 r = &early_res[i];
80 r->start = start;
81 r->end = end;
82 if (name)
83 strncpy(r->name, name, sizeof(r->name) - 1);
84}
87 85
88 if (last >= ebda_addr && addr < ebda_addr + ebda_size) { 86void __init early_res_to_bootmem(void)
89 *addrp = PAGE_ALIGN(ebda_addr + ebda_size); 87{
90 return 1; 88 int i;
89 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
90 struct early_res *r = &early_res[i];
91 printk(KERN_INFO "early res: %d [%lx-%lx] %s\n", i,
92 r->start, r->end - 1, r->name);
93 reserve_bootmem_generic(r->start, r->end - r->start);
91 } 94 }
95}
92 96
93#ifdef CONFIG_NUMA 97/* Check for already reserved areas */
94 /* NUMA memory to node map */ 98static inline int bad_addr(unsigned long *addrp, unsigned long size)
95 if (last >= nodemap_addr && addr < nodemap_addr + nodemap_size) { 99{
96 *addrp = nodemap_addr + nodemap_size; 100 int i;
97 return 1; 101 unsigned long addr = *addrp, last;
102 int changed = 0;
103again:
104 last = addr + size;
105 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
106 struct early_res *r = &early_res[i];
107 if (last >= r->start && addr < r->end) {
108 *addrp = addr = r->end;
109 changed = 1;
110 goto again;
111 }
98 } 112 }
99#endif 113 return changed;
100 /* XXX ramdisk image here? */ 114}
101 return 0;
102}
103 115
104/* 116/*
105 * This function checks if any part of the range <start,end> is mapped 117 * This function checks if any part of the range <start,end> is mapped
@@ -107,16 +119,18 @@ static inline int bad_addr(unsigned long *addrp, unsigned long size)
107 */ 119 */
108int 120int
109e820_any_mapped(unsigned long start, unsigned long end, unsigned type) 121e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
110{ 122{
111 int i; 123 int i;
112 for (i = 0; i < e820.nr_map; i++) { 124
113 struct e820entry *ei = &e820.map[i]; 125 for (i = 0; i < e820.nr_map; i++) {
114 if (type && ei->type != type) 126 struct e820entry *ei = &e820.map[i];
127
128 if (type && ei->type != type)
115 continue; 129 continue;
116 if (ei->addr >= end || ei->addr + ei->size <= start) 130 if (ei->addr >= end || ei->addr + ei->size <= start)
117 continue; 131 continue;
118 return 1; 132 return 1;
119 } 133 }
120 return 0; 134 return 0;
121} 135}
122EXPORT_SYMBOL_GPL(e820_any_mapped); 136EXPORT_SYMBOL_GPL(e820_any_mapped);
@@ -127,11 +141,14 @@ EXPORT_SYMBOL_GPL(e820_any_mapped);
127 * Note: this function only works correct if the e820 table is sorted and 141 * Note: this function only works correct if the e820 table is sorted and
128 * not-overlapping, which is the case 142 * not-overlapping, which is the case
129 */ 143 */
130int __init e820_all_mapped(unsigned long start, unsigned long end, unsigned type) 144int __init e820_all_mapped(unsigned long start, unsigned long end,
145 unsigned type)
131{ 146{
132 int i; 147 int i;
148
133 for (i = 0; i < e820.nr_map; i++) { 149 for (i = 0; i < e820.nr_map; i++) {
134 struct e820entry *ei = &e820.map[i]; 150 struct e820entry *ei = &e820.map[i];
151
135 if (type && ei->type != type) 152 if (type && ei->type != type)
136 continue; 153 continue;
137 /* is the region (part) in overlap with the current region ?*/ 154 /* is the region (part) in overlap with the current region ?*/
@@ -143,65 +160,75 @@ int __init e820_all_mapped(unsigned long start, unsigned long end, unsigned type
143 */ 160 */
144 if (ei->addr <= start) 161 if (ei->addr <= start)
145 start = ei->addr + ei->size; 162 start = ei->addr + ei->size;
146 /* if start is now at or beyond end, we're done, full coverage */ 163 /*
164 * if start is now at or beyond end, we're done, full
165 * coverage
166 */
147 if (start >= end) 167 if (start >= end)
148 return 1; /* we're done */ 168 return 1;
149 } 169 }
150 return 0; 170 return 0;
151} 171}
152 172
153/* 173/*
154 * Find a free area in a specific range. 174 * Find a free area with specified alignment in a specific range.
155 */ 175 */
156unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsigned size) 176unsigned long __init find_e820_area(unsigned long start, unsigned long end,
157{ 177 unsigned size, unsigned long align)
158 int i; 178{
159 for (i = 0; i < e820.nr_map; i++) { 179 int i;
160 struct e820entry *ei = &e820.map[i]; 180 unsigned long mask = ~(align - 1);
161 unsigned long addr = ei->addr, last; 181
162 if (ei->type != E820_RAM) 182 for (i = 0; i < e820.nr_map; i++) {
163 continue; 183 struct e820entry *ei = &e820.map[i];
164 if (addr < start) 184 unsigned long addr = ei->addr, last;
185
186 if (ei->type != E820_RAM)
187 continue;
188 if (addr < start)
165 addr = start; 189 addr = start;
166 if (addr > ei->addr + ei->size) 190 if (addr > ei->addr + ei->size)
167 continue; 191 continue;
168 while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size) 192 while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size)
169 ; 193 ;
170 last = PAGE_ALIGN(addr) + size; 194 addr = (addr + align - 1) & mask;
195 last = addr + size;
171 if (last > ei->addr + ei->size) 196 if (last > ei->addr + ei->size)
172 continue; 197 continue;
173 if (last > end) 198 if (last > end)
174 continue; 199 continue;
175 return addr; 200 return addr;
176 } 201 }
177 return -1UL; 202 return -1UL;
178} 203}
179 204
180/* 205/*
181 * Find the highest page frame number we have available 206 * Find the highest page frame number we have available
182 */ 207 */
183unsigned long __init e820_end_of_ram(void) 208unsigned long __init e820_end_of_ram(void)
184{ 209{
185 unsigned long end_pfn = 0; 210 unsigned long end_pfn;
211
186 end_pfn = find_max_pfn_with_active_regions(); 212 end_pfn = find_max_pfn_with_active_regions();
187 213
188 if (end_pfn > end_pfn_map) 214 if (end_pfn > end_pfn_map)
189 end_pfn_map = end_pfn; 215 end_pfn_map = end_pfn;
190 if (end_pfn_map > MAXMEM>>PAGE_SHIFT) 216 if (end_pfn_map > MAXMEM>>PAGE_SHIFT)
191 end_pfn_map = MAXMEM>>PAGE_SHIFT; 217 end_pfn_map = MAXMEM>>PAGE_SHIFT;
192 if (end_pfn > end_user_pfn) 218 if (end_pfn > end_user_pfn)
193 end_pfn = end_user_pfn; 219 end_pfn = end_user_pfn;
194 if (end_pfn > end_pfn_map) 220 if (end_pfn > end_pfn_map)
195 end_pfn = end_pfn_map; 221 end_pfn = end_pfn_map;
196 222
197 printk("end_pfn_map = %lu\n", end_pfn_map); 223 printk(KERN_INFO "end_pfn_map = %lu\n", end_pfn_map);
198 return end_pfn; 224 return end_pfn;
199} 225}
200 226
201/* 227/*
202 * Mark e820 reserved areas as busy for the resource manager. 228 * Mark e820 reserved areas as busy for the resource manager.
203 */ 229 */
204void __init e820_reserve_resources(void) 230void __init e820_reserve_resources(struct resource *code_resource,
231 struct resource *data_resource, struct resource *bss_resource)
205{ 232{
206 int i; 233 int i;
207 for (i = 0; i < e820.nr_map; i++) { 234 for (i = 0; i < e820.nr_map; i++) {
@@ -219,13 +246,13 @@ void __init e820_reserve_resources(void)
219 request_resource(&iomem_resource, res); 246 request_resource(&iomem_resource, res);
220 if (e820.map[i].type == E820_RAM) { 247 if (e820.map[i].type == E820_RAM) {
221 /* 248 /*
222 * We don't know which RAM region contains kernel data, 249 * We don't know which RAM region contains kernel data,
223 * so we try it repeatedly and let the resource manager 250 * so we try it repeatedly and let the resource manager
224 * test it. 251 * test it.
225 */ 252 */
226 request_resource(res, &code_resource); 253 request_resource(res, code_resource);
227 request_resource(res, &data_resource); 254 request_resource(res, data_resource);
228 request_resource(res, &bss_resource); 255 request_resource(res, bss_resource);
229#ifdef CONFIG_KEXEC 256#ifdef CONFIG_KEXEC
230 if (crashk_res.start != crashk_res.end) 257 if (crashk_res.start != crashk_res.end)
231 request_resource(res, &crashk_res); 258 request_resource(res, &crashk_res);
@@ -322,9 +349,9 @@ e820_register_active_regions(int nid, unsigned long start_pfn,
322 add_active_range(nid, ei_startpfn, ei_endpfn); 349 add_active_range(nid, ei_startpfn, ei_endpfn);
323} 350}
324 351
325/* 352/*
326 * Add a memory region to the kernel e820 map. 353 * Add a memory region to the kernel e820 map.
327 */ 354 */
328void __init add_memory_region(unsigned long start, unsigned long size, int type) 355void __init add_memory_region(unsigned long start, unsigned long size, int type)
329{ 356{
330 int x = e820.nr_map; 357 int x = e820.nr_map;
@@ -349,9 +376,7 @@ unsigned long __init e820_hole_size(unsigned long start, unsigned long end)
349{ 376{
350 unsigned long start_pfn = start >> PAGE_SHIFT; 377 unsigned long start_pfn = start >> PAGE_SHIFT;
351 unsigned long end_pfn = end >> PAGE_SHIFT; 378 unsigned long end_pfn = end >> PAGE_SHIFT;
352 unsigned long ei_startpfn; 379 unsigned long ei_startpfn, ei_endpfn, ram = 0;
353 unsigned long ei_endpfn;
354 unsigned long ram = 0;
355 int i; 380 int i;
356 381
357 for (i = 0; i < e820.nr_map; i++) { 382 for (i = 0; i < e820.nr_map; i++) {
@@ -363,28 +388,31 @@ unsigned long __init e820_hole_size(unsigned long start, unsigned long end)
363 return end - start - (ram << PAGE_SHIFT); 388 return end - start - (ram << PAGE_SHIFT);
364} 389}
365 390
366void __init e820_print_map(char *who) 391static void __init e820_print_map(char *who)
367{ 392{
368 int i; 393 int i;
369 394
370 for (i = 0; i < e820.nr_map; i++) { 395 for (i = 0; i < e820.nr_map; i++) {
371 printk(KERN_INFO " %s: %016Lx - %016Lx ", who, 396 printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
372 (unsigned long long) e820.map[i].addr, 397 (unsigned long long) e820.map[i].addr,
373 (unsigned long long) (e820.map[i].addr + e820.map[i].size)); 398 (unsigned long long)
399 (e820.map[i].addr + e820.map[i].size));
374 switch (e820.map[i].type) { 400 switch (e820.map[i].type) {
375 case E820_RAM: printk("(usable)\n"); 401 case E820_RAM:
376 break; 402 printk(KERN_CONT "(usable)\n");
403 break;
377 case E820_RESERVED: 404 case E820_RESERVED:
378 printk("(reserved)\n"); 405 printk(KERN_CONT "(reserved)\n");
379 break; 406 break;
380 case E820_ACPI: 407 case E820_ACPI:
381 printk("(ACPI data)\n"); 408 printk(KERN_CONT "(ACPI data)\n");
382 break; 409 break;
383 case E820_NVS: 410 case E820_NVS:
384 printk("(ACPI NVS)\n"); 411 printk(KERN_CONT "(ACPI NVS)\n");
385 break; 412 break;
386 default: printk("type %u\n", e820.map[i].type); 413 default:
387 break; 414 printk(KERN_CONT "type %u\n", e820.map[i].type);
415 break;
388 } 416 }
389 } 417 }
390} 418}
@@ -392,11 +420,11 @@ void __init e820_print_map(char *who)
392/* 420/*
393 * Sanitize the BIOS e820 map. 421 * Sanitize the BIOS e820 map.
394 * 422 *
395 * Some e820 responses include overlapping entries. The following 423 * Some e820 responses include overlapping entries. The following
396 * replaces the original e820 map with a new one, removing overlaps. 424 * replaces the original e820 map with a new one, removing overlaps.
397 * 425 *
398 */ 426 */
399static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) 427static int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map)
400{ 428{
401 struct change_member { 429 struct change_member {
402 struct e820entry *pbios; /* pointer to original bios entry */ 430 struct e820entry *pbios; /* pointer to original bios entry */
@@ -416,7 +444,8 @@ static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
416 int i; 444 int i;
417 445
418 /* 446 /*
419 Visually we're performing the following (1,2,3,4 = memory types)... 447 Visually we're performing the following
448 (1,2,3,4 = memory types)...
420 449
421 Sample memory map (w/overlaps): 450 Sample memory map (w/overlaps):
422 ____22__________________ 451 ____22__________________
@@ -458,22 +487,23 @@ static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
458 old_nr = *pnr_map; 487 old_nr = *pnr_map;
459 488
460 /* bail out if we find any unreasonable addresses in bios map */ 489 /* bail out if we find any unreasonable addresses in bios map */
461 for (i=0; i<old_nr; i++) 490 for (i = 0; i < old_nr; i++)
462 if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) 491 if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
463 return -1; 492 return -1;
464 493
465 /* create pointers for initial change-point information (for sorting) */ 494 /* create pointers for initial change-point information (for sorting) */
466 for (i=0; i < 2*old_nr; i++) 495 for (i = 0; i < 2 * old_nr; i++)
467 change_point[i] = &change_point_list[i]; 496 change_point[i] = &change_point_list[i];
468 497
469 /* record all known change-points (starting and ending addresses), 498 /* record all known change-points (starting and ending addresses),
470 omitting those that are for empty memory regions */ 499 omitting those that are for empty memory regions */
471 chgidx = 0; 500 chgidx = 0;
472 for (i=0; i < old_nr; i++) { 501 for (i = 0; i < old_nr; i++) {
473 if (biosmap[i].size != 0) { 502 if (biosmap[i].size != 0) {
474 change_point[chgidx]->addr = biosmap[i].addr; 503 change_point[chgidx]->addr = biosmap[i].addr;
475 change_point[chgidx++]->pbios = &biosmap[i]; 504 change_point[chgidx++]->pbios = &biosmap[i];
476 change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size; 505 change_point[chgidx]->addr = biosmap[i].addr +
506 biosmap[i].size;
477 change_point[chgidx++]->pbios = &biosmap[i]; 507 change_point[chgidx++]->pbios = &biosmap[i];
478 } 508 }
479 } 509 }
@@ -483,75 +513,106 @@ static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
483 still_changing = 1; 513 still_changing = 1;
484 while (still_changing) { 514 while (still_changing) {
485 still_changing = 0; 515 still_changing = 0;
486 for (i=1; i < chg_nr; i++) { 516 for (i = 1; i < chg_nr; i++) {
487 /* if <current_addr> > <last_addr>, swap */ 517 unsigned long long curaddr, lastaddr;
488 /* or, if current=<start_addr> & last=<end_addr>, swap */ 518 unsigned long long curpbaddr, lastpbaddr;
489 if ((change_point[i]->addr < change_point[i-1]->addr) || 519
490 ((change_point[i]->addr == change_point[i-1]->addr) && 520 curaddr = change_point[i]->addr;
491 (change_point[i]->addr == change_point[i]->pbios->addr) && 521 lastaddr = change_point[i - 1]->addr;
492 (change_point[i-1]->addr != change_point[i-1]->pbios->addr)) 522 curpbaddr = change_point[i]->pbios->addr;
493 ) 523 lastpbaddr = change_point[i - 1]->pbios->addr;
494 { 524
525 /*
526 * swap entries, when:
527 *
528 * curaddr > lastaddr or
529 * curaddr == lastaddr and curaddr == curpbaddr and
530 * lastaddr != lastpbaddr
531 */
532 if (curaddr < lastaddr ||
533 (curaddr == lastaddr && curaddr == curpbaddr &&
534 lastaddr != lastpbaddr)) {
495 change_tmp = change_point[i]; 535 change_tmp = change_point[i];
496 change_point[i] = change_point[i-1]; 536 change_point[i] = change_point[i-1];
497 change_point[i-1] = change_tmp; 537 change_point[i-1] = change_tmp;
498 still_changing=1; 538 still_changing = 1;
499 } 539 }
500 } 540 }
501 } 541 }
502 542
503 /* create a new bios memory map, removing overlaps */ 543 /* create a new bios memory map, removing overlaps */
504 overlap_entries=0; /* number of entries in the overlap table */ 544 overlap_entries = 0; /* number of entries in the overlap table */
505 new_bios_entry=0; /* index for creating new bios map entries */ 545 new_bios_entry = 0; /* index for creating new bios map entries */
506 last_type = 0; /* start with undefined memory type */ 546 last_type = 0; /* start with undefined memory type */
507 last_addr = 0; /* start with 0 as last starting address */ 547 last_addr = 0; /* start with 0 as last starting address */
548
508 /* loop through change-points, determining affect on the new bios map */ 549 /* loop through change-points, determining affect on the new bios map */
509 for (chgidx=0; chgidx < chg_nr; chgidx++) 550 for (chgidx = 0; chgidx < chg_nr; chgidx++) {
510 {
511 /* keep track of all overlapping bios entries */ 551 /* keep track of all overlapping bios entries */
512 if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr) 552 if (change_point[chgidx]->addr ==
513 { 553 change_point[chgidx]->pbios->addr) {
514 /* add map entry to overlap list (> 1 entry implies an overlap) */ 554 /*
515 overlap_list[overlap_entries++]=change_point[chgidx]->pbios; 555 * add map entry to overlap list (> 1 entry
516 } 556 * implies an overlap)
517 else 557 */
518 { 558 overlap_list[overlap_entries++] =
519 /* remove entry from list (order independent, so swap with last) */ 559 change_point[chgidx]->pbios;
520 for (i=0; i<overlap_entries; i++) 560 } else {
521 { 561 /*
522 if (overlap_list[i] == change_point[chgidx]->pbios) 562 * remove entry from list (order independent,
523 overlap_list[i] = overlap_list[overlap_entries-1]; 563 * so swap with last)
564 */
565 for (i = 0; i < overlap_entries; i++) {
566 if (overlap_list[i] ==
567 change_point[chgidx]->pbios)
568 overlap_list[i] =
569 overlap_list[overlap_entries-1];
524 } 570 }
525 overlap_entries--; 571 overlap_entries--;
526 } 572 }
527 /* if there are overlapping entries, decide which "type" to use */ 573 /*
528 /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */ 574 * if there are overlapping entries, decide which
575 * "type" to use (larger value takes precedence --
576 * 1=usable, 2,3,4,4+=unusable)
577 */
529 current_type = 0; 578 current_type = 0;
530 for (i=0; i<overlap_entries; i++) 579 for (i = 0; i < overlap_entries; i++)
531 if (overlap_list[i]->type > current_type) 580 if (overlap_list[i]->type > current_type)
532 current_type = overlap_list[i]->type; 581 current_type = overlap_list[i]->type;
533 /* continue building up new bios map based on this information */ 582 /*
583 * continue building up new bios map based on this
584 * information
585 */
534 if (current_type != last_type) { 586 if (current_type != last_type) {
535 if (last_type != 0) { 587 if (last_type != 0) {
536 new_bios[new_bios_entry].size = 588 new_bios[new_bios_entry].size =
537 change_point[chgidx]->addr - last_addr; 589 change_point[chgidx]->addr - last_addr;
538 /* move forward only if the new size was non-zero */ 590 /*
591 * move forward only if the new size
592 * was non-zero
593 */
539 if (new_bios[new_bios_entry].size != 0) 594 if (new_bios[new_bios_entry].size != 0)
595 /*
596 * no more space left for new
597 * bios entries ?
598 */
540 if (++new_bios_entry >= E820MAX) 599 if (++new_bios_entry >= E820MAX)
541 break; /* no more space left for new bios entries */ 600 break;
542 } 601 }
543 if (current_type != 0) { 602 if (current_type != 0) {
544 new_bios[new_bios_entry].addr = change_point[chgidx]->addr; 603 new_bios[new_bios_entry].addr =
604 change_point[chgidx]->addr;
545 new_bios[new_bios_entry].type = current_type; 605 new_bios[new_bios_entry].type = current_type;
546 last_addr=change_point[chgidx]->addr; 606 last_addr = change_point[chgidx]->addr;
547 } 607 }
548 last_type = current_type; 608 last_type = current_type;
549 } 609 }
550 } 610 }
551 new_nr = new_bios_entry; /* retain count for new bios entries */ 611 /* retain count for new bios entries */
612 new_nr = new_bios_entry;
552 613
553 /* copy new bios mapping into original location */ 614 /* copy new bios mapping into original location */
554 memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry)); 615 memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry));
555 *pnr_map = new_nr; 616 *pnr_map = new_nr;
556 617
557 return 0; 618 return 0;
@@ -566,7 +627,7 @@ static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
566 * will have given us a memory map that we can use to properly 627 * will have given us a memory map that we can use to properly
567 * set up memory. If we aren't, we'll fake a memory map. 628 * set up memory. If we aren't, we'll fake a memory map.
568 */ 629 */
569static int __init copy_e820_map(struct e820entry * biosmap, int nr_map) 630static int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
570{ 631{
571 /* Only one memory region (or negative)? Ignore it */ 632 /* Only one memory region (or negative)? Ignore it */
572 if (nr_map < 2) 633 if (nr_map < 2)
@@ -583,18 +644,20 @@ static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
583 return -1; 644 return -1;
584 645
585 add_memory_region(start, size, type); 646 add_memory_region(start, size, type);
586 } while (biosmap++,--nr_map); 647 } while (biosmap++, --nr_map);
587 return 0; 648 return 0;
588} 649}
589 650
590void early_panic(char *msg) 651static void early_panic(char *msg)
591{ 652{
592 early_printk(msg); 653 early_printk(msg);
593 panic(msg); 654 panic(msg);
594} 655}
595 656
596void __init setup_memory_region(void) 657/* We're not void only for x86 32-bit compat */
658char * __init machine_specific_memory_setup(void)
597{ 659{
660 char *who = "BIOS-e820";
598 /* 661 /*
599 * Try to copy the BIOS-supplied E820-map. 662 * Try to copy the BIOS-supplied E820-map.
600 * 663 *
@@ -605,7 +668,10 @@ void __init setup_memory_region(void)
605 if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0) 668 if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0)
606 early_panic("Cannot find a valid memory map"); 669 early_panic("Cannot find a valid memory map");
607 printk(KERN_INFO "BIOS-provided physical RAM map:\n"); 670 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
608 e820_print_map("BIOS-e820"); 671 e820_print_map(who);
672
673 /* In case someone cares... */
674 return who;
609} 675}
610 676
611static int __init parse_memopt(char *p) 677static int __init parse_memopt(char *p)
@@ -613,9 +679,9 @@ static int __init parse_memopt(char *p)
613 if (!p) 679 if (!p)
614 return -EINVAL; 680 return -EINVAL;
615 end_user_pfn = memparse(p, &p); 681 end_user_pfn = memparse(p, &p);
616 end_user_pfn >>= PAGE_SHIFT; 682 end_user_pfn >>= PAGE_SHIFT;
617 return 0; 683 return 0;
618} 684}
619early_param("mem", parse_memopt); 685early_param("mem", parse_memopt);
620 686
621static int userdef __initdata; 687static int userdef __initdata;
@@ -627,9 +693,9 @@ static int __init parse_memmap_opt(char *p)
627 693
628 if (!strcmp(p, "exactmap")) { 694 if (!strcmp(p, "exactmap")) {
629#ifdef CONFIG_CRASH_DUMP 695#ifdef CONFIG_CRASH_DUMP
630 /* If we are doing a crash dump, we 696 /*
631 * still need to know the real mem 697 * If we are doing a crash dump, we still need to know
632 * size before original memory map is 698 * the real mem size before original memory map is
633 * reset. 699 * reset.
634 */ 700 */
635 e820_register_active_regions(0, 0, -1UL); 701 e820_register_active_regions(0, 0, -1UL);
@@ -646,6 +712,8 @@ static int __init parse_memmap_opt(char *p)
646 mem_size = memparse(p, &p); 712 mem_size = memparse(p, &p);
647 if (p == oldp) 713 if (p == oldp)
648 return -EINVAL; 714 return -EINVAL;
715
716 userdef = 1;
649 if (*p == '@') { 717 if (*p == '@') {
650 start_at = memparse(p+1, &p); 718 start_at = memparse(p+1, &p);
651 add_memory_region(start_at, mem_size, E820_RAM); 719 add_memory_region(start_at, mem_size, E820_RAM);
@@ -665,11 +733,29 @@ early_param("memmap", parse_memmap_opt);
665void __init finish_e820_parsing(void) 733void __init finish_e820_parsing(void)
666{ 734{
667 if (userdef) { 735 if (userdef) {
736 char nr = e820.nr_map;
737
738 if (sanitize_e820_map(e820.map, &nr) < 0)
739 early_panic("Invalid user supplied memory map");
740 e820.nr_map = nr;
741
668 printk(KERN_INFO "user-defined physical RAM map:\n"); 742 printk(KERN_INFO "user-defined physical RAM map:\n");
669 e820_print_map("user"); 743 e820_print_map("user");
670 } 744 }
671} 745}
672 746
747void __init update_e820(void)
748{
749 u8 nr_map;
750
751 nr_map = e820.nr_map;
752 if (sanitize_e820_map(e820.map, &nr_map))
753 return;
754 e820.nr_map = nr_map;
755 printk(KERN_INFO "modified physical RAM map:\n");
756 e820_print_map("modified");
757}
758
673unsigned long pci_mem_start = 0xaeedbabe; 759unsigned long pci_mem_start = 0xaeedbabe;
674EXPORT_SYMBOL(pci_mem_start); 760EXPORT_SYMBOL(pci_mem_start);
675 761
@@ -713,8 +799,10 @@ __init void e820_setup_gap(void)
713 799
714 if (!found) { 800 if (!found) {
715 gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024; 801 gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024;
716 printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit address range\n" 802 printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit "
717 KERN_ERR "PCI: Unassigned devices with 32bit resource registers may break!\n"); 803 "address range\n"
804 KERN_ERR "PCI: Unassigned devices with 32bit resource "
805 "registers may break!\n");
718 } 806 }
719 807
720 /* 808 /*
@@ -727,8 +815,9 @@ __init void e820_setup_gap(void)
727 /* Fun with two's complement */ 815 /* Fun with two's complement */
728 pci_mem_start = (gapstart + round) & -round; 816 pci_mem_start = (gapstart + round) & -round;
729 817
730 printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", 818 printk(KERN_INFO
731 pci_mem_start, gapstart, gapsize); 819 "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
820 pci_mem_start, gapstart, gapsize);
732} 821}
733 822
734int __init arch_get_ram_range(int slot, u64 *addr, u64 *size) 823int __init arch_get_ram_range(int slot, u64 *addr, u64 *size)
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 88bb83ec895..9f51e1ea9e8 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -21,7 +21,33 @@
21#include <asm/gart.h> 21#include <asm/gart.h>
22#endif 22#endif
23 23
24static void __init via_bugs(void) 24static void __init fix_hypertransport_config(int num, int slot, int func)
25{
26 u32 htcfg;
27 /*
28 * we found a hypertransport bus
29 * make sure that we are broadcasting
30 * interrupts to all cpus on the ht bus
31 * if we're using extended apic ids
32 */
33 htcfg = read_pci_config(num, slot, func, 0x68);
34 if (htcfg & (1 << 18)) {
35 printk(KERN_INFO "Detected use of extended apic ids "
36 "on hypertransport bus\n");
37 if ((htcfg & (1 << 17)) == 0) {
38 printk(KERN_INFO "Enabling hypertransport extended "
39 "apic interrupt broadcast\n");
40 printk(KERN_INFO "Note this is a bios bug, "
41 "please contact your hw vendor\n");
42 htcfg |= (1 << 17);
43 write_pci_config(num, slot, func, 0x68, htcfg);
44 }
45 }
46
47
48}
49
50static void __init via_bugs(int num, int slot, int func)
25{ 51{
26#ifdef CONFIG_GART_IOMMU 52#ifdef CONFIG_GART_IOMMU
27 if ((end_pfn > MAX_DMA32_PFN || force_iommu) && 53 if ((end_pfn > MAX_DMA32_PFN || force_iommu) &&
@@ -44,7 +70,7 @@ static int __init nvidia_hpet_check(struct acpi_table_header *header)
44#endif /* CONFIG_X86_IO_APIC */ 70#endif /* CONFIG_X86_IO_APIC */
45#endif /* CONFIG_ACPI */ 71#endif /* CONFIG_ACPI */
46 72
47static void __init nvidia_bugs(void) 73static void __init nvidia_bugs(int num, int slot, int func)
48{ 74{
49#ifdef CONFIG_ACPI 75#ifdef CONFIG_ACPI
50#ifdef CONFIG_X86_IO_APIC 76#ifdef CONFIG_X86_IO_APIC
@@ -72,7 +98,7 @@ static void __init nvidia_bugs(void)
72 98
73} 99}
74 100
75static void __init ati_bugs(void) 101static void __init ati_bugs(int num, int slot, int func)
76{ 102{
77#ifdef CONFIG_X86_IO_APIC 103#ifdef CONFIG_X86_IO_APIC
78 if (timer_over_8254 == 1) { 104 if (timer_over_8254 == 1) {
@@ -83,18 +109,67 @@ static void __init ati_bugs(void)
83#endif 109#endif
84} 110}
85 111
112#define QFLAG_APPLY_ONCE 0x1
113#define QFLAG_APPLIED 0x2
114#define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED)
86struct chipset { 115struct chipset {
87 u16 vendor; 116 u32 vendor;
88 void (*f)(void); 117 u32 device;
118 u32 class;
119 u32 class_mask;
120 u32 flags;
121 void (*f)(int num, int slot, int func);
89}; 122};
90 123
91static struct chipset early_qrk[] __initdata = { 124static struct chipset early_qrk[] __initdata = {
92 { PCI_VENDOR_ID_NVIDIA, nvidia_bugs }, 125 { PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID,
93 { PCI_VENDOR_ID_VIA, via_bugs }, 126 PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, nvidia_bugs },
94 { PCI_VENDOR_ID_ATI, ati_bugs }, 127 { PCI_VENDOR_ID_VIA, PCI_ANY_ID,
128 PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, via_bugs },
129 { PCI_VENDOR_ID_ATI, PCI_ANY_ID,
130 PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, ati_bugs },
131 { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB,
132 PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config },
95 {} 133 {}
96}; 134};
97 135
136static void __init check_dev_quirk(int num, int slot, int func)
137{
138 u16 class;
139 u16 vendor;
140 u16 device;
141 u8 type;
142 int i;
143
144 class = read_pci_config_16(num, slot, func, PCI_CLASS_DEVICE);
145
146 if (class == 0xffff)
147 return;
148
149 vendor = read_pci_config_16(num, slot, func, PCI_VENDOR_ID);
150
151 device = read_pci_config_16(num, slot, func, PCI_DEVICE_ID);
152
153 for (i = 0; early_qrk[i].f != NULL; i++) {
154 if (((early_qrk[i].vendor == PCI_ANY_ID) ||
155 (early_qrk[i].vendor == vendor)) &&
156 ((early_qrk[i].device == PCI_ANY_ID) ||
157 (early_qrk[i].device == device)) &&
158 (!((early_qrk[i].class ^ class) &
159 early_qrk[i].class_mask))) {
160 if ((early_qrk[i].flags &
161 QFLAG_DONE) != QFLAG_DONE)
162 early_qrk[i].f(num, slot, func);
163 early_qrk[i].flags |= QFLAG_APPLIED;
164 }
165 }
166
167 type = read_pci_config_byte(num, slot, func,
168 PCI_HEADER_TYPE);
169 if (!(type & 0x80))
170 return;
171}
172
98void __init early_quirks(void) 173void __init early_quirks(void)
99{ 174{
100 int num, slot, func; 175 int num, slot, func;
@@ -103,36 +178,8 @@ void __init early_quirks(void)
103 return; 178 return;
104 179
105 /* Poor man's PCI discovery */ 180 /* Poor man's PCI discovery */
106 for (num = 0; num < 32; num++) { 181 for (num = 0; num < 32; num++)
107 for (slot = 0; slot < 32; slot++) { 182 for (slot = 0; slot < 32; slot++)
108 for (func = 0; func < 8; func++) { 183 for (func = 0; func < 8; func++)
109 u32 class; 184 check_dev_quirk(num, slot, func);
110 u32 vendor;
111 u8 type;
112 int i;
113 class = read_pci_config(num,slot,func,
114 PCI_CLASS_REVISION);
115 if (class == 0xffffffff)
116 break;
117
118 if ((class >> 16) != PCI_CLASS_BRIDGE_PCI)
119 continue;
120
121 vendor = read_pci_config(num, slot, func,
122 PCI_VENDOR_ID);
123 vendor &= 0xffff;
124
125 for (i = 0; early_qrk[i].f; i++)
126 if (early_qrk[i].vendor == vendor) {
127 early_qrk[i].f();
128 return;
129 }
130
131 type = read_pci_config_byte(num, slot, func,
132 PCI_HEADER_TYPE);
133 if (!(type & 0x80))
134 break;
135 }
136 }
137 }
138} 185}
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index b7d6c23f287..cff84cd9987 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -193,7 +193,7 @@ static struct console simnow_console = {
193}; 193};
194 194
195/* Direct interface for emergencies */ 195/* Direct interface for emergencies */
196struct console *early_console = &early_vga_console; 196static struct console *early_console = &early_vga_console;
197static int early_console_initialized = 0; 197static int early_console_initialized = 0;
198 198
199void early_printk(const char *fmt, ...) 199void early_printk(const char *fmt, ...)
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
new file mode 100644
index 00000000000..32dd62b36ff
--- /dev/null
+++ b/arch/x86/kernel/efi.c
@@ -0,0 +1,515 @@
1/*
2 * Common EFI (Extensible Firmware Interface) support functions
3 * Based on Extensible Firmware Interface Specification version 1.0
4 *
5 * Copyright (C) 1999 VA Linux Systems
6 * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
7 * Copyright (C) 1999-2002 Hewlett-Packard Co.
8 * David Mosberger-Tang <davidm@hpl.hp.com>
9 * Stephane Eranian <eranian@hpl.hp.com>
10 * Copyright (C) 2005-2008 Intel Co.
11 * Fenghua Yu <fenghua.yu@intel.com>
12 * Bibo Mao <bibo.mao@intel.com>
13 * Chandramouli Narayanan <mouli@linux.intel.com>
14 * Huang Ying <ying.huang@intel.com>
15 *
16 * Copied from efi_32.c to eliminate the duplicated code between EFI
17 * 32/64 support code. --ying 2007-10-26
18 *
19 * All EFI Runtime Services are not implemented yet as EFI only
20 * supports physical mode addressing on SoftSDV. This is to be fixed
21 * in a future version. --drummond 1999-07-20
22 *
23 * Implemented EFI runtime services and virtual mode calls. --davidm
24 *
25 * Goutham Rao: <goutham.rao@intel.com>
26 * Skip non-WB memory and ignore empty memory ranges.
27 */
28
29#include <linux/kernel.h>
30#include <linux/init.h>
31#include <linux/efi.h>
32#include <linux/bootmem.h>
33#include <linux/spinlock.h>
34#include <linux/uaccess.h>
35#include <linux/time.h>
36#include <linux/io.h>
37#include <linux/reboot.h>
38#include <linux/bcd.h>
39
40#include <asm/setup.h>
41#include <asm/efi.h>
42#include <asm/time.h>
43#include <asm/cacheflush.h>
44#include <asm/tlbflush.h>
45
46#define EFI_DEBUG 1
47#define PFX "EFI: "
48
49int efi_enabled;
50EXPORT_SYMBOL(efi_enabled);
51
52struct efi efi;
53EXPORT_SYMBOL(efi);
54
55struct efi_memory_map memmap;
56
57struct efi efi_phys __initdata;
58static efi_system_table_t efi_systab __initdata;
59
60static int __init setup_noefi(char *arg)
61{
62 efi_enabled = 0;
63 return 0;
64}
65early_param("noefi", setup_noefi);
66
67static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
68{
69 return efi_call_virt2(get_time, tm, tc);
70}
71
72static efi_status_t virt_efi_set_time(efi_time_t *tm)
73{
74 return efi_call_virt1(set_time, tm);
75}
76
77static efi_status_t virt_efi_get_wakeup_time(efi_bool_t *enabled,
78 efi_bool_t *pending,
79 efi_time_t *tm)
80{
81 return efi_call_virt3(get_wakeup_time,
82 enabled, pending, tm);
83}
84
85static efi_status_t virt_efi_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm)
86{
87 return efi_call_virt2(set_wakeup_time,
88 enabled, tm);
89}
90
91static efi_status_t virt_efi_get_variable(efi_char16_t *name,
92 efi_guid_t *vendor,
93 u32 *attr,
94 unsigned long *data_size,
95 void *data)
96{
97 return efi_call_virt5(get_variable,
98 name, vendor, attr,
99 data_size, data);
100}
101
102static efi_status_t virt_efi_get_next_variable(unsigned long *name_size,
103 efi_char16_t *name,
104 efi_guid_t *vendor)
105{
106 return efi_call_virt3(get_next_variable,
107 name_size, name, vendor);
108}
109
110static efi_status_t virt_efi_set_variable(efi_char16_t *name,
111 efi_guid_t *vendor,
112 unsigned long attr,
113 unsigned long data_size,
114 void *data)
115{
116 return efi_call_virt5(set_variable,
117 name, vendor, attr,
118 data_size, data);
119}
120
121static efi_status_t virt_efi_get_next_high_mono_count(u32 *count)
122{
123 return efi_call_virt1(get_next_high_mono_count, count);
124}
125
126static void virt_efi_reset_system(int reset_type,
127 efi_status_t status,
128 unsigned long data_size,
129 efi_char16_t *data)
130{
131 efi_call_virt4(reset_system, reset_type, status,
132 data_size, data);
133}
134
135static efi_status_t virt_efi_set_virtual_address_map(
136 unsigned long memory_map_size,
137 unsigned long descriptor_size,
138 u32 descriptor_version,
139 efi_memory_desc_t *virtual_map)
140{
141 return efi_call_virt4(set_virtual_address_map,
142 memory_map_size, descriptor_size,
143 descriptor_version, virtual_map);
144}
145
146static efi_status_t __init phys_efi_set_virtual_address_map(
147 unsigned long memory_map_size,
148 unsigned long descriptor_size,
149 u32 descriptor_version,
150 efi_memory_desc_t *virtual_map)
151{
152 efi_status_t status;
153
154 efi_call_phys_prelog();
155 status = efi_call_phys4(efi_phys.set_virtual_address_map,
156 memory_map_size, descriptor_size,
157 descriptor_version, virtual_map);
158 efi_call_phys_epilog();
159 return status;
160}
161
162static efi_status_t __init phys_efi_get_time(efi_time_t *tm,
163 efi_time_cap_t *tc)
164{
165 efi_status_t status;
166
167 efi_call_phys_prelog();
168 status = efi_call_phys2(efi_phys.get_time, tm, tc);
169 efi_call_phys_epilog();
170 return status;
171}
172
173int efi_set_rtc_mmss(unsigned long nowtime)
174{
175 int real_seconds, real_minutes;
176 efi_status_t status;
177 efi_time_t eft;
178 efi_time_cap_t cap;
179
180 status = efi.get_time(&eft, &cap);
181 if (status != EFI_SUCCESS) {
182 printk(KERN_ERR "Oops: efitime: can't read time!\n");
183 return -1;
184 }
185
186 real_seconds = nowtime % 60;
187 real_minutes = nowtime / 60;
188 if (((abs(real_minutes - eft.minute) + 15)/30) & 1)
189 real_minutes += 30;
190 real_minutes %= 60;
191 eft.minute = real_minutes;
192 eft.second = real_seconds;
193
194 status = efi.set_time(&eft);
195 if (status != EFI_SUCCESS) {
196 printk(KERN_ERR "Oops: efitime: can't write time!\n");
197 return -1;
198 }
199 return 0;
200}
201
202unsigned long efi_get_time(void)
203{
204 efi_status_t status;
205 efi_time_t eft;
206 efi_time_cap_t cap;
207
208 status = efi.get_time(&eft, &cap);
209 if (status != EFI_SUCCESS)
210 printk(KERN_ERR "Oops: efitime: can't read time!\n");
211
212 return mktime(eft.year, eft.month, eft.day, eft.hour,
213 eft.minute, eft.second);
214}
215
216#if EFI_DEBUG
217static void __init print_efi_memmap(void)
218{
219 efi_memory_desc_t *md;
220 void *p;
221 int i;
222
223 for (p = memmap.map, i = 0;
224 p < memmap.map_end;
225 p += memmap.desc_size, i++) {
226 md = p;
227 printk(KERN_INFO PFX "mem%02u: type=%u, attr=0x%llx, "
228 "range=[0x%016llx-0x%016llx) (%lluMB)\n",
229 i, md->type, md->attribute, md->phys_addr,
230 md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT),
231 (md->num_pages >> (20 - EFI_PAGE_SHIFT)));
232 }
233}
234#endif /* EFI_DEBUG */
235
236void __init efi_init(void)
237{
238 efi_config_table_t *config_tables;
239 efi_runtime_services_t *runtime;
240 efi_char16_t *c16;
241 char vendor[100] = "unknown";
242 int i = 0;
243 void *tmp;
244
245#ifdef CONFIG_X86_32
246 efi_phys.systab = (efi_system_table_t *)boot_params.efi_info.efi_systab;
247 memmap.phys_map = (void *)boot_params.efi_info.efi_memmap;
248#else
249 efi_phys.systab = (efi_system_table_t *)
250 (boot_params.efi_info.efi_systab |
251 ((__u64)boot_params.efi_info.efi_systab_hi<<32));
252 memmap.phys_map = (void *)
253 (boot_params.efi_info.efi_memmap |
254 ((__u64)boot_params.efi_info.efi_memmap_hi<<32));
255#endif
256 memmap.nr_map = boot_params.efi_info.efi_memmap_size /
257 boot_params.efi_info.efi_memdesc_size;
258 memmap.desc_version = boot_params.efi_info.efi_memdesc_version;
259 memmap.desc_size = boot_params.efi_info.efi_memdesc_size;
260
261 efi.systab = early_ioremap((unsigned long)efi_phys.systab,
262 sizeof(efi_system_table_t));
263 if (efi.systab == NULL)
264 printk(KERN_ERR "Couldn't map the EFI system table!\n");
265 memcpy(&efi_systab, efi.systab, sizeof(efi_system_table_t));
266 early_iounmap(efi.systab, sizeof(efi_system_table_t));
267 efi.systab = &efi_systab;
268
269 /*
270 * Verify the EFI Table
271 */
272 if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
273 printk(KERN_ERR "EFI system table signature incorrect!\n");
274 if ((efi.systab->hdr.revision >> 16) == 0)
275 printk(KERN_ERR "Warning: EFI system table version "
276 "%d.%02d, expected 1.00 or greater!\n",
277 efi.systab->hdr.revision >> 16,
278 efi.systab->hdr.revision & 0xffff);
279
280 /*
281 * Show what we know for posterity
282 */
283 c16 = tmp = early_ioremap(efi.systab->fw_vendor, 2);
284 if (c16) {
285 for (i = 0; i < sizeof(vendor) && *c16; ++i)
286 vendor[i] = *c16++;
287 vendor[i] = '\0';
288 } else
289 printk(KERN_ERR PFX "Could not map the firmware vendor!\n");
290 early_iounmap(tmp, 2);
291
292 printk(KERN_INFO "EFI v%u.%.02u by %s \n",
293 efi.systab->hdr.revision >> 16,
294 efi.systab->hdr.revision & 0xffff, vendor);
295
296 /*
297 * Let's see what config tables the firmware passed to us.
298 */
299 config_tables = early_ioremap(
300 efi.systab->tables,
301 efi.systab->nr_tables * sizeof(efi_config_table_t));
302 if (config_tables == NULL)
303 printk(KERN_ERR "Could not map EFI Configuration Table!\n");
304
305 printk(KERN_INFO);
306 for (i = 0; i < efi.systab->nr_tables; i++) {
307 if (!efi_guidcmp(config_tables[i].guid, MPS_TABLE_GUID)) {
308 efi.mps = config_tables[i].table;
309 printk(" MPS=0x%lx ", config_tables[i].table);
310 } else if (!efi_guidcmp(config_tables[i].guid,
311 ACPI_20_TABLE_GUID)) {
312 efi.acpi20 = config_tables[i].table;
313 printk(" ACPI 2.0=0x%lx ", config_tables[i].table);
314 } else if (!efi_guidcmp(config_tables[i].guid,
315 ACPI_TABLE_GUID)) {
316 efi.acpi = config_tables[i].table;
317 printk(" ACPI=0x%lx ", config_tables[i].table);
318 } else if (!efi_guidcmp(config_tables[i].guid,
319 SMBIOS_TABLE_GUID)) {
320 efi.smbios = config_tables[i].table;
321 printk(" SMBIOS=0x%lx ", config_tables[i].table);
322 } else if (!efi_guidcmp(config_tables[i].guid,
323 HCDP_TABLE_GUID)) {
324 efi.hcdp = config_tables[i].table;
325 printk(" HCDP=0x%lx ", config_tables[i].table);
326 } else if (!efi_guidcmp(config_tables[i].guid,
327 UGA_IO_PROTOCOL_GUID)) {
328 efi.uga = config_tables[i].table;
329 printk(" UGA=0x%lx ", config_tables[i].table);
330 }
331 }
332 printk("\n");
333 early_iounmap(config_tables,
334 efi.systab->nr_tables * sizeof(efi_config_table_t));
335
336 /*
337 * Check out the runtime services table. We need to map
338 * the runtime services table so that we can grab the physical
339 * address of several of the EFI runtime functions, needed to
340 * set the firmware into virtual mode.
341 */
342 runtime = early_ioremap((unsigned long)efi.systab->runtime,
343 sizeof(efi_runtime_services_t));
344 if (runtime != NULL) {
345 /*
346 * We will only need *early* access to the following
347 * two EFI runtime services before set_virtual_address_map
348 * is invoked.
349 */
350 efi_phys.get_time = (efi_get_time_t *)runtime->get_time;
351 efi_phys.set_virtual_address_map =
352 (efi_set_virtual_address_map_t *)
353 runtime->set_virtual_address_map;
354 /*
355 * Make efi_get_time can be called before entering
356 * virtual mode.
357 */
358 efi.get_time = phys_efi_get_time;
359 } else
360 printk(KERN_ERR "Could not map the EFI runtime service "
361 "table!\n");
362 early_iounmap(runtime, sizeof(efi_runtime_services_t));
363
364 /* Map the EFI memory map */
365 memmap.map = early_ioremap((unsigned long)memmap.phys_map,
366 memmap.nr_map * memmap.desc_size);
367 if (memmap.map == NULL)
368 printk(KERN_ERR "Could not map the EFI memory map!\n");
369 memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
370 if (memmap.desc_size != sizeof(efi_memory_desc_t))
371 printk(KERN_WARNING "Kernel-defined memdesc"
372 "doesn't match the one from EFI!\n");
373
374 /* Setup for EFI runtime service */
375 reboot_type = BOOT_EFI;
376
377#if EFI_DEBUG
378 print_efi_memmap();
379#endif
380}
381
382static void __init runtime_code_page_mkexec(void)
383{
384 efi_memory_desc_t *md;
385 void *p;
386
387 if (!(__supported_pte_mask & _PAGE_NX))
388 return;
389
390 /* Make EFI runtime service code area executable */
391 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
392 md = p;
393
394 if (md->type != EFI_RUNTIME_SERVICES_CODE)
395 continue;
396
397 set_memory_x(md->virt_addr, md->num_pages << EFI_PAGE_SHIFT);
398 }
399}
400
401/*
402 * This function will switch the EFI runtime services to virtual mode.
403 * Essentially, look through the EFI memmap and map every region that
404 * has the runtime attribute bit set in its memory descriptor and update
405 * that memory descriptor with the virtual address obtained from ioremap().
406 * This enables the runtime services to be called without having to
407 * thunk back into physical mode for every invocation.
408 */
409void __init efi_enter_virtual_mode(void)
410{
411 efi_memory_desc_t *md;
412 efi_status_t status;
413 unsigned long size;
414 u64 end, systab;
415 void *p, *va;
416
417 efi.systab = NULL;
418 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
419 md = p;
420 if (!(md->attribute & EFI_MEMORY_RUNTIME))
421 continue;
422
423 size = md->num_pages << EFI_PAGE_SHIFT;
424 end = md->phys_addr + size;
425
426 if ((end >> PAGE_SHIFT) <= max_pfn_mapped)
427 va = __va(md->phys_addr);
428 else
429 va = efi_ioremap(md->phys_addr, size);
430
431 if (md->attribute & EFI_MEMORY_WB)
432 set_memory_uc(md->virt_addr, size);
433
434 md->virt_addr = (u64) (unsigned long) va;
435
436 if (!va) {
437 printk(KERN_ERR PFX "ioremap of 0x%llX failed!\n",
438 (unsigned long long)md->phys_addr);
439 continue;
440 }
441
442 systab = (u64) (unsigned long) efi_phys.systab;
443 if (md->phys_addr <= systab && systab < end) {
444 systab += md->virt_addr - md->phys_addr;
445 efi.systab = (efi_system_table_t *) (unsigned long) systab;
446 }
447 }
448
449 BUG_ON(!efi.systab);
450
451 status = phys_efi_set_virtual_address_map(
452 memmap.desc_size * memmap.nr_map,
453 memmap.desc_size,
454 memmap.desc_version,
455 memmap.phys_map);
456
457 if (status != EFI_SUCCESS) {
458 printk(KERN_ALERT "Unable to switch EFI into virtual mode "
459 "(status=%lx)!\n", status);
460 panic("EFI call to SetVirtualAddressMap() failed!");
461 }
462
463 /*
464 * Now that EFI is in virtual mode, update the function
465 * pointers in the runtime service table to the new virtual addresses.
466 *
467 * Call EFI services through wrapper functions.
468 */
469 efi.get_time = virt_efi_get_time;
470 efi.set_time = virt_efi_set_time;
471 efi.get_wakeup_time = virt_efi_get_wakeup_time;
472 efi.set_wakeup_time = virt_efi_set_wakeup_time;
473 efi.get_variable = virt_efi_get_variable;
474 efi.get_next_variable = virt_efi_get_next_variable;
475 efi.set_variable = virt_efi_set_variable;
476 efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count;
477 efi.reset_system = virt_efi_reset_system;
478 efi.set_virtual_address_map = virt_efi_set_virtual_address_map;
479 runtime_code_page_mkexec();
480 early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size);
481 memmap.map = NULL;
482}
483
484/*
485 * Convenience functions to obtain memory types and attributes
486 */
487u32 efi_mem_type(unsigned long phys_addr)
488{
489 efi_memory_desc_t *md;
490 void *p;
491
492 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
493 md = p;
494 if ((md->phys_addr <= phys_addr) &&
495 (phys_addr < (md->phys_addr +
496 (md->num_pages << EFI_PAGE_SHIFT))))
497 return md->type;
498 }
499 return 0;
500}
501
502u64 efi_mem_attributes(unsigned long phys_addr)
503{
504 efi_memory_desc_t *md;
505 void *p;
506
507 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
508 md = p;
509 if ((md->phys_addr <= phys_addr) &&
510 (phys_addr < (md->phys_addr +
511 (md->num_pages << EFI_PAGE_SHIFT))))
512 return md->attribute;
513 }
514 return 0;
515}
diff --git a/arch/x86/kernel/efi_32.c b/arch/x86/kernel/efi_32.c
index e2be78f4939..cb91f985b4a 100644
--- a/arch/x86/kernel/efi_32.c
+++ b/arch/x86/kernel/efi_32.c
@@ -20,40 +20,15 @@
20 */ 20 */
21 21
22#include <linux/kernel.h> 22#include <linux/kernel.h>
23#include <linux/init.h>
24#include <linux/mm.h>
25#include <linux/types.h> 23#include <linux/types.h>
26#include <linux/time.h>
27#include <linux/spinlock.h>
28#include <linux/bootmem.h>
29#include <linux/ioport.h> 24#include <linux/ioport.h>
30#include <linux/module.h>
31#include <linux/efi.h> 25#include <linux/efi.h>
32#include <linux/kexec.h>
33 26
34#include <asm/setup.h>
35#include <asm/io.h> 27#include <asm/io.h>
36#include <asm/page.h> 28#include <asm/page.h>
37#include <asm/pgtable.h> 29#include <asm/pgtable.h>
38#include <asm/processor.h>
39#include <asm/desc.h>
40#include <asm/tlbflush.h> 30#include <asm/tlbflush.h>
41 31
42#define EFI_DEBUG 0
43#define PFX "EFI: "
44
45extern efi_status_t asmlinkage efi_call_phys(void *, ...);
46
47struct efi efi;
48EXPORT_SYMBOL(efi);
49static struct efi efi_phys;
50struct efi_memory_map memmap;
51
52/*
53 * We require an early boot_ioremap mapping mechanism initially
54 */
55extern void * boot_ioremap(unsigned long, unsigned long);
56
57/* 32/*
58 * To make EFI call EFI runtime service in physical addressing mode we need 33 * To make EFI call EFI runtime service in physical addressing mode we need
59 * prelog/epilog before/after the invocation to disable interrupt, to 34 * prelog/epilog before/after the invocation to disable interrupt, to
@@ -62,16 +37,14 @@ extern void * boot_ioremap(unsigned long, unsigned long);
62 */ 37 */
63 38
64static unsigned long efi_rt_eflags; 39static unsigned long efi_rt_eflags;
65static DEFINE_SPINLOCK(efi_rt_lock);
66static pgd_t efi_bak_pg_dir_pointer[2]; 40static pgd_t efi_bak_pg_dir_pointer[2];
67 41
68static void efi_call_phys_prelog(void) __acquires(efi_rt_lock) 42void efi_call_phys_prelog(void)
69{ 43{
70 unsigned long cr4; 44 unsigned long cr4;
71 unsigned long temp; 45 unsigned long temp;
72 struct Xgt_desc_struct gdt_descr; 46 struct desc_ptr gdt_descr;
73 47
74 spin_lock(&efi_rt_lock);
75 local_irq_save(efi_rt_eflags); 48 local_irq_save(efi_rt_eflags);
76 49
77 /* 50 /*
@@ -101,17 +74,17 @@ static void efi_call_phys_prelog(void) __acquires(efi_rt_lock)
101 /* 74 /*
102 * After the lock is released, the original page table is restored. 75 * After the lock is released, the original page table is restored.
103 */ 76 */
104 local_flush_tlb(); 77 __flush_tlb_all();
105 78
106 gdt_descr.address = __pa(get_cpu_gdt_table(0)); 79 gdt_descr.address = __pa(get_cpu_gdt_table(0));
107 gdt_descr.size = GDT_SIZE - 1; 80 gdt_descr.size = GDT_SIZE - 1;
108 load_gdt(&gdt_descr); 81 load_gdt(&gdt_descr);
109} 82}
110 83
111static void efi_call_phys_epilog(void) __releases(efi_rt_lock) 84void efi_call_phys_epilog(void)
112{ 85{
113 unsigned long cr4; 86 unsigned long cr4;
114 struct Xgt_desc_struct gdt_descr; 87 struct desc_ptr gdt_descr;
115 88
116 gdt_descr.address = (unsigned long)get_cpu_gdt_table(0); 89 gdt_descr.address = (unsigned long)get_cpu_gdt_table(0);
117 gdt_descr.size = GDT_SIZE - 1; 90 gdt_descr.size = GDT_SIZE - 1;
@@ -132,586 +105,7 @@ static void efi_call_phys_epilog(void) __releases(efi_rt_lock)
132 /* 105 /*
133 * After the lock is released, the original page table is restored. 106 * After the lock is released, the original page table is restored.
134 */ 107 */
135 local_flush_tlb(); 108 __flush_tlb_all();
136 109
137 local_irq_restore(efi_rt_eflags); 110 local_irq_restore(efi_rt_eflags);
138 spin_unlock(&efi_rt_lock);
139}
140
141static efi_status_t
142phys_efi_set_virtual_address_map(unsigned long memory_map_size,
143 unsigned long descriptor_size,
144 u32 descriptor_version,
145 efi_memory_desc_t *virtual_map)
146{
147 efi_status_t status;
148
149 efi_call_phys_prelog();
150 status = efi_call_phys(efi_phys.set_virtual_address_map,
151 memory_map_size, descriptor_size,
152 descriptor_version, virtual_map);
153 efi_call_phys_epilog();
154 return status;
155}
156
157static efi_status_t
158phys_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
159{
160 efi_status_t status;
161
162 efi_call_phys_prelog();
163 status = efi_call_phys(efi_phys.get_time, tm, tc);
164 efi_call_phys_epilog();
165 return status;
166}
167
168inline int efi_set_rtc_mmss(unsigned long nowtime)
169{
170 int real_seconds, real_minutes;
171 efi_status_t status;
172 efi_time_t eft;
173 efi_time_cap_t cap;
174
175 spin_lock(&efi_rt_lock);
176 status = efi.get_time(&eft, &cap);
177 spin_unlock(&efi_rt_lock);
178 if (status != EFI_SUCCESS)
179 panic("Ooops, efitime: can't read time!\n");
180 real_seconds = nowtime % 60;
181 real_minutes = nowtime / 60;
182
183 if (((abs(real_minutes - eft.minute) + 15)/30) & 1)
184 real_minutes += 30;
185 real_minutes %= 60;
186
187 eft.minute = real_minutes;
188 eft.second = real_seconds;
189
190 if (status != EFI_SUCCESS) {
191 printk("Ooops: efitime: can't read time!\n");
192 return -1;
193 }
194 return 0;
195}
196/*
197 * This is used during kernel init before runtime
198 * services have been remapped and also during suspend, therefore,
199 * we'll need to call both in physical and virtual modes.
200 */
201inline unsigned long efi_get_time(void)
202{
203 efi_status_t status;
204 efi_time_t eft;
205 efi_time_cap_t cap;
206
207 if (efi.get_time) {
208 /* if we are in virtual mode use remapped function */
209 status = efi.get_time(&eft, &cap);
210 } else {
211 /* we are in physical mode */
212 status = phys_efi_get_time(&eft, &cap);
213 }
214
215 if (status != EFI_SUCCESS)
216 printk("Oops: efitime: can't read time status: 0x%lx\n",status);
217
218 return mktime(eft.year, eft.month, eft.day, eft.hour,
219 eft.minute, eft.second);
220}
221
222int is_available_memory(efi_memory_desc_t * md)
223{
224 if (!(md->attribute & EFI_MEMORY_WB))
225 return 0;
226
227 switch (md->type) {
228 case EFI_LOADER_CODE:
229 case EFI_LOADER_DATA:
230 case EFI_BOOT_SERVICES_CODE:
231 case EFI_BOOT_SERVICES_DATA:
232 case EFI_CONVENTIONAL_MEMORY:
233 return 1;
234 }
235 return 0;
236}
237
238/*
239 * We need to map the EFI memory map again after paging_init().
240 */
241void __init efi_map_memmap(void)
242{
243 memmap.map = NULL;
244
245 memmap.map = bt_ioremap((unsigned long) memmap.phys_map,
246 (memmap.nr_map * memmap.desc_size));
247 if (memmap.map == NULL)
248 printk(KERN_ERR PFX "Could not remap the EFI memmap!\n");
249
250 memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
251}
252
253#if EFI_DEBUG
254static void __init print_efi_memmap(void)
255{
256 efi_memory_desc_t *md;
257 void *p;
258 int i;
259
260 for (p = memmap.map, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) {
261 md = p;
262 printk(KERN_INFO "mem%02u: type=%u, attr=0x%llx, "
263 "range=[0x%016llx-0x%016llx) (%lluMB)\n",
264 i, md->type, md->attribute, md->phys_addr,
265 md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT),
266 (md->num_pages >> (20 - EFI_PAGE_SHIFT)));
267 }
268}
269#endif /* EFI_DEBUG */
270
271/*
272 * Walks the EFI memory map and calls CALLBACK once for each EFI
273 * memory descriptor that has memory that is available for kernel use.
274 */
275void efi_memmap_walk(efi_freemem_callback_t callback, void *arg)
276{
277 int prev_valid = 0;
278 struct range {
279 unsigned long start;
280 unsigned long end;
281 } uninitialized_var(prev), curr;
282 efi_memory_desc_t *md;
283 unsigned long start, end;
284 void *p;
285
286 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
287 md = p;
288
289 if ((md->num_pages == 0) || (!is_available_memory(md)))
290 continue;
291
292 curr.start = md->phys_addr;
293 curr.end = curr.start + (md->num_pages << EFI_PAGE_SHIFT);
294
295 if (!prev_valid) {
296 prev = curr;
297 prev_valid = 1;
298 } else {
299 if (curr.start < prev.start)
300 printk(KERN_INFO PFX "Unordered memory map\n");
301 if (prev.end == curr.start)
302 prev.end = curr.end;
303 else {
304 start =
305 (unsigned long) (PAGE_ALIGN(prev.start));
306 end = (unsigned long) (prev.end & PAGE_MASK);
307 if ((end > start)
308 && (*callback) (start, end, arg) < 0)
309 return;
310 prev = curr;
311 }
312 }
313 }
314 if (prev_valid) {
315 start = (unsigned long) PAGE_ALIGN(prev.start);
316 end = (unsigned long) (prev.end & PAGE_MASK);
317 if (end > start)
318 (*callback) (start, end, arg);
319 }
320}
321
322void __init efi_init(void)
323{
324 efi_config_table_t *config_tables;
325 efi_runtime_services_t *runtime;
326 efi_char16_t *c16;
327 char vendor[100] = "unknown";
328 unsigned long num_config_tables;
329 int i = 0;
330
331 memset(&efi, 0, sizeof(efi) );
332 memset(&efi_phys, 0, sizeof(efi_phys));
333
334 efi_phys.systab =
335 (efi_system_table_t *)boot_params.efi_info.efi_systab;
336 memmap.phys_map = (void *)boot_params.efi_info.efi_memmap;
337 memmap.nr_map = boot_params.efi_info.efi_memmap_size/
338 boot_params.efi_info.efi_memdesc_size;
339 memmap.desc_version = boot_params.efi_info.efi_memdesc_version;
340 memmap.desc_size = boot_params.efi_info.efi_memdesc_size;
341
342 efi.systab = (efi_system_table_t *)
343 boot_ioremap((unsigned long) efi_phys.systab,
344 sizeof(efi_system_table_t));
345 /*
346 * Verify the EFI Table
347 */
348 if (efi.systab == NULL)
349 printk(KERN_ERR PFX "Woah! Couldn't map the EFI system table.\n");
350 if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
351 printk(KERN_ERR PFX "Woah! EFI system table signature incorrect\n");
352 if ((efi.systab->hdr.revision >> 16) == 0)
353 printk(KERN_ERR PFX "Warning: EFI system table version "
354 "%d.%02d, expected 1.00 or greater\n",
355 efi.systab->hdr.revision >> 16,
356 efi.systab->hdr.revision & 0xffff);
357
358 /*
359 * Grab some details from the system table
360 */
361 num_config_tables = efi.systab->nr_tables;
362 config_tables = (efi_config_table_t *)efi.systab->tables;
363 runtime = efi.systab->runtime;
364
365 /*
366 * Show what we know for posterity
367 */
368 c16 = (efi_char16_t *) boot_ioremap(efi.systab->fw_vendor, 2);
369 if (c16) {
370 for (i = 0; i < (sizeof(vendor) - 1) && *c16; ++i)
371 vendor[i] = *c16++;
372 vendor[i] = '\0';
373 } else
374 printk(KERN_ERR PFX "Could not map the firmware vendor!\n");
375
376 printk(KERN_INFO PFX "EFI v%u.%.02u by %s \n",
377 efi.systab->hdr.revision >> 16,
378 efi.systab->hdr.revision & 0xffff, vendor);
379
380 /*
381 * Let's see what config tables the firmware passed to us.
382 */
383 config_tables = (efi_config_table_t *)
384 boot_ioremap((unsigned long) config_tables,
385 num_config_tables * sizeof(efi_config_table_t));
386
387 if (config_tables == NULL)
388 printk(KERN_ERR PFX "Could not map EFI Configuration Table!\n");
389
390 efi.mps = EFI_INVALID_TABLE_ADDR;
391 efi.acpi = EFI_INVALID_TABLE_ADDR;
392 efi.acpi20 = EFI_INVALID_TABLE_ADDR;
393 efi.smbios = EFI_INVALID_TABLE_ADDR;
394 efi.sal_systab = EFI_INVALID_TABLE_ADDR;
395 efi.boot_info = EFI_INVALID_TABLE_ADDR;
396 efi.hcdp = EFI_INVALID_TABLE_ADDR;
397 efi.uga = EFI_INVALID_TABLE_ADDR;
398
399 for (i = 0; i < num_config_tables; i++) {
400 if (efi_guidcmp(config_tables[i].guid, MPS_TABLE_GUID) == 0) {
401 efi.mps = config_tables[i].table;
402 printk(KERN_INFO " MPS=0x%lx ", config_tables[i].table);
403 } else
404 if (efi_guidcmp(config_tables[i].guid, ACPI_20_TABLE_GUID) == 0) {
405 efi.acpi20 = config_tables[i].table;
406 printk(KERN_INFO " ACPI 2.0=0x%lx ", config_tables[i].table);
407 } else
408 if (efi_guidcmp(config_tables[i].guid, ACPI_TABLE_GUID) == 0) {
409 efi.acpi = config_tables[i].table;
410 printk(KERN_INFO " ACPI=0x%lx ", config_tables[i].table);
411 } else
412 if (efi_guidcmp(config_tables[i].guid, SMBIOS_TABLE_GUID) == 0) {
413 efi.smbios = config_tables[i].table;
414 printk(KERN_INFO " SMBIOS=0x%lx ", config_tables[i].table);
415 } else
416 if (efi_guidcmp(config_tables[i].guid, HCDP_TABLE_GUID) == 0) {
417 efi.hcdp = config_tables[i].table;
418 printk(KERN_INFO " HCDP=0x%lx ", config_tables[i].table);
419 } else
420 if (efi_guidcmp(config_tables[i].guid, UGA_IO_PROTOCOL_GUID) == 0) {
421 efi.uga = config_tables[i].table;
422 printk(KERN_INFO " UGA=0x%lx ", config_tables[i].table);
423 }
424 }
425 printk("\n");
426
427 /*
428 * Check out the runtime services table. We need to map
429 * the runtime services table so that we can grab the physical
430 * address of several of the EFI runtime functions, needed to
431 * set the firmware into virtual mode.
432 */
433
434 runtime = (efi_runtime_services_t *) boot_ioremap((unsigned long)
435 runtime,
436 sizeof(efi_runtime_services_t));
437 if (runtime != NULL) {
438 /*
439 * We will only need *early* access to the following
440 * two EFI runtime services before set_virtual_address_map
441 * is invoked.
442 */
443 efi_phys.get_time = (efi_get_time_t *) runtime->get_time;
444 efi_phys.set_virtual_address_map =
445 (efi_set_virtual_address_map_t *)
446 runtime->set_virtual_address_map;
447 } else
448 printk(KERN_ERR PFX "Could not map the runtime service table!\n");
449
450 /* Map the EFI memory map for use until paging_init() */
451 memmap.map = boot_ioremap(boot_params.efi_info.efi_memmap,
452 boot_params.efi_info.efi_memmap_size);
453 if (memmap.map == NULL)
454 printk(KERN_ERR PFX "Could not map the EFI memory map!\n");
455
456 memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
457
458#if EFI_DEBUG
459 print_efi_memmap();
460#endif
461}
462
463static inline void __init check_range_for_systab(efi_memory_desc_t *md)
464{
465 if (((unsigned long)md->phys_addr <= (unsigned long)efi_phys.systab) &&
466 ((unsigned long)efi_phys.systab < md->phys_addr +
467 ((unsigned long)md->num_pages << EFI_PAGE_SHIFT))) {
468 unsigned long addr;
469
470 addr = md->virt_addr - md->phys_addr +
471 (unsigned long)efi_phys.systab;
472 efi.systab = (efi_system_table_t *)addr;
473 }
474}
475
476/*
477 * Wrap all the virtual calls in a way that forces the parameters on the stack.
478 */
479
480#define efi_call_virt(f, args...) \
481 ((efi_##f##_t __attribute__((regparm(0)))*)efi.systab->runtime->f)(args)
482
483static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
484{
485 return efi_call_virt(get_time, tm, tc);
486}
487
488static efi_status_t virt_efi_set_time (efi_time_t *tm)
489{
490 return efi_call_virt(set_time, tm);
491}
492
493static efi_status_t virt_efi_get_wakeup_time (efi_bool_t *enabled,
494 efi_bool_t *pending,
495 efi_time_t *tm)
496{
497 return efi_call_virt(get_wakeup_time, enabled, pending, tm);
498}
499
500static efi_status_t virt_efi_set_wakeup_time (efi_bool_t enabled,
501 efi_time_t *tm)
502{
503 return efi_call_virt(set_wakeup_time, enabled, tm);
504}
505
506static efi_status_t virt_efi_get_variable (efi_char16_t *name,
507 efi_guid_t *vendor, u32 *attr,
508 unsigned long *data_size, void *data)
509{
510 return efi_call_virt(get_variable, name, vendor, attr, data_size, data);
511}
512
513static efi_status_t virt_efi_get_next_variable (unsigned long *name_size,
514 efi_char16_t *name,
515 efi_guid_t *vendor)
516{
517 return efi_call_virt(get_next_variable, name_size, name, vendor);
518}
519
520static efi_status_t virt_efi_set_variable (efi_char16_t *name,
521 efi_guid_t *vendor,
522 unsigned long attr,
523 unsigned long data_size, void *data)
524{
525 return efi_call_virt(set_variable, name, vendor, attr, data_size, data);
526}
527
528static efi_status_t virt_efi_get_next_high_mono_count (u32 *count)
529{
530 return efi_call_virt(get_next_high_mono_count, count);
531}
532
533static void virt_efi_reset_system (int reset_type, efi_status_t status,
534 unsigned long data_size,
535 efi_char16_t *data)
536{
537 efi_call_virt(reset_system, reset_type, status, data_size, data);
538}
539
540/*
541 * This function will switch the EFI runtime services to virtual mode.
542 * Essentially, look through the EFI memmap and map every region that
543 * has the runtime attribute bit set in its memory descriptor and update
544 * that memory descriptor with the virtual address obtained from ioremap().
545 * This enables the runtime services to be called without having to
546 * thunk back into physical mode for every invocation.
547 */
548
549void __init efi_enter_virtual_mode(void)
550{
551 efi_memory_desc_t *md;
552 efi_status_t status;
553 void *p;
554
555 efi.systab = NULL;
556
557 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
558 md = p;
559
560 if (!(md->attribute & EFI_MEMORY_RUNTIME))
561 continue;
562
563 md->virt_addr = (unsigned long)ioremap(md->phys_addr,
564 md->num_pages << EFI_PAGE_SHIFT);
565 if (!(unsigned long)md->virt_addr) {
566 printk(KERN_ERR PFX "ioremap of 0x%lX failed\n",
567 (unsigned long)md->phys_addr);
568 }
569 /* update the virtual address of the EFI system table */
570 check_range_for_systab(md);
571 }
572
573 BUG_ON(!efi.systab);
574
575 status = phys_efi_set_virtual_address_map(
576 memmap.desc_size * memmap.nr_map,
577 memmap.desc_size,
578 memmap.desc_version,
579 memmap.phys_map);
580
581 if (status != EFI_SUCCESS) {
582 printk (KERN_ALERT "You are screwed! "
583 "Unable to switch EFI into virtual mode "
584 "(status=%lx)\n", status);
585 panic("EFI call to SetVirtualAddressMap() failed!");
586 }
587
588 /*
589 * Now that EFI is in virtual mode, update the function
590 * pointers in the runtime service table to the new virtual addresses.
591 */
592
593 efi.get_time = virt_efi_get_time;
594 efi.set_time = virt_efi_set_time;
595 efi.get_wakeup_time = virt_efi_get_wakeup_time;
596 efi.set_wakeup_time = virt_efi_set_wakeup_time;
597 efi.get_variable = virt_efi_get_variable;
598 efi.get_next_variable = virt_efi_get_next_variable;
599 efi.set_variable = virt_efi_set_variable;
600 efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count;
601 efi.reset_system = virt_efi_reset_system;
602}
603
604void __init
605efi_initialize_iomem_resources(struct resource *code_resource,
606 struct resource *data_resource,
607 struct resource *bss_resource)
608{
609 struct resource *res;
610 efi_memory_desc_t *md;
611 void *p;
612
613 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
614 md = p;
615
616 if ((md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >
617 0x100000000ULL)
618 continue;
619 res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
620 switch (md->type) {
621 case EFI_RESERVED_TYPE:
622 res->name = "Reserved Memory";
623 break;
624 case EFI_LOADER_CODE:
625 res->name = "Loader Code";
626 break;
627 case EFI_LOADER_DATA:
628 res->name = "Loader Data";
629 break;
630 case EFI_BOOT_SERVICES_DATA:
631 res->name = "BootServices Data";
632 break;
633 case EFI_BOOT_SERVICES_CODE:
634 res->name = "BootServices Code";
635 break;
636 case EFI_RUNTIME_SERVICES_CODE:
637 res->name = "Runtime Service Code";
638 break;
639 case EFI_RUNTIME_SERVICES_DATA:
640 res->name = "Runtime Service Data";
641 break;
642 case EFI_CONVENTIONAL_MEMORY:
643 res->name = "Conventional Memory";
644 break;
645 case EFI_UNUSABLE_MEMORY:
646 res->name = "Unusable Memory";
647 break;
648 case EFI_ACPI_RECLAIM_MEMORY:
649 res->name = "ACPI Reclaim";
650 break;
651 case EFI_ACPI_MEMORY_NVS:
652 res->name = "ACPI NVS";
653 break;
654 case EFI_MEMORY_MAPPED_IO:
655 res->name = "Memory Mapped IO";
656 break;
657 case EFI_MEMORY_MAPPED_IO_PORT_SPACE:
658 res->name = "Memory Mapped IO Port Space";
659 break;
660 default:
661 res->name = "Reserved";
662 break;
663 }
664 res->start = md->phys_addr;
665 res->end = res->start + ((md->num_pages << EFI_PAGE_SHIFT) - 1);
666 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
667 if (request_resource(&iomem_resource, res) < 0)
668 printk(KERN_ERR PFX "Failed to allocate res %s : "
669 "0x%llx-0x%llx\n", res->name,
670 (unsigned long long)res->start,
671 (unsigned long long)res->end);
672 /*
673 * We don't know which region contains kernel data so we try
674 * it repeatedly and let the resource manager test it.
675 */
676 if (md->type == EFI_CONVENTIONAL_MEMORY) {
677 request_resource(res, code_resource);
678 request_resource(res, data_resource);
679 request_resource(res, bss_resource);
680#ifdef CONFIG_KEXEC
681 request_resource(res, &crashk_res);
682#endif
683 }
684 }
685}
686
687/*
688 * Convenience functions to obtain memory types and attributes
689 */
690
691u32 efi_mem_type(unsigned long phys_addr)
692{
693 efi_memory_desc_t *md;
694 void *p;
695
696 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
697 md = p;
698 if ((md->phys_addr <= phys_addr) && (phys_addr <
699 (md->phys_addr + (md-> num_pages << EFI_PAGE_SHIFT)) ))
700 return md->type;
701 }
702 return 0;
703}
704
705u64 efi_mem_attributes(unsigned long phys_addr)
706{
707 efi_memory_desc_t *md;
708 void *p;
709
710 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
711 md = p;
712 if ((md->phys_addr <= phys_addr) && (phys_addr <
713 (md->phys_addr + (md-> num_pages << EFI_PAGE_SHIFT)) ))
714 return md->attribute;
715 }
716 return 0;
717} 111}
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c
new file mode 100644
index 00000000000..09d5c233093
--- /dev/null
+++ b/arch/x86/kernel/efi_64.c
@@ -0,0 +1,134 @@
1/*
2 * x86_64 specific EFI support functions
3 * Based on Extensible Firmware Interface Specification version 1.0
4 *
5 * Copyright (C) 2005-2008 Intel Co.
6 * Fenghua Yu <fenghua.yu@intel.com>
7 * Bibo Mao <bibo.mao@intel.com>
8 * Chandramouli Narayanan <mouli@linux.intel.com>
9 * Huang Ying <ying.huang@intel.com>
10 *
11 * Code to convert EFI to E820 map has been implemented in elilo bootloader
12 * based on a EFI patch by Edgar Hucek. Based on the E820 map, the page table
13 * is setup appropriately for EFI runtime code.
14 * - mouli 06/14/2007.
15 *
16 */
17
18#include <linux/kernel.h>
19#include <linux/init.h>
20#include <linux/mm.h>
21#include <linux/types.h>
22#include <linux/spinlock.h>
23#include <linux/bootmem.h>
24#include <linux/ioport.h>
25#include <linux/module.h>
26#include <linux/efi.h>
27#include <linux/uaccess.h>
28#include <linux/io.h>
29#include <linux/reboot.h>
30
31#include <asm/setup.h>
32#include <asm/page.h>
33#include <asm/e820.h>
34#include <asm/pgtable.h>
35#include <asm/tlbflush.h>
36#include <asm/proto.h>
37#include <asm/efi.h>
38
39static pgd_t save_pgd __initdata;
40static unsigned long efi_flags __initdata;
41
42static void __init early_mapping_set_exec(unsigned long start,
43 unsigned long end,
44 int executable)
45{
46 pte_t *kpte;
47 unsigned int level;
48
49 while (start < end) {
50 kpte = lookup_address((unsigned long)__va(start), &level);
51 BUG_ON(!kpte);
52 if (executable)
53 set_pte(kpte, pte_mkexec(*kpte));
54 else
55 set_pte(kpte, __pte((pte_val(*kpte) | _PAGE_NX) & \
56 __supported_pte_mask));
57 if (level == PG_LEVEL_4K)
58 start = (start + PAGE_SIZE) & PAGE_MASK;
59 else
60 start = (start + PMD_SIZE) & PMD_MASK;
61 }
62}
63
64static void __init early_runtime_code_mapping_set_exec(int executable)
65{
66 efi_memory_desc_t *md;
67 void *p;
68
69 if (!(__supported_pte_mask & _PAGE_NX))
70 return;
71
72 /* Make EFI runtime service code area executable */
73 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
74 md = p;
75 if (md->type == EFI_RUNTIME_SERVICES_CODE) {
76 unsigned long end;
77 end = md->phys_addr + (md->num_pages << PAGE_SHIFT);
78 early_mapping_set_exec(md->phys_addr, end, executable);
79 }
80 }
81}
82
83void __init efi_call_phys_prelog(void)
84{
85 unsigned long vaddress;
86
87 local_irq_save(efi_flags);
88 early_runtime_code_mapping_set_exec(1);
89 vaddress = (unsigned long)__va(0x0UL);
90 save_pgd = *pgd_offset_k(0x0UL);
91 set_pgd(pgd_offset_k(0x0UL), *pgd_offset_k(vaddress));
92 __flush_tlb_all();
93}
94
95void __init efi_call_phys_epilog(void)
96{
97 /*
98 * After the lock is released, the original page table is restored.
99 */
100 set_pgd(pgd_offset_k(0x0UL), save_pgd);
101 early_runtime_code_mapping_set_exec(0);
102 __flush_tlb_all();
103 local_irq_restore(efi_flags);
104}
105
106void __init efi_reserve_bootmem(void)
107{
108 reserve_bootmem_generic((unsigned long)memmap.phys_map,
109 memmap.nr_map * memmap.desc_size);
110}
111
112void __iomem * __init efi_ioremap(unsigned long phys_addr, unsigned long size)
113{
114 static unsigned pages_mapped;
115 unsigned i, pages;
116
117 /* phys_addr and size must be page aligned */
118 if ((phys_addr & ~PAGE_MASK) || (size & ~PAGE_MASK))
119 return NULL;
120
121 pages = size >> PAGE_SHIFT;
122 if (pages_mapped + pages > MAX_EFI_IO_PAGES)
123 return NULL;
124
125 for (i = 0; i < pages; i++) {
126 __set_fixmap(FIX_EFI_IO_MAP_FIRST_PAGE - pages_mapped,
127 phys_addr, PAGE_KERNEL);
128 phys_addr += PAGE_SIZE;
129 pages_mapped++;
130 }
131
132 return (void __iomem *)__fix_to_virt(FIX_EFI_IO_MAP_FIRST_PAGE - \
133 (pages_mapped - pages));
134}
diff --git a/arch/x86/kernel/efi_stub_64.S b/arch/x86/kernel/efi_stub_64.S
new file mode 100644
index 00000000000..99b47d48c9f
--- /dev/null
+++ b/arch/x86/kernel/efi_stub_64.S
@@ -0,0 +1,109 @@
1/*
2 * Function calling ABI conversion from Linux to EFI for x86_64
3 *
4 * Copyright (C) 2007 Intel Corp
5 * Bibo Mao <bibo.mao@intel.com>
6 * Huang Ying <ying.huang@intel.com>
7 */
8
9#include <linux/linkage.h>
10
11#define SAVE_XMM \
12 mov %rsp, %rax; \
13 subq $0x70, %rsp; \
14 and $~0xf, %rsp; \
15 mov %rax, (%rsp); \
16 mov %cr0, %rax; \
17 clts; \
18 mov %rax, 0x8(%rsp); \
19 movaps %xmm0, 0x60(%rsp); \
20 movaps %xmm1, 0x50(%rsp); \
21 movaps %xmm2, 0x40(%rsp); \
22 movaps %xmm3, 0x30(%rsp); \
23 movaps %xmm4, 0x20(%rsp); \
24 movaps %xmm5, 0x10(%rsp)
25
26#define RESTORE_XMM \
27 movaps 0x60(%rsp), %xmm0; \
28 movaps 0x50(%rsp), %xmm1; \
29 movaps 0x40(%rsp), %xmm2; \
30 movaps 0x30(%rsp), %xmm3; \
31 movaps 0x20(%rsp), %xmm4; \
32 movaps 0x10(%rsp), %xmm5; \
33 mov 0x8(%rsp), %rsi; \
34 mov %rsi, %cr0; \
35 mov (%rsp), %rsp
36
37ENTRY(efi_call0)
38 SAVE_XMM
39 subq $32, %rsp
40 call *%rdi
41 addq $32, %rsp
42 RESTORE_XMM
43 ret
44
45ENTRY(efi_call1)
46 SAVE_XMM
47 subq $32, %rsp
48 mov %rsi, %rcx
49 call *%rdi
50 addq $32, %rsp
51 RESTORE_XMM
52 ret
53
54ENTRY(efi_call2)
55 SAVE_XMM
56 subq $32, %rsp
57 mov %rsi, %rcx
58 call *%rdi
59 addq $32, %rsp
60 RESTORE_XMM
61 ret
62
63ENTRY(efi_call3)
64 SAVE_XMM
65 subq $32, %rsp
66 mov %rcx, %r8
67 mov %rsi, %rcx
68 call *%rdi
69 addq $32, %rsp
70 RESTORE_XMM
71 ret
72
73ENTRY(efi_call4)
74 SAVE_XMM
75 subq $32, %rsp
76 mov %r8, %r9
77 mov %rcx, %r8
78 mov %rsi, %rcx
79 call *%rdi
80 addq $32, %rsp
81 RESTORE_XMM
82 ret
83
84ENTRY(efi_call5)
85 SAVE_XMM
86 subq $48, %rsp
87 mov %r9, 32(%rsp)
88 mov %r8, %r9
89 mov %rcx, %r8
90 mov %rsi, %rcx
91 call *%rdi
92 addq $48, %rsp
93 RESTORE_XMM
94 ret
95
96ENTRY(efi_call6)
97 SAVE_XMM
98 mov (%rsp), %rax
99 mov 8(%rax), %rax
100 subq $48, %rsp
101 mov %r9, 32(%rsp)
102 mov %rax, 40(%rsp)
103 mov %r8, %r9
104 mov %rcx, %r8
105 mov %rsi, %rcx
106 call *%rdi
107 addq $48, %rsp
108 RESTORE_XMM
109 ret
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index dc7f938e501..be5c31d0488 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -58,7 +58,7 @@
58 * for paravirtualization. The following will never clobber any registers: 58 * for paravirtualization. The following will never clobber any registers:
59 * INTERRUPT_RETURN (aka. "iret") 59 * INTERRUPT_RETURN (aka. "iret")
60 * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax") 60 * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
61 * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit"). 61 * ENABLE_INTERRUPTS_SYSCALL_RET (aka "sti; sysexit").
62 * 62 *
63 * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must 63 * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
64 * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY). 64 * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
@@ -283,12 +283,12 @@ END(resume_kernel)
283 the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ 283 the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */
284 284
285 # sysenter call handler stub 285 # sysenter call handler stub
286ENTRY(sysenter_entry) 286ENTRY(ia32_sysenter_target)
287 CFI_STARTPROC simple 287 CFI_STARTPROC simple
288 CFI_SIGNAL_FRAME 288 CFI_SIGNAL_FRAME
289 CFI_DEF_CFA esp, 0 289 CFI_DEF_CFA esp, 0
290 CFI_REGISTER esp, ebp 290 CFI_REGISTER esp, ebp
291 movl TSS_sysenter_esp0(%esp),%esp 291 movl TSS_sysenter_sp0(%esp),%esp
292sysenter_past_esp: 292sysenter_past_esp:
293 /* 293 /*
294 * No need to follow this irqs on/off section: the syscall 294 * No need to follow this irqs on/off section: the syscall
@@ -351,7 +351,7 @@ sysenter_past_esp:
351 xorl %ebp,%ebp 351 xorl %ebp,%ebp
352 TRACE_IRQS_ON 352 TRACE_IRQS_ON
3531: mov PT_FS(%esp), %fs 3531: mov PT_FS(%esp), %fs
354 ENABLE_INTERRUPTS_SYSEXIT 354 ENABLE_INTERRUPTS_SYSCALL_RET
355 CFI_ENDPROC 355 CFI_ENDPROC
356.pushsection .fixup,"ax" 356.pushsection .fixup,"ax"
3572: movl $0,PT_FS(%esp) 3572: movl $0,PT_FS(%esp)
@@ -360,7 +360,7 @@ sysenter_past_esp:
360 .align 4 360 .align 4
361 .long 1b,2b 361 .long 1b,2b
362.popsection 362.popsection
363ENDPROC(sysenter_entry) 363ENDPROC(ia32_sysenter_target)
364 364
365 # system call handler stub 365 # system call handler stub
366ENTRY(system_call) 366ENTRY(system_call)
@@ -583,7 +583,7 @@ END(syscall_badsys)
583 * Build the entry stubs and pointer table with 583 * Build the entry stubs and pointer table with
584 * some assembler magic. 584 * some assembler magic.
585 */ 585 */
586.data 586.section .rodata,"a"
587ENTRY(interrupt) 587ENTRY(interrupt)
588.text 588.text
589 589
@@ -743,7 +743,7 @@ END(device_not_available)
743 * that sets up the real kernel stack. Check here, since we can't 743 * that sets up the real kernel stack. Check here, since we can't
744 * allow the wrong stack to be used. 744 * allow the wrong stack to be used.
745 * 745 *
746 * "TSS_sysenter_esp0+12" is because the NMI/debug handler will have 746 * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
747 * already pushed 3 words if it hits on the sysenter instruction: 747 * already pushed 3 words if it hits on the sysenter instruction:
748 * eflags, cs and eip. 748 * eflags, cs and eip.
749 * 749 *
@@ -755,7 +755,7 @@ END(device_not_available)
755 cmpw $__KERNEL_CS,4(%esp); \ 755 cmpw $__KERNEL_CS,4(%esp); \
756 jne ok; \ 756 jne ok; \
757label: \ 757label: \
758 movl TSS_sysenter_esp0+offset(%esp),%esp; \ 758 movl TSS_sysenter_sp0+offset(%esp),%esp; \
759 CFI_DEF_CFA esp, 0; \ 759 CFI_DEF_CFA esp, 0; \
760 CFI_UNDEFINED eip; \ 760 CFI_UNDEFINED eip; \
761 pushfl; \ 761 pushfl; \
@@ -768,7 +768,7 @@ label: \
768 768
769KPROBE_ENTRY(debug) 769KPROBE_ENTRY(debug)
770 RING0_INT_FRAME 770 RING0_INT_FRAME
771 cmpl $sysenter_entry,(%esp) 771 cmpl $ia32_sysenter_target,(%esp)
772 jne debug_stack_correct 772 jne debug_stack_correct
773 FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn) 773 FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
774debug_stack_correct: 774debug_stack_correct:
@@ -799,7 +799,7 @@ KPROBE_ENTRY(nmi)
799 popl %eax 799 popl %eax
800 CFI_ADJUST_CFA_OFFSET -4 800 CFI_ADJUST_CFA_OFFSET -4
801 je nmi_espfix_stack 801 je nmi_espfix_stack
802 cmpl $sysenter_entry,(%esp) 802 cmpl $ia32_sysenter_target,(%esp)
803 je nmi_stack_fixup 803 je nmi_stack_fixup
804 pushl %eax 804 pushl %eax
805 CFI_ADJUST_CFA_OFFSET 4 805 CFI_ADJUST_CFA_OFFSET 4
@@ -812,7 +812,7 @@ KPROBE_ENTRY(nmi)
812 popl %eax 812 popl %eax
813 CFI_ADJUST_CFA_OFFSET -4 813 CFI_ADJUST_CFA_OFFSET -4
814 jae nmi_stack_correct 814 jae nmi_stack_correct
815 cmpl $sysenter_entry,12(%esp) 815 cmpl $ia32_sysenter_target,12(%esp)
816 je nmi_debug_stack_check 816 je nmi_debug_stack_check
817nmi_stack_correct: 817nmi_stack_correct:
818 /* We have a RING0_INT_FRAME here */ 818 /* We have a RING0_INT_FRAME here */
@@ -882,10 +882,10 @@ ENTRY(native_iret)
882.previous 882.previous
883END(native_iret) 883END(native_iret)
884 884
885ENTRY(native_irq_enable_sysexit) 885ENTRY(native_irq_enable_syscall_ret)
886 sti 886 sti
887 sysexit 887 sysexit
888END(native_irq_enable_sysexit) 888END(native_irq_enable_syscall_ret)
889#endif 889#endif
890 890
891KPROBE_ENTRY(int3) 891KPROBE_ENTRY(int3)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 3a058bb1640..c7341e81941 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -50,6 +50,7 @@
50#include <asm/hw_irq.h> 50#include <asm/hw_irq.h>
51#include <asm/page.h> 51#include <asm/page.h>
52#include <asm/irqflags.h> 52#include <asm/irqflags.h>
53#include <asm/paravirt.h>
53 54
54 .code64 55 .code64
55 56
@@ -57,6 +58,13 @@
57#define retint_kernel retint_restore_args 58#define retint_kernel retint_restore_args
58#endif 59#endif
59 60
61#ifdef CONFIG_PARAVIRT
62ENTRY(native_irq_enable_syscall_ret)
63 movq %gs:pda_oldrsp,%rsp
64 swapgs
65 sysretq
66#endif /* CONFIG_PARAVIRT */
67
60 68
61.macro TRACE_IRQS_IRETQ offset=ARGOFFSET 69.macro TRACE_IRQS_IRETQ offset=ARGOFFSET
62#ifdef CONFIG_TRACE_IRQFLAGS 70#ifdef CONFIG_TRACE_IRQFLAGS
@@ -216,14 +224,21 @@ ENTRY(system_call)
216 CFI_DEF_CFA rsp,PDA_STACKOFFSET 224 CFI_DEF_CFA rsp,PDA_STACKOFFSET
217 CFI_REGISTER rip,rcx 225 CFI_REGISTER rip,rcx
218 /*CFI_REGISTER rflags,r11*/ 226 /*CFI_REGISTER rflags,r11*/
219 swapgs 227 SWAPGS_UNSAFE_STACK
228 /*
229 * A hypervisor implementation might want to use a label
230 * after the swapgs, so that it can do the swapgs
231 * for the guest and jump here on syscall.
232 */
233ENTRY(system_call_after_swapgs)
234
220 movq %rsp,%gs:pda_oldrsp 235 movq %rsp,%gs:pda_oldrsp
221 movq %gs:pda_kernelstack,%rsp 236 movq %gs:pda_kernelstack,%rsp
222 /* 237 /*
223 * No need to follow this irqs off/on section - it's straight 238 * No need to follow this irqs off/on section - it's straight
224 * and short: 239 * and short:
225 */ 240 */
226 sti 241 ENABLE_INTERRUPTS(CLBR_NONE)
227 SAVE_ARGS 8,1 242 SAVE_ARGS 8,1
228 movq %rax,ORIG_RAX-ARGOFFSET(%rsp) 243 movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
229 movq %rcx,RIP-ARGOFFSET(%rsp) 244 movq %rcx,RIP-ARGOFFSET(%rsp)
@@ -246,7 +261,7 @@ ret_from_sys_call:
246sysret_check: 261sysret_check:
247 LOCKDEP_SYS_EXIT 262 LOCKDEP_SYS_EXIT
248 GET_THREAD_INFO(%rcx) 263 GET_THREAD_INFO(%rcx)
249 cli 264 DISABLE_INTERRUPTS(CLBR_NONE)
250 TRACE_IRQS_OFF 265 TRACE_IRQS_OFF
251 movl threadinfo_flags(%rcx),%edx 266 movl threadinfo_flags(%rcx),%edx
252 andl %edi,%edx 267 andl %edi,%edx
@@ -260,9 +275,7 @@ sysret_check:
260 CFI_REGISTER rip,rcx 275 CFI_REGISTER rip,rcx
261 RESTORE_ARGS 0,-ARG_SKIP,1 276 RESTORE_ARGS 0,-ARG_SKIP,1
262 /*CFI_REGISTER rflags,r11*/ 277 /*CFI_REGISTER rflags,r11*/
263 movq %gs:pda_oldrsp,%rsp 278 ENABLE_INTERRUPTS_SYSCALL_RET
264 swapgs
265 sysretq
266 279
267 CFI_RESTORE_STATE 280 CFI_RESTORE_STATE
268 /* Handle reschedules */ 281 /* Handle reschedules */
@@ -271,7 +284,7 @@ sysret_careful:
271 bt $TIF_NEED_RESCHED,%edx 284 bt $TIF_NEED_RESCHED,%edx
272 jnc sysret_signal 285 jnc sysret_signal
273 TRACE_IRQS_ON 286 TRACE_IRQS_ON
274 sti 287 ENABLE_INTERRUPTS(CLBR_NONE)
275 pushq %rdi 288 pushq %rdi
276 CFI_ADJUST_CFA_OFFSET 8 289 CFI_ADJUST_CFA_OFFSET 8
277 call schedule 290 call schedule
@@ -282,8 +295,8 @@ sysret_careful:
282 /* Handle a signal */ 295 /* Handle a signal */
283sysret_signal: 296sysret_signal:
284 TRACE_IRQS_ON 297 TRACE_IRQS_ON
285 sti 298 ENABLE_INTERRUPTS(CLBR_NONE)
286 testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx 299 testl $_TIF_DO_NOTIFY_MASK,%edx
287 jz 1f 300 jz 1f
288 301
289 /* Really a signal */ 302 /* Really a signal */
@@ -295,7 +308,7 @@ sysret_signal:
2951: movl $_TIF_NEED_RESCHED,%edi 3081: movl $_TIF_NEED_RESCHED,%edi
296 /* Use IRET because user could have changed frame. This 309 /* Use IRET because user could have changed frame. This
297 works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ 310 works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
298 cli 311 DISABLE_INTERRUPTS(CLBR_NONE)
299 TRACE_IRQS_OFF 312 TRACE_IRQS_OFF
300 jmp int_with_check 313 jmp int_with_check
301 314
@@ -327,7 +340,7 @@ tracesys:
327 */ 340 */
328 .globl int_ret_from_sys_call 341 .globl int_ret_from_sys_call
329int_ret_from_sys_call: 342int_ret_from_sys_call:
330 cli 343 DISABLE_INTERRUPTS(CLBR_NONE)
331 TRACE_IRQS_OFF 344 TRACE_IRQS_OFF
332 testl $3,CS-ARGOFFSET(%rsp) 345 testl $3,CS-ARGOFFSET(%rsp)
333 je retint_restore_args 346 je retint_restore_args
@@ -349,20 +362,20 @@ int_careful:
349 bt $TIF_NEED_RESCHED,%edx 362 bt $TIF_NEED_RESCHED,%edx
350 jnc int_very_careful 363 jnc int_very_careful
351 TRACE_IRQS_ON 364 TRACE_IRQS_ON
352 sti 365 ENABLE_INTERRUPTS(CLBR_NONE)
353 pushq %rdi 366 pushq %rdi
354 CFI_ADJUST_CFA_OFFSET 8 367 CFI_ADJUST_CFA_OFFSET 8
355 call schedule 368 call schedule
356 popq %rdi 369 popq %rdi
357 CFI_ADJUST_CFA_OFFSET -8 370 CFI_ADJUST_CFA_OFFSET -8
358 cli 371 DISABLE_INTERRUPTS(CLBR_NONE)
359 TRACE_IRQS_OFF 372 TRACE_IRQS_OFF
360 jmp int_with_check 373 jmp int_with_check
361 374
362 /* handle signals and tracing -- both require a full stack frame */ 375 /* handle signals and tracing -- both require a full stack frame */
363int_very_careful: 376int_very_careful:
364 TRACE_IRQS_ON 377 TRACE_IRQS_ON
365 sti 378 ENABLE_INTERRUPTS(CLBR_NONE)
366 SAVE_REST 379 SAVE_REST
367 /* Check for syscall exit trace */ 380 /* Check for syscall exit trace */
368 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx 381 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx
@@ -377,7 +390,7 @@ int_very_careful:
377 jmp int_restore_rest 390 jmp int_restore_rest
378 391
379int_signal: 392int_signal:
380 testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx 393 testl $_TIF_DO_NOTIFY_MASK,%edx
381 jz 1f 394 jz 1f
382 movq %rsp,%rdi # &ptregs -> arg1 395 movq %rsp,%rdi # &ptregs -> arg1
383 xorl %esi,%esi # oldset -> arg2 396 xorl %esi,%esi # oldset -> arg2
@@ -385,7 +398,7 @@ int_signal:
3851: movl $_TIF_NEED_RESCHED,%edi 3981: movl $_TIF_NEED_RESCHED,%edi
386int_restore_rest: 399int_restore_rest:
387 RESTORE_REST 400 RESTORE_REST
388 cli 401 DISABLE_INTERRUPTS(CLBR_NONE)
389 TRACE_IRQS_OFF 402 TRACE_IRQS_OFF
390 jmp int_with_check 403 jmp int_with_check
391 CFI_ENDPROC 404 CFI_ENDPROC
@@ -506,7 +519,7 @@ END(stub_rt_sigreturn)
506 CFI_DEF_CFA_REGISTER rbp 519 CFI_DEF_CFA_REGISTER rbp
507 testl $3,CS(%rdi) 520 testl $3,CS(%rdi)
508 je 1f 521 je 1f
509 swapgs 522 SWAPGS
510 /* irqcount is used to check if a CPU is already on an interrupt 523 /* irqcount is used to check if a CPU is already on an interrupt
511 stack or not. While this is essentially redundant with preempt_count 524 stack or not. While this is essentially redundant with preempt_count
512 it is a little cheaper to use a separate counter in the PDA 525 it is a little cheaper to use a separate counter in the PDA
@@ -527,7 +540,7 @@ ENTRY(common_interrupt)
527 interrupt do_IRQ 540 interrupt do_IRQ
528 /* 0(%rsp): oldrsp-ARGOFFSET */ 541 /* 0(%rsp): oldrsp-ARGOFFSET */
529ret_from_intr: 542ret_from_intr:
530 cli 543 DISABLE_INTERRUPTS(CLBR_NONE)
531 TRACE_IRQS_OFF 544 TRACE_IRQS_OFF
532 decl %gs:pda_irqcount 545 decl %gs:pda_irqcount
533 leaveq 546 leaveq
@@ -556,64 +569,76 @@ retint_swapgs: /* return to user-space */
556 /* 569 /*
557 * The iretq could re-enable interrupts: 570 * The iretq could re-enable interrupts:
558 */ 571 */
559 cli 572 DISABLE_INTERRUPTS(CLBR_ANY)
560 TRACE_IRQS_IRETQ 573 TRACE_IRQS_IRETQ
561 swapgs 574 SWAPGS
562 jmp restore_args 575 jmp restore_args
563 576
564retint_restore_args: /* return to kernel space */ 577retint_restore_args: /* return to kernel space */
565 cli 578 DISABLE_INTERRUPTS(CLBR_ANY)
566 /* 579 /*
567 * The iretq could re-enable interrupts: 580 * The iretq could re-enable interrupts:
568 */ 581 */
569 TRACE_IRQS_IRETQ 582 TRACE_IRQS_IRETQ
570restore_args: 583restore_args:
571 RESTORE_ARGS 0,8,0 584 RESTORE_ARGS 0,8,0
572iret_label: 585#ifdef CONFIG_PARAVIRT
586 INTERRUPT_RETURN
587#endif
588ENTRY(native_iret)
573 iretq 589 iretq
574 590
575 .section __ex_table,"a" 591 .section __ex_table,"a"
576 .quad iret_label,bad_iret 592 .quad native_iret, bad_iret
577 .previous 593 .previous
578 .section .fixup,"ax" 594 .section .fixup,"ax"
579 /* force a signal here? this matches i386 behaviour */
580 /* running with kernel gs */
581bad_iret: 595bad_iret:
582 movq $11,%rdi /* SIGSEGV */ 596 /*
583 TRACE_IRQS_ON 597 * The iret traps when the %cs or %ss being restored is bogus.
584 sti 598 * We've lost the original trap vector and error code.
585 jmp do_exit 599 * #GPF is the most likely one to get for an invalid selector.
586 .previous 600 * So pretend we completed the iret and took the #GPF in user mode.
587 601 *
602 * We are now running with the kernel GS after exception recovery.
603 * But error_entry expects us to have user GS to match the user %cs,
604 * so swap back.
605 */
606 pushq $0
607
608 SWAPGS
609 jmp general_protection
610
611 .previous
612
588 /* edi: workmask, edx: work */ 613 /* edi: workmask, edx: work */
589retint_careful: 614retint_careful:
590 CFI_RESTORE_STATE 615 CFI_RESTORE_STATE
591 bt $TIF_NEED_RESCHED,%edx 616 bt $TIF_NEED_RESCHED,%edx
592 jnc retint_signal 617 jnc retint_signal
593 TRACE_IRQS_ON 618 TRACE_IRQS_ON
594 sti 619 ENABLE_INTERRUPTS(CLBR_NONE)
595 pushq %rdi 620 pushq %rdi
596 CFI_ADJUST_CFA_OFFSET 8 621 CFI_ADJUST_CFA_OFFSET 8
597 call schedule 622 call schedule
598 popq %rdi 623 popq %rdi
599 CFI_ADJUST_CFA_OFFSET -8 624 CFI_ADJUST_CFA_OFFSET -8
600 GET_THREAD_INFO(%rcx) 625 GET_THREAD_INFO(%rcx)
601 cli 626 DISABLE_INTERRUPTS(CLBR_NONE)
602 TRACE_IRQS_OFF 627 TRACE_IRQS_OFF
603 jmp retint_check 628 jmp retint_check
604 629
605retint_signal: 630retint_signal:
606 testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx 631 testl $_TIF_DO_NOTIFY_MASK,%edx
607 jz retint_swapgs 632 jz retint_swapgs
608 TRACE_IRQS_ON 633 TRACE_IRQS_ON
609 sti 634 ENABLE_INTERRUPTS(CLBR_NONE)
610 SAVE_REST 635 SAVE_REST
611 movq $-1,ORIG_RAX(%rsp) 636 movq $-1,ORIG_RAX(%rsp)
612 xorl %esi,%esi # oldset 637 xorl %esi,%esi # oldset
613 movq %rsp,%rdi # &pt_regs 638 movq %rsp,%rdi # &pt_regs
614 call do_notify_resume 639 call do_notify_resume
615 RESTORE_REST 640 RESTORE_REST
616 cli 641 DISABLE_INTERRUPTS(CLBR_NONE)
617 TRACE_IRQS_OFF 642 TRACE_IRQS_OFF
618 movl $_TIF_NEED_RESCHED,%edi 643 movl $_TIF_NEED_RESCHED,%edi
619 GET_THREAD_INFO(%rcx) 644 GET_THREAD_INFO(%rcx)
@@ -731,7 +756,7 @@ END(spurious_interrupt)
731 rdmsr 756 rdmsr
732 testl %edx,%edx 757 testl %edx,%edx
733 js 1f 758 js 1f
734 swapgs 759 SWAPGS
735 xorl %ebx,%ebx 760 xorl %ebx,%ebx
7361: 7611:
737 .if \ist 762 .if \ist
@@ -747,7 +772,7 @@ END(spurious_interrupt)
747 .if \ist 772 .if \ist
748 addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) 773 addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
749 .endif 774 .endif
750 cli 775 DISABLE_INTERRUPTS(CLBR_NONE)
751 .if \irqtrace 776 .if \irqtrace
752 TRACE_IRQS_OFF 777 TRACE_IRQS_OFF
753 .endif 778 .endif
@@ -776,10 +801,10 @@ paranoid_swapgs\trace:
776 .if \trace 801 .if \trace
777 TRACE_IRQS_IRETQ 0 802 TRACE_IRQS_IRETQ 0
778 .endif 803 .endif
779 swapgs 804 SWAPGS_UNSAFE_STACK
780paranoid_restore\trace: 805paranoid_restore\trace:
781 RESTORE_ALL 8 806 RESTORE_ALL 8
782 iretq 807 INTERRUPT_RETURN
783paranoid_userspace\trace: 808paranoid_userspace\trace:
784 GET_THREAD_INFO(%rcx) 809 GET_THREAD_INFO(%rcx)
785 movl threadinfo_flags(%rcx),%ebx 810 movl threadinfo_flags(%rcx),%ebx
@@ -794,11 +819,11 @@ paranoid_userspace\trace:
794 .if \trace 819 .if \trace
795 TRACE_IRQS_ON 820 TRACE_IRQS_ON
796 .endif 821 .endif
797 sti 822 ENABLE_INTERRUPTS(CLBR_NONE)
798 xorl %esi,%esi /* arg2: oldset */ 823 xorl %esi,%esi /* arg2: oldset */
799 movq %rsp,%rdi /* arg1: &pt_regs */ 824 movq %rsp,%rdi /* arg1: &pt_regs */
800 call do_notify_resume 825 call do_notify_resume
801 cli 826 DISABLE_INTERRUPTS(CLBR_NONE)
802 .if \trace 827 .if \trace
803 TRACE_IRQS_OFF 828 TRACE_IRQS_OFF
804 .endif 829 .endif
@@ -807,9 +832,9 @@ paranoid_schedule\trace:
807 .if \trace 832 .if \trace
808 TRACE_IRQS_ON 833 TRACE_IRQS_ON
809 .endif 834 .endif
810 sti 835 ENABLE_INTERRUPTS(CLBR_ANY)
811 call schedule 836 call schedule
812 cli 837 DISABLE_INTERRUPTS(CLBR_ANY)
813 .if \trace 838 .if \trace
814 TRACE_IRQS_OFF 839 TRACE_IRQS_OFF
815 .endif 840 .endif
@@ -862,7 +887,7 @@ KPROBE_ENTRY(error_entry)
862 testl $3,CS(%rsp) 887 testl $3,CS(%rsp)
863 je error_kernelspace 888 je error_kernelspace
864error_swapgs: 889error_swapgs:
865 swapgs 890 SWAPGS
866error_sti: 891error_sti:
867 movq %rdi,RDI(%rsp) 892 movq %rdi,RDI(%rsp)
868 CFI_REL_OFFSET rdi,RDI 893 CFI_REL_OFFSET rdi,RDI
@@ -874,7 +899,7 @@ error_sti:
874error_exit: 899error_exit:
875 movl %ebx,%eax 900 movl %ebx,%eax
876 RESTORE_REST 901 RESTORE_REST
877 cli 902 DISABLE_INTERRUPTS(CLBR_NONE)
878 TRACE_IRQS_OFF 903 TRACE_IRQS_OFF
879 GET_THREAD_INFO(%rcx) 904 GET_THREAD_INFO(%rcx)
880 testl %eax,%eax 905 testl %eax,%eax
@@ -894,7 +919,7 @@ error_kernelspace:
894 iret run with kernel gs again, so don't set the user space flag. 919 iret run with kernel gs again, so don't set the user space flag.
895 B stepping K8s sometimes report an truncated RIP for IRET 920 B stepping K8s sometimes report an truncated RIP for IRET
896 exceptions returning to compat mode. Check for these here too. */ 921 exceptions returning to compat mode. Check for these here too. */
897 leaq iret_label(%rip),%rbp 922 leaq native_iret(%rip),%rbp
898 cmpq %rbp,RIP(%rsp) 923 cmpq %rbp,RIP(%rsp)
899 je error_swapgs 924 je error_swapgs
900 movl %ebp,%ebp /* zero extend */ 925 movl %ebp,%ebp /* zero extend */
@@ -911,12 +936,12 @@ ENTRY(load_gs_index)
911 CFI_STARTPROC 936 CFI_STARTPROC
912 pushf 937 pushf
913 CFI_ADJUST_CFA_OFFSET 8 938 CFI_ADJUST_CFA_OFFSET 8
914 cli 939 DISABLE_INTERRUPTS(CLBR_ANY | ~(CLBR_RDI))
915 swapgs 940 SWAPGS
916gs_change: 941gs_change:
917 movl %edi,%gs 942 movl %edi,%gs
9182: mfence /* workaround */ 9432: mfence /* workaround */
919 swapgs 944 SWAPGS
920 popf 945 popf
921 CFI_ADJUST_CFA_OFFSET -8 946 CFI_ADJUST_CFA_OFFSET -8
922 ret 947 ret
@@ -930,7 +955,7 @@ ENDPROC(load_gs_index)
930 .section .fixup,"ax" 955 .section .fixup,"ax"
931 /* running with kernelgs */ 956 /* running with kernelgs */
932bad_gs: 957bad_gs:
933 swapgs /* switch back to user gs */ 958 SWAPGS /* switch back to user gs */
934 xorl %eax,%eax 959 xorl %eax,%eax
935 movl %eax,%gs 960 movl %eax,%gs
936 jmp 2b 961 jmp 2b
diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c
index ce703e21c91..4ae7b644026 100644
--- a/arch/x86/kernel/genapic_64.c
+++ b/arch/x86/kernel/genapic_64.c
@@ -24,18 +24,11 @@
24#include <acpi/acpi_bus.h> 24#include <acpi/acpi_bus.h>
25#endif 25#endif
26 26
27/* 27/* which logical CPU number maps to which CPU (physical APIC ID) */
28 * which logical CPU number maps to which CPU (physical APIC ID) 28u16 x86_cpu_to_apicid_init[NR_CPUS] __initdata
29 *
30 * The following static array is used during kernel startup
31 * and the x86_cpu_to_apicid_ptr contains the address of the
32 * array during this time. Is it zeroed when the per_cpu
33 * data area is removed.
34 */
35u8 x86_cpu_to_apicid_init[NR_CPUS] __initdata
36 = { [0 ... NR_CPUS-1] = BAD_APICID }; 29 = { [0 ... NR_CPUS-1] = BAD_APICID };
37void *x86_cpu_to_apicid_ptr; 30void *x86_cpu_to_apicid_early_ptr;
38DEFINE_PER_CPU(u8, x86_cpu_to_apicid) = BAD_APICID; 31DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID;
39EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid); 32EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
40 33
41struct genapic __read_mostly *genapic = &apic_flat; 34struct genapic __read_mostly *genapic = &apic_flat;
diff --git a/arch/x86/kernel/geode_32.c b/arch/x86/kernel/geode_32.c
index f12d8c5d980..9c7f7d39596 100644
--- a/arch/x86/kernel/geode_32.c
+++ b/arch/x86/kernel/geode_32.c
@@ -1,6 +1,7 @@
1/* 1/*
2 * AMD Geode southbridge support code 2 * AMD Geode southbridge support code
3 * Copyright (C) 2006, Advanced Micro Devices, Inc. 3 * Copyright (C) 2006, Advanced Micro Devices, Inc.
4 * Copyright (C) 2007, Andres Salomon <dilinger@debian.org>
4 * 5 *
5 * This program is free software; you can redistribute it and/or 6 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of version 2 of the GNU General Public License 7 * modify it under the terms of version 2 of the GNU General Public License
@@ -51,45 +52,62 @@ EXPORT_SYMBOL_GPL(geode_get_dev_base);
51 52
52/* === GPIO API === */ 53/* === GPIO API === */
53 54
54void geode_gpio_set(unsigned int gpio, unsigned int reg) 55void geode_gpio_set(u32 gpio, unsigned int reg)
55{ 56{
56 u32 base = geode_get_dev_base(GEODE_DEV_GPIO); 57 u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
57 58
58 if (!base) 59 if (!base)
59 return; 60 return;
60 61
61 if (gpio < 16) 62 /* low bank register */
62 outl(1 << gpio, base + reg); 63 if (gpio & 0xFFFF)
63 else 64 outl(gpio & 0xFFFF, base + reg);
64 outl(1 << (gpio - 16), base + 0x80 + reg); 65 /* high bank register */
66 gpio >>= 16;
67 if (gpio)
68 outl(gpio, base + 0x80 + reg);
65} 69}
66EXPORT_SYMBOL_GPL(geode_gpio_set); 70EXPORT_SYMBOL_GPL(geode_gpio_set);
67 71
68void geode_gpio_clear(unsigned int gpio, unsigned int reg) 72void geode_gpio_clear(u32 gpio, unsigned int reg)
69{ 73{
70 u32 base = geode_get_dev_base(GEODE_DEV_GPIO); 74 u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
71 75
72 if (!base) 76 if (!base)
73 return; 77 return;
74 78
75 if (gpio < 16) 79 /* low bank register */
76 outl(1 << (gpio + 16), base + reg); 80 if (gpio & 0xFFFF)
77 else 81 outl((gpio & 0xFFFF) << 16, base + reg);
78 outl(1 << gpio, base + 0x80 + reg); 82 /* high bank register */
83 gpio &= (0xFFFF << 16);
84 if (gpio)
85 outl(gpio, base + 0x80 + reg);
79} 86}
80EXPORT_SYMBOL_GPL(geode_gpio_clear); 87EXPORT_SYMBOL_GPL(geode_gpio_clear);
81 88
82int geode_gpio_isset(unsigned int gpio, unsigned int reg) 89int geode_gpio_isset(u32 gpio, unsigned int reg)
83{ 90{
84 u32 base = geode_get_dev_base(GEODE_DEV_GPIO); 91 u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
92 u32 val;
85 93
86 if (!base) 94 if (!base)
87 return 0; 95 return 0;
88 96
89 if (gpio < 16) 97 /* low bank register */
90 return (inl(base + reg) & (1 << gpio)) ? 1 : 0; 98 if (gpio & 0xFFFF) {
91 else 99 val = inl(base + reg) & (gpio & 0xFFFF);
92 return (inl(base + 0x80 + reg) & (1 << (gpio - 16))) ? 1 : 0; 100 if ((gpio & 0xFFFF) == val)
101 return 1;
102 }
103 /* high bank register */
104 gpio >>= 16;
105 if (gpio) {
106 val = inl(base + 0x80 + reg) & gpio;
107 if (gpio == val)
108 return 1;
109 }
110 return 0;
93} 111}
94EXPORT_SYMBOL_GPL(geode_gpio_isset); 112EXPORT_SYMBOL_GPL(geode_gpio_isset);
95 113
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 6b3469311e4..24dbf56928d 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -10,6 +10,7 @@
10#include <linux/kernel.h> 10#include <linux/kernel.h>
11#include <linux/string.h> 11#include <linux/string.h>
12#include <linux/percpu.h> 12#include <linux/percpu.h>
13#include <linux/start_kernel.h>
13 14
14#include <asm/processor.h> 15#include <asm/processor.h>
15#include <asm/proto.h> 16#include <asm/proto.h>
@@ -19,12 +20,14 @@
19#include <asm/pgtable.h> 20#include <asm/pgtable.h>
20#include <asm/tlbflush.h> 21#include <asm/tlbflush.h>
21#include <asm/sections.h> 22#include <asm/sections.h>
23#include <asm/kdebug.h>
24#include <asm/e820.h>
22 25
23static void __init zap_identity_mappings(void) 26static void __init zap_identity_mappings(void)
24{ 27{
25 pgd_t *pgd = pgd_offset_k(0UL); 28 pgd_t *pgd = pgd_offset_k(0UL);
26 pgd_clear(pgd); 29 pgd_clear(pgd);
27 __flush_tlb(); 30 __flush_tlb_all();
28} 31}
29 32
30/* Don't add a printk in there. printk relies on the PDA which is not initialized 33/* Don't add a printk in there. printk relies on the PDA which is not initialized
@@ -46,6 +49,35 @@ static void __init copy_bootdata(char *real_mode_data)
46 } 49 }
47} 50}
48 51
52#define EBDA_ADDR_POINTER 0x40E
53
54static __init void reserve_ebda(void)
55{
56 unsigned ebda_addr, ebda_size;
57
58 /*
59 * there is a real-mode segmented pointer pointing to the
60 * 4K EBDA area at 0x40E
61 */
62 ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER);
63 ebda_addr <<= 4;
64
65 if (!ebda_addr)
66 return;
67
68 ebda_size = *(unsigned short *)__va(ebda_addr);
69
70 /* Round EBDA up to pages */
71 if (ebda_size == 0)
72 ebda_size = 1;
73 ebda_size <<= 10;
74 ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE);
75 if (ebda_size > 64*1024)
76 ebda_size = 64*1024;
77
78 reserve_early(ebda_addr, ebda_addr + ebda_size, "EBDA");
79}
80
49void __init x86_64_start_kernel(char * real_mode_data) 81void __init x86_64_start_kernel(char * real_mode_data)
50{ 82{
51 int i; 83 int i;
@@ -56,8 +88,13 @@ void __init x86_64_start_kernel(char * real_mode_data)
56 /* Make NULL pointers segfault */ 88 /* Make NULL pointers segfault */
57 zap_identity_mappings(); 89 zap_identity_mappings();
58 90
59 for (i = 0; i < IDT_ENTRIES; i++) 91 for (i = 0; i < IDT_ENTRIES; i++) {
92#ifdef CONFIG_EARLY_PRINTK
93 set_intr_gate(i, &early_idt_handlers[i]);
94#else
60 set_intr_gate(i, early_idt_handler); 95 set_intr_gate(i, early_idt_handler);
96#endif
97 }
61 load_idt((const struct desc_ptr *)&idt_descr); 98 load_idt((const struct desc_ptr *)&idt_descr);
62 99
63 early_printk("Kernel alive\n"); 100 early_printk("Kernel alive\n");
@@ -67,8 +104,24 @@ void __init x86_64_start_kernel(char * real_mode_data)
67 104
68 pda_init(0); 105 pda_init(0);
69 copy_bootdata(__va(real_mode_data)); 106 copy_bootdata(__va(real_mode_data));
70#ifdef CONFIG_SMP 107
71 cpu_set(0, cpu_online_map); 108 reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
72#endif 109
110 /* Reserve INITRD */
111 if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
112 unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
113 unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
114 unsigned long ramdisk_end = ramdisk_image + ramdisk_size;
115 reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
116 }
117
118 reserve_ebda();
119
120 /*
121 * At this point everything still needed from the boot loader
122 * or BIOS or kernel text should be early reserved or marked not
123 * RAM in e820. All other memory is free game.
124 */
125
73 start_kernel(); 126 start_kernel();
74} 127}
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index fbad51fce67..5d8c5730686 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -9,6 +9,7 @@
9 9
10.text 10.text
11#include <linux/threads.h> 11#include <linux/threads.h>
12#include <linux/init.h>
12#include <linux/linkage.h> 13#include <linux/linkage.h>
13#include <asm/segment.h> 14#include <asm/segment.h>
14#include <asm/page.h> 15#include <asm/page.h>
@@ -151,7 +152,9 @@ WEAK(xen_entry)
151 /* Unknown implementation; there's really 152 /* Unknown implementation; there's really
152 nothing we can do at this point. */ 153 nothing we can do at this point. */
153 ud2a 154 ud2a
154.data 155
156 __INITDATA
157
155subarch_entries: 158subarch_entries:
156 .long default_entry /* normal x86/PC */ 159 .long default_entry /* normal x86/PC */
157 .long lguest_entry /* lguest hypervisor */ 160 .long lguest_entry /* lguest hypervisor */
@@ -199,7 +202,6 @@ default_entry:
199 addl $0x67, %eax /* 0x67 == _PAGE_TABLE */ 202 addl $0x67, %eax /* 0x67 == _PAGE_TABLE */
200 movl %eax, 4092(%edx) 203 movl %eax, 4092(%edx)
201 204
202 xorl %ebx,%ebx /* This is the boot CPU (BSP) */
203 jmp 3f 205 jmp 3f
204/* 206/*
205 * Non-boot CPU entry point; entered from trampoline.S 207 * Non-boot CPU entry point; entered from trampoline.S
@@ -222,6 +224,8 @@ ENTRY(startup_32_smp)
222 movl %eax,%es 224 movl %eax,%es
223 movl %eax,%fs 225 movl %eax,%fs
224 movl %eax,%gs 226 movl %eax,%gs
227#endif /* CONFIG_SMP */
2283:
225 229
226/* 230/*
227 * New page tables may be in 4Mbyte page mode and may 231 * New page tables may be in 4Mbyte page mode and may
@@ -268,12 +272,6 @@ ENTRY(startup_32_smp)
268 wrmsr 272 wrmsr
269 273
2706: 2746:
271 /* This is a secondary processor (AP) */
272 xorl %ebx,%ebx
273 incl %ebx
274
275#endif /* CONFIG_SMP */
2763:
277 275
278/* 276/*
279 * Enable paging 277 * Enable paging
@@ -297,7 +295,7 @@ ENTRY(startup_32_smp)
297 popfl 295 popfl
298 296
299#ifdef CONFIG_SMP 297#ifdef CONFIG_SMP
300 andl %ebx,%ebx 298 cmpb $0, ready
301 jz 1f /* Initial CPU cleans BSS */ 299 jz 1f /* Initial CPU cleans BSS */
302 jmp checkCPUtype 300 jmp checkCPUtype
3031: 3011:
@@ -502,6 +500,7 @@ early_fault:
502 call printk 500 call printk
503#endif 501#endif
504#endif 502#endif
503 call dump_stack
505hlt_loop: 504hlt_loop:
506 hlt 505 hlt
507 jmp hlt_loop 506 jmp hlt_loop
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index b6167fe3330..09b38d539b0 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -19,6 +19,13 @@
19#include <asm/msr.h> 19#include <asm/msr.h>
20#include <asm/cache.h> 20#include <asm/cache.h>
21 21
22#ifdef CONFIG_PARAVIRT
23#include <asm/asm-offsets.h>
24#include <asm/paravirt.h>
25#else
26#define GET_CR2_INTO_RCX movq %cr2, %rcx
27#endif
28
22/* we are not able to switch in one step to the final KERNEL ADRESS SPACE 29/* we are not able to switch in one step to the final KERNEL ADRESS SPACE
23 * because we need identity-mapped pages. 30 * because we need identity-mapped pages.
24 * 31 *
@@ -56,7 +63,7 @@ startup_64:
56 63
57 /* Is the address not 2M aligned? */ 64 /* Is the address not 2M aligned? */
58 movq %rbp, %rax 65 movq %rbp, %rax
59 andl $~LARGE_PAGE_MASK, %eax 66 andl $~PMD_PAGE_MASK, %eax
60 testl %eax, %eax 67 testl %eax, %eax
61 jnz bad_address 68 jnz bad_address
62 69
@@ -81,7 +88,7 @@ startup_64:
81 88
82 /* Add an Identity mapping if I am above 1G */ 89 /* Add an Identity mapping if I am above 1G */
83 leaq _text(%rip), %rdi 90 leaq _text(%rip), %rdi
84 andq $LARGE_PAGE_MASK, %rdi 91 andq $PMD_PAGE_MASK, %rdi
85 92
86 movq %rdi, %rax 93 movq %rdi, %rax
87 shrq $PUD_SHIFT, %rax 94 shrq $PUD_SHIFT, %rax
@@ -243,31 +250,55 @@ ENTRY(secondary_startup_64)
243 lretq 250 lretq
244 251
245 /* SMP bootup changes these two */ 252 /* SMP bootup changes these two */
246#ifndef CONFIG_HOTPLUG_CPU 253 __CPUINITDATA
247 .pushsection .init.data
248#endif
249 .align 8 254 .align 8
250 .globl initial_code 255 ENTRY(initial_code)
251initial_code:
252 .quad x86_64_start_kernel 256 .quad x86_64_start_kernel
253#ifndef CONFIG_HOTPLUG_CPU 257 __FINITDATA
254 .popsection 258
255#endif 259 ENTRY(init_rsp)
256 .globl init_rsp
257init_rsp:
258 .quad init_thread_union+THREAD_SIZE-8 260 .quad init_thread_union+THREAD_SIZE-8
259 261
260bad_address: 262bad_address:
261 jmp bad_address 263 jmp bad_address
262 264
265#ifdef CONFIG_EARLY_PRINTK
266.macro early_idt_tramp first, last
267 .ifgt \last-\first
268 early_idt_tramp \first, \last-1
269 .endif
270 movl $\last,%esi
271 jmp early_idt_handler
272.endm
273
274 .globl early_idt_handlers
275early_idt_handlers:
276 early_idt_tramp 0, 63
277 early_idt_tramp 64, 127
278 early_idt_tramp 128, 191
279 early_idt_tramp 192, 255
280#endif
281
263ENTRY(early_idt_handler) 282ENTRY(early_idt_handler)
283#ifdef CONFIG_EARLY_PRINTK
264 cmpl $2,early_recursion_flag(%rip) 284 cmpl $2,early_recursion_flag(%rip)
265 jz 1f 285 jz 1f
266 incl early_recursion_flag(%rip) 286 incl early_recursion_flag(%rip)
287 GET_CR2_INTO_RCX
288 movq %rcx,%r9
289 xorl %r8d,%r8d # zero for error code
290 movl %esi,%ecx # get vector number
291 # Test %ecx against mask of vectors that push error code.
292 cmpl $31,%ecx
293 ja 0f
294 movl $1,%eax
295 salq %cl,%rax
296 testl $0x27d00,%eax
297 je 0f
298 popq %r8 # get error code
2990: movq 0(%rsp),%rcx # get ip
300 movq 8(%rsp),%rdx # get cs
267 xorl %eax,%eax 301 xorl %eax,%eax
268 movq 8(%rsp),%rsi # get rip
269 movq (%rsp),%rdx
270 movq %cr2,%rcx
271 leaq early_idt_msg(%rip),%rdi 302 leaq early_idt_msg(%rip),%rdi
272 call early_printk 303 call early_printk
273 cmpl $2,early_recursion_flag(%rip) 304 cmpl $2,early_recursion_flag(%rip)
@@ -278,15 +309,19 @@ ENTRY(early_idt_handler)
278 movq 8(%rsp),%rsi # get rip again 309 movq 8(%rsp),%rsi # get rip again
279 call __print_symbol 310 call __print_symbol
280#endif 311#endif
312#endif /* EARLY_PRINTK */
2811: hlt 3131: hlt
282 jmp 1b 314 jmp 1b
315
316#ifdef CONFIG_EARLY_PRINTK
283early_recursion_flag: 317early_recursion_flag:
284 .long 0 318 .long 0
285 319
286early_idt_msg: 320early_idt_msg:
287 .asciz "PANIC: early exception rip %lx error %lx cr2 %lx\n" 321 .asciz "PANIC: early exception %02lx rip %lx:%lx error %lx cr2 %lx\n"
288early_idt_ripmsg: 322early_idt_ripmsg:
289 .asciz "RIP %s\n" 323 .asciz "RIP %s\n"
324#endif /* CONFIG_EARLY_PRINTK */
290 325
291.balign PAGE_SIZE 326.balign PAGE_SIZE
292 327
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 4a86ffd67ec..429d084e014 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -6,7 +6,6 @@
6#include <linux/init.h> 6#include <linux/init.h>
7#include <linux/sysdev.h> 7#include <linux/sysdev.h>
8#include <linux/pm.h> 8#include <linux/pm.h>
9#include <linux/delay.h>
10 9
11#include <asm/fixmap.h> 10#include <asm/fixmap.h>
12#include <asm/hpet.h> 11#include <asm/hpet.h>
@@ -16,7 +15,8 @@
16#define HPET_MASK CLOCKSOURCE_MASK(32) 15#define HPET_MASK CLOCKSOURCE_MASK(32)
17#define HPET_SHIFT 22 16#define HPET_SHIFT 22
18 17
19/* FSEC = 10^-15 NSEC = 10^-9 */ 18/* FSEC = 10^-15
19 NSEC = 10^-9 */
20#define FSEC_PER_NSEC 1000000 20#define FSEC_PER_NSEC 1000000
21 21
22/* 22/*
@@ -107,6 +107,7 @@ int is_hpet_enabled(void)
107{ 107{
108 return is_hpet_capable() && hpet_legacy_int_enabled; 108 return is_hpet_capable() && hpet_legacy_int_enabled;
109} 109}
110EXPORT_SYMBOL_GPL(is_hpet_enabled);
110 111
111/* 112/*
112 * When the hpet driver (/dev/hpet) is enabled, we need to reserve 113 * When the hpet driver (/dev/hpet) is enabled, we need to reserve
@@ -132,16 +133,13 @@ static void hpet_reserve_platform_timers(unsigned long id)
132#ifdef CONFIG_HPET_EMULATE_RTC 133#ifdef CONFIG_HPET_EMULATE_RTC
133 hpet_reserve_timer(&hd, 1); 134 hpet_reserve_timer(&hd, 1);
134#endif 135#endif
135
136 hd.hd_irq[0] = HPET_LEGACY_8254; 136 hd.hd_irq[0] = HPET_LEGACY_8254;
137 hd.hd_irq[1] = HPET_LEGACY_RTC; 137 hd.hd_irq[1] = HPET_LEGACY_RTC;
138 138
139 for (i = 2; i < nrtimers; timer++, i++) 139 for (i = 2; i < nrtimers; timer++, i++)
140 hd.hd_irq[i] = (timer->hpet_config & Tn_INT_ROUTE_CNF_MASK) >> 140 hd.hd_irq[i] = (timer->hpet_config & Tn_INT_ROUTE_CNF_MASK) >>
141 Tn_INT_ROUTE_CNF_SHIFT; 141 Tn_INT_ROUTE_CNF_SHIFT;
142
143 hpet_alloc(&hd); 142 hpet_alloc(&hd);
144
145} 143}
146#else 144#else
147static void hpet_reserve_platform_timers(unsigned long id) { } 145static void hpet_reserve_platform_timers(unsigned long id) { }
@@ -478,6 +476,7 @@ void hpet_disable(void)
478 */ 476 */
479#include <linux/mc146818rtc.h> 477#include <linux/mc146818rtc.h>
480#include <linux/rtc.h> 478#include <linux/rtc.h>
479#include <asm/rtc.h>
481 480
482#define DEFAULT_RTC_INT_FREQ 64 481#define DEFAULT_RTC_INT_FREQ 64
483#define DEFAULT_RTC_SHIFT 6 482#define DEFAULT_RTC_SHIFT 6
@@ -492,6 +491,38 @@ static unsigned long hpet_default_delta;
492static unsigned long hpet_pie_delta; 491static unsigned long hpet_pie_delta;
493static unsigned long hpet_pie_limit; 492static unsigned long hpet_pie_limit;
494 493
494static rtc_irq_handler irq_handler;
495
496/*
497 * Registers a IRQ handler.
498 */
499int hpet_register_irq_handler(rtc_irq_handler handler)
500{
501 if (!is_hpet_enabled())
502 return -ENODEV;
503 if (irq_handler)
504 return -EBUSY;
505
506 irq_handler = handler;
507
508 return 0;
509}
510EXPORT_SYMBOL_GPL(hpet_register_irq_handler);
511
512/*
513 * Deregisters the IRQ handler registered with hpet_register_irq_handler()
514 * and does cleanup.
515 */
516void hpet_unregister_irq_handler(rtc_irq_handler handler)
517{
518 if (!is_hpet_enabled())
519 return;
520
521 irq_handler = NULL;
522 hpet_rtc_flags = 0;
523}
524EXPORT_SYMBOL_GPL(hpet_unregister_irq_handler);
525
495/* 526/*
496 * Timer 1 for RTC emulation. We use one shot mode, as periodic mode 527 * Timer 1 for RTC emulation. We use one shot mode, as periodic mode
497 * is not supported by all HPET implementations for timer 1. 528 * is not supported by all HPET implementations for timer 1.
@@ -533,6 +564,7 @@ int hpet_rtc_timer_init(void)
533 564
534 return 1; 565 return 1;
535} 566}
567EXPORT_SYMBOL_GPL(hpet_rtc_timer_init);
536 568
537/* 569/*
538 * The functions below are called from rtc driver. 570 * The functions below are called from rtc driver.
@@ -547,6 +579,7 @@ int hpet_mask_rtc_irq_bit(unsigned long bit_mask)
547 hpet_rtc_flags &= ~bit_mask; 579 hpet_rtc_flags &= ~bit_mask;
548 return 1; 580 return 1;
549} 581}
582EXPORT_SYMBOL_GPL(hpet_mask_rtc_irq_bit);
550 583
551int hpet_set_rtc_irq_bit(unsigned long bit_mask) 584int hpet_set_rtc_irq_bit(unsigned long bit_mask)
552{ 585{
@@ -562,6 +595,7 @@ int hpet_set_rtc_irq_bit(unsigned long bit_mask)
562 595
563 return 1; 596 return 1;
564} 597}
598EXPORT_SYMBOL_GPL(hpet_set_rtc_irq_bit);
565 599
566int hpet_set_alarm_time(unsigned char hrs, unsigned char min, 600int hpet_set_alarm_time(unsigned char hrs, unsigned char min,
567 unsigned char sec) 601 unsigned char sec)
@@ -575,6 +609,7 @@ int hpet_set_alarm_time(unsigned char hrs, unsigned char min,
575 609
576 return 1; 610 return 1;
577} 611}
612EXPORT_SYMBOL_GPL(hpet_set_alarm_time);
578 613
579int hpet_set_periodic_freq(unsigned long freq) 614int hpet_set_periodic_freq(unsigned long freq)
580{ 615{
@@ -593,11 +628,13 @@ int hpet_set_periodic_freq(unsigned long freq)
593 } 628 }
594 return 1; 629 return 1;
595} 630}
631EXPORT_SYMBOL_GPL(hpet_set_periodic_freq);
596 632
597int hpet_rtc_dropped_irq(void) 633int hpet_rtc_dropped_irq(void)
598{ 634{
599 return is_hpet_enabled(); 635 return is_hpet_enabled();
600} 636}
637EXPORT_SYMBOL_GPL(hpet_rtc_dropped_irq);
601 638
602static void hpet_rtc_timer_reinit(void) 639static void hpet_rtc_timer_reinit(void)
603{ 640{
@@ -641,9 +678,10 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
641 unsigned long rtc_int_flag = 0; 678 unsigned long rtc_int_flag = 0;
642 679
643 hpet_rtc_timer_reinit(); 680 hpet_rtc_timer_reinit();
681 memset(&curr_time, 0, sizeof(struct rtc_time));
644 682
645 if (hpet_rtc_flags & (RTC_UIE | RTC_AIE)) 683 if (hpet_rtc_flags & (RTC_UIE | RTC_AIE))
646 rtc_get_rtc_time(&curr_time); 684 get_rtc_time(&curr_time);
647 685
648 if (hpet_rtc_flags & RTC_UIE && 686 if (hpet_rtc_flags & RTC_UIE &&
649 curr_time.tm_sec != hpet_prev_update_sec) { 687 curr_time.tm_sec != hpet_prev_update_sec) {
@@ -657,7 +695,7 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
657 hpet_pie_count = 0; 695 hpet_pie_count = 0;
658 } 696 }
659 697
660 if (hpet_rtc_flags & RTC_PIE && 698 if (hpet_rtc_flags & RTC_AIE &&
661 (curr_time.tm_sec == hpet_alarm_time.tm_sec) && 699 (curr_time.tm_sec == hpet_alarm_time.tm_sec) &&
662 (curr_time.tm_min == hpet_alarm_time.tm_min) && 700 (curr_time.tm_min == hpet_alarm_time.tm_min) &&
663 (curr_time.tm_hour == hpet_alarm_time.tm_hour)) 701 (curr_time.tm_hour == hpet_alarm_time.tm_hour))
@@ -665,8 +703,10 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
665 703
666 if (rtc_int_flag) { 704 if (rtc_int_flag) {
667 rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8)); 705 rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8));
668 rtc_interrupt(rtc_int_flag, dev_id); 706 if (irq_handler)
707 irq_handler(rtc_int_flag, dev_id);
669 } 708 }
670 return IRQ_HANDLED; 709 return IRQ_HANDLED;
671} 710}
711EXPORT_SYMBOL_GPL(hpet_rtc_interrupt);
672#endif 712#endif
diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c
index 02112fcc0de..061627806a2 100644
--- a/arch/x86/kernel/i386_ksyms_32.c
+++ b/arch/x86/kernel/i386_ksyms_32.c
@@ -22,12 +22,5 @@ EXPORT_SYMBOL(__put_user_8);
22 22
23EXPORT_SYMBOL(strstr); 23EXPORT_SYMBOL(strstr);
24 24
25#ifdef CONFIG_SMP
26extern void FASTCALL( __write_lock_failed(rwlock_t *rw));
27extern void FASTCALL( __read_lock_failed(rwlock_t *rw));
28EXPORT_SYMBOL(__write_lock_failed);
29EXPORT_SYMBOL(__read_lock_failed);
30#endif
31
32EXPORT_SYMBOL(csum_partial); 25EXPORT_SYMBOL(csum_partial);
33EXPORT_SYMBOL(empty_zero_page); 26EXPORT_SYMBOL(empty_zero_page);
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
new file mode 100644
index 00000000000..26719bd2c77
--- /dev/null
+++ b/arch/x86/kernel/i387.c
@@ -0,0 +1,479 @@
1/*
2 * Copyright (C) 1994 Linus Torvalds
3 *
4 * Pentium III FXSR, SSE support
5 * General FPU state handling cleanups
6 * Gareth Hughes <gareth@valinux.com>, May 2000
7 */
8
9#include <linux/sched.h>
10#include <linux/module.h>
11#include <linux/regset.h>
12#include <asm/processor.h>
13#include <asm/i387.h>
14#include <asm/math_emu.h>
15#include <asm/sigcontext.h>
16#include <asm/user.h>
17#include <asm/ptrace.h>
18#include <asm/uaccess.h>
19
20#ifdef CONFIG_X86_64
21
22#include <asm/sigcontext32.h>
23#include <asm/user32.h>
24
25#else
26
27#define save_i387_ia32 save_i387
28#define restore_i387_ia32 restore_i387
29
30#define _fpstate_ia32 _fpstate
31#define user_i387_ia32_struct user_i387_struct
32#define user32_fxsr_struct user_fxsr_struct
33
34#endif
35
36#ifdef CONFIG_MATH_EMULATION
37#define HAVE_HWFP (boot_cpu_data.hard_math)
38#else
39#define HAVE_HWFP 1
40#endif
41
42unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu;
43
44void mxcsr_feature_mask_init(void)
45{
46 unsigned long mask = 0;
47 clts();
48 if (cpu_has_fxsr) {
49 memset(&current->thread.i387.fxsave, 0,
50 sizeof(struct i387_fxsave_struct));
51 asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave));
52 mask = current->thread.i387.fxsave.mxcsr_mask;
53 if (mask == 0)
54 mask = 0x0000ffbf;
55 }
56 mxcsr_feature_mask &= mask;
57 stts();
58}
59
60#ifdef CONFIG_X86_64
61/*
62 * Called at bootup to set up the initial FPU state that is later cloned
63 * into all processes.
64 */
65void __cpuinit fpu_init(void)
66{
67 unsigned long oldcr0 = read_cr0();
68 extern void __bad_fxsave_alignment(void);
69
70 if (offsetof(struct task_struct, thread.i387.fxsave) & 15)
71 __bad_fxsave_alignment();
72 set_in_cr4(X86_CR4_OSFXSR);
73 set_in_cr4(X86_CR4_OSXMMEXCPT);
74
75 write_cr0(oldcr0 & ~((1UL<<3)|(1UL<<2))); /* clear TS and EM */
76
77 mxcsr_feature_mask_init();
78 /* clean state in init */
79 current_thread_info()->status = 0;
80 clear_used_math();
81}
82#endif /* CONFIG_X86_64 */
83
84/*
85 * The _current_ task is using the FPU for the first time
86 * so initialize it and set the mxcsr to its default
87 * value at reset if we support XMM instructions and then
88 * remeber the current task has used the FPU.
89 */
90void init_fpu(struct task_struct *tsk)
91{
92 if (tsk_used_math(tsk)) {
93 if (tsk == current)
94 unlazy_fpu(tsk);
95 return;
96 }
97
98 if (cpu_has_fxsr) {
99 memset(&tsk->thread.i387.fxsave, 0,
100 sizeof(struct i387_fxsave_struct));
101 tsk->thread.i387.fxsave.cwd = 0x37f;
102 if (cpu_has_xmm)
103 tsk->thread.i387.fxsave.mxcsr = MXCSR_DEFAULT;
104 } else {
105 memset(&tsk->thread.i387.fsave, 0,
106 sizeof(struct i387_fsave_struct));
107 tsk->thread.i387.fsave.cwd = 0xffff037fu;
108 tsk->thread.i387.fsave.swd = 0xffff0000u;
109 tsk->thread.i387.fsave.twd = 0xffffffffu;
110 tsk->thread.i387.fsave.fos = 0xffff0000u;
111 }
112 /*
113 * Only the device not available exception or ptrace can call init_fpu.
114 */
115 set_stopped_child_used_math(tsk);
116}
117
118int fpregs_active(struct task_struct *target, const struct user_regset *regset)
119{
120 return tsk_used_math(target) ? regset->n : 0;
121}
122
123int xfpregs_active(struct task_struct *target, const struct user_regset *regset)
124{
125 return (cpu_has_fxsr && tsk_used_math(target)) ? regset->n : 0;
126}
127
128int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
129 unsigned int pos, unsigned int count,
130 void *kbuf, void __user *ubuf)
131{
132 if (!cpu_has_fxsr)
133 return -ENODEV;
134
135 unlazy_fpu(target);
136
137 return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
138 &target->thread.i387.fxsave, 0, -1);
139}
140
141int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
142 unsigned int pos, unsigned int count,
143 const void *kbuf, const void __user *ubuf)
144{
145 int ret;
146
147 if (!cpu_has_fxsr)
148 return -ENODEV;
149
150 unlazy_fpu(target);
151 set_stopped_child_used_math(target);
152
153 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
154 &target->thread.i387.fxsave, 0, -1);
155
156 /*
157 * mxcsr reserved bits must be masked to zero for security reasons.
158 */
159 target->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
160
161 return ret;
162}
163
164#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
165
166/*
167 * FPU tag word conversions.
168 */
169
170static inline unsigned short twd_i387_to_fxsr(unsigned short twd)
171{
172 unsigned int tmp; /* to avoid 16 bit prefixes in the code */
173
174 /* Transform each pair of bits into 01 (valid) or 00 (empty) */
175 tmp = ~twd;
176 tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */
177 /* and move the valid bits to the lower byte. */
178 tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */
179 tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */
180 tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */
181 return tmp;
182}
183
184#define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16);
185#define FP_EXP_TAG_VALID 0
186#define FP_EXP_TAG_ZERO 1
187#define FP_EXP_TAG_SPECIAL 2
188#define FP_EXP_TAG_EMPTY 3
189
190static inline u32 twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave)
191{
192 struct _fpxreg *st;
193 u32 tos = (fxsave->swd >> 11) & 7;
194 u32 twd = (unsigned long) fxsave->twd;
195 u32 tag;
196 u32 ret = 0xffff0000u;
197 int i;
198
199 for (i = 0; i < 8; i++, twd >>= 1) {
200 if (twd & 0x1) {
201 st = FPREG_ADDR(fxsave, (i - tos) & 7);
202
203 switch (st->exponent & 0x7fff) {
204 case 0x7fff:
205 tag = FP_EXP_TAG_SPECIAL;
206 break;
207 case 0x0000:
208 if (!st->significand[0] &&
209 !st->significand[1] &&
210 !st->significand[2] &&
211 !st->significand[3])
212 tag = FP_EXP_TAG_ZERO;
213 else
214 tag = FP_EXP_TAG_SPECIAL;
215 break;
216 default:
217 if (st->significand[3] & 0x8000)
218 tag = FP_EXP_TAG_VALID;
219 else
220 tag = FP_EXP_TAG_SPECIAL;
221 break;
222 }
223 } else {
224 tag = FP_EXP_TAG_EMPTY;
225 }
226 ret |= tag << (2 * i);
227 }
228 return ret;
229}
230
231/*
232 * FXSR floating point environment conversions.
233 */
234
235static void convert_from_fxsr(struct user_i387_ia32_struct *env,
236 struct task_struct *tsk)
237{
238 struct i387_fxsave_struct *fxsave = &tsk->thread.i387.fxsave;
239 struct _fpreg *to = (struct _fpreg *) &env->st_space[0];
240 struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0];
241 int i;
242
243 env->cwd = fxsave->cwd | 0xffff0000u;
244 env->swd = fxsave->swd | 0xffff0000u;
245 env->twd = twd_fxsr_to_i387(fxsave);
246
247#ifdef CONFIG_X86_64
248 env->fip = fxsave->rip;
249 env->foo = fxsave->rdp;
250 if (tsk == current) {
251 /*
252 * should be actually ds/cs at fpu exception time, but
253 * that information is not available in 64bit mode.
254 */
255 asm("mov %%ds,%0" : "=r" (env->fos));
256 asm("mov %%cs,%0" : "=r" (env->fcs));
257 } else {
258 struct pt_regs *regs = task_pt_regs(tsk);
259 env->fos = 0xffff0000 | tsk->thread.ds;
260 env->fcs = regs->cs;
261 }
262#else
263 env->fip = fxsave->fip;
264 env->fcs = fxsave->fcs;
265 env->foo = fxsave->foo;
266 env->fos = fxsave->fos;
267#endif
268
269 for (i = 0; i < 8; ++i)
270 memcpy(&to[i], &from[i], sizeof(to[0]));
271}
272
273static void convert_to_fxsr(struct task_struct *tsk,
274 const struct user_i387_ia32_struct *env)
275
276{
277 struct i387_fxsave_struct *fxsave = &tsk->thread.i387.fxsave;
278 struct _fpreg *from = (struct _fpreg *) &env->st_space[0];
279 struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0];
280 int i;
281
282 fxsave->cwd = env->cwd;
283 fxsave->swd = env->swd;
284 fxsave->twd = twd_i387_to_fxsr(env->twd);
285 fxsave->fop = (u16) ((u32) env->fcs >> 16);
286#ifdef CONFIG_X86_64
287 fxsave->rip = env->fip;
288 fxsave->rdp = env->foo;
289 /* cs and ds ignored */
290#else
291 fxsave->fip = env->fip;
292 fxsave->fcs = (env->fcs & 0xffff);
293 fxsave->foo = env->foo;
294 fxsave->fos = env->fos;
295#endif
296
297 for (i = 0; i < 8; ++i)
298 memcpy(&to[i], &from[i], sizeof(from[0]));
299}
300
301int fpregs_get(struct task_struct *target, const struct user_regset *regset,
302 unsigned int pos, unsigned int count,
303 void *kbuf, void __user *ubuf)
304{
305 struct user_i387_ia32_struct env;
306
307 if (!HAVE_HWFP)
308 return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf);
309
310 unlazy_fpu(target);
311
312 if (!cpu_has_fxsr)
313 return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
314 &target->thread.i387.fsave, 0, -1);
315
316 if (kbuf && pos == 0 && count == sizeof(env)) {
317 convert_from_fxsr(kbuf, target);
318 return 0;
319 }
320
321 convert_from_fxsr(&env, target);
322 return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &env, 0, -1);
323}
324
325int fpregs_set(struct task_struct *target, const struct user_regset *regset,
326 unsigned int pos, unsigned int count,
327 const void *kbuf, const void __user *ubuf)
328{
329 struct user_i387_ia32_struct env;
330 int ret;
331
332 if (!HAVE_HWFP)
333 return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf);
334
335 unlazy_fpu(target);
336 set_stopped_child_used_math(target);
337
338 if (!cpu_has_fxsr)
339 return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
340 &target->thread.i387.fsave, 0, -1);
341
342 if (pos > 0 || count < sizeof(env))
343 convert_from_fxsr(&env, target);
344
345 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &env, 0, -1);
346 if (!ret)
347 convert_to_fxsr(target, &env);
348
349 return ret;
350}
351
352/*
353 * Signal frame handlers.
354 */
355
356static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf)
357{
358 struct task_struct *tsk = current;
359
360 unlazy_fpu(tsk);
361 tsk->thread.i387.fsave.status = tsk->thread.i387.fsave.swd;
362 if (__copy_to_user(buf, &tsk->thread.i387.fsave,
363 sizeof(struct i387_fsave_struct)))
364 return -1;
365 return 1;
366}
367
368static int save_i387_fxsave(struct _fpstate_ia32 __user *buf)
369{
370 struct task_struct *tsk = current;
371 struct user_i387_ia32_struct env;
372 int err = 0;
373
374 unlazy_fpu(tsk);
375
376 convert_from_fxsr(&env, tsk);
377 if (__copy_to_user(buf, &env, sizeof(env)))
378 return -1;
379
380 err |= __put_user(tsk->thread.i387.fxsave.swd, &buf->status);
381 err |= __put_user(X86_FXSR_MAGIC, &buf->magic);
382 if (err)
383 return -1;
384
385 if (__copy_to_user(&buf->_fxsr_env[0], &tsk->thread.i387.fxsave,
386 sizeof(struct i387_fxsave_struct)))
387 return -1;
388 return 1;
389}
390
391int save_i387_ia32(struct _fpstate_ia32 __user *buf)
392{
393 if (!used_math())
394 return 0;
395
396 /* This will cause a "finit" to be triggered by the next
397 * attempted FPU operation by the 'current' process.
398 */
399 clear_used_math();
400
401 if (HAVE_HWFP) {
402 if (cpu_has_fxsr) {
403 return save_i387_fxsave(buf);
404 } else {
405 return save_i387_fsave(buf);
406 }
407 } else {
408 return fpregs_soft_get(current, NULL,
409 0, sizeof(struct user_i387_ia32_struct),
410 NULL, buf) ? -1 : 1;
411 }
412}
413
414static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf)
415{
416 struct task_struct *tsk = current;
417 clear_fpu(tsk);
418 return __copy_from_user(&tsk->thread.i387.fsave, buf,
419 sizeof(struct i387_fsave_struct));
420}
421
422static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf)
423{
424 int err;
425 struct task_struct *tsk = current;
426 struct user_i387_ia32_struct env;
427 clear_fpu(tsk);
428 err = __copy_from_user(&tsk->thread.i387.fxsave, &buf->_fxsr_env[0],
429 sizeof(struct i387_fxsave_struct));
430 /* mxcsr reserved bits must be masked to zero for security reasons */
431 tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
432 if (err || __copy_from_user(&env, buf, sizeof(env)))
433 return 1;
434 convert_to_fxsr(tsk, &env);
435 return 0;
436}
437
438int restore_i387_ia32(struct _fpstate_ia32 __user *buf)
439{
440 int err;
441
442 if (HAVE_HWFP) {
443 if (cpu_has_fxsr) {
444 err = restore_i387_fxsave(buf);
445 } else {
446 err = restore_i387_fsave(buf);
447 }
448 } else {
449 err = fpregs_soft_set(current, NULL,
450 0, sizeof(struct user_i387_ia32_struct),
451 NULL, buf) != 0;
452 }
453 set_used_math();
454 return err;
455}
456
457/*
458 * FPU state for core dumps.
459 * This is only used for a.out dumps now.
460 * It is declared generically using elf_fpregset_t (which is
461 * struct user_i387_struct) but is in fact only used for 32-bit
462 * dumps, so on 64-bit it is really struct user_i387_ia32_struct.
463 */
464int dump_fpu(struct pt_regs *regs, struct user_i387_struct *fpu)
465{
466 int fpvalid;
467 struct task_struct *tsk = current;
468
469 fpvalid = !!used_math();
470 if (fpvalid)
471 fpvalid = !fpregs_get(tsk, NULL,
472 0, sizeof(struct user_i387_ia32_struct),
473 fpu, NULL);
474
475 return fpvalid;
476}
477EXPORT_SYMBOL(dump_fpu);
478
479#endif /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */
diff --git a/arch/x86/kernel/i387_32.c b/arch/x86/kernel/i387_32.c
deleted file mode 100644
index 7d2e12f6c78..00000000000
--- a/arch/x86/kernel/i387_32.c
+++ /dev/null
@@ -1,544 +0,0 @@
1/*
2 * Copyright (C) 1994 Linus Torvalds
3 *
4 * Pentium III FXSR, SSE support
5 * General FPU state handling cleanups
6 * Gareth Hughes <gareth@valinux.com>, May 2000
7 */
8
9#include <linux/sched.h>
10#include <linux/module.h>
11#include <asm/processor.h>
12#include <asm/i387.h>
13#include <asm/math_emu.h>
14#include <asm/sigcontext.h>
15#include <asm/user.h>
16#include <asm/ptrace.h>
17#include <asm/uaccess.h>
18
19#ifdef CONFIG_MATH_EMULATION
20#define HAVE_HWFP (boot_cpu_data.hard_math)
21#else
22#define HAVE_HWFP 1
23#endif
24
25static unsigned long mxcsr_feature_mask __read_mostly = 0xffffffff;
26
27void mxcsr_feature_mask_init(void)
28{
29 unsigned long mask = 0;
30 clts();
31 if (cpu_has_fxsr) {
32 memset(&current->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
33 asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave));
34 mask = current->thread.i387.fxsave.mxcsr_mask;
35 if (mask == 0) mask = 0x0000ffbf;
36 }
37 mxcsr_feature_mask &= mask;
38 stts();
39}
40
41/*
42 * The _current_ task is using the FPU for the first time
43 * so initialize it and set the mxcsr to its default
44 * value at reset if we support XMM instructions and then
45 * remeber the current task has used the FPU.
46 */
47void init_fpu(struct task_struct *tsk)
48{
49 if (cpu_has_fxsr) {
50 memset(&tsk->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
51 tsk->thread.i387.fxsave.cwd = 0x37f;
52 if (cpu_has_xmm)
53 tsk->thread.i387.fxsave.mxcsr = 0x1f80;
54 } else {
55 memset(&tsk->thread.i387.fsave, 0, sizeof(struct i387_fsave_struct));
56 tsk->thread.i387.fsave.cwd = 0xffff037fu;
57 tsk->thread.i387.fsave.swd = 0xffff0000u;
58 tsk->thread.i387.fsave.twd = 0xffffffffu;
59 tsk->thread.i387.fsave.fos = 0xffff0000u;
60 }
61 /* only the device not available exception or ptrace can call init_fpu */
62 set_stopped_child_used_math(tsk);
63}
64
65/*
66 * FPU lazy state save handling.
67 */
68
69void kernel_fpu_begin(void)
70{
71 struct thread_info *thread = current_thread_info();
72
73 preempt_disable();
74 if (thread->status & TS_USEDFPU) {
75 __save_init_fpu(thread->task);
76 return;
77 }
78 clts();
79}
80EXPORT_SYMBOL_GPL(kernel_fpu_begin);
81
82/*
83 * FPU tag word conversions.
84 */
85
86static inline unsigned short twd_i387_to_fxsr( unsigned short twd )
87{
88 unsigned int tmp; /* to avoid 16 bit prefixes in the code */
89
90 /* Transform each pair of bits into 01 (valid) or 00 (empty) */
91 tmp = ~twd;
92 tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */
93 /* and move the valid bits to the lower byte. */
94 tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */
95 tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */
96 tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */
97 return tmp;
98}
99
100static inline unsigned long twd_fxsr_to_i387( struct i387_fxsave_struct *fxsave )
101{
102 struct _fpxreg *st = NULL;
103 unsigned long tos = (fxsave->swd >> 11) & 7;
104 unsigned long twd = (unsigned long) fxsave->twd;
105 unsigned long tag;
106 unsigned long ret = 0xffff0000u;
107 int i;
108
109#define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16);
110
111 for ( i = 0 ; i < 8 ; i++ ) {
112 if ( twd & 0x1 ) {
113 st = FPREG_ADDR( fxsave, (i - tos) & 7 );
114
115 switch ( st->exponent & 0x7fff ) {
116 case 0x7fff:
117 tag = 2; /* Special */
118 break;
119 case 0x0000:
120 if ( !st->significand[0] &&
121 !st->significand[1] &&
122 !st->significand[2] &&
123 !st->significand[3] ) {
124 tag = 1; /* Zero */
125 } else {
126 tag = 2; /* Special */
127 }
128 break;
129 default:
130 if ( st->significand[3] & 0x8000 ) {
131 tag = 0; /* Valid */
132 } else {
133 tag = 2; /* Special */
134 }
135 break;
136 }
137 } else {
138 tag = 3; /* Empty */
139 }
140 ret |= (tag << (2 * i));
141 twd = twd >> 1;
142 }
143 return ret;
144}
145
146/*
147 * FPU state interaction.
148 */
149
150unsigned short get_fpu_cwd( struct task_struct *tsk )
151{
152 if ( cpu_has_fxsr ) {
153 return tsk->thread.i387.fxsave.cwd;
154 } else {
155 return (unsigned short)tsk->thread.i387.fsave.cwd;
156 }
157}
158
159unsigned short get_fpu_swd( struct task_struct *tsk )
160{
161 if ( cpu_has_fxsr ) {
162 return tsk->thread.i387.fxsave.swd;
163 } else {
164 return (unsigned short)tsk->thread.i387.fsave.swd;
165 }
166}
167
168#if 0
169unsigned short get_fpu_twd( struct task_struct *tsk )
170{
171 if ( cpu_has_fxsr ) {
172 return tsk->thread.i387.fxsave.twd;
173 } else {
174 return (unsigned short)tsk->thread.i387.fsave.twd;
175 }
176}
177#endif /* 0 */
178
179unsigned short get_fpu_mxcsr( struct task_struct *tsk )
180{
181 if ( cpu_has_xmm ) {
182 return tsk->thread.i387.fxsave.mxcsr;
183 } else {
184 return 0x1f80;
185 }
186}
187
188#if 0
189
190void set_fpu_cwd( struct task_struct *tsk, unsigned short cwd )
191{
192 if ( cpu_has_fxsr ) {
193 tsk->thread.i387.fxsave.cwd = cwd;
194 } else {
195 tsk->thread.i387.fsave.cwd = ((long)cwd | 0xffff0000u);
196 }
197}
198
199void set_fpu_swd( struct task_struct *tsk, unsigned short swd )
200{
201 if ( cpu_has_fxsr ) {
202 tsk->thread.i387.fxsave.swd = swd;
203 } else {
204 tsk->thread.i387.fsave.swd = ((long)swd | 0xffff0000u);
205 }
206}
207
208void set_fpu_twd( struct task_struct *tsk, unsigned short twd )
209{
210 if ( cpu_has_fxsr ) {
211 tsk->thread.i387.fxsave.twd = twd_i387_to_fxsr(twd);
212 } else {
213 tsk->thread.i387.fsave.twd = ((long)twd | 0xffff0000u);
214 }
215}
216
217#endif /* 0 */
218
219/*
220 * FXSR floating point environment conversions.
221 */
222
223static int convert_fxsr_to_user( struct _fpstate __user *buf,
224 struct i387_fxsave_struct *fxsave )
225{
226 unsigned long env[7];
227 struct _fpreg __user *to;
228 struct _fpxreg *from;
229 int i;
230
231 env[0] = (unsigned long)fxsave->cwd | 0xffff0000ul;
232 env[1] = (unsigned long)fxsave->swd | 0xffff0000ul;
233 env[2] = twd_fxsr_to_i387(fxsave);
234 env[3] = fxsave->fip;
235 env[4] = fxsave->fcs | ((unsigned long)fxsave->fop << 16);
236 env[5] = fxsave->foo;
237 env[6] = fxsave->fos;
238
239 if ( __copy_to_user( buf, env, 7 * sizeof(unsigned long) ) )
240 return 1;
241
242 to = &buf->_st[0];
243 from = (struct _fpxreg *) &fxsave->st_space[0];
244 for ( i = 0 ; i < 8 ; i++, to++, from++ ) {
245 unsigned long __user *t = (unsigned long __user *)to;
246 unsigned long *f = (unsigned long *)from;
247
248 if (__put_user(*f, t) ||
249 __put_user(*(f + 1), t + 1) ||
250 __put_user(from->exponent, &to->exponent))
251 return 1;
252 }
253 return 0;
254}
255
256static int convert_fxsr_from_user( struct i387_fxsave_struct *fxsave,
257 struct _fpstate __user *buf )
258{
259 unsigned long env[7];
260 struct _fpxreg *to;
261 struct _fpreg __user *from;
262 int i;
263
264 if ( __copy_from_user( env, buf, 7 * sizeof(long) ) )
265 return 1;
266
267 fxsave->cwd = (unsigned short)(env[0] & 0xffff);
268 fxsave->swd = (unsigned short)(env[1] & 0xffff);
269 fxsave->twd = twd_i387_to_fxsr((unsigned short)(env[2] & 0xffff));
270 fxsave->fip = env[3];
271 fxsave->fop = (unsigned short)((env[4] & 0xffff0000ul) >> 16);
272 fxsave->fcs = (env[4] & 0xffff);
273 fxsave->foo = env[5];
274 fxsave->fos = env[6];
275
276 to = (struct _fpxreg *) &fxsave->st_space[0];
277 from = &buf->_st[0];
278 for ( i = 0 ; i < 8 ; i++, to++, from++ ) {
279 unsigned long *t = (unsigned long *)to;
280 unsigned long __user *f = (unsigned long __user *)from;
281
282 if (__get_user(*t, f) ||
283 __get_user(*(t + 1), f + 1) ||
284 __get_user(to->exponent, &from->exponent))
285 return 1;
286 }
287 return 0;
288}
289
290/*
291 * Signal frame handlers.
292 */
293
294static inline int save_i387_fsave( struct _fpstate __user *buf )
295{
296 struct task_struct *tsk = current;
297
298 unlazy_fpu( tsk );
299 tsk->thread.i387.fsave.status = tsk->thread.i387.fsave.swd;
300 if ( __copy_to_user( buf, &tsk->thread.i387.fsave,
301 sizeof(struct i387_fsave_struct) ) )
302 return -1;
303 return 1;
304}
305
306static int save_i387_fxsave( struct _fpstate __user *buf )
307{
308 struct task_struct *tsk = current;
309 int err = 0;
310
311 unlazy_fpu( tsk );
312
313 if ( convert_fxsr_to_user( buf, &tsk->thread.i387.fxsave ) )
314 return -1;
315
316 err |= __put_user( tsk->thread.i387.fxsave.swd, &buf->status );
317 err |= __put_user( X86_FXSR_MAGIC, &buf->magic );
318 if ( err )
319 return -1;
320
321 if ( __copy_to_user( &buf->_fxsr_env[0], &tsk->thread.i387.fxsave,
322 sizeof(struct i387_fxsave_struct) ) )
323 return -1;
324 return 1;
325}
326
327int save_i387( struct _fpstate __user *buf )
328{
329 if ( !used_math() )
330 return 0;
331
332 /* This will cause a "finit" to be triggered by the next
333 * attempted FPU operation by the 'current' process.
334 */
335 clear_used_math();
336
337 if ( HAVE_HWFP ) {
338 if ( cpu_has_fxsr ) {
339 return save_i387_fxsave( buf );
340 } else {
341 return save_i387_fsave( buf );
342 }
343 } else {
344 return save_i387_soft( &current->thread.i387.soft, buf );
345 }
346}
347
348static inline int restore_i387_fsave( struct _fpstate __user *buf )
349{
350 struct task_struct *tsk = current;
351 clear_fpu( tsk );
352 return __copy_from_user( &tsk->thread.i387.fsave, buf,
353 sizeof(struct i387_fsave_struct) );
354}
355
356static int restore_i387_fxsave( struct _fpstate __user *buf )
357{
358 int err;
359 struct task_struct *tsk = current;
360 clear_fpu( tsk );
361 err = __copy_from_user( &tsk->thread.i387.fxsave, &buf->_fxsr_env[0],
362 sizeof(struct i387_fxsave_struct) );
363 /* mxcsr reserved bits must be masked to zero for security reasons */
364 tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
365 return err ? 1 : convert_fxsr_from_user( &tsk->thread.i387.fxsave, buf );
366}
367
368int restore_i387( struct _fpstate __user *buf )
369{
370 int err;
371
372 if ( HAVE_HWFP ) {
373 if ( cpu_has_fxsr ) {
374 err = restore_i387_fxsave( buf );
375 } else {
376 err = restore_i387_fsave( buf );
377 }
378 } else {
379 err = restore_i387_soft( &current->thread.i387.soft, buf );
380 }
381 set_used_math();
382 return err;
383}
384
385/*
386 * ptrace request handlers.
387 */
388
389static inline int get_fpregs_fsave( struct user_i387_struct __user *buf,
390 struct task_struct *tsk )
391{
392 return __copy_to_user( buf, &tsk->thread.i387.fsave,
393 sizeof(struct user_i387_struct) );
394}
395
396static inline int get_fpregs_fxsave( struct user_i387_struct __user *buf,
397 struct task_struct *tsk )
398{
399 return convert_fxsr_to_user( (struct _fpstate __user *)buf,
400 &tsk->thread.i387.fxsave );
401}
402
403int get_fpregs( struct user_i387_struct __user *buf, struct task_struct *tsk )
404{
405 if ( HAVE_HWFP ) {
406 if ( cpu_has_fxsr ) {
407 return get_fpregs_fxsave( buf, tsk );
408 } else {
409 return get_fpregs_fsave( buf, tsk );
410 }
411 } else {
412 return save_i387_soft( &tsk->thread.i387.soft,
413 (struct _fpstate __user *)buf );
414 }
415}
416
417static inline int set_fpregs_fsave( struct task_struct *tsk,
418 struct user_i387_struct __user *buf )
419{
420 return __copy_from_user( &tsk->thread.i387.fsave, buf,
421 sizeof(struct user_i387_struct) );
422}
423
424static inline int set_fpregs_fxsave( struct task_struct *tsk,
425 struct user_i387_struct __user *buf )
426{
427 return convert_fxsr_from_user( &tsk->thread.i387.fxsave,
428 (struct _fpstate __user *)buf );
429}
430
431int set_fpregs( struct task_struct *tsk, struct user_i387_struct __user *buf )
432{
433 if ( HAVE_HWFP ) {
434 if ( cpu_has_fxsr ) {
435 return set_fpregs_fxsave( tsk, buf );
436 } else {
437 return set_fpregs_fsave( tsk, buf );
438 }
439 } else {
440 return restore_i387_soft( &tsk->thread.i387.soft,
441 (struct _fpstate __user *)buf );
442 }
443}
444
445int get_fpxregs( struct user_fxsr_struct __user *buf, struct task_struct *tsk )
446{
447 if ( cpu_has_fxsr ) {
448 if (__copy_to_user( buf, &tsk->thread.i387.fxsave,
449 sizeof(struct user_fxsr_struct) ))
450 return -EFAULT;
451 return 0;
452 } else {
453 return -EIO;
454 }
455}
456
457int set_fpxregs( struct task_struct *tsk, struct user_fxsr_struct __user *buf )
458{
459 int ret = 0;
460
461 if ( cpu_has_fxsr ) {
462 if (__copy_from_user( &tsk->thread.i387.fxsave, buf,
463 sizeof(struct user_fxsr_struct) ))
464 ret = -EFAULT;
465 /* mxcsr reserved bits must be masked to zero for security reasons */
466 tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
467 } else {
468 ret = -EIO;
469 }
470 return ret;
471}
472
473/*
474 * FPU state for core dumps.
475 */
476
477static inline void copy_fpu_fsave( struct task_struct *tsk,
478 struct user_i387_struct *fpu )
479{
480 memcpy( fpu, &tsk->thread.i387.fsave,
481 sizeof(struct user_i387_struct) );
482}
483
484static inline void copy_fpu_fxsave( struct task_struct *tsk,
485 struct user_i387_struct *fpu )
486{
487 unsigned short *to;
488 unsigned short *from;
489 int i;
490
491 memcpy( fpu, &tsk->thread.i387.fxsave, 7 * sizeof(long) );
492
493 to = (unsigned short *)&fpu->st_space[0];
494 from = (unsigned short *)&tsk->thread.i387.fxsave.st_space[0];
495 for ( i = 0 ; i < 8 ; i++, to += 5, from += 8 ) {
496 memcpy( to, from, 5 * sizeof(unsigned short) );
497 }
498}
499
500int dump_fpu( struct pt_regs *regs, struct user_i387_struct *fpu )
501{
502 int fpvalid;
503 struct task_struct *tsk = current;
504
505 fpvalid = !!used_math();
506 if ( fpvalid ) {
507 unlazy_fpu( tsk );
508 if ( cpu_has_fxsr ) {
509 copy_fpu_fxsave( tsk, fpu );
510 } else {
511 copy_fpu_fsave( tsk, fpu );
512 }
513 }
514
515 return fpvalid;
516}
517EXPORT_SYMBOL(dump_fpu);
518
519int dump_task_fpu(struct task_struct *tsk, struct user_i387_struct *fpu)
520{
521 int fpvalid = !!tsk_used_math(tsk);
522
523 if (fpvalid) {
524 if (tsk == current)
525 unlazy_fpu(tsk);
526 if (cpu_has_fxsr)
527 copy_fpu_fxsave(tsk, fpu);
528 else
529 copy_fpu_fsave(tsk, fpu);
530 }
531 return fpvalid;
532}
533
534int dump_task_extended_fpu(struct task_struct *tsk, struct user_fxsr_struct *fpu)
535{
536 int fpvalid = tsk_used_math(tsk) && cpu_has_fxsr;
537
538 if (fpvalid) {
539 if (tsk == current)
540 unlazy_fpu(tsk);
541 memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(*fpu));
542 }
543 return fpvalid;
544}
diff --git a/arch/x86/kernel/i387_64.c b/arch/x86/kernel/i387_64.c
deleted file mode 100644
index bfaff28fb13..00000000000
--- a/arch/x86/kernel/i387_64.c
+++ /dev/null
@@ -1,150 +0,0 @@
1/*
2 * Copyright (C) 1994 Linus Torvalds
3 * Copyright (C) 2002 Andi Kleen, SuSE Labs
4 *
5 * Pentium III FXSR, SSE support
6 * General FPU state handling cleanups
7 * Gareth Hughes <gareth@valinux.com>, May 2000
8 *
9 * x86-64 rework 2002 Andi Kleen.
10 * Does direct fxsave in and out of user space now for signal handlers.
11 * All the FSAVE<->FXSAVE conversion code has been moved to the 32bit emulation,
12 * the 64bit user space sees a FXSAVE frame directly.
13 */
14
15#include <linux/sched.h>
16#include <linux/init.h>
17#include <asm/processor.h>
18#include <asm/i387.h>
19#include <asm/sigcontext.h>
20#include <asm/user.h>
21#include <asm/ptrace.h>
22#include <asm/uaccess.h>
23
24unsigned int mxcsr_feature_mask __read_mostly = 0xffffffff;
25
26void mxcsr_feature_mask_init(void)
27{
28 unsigned int mask;
29 clts();
30 memset(&current->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
31 asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave));
32 mask = current->thread.i387.fxsave.mxcsr_mask;
33 if (mask == 0) mask = 0x0000ffbf;
34 mxcsr_feature_mask &= mask;
35 stts();
36}
37
38/*
39 * Called at bootup to set up the initial FPU state that is later cloned
40 * into all processes.
41 */
42void __cpuinit fpu_init(void)
43{
44 unsigned long oldcr0 = read_cr0();
45 extern void __bad_fxsave_alignment(void);
46
47 if (offsetof(struct task_struct, thread.i387.fxsave) & 15)
48 __bad_fxsave_alignment();
49 set_in_cr4(X86_CR4_OSFXSR);
50 set_in_cr4(X86_CR4_OSXMMEXCPT);
51
52 write_cr0(oldcr0 & ~((1UL<<3)|(1UL<<2))); /* clear TS and EM */
53
54 mxcsr_feature_mask_init();
55 /* clean state in init */
56 current_thread_info()->status = 0;
57 clear_used_math();
58}
59
60void init_fpu(struct task_struct *child)
61{
62 if (tsk_used_math(child)) {
63 if (child == current)
64 unlazy_fpu(child);
65 return;
66 }
67 memset(&child->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
68 child->thread.i387.fxsave.cwd = 0x37f;
69 child->thread.i387.fxsave.mxcsr = 0x1f80;
70 /* only the device not available exception or ptrace can call init_fpu */
71 set_stopped_child_used_math(child);
72}
73
74/*
75 * Signal frame handlers.
76 */
77
78int save_i387(struct _fpstate __user *buf)
79{
80 struct task_struct *tsk = current;
81 int err = 0;
82
83 BUILD_BUG_ON(sizeof(struct user_i387_struct) !=
84 sizeof(tsk->thread.i387.fxsave));
85
86 if ((unsigned long)buf % 16)
87 printk("save_i387: bad fpstate %p\n",buf);
88
89 if (!used_math())
90 return 0;
91 clear_used_math(); /* trigger finit */
92 if (task_thread_info(tsk)->status & TS_USEDFPU) {
93 err = save_i387_checking((struct i387_fxsave_struct __user *)buf);
94 if (err) return err;
95 task_thread_info(tsk)->status &= ~TS_USEDFPU;
96 stts();
97 } else {
98 if (__copy_to_user(buf, &tsk->thread.i387.fxsave,
99 sizeof(struct i387_fxsave_struct)))
100 return -1;
101 }
102 return 1;
103}
104
105/*
106 * ptrace request handlers.
107 */
108
109int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *tsk)
110{
111 init_fpu(tsk);
112 return __copy_to_user(buf, &tsk->thread.i387.fxsave,
113 sizeof(struct user_i387_struct)) ? -EFAULT : 0;
114}
115
116int set_fpregs(struct task_struct *tsk, struct user_i387_struct __user *buf)
117{
118 if (__copy_from_user(&tsk->thread.i387.fxsave, buf,
119 sizeof(struct user_i387_struct)))
120 return -EFAULT;
121 return 0;
122}
123
124/*
125 * FPU state for core dumps.
126 */
127
128int dump_fpu( struct pt_regs *regs, struct user_i387_struct *fpu )
129{
130 struct task_struct *tsk = current;
131
132 if (!used_math())
133 return 0;
134
135 unlazy_fpu(tsk);
136 memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(struct user_i387_struct));
137 return 1;
138}
139
140int dump_task_fpu(struct task_struct *tsk, struct user_i387_struct *fpu)
141{
142 int fpvalid = !!tsk_used_math(tsk);
143
144 if (fpvalid) {
145 if (tsk == current)
146 unlazy_fpu(tsk);
147 memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(struct user_i387_struct));
148}
149 return fpvalid;
150}
diff --git a/arch/x86/kernel/i8237.c b/arch/x86/kernel/i8237.c
index 29313832df0..dbd6c1d1b63 100644
--- a/arch/x86/kernel/i8237.c
+++ b/arch/x86/kernel/i8237.c
@@ -51,7 +51,7 @@ static int i8237A_suspend(struct sys_device *dev, pm_message_t state)
51} 51}
52 52
53static struct sysdev_class i8237_sysdev_class = { 53static struct sysdev_class i8237_sysdev_class = {
54 set_kset_name("i8237"), 54 .name = "i8237",
55 .suspend = i8237A_suspend, 55 .suspend = i8237A_suspend,
56 .resume = i8237A_resume, 56 .resume = i8237A_resume,
57}; 57};
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index a42c8074532..ef62b07b2b4 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -13,10 +13,17 @@
13#include <asm/delay.h> 13#include <asm/delay.h>
14#include <asm/i8253.h> 14#include <asm/i8253.h>
15#include <asm/io.h> 15#include <asm/io.h>
16#include <asm/hpet.h>
16 17
17DEFINE_SPINLOCK(i8253_lock); 18DEFINE_SPINLOCK(i8253_lock);
18EXPORT_SYMBOL(i8253_lock); 19EXPORT_SYMBOL(i8253_lock);
19 20
21#ifdef CONFIG_X86_32
22static void pit_disable_clocksource(void);
23#else
24static inline void pit_disable_clocksource(void) { }
25#endif
26
20/* 27/*
21 * HPET replaces the PIT, when enabled. So we need to know, which of 28 * HPET replaces the PIT, when enabled. So we need to know, which of
22 * the two timers is used 29 * the two timers is used
@@ -31,38 +38,38 @@ struct clock_event_device *global_clock_event;
31static void init_pit_timer(enum clock_event_mode mode, 38static void init_pit_timer(enum clock_event_mode mode,
32 struct clock_event_device *evt) 39 struct clock_event_device *evt)
33{ 40{
34 unsigned long flags; 41 spin_lock(&i8253_lock);
35
36 spin_lock_irqsave(&i8253_lock, flags);
37 42
38 switch(mode) { 43 switch(mode) {
39 case CLOCK_EVT_MODE_PERIODIC: 44 case CLOCK_EVT_MODE_PERIODIC:
40 /* binary, mode 2, LSB/MSB, ch 0 */ 45 /* binary, mode 2, LSB/MSB, ch 0 */
41 outb_p(0x34, PIT_MODE); 46 outb_pit(0x34, PIT_MODE);
42 outb_p(LATCH & 0xff , PIT_CH0); /* LSB */ 47 outb_pit(LATCH & 0xff , PIT_CH0); /* LSB */
43 outb(LATCH >> 8 , PIT_CH0); /* MSB */ 48 outb_pit(LATCH >> 8 , PIT_CH0); /* MSB */
44 break; 49 break;
45 50
46 case CLOCK_EVT_MODE_SHUTDOWN: 51 case CLOCK_EVT_MODE_SHUTDOWN:
47 case CLOCK_EVT_MODE_UNUSED: 52 case CLOCK_EVT_MODE_UNUSED:
48 if (evt->mode == CLOCK_EVT_MODE_PERIODIC || 53 if (evt->mode == CLOCK_EVT_MODE_PERIODIC ||
49 evt->mode == CLOCK_EVT_MODE_ONESHOT) { 54 evt->mode == CLOCK_EVT_MODE_ONESHOT) {
50 outb_p(0x30, PIT_MODE); 55 outb_pit(0x30, PIT_MODE);
51 outb_p(0, PIT_CH0); 56 outb_pit(0, PIT_CH0);
52 outb_p(0, PIT_CH0); 57 outb_pit(0, PIT_CH0);
53 } 58 }
59 pit_disable_clocksource();
54 break; 60 break;
55 61
56 case CLOCK_EVT_MODE_ONESHOT: 62 case CLOCK_EVT_MODE_ONESHOT:
57 /* One shot setup */ 63 /* One shot setup */
58 outb_p(0x38, PIT_MODE); 64 pit_disable_clocksource();
65 outb_pit(0x38, PIT_MODE);
59 break; 66 break;
60 67
61 case CLOCK_EVT_MODE_RESUME: 68 case CLOCK_EVT_MODE_RESUME:
62 /* Nothing to do here */ 69 /* Nothing to do here */
63 break; 70 break;
64 } 71 }
65 spin_unlock_irqrestore(&i8253_lock, flags); 72 spin_unlock(&i8253_lock);
66} 73}
67 74
68/* 75/*
@@ -72,12 +79,10 @@ static void init_pit_timer(enum clock_event_mode mode,
72 */ 79 */
73static int pit_next_event(unsigned long delta, struct clock_event_device *evt) 80static int pit_next_event(unsigned long delta, struct clock_event_device *evt)
74{ 81{
75 unsigned long flags; 82 spin_lock(&i8253_lock);
76 83 outb_pit(delta & 0xff , PIT_CH0); /* LSB */
77 spin_lock_irqsave(&i8253_lock, flags); 84 outb_pit(delta >> 8 , PIT_CH0); /* MSB */
78 outb_p(delta & 0xff , PIT_CH0); /* LSB */ 85 spin_unlock(&i8253_lock);
79 outb(delta >> 8 , PIT_CH0); /* MSB */
80 spin_unlock_irqrestore(&i8253_lock, flags);
81 86
82 return 0; 87 return 0;
83} 88}
@@ -148,15 +153,15 @@ static cycle_t pit_read(void)
148 * count), it cannot be newer. 153 * count), it cannot be newer.
149 */ 154 */
150 jifs = jiffies; 155 jifs = jiffies;
151 outb_p(0x00, PIT_MODE); /* latch the count ASAP */ 156 outb_pit(0x00, PIT_MODE); /* latch the count ASAP */
152 count = inb_p(PIT_CH0); /* read the latched count */ 157 count = inb_pit(PIT_CH0); /* read the latched count */
153 count |= inb_p(PIT_CH0) << 8; 158 count |= inb_pit(PIT_CH0) << 8;
154 159
155 /* VIA686a test code... reset the latch if count > max + 1 */ 160 /* VIA686a test code... reset the latch if count > max + 1 */
156 if (count > LATCH) { 161 if (count > LATCH) {
157 outb_p(0x34, PIT_MODE); 162 outb_pit(0x34, PIT_MODE);
158 outb_p(LATCH & 0xff, PIT_CH0); 163 outb_pit(LATCH & 0xff, PIT_CH0);
159 outb(LATCH >> 8, PIT_CH0); 164 outb_pit(LATCH >> 8, PIT_CH0);
160 count = LATCH - 1; 165 count = LATCH - 1;
161 } 166 }
162 167
@@ -195,9 +200,28 @@ static struct clocksource clocksource_pit = {
195 .shift = 20, 200 .shift = 20,
196}; 201};
197 202
203static void pit_disable_clocksource(void)
204{
205 /*
206 * Use mult to check whether it is registered or not
207 */
208 if (clocksource_pit.mult) {
209 clocksource_unregister(&clocksource_pit);
210 clocksource_pit.mult = 0;
211 }
212}
213
198static int __init init_pit_clocksource(void) 214static int __init init_pit_clocksource(void)
199{ 215{
200 if (num_possible_cpus() > 1) /* PIT does not scale! */ 216 /*
217 * Several reasons not to register PIT as a clocksource:
218 *
219 * - On SMP PIT does not scale due to i8253_lock
220 * - when HPET is enabled
221 * - when local APIC timer is active (PIT is switched off)
222 */
223 if (num_possible_cpus() > 1 || is_hpet_enabled() ||
224 pit_clockevent.mode != CLOCK_EVT_MODE_PERIODIC)
201 return 0; 225 return 0;
202 226
203 clocksource_pit.mult = clocksource_hz2mult(CLOCK_TICK_RATE, 20); 227 clocksource_pit.mult = clocksource_hz2mult(CLOCK_TICK_RATE, 20);
diff --git a/arch/x86/kernel/i8259_32.c b/arch/x86/kernel/i8259_32.c
index f634fc715c9..2d25b77102f 100644
--- a/arch/x86/kernel/i8259_32.c
+++ b/arch/x86/kernel/i8259_32.c
@@ -21,8 +21,6 @@
21#include <asm/arch_hooks.h> 21#include <asm/arch_hooks.h>
22#include <asm/i8259.h> 22#include <asm/i8259.h>
23 23
24#include <io_ports.h>
25
26/* 24/*
27 * This is the 'legacy' 8259A Programmable Interrupt Controller, 25 * This is the 'legacy' 8259A Programmable Interrupt Controller,
28 * present in the majority of PC/AT boxes. 26 * present in the majority of PC/AT boxes.
@@ -258,7 +256,7 @@ static int i8259A_shutdown(struct sys_device *dev)
258} 256}
259 257
260static struct sysdev_class i8259_sysdev_class = { 258static struct sysdev_class i8259_sysdev_class = {
261 set_kset_name("i8259"), 259 .name = "i8259",
262 .suspend = i8259A_suspend, 260 .suspend = i8259A_suspend,
263 .resume = i8259A_resume, 261 .resume = i8259A_resume,
264 .shutdown = i8259A_shutdown, 262 .shutdown = i8259A_shutdown,
@@ -291,20 +289,20 @@ void init_8259A(int auto_eoi)
291 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ 289 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
292 290
293 /* 291 /*
294 * outb_p - this has to work on a wide range of PC hardware. 292 * outb_pic - this has to work on a wide range of PC hardware.
295 */ 293 */
296 outb_p(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */ 294 outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */
297 outb_p(0x20 + 0, PIC_MASTER_IMR); /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */ 295 outb_pic(0x20 + 0, PIC_MASTER_IMR); /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */
298 outb_p(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); /* 8259A-1 (the master) has a slave on IR2 */ 296 outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); /* 8259A-1 (the master) has a slave on IR2 */
299 if (auto_eoi) /* master does Auto EOI */ 297 if (auto_eoi) /* master does Auto EOI */
300 outb_p(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); 298 outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR);
301 else /* master expects normal EOI */ 299 else /* master expects normal EOI */
302 outb_p(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR); 300 outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR);
303 301
304 outb_p(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */ 302 outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */
305 outb_p(0x20 + 8, PIC_SLAVE_IMR); /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */ 303 outb_pic(0x20 + 8, PIC_SLAVE_IMR); /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */
306 outb_p(PIC_CASCADE_IR, PIC_SLAVE_IMR); /* 8259A-2 is a slave on master's IR2 */ 304 outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR); /* 8259A-2 is a slave on master's IR2 */
307 outb_p(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */ 305 outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */
308 if (auto_eoi) 306 if (auto_eoi)
309 /* 307 /*
310 * In AEOI mode we just have to mask the interrupt 308 * In AEOI mode we just have to mask the interrupt
@@ -341,7 +339,7 @@ static irqreturn_t math_error_irq(int cpl, void *dev_id)
341 outb(0,0xF0); 339 outb(0,0xF0);
342 if (ignore_fpu_irq || !boot_cpu_data.hard_math) 340 if (ignore_fpu_irq || !boot_cpu_data.hard_math)
343 return IRQ_NONE; 341 return IRQ_NONE;
344 math_error((void __user *)get_irq_regs()->eip); 342 math_error((void __user *)get_irq_regs()->ip);
345 return IRQ_HANDLED; 343 return IRQ_HANDLED;
346} 344}
347 345
diff --git a/arch/x86/kernel/i8259_64.c b/arch/x86/kernel/i8259_64.c
index 3f27ea0b981..fa57a156850 100644
--- a/arch/x86/kernel/i8259_64.c
+++ b/arch/x86/kernel/i8259_64.c
@@ -21,6 +21,7 @@
21#include <asm/delay.h> 21#include <asm/delay.h>
22#include <asm/desc.h> 22#include <asm/desc.h>
23#include <asm/apic.h> 23#include <asm/apic.h>
24#include <asm/i8259.h>
24 25
25/* 26/*
26 * Common place to define all x86 IRQ vectors 27 * Common place to define all x86 IRQ vectors
@@ -48,7 +49,7 @@
48 */ 49 */
49 50
50/* 51/*
51 * The IO-APIC gives us many more interrupt sources. Most of these 52 * The IO-APIC gives us many more interrupt sources. Most of these
52 * are unused but an SMP system is supposed to have enough memory ... 53 * are unused but an SMP system is supposed to have enough memory ...
53 * sometimes (mostly wrt. hw bugs) we get corrupted vectors all 54 * sometimes (mostly wrt. hw bugs) we get corrupted vectors all
54 * across the spectrum, so we really want to be prepared to get all 55 * across the spectrum, so we really want to be prepared to get all
@@ -76,7 +77,7 @@ BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf)
76 IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f) 77 IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f)
77 78
78/* for the irq vectors */ 79/* for the irq vectors */
79static void (*interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = { 80static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = {
80 IRQLIST_16(0x2), IRQLIST_16(0x3), 81 IRQLIST_16(0x2), IRQLIST_16(0x3),
81 IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7), 82 IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7),
82 IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb), 83 IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb),
@@ -114,11 +115,7 @@ static struct irq_chip i8259A_chip = {
114/* 115/*
115 * This contains the irq mask for both 8259A irq controllers, 116 * This contains the irq mask for both 8259A irq controllers,
116 */ 117 */
117static unsigned int cached_irq_mask = 0xffff; 118unsigned int cached_irq_mask = 0xffff;
118
119#define __byte(x,y) (((unsigned char *)&(y))[x])
120#define cached_21 (__byte(0,cached_irq_mask))
121#define cached_A1 (__byte(1,cached_irq_mask))
122 119
123/* 120/*
124 * Not all IRQs can be routed through the IO-APIC, eg. on certain (older) 121 * Not all IRQs can be routed through the IO-APIC, eg. on certain (older)
@@ -139,9 +136,9 @@ void disable_8259A_irq(unsigned int irq)
139 spin_lock_irqsave(&i8259A_lock, flags); 136 spin_lock_irqsave(&i8259A_lock, flags);
140 cached_irq_mask |= mask; 137 cached_irq_mask |= mask;
141 if (irq & 8) 138 if (irq & 8)
142 outb(cached_A1,0xA1); 139 outb(cached_slave_mask, PIC_SLAVE_IMR);
143 else 140 else
144 outb(cached_21,0x21); 141 outb(cached_master_mask, PIC_MASTER_IMR);
145 spin_unlock_irqrestore(&i8259A_lock, flags); 142 spin_unlock_irqrestore(&i8259A_lock, flags);
146} 143}
147 144
@@ -153,9 +150,9 @@ void enable_8259A_irq(unsigned int irq)
153 spin_lock_irqsave(&i8259A_lock, flags); 150 spin_lock_irqsave(&i8259A_lock, flags);
154 cached_irq_mask &= mask; 151 cached_irq_mask &= mask;
155 if (irq & 8) 152 if (irq & 8)
156 outb(cached_A1,0xA1); 153 outb(cached_slave_mask, PIC_SLAVE_IMR);
157 else 154 else
158 outb(cached_21,0x21); 155 outb(cached_master_mask, PIC_MASTER_IMR);
159 spin_unlock_irqrestore(&i8259A_lock, flags); 156 spin_unlock_irqrestore(&i8259A_lock, flags);
160} 157}
161 158
@@ -167,9 +164,9 @@ int i8259A_irq_pending(unsigned int irq)
167 164
168 spin_lock_irqsave(&i8259A_lock, flags); 165 spin_lock_irqsave(&i8259A_lock, flags);
169 if (irq < 8) 166 if (irq < 8)
170 ret = inb(0x20) & mask; 167 ret = inb(PIC_MASTER_CMD) & mask;
171 else 168 else
172 ret = inb(0xA0) & (mask >> 8); 169 ret = inb(PIC_SLAVE_CMD) & (mask >> 8);
173 spin_unlock_irqrestore(&i8259A_lock, flags); 170 spin_unlock_irqrestore(&i8259A_lock, flags);
174 171
175 return ret; 172 return ret;
@@ -196,14 +193,14 @@ static inline int i8259A_irq_real(unsigned int irq)
196 int irqmask = 1<<irq; 193 int irqmask = 1<<irq;
197 194
198 if (irq < 8) { 195 if (irq < 8) {
199 outb(0x0B,0x20); /* ISR register */ 196 outb(0x0B,PIC_MASTER_CMD); /* ISR register */
200 value = inb(0x20) & irqmask; 197 value = inb(PIC_MASTER_CMD) & irqmask;
201 outb(0x0A,0x20); /* back to the IRR register */ 198 outb(0x0A,PIC_MASTER_CMD); /* back to the IRR register */
202 return value; 199 return value;
203 } 200 }
204 outb(0x0B,0xA0); /* ISR register */ 201 outb(0x0B,PIC_SLAVE_CMD); /* ISR register */
205 value = inb(0xA0) & (irqmask >> 8); 202 value = inb(PIC_SLAVE_CMD) & (irqmask >> 8);
206 outb(0x0A,0xA0); /* back to the IRR register */ 203 outb(0x0A,PIC_SLAVE_CMD); /* back to the IRR register */
207 return value; 204 return value;
208} 205}
209 206
@@ -240,14 +237,17 @@ static void mask_and_ack_8259A(unsigned int irq)
240 237
241handle_real_irq: 238handle_real_irq:
242 if (irq & 8) { 239 if (irq & 8) {
243 inb(0xA1); /* DUMMY - (do we need this?) */ 240 inb(PIC_SLAVE_IMR); /* DUMMY - (do we need this?) */
244 outb(cached_A1,0xA1); 241 outb(cached_slave_mask, PIC_SLAVE_IMR);
245 outb(0x60+(irq&7),0xA0);/* 'Specific EOI' to slave */ 242 /* 'Specific EOI' to slave */
246 outb(0x62,0x20); /* 'Specific EOI' to master-IRQ2 */ 243 outb(0x60+(irq&7),PIC_SLAVE_CMD);
244 /* 'Specific EOI' to master-IRQ2 */
245 outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD);
247 } else { 246 } else {
248 inb(0x21); /* DUMMY - (do we need this?) */ 247 inb(PIC_MASTER_IMR); /* DUMMY - (do we need this?) */
249 outb(cached_21,0x21); 248 outb(cached_master_mask, PIC_MASTER_IMR);
250 outb(0x60+irq,0x20); /* 'Specific EOI' to master */ 249 /* 'Specific EOI' to master */
250 outb(0x60+irq,PIC_MASTER_CMD);
251 } 251 }
252 spin_unlock_irqrestore(&i8259A_lock, flags); 252 spin_unlock_irqrestore(&i8259A_lock, flags);
253 return; 253 return;
@@ -270,7 +270,8 @@ spurious_8259A_irq:
270 * lets ACK and report it. [once per IRQ] 270 * lets ACK and report it. [once per IRQ]
271 */ 271 */
272 if (!(spurious_irq_mask & irqmask)) { 272 if (!(spurious_irq_mask & irqmask)) {
273 printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq); 273 printk(KERN_DEBUG
274 "spurious 8259A interrupt: IRQ%d.\n", irq);
274 spurious_irq_mask |= irqmask; 275 spurious_irq_mask |= irqmask;
275 } 276 }
276 atomic_inc(&irq_err_count); 277 atomic_inc(&irq_err_count);
@@ -283,51 +284,6 @@ spurious_8259A_irq:
283 } 284 }
284} 285}
285 286
286void init_8259A(int auto_eoi)
287{
288 unsigned long flags;
289
290 i8259A_auto_eoi = auto_eoi;
291
292 spin_lock_irqsave(&i8259A_lock, flags);
293
294 outb(0xff, 0x21); /* mask all of 8259A-1 */
295 outb(0xff, 0xA1); /* mask all of 8259A-2 */
296
297 /*
298 * outb_p - this has to work on a wide range of PC hardware.
299 */
300 outb_p(0x11, 0x20); /* ICW1: select 8259A-1 init */
301 outb_p(IRQ0_VECTOR, 0x21); /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */
302 outb_p(0x04, 0x21); /* 8259A-1 (the master) has a slave on IR2 */
303 if (auto_eoi)
304 outb_p(0x03, 0x21); /* master does Auto EOI */
305 else
306 outb_p(0x01, 0x21); /* master expects normal EOI */
307
308 outb_p(0x11, 0xA0); /* ICW1: select 8259A-2 init */
309 outb_p(IRQ8_VECTOR, 0xA1); /* ICW2: 8259A-2 IR0-7 mapped to 0x38-0x3f */
310 outb_p(0x02, 0xA1); /* 8259A-2 is a slave on master's IR2 */
311 outb_p(0x01, 0xA1); /* (slave's support for AEOI in flat mode
312 is to be investigated) */
313
314 if (auto_eoi)
315 /*
316 * in AEOI mode we just have to mask the interrupt
317 * when acking.
318 */
319 i8259A_chip.mask_ack = disable_8259A_irq;
320 else
321 i8259A_chip.mask_ack = mask_and_ack_8259A;
322
323 udelay(100); /* wait for 8259A to initialize */
324
325 outb(cached_21, 0x21); /* restore master IRQ mask */
326 outb(cached_A1, 0xA1); /* restore slave IRQ mask */
327
328 spin_unlock_irqrestore(&i8259A_lock, flags);
329}
330
331static char irq_trigger[2]; 287static char irq_trigger[2];
332/** 288/**
333 * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ 289 * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ
@@ -364,13 +320,13 @@ static int i8259A_shutdown(struct sys_device *dev)
364 * the kernel initialization code can get it 320 * the kernel initialization code can get it
365 * out of. 321 * out of.
366 */ 322 */
367 outb(0xff, 0x21); /* mask all of 8259A-1 */ 323 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
368 outb(0xff, 0xA1); /* mask all of 8259A-1 */ 324 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-1 */
369 return 0; 325 return 0;
370} 326}
371 327
372static struct sysdev_class i8259_sysdev_class = { 328static struct sysdev_class i8259_sysdev_class = {
373 set_kset_name("i8259"), 329 .name = "i8259",
374 .suspend = i8259A_suspend, 330 .suspend = i8259A_suspend,
375 .resume = i8259A_resume, 331 .resume = i8259A_resume,
376 .shutdown = i8259A_shutdown, 332 .shutdown = i8259A_shutdown,
@@ -391,6 +347,58 @@ static int __init i8259A_init_sysfs(void)
391 347
392device_initcall(i8259A_init_sysfs); 348device_initcall(i8259A_init_sysfs);
393 349
350void init_8259A(int auto_eoi)
351{
352 unsigned long flags;
353
354 i8259A_auto_eoi = auto_eoi;
355
356 spin_lock_irqsave(&i8259A_lock, flags);
357
358 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
359 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
360
361 /*
362 * outb_pic - this has to work on a wide range of PC hardware.
363 */
364 outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */
365 /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */
366 outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR);
367 /* 8259A-1 (the master) has a slave on IR2 */
368 outb_pic(0x04, PIC_MASTER_IMR);
369 if (auto_eoi) /* master does Auto EOI */
370 outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR);
371 else /* master expects normal EOI */
372 outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR);
373
374 outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */
375 /* ICW2: 8259A-2 IR0-7 mapped to 0x38-0x3f */
376 outb_pic(IRQ8_VECTOR, PIC_SLAVE_IMR);
377 /* 8259A-2 is a slave on master's IR2 */
378 outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR);
379 /* (slave's support for AEOI in flat mode is to be investigated) */
380 outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR);
381
382 if (auto_eoi)
383 /*
384 * In AEOI mode we just have to mask the interrupt
385 * when acking.
386 */
387 i8259A_chip.mask_ack = disable_8259A_irq;
388 else
389 i8259A_chip.mask_ack = mask_and_ack_8259A;
390
391 udelay(100); /* wait for 8259A to initialize */
392
393 outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */
394 outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */
395
396 spin_unlock_irqrestore(&i8259A_lock, flags);
397}
398
399
400
401
394/* 402/*
395 * IRQ2 is cascade interrupt to second interrupt controller 403 * IRQ2 is cascade interrupt to second interrupt controller
396 */ 404 */
@@ -448,7 +456,9 @@ void __init init_ISA_irqs (void)
448 } 456 }
449} 457}
450 458
451void __init init_IRQ(void) 459void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
460
461void __init native_init_IRQ(void)
452{ 462{
453 int i; 463 int i;
454 464
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
index 468c9c43784..5b3ce793436 100644
--- a/arch/x86/kernel/init_task.c
+++ b/arch/x86/kernel/init_task.c
@@ -15,7 +15,6 @@ static struct files_struct init_files = INIT_FILES;
15static struct signal_struct init_signals = INIT_SIGNALS(init_signals); 15static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
16static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); 16static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
17struct mm_struct init_mm = INIT_MM(init_mm); 17struct mm_struct init_mm = INIT_MM(init_mm);
18EXPORT_SYMBOL(init_mm);
19 18
20/* 19/*
21 * Initial thread structure. 20 * Initial thread structure.
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index c3a565bba10..4ca548632c8 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -35,6 +35,7 @@
35#include <linux/htirq.h> 35#include <linux/htirq.h>
36#include <linux/freezer.h> 36#include <linux/freezer.h>
37#include <linux/kthread.h> 37#include <linux/kthread.h>
38#include <linux/jiffies.h> /* time_after() */
38 39
39#include <asm/io.h> 40#include <asm/io.h>
40#include <asm/smp.h> 41#include <asm/smp.h>
@@ -48,8 +49,6 @@
48#include <mach_apic.h> 49#include <mach_apic.h>
49#include <mach_apicdef.h> 50#include <mach_apicdef.h>
50 51
51#include "io_ports.h"
52
53int (*ioapic_renumber_irq)(int ioapic, int irq); 52int (*ioapic_renumber_irq)(int ioapic, int irq);
54atomic_t irq_mis_count; 53atomic_t irq_mis_count;
55 54
@@ -351,7 +350,7 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
351# include <asm/processor.h> /* kernel_thread() */ 350# include <asm/processor.h> /* kernel_thread() */
352# include <linux/kernel_stat.h> /* kstat */ 351# include <linux/kernel_stat.h> /* kstat */
353# include <linux/slab.h> /* kmalloc() */ 352# include <linux/slab.h> /* kmalloc() */
354# include <linux/timer.h> /* time_after() */ 353# include <linux/timer.h>
355 354
356#define IRQBALANCE_CHECK_ARCH -999 355#define IRQBALANCE_CHECK_ARCH -999
357#define MAX_BALANCED_IRQ_INTERVAL (5*HZ) 356#define MAX_BALANCED_IRQ_INTERVAL (5*HZ)
@@ -727,7 +726,7 @@ late_initcall(balanced_irq_init);
727#endif /* CONFIG_SMP */ 726#endif /* CONFIG_SMP */
728 727
729#ifndef CONFIG_SMP 728#ifndef CONFIG_SMP
730void fastcall send_IPI_self(int vector) 729void send_IPI_self(int vector)
731{ 730{
732 unsigned int cfg; 731 unsigned int cfg;
733 732
@@ -1900,7 +1899,7 @@ static int __init timer_irq_works(void)
1900 * might have cached one ExtINT interrupt. Finally, at 1899 * might have cached one ExtINT interrupt. Finally, at
1901 * least one tick may be lost due to delays. 1900 * least one tick may be lost due to delays.
1902 */ 1901 */
1903 if (jiffies - t1 > 4) 1902 if (time_after(jiffies, t1 + 4))
1904 return 1; 1903 return 1;
1905 1904
1906 return 0; 1905 return 0;
@@ -2080,7 +2079,7 @@ static struct irq_chip lapic_chip __read_mostly = {
2080 .eoi = ack_apic, 2079 .eoi = ack_apic,
2081}; 2080};
2082 2081
2083static void setup_nmi (void) 2082static void __init setup_nmi(void)
2084{ 2083{
2085 /* 2084 /*
2086 * Dirty trick to enable the NMI watchdog ... 2085 * Dirty trick to enable the NMI watchdog ...
@@ -2093,7 +2092,7 @@ static void setup_nmi (void)
2093 */ 2092 */
2094 apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ..."); 2093 apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
2095 2094
2096 on_each_cpu(enable_NMI_through_LVT0, NULL, 1, 1); 2095 enable_NMI_through_LVT0();
2097 2096
2098 apic_printk(APIC_VERBOSE, " done.\n"); 2097 apic_printk(APIC_VERBOSE, " done.\n");
2099} 2098}
@@ -2169,14 +2168,10 @@ static inline void __init check_timer(void)
2169{ 2168{
2170 int apic1, pin1, apic2, pin2; 2169 int apic1, pin1, apic2, pin2;
2171 int vector; 2170 int vector;
2172 unsigned int ver;
2173 unsigned long flags; 2171 unsigned long flags;
2174 2172
2175 local_irq_save(flags); 2173 local_irq_save(flags);
2176 2174
2177 ver = apic_read(APIC_LVR);
2178 ver = GET_APIC_VERSION(ver);
2179
2180 /* 2175 /*
2181 * get/set the timer IRQ vector: 2176 * get/set the timer IRQ vector:
2182 */ 2177 */
@@ -2189,15 +2184,11 @@ static inline void __init check_timer(void)
2189 * mode for the 8259A whenever interrupts are routed 2184 * mode for the 8259A whenever interrupts are routed
2190 * through I/O APICs. Also IRQ0 has to be enabled in 2185 * through I/O APICs. Also IRQ0 has to be enabled in
2191 * the 8259A which implies the virtual wire has to be 2186 * the 8259A which implies the virtual wire has to be
2192 * disabled in the local APIC. Finally timer interrupts 2187 * disabled in the local APIC.
2193 * need to be acknowledged manually in the 8259A for
2194 * timer_interrupt() and for the i82489DX when using
2195 * the NMI watchdog.
2196 */ 2188 */
2197 apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); 2189 apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
2198 init_8259A(1); 2190 init_8259A(1);
2199 timer_ack = !cpu_has_tsc; 2191 timer_ack = 1;
2200 timer_ack |= (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
2201 if (timer_over_8254 > 0) 2192 if (timer_over_8254 > 0)
2202 enable_8259A_irq(0); 2193 enable_8259A_irq(0);
2203 2194
@@ -2409,7 +2400,7 @@ static int ioapic_resume(struct sys_device *dev)
2409} 2400}
2410 2401
2411static struct sysdev_class ioapic_sysdev_class = { 2402static struct sysdev_class ioapic_sysdev_class = {
2412 set_kset_name("ioapic"), 2403 .name = "ioapic",
2413 .suspend = ioapic_suspend, 2404 .suspend = ioapic_suspend,
2414 .resume = ioapic_resume, 2405 .resume = ioapic_resume,
2415}; 2406};
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index cbac1670c7c..1627c0d53e0 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -32,9 +32,11 @@
32#include <linux/msi.h> 32#include <linux/msi.h>
33#include <linux/htirq.h> 33#include <linux/htirq.h>
34#include <linux/dmar.h> 34#include <linux/dmar.h>
35#include <linux/jiffies.h>
35#ifdef CONFIG_ACPI 36#ifdef CONFIG_ACPI
36#include <acpi/acpi_bus.h> 37#include <acpi/acpi_bus.h>
37#endif 38#endif
39#include <linux/bootmem.h>
38 40
39#include <asm/idle.h> 41#include <asm/idle.h>
40#include <asm/io.h> 42#include <asm/io.h>
@@ -1069,7 +1071,7 @@ void __apicdebuginit print_local_APIC(void * dummy)
1069 v = apic_read(APIC_LVR); 1071 v = apic_read(APIC_LVR);
1070 printk(KERN_INFO "... APIC VERSION: %08x\n", v); 1072 printk(KERN_INFO "... APIC VERSION: %08x\n", v);
1071 ver = GET_APIC_VERSION(v); 1073 ver = GET_APIC_VERSION(v);
1072 maxlvt = get_maxlvt(); 1074 maxlvt = lapic_get_maxlvt();
1073 1075
1074 v = apic_read(APIC_TASKPRI); 1076 v = apic_read(APIC_TASKPRI);
1075 printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK); 1077 printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
@@ -1171,7 +1173,7 @@ void __apicdebuginit print_PIC(void)
1171 1173
1172#endif /* 0 */ 1174#endif /* 0 */
1173 1175
1174static void __init enable_IO_APIC(void) 1176void __init enable_IO_APIC(void)
1175{ 1177{
1176 union IO_APIC_reg_01 reg_01; 1178 union IO_APIC_reg_01 reg_01;
1177 int i8259_apic, i8259_pin; 1179 int i8259_apic, i8259_pin;
@@ -1298,7 +1300,7 @@ static int __init timer_irq_works(void)
1298 */ 1300 */
1299 1301
1300 /* jiffies wrap? */ 1302 /* jiffies wrap? */
1301 if (jiffies - t1 > 4) 1303 if (time_after(jiffies, t1 + 4))
1302 return 1; 1304 return 1;
1303 return 0; 1305 return 0;
1304} 1306}
@@ -1411,7 +1413,7 @@ static void irq_complete_move(unsigned int irq)
1411 if (likely(!cfg->move_in_progress)) 1413 if (likely(!cfg->move_in_progress))
1412 return; 1414 return;
1413 1415
1414 vector = ~get_irq_regs()->orig_rax; 1416 vector = ~get_irq_regs()->orig_ax;
1415 me = smp_processor_id(); 1417 me = smp_processor_id();
1416 if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) { 1418 if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) {
1417 cpumask_t cleanup_mask; 1419 cpumask_t cleanup_mask;
@@ -1438,7 +1440,7 @@ static void ack_apic_level(unsigned int irq)
1438 int do_unmask_irq = 0; 1440 int do_unmask_irq = 0;
1439 1441
1440 irq_complete_move(irq); 1442 irq_complete_move(irq);
1441#if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE) 1443#ifdef CONFIG_GENERIC_PENDING_IRQ
1442 /* If we are moving the irq we need to mask it */ 1444 /* If we are moving the irq we need to mask it */
1443 if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) { 1445 if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) {
1444 do_unmask_irq = 1; 1446 do_unmask_irq = 1;
@@ -1565,7 +1567,7 @@ static struct hw_interrupt_type lapic_irq_type __read_mostly = {
1565 .end = end_lapic_irq, 1567 .end = end_lapic_irq,
1566}; 1568};
1567 1569
1568static void setup_nmi (void) 1570static void __init setup_nmi(void)
1569{ 1571{
1570 /* 1572 /*
1571 * Dirty trick to enable the NMI watchdog ... 1573 * Dirty trick to enable the NMI watchdog ...
@@ -1578,7 +1580,7 @@ static void setup_nmi (void)
1578 */ 1580 */
1579 printk(KERN_INFO "activating NMI Watchdog ..."); 1581 printk(KERN_INFO "activating NMI Watchdog ...");
1580 1582
1581 enable_NMI_through_LVT0(NULL); 1583 enable_NMI_through_LVT0();
1582 1584
1583 printk(" done.\n"); 1585 printk(" done.\n");
1584} 1586}
@@ -1654,7 +1656,7 @@ static inline void unlock_ExtINT_logic(void)
1654 * 1656 *
1655 * FIXME: really need to revamp this for modern platforms only. 1657 * FIXME: really need to revamp this for modern platforms only.
1656 */ 1658 */
1657static inline void check_timer(void) 1659static inline void __init check_timer(void)
1658{ 1660{
1659 struct irq_cfg *cfg = irq_cfg + 0; 1661 struct irq_cfg *cfg = irq_cfg + 0;
1660 int apic1, pin1, apic2, pin2; 1662 int apic1, pin1, apic2, pin2;
@@ -1788,7 +1790,10 @@ __setup("no_timer_check", notimercheck);
1788 1790
1789void __init setup_IO_APIC(void) 1791void __init setup_IO_APIC(void)
1790{ 1792{
1791 enable_IO_APIC(); 1793
1794 /*
1795 * calling enable_IO_APIC() is moved to setup_local_APIC for BP
1796 */
1792 1797
1793 if (acpi_ioapic) 1798 if (acpi_ioapic)
1794 io_apic_irqs = ~0; /* all IRQs go through IOAPIC */ 1799 io_apic_irqs = ~0; /* all IRQs go through IOAPIC */
@@ -1850,7 +1855,7 @@ static int ioapic_resume(struct sys_device *dev)
1850} 1855}
1851 1856
1852static struct sysdev_class ioapic_sysdev_class = { 1857static struct sysdev_class ioapic_sysdev_class = {
1853 set_kset_name("ioapic"), 1858 .name = "ioapic",
1854 .suspend = ioapic_suspend, 1859 .suspend = ioapic_suspend,
1855 .resume = ioapic_resume, 1860 .resume = ioapic_resume,
1856}; 1861};
@@ -2288,3 +2293,92 @@ void __init setup_ioapic_dest(void)
2288} 2293}
2289#endif 2294#endif
2290 2295
2296#define IOAPIC_RESOURCE_NAME_SIZE 11
2297
2298static struct resource *ioapic_resources;
2299
2300static struct resource * __init ioapic_setup_resources(void)
2301{
2302 unsigned long n;
2303 struct resource *res;
2304 char *mem;
2305 int i;
2306
2307 if (nr_ioapics <= 0)
2308 return NULL;
2309
2310 n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource);
2311 n *= nr_ioapics;
2312
2313 mem = alloc_bootmem(n);
2314 res = (void *)mem;
2315
2316 if (mem != NULL) {
2317 memset(mem, 0, n);
2318 mem += sizeof(struct resource) * nr_ioapics;
2319
2320 for (i = 0; i < nr_ioapics; i++) {
2321 res[i].name = mem;
2322 res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
2323 sprintf(mem, "IOAPIC %u", i);
2324 mem += IOAPIC_RESOURCE_NAME_SIZE;
2325 }
2326 }
2327
2328 ioapic_resources = res;
2329
2330 return res;
2331}
2332
2333void __init ioapic_init_mappings(void)
2334{
2335 unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
2336 struct resource *ioapic_res;
2337 int i;
2338
2339 ioapic_res = ioapic_setup_resources();
2340 for (i = 0; i < nr_ioapics; i++) {
2341 if (smp_found_config) {
2342 ioapic_phys = mp_ioapics[i].mpc_apicaddr;
2343 } else {
2344 ioapic_phys = (unsigned long)
2345 alloc_bootmem_pages(PAGE_SIZE);
2346 ioapic_phys = __pa(ioapic_phys);
2347 }
2348 set_fixmap_nocache(idx, ioapic_phys);
2349 apic_printk(APIC_VERBOSE,
2350 "mapped IOAPIC to %016lx (%016lx)\n",
2351 __fix_to_virt(idx), ioapic_phys);
2352 idx++;
2353
2354 if (ioapic_res != NULL) {
2355 ioapic_res->start = ioapic_phys;
2356 ioapic_res->end = ioapic_phys + (4 * 1024) - 1;
2357 ioapic_res++;
2358 }
2359 }
2360}
2361
2362static int __init ioapic_insert_resources(void)
2363{
2364 int i;
2365 struct resource *r = ioapic_resources;
2366
2367 if (!r) {
2368 printk(KERN_ERR
2369 "IO APIC resources could be not be allocated.\n");
2370 return -1;
2371 }
2372
2373 for (i = 0; i < nr_ioapics; i++) {
2374 insert_resource(&iomem_resource, r);
2375 r++;
2376 }
2377
2378 return 0;
2379}
2380
2381/* Insert the IO APIC resources after PCI initialization has occured to handle
2382 * IO APICS that are mapped in on a BAR in PCI space. */
2383late_initcall(ioapic_insert_resources);
2384
diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c
new file mode 100644
index 00000000000..bd49321034d
--- /dev/null
+++ b/arch/x86/kernel/io_delay.c
@@ -0,0 +1,114 @@
1/*
2 * I/O delay strategies for inb_p/outb_p
3 *
4 * Allow for a DMI based override of port 0x80, needed for certain HP laptops
5 * and possibly other systems. Also allow for the gradual elimination of
6 * outb_p/inb_p API uses.
7 */
8#include <linux/kernel.h>
9#include <linux/module.h>
10#include <linux/init.h>
11#include <linux/delay.h>
12#include <linux/dmi.h>
13#include <asm/io.h>
14
15int io_delay_type __read_mostly = CONFIG_DEFAULT_IO_DELAY_TYPE;
16EXPORT_SYMBOL_GPL(io_delay_type);
17
18static int __initdata io_delay_override;
19
20/*
21 * Paravirt wants native_io_delay to be a constant.
22 */
23void native_io_delay(void)
24{
25 switch (io_delay_type) {
26 default:
27 case CONFIG_IO_DELAY_TYPE_0X80:
28 asm volatile ("outb %al, $0x80");
29 break;
30 case CONFIG_IO_DELAY_TYPE_0XED:
31 asm volatile ("outb %al, $0xed");
32 break;
33 case CONFIG_IO_DELAY_TYPE_UDELAY:
34 /*
35 * 2 usecs is an upper-bound for the outb delay but
36 * note that udelay doesn't have the bus-level
37 * side-effects that outb does, nor does udelay() have
38 * precise timings during very early bootup (the delays
39 * are shorter until calibrated):
40 */
41 udelay(2);
42 case CONFIG_IO_DELAY_TYPE_NONE:
43 break;
44 }
45}
46EXPORT_SYMBOL(native_io_delay);
47
48static int __init dmi_io_delay_0xed_port(const struct dmi_system_id *id)
49{
50 if (io_delay_type == CONFIG_IO_DELAY_TYPE_0X80) {
51 printk(KERN_NOTICE "%s: using 0xed I/O delay port\n",
52 id->ident);
53 io_delay_type = CONFIG_IO_DELAY_TYPE_0XED;
54 }
55
56 return 0;
57}
58
59/*
60 * Quirk table for systems that misbehave (lock up, etc.) if port
61 * 0x80 is used:
62 */
63static struct dmi_system_id __initdata io_delay_0xed_port_dmi_table[] = {
64 {
65 .callback = dmi_io_delay_0xed_port,
66 .ident = "Compaq Presario V6000",
67 .matches = {
68 DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
69 DMI_MATCH(DMI_BOARD_NAME, "30B7")
70 }
71 },
72 {
73 .callback = dmi_io_delay_0xed_port,
74 .ident = "HP Pavilion dv9000z",
75 .matches = {
76 DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
77 DMI_MATCH(DMI_BOARD_NAME, "30B9")
78 }
79 },
80 {
81 .callback = dmi_io_delay_0xed_port,
82 .ident = "HP Pavilion tx1000",
83 .matches = {
84 DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
85 DMI_MATCH(DMI_BOARD_NAME, "30BF")
86 }
87 },
88 { }
89};
90
91void __init io_delay_init(void)
92{
93 if (!io_delay_override)
94 dmi_check_system(io_delay_0xed_port_dmi_table);
95}
96
97static int __init io_delay_param(char *s)
98{
99 if (!strcmp(s, "0x80"))
100 io_delay_type = CONFIG_IO_DELAY_TYPE_0X80;
101 else if (!strcmp(s, "0xed"))
102 io_delay_type = CONFIG_IO_DELAY_TYPE_0XED;
103 else if (!strcmp(s, "udelay"))
104 io_delay_type = CONFIG_IO_DELAY_TYPE_UDELAY;
105 else if (!strcmp(s, "none"))
106 io_delay_type = CONFIG_IO_DELAY_TYPE_NONE;
107 else
108 return -EINVAL;
109
110 io_delay_override = 1;
111 return 0;
112}
113
114early_param("io_delay", io_delay_param);
diff --git a/arch/x86/kernel/ioport_32.c b/arch/x86/kernel/ioport.c
index 4ed48dc8df1..50e5e4a31c8 100644
--- a/arch/x86/kernel/ioport_32.c
+++ b/arch/x86/kernel/ioport.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * This contains the io-permission bitmap code - written by obz, with changes 2 * This contains the io-permission bitmap code - written by obz, with changes
3 * by Linus. 3 * by Linus. 32/64 bits code unification by Miguel Botón.
4 */ 4 */
5 5
6#include <linux/sched.h> 6#include <linux/sched.h>
@@ -16,49 +16,27 @@
16#include <linux/syscalls.h> 16#include <linux/syscalls.h>
17 17
18/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ 18/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
19static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value) 19static void set_bitmap(unsigned long *bitmap, unsigned int base,
20 unsigned int extent, int new_value)
20{ 21{
21 unsigned long mask; 22 unsigned int i;
22 unsigned long *bitmap_base = bitmap + (base / BITS_PER_LONG);
23 unsigned int low_index = base & (BITS_PER_LONG-1);
24 int length = low_index + extent;
25
26 if (low_index != 0) {
27 mask = (~0UL << low_index);
28 if (length < BITS_PER_LONG)
29 mask &= ~(~0UL << length);
30 if (new_value)
31 *bitmap_base++ |= mask;
32 else
33 *bitmap_base++ &= ~mask;
34 length -= BITS_PER_LONG;
35 }
36
37 mask = (new_value ? ~0UL : 0UL);
38 while (length >= BITS_PER_LONG) {
39 *bitmap_base++ = mask;
40 length -= BITS_PER_LONG;
41 }
42 23
43 if (length > 0) { 24 for (i = base; i < base + extent; i++) {
44 mask = ~(~0UL << length);
45 if (new_value) 25 if (new_value)
46 *bitmap_base++ |= mask; 26 __set_bit(i, bitmap);
47 else 27 else
48 *bitmap_base++ &= ~mask; 28 __clear_bit(i, bitmap);
49 } 29 }
50} 30}
51 31
52
53/* 32/*
54 * this changes the io permissions bitmap in the current task. 33 * this changes the io permissions bitmap in the current task.
55 */ 34 */
56asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) 35asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
57{ 36{
58 unsigned long i, max_long, bytes, bytes_updated;
59 struct thread_struct * t = &current->thread; 37 struct thread_struct * t = &current->thread;
60 struct tss_struct * tss; 38 struct tss_struct * tss;
61 unsigned long *bitmap; 39 unsigned int i, max_long, bytes, bytes_updated;
62 40
63 if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) 41 if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
64 return -EINVAL; 42 return -EINVAL;
@@ -71,7 +49,8 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
71 * this is why we delay this operation until now: 49 * this is why we delay this operation until now:
72 */ 50 */
73 if (!t->io_bitmap_ptr) { 51 if (!t->io_bitmap_ptr) {
74 bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); 52 unsigned long *bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
53
75 if (!bitmap) 54 if (!bitmap)
76 return -ENOMEM; 55 return -ENOMEM;
77 56
@@ -100,11 +79,12 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
100 if (t->io_bitmap_ptr[i] != ~0UL) 79 if (t->io_bitmap_ptr[i] != ~0UL)
101 max_long = i; 80 max_long = i;
102 81
103 bytes = (max_long + 1) * sizeof(long); 82 bytes = (max_long + 1) * sizeof(unsigned long);
104 bytes_updated = max(bytes, t->io_bitmap_max); 83 bytes_updated = max(bytes, t->io_bitmap_max);
105 84
106 t->io_bitmap_max = bytes; 85 t->io_bitmap_max = bytes;
107 86
87#ifdef CONFIG_X86_32
108 /* 88 /*
109 * Sets the lazy trigger so that the next I/O operation will 89 * Sets the lazy trigger so that the next I/O operation will
110 * reload the correct bitmap. 90 * reload the correct bitmap.
@@ -113,6 +93,10 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
113 */ 93 */
114 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY; 94 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY;
115 tss->io_bitmap_owner = NULL; 95 tss->io_bitmap_owner = NULL;
96#else
97 /* Update the TSS: */
98 memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated);
99#endif
116 100
117 put_cpu(); 101 put_cpu();
118 102
@@ -124,18 +108,14 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
124 * beyond the 0x3ff range: to get the full 65536 ports bitmapped 108 * beyond the 0x3ff range: to get the full 65536 ports bitmapped
125 * you'd need 8kB of bitmaps/process, which is a bit excessive. 109 * you'd need 8kB of bitmaps/process, which is a bit excessive.
126 * 110 *
127 * Here we just change the eflags value on the stack: we allow 111 * Here we just change the flags value on the stack: we allow
128 * only the super-user to do it. This depends on the stack-layout 112 * only the super-user to do it. This depends on the stack-layout
129 * on system-call entry - see also fork() and the signal handling 113 * on system-call entry - see also fork() and the signal handling
130 * code. 114 * code.
131 */ 115 */
132 116static int do_iopl(unsigned int level, struct pt_regs *regs)
133asmlinkage long sys_iopl(unsigned long unused)
134{ 117{
135 volatile struct pt_regs * regs = (struct pt_regs *) &unused; 118 unsigned int old = (regs->flags >> 12) & 3;
136 unsigned int level = regs->ebx;
137 unsigned int old = (regs->eflags >> 12) & 3;
138 struct thread_struct *t = &current->thread;
139 119
140 if (level > 3) 120 if (level > 3)
141 return -EINVAL; 121 return -EINVAL;
@@ -144,8 +124,31 @@ asmlinkage long sys_iopl(unsigned long unused)
144 if (!capable(CAP_SYS_RAWIO)) 124 if (!capable(CAP_SYS_RAWIO))
145 return -EPERM; 125 return -EPERM;
146 } 126 }
127 regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | (level << 12);
128
129 return 0;
130}
131
132#ifdef CONFIG_X86_32
133asmlinkage long sys_iopl(unsigned long regsp)
134{
135 struct pt_regs *regs = (struct pt_regs *)&regsp;
136 unsigned int level = regs->bx;
137 struct thread_struct *t = &current->thread;
138 int rc;
139
140 rc = do_iopl(level, regs);
141 if (rc < 0)
142 goto out;
143
147 t->iopl = level << 12; 144 t->iopl = level << 12;
148 regs->eflags = (regs->eflags & ~X86_EFLAGS_IOPL) | t->iopl;
149 set_iopl_mask(t->iopl); 145 set_iopl_mask(t->iopl);
150 return 0; 146out:
147 return rc;
148}
149#else
150asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs)
151{
152 return do_iopl(level, regs);
151} 153}
154#endif
diff --git a/arch/x86/kernel/ioport_64.c b/arch/x86/kernel/ioport_64.c
deleted file mode 100644
index 5f62fad64da..00000000000
--- a/arch/x86/kernel/ioport_64.c
+++ /dev/null
@@ -1,117 +0,0 @@
1/*
2 * This contains the io-permission bitmap code - written by obz, with changes
3 * by Linus.
4 */
5
6#include <linux/sched.h>
7#include <linux/kernel.h>
8#include <linux/capability.h>
9#include <linux/errno.h>
10#include <linux/types.h>
11#include <linux/ioport.h>
12#include <linux/smp.h>
13#include <linux/stddef.h>
14#include <linux/slab.h>
15#include <linux/thread_info.h>
16#include <linux/syscalls.h>
17
18/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
19static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
20{
21 int i;
22 if (new_value)
23 for (i = base; i < base + extent; i++)
24 __set_bit(i, bitmap);
25 else
26 for (i = base; i < base + extent; i++)
27 clear_bit(i, bitmap);
28}
29
30/*
31 * this changes the io permissions bitmap in the current task.
32 */
33asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
34{
35 unsigned int i, max_long, bytes, bytes_updated;
36 struct thread_struct * t = &current->thread;
37 struct tss_struct * tss;
38 unsigned long *bitmap;
39
40 if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
41 return -EINVAL;
42 if (turn_on && !capable(CAP_SYS_RAWIO))
43 return -EPERM;
44
45 /*
46 * If it's the first ioperm() call in this thread's lifetime, set the
47 * IO bitmap up. ioperm() is much less timing critical than clone(),
48 * this is why we delay this operation until now:
49 */
50 if (!t->io_bitmap_ptr) {
51 bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
52 if (!bitmap)
53 return -ENOMEM;
54
55 memset(bitmap, 0xff, IO_BITMAP_BYTES);
56 t->io_bitmap_ptr = bitmap;
57 set_thread_flag(TIF_IO_BITMAP);
58 }
59
60 /*
61 * do it in the per-thread copy and in the TSS ...
62 *
63 * Disable preemption via get_cpu() - we must not switch away
64 * because the ->io_bitmap_max value must match the bitmap
65 * contents:
66 */
67 tss = &per_cpu(init_tss, get_cpu());
68
69 set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
70
71 /*
72 * Search for a (possibly new) maximum. This is simple and stupid,
73 * to keep it obviously correct:
74 */
75 max_long = 0;
76 for (i = 0; i < IO_BITMAP_LONGS; i++)
77 if (t->io_bitmap_ptr[i] != ~0UL)
78 max_long = i;
79
80 bytes = (max_long + 1) * sizeof(long);
81 bytes_updated = max(bytes, t->io_bitmap_max);
82
83 t->io_bitmap_max = bytes;
84
85 /* Update the TSS: */
86 memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated);
87
88 put_cpu();
89
90 return 0;
91}
92
93/*
94 * sys_iopl has to be used when you want to access the IO ports
95 * beyond the 0x3ff range: to get the full 65536 ports bitmapped
96 * you'd need 8kB of bitmaps/process, which is a bit excessive.
97 *
98 * Here we just change the eflags value on the stack: we allow
99 * only the super-user to do it. This depends on the stack-layout
100 * on system-call entry - see also fork() and the signal handling
101 * code.
102 */
103
104asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs)
105{
106 unsigned int old = (regs->eflags >> 12) & 3;
107
108 if (level > 3)
109 return -EINVAL;
110 /* Trying to gain more privileges? */
111 if (level > old) {
112 if (!capable(CAP_SYS_RAWIO))
113 return -EPERM;
114 }
115 regs->eflags = (regs->eflags &~ X86_EFLAGS_IOPL) | (level << 12);
116 return 0;
117}
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index d3fde94f734..cef054b09d2 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -66,11 +66,11 @@ static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
66 * SMP cross-CPU interrupts have their own specific 66 * SMP cross-CPU interrupts have their own specific
67 * handlers). 67 * handlers).
68 */ 68 */
69fastcall unsigned int do_IRQ(struct pt_regs *regs) 69unsigned int do_IRQ(struct pt_regs *regs)
70{ 70{
71 struct pt_regs *old_regs; 71 struct pt_regs *old_regs;
72 /* high bit used in ret_from_ code */ 72 /* high bit used in ret_from_ code */
73 int irq = ~regs->orig_eax; 73 int irq = ~regs->orig_ax;
74 struct irq_desc *desc = irq_desc + irq; 74 struct irq_desc *desc = irq_desc + irq;
75#ifdef CONFIG_4KSTACKS 75#ifdef CONFIG_4KSTACKS
76 union irq_ctx *curctx, *irqctx; 76 union irq_ctx *curctx, *irqctx;
@@ -88,13 +88,13 @@ fastcall unsigned int do_IRQ(struct pt_regs *regs)
88#ifdef CONFIG_DEBUG_STACKOVERFLOW 88#ifdef CONFIG_DEBUG_STACKOVERFLOW
89 /* Debugging check for stack overflow: is there less than 1KB free? */ 89 /* Debugging check for stack overflow: is there less than 1KB free? */
90 { 90 {
91 long esp; 91 long sp;
92 92
93 __asm__ __volatile__("andl %%esp,%0" : 93 __asm__ __volatile__("andl %%esp,%0" :
94 "=r" (esp) : "0" (THREAD_SIZE - 1)); 94 "=r" (sp) : "0" (THREAD_SIZE - 1));
95 if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) { 95 if (unlikely(sp < (sizeof(struct thread_info) + STACK_WARN))) {
96 printk("do_IRQ: stack overflow: %ld\n", 96 printk("do_IRQ: stack overflow: %ld\n",
97 esp - sizeof(struct thread_info)); 97 sp - sizeof(struct thread_info));
98 dump_stack(); 98 dump_stack();
99 } 99 }
100 } 100 }
@@ -112,7 +112,7 @@ fastcall unsigned int do_IRQ(struct pt_regs *regs)
112 * current stack (which is the irq stack already after all) 112 * current stack (which is the irq stack already after all)
113 */ 113 */
114 if (curctx != irqctx) { 114 if (curctx != irqctx) {
115 int arg1, arg2, ebx; 115 int arg1, arg2, bx;
116 116
117 /* build the stack frame on the IRQ stack */ 117 /* build the stack frame on the IRQ stack */
118 isp = (u32*) ((char*)irqctx + sizeof(*irqctx)); 118 isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
@@ -128,10 +128,10 @@ fastcall unsigned int do_IRQ(struct pt_regs *regs)
128 (curctx->tinfo.preempt_count & SOFTIRQ_MASK); 128 (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
129 129
130 asm volatile( 130 asm volatile(
131 " xchgl %%ebx,%%esp \n" 131 " xchgl %%ebx,%%esp \n"
132 " call *%%edi \n" 132 " call *%%edi \n"
133 " movl %%ebx,%%esp \n" 133 " movl %%ebx,%%esp \n"
134 : "=a" (arg1), "=d" (arg2), "=b" (ebx) 134 : "=a" (arg1), "=d" (arg2), "=b" (bx)
135 : "0" (irq), "1" (desc), "2" (isp), 135 : "0" (irq), "1" (desc), "2" (isp),
136 "D" (desc->handle_irq) 136 "D" (desc->handle_irq)
137 : "memory", "cc" 137 : "memory", "cc"
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 6b5c730d67b..3aac15466a9 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -20,6 +20,26 @@
20 20
21atomic_t irq_err_count; 21atomic_t irq_err_count;
22 22
23/*
24 * 'what should we do if we get a hw irq event on an illegal vector'.
25 * each architecture has to answer this themselves.
26 */
27void ack_bad_irq(unsigned int irq)
28{
29 printk(KERN_WARNING "unexpected IRQ trap at vector %02x\n", irq);
30 /*
31 * Currently unexpected vectors happen only on SMP and APIC.
32 * We _must_ ack these because every local APIC has only N
33 * irq slots per priority level, and a 'hanging, unacked' IRQ
34 * holds up an irq slot - in excessive cases (when multiple
35 * unexpected vectors occur) that might lock up the APIC
36 * completely.
37 * But don't ack when the APIC is disabled. -AK
38 */
39 if (!disable_apic)
40 ack_APIC_irq();
41}
42
23#ifdef CONFIG_DEBUG_STACKOVERFLOW 43#ifdef CONFIG_DEBUG_STACKOVERFLOW
24/* 44/*
25 * Probabilistic stack overflow check: 45 * Probabilistic stack overflow check:
@@ -33,11 +53,11 @@ static inline void stack_overflow_check(struct pt_regs *regs)
33 u64 curbase = (u64)task_stack_page(current); 53 u64 curbase = (u64)task_stack_page(current);
34 static unsigned long warned = -60*HZ; 54 static unsigned long warned = -60*HZ;
35 55
36 if (regs->rsp >= curbase && regs->rsp <= curbase + THREAD_SIZE && 56 if (regs->sp >= curbase && regs->sp <= curbase + THREAD_SIZE &&
37 regs->rsp < curbase + sizeof(struct thread_info) + 128 && 57 regs->sp < curbase + sizeof(struct thread_info) + 128 &&
38 time_after(jiffies, warned + 60*HZ)) { 58 time_after(jiffies, warned + 60*HZ)) {
39 printk("do_IRQ: %s near stack overflow (cur:%Lx,rsp:%lx)\n", 59 printk("do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n",
40 current->comm, curbase, regs->rsp); 60 current->comm, curbase, regs->sp);
41 show_stack(NULL,NULL); 61 show_stack(NULL,NULL);
42 warned = jiffies; 62 warned = jiffies;
43 } 63 }
@@ -142,7 +162,7 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
142 struct pt_regs *old_regs = set_irq_regs(regs); 162 struct pt_regs *old_regs = set_irq_regs(regs);
143 163
144 /* high bit used in ret_from_ code */ 164 /* high bit used in ret_from_ code */
145 unsigned vector = ~regs->orig_rax; 165 unsigned vector = ~regs->orig_ax;
146 unsigned irq; 166 unsigned irq;
147 167
148 exit_idle(); 168 exit_idle();
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
new file mode 100644
index 00000000000..73354302fda
--- /dev/null
+++ b/arch/x86/kernel/kdebugfs.c
@@ -0,0 +1,65 @@
1/*
2 * Architecture specific debugfs files
3 *
4 * Copyright (C) 2007, Intel Corp.
5 * Huang Ying <ying.huang@intel.com>
6 *
7 * This file is released under the GPLv2.
8 */
9
10#include <linux/debugfs.h>
11#include <linux/stat.h>
12#include <linux/init.h>
13
14#include <asm/setup.h>
15
16#ifdef CONFIG_DEBUG_BOOT_PARAMS
17static struct debugfs_blob_wrapper boot_params_blob = {
18 .data = &boot_params,
19 .size = sizeof(boot_params),
20};
21
22static int __init boot_params_kdebugfs_init(void)
23{
24 int error;
25 struct dentry *dbp, *version, *data;
26
27 dbp = debugfs_create_dir("boot_params", NULL);
28 if (!dbp) {
29 error = -ENOMEM;
30 goto err_return;
31 }
32 version = debugfs_create_x16("version", S_IRUGO, dbp,
33 &boot_params.hdr.version);
34 if (!version) {
35 error = -ENOMEM;
36 goto err_dir;
37 }
38 data = debugfs_create_blob("data", S_IRUGO, dbp,
39 &boot_params_blob);
40 if (!data) {
41 error = -ENOMEM;
42 goto err_version;
43 }
44 return 0;
45err_version:
46 debugfs_remove(version);
47err_dir:
48 debugfs_remove(dbp);
49err_return:
50 return error;
51}
52#endif
53
54static int __init arch_kdebugfs_init(void)
55{
56 int error = 0;
57
58#ifdef CONFIG_DEBUG_BOOT_PARAMS
59 error = boot_params_kdebugfs_init();
60#endif
61
62 return error;
63}
64
65arch_initcall(arch_kdebugfs_init);
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
new file mode 100644
index 00000000000..a99e764fd66
--- /dev/null
+++ b/arch/x86/kernel/kprobes.c
@@ -0,0 +1,1066 @@
1/*
2 * Kernel Probes (KProbes)
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) IBM Corporation, 2002, 2004
19 *
20 * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
21 * Probes initial implementation ( includes contributions from
22 * Rusty Russell).
23 * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
24 * interface to access function arguments.
25 * 2004-Oct Jim Keniston <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
26 * <prasanna@in.ibm.com> adapted for x86_64 from i386.
27 * 2005-Mar Roland McGrath <roland@redhat.com>
28 * Fixed to handle %rip-relative addressing mode correctly.
29 * 2005-May Hien Nguyen <hien@us.ibm.com>, Jim Keniston
30 * <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
31 * <prasanna@in.ibm.com> added function-return probes.
32 * 2005-May Rusty Lynch <rusty.lynch@intel.com>
33 * Added function return probes functionality
34 * 2006-Feb Masami Hiramatsu <hiramatu@sdl.hitachi.co.jp> added
35 * kprobe-booster and kretprobe-booster for i386.
36 * 2007-Dec Masami Hiramatsu <mhiramat@redhat.com> added kprobe-booster
37 * and kretprobe-booster for x86-64
38 * 2007-Dec Masami Hiramatsu <mhiramat@redhat.com>, Arjan van de Ven
39 * <arjan@infradead.org> and Jim Keniston <jkenisto@us.ibm.com>
40 * unified x86 kprobes code.
41 */
42
43#include <linux/kprobes.h>
44#include <linux/ptrace.h>
45#include <linux/string.h>
46#include <linux/slab.h>
47#include <linux/hardirq.h>
48#include <linux/preempt.h>
49#include <linux/module.h>
50#include <linux/kdebug.h>
51
52#include <asm/cacheflush.h>
53#include <asm/desc.h>
54#include <asm/pgtable.h>
55#include <asm/uaccess.h>
56#include <asm/alternative.h>
57
58void jprobe_return_end(void);
59
60DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
61DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
62
63#ifdef CONFIG_X86_64
64#define stack_addr(regs) ((unsigned long *)regs->sp)
65#else
66/*
67 * "&regs->sp" looks wrong, but it's correct for x86_32. x86_32 CPUs
68 * don't save the ss and esp registers if the CPU is already in kernel
69 * mode when it traps. So for kprobes, regs->sp and regs->ss are not
70 * the [nonexistent] saved stack pointer and ss register, but rather
71 * the top 8 bytes of the pre-int3 stack. So &regs->sp happens to
72 * point to the top of the pre-int3 stack.
73 */
74#define stack_addr(regs) ((unsigned long *)&regs->sp)
75#endif
76
77#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
78 (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
79 (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \
80 (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \
81 (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \
82 << (row % 32))
83 /*
84 * Undefined/reserved opcodes, conditional jump, Opcode Extension
85 * Groups, and some special opcodes can not boost.
86 */
87static const u32 twobyte_is_boostable[256 / 32] = {
88 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
89 /* ---------------------------------------------- */
90 W(0x00, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0) | /* 00 */
91 W(0x10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 10 */
92 W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 20 */
93 W(0x30, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */
94 W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
95 W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */
96 W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1) | /* 60 */
97 W(0x70, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */
98 W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 80 */
99 W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
100 W(0xa0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) | /* a0 */
101 W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) , /* b0 */
102 W(0xc0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */
103 W(0xd0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) , /* d0 */
104 W(0xe0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) | /* e0 */
105 W(0xf0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0) /* f0 */
106 /* ----------------------------------------------- */
107 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
108};
109static const u32 onebyte_has_modrm[256 / 32] = {
110 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
111 /* ----------------------------------------------- */
112 W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 00 */
113 W(0x10, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 10 */
114 W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 20 */
115 W(0x30, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 30 */
116 W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */
117 W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */
118 W(0x60, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0) | /* 60 */
119 W(0x70, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 70 */
120 W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
121 W(0x90, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 90 */
122 W(0xa0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* a0 */
123 W(0xb0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* b0 */
124 W(0xc0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* c0 */
125 W(0xd0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
126 W(0xe0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* e0 */
127 W(0xf0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) /* f0 */
128 /* ----------------------------------------------- */
129 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
130};
131static const u32 twobyte_has_modrm[256 / 32] = {
132 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
133 /* ----------------------------------------------- */
134 W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1) | /* 0f */
135 W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0) , /* 1f */
136 W(0x20, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 2f */
137 W(0x30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 3f */
138 W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 4f */
139 W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 5f */
140 W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 6f */
141 W(0x70, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1) , /* 7f */
142 W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 8f */
143 W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 9f */
144 W(0xa0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) | /* af */
145 W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1) , /* bf */
146 W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* cf */
147 W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* df */
148 W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* ef */
149 W(0xf0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* ff */
150 /* ----------------------------------------------- */
151 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
152};
153#undef W
154
155struct kretprobe_blackpoint kretprobe_blacklist[] = {
156 {"__switch_to", }, /* This function switches only current task, but
157 doesn't switch kernel stack.*/
158 {NULL, NULL} /* Terminator */
159};
160const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
161
162/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
163static void __kprobes set_jmp_op(void *from, void *to)
164{
165 struct __arch_jmp_op {
166 char op;
167 s32 raddr;
168 } __attribute__((packed)) * jop;
169 jop = (struct __arch_jmp_op *)from;
170 jop->raddr = (s32)((long)(to) - ((long)(from) + 5));
171 jop->op = RELATIVEJUMP_INSTRUCTION;
172}
173
174/*
175 * Check for the REX prefix which can only exist on X86_64
176 * X86_32 always returns 0
177 */
178static int __kprobes is_REX_prefix(kprobe_opcode_t *insn)
179{
180#ifdef CONFIG_X86_64
181 if ((*insn & 0xf0) == 0x40)
182 return 1;
183#endif
184 return 0;
185}
186
187/*
188 * Returns non-zero if opcode is boostable.
189 * RIP relative instructions are adjusted at copying time in 64 bits mode
190 */
191static int __kprobes can_boost(kprobe_opcode_t *opcodes)
192{
193 kprobe_opcode_t opcode;
194 kprobe_opcode_t *orig_opcodes = opcodes;
195
196retry:
197 if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
198 return 0;
199 opcode = *(opcodes++);
200
201 /* 2nd-byte opcode */
202 if (opcode == 0x0f) {
203 if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
204 return 0;
205 return test_bit(*opcodes,
206 (unsigned long *)twobyte_is_boostable);
207 }
208
209 switch (opcode & 0xf0) {
210#ifdef CONFIG_X86_64
211 case 0x40:
212 goto retry; /* REX prefix is boostable */
213#endif
214 case 0x60:
215 if (0x63 < opcode && opcode < 0x67)
216 goto retry; /* prefixes */
217 /* can't boost Address-size override and bound */
218 return (opcode != 0x62 && opcode != 0x67);
219 case 0x70:
220 return 0; /* can't boost conditional jump */
221 case 0xc0:
222 /* can't boost software-interruptions */
223 return (0xc1 < opcode && opcode < 0xcc) || opcode == 0xcf;
224 case 0xd0:
225 /* can boost AA* and XLAT */
226 return (opcode == 0xd4 || opcode == 0xd5 || opcode == 0xd7);
227 case 0xe0:
228 /* can boost in/out and absolute jmps */
229 return ((opcode & 0x04) || opcode == 0xea);
230 case 0xf0:
231 if ((opcode & 0x0c) == 0 && opcode != 0xf1)
232 goto retry; /* lock/rep(ne) prefix */
233 /* clear and set flags are boostable */
234 return (opcode == 0xf5 || (0xf7 < opcode && opcode < 0xfe));
235 default:
236 /* segment override prefixes are boostable */
237 if (opcode == 0x26 || opcode == 0x36 || opcode == 0x3e)
238 goto retry; /* prefixes */
239 /* CS override prefix and call are not boostable */
240 return (opcode != 0x2e && opcode != 0x9a);
241 }
242}
243
244/*
245 * Returns non-zero if opcode modifies the interrupt flag.
246 */
247static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
248{
249 switch (*insn) {
250 case 0xfa: /* cli */
251 case 0xfb: /* sti */
252 case 0xcf: /* iret/iretd */
253 case 0x9d: /* popf/popfd */
254 return 1;
255 }
256
257 /*
258 * on X86_64, 0x40-0x4f are REX prefixes so we need to look
259 * at the next byte instead.. but of course not recurse infinitely
260 */
261 if (is_REX_prefix(insn))
262 return is_IF_modifier(++insn);
263
264 return 0;
265}
266
267/*
268 * Adjust the displacement if the instruction uses the %rip-relative
269 * addressing mode.
270 * If it does, Return the address of the 32-bit displacement word.
271 * If not, return null.
272 * Only applicable to 64-bit x86.
273 */
274static void __kprobes fix_riprel(struct kprobe *p)
275{
276#ifdef CONFIG_X86_64
277 u8 *insn = p->ainsn.insn;
278 s64 disp;
279 int need_modrm;
280
281 /* Skip legacy instruction prefixes. */
282 while (1) {
283 switch (*insn) {
284 case 0x66:
285 case 0x67:
286 case 0x2e:
287 case 0x3e:
288 case 0x26:
289 case 0x64:
290 case 0x65:
291 case 0x36:
292 case 0xf0:
293 case 0xf3:
294 case 0xf2:
295 ++insn;
296 continue;
297 }
298 break;
299 }
300
301 /* Skip REX instruction prefix. */
302 if (is_REX_prefix(insn))
303 ++insn;
304
305 if (*insn == 0x0f) {
306 /* Two-byte opcode. */
307 ++insn;
308 need_modrm = test_bit(*insn,
309 (unsigned long *)twobyte_has_modrm);
310 } else
311 /* One-byte opcode. */
312 need_modrm = test_bit(*insn,
313 (unsigned long *)onebyte_has_modrm);
314
315 if (need_modrm) {
316 u8 modrm = *++insn;
317 if ((modrm & 0xc7) == 0x05) {
318 /* %rip+disp32 addressing mode */
319 /* Displacement follows ModRM byte. */
320 ++insn;
321 /*
322 * The copied instruction uses the %rip-relative
323 * addressing mode. Adjust the displacement for the
324 * difference between the original location of this
325 * instruction and the location of the copy that will
326 * actually be run. The tricky bit here is making sure
327 * that the sign extension happens correctly in this
328 * calculation, since we need a signed 32-bit result to
329 * be sign-extended to 64 bits when it's added to the
330 * %rip value and yield the same 64-bit result that the
331 * sign-extension of the original signed 32-bit
332 * displacement would have given.
333 */
334 disp = (u8 *) p->addr + *((s32 *) insn) -
335 (u8 *) p->ainsn.insn;
336 BUG_ON((s64) (s32) disp != disp); /* Sanity check. */
337 *(s32 *)insn = (s32) disp;
338 }
339 }
340#endif
341}
342
343static void __kprobes arch_copy_kprobe(struct kprobe *p)
344{
345 memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
346
347 fix_riprel(p);
348
349 if (can_boost(p->addr))
350 p->ainsn.boostable = 0;
351 else
352 p->ainsn.boostable = -1;
353
354 p->opcode = *p->addr;
355}
356
357int __kprobes arch_prepare_kprobe(struct kprobe *p)
358{
359 /* insn: must be on special executable page on x86. */
360 p->ainsn.insn = get_insn_slot();
361 if (!p->ainsn.insn)
362 return -ENOMEM;
363 arch_copy_kprobe(p);
364 return 0;
365}
366
367void __kprobes arch_arm_kprobe(struct kprobe *p)
368{
369 text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1);
370}
371
372void __kprobes arch_disarm_kprobe(struct kprobe *p)
373{
374 text_poke(p->addr, &p->opcode, 1);
375}
376
377void __kprobes arch_remove_kprobe(struct kprobe *p)
378{
379 mutex_lock(&kprobe_mutex);
380 free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1));
381 mutex_unlock(&kprobe_mutex);
382}
383
384static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
385{
386 kcb->prev_kprobe.kp = kprobe_running();
387 kcb->prev_kprobe.status = kcb->kprobe_status;
388 kcb->prev_kprobe.old_flags = kcb->kprobe_old_flags;
389 kcb->prev_kprobe.saved_flags = kcb->kprobe_saved_flags;
390}
391
392static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
393{
394 __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp;
395 kcb->kprobe_status = kcb->prev_kprobe.status;
396 kcb->kprobe_old_flags = kcb->prev_kprobe.old_flags;
397 kcb->kprobe_saved_flags = kcb->prev_kprobe.saved_flags;
398}
399
400static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
401 struct kprobe_ctlblk *kcb)
402{
403 __get_cpu_var(current_kprobe) = p;
404 kcb->kprobe_saved_flags = kcb->kprobe_old_flags
405 = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF));
406 if (is_IF_modifier(p->ainsn.insn))
407 kcb->kprobe_saved_flags &= ~X86_EFLAGS_IF;
408}
409
410static void __kprobes clear_btf(void)
411{
412 if (test_thread_flag(TIF_DEBUGCTLMSR))
413 wrmsrl(MSR_IA32_DEBUGCTLMSR, 0);
414}
415
416static void __kprobes restore_btf(void)
417{
418 if (test_thread_flag(TIF_DEBUGCTLMSR))
419 wrmsrl(MSR_IA32_DEBUGCTLMSR, current->thread.debugctlmsr);
420}
421
422static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
423{
424 clear_btf();
425 regs->flags |= X86_EFLAGS_TF;
426 regs->flags &= ~X86_EFLAGS_IF;
427 /* single step inline if the instruction is an int3 */
428 if (p->opcode == BREAKPOINT_INSTRUCTION)
429 regs->ip = (unsigned long)p->addr;
430 else
431 regs->ip = (unsigned long)p->ainsn.insn;
432}
433
434/* Called with kretprobe_lock held */
435void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
436 struct pt_regs *regs)
437{
438 unsigned long *sara = stack_addr(regs);
439
440 ri->ret_addr = (kprobe_opcode_t *) *sara;
441
442 /* Replace the return addr with trampoline addr */
443 *sara = (unsigned long) &kretprobe_trampoline;
444}
445
446static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs,
447 struct kprobe_ctlblk *kcb)
448{
449#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PM)
450 if (p->ainsn.boostable == 1 && !p->post_handler) {
451 /* Boost up -- we can execute copied instructions directly */
452 reset_current_kprobe();
453 regs->ip = (unsigned long)p->ainsn.insn;
454 preempt_enable_no_resched();
455 return;
456 }
457#endif
458 prepare_singlestep(p, regs);
459 kcb->kprobe_status = KPROBE_HIT_SS;
460}
461
462/*
463 * We have reentered the kprobe_handler(), since another probe was hit while
464 * within the handler. We save the original kprobes variables and just single
465 * step on the instruction of the new probe without calling any user handlers.
466 */
467static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
468 struct kprobe_ctlblk *kcb)
469{
470 switch (kcb->kprobe_status) {
471 case KPROBE_HIT_SSDONE:
472#ifdef CONFIG_X86_64
473 /* TODO: Provide re-entrancy from post_kprobes_handler() and
474 * avoid exception stack corruption while single-stepping on
475 * the instruction of the new probe.
476 */
477 arch_disarm_kprobe(p);
478 regs->ip = (unsigned long)p->addr;
479 reset_current_kprobe();
480 preempt_enable_no_resched();
481 break;
482#endif
483 case KPROBE_HIT_ACTIVE:
484 save_previous_kprobe(kcb);
485 set_current_kprobe(p, regs, kcb);
486 kprobes_inc_nmissed_count(p);
487 prepare_singlestep(p, regs);
488 kcb->kprobe_status = KPROBE_REENTER;
489 break;
490 case KPROBE_HIT_SS:
491 if (p == kprobe_running()) {
492 regs->flags &= ~TF_MASK;
493 regs->flags |= kcb->kprobe_saved_flags;
494 return 0;
495 } else {
496 /* A probe has been hit in the codepath leading up
497 * to, or just after, single-stepping of a probed
498 * instruction. This entire codepath should strictly
499 * reside in .kprobes.text section. Raise a warning
500 * to highlight this peculiar case.
501 */
502 }
503 default:
504 /* impossible cases */
505 WARN_ON(1);
506 return 0;
507 }
508
509 return 1;
510}
511
512/*
513 * Interrupts are disabled on entry as trap3 is an interrupt gate and they
514 * remain disabled thorough out this function.
515 */
516static int __kprobes kprobe_handler(struct pt_regs *regs)
517{
518 kprobe_opcode_t *addr;
519 struct kprobe *p;
520 struct kprobe_ctlblk *kcb;
521
522 addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t));
523 if (*addr != BREAKPOINT_INSTRUCTION) {
524 /*
525 * The breakpoint instruction was removed right
526 * after we hit it. Another cpu has removed
527 * either a probepoint or a debugger breakpoint
528 * at this address. In either case, no further
529 * handling of this interrupt is appropriate.
530 * Back up over the (now missing) int3 and run
531 * the original instruction.
532 */
533 regs->ip = (unsigned long)addr;
534 return 1;
535 }
536
537 /*
538 * We don't want to be preempted for the entire
539 * duration of kprobe processing. We conditionally
540 * re-enable preemption at the end of this function,
541 * and also in reenter_kprobe() and setup_singlestep().
542 */
543 preempt_disable();
544
545 kcb = get_kprobe_ctlblk();
546 p = get_kprobe(addr);
547
548 if (p) {
549 if (kprobe_running()) {
550 if (reenter_kprobe(p, regs, kcb))
551 return 1;
552 } else {
553 set_current_kprobe(p, regs, kcb);
554 kcb->kprobe_status = KPROBE_HIT_ACTIVE;
555
556 /*
557 * If we have no pre-handler or it returned 0, we
558 * continue with normal processing. If we have a
559 * pre-handler and it returned non-zero, it prepped
560 * for calling the break_handler below on re-entry
561 * for jprobe processing, so get out doing nothing
562 * more here.
563 */
564 if (!p->pre_handler || !p->pre_handler(p, regs))
565 setup_singlestep(p, regs, kcb);
566 return 1;
567 }
568 } else if (kprobe_running()) {
569 p = __get_cpu_var(current_kprobe);
570 if (p->break_handler && p->break_handler(p, regs)) {
571 setup_singlestep(p, regs, kcb);
572 return 1;
573 }
574 } /* else: not a kprobe fault; let the kernel handle it */
575
576 preempt_enable_no_resched();
577 return 0;
578}
579
580/*
581 * When a retprobed function returns, this code saves registers and
582 * calls trampoline_handler() runs, which calls the kretprobe's handler.
583 */
584void __kprobes kretprobe_trampoline_holder(void)
585{
586 asm volatile (
587 ".global kretprobe_trampoline\n"
588 "kretprobe_trampoline: \n"
589#ifdef CONFIG_X86_64
590 /* We don't bother saving the ss register */
591 " pushq %rsp\n"
592 " pushfq\n"
593 /*
594 * Skip cs, ip, orig_ax.
595 * trampoline_handler() will plug in these values
596 */
597 " subq $24, %rsp\n"
598 " pushq %rdi\n"
599 " pushq %rsi\n"
600 " pushq %rdx\n"
601 " pushq %rcx\n"
602 " pushq %rax\n"
603 " pushq %r8\n"
604 " pushq %r9\n"
605 " pushq %r10\n"
606 " pushq %r11\n"
607 " pushq %rbx\n"
608 " pushq %rbp\n"
609 " pushq %r12\n"
610 " pushq %r13\n"
611 " pushq %r14\n"
612 " pushq %r15\n"
613 " movq %rsp, %rdi\n"
614 " call trampoline_handler\n"
615 /* Replace saved sp with true return address. */
616 " movq %rax, 152(%rsp)\n"
617 " popq %r15\n"
618 " popq %r14\n"
619 " popq %r13\n"
620 " popq %r12\n"
621 " popq %rbp\n"
622 " popq %rbx\n"
623 " popq %r11\n"
624 " popq %r10\n"
625 " popq %r9\n"
626 " popq %r8\n"
627 " popq %rax\n"
628 " popq %rcx\n"
629 " popq %rdx\n"
630 " popq %rsi\n"
631 " popq %rdi\n"
632 /* Skip orig_ax, ip, cs */
633 " addq $24, %rsp\n"
634 " popfq\n"
635#else
636 " pushf\n"
637 /*
638 * Skip cs, ip, orig_ax.
639 * trampoline_handler() will plug in these values
640 */
641 " subl $12, %esp\n"
642 " pushl %fs\n"
643 " pushl %ds\n"
644 " pushl %es\n"
645 " pushl %eax\n"
646 " pushl %ebp\n"
647 " pushl %edi\n"
648 " pushl %esi\n"
649 " pushl %edx\n"
650 " pushl %ecx\n"
651 " pushl %ebx\n"
652 " movl %esp, %eax\n"
653 " call trampoline_handler\n"
654 /* Move flags to cs */
655 " movl 52(%esp), %edx\n"
656 " movl %edx, 48(%esp)\n"
657 /* Replace saved flags with true return address. */
658 " movl %eax, 52(%esp)\n"
659 " popl %ebx\n"
660 " popl %ecx\n"
661 " popl %edx\n"
662 " popl %esi\n"
663 " popl %edi\n"
664 " popl %ebp\n"
665 " popl %eax\n"
666 /* Skip ip, orig_ax, es, ds, fs */
667 " addl $20, %esp\n"
668 " popf\n"
669#endif
670 " ret\n");
671}
672
673/*
674 * Called from kretprobe_trampoline
675 */
676void * __kprobes trampoline_handler(struct pt_regs *regs)
677{
678 struct kretprobe_instance *ri = NULL;
679 struct hlist_head *head, empty_rp;
680 struct hlist_node *node, *tmp;
681 unsigned long flags, orig_ret_address = 0;
682 unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline;
683
684 INIT_HLIST_HEAD(&empty_rp);
685 spin_lock_irqsave(&kretprobe_lock, flags);
686 head = kretprobe_inst_table_head(current);
687 /* fixup registers */
688#ifdef CONFIG_X86_64
689 regs->cs = __KERNEL_CS;
690#else
691 regs->cs = __KERNEL_CS | get_kernel_rpl();
692#endif
693 regs->ip = trampoline_address;
694 regs->orig_ax = ~0UL;
695
696 /*
697 * It is possible to have multiple instances associated with a given
698 * task either because multiple functions in the call path have
699 * return probes installed on them, and/or more then one
700 * return probe was registered for a target function.
701 *
702 * We can handle this because:
703 * - instances are always pushed into the head of the list
704 * - when multiple return probes are registered for the same
705 * function, the (chronologically) first instance's ret_addr
706 * will be the real return address, and all the rest will
707 * point to kretprobe_trampoline.
708 */
709 hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
710 if (ri->task != current)
711 /* another task is sharing our hash bucket */
712 continue;
713
714 if (ri->rp && ri->rp->handler) {
715 __get_cpu_var(current_kprobe) = &ri->rp->kp;
716 get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
717 ri->rp->handler(ri, regs);
718 __get_cpu_var(current_kprobe) = NULL;
719 }
720
721 orig_ret_address = (unsigned long)ri->ret_addr;
722 recycle_rp_inst(ri, &empty_rp);
723
724 if (orig_ret_address != trampoline_address)
725 /*
726 * This is the real return address. Any other
727 * instances associated with this task are for
728 * other calls deeper on the call stack
729 */
730 break;
731 }
732
733 kretprobe_assert(ri, orig_ret_address, trampoline_address);
734
735 spin_unlock_irqrestore(&kretprobe_lock, flags);
736
737 hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
738 hlist_del(&ri->hlist);
739 kfree(ri);
740 }
741 return (void *)orig_ret_address;
742}
743
744/*
745 * Called after single-stepping. p->addr is the address of the
746 * instruction whose first byte has been replaced by the "int 3"
747 * instruction. To avoid the SMP problems that can occur when we
748 * temporarily put back the original opcode to single-step, we
749 * single-stepped a copy of the instruction. The address of this
750 * copy is p->ainsn.insn.
751 *
752 * This function prepares to return from the post-single-step
753 * interrupt. We have to fix up the stack as follows:
754 *
755 * 0) Except in the case of absolute or indirect jump or call instructions,
756 * the new ip is relative to the copied instruction. We need to make
757 * it relative to the original instruction.
758 *
759 * 1) If the single-stepped instruction was pushfl, then the TF and IF
760 * flags are set in the just-pushed flags, and may need to be cleared.
761 *
762 * 2) If the single-stepped instruction was a call, the return address
763 * that is atop the stack is the address following the copied instruction.
764 * We need to make it the address following the original instruction.
765 *
766 * If this is the first time we've single-stepped the instruction at
767 * this probepoint, and the instruction is boostable, boost it: add a
768 * jump instruction after the copied instruction, that jumps to the next
769 * instruction after the probepoint.
770 */
771static void __kprobes resume_execution(struct kprobe *p,
772 struct pt_regs *regs, struct kprobe_ctlblk *kcb)
773{
774 unsigned long *tos = stack_addr(regs);
775 unsigned long copy_ip = (unsigned long)p->ainsn.insn;
776 unsigned long orig_ip = (unsigned long)p->addr;
777 kprobe_opcode_t *insn = p->ainsn.insn;
778
779 /*skip the REX prefix*/
780 if (is_REX_prefix(insn))
781 insn++;
782
783 regs->flags &= ~X86_EFLAGS_TF;
784 switch (*insn) {
785 case 0x9c: /* pushfl */
786 *tos &= ~(X86_EFLAGS_TF | X86_EFLAGS_IF);
787 *tos |= kcb->kprobe_old_flags;
788 break;
789 case 0xc2: /* iret/ret/lret */
790 case 0xc3:
791 case 0xca:
792 case 0xcb:
793 case 0xcf:
794 case 0xea: /* jmp absolute -- ip is correct */
795 /* ip is already adjusted, no more changes required */
796 p->ainsn.boostable = 1;
797 goto no_change;
798 case 0xe8: /* call relative - Fix return addr */
799 *tos = orig_ip + (*tos - copy_ip);
800 break;
801#ifdef CONFIG_X86_32
802 case 0x9a: /* call absolute -- same as call absolute, indirect */
803 *tos = orig_ip + (*tos - copy_ip);
804 goto no_change;
805#endif
806 case 0xff:
807 if ((insn[1] & 0x30) == 0x10) {
808 /*
809 * call absolute, indirect
810 * Fix return addr; ip is correct.
811 * But this is not boostable
812 */
813 *tos = orig_ip + (*tos - copy_ip);
814 goto no_change;
815 } else if (((insn[1] & 0x31) == 0x20) ||
816 ((insn[1] & 0x31) == 0x21)) {
817 /*
818 * jmp near and far, absolute indirect
819 * ip is correct. And this is boostable
820 */
821 p->ainsn.boostable = 1;
822 goto no_change;
823 }
824 default:
825 break;
826 }
827
828 if (p->ainsn.boostable == 0) {
829 if ((regs->ip > copy_ip) &&
830 (regs->ip - copy_ip) + 5 < MAX_INSN_SIZE) {
831 /*
832 * These instructions can be executed directly if it
833 * jumps back to correct address.
834 */
835 set_jmp_op((void *)regs->ip,
836 (void *)orig_ip + (regs->ip - copy_ip));
837 p->ainsn.boostable = 1;
838 } else {
839 p->ainsn.boostable = -1;
840 }
841 }
842
843 regs->ip += orig_ip - copy_ip;
844
845no_change:
846 restore_btf();
847}
848
849/*
850 * Interrupts are disabled on entry as trap1 is an interrupt gate and they
851 * remain disabled thoroughout this function.
852 */
853static int __kprobes post_kprobe_handler(struct pt_regs *regs)
854{
855 struct kprobe *cur = kprobe_running();
856 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
857
858 if (!cur)
859 return 0;
860
861 if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
862 kcb->kprobe_status = KPROBE_HIT_SSDONE;
863 cur->post_handler(cur, regs, 0);
864 }
865
866 resume_execution(cur, regs, kcb);
867 regs->flags |= kcb->kprobe_saved_flags;
868 trace_hardirqs_fixup_flags(regs->flags);
869
870 /* Restore back the original saved kprobes variables and continue. */
871 if (kcb->kprobe_status == KPROBE_REENTER) {
872 restore_previous_kprobe(kcb);
873 goto out;
874 }
875 reset_current_kprobe();
876out:
877 preempt_enable_no_resched();
878
879 /*
880 * if somebody else is singlestepping across a probe point, flags
881 * will have TF set, in which case, continue the remaining processing
882 * of do_debug, as if this is not a probe hit.
883 */
884 if (regs->flags & X86_EFLAGS_TF)
885 return 0;
886
887 return 1;
888}
889
890int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
891{
892 struct kprobe *cur = kprobe_running();
893 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
894
895 switch (kcb->kprobe_status) {
896 case KPROBE_HIT_SS:
897 case KPROBE_REENTER:
898 /*
899 * We are here because the instruction being single
900 * stepped caused a page fault. We reset the current
901 * kprobe and the ip points back to the probe address
902 * and allow the page fault handler to continue as a
903 * normal page fault.
904 */
905 regs->ip = (unsigned long)cur->addr;
906 regs->flags |= kcb->kprobe_old_flags;
907 if (kcb->kprobe_status == KPROBE_REENTER)
908 restore_previous_kprobe(kcb);
909 else
910 reset_current_kprobe();
911 preempt_enable_no_resched();
912 break;
913 case KPROBE_HIT_ACTIVE:
914 case KPROBE_HIT_SSDONE:
915 /*
916 * We increment the nmissed count for accounting,
917 * we can also use npre/npostfault count for accounting
918 * these specific fault cases.
919 */
920 kprobes_inc_nmissed_count(cur);
921
922 /*
923 * We come here because instructions in the pre/post
924 * handler caused the page_fault, this could happen
925 * if handler tries to access user space by
926 * copy_from_user(), get_user() etc. Let the
927 * user-specified handler try to fix it first.
928 */
929 if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
930 return 1;
931
932 /*
933 * In case the user-specified fault handler returned
934 * zero, try to fix up.
935 */
936 if (fixup_exception(regs))
937 return 1;
938
939 /*
940 * fixup routine could not handle it,
941 * Let do_page_fault() fix it.
942 */
943 break;
944 default:
945 break;
946 }
947 return 0;
948}
949
950/*
951 * Wrapper routine for handling exceptions.
952 */
953int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
954 unsigned long val, void *data)
955{
956 struct die_args *args = data;
957 int ret = NOTIFY_DONE;
958
959 if (args->regs && user_mode_vm(args->regs))
960 return ret;
961
962 switch (val) {
963 case DIE_INT3:
964 if (kprobe_handler(args->regs))
965 ret = NOTIFY_STOP;
966 break;
967 case DIE_DEBUG:
968 if (post_kprobe_handler(args->regs))
969 ret = NOTIFY_STOP;
970 break;
971 case DIE_GPF:
972 /*
973 * To be potentially processing a kprobe fault and to
974 * trust the result from kprobe_running(), we have
975 * be non-preemptible.
976 */
977 if (!preemptible() && kprobe_running() &&
978 kprobe_fault_handler(args->regs, args->trapnr))
979 ret = NOTIFY_STOP;
980 break;
981 default:
982 break;
983 }
984 return ret;
985}
986
987int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
988{
989 struct jprobe *jp = container_of(p, struct jprobe, kp);
990 unsigned long addr;
991 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
992
993 kcb->jprobe_saved_regs = *regs;
994 kcb->jprobe_saved_sp = stack_addr(regs);
995 addr = (unsigned long)(kcb->jprobe_saved_sp);
996
997 /*
998 * As Linus pointed out, gcc assumes that the callee
999 * owns the argument space and could overwrite it, e.g.
1000 * tailcall optimization. So, to be absolutely safe
1001 * we also save and restore enough stack bytes to cover
1002 * the argument area.
1003 */
1004 memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr,
1005 MIN_STACK_SIZE(addr));
1006 regs->flags &= ~X86_EFLAGS_IF;
1007 trace_hardirqs_off();
1008 regs->ip = (unsigned long)(jp->entry);
1009 return 1;
1010}
1011
1012void __kprobes jprobe_return(void)
1013{
1014 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
1015
1016 asm volatile (
1017#ifdef CONFIG_X86_64
1018 " xchg %%rbx,%%rsp \n"
1019#else
1020 " xchgl %%ebx,%%esp \n"
1021#endif
1022 " int3 \n"
1023 " .globl jprobe_return_end\n"
1024 " jprobe_return_end: \n"
1025 " nop \n"::"b"
1026 (kcb->jprobe_saved_sp):"memory");
1027}
1028
1029int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
1030{
1031 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
1032 u8 *addr = (u8 *) (regs->ip - 1);
1033 struct jprobe *jp = container_of(p, struct jprobe, kp);
1034
1035 if ((addr > (u8 *) jprobe_return) &&
1036 (addr < (u8 *) jprobe_return_end)) {
1037 if (stack_addr(regs) != kcb->jprobe_saved_sp) {
1038 struct pt_regs *saved_regs = &kcb->jprobe_saved_regs;
1039 printk(KERN_ERR
1040 "current sp %p does not match saved sp %p\n",
1041 stack_addr(regs), kcb->jprobe_saved_sp);
1042 printk(KERN_ERR "Saved registers for jprobe %p\n", jp);
1043 show_registers(saved_regs);
1044 printk(KERN_ERR "Current registers\n");
1045 show_registers(regs);
1046 BUG();
1047 }
1048 *regs = kcb->jprobe_saved_regs;
1049 memcpy((kprobe_opcode_t *)(kcb->jprobe_saved_sp),
1050 kcb->jprobes_stack,
1051 MIN_STACK_SIZE(kcb->jprobe_saved_sp));
1052 preempt_enable_no_resched();
1053 return 1;
1054 }
1055 return 0;
1056}
1057
1058int __init arch_init_kprobes(void)
1059{
1060 return 0;
1061}
1062
1063int __kprobes arch_trampoline_kprobe(struct kprobe *p)
1064{
1065 return 0;
1066}
diff --git a/arch/x86/kernel/kprobes_32.c b/arch/x86/kernel/kprobes_32.c
deleted file mode 100644
index 3a020f79f82..00000000000
--- a/arch/x86/kernel/kprobes_32.c
+++ /dev/null
@@ -1,756 +0,0 @@
1/*
2 * Kernel Probes (KProbes)
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) IBM Corporation, 2002, 2004
19 *
20 * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
21 * Probes initial implementation ( includes contributions from
22 * Rusty Russell).
23 * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
24 * interface to access function arguments.
25 * 2005-May Hien Nguyen <hien@us.ibm.com>, Jim Keniston
26 * <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
27 * <prasanna@in.ibm.com> added function-return probes.
28 */
29
30#include <linux/kprobes.h>
31#include <linux/ptrace.h>
32#include <linux/preempt.h>
33#include <linux/kdebug.h>
34#include <asm/cacheflush.h>
35#include <asm/desc.h>
36#include <asm/uaccess.h>
37#include <asm/alternative.h>
38
39void jprobe_return_end(void);
40
41DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
42DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
43
44struct kretprobe_blackpoint kretprobe_blacklist[] = {
45 {"__switch_to", }, /* This function switches only current task, but
46 doesn't switch kernel stack.*/
47 {NULL, NULL} /* Terminator */
48};
49const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
50
51/* insert a jmp code */
52static __always_inline void set_jmp_op(void *from, void *to)
53{
54 struct __arch_jmp_op {
55 char op;
56 long raddr;
57 } __attribute__((packed)) *jop;
58 jop = (struct __arch_jmp_op *)from;
59 jop->raddr = (long)(to) - ((long)(from) + 5);
60 jop->op = RELATIVEJUMP_INSTRUCTION;
61}
62
63/*
64 * returns non-zero if opcodes can be boosted.
65 */
66static __always_inline int can_boost(kprobe_opcode_t *opcodes)
67{
68#define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf) \
69 (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
70 (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \
71 (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \
72 (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \
73 << (row % 32))
74 /*
75 * Undefined/reserved opcodes, conditional jump, Opcode Extension
76 * Groups, and some special opcodes can not be boost.
77 */
78 static const unsigned long twobyte_is_boostable[256 / 32] = {
79 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
80 /* ------------------------------- */
81 W(0x00, 0,0,1,1,0,0,1,0,1,1,0,0,0,0,0,0)| /* 00 */
82 W(0x10, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 10 */
83 W(0x20, 1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0)| /* 20 */
84 W(0x30, 0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 30 */
85 W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 40 */
86 W(0x50, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 50 */
87 W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1)| /* 60 */
88 W(0x70, 0,0,0,0,1,1,1,1,0,0,0,0,0,0,1,1), /* 70 */
89 W(0x80, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 80 */
90 W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1), /* 90 */
91 W(0xa0, 1,1,0,1,1,1,0,0,1,1,0,1,1,1,0,1)| /* a0 */
92 W(0xb0, 1,1,1,1,1,1,1,1,0,0,0,1,1,1,1,1), /* b0 */
93 W(0xc0, 1,1,0,0,0,0,0,0,1,1,1,1,1,1,1,1)| /* c0 */
94 W(0xd0, 0,1,1,1,0,1,0,0,1,1,0,1,1,1,0,1), /* d0 */
95 W(0xe0, 0,1,1,0,0,1,0,0,1,1,0,1,1,1,0,1)| /* e0 */
96 W(0xf0, 0,1,1,1,0,1,0,0,1,1,1,0,1,1,1,0) /* f0 */
97 /* ------------------------------- */
98 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
99 };
100#undef W
101 kprobe_opcode_t opcode;
102 kprobe_opcode_t *orig_opcodes = opcodes;
103retry:
104 if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
105 return 0;
106 opcode = *(opcodes++);
107
108 /* 2nd-byte opcode */
109 if (opcode == 0x0f) {
110 if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
111 return 0;
112 return test_bit(*opcodes, twobyte_is_boostable);
113 }
114
115 switch (opcode & 0xf0) {
116 case 0x60:
117 if (0x63 < opcode && opcode < 0x67)
118 goto retry; /* prefixes */
119 /* can't boost Address-size override and bound */
120 return (opcode != 0x62 && opcode != 0x67);
121 case 0x70:
122 return 0; /* can't boost conditional jump */
123 case 0xc0:
124 /* can't boost software-interruptions */
125 return (0xc1 < opcode && opcode < 0xcc) || opcode == 0xcf;
126 case 0xd0:
127 /* can boost AA* and XLAT */
128 return (opcode == 0xd4 || opcode == 0xd5 || opcode == 0xd7);
129 case 0xe0:
130 /* can boost in/out and absolute jmps */
131 return ((opcode & 0x04) || opcode == 0xea);
132 case 0xf0:
133 if ((opcode & 0x0c) == 0 && opcode != 0xf1)
134 goto retry; /* lock/rep(ne) prefix */
135 /* clear and set flags can be boost */
136 return (opcode == 0xf5 || (0xf7 < opcode && opcode < 0xfe));
137 default:
138 if (opcode == 0x26 || opcode == 0x36 || opcode == 0x3e)
139 goto retry; /* prefixes */
140 /* can't boost CS override and call */
141 return (opcode != 0x2e && opcode != 0x9a);
142 }
143}
144
145/*
146 * returns non-zero if opcode modifies the interrupt flag.
147 */
148static int __kprobes is_IF_modifier(kprobe_opcode_t opcode)
149{
150 switch (opcode) {
151 case 0xfa: /* cli */
152 case 0xfb: /* sti */
153 case 0xcf: /* iret/iretd */
154 case 0x9d: /* popf/popfd */
155 return 1;
156 }
157 return 0;
158}
159
160int __kprobes arch_prepare_kprobe(struct kprobe *p)
161{
162 /* insn: must be on special executable page on i386. */
163 p->ainsn.insn = get_insn_slot();
164 if (!p->ainsn.insn)
165 return -ENOMEM;
166
167 memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
168 p->opcode = *p->addr;
169 if (can_boost(p->addr)) {
170 p->ainsn.boostable = 0;
171 } else {
172 p->ainsn.boostable = -1;
173 }
174 return 0;
175}
176
177void __kprobes arch_arm_kprobe(struct kprobe *p)
178{
179 text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1);
180}
181
182void __kprobes arch_disarm_kprobe(struct kprobe *p)
183{
184 text_poke(p->addr, &p->opcode, 1);
185}
186
187void __kprobes arch_remove_kprobe(struct kprobe *p)
188{
189 mutex_lock(&kprobe_mutex);
190 free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1));
191 mutex_unlock(&kprobe_mutex);
192}
193
194static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
195{
196 kcb->prev_kprobe.kp = kprobe_running();
197 kcb->prev_kprobe.status = kcb->kprobe_status;
198 kcb->prev_kprobe.old_eflags = kcb->kprobe_old_eflags;
199 kcb->prev_kprobe.saved_eflags = kcb->kprobe_saved_eflags;
200}
201
202static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
203{
204 __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp;
205 kcb->kprobe_status = kcb->prev_kprobe.status;
206 kcb->kprobe_old_eflags = kcb->prev_kprobe.old_eflags;
207 kcb->kprobe_saved_eflags = kcb->prev_kprobe.saved_eflags;
208}
209
210static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
211 struct kprobe_ctlblk *kcb)
212{
213 __get_cpu_var(current_kprobe) = p;
214 kcb->kprobe_saved_eflags = kcb->kprobe_old_eflags
215 = (regs->eflags & (TF_MASK | IF_MASK));
216 if (is_IF_modifier(p->opcode))
217 kcb->kprobe_saved_eflags &= ~IF_MASK;
218}
219
220static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
221{
222 regs->eflags |= TF_MASK;
223 regs->eflags &= ~IF_MASK;
224 /*single step inline if the instruction is an int3*/
225 if (p->opcode == BREAKPOINT_INSTRUCTION)
226 regs->eip = (unsigned long)p->addr;
227 else
228 regs->eip = (unsigned long)p->ainsn.insn;
229}
230
231/* Called with kretprobe_lock held */
232void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
233 struct pt_regs *regs)
234{
235 unsigned long *sara = (unsigned long *)&regs->esp;
236
237 ri->ret_addr = (kprobe_opcode_t *) *sara;
238
239 /* Replace the return addr with trampoline addr */
240 *sara = (unsigned long) &kretprobe_trampoline;
241}
242
243/*
244 * Interrupts are disabled on entry as trap3 is an interrupt gate and they
245 * remain disabled thorough out this function.
246 */
247static int __kprobes kprobe_handler(struct pt_regs *regs)
248{
249 struct kprobe *p;
250 int ret = 0;
251 kprobe_opcode_t *addr;
252 struct kprobe_ctlblk *kcb;
253
254 addr = (kprobe_opcode_t *)(regs->eip - sizeof(kprobe_opcode_t));
255
256 /*
257 * We don't want to be preempted for the entire
258 * duration of kprobe processing
259 */
260 preempt_disable();
261 kcb = get_kprobe_ctlblk();
262
263 /* Check we're not actually recursing */
264 if (kprobe_running()) {
265 p = get_kprobe(addr);
266 if (p) {
267 if (kcb->kprobe_status == KPROBE_HIT_SS &&
268 *p->ainsn.insn == BREAKPOINT_INSTRUCTION) {
269 regs->eflags &= ~TF_MASK;
270 regs->eflags |= kcb->kprobe_saved_eflags;
271 goto no_kprobe;
272 }
273 /* We have reentered the kprobe_handler(), since
274 * another probe was hit while within the handler.
275 * We here save the original kprobes variables and
276 * just single step on the instruction of the new probe
277 * without calling any user handlers.
278 */
279 save_previous_kprobe(kcb);
280 set_current_kprobe(p, regs, kcb);
281 kprobes_inc_nmissed_count(p);
282 prepare_singlestep(p, regs);
283 kcb->kprobe_status = KPROBE_REENTER;
284 return 1;
285 } else {
286 if (*addr != BREAKPOINT_INSTRUCTION) {
287 /* The breakpoint instruction was removed by
288 * another cpu right after we hit, no further
289 * handling of this interrupt is appropriate
290 */
291 regs->eip -= sizeof(kprobe_opcode_t);
292 ret = 1;
293 goto no_kprobe;
294 }
295 p = __get_cpu_var(current_kprobe);
296 if (p->break_handler && p->break_handler(p, regs)) {
297 goto ss_probe;
298 }
299 }
300 goto no_kprobe;
301 }
302
303 p = get_kprobe(addr);
304 if (!p) {
305 if (*addr != BREAKPOINT_INSTRUCTION) {
306 /*
307 * The breakpoint instruction was removed right
308 * after we hit it. Another cpu has removed
309 * either a probepoint or a debugger breakpoint
310 * at this address. In either case, no further
311 * handling of this interrupt is appropriate.
312 * Back up over the (now missing) int3 and run
313 * the original instruction.
314 */
315 regs->eip -= sizeof(kprobe_opcode_t);
316 ret = 1;
317 }
318 /* Not one of ours: let kernel handle it */
319 goto no_kprobe;
320 }
321
322 set_current_kprobe(p, regs, kcb);
323 kcb->kprobe_status = KPROBE_HIT_ACTIVE;
324
325 if (p->pre_handler && p->pre_handler(p, regs))
326 /* handler has already set things up, so skip ss setup */
327 return 1;
328
329ss_probe:
330#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PM)
331 if (p->ainsn.boostable == 1 && !p->post_handler){
332 /* Boost up -- we can execute copied instructions directly */
333 reset_current_kprobe();
334 regs->eip = (unsigned long)p->ainsn.insn;
335 preempt_enable_no_resched();
336 return 1;
337 }
338#endif
339 prepare_singlestep(p, regs);
340 kcb->kprobe_status = KPROBE_HIT_SS;
341 return 1;
342
343no_kprobe:
344 preempt_enable_no_resched();
345 return ret;
346}
347
348/*
349 * For function-return probes, init_kprobes() establishes a probepoint
350 * here. When a retprobed function returns, this probe is hit and
351 * trampoline_probe_handler() runs, calling the kretprobe's handler.
352 */
353 void __kprobes kretprobe_trampoline_holder(void)
354 {
355 asm volatile ( ".global kretprobe_trampoline\n"
356 "kretprobe_trampoline: \n"
357 " pushf\n"
358 /* skip cs, eip, orig_eax */
359 " subl $12, %esp\n"
360 " pushl %fs\n"
361 " pushl %ds\n"
362 " pushl %es\n"
363 " pushl %eax\n"
364 " pushl %ebp\n"
365 " pushl %edi\n"
366 " pushl %esi\n"
367 " pushl %edx\n"
368 " pushl %ecx\n"
369 " pushl %ebx\n"
370 " movl %esp, %eax\n"
371 " call trampoline_handler\n"
372 /* move eflags to cs */
373 " movl 52(%esp), %edx\n"
374 " movl %edx, 48(%esp)\n"
375 /* save true return address on eflags */
376 " movl %eax, 52(%esp)\n"
377 " popl %ebx\n"
378 " popl %ecx\n"
379 " popl %edx\n"
380 " popl %esi\n"
381 " popl %edi\n"
382 " popl %ebp\n"
383 " popl %eax\n"
384 /* skip eip, orig_eax, es, ds, fs */
385 " addl $20, %esp\n"
386 " popf\n"
387 " ret\n");
388}
389
390/*
391 * Called from kretprobe_trampoline
392 */
393fastcall void *__kprobes trampoline_handler(struct pt_regs *regs)
394{
395 struct kretprobe_instance *ri = NULL;
396 struct hlist_head *head, empty_rp;
397 struct hlist_node *node, *tmp;
398 unsigned long flags, orig_ret_address = 0;
399 unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline;
400
401 INIT_HLIST_HEAD(&empty_rp);
402 spin_lock_irqsave(&kretprobe_lock, flags);
403 head = kretprobe_inst_table_head(current);
404 /* fixup registers */
405 regs->xcs = __KERNEL_CS | get_kernel_rpl();
406 regs->eip = trampoline_address;
407 regs->orig_eax = 0xffffffff;
408
409 /*
410 * It is possible to have multiple instances associated with a given
411 * task either because an multiple functions in the call path
412 * have a return probe installed on them, and/or more then one return
413 * return probe was registered for a target function.
414 *
415 * We can handle this because:
416 * - instances are always inserted at the head of the list
417 * - when multiple return probes are registered for the same
418 * function, the first instance's ret_addr will point to the
419 * real return address, and all the rest will point to
420 * kretprobe_trampoline
421 */
422 hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
423 if (ri->task != current)
424 /* another task is sharing our hash bucket */
425 continue;
426
427 if (ri->rp && ri->rp->handler){
428 __get_cpu_var(current_kprobe) = &ri->rp->kp;
429 get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
430 ri->rp->handler(ri, regs);
431 __get_cpu_var(current_kprobe) = NULL;
432 }
433
434 orig_ret_address = (unsigned long)ri->ret_addr;
435 recycle_rp_inst(ri, &empty_rp);
436
437 if (orig_ret_address != trampoline_address)
438 /*
439 * This is the real return address. Any other
440 * instances associated with this task are for
441 * other calls deeper on the call stack
442 */
443 break;
444 }
445
446 kretprobe_assert(ri, orig_ret_address, trampoline_address);
447 spin_unlock_irqrestore(&kretprobe_lock, flags);
448
449 hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
450 hlist_del(&ri->hlist);
451 kfree(ri);
452 }
453 return (void*)orig_ret_address;
454}
455
456/*
457 * Called after single-stepping. p->addr is the address of the
458 * instruction whose first byte has been replaced by the "int 3"
459 * instruction. To avoid the SMP problems that can occur when we
460 * temporarily put back the original opcode to single-step, we
461 * single-stepped a copy of the instruction. The address of this
462 * copy is p->ainsn.insn.
463 *
464 * This function prepares to return from the post-single-step
465 * interrupt. We have to fix up the stack as follows:
466 *
467 * 0) Except in the case of absolute or indirect jump or call instructions,
468 * the new eip is relative to the copied instruction. We need to make
469 * it relative to the original instruction.
470 *
471 * 1) If the single-stepped instruction was pushfl, then the TF and IF
472 * flags are set in the just-pushed eflags, and may need to be cleared.
473 *
474 * 2) If the single-stepped instruction was a call, the return address
475 * that is atop the stack is the address following the copied instruction.
476 * We need to make it the address following the original instruction.
477 *
478 * This function also checks instruction size for preparing direct execution.
479 */
480static void __kprobes resume_execution(struct kprobe *p,
481 struct pt_regs *regs, struct kprobe_ctlblk *kcb)
482{
483 unsigned long *tos = (unsigned long *)&regs->esp;
484 unsigned long copy_eip = (unsigned long)p->ainsn.insn;
485 unsigned long orig_eip = (unsigned long)p->addr;
486
487 regs->eflags &= ~TF_MASK;
488 switch (p->ainsn.insn[0]) {
489 case 0x9c: /* pushfl */
490 *tos &= ~(TF_MASK | IF_MASK);
491 *tos |= kcb->kprobe_old_eflags;
492 break;
493 case 0xc2: /* iret/ret/lret */
494 case 0xc3:
495 case 0xca:
496 case 0xcb:
497 case 0xcf:
498 case 0xea: /* jmp absolute -- eip is correct */
499 /* eip is already adjusted, no more changes required */
500 p->ainsn.boostable = 1;
501 goto no_change;
502 case 0xe8: /* call relative - Fix return addr */
503 *tos = orig_eip + (*tos - copy_eip);
504 break;
505 case 0x9a: /* call absolute -- same as call absolute, indirect */
506 *tos = orig_eip + (*tos - copy_eip);
507 goto no_change;
508 case 0xff:
509 if ((p->ainsn.insn[1] & 0x30) == 0x10) {
510 /*
511 * call absolute, indirect
512 * Fix return addr; eip is correct.
513 * But this is not boostable
514 */
515 *tos = orig_eip + (*tos - copy_eip);
516 goto no_change;
517 } else if (((p->ainsn.insn[1] & 0x31) == 0x20) || /* jmp near, absolute indirect */
518 ((p->ainsn.insn[1] & 0x31) == 0x21)) { /* jmp far, absolute indirect */
519 /* eip is correct. And this is boostable */
520 p->ainsn.boostable = 1;
521 goto no_change;
522 }
523 default:
524 break;
525 }
526
527 if (p->ainsn.boostable == 0) {
528 if ((regs->eip > copy_eip) &&
529 (regs->eip - copy_eip) + 5 < MAX_INSN_SIZE) {
530 /*
531 * These instructions can be executed directly if it
532 * jumps back to correct address.
533 */
534 set_jmp_op((void *)regs->eip,
535 (void *)orig_eip + (regs->eip - copy_eip));
536 p->ainsn.boostable = 1;
537 } else {
538 p->ainsn.boostable = -1;
539 }
540 }
541
542 regs->eip = orig_eip + (regs->eip - copy_eip);
543
544no_change:
545 return;
546}
547
548/*
549 * Interrupts are disabled on entry as trap1 is an interrupt gate and they
550 * remain disabled thoroughout this function.
551 */
552static int __kprobes post_kprobe_handler(struct pt_regs *regs)
553{
554 struct kprobe *cur = kprobe_running();
555 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
556
557 if (!cur)
558 return 0;
559
560 if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
561 kcb->kprobe_status = KPROBE_HIT_SSDONE;
562 cur->post_handler(cur, regs, 0);
563 }
564
565 resume_execution(cur, regs, kcb);
566 regs->eflags |= kcb->kprobe_saved_eflags;
567 trace_hardirqs_fixup_flags(regs->eflags);
568
569 /*Restore back the original saved kprobes variables and continue. */
570 if (kcb->kprobe_status == KPROBE_REENTER) {
571 restore_previous_kprobe(kcb);
572 goto out;
573 }
574 reset_current_kprobe();
575out:
576 preempt_enable_no_resched();
577
578 /*
579 * if somebody else is singlestepping across a probe point, eflags
580 * will have TF set, in which case, continue the remaining processing
581 * of do_debug, as if this is not a probe hit.
582 */
583 if (regs->eflags & TF_MASK)
584 return 0;
585
586 return 1;
587}
588
589int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
590{
591 struct kprobe *cur = kprobe_running();
592 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
593
594 switch(kcb->kprobe_status) {
595 case KPROBE_HIT_SS:
596 case KPROBE_REENTER:
597 /*
598 * We are here because the instruction being single
599 * stepped caused a page fault. We reset the current
600 * kprobe and the eip points back to the probe address
601 * and allow the page fault handler to continue as a
602 * normal page fault.
603 */
604 regs->eip = (unsigned long)cur->addr;
605 regs->eflags |= kcb->kprobe_old_eflags;
606 if (kcb->kprobe_status == KPROBE_REENTER)
607 restore_previous_kprobe(kcb);
608 else
609 reset_current_kprobe();
610 preempt_enable_no_resched();
611 break;
612 case KPROBE_HIT_ACTIVE:
613 case KPROBE_HIT_SSDONE:
614 /*
615 * We increment the nmissed count for accounting,
616 * we can also use npre/npostfault count for accouting
617 * these specific fault cases.
618 */
619 kprobes_inc_nmissed_count(cur);
620
621 /*
622 * We come here because instructions in the pre/post
623 * handler caused the page_fault, this could happen
624 * if handler tries to access user space by
625 * copy_from_user(), get_user() etc. Let the
626 * user-specified handler try to fix it first.
627 */
628 if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
629 return 1;
630
631 /*
632 * In case the user-specified fault handler returned
633 * zero, try to fix up.
634 */
635 if (fixup_exception(regs))
636 return 1;
637
638 /*
639 * fixup_exception() could not handle it,
640 * Let do_page_fault() fix it.
641 */
642 break;
643 default:
644 break;
645 }
646 return 0;
647}
648
649/*
650 * Wrapper routine to for handling exceptions.
651 */
652int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
653 unsigned long val, void *data)
654{
655 struct die_args *args = (struct die_args *)data;
656 int ret = NOTIFY_DONE;
657
658 if (args->regs && user_mode_vm(args->regs))
659 return ret;
660
661 switch (val) {
662 case DIE_INT3:
663 if (kprobe_handler(args->regs))
664 ret = NOTIFY_STOP;
665 break;
666 case DIE_DEBUG:
667 if (post_kprobe_handler(args->regs))
668 ret = NOTIFY_STOP;
669 break;
670 case DIE_GPF:
671 /* kprobe_running() needs smp_processor_id() */
672 preempt_disable();
673 if (kprobe_running() &&
674 kprobe_fault_handler(args->regs, args->trapnr))
675 ret = NOTIFY_STOP;
676 preempt_enable();
677 break;
678 default:
679 break;
680 }
681 return ret;
682}
683
684int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
685{
686 struct jprobe *jp = container_of(p, struct jprobe, kp);
687 unsigned long addr;
688 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
689
690 kcb->jprobe_saved_regs = *regs;
691 kcb->jprobe_saved_esp = &regs->esp;
692 addr = (unsigned long)(kcb->jprobe_saved_esp);
693
694 /*
695 * TBD: As Linus pointed out, gcc assumes that the callee
696 * owns the argument space and could overwrite it, e.g.
697 * tailcall optimization. So, to be absolutely safe
698 * we also save and restore enough stack bytes to cover
699 * the argument area.
700 */
701 memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr,
702 MIN_STACK_SIZE(addr));
703 regs->eflags &= ~IF_MASK;
704 trace_hardirqs_off();
705 regs->eip = (unsigned long)(jp->entry);
706 return 1;
707}
708
709void __kprobes jprobe_return(void)
710{
711 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
712
713 asm volatile (" xchgl %%ebx,%%esp \n"
714 " int3 \n"
715 " .globl jprobe_return_end \n"
716 " jprobe_return_end: \n"
717 " nop \n"::"b"
718 (kcb->jprobe_saved_esp):"memory");
719}
720
721int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
722{
723 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
724 u8 *addr = (u8 *) (regs->eip - 1);
725 unsigned long stack_addr = (unsigned long)(kcb->jprobe_saved_esp);
726 struct jprobe *jp = container_of(p, struct jprobe, kp);
727
728 if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) {
729 if (&regs->esp != kcb->jprobe_saved_esp) {
730 struct pt_regs *saved_regs = &kcb->jprobe_saved_regs;
731 printk("current esp %p does not match saved esp %p\n",
732 &regs->esp, kcb->jprobe_saved_esp);
733 printk("Saved registers for jprobe %p\n", jp);
734 show_registers(saved_regs);
735 printk("Current registers\n");
736 show_registers(regs);
737 BUG();
738 }
739 *regs = kcb->jprobe_saved_regs;
740 memcpy((kprobe_opcode_t *) stack_addr, kcb->jprobes_stack,
741 MIN_STACK_SIZE(stack_addr));
742 preempt_enable_no_resched();
743 return 1;
744 }
745 return 0;
746}
747
748int __kprobes arch_trampoline_kprobe(struct kprobe *p)
749{
750 return 0;
751}
752
753int __init arch_init_kprobes(void)
754{
755 return 0;
756}
diff --git a/arch/x86/kernel/kprobes_64.c b/arch/x86/kernel/kprobes_64.c
deleted file mode 100644
index 5df19a9f923..00000000000
--- a/arch/x86/kernel/kprobes_64.c
+++ /dev/null
@@ -1,749 +0,0 @@
1/*
2 * Kernel Probes (KProbes)
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) IBM Corporation, 2002, 2004
19 *
20 * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
21 * Probes initial implementation ( includes contributions from
22 * Rusty Russell).
23 * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
24 * interface to access function arguments.
25 * 2004-Oct Jim Keniston <kenistoj@us.ibm.com> and Prasanna S Panchamukhi
26 * <prasanna@in.ibm.com> adapted for x86_64
27 * 2005-Mar Roland McGrath <roland@redhat.com>
28 * Fixed to handle %rip-relative addressing mode correctly.
29 * 2005-May Rusty Lynch <rusty.lynch@intel.com>
30 * Added function return probes functionality
31 */
32
33#include <linux/kprobes.h>
34#include <linux/ptrace.h>
35#include <linux/string.h>
36#include <linux/slab.h>
37#include <linux/preempt.h>
38#include <linux/module.h>
39#include <linux/kdebug.h>
40
41#include <asm/pgtable.h>
42#include <asm/uaccess.h>
43#include <asm/alternative.h>
44
45void jprobe_return_end(void);
46static void __kprobes arch_copy_kprobe(struct kprobe *p);
47
48DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
49DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
50
51struct kretprobe_blackpoint kretprobe_blacklist[] = {
52 {"__switch_to", }, /* This function switches only current task, but
53 doesn't switch kernel stack.*/
54 {NULL, NULL} /* Terminator */
55};
56const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
57
58/*
59 * returns non-zero if opcode modifies the interrupt flag.
60 */
61static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
62{
63 switch (*insn) {
64 case 0xfa: /* cli */
65 case 0xfb: /* sti */
66 case 0xcf: /* iret/iretd */
67 case 0x9d: /* popf/popfd */
68 return 1;
69 }
70
71 if (*insn >= 0x40 && *insn <= 0x4f && *++insn == 0xcf)
72 return 1;
73 return 0;
74}
75
76int __kprobes arch_prepare_kprobe(struct kprobe *p)
77{
78 /* insn: must be on special executable page on x86_64. */
79 p->ainsn.insn = get_insn_slot();
80 if (!p->ainsn.insn) {
81 return -ENOMEM;
82 }
83 arch_copy_kprobe(p);
84 return 0;
85}
86
87/*
88 * Determine if the instruction uses the %rip-relative addressing mode.
89 * If it does, return the address of the 32-bit displacement word.
90 * If not, return null.
91 */
92static s32 __kprobes *is_riprel(u8 *insn)
93{
94#define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf) \
95 (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
96 (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \
97 (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \
98 (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \
99 << (row % 64))
100 static const u64 onebyte_has_modrm[256 / 64] = {
101 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
102 /* ------------------------------- */
103 W(0x00, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 00 */
104 W(0x10, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 10 */
105 W(0x20, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 20 */
106 W(0x30, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0), /* 30 */
107 W(0x40, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 40 */
108 W(0x50, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 50 */
109 W(0x60, 0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0)| /* 60 */
110 W(0x70, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 70 */
111 W(0x80, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 80 */
112 W(0x90, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 90 */
113 W(0xa0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* a0 */
114 W(0xb0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* b0 */
115 W(0xc0, 1,1,0,0,1,1,1,1,0,0,0,0,0,0,0,0)| /* c0 */
116 W(0xd0, 1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1)| /* d0 */
117 W(0xe0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* e0 */
118 W(0xf0, 0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,1) /* f0 */
119 /* ------------------------------- */
120 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
121 };
122 static const u64 twobyte_has_modrm[256 / 64] = {
123 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
124 /* ------------------------------- */
125 W(0x00, 1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,1)| /* 0f */
126 W(0x10, 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0)| /* 1f */
127 W(0x20, 1,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1)| /* 2f */
128 W(0x30, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 3f */
129 W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 4f */
130 W(0x50, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 5f */
131 W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 6f */
132 W(0x70, 1,1,1,1,1,1,1,0,0,0,0,0,1,1,1,1), /* 7f */
133 W(0x80, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 8f */
134 W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 9f */
135 W(0xa0, 0,0,0,1,1,1,1,1,0,0,0,1,1,1,1,1)| /* af */
136 W(0xb0, 1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1), /* bf */
137 W(0xc0, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0)| /* cf */
138 W(0xd0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* df */
139 W(0xe0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* ef */
140 W(0xf0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0) /* ff */
141 /* ------------------------------- */
142 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
143 };
144#undef W
145 int need_modrm;
146
147 /* Skip legacy instruction prefixes. */
148 while (1) {
149 switch (*insn) {
150 case 0x66:
151 case 0x67:
152 case 0x2e:
153 case 0x3e:
154 case 0x26:
155 case 0x64:
156 case 0x65:
157 case 0x36:
158 case 0xf0:
159 case 0xf3:
160 case 0xf2:
161 ++insn;
162 continue;
163 }
164 break;
165 }
166
167 /* Skip REX instruction prefix. */
168 if ((*insn & 0xf0) == 0x40)
169 ++insn;
170
171 if (*insn == 0x0f) { /* Two-byte opcode. */
172 ++insn;
173 need_modrm = test_bit(*insn, twobyte_has_modrm);
174 } else { /* One-byte opcode. */
175 need_modrm = test_bit(*insn, onebyte_has_modrm);
176 }
177
178 if (need_modrm) {
179 u8 modrm = *++insn;
180 if ((modrm & 0xc7) == 0x05) { /* %rip+disp32 addressing mode */
181 /* Displacement follows ModRM byte. */
182 return (s32 *) ++insn;
183 }
184 }
185
186 /* No %rip-relative addressing mode here. */
187 return NULL;
188}
189
190static void __kprobes arch_copy_kprobe(struct kprobe *p)
191{
192 s32 *ripdisp;
193 memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE);
194 ripdisp = is_riprel(p->ainsn.insn);
195 if (ripdisp) {
196 /*
197 * The copied instruction uses the %rip-relative
198 * addressing mode. Adjust the displacement for the
199 * difference between the original location of this
200 * instruction and the location of the copy that will
201 * actually be run. The tricky bit here is making sure
202 * that the sign extension happens correctly in this
203 * calculation, since we need a signed 32-bit result to
204 * be sign-extended to 64 bits when it's added to the
205 * %rip value and yield the same 64-bit result that the
206 * sign-extension of the original signed 32-bit
207 * displacement would have given.
208 */
209 s64 disp = (u8 *) p->addr + *ripdisp - (u8 *) p->ainsn.insn;
210 BUG_ON((s64) (s32) disp != disp); /* Sanity check. */
211 *ripdisp = disp;
212 }
213 p->opcode = *p->addr;
214}
215
216void __kprobes arch_arm_kprobe(struct kprobe *p)
217{
218 text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1);
219}
220
221void __kprobes arch_disarm_kprobe(struct kprobe *p)
222{
223 text_poke(p->addr, &p->opcode, 1);
224}
225
226void __kprobes arch_remove_kprobe(struct kprobe *p)
227{
228 mutex_lock(&kprobe_mutex);
229 free_insn_slot(p->ainsn.insn, 0);
230 mutex_unlock(&kprobe_mutex);
231}
232
233static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
234{
235 kcb->prev_kprobe.kp = kprobe_running();
236 kcb->prev_kprobe.status = kcb->kprobe_status;
237 kcb->prev_kprobe.old_rflags = kcb->kprobe_old_rflags;
238 kcb->prev_kprobe.saved_rflags = kcb->kprobe_saved_rflags;
239}
240
241static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
242{
243 __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp;
244 kcb->kprobe_status = kcb->prev_kprobe.status;
245 kcb->kprobe_old_rflags = kcb->prev_kprobe.old_rflags;
246 kcb->kprobe_saved_rflags = kcb->prev_kprobe.saved_rflags;
247}
248
249static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
250 struct kprobe_ctlblk *kcb)
251{
252 __get_cpu_var(current_kprobe) = p;
253 kcb->kprobe_saved_rflags = kcb->kprobe_old_rflags
254 = (regs->eflags & (TF_MASK | IF_MASK));
255 if (is_IF_modifier(p->ainsn.insn))
256 kcb->kprobe_saved_rflags &= ~IF_MASK;
257}
258
259static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
260{
261 regs->eflags |= TF_MASK;
262 regs->eflags &= ~IF_MASK;
263 /*single step inline if the instruction is an int3*/
264 if (p->opcode == BREAKPOINT_INSTRUCTION)
265 regs->rip = (unsigned long)p->addr;
266 else
267 regs->rip = (unsigned long)p->ainsn.insn;
268}
269
270/* Called with kretprobe_lock held */
271void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
272 struct pt_regs *regs)
273{
274 unsigned long *sara = (unsigned long *)regs->rsp;
275
276 ri->ret_addr = (kprobe_opcode_t *) *sara;
277 /* Replace the return addr with trampoline addr */
278 *sara = (unsigned long) &kretprobe_trampoline;
279}
280
281int __kprobes kprobe_handler(struct pt_regs *regs)
282{
283 struct kprobe *p;
284 int ret = 0;
285 kprobe_opcode_t *addr = (kprobe_opcode_t *)(regs->rip - sizeof(kprobe_opcode_t));
286 struct kprobe_ctlblk *kcb;
287
288 /*
289 * We don't want to be preempted for the entire
290 * duration of kprobe processing
291 */
292 preempt_disable();
293 kcb = get_kprobe_ctlblk();
294
295 /* Check we're not actually recursing */
296 if (kprobe_running()) {
297 p = get_kprobe(addr);
298 if (p) {
299 if (kcb->kprobe_status == KPROBE_HIT_SS &&
300 *p->ainsn.insn == BREAKPOINT_INSTRUCTION) {
301 regs->eflags &= ~TF_MASK;
302 regs->eflags |= kcb->kprobe_saved_rflags;
303 goto no_kprobe;
304 } else if (kcb->kprobe_status == KPROBE_HIT_SSDONE) {
305 /* TODO: Provide re-entrancy from
306 * post_kprobes_handler() and avoid exception
307 * stack corruption while single-stepping on
308 * the instruction of the new probe.
309 */
310 arch_disarm_kprobe(p);
311 regs->rip = (unsigned long)p->addr;
312 reset_current_kprobe();
313 ret = 1;
314 } else {
315 /* We have reentered the kprobe_handler(), since
316 * another probe was hit while within the
317 * handler. We here save the original kprobe
318 * variables and just single step on instruction
319 * of the new probe without calling any user
320 * handlers.
321 */
322 save_previous_kprobe(kcb);
323 set_current_kprobe(p, regs, kcb);
324 kprobes_inc_nmissed_count(p);
325 prepare_singlestep(p, regs);
326 kcb->kprobe_status = KPROBE_REENTER;
327 return 1;
328 }
329 } else {
330 if (*addr != BREAKPOINT_INSTRUCTION) {
331 /* The breakpoint instruction was removed by
332 * another cpu right after we hit, no further
333 * handling of this interrupt is appropriate
334 */
335 regs->rip = (unsigned long)addr;
336 ret = 1;
337 goto no_kprobe;
338 }
339 p = __get_cpu_var(current_kprobe);
340 if (p->break_handler && p->break_handler(p, regs)) {
341 goto ss_probe;
342 }
343 }
344 goto no_kprobe;
345 }
346
347 p = get_kprobe(addr);
348 if (!p) {
349 if (*addr != BREAKPOINT_INSTRUCTION) {
350 /*
351 * The breakpoint instruction was removed right
352 * after we hit it. Another cpu has removed
353 * either a probepoint or a debugger breakpoint
354 * at this address. In either case, no further
355 * handling of this interrupt is appropriate.
356 * Back up over the (now missing) int3 and run
357 * the original instruction.
358 */
359 regs->rip = (unsigned long)addr;
360 ret = 1;
361 }
362 /* Not one of ours: let kernel handle it */
363 goto no_kprobe;
364 }
365
366 set_current_kprobe(p, regs, kcb);
367 kcb->kprobe_status = KPROBE_HIT_ACTIVE;
368
369 if (p->pre_handler && p->pre_handler(p, regs))
370 /* handler has already set things up, so skip ss setup */
371 return 1;
372
373ss_probe:
374 prepare_singlestep(p, regs);
375 kcb->kprobe_status = KPROBE_HIT_SS;
376 return 1;
377
378no_kprobe:
379 preempt_enable_no_resched();
380 return ret;
381}
382
383/*
384 * For function-return probes, init_kprobes() establishes a probepoint
385 * here. When a retprobed function returns, this probe is hit and
386 * trampoline_probe_handler() runs, calling the kretprobe's handler.
387 */
388 void kretprobe_trampoline_holder(void)
389 {
390 asm volatile ( ".global kretprobe_trampoline\n"
391 "kretprobe_trampoline: \n"
392 "nop\n");
393 }
394
395/*
396 * Called when we hit the probe point at kretprobe_trampoline
397 */
398int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
399{
400 struct kretprobe_instance *ri = NULL;
401 struct hlist_head *head, empty_rp;
402 struct hlist_node *node, *tmp;
403 unsigned long flags, orig_ret_address = 0;
404 unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline;
405
406 INIT_HLIST_HEAD(&empty_rp);
407 spin_lock_irqsave(&kretprobe_lock, flags);
408 head = kretprobe_inst_table_head(current);
409
410 /*
411 * It is possible to have multiple instances associated with a given
412 * task either because an multiple functions in the call path
413 * have a return probe installed on them, and/or more then one return
414 * return probe was registered for a target function.
415 *
416 * We can handle this because:
417 * - instances are always inserted at the head of the list
418 * - when multiple return probes are registered for the same
419 * function, the first instance's ret_addr will point to the
420 * real return address, and all the rest will point to
421 * kretprobe_trampoline
422 */
423 hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
424 if (ri->task != current)
425 /* another task is sharing our hash bucket */
426 continue;
427
428 if (ri->rp && ri->rp->handler)
429 ri->rp->handler(ri, regs);
430
431 orig_ret_address = (unsigned long)ri->ret_addr;
432 recycle_rp_inst(ri, &empty_rp);
433
434 if (orig_ret_address != trampoline_address)
435 /*
436 * This is the real return address. Any other
437 * instances associated with this task are for
438 * other calls deeper on the call stack
439 */
440 break;
441 }
442
443 kretprobe_assert(ri, orig_ret_address, trampoline_address);
444 regs->rip = orig_ret_address;
445
446 reset_current_kprobe();
447 spin_unlock_irqrestore(&kretprobe_lock, flags);
448 preempt_enable_no_resched();
449
450 hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
451 hlist_del(&ri->hlist);
452 kfree(ri);
453 }
454 /*
455 * By returning a non-zero value, we are telling
456 * kprobe_handler() that we don't want the post_handler
457 * to run (and have re-enabled preemption)
458 */
459 return 1;
460}
461
462/*
463 * Called after single-stepping. p->addr is the address of the
464 * instruction whose first byte has been replaced by the "int 3"
465 * instruction. To avoid the SMP problems that can occur when we
466 * temporarily put back the original opcode to single-step, we
467 * single-stepped a copy of the instruction. The address of this
468 * copy is p->ainsn.insn.
469 *
470 * This function prepares to return from the post-single-step
471 * interrupt. We have to fix up the stack as follows:
472 *
473 * 0) Except in the case of absolute or indirect jump or call instructions,
474 * the new rip is relative to the copied instruction. We need to make
475 * it relative to the original instruction.
476 *
477 * 1) If the single-stepped instruction was pushfl, then the TF and IF
478 * flags are set in the just-pushed eflags, and may need to be cleared.
479 *
480 * 2) If the single-stepped instruction was a call, the return address
481 * that is atop the stack is the address following the copied instruction.
482 * We need to make it the address following the original instruction.
483 */
484static void __kprobes resume_execution(struct kprobe *p,
485 struct pt_regs *regs, struct kprobe_ctlblk *kcb)
486{
487 unsigned long *tos = (unsigned long *)regs->rsp;
488 unsigned long copy_rip = (unsigned long)p->ainsn.insn;
489 unsigned long orig_rip = (unsigned long)p->addr;
490 kprobe_opcode_t *insn = p->ainsn.insn;
491
492 /*skip the REX prefix*/
493 if (*insn >= 0x40 && *insn <= 0x4f)
494 insn++;
495
496 regs->eflags &= ~TF_MASK;
497 switch (*insn) {
498 case 0x9c: /* pushfl */
499 *tos &= ~(TF_MASK | IF_MASK);
500 *tos |= kcb->kprobe_old_rflags;
501 break;
502 case 0xc2: /* iret/ret/lret */
503 case 0xc3:
504 case 0xca:
505 case 0xcb:
506 case 0xcf:
507 case 0xea: /* jmp absolute -- ip is correct */
508 /* ip is already adjusted, no more changes required */
509 goto no_change;
510 case 0xe8: /* call relative - Fix return addr */
511 *tos = orig_rip + (*tos - copy_rip);
512 break;
513 case 0xff:
514 if ((insn[1] & 0x30) == 0x10) {
515 /* call absolute, indirect */
516 /* Fix return addr; ip is correct. */
517 *tos = orig_rip + (*tos - copy_rip);
518 goto no_change;
519 } else if (((insn[1] & 0x31) == 0x20) || /* jmp near, absolute indirect */
520 ((insn[1] & 0x31) == 0x21)) { /* jmp far, absolute indirect */
521 /* ip is correct. */
522 goto no_change;
523 }
524 default:
525 break;
526 }
527
528 regs->rip = orig_rip + (regs->rip - copy_rip);
529no_change:
530
531 return;
532}
533
534int __kprobes post_kprobe_handler(struct pt_regs *regs)
535{
536 struct kprobe *cur = kprobe_running();
537 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
538
539 if (!cur)
540 return 0;
541
542 if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
543 kcb->kprobe_status = KPROBE_HIT_SSDONE;
544 cur->post_handler(cur, regs, 0);
545 }
546
547 resume_execution(cur, regs, kcb);
548 regs->eflags |= kcb->kprobe_saved_rflags;
549 trace_hardirqs_fixup_flags(regs->eflags);
550
551 /* Restore the original saved kprobes variables and continue. */
552 if (kcb->kprobe_status == KPROBE_REENTER) {
553 restore_previous_kprobe(kcb);
554 goto out;
555 }
556 reset_current_kprobe();
557out:
558 preempt_enable_no_resched();
559
560 /*
561 * if somebody else is singlestepping across a probe point, eflags
562 * will have TF set, in which case, continue the remaining processing
563 * of do_debug, as if this is not a probe hit.
564 */
565 if (regs->eflags & TF_MASK)
566 return 0;
567
568 return 1;
569}
570
571int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
572{
573 struct kprobe *cur = kprobe_running();
574 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
575 const struct exception_table_entry *fixup;
576
577 switch(kcb->kprobe_status) {
578 case KPROBE_HIT_SS:
579 case KPROBE_REENTER:
580 /*
581 * We are here because the instruction being single
582 * stepped caused a page fault. We reset the current
583 * kprobe and the rip points back to the probe address
584 * and allow the page fault handler to continue as a
585 * normal page fault.
586 */
587 regs->rip = (unsigned long)cur->addr;
588 regs->eflags |= kcb->kprobe_old_rflags;
589 if (kcb->kprobe_status == KPROBE_REENTER)
590 restore_previous_kprobe(kcb);
591 else
592 reset_current_kprobe();
593 preempt_enable_no_resched();
594 break;
595 case KPROBE_HIT_ACTIVE:
596 case KPROBE_HIT_SSDONE:
597 /*
598 * We increment the nmissed count for accounting,
599 * we can also use npre/npostfault count for accouting
600 * these specific fault cases.
601 */
602 kprobes_inc_nmissed_count(cur);
603
604 /*
605 * We come here because instructions in the pre/post
606 * handler caused the page_fault, this could happen
607 * if handler tries to access user space by
608 * copy_from_user(), get_user() etc. Let the
609 * user-specified handler try to fix it first.
610 */
611 if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
612 return 1;
613
614 /*
615 * In case the user-specified fault handler returned
616 * zero, try to fix up.
617 */
618 fixup = search_exception_tables(regs->rip);
619 if (fixup) {
620 regs->rip = fixup->fixup;
621 return 1;
622 }
623
624 /*
625 * fixup() could not handle it,
626 * Let do_page_fault() fix it.
627 */
628 break;
629 default:
630 break;
631 }
632 return 0;
633}
634
635/*
636 * Wrapper routine for handling exceptions.
637 */
638int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
639 unsigned long val, void *data)
640{
641 struct die_args *args = (struct die_args *)data;
642 int ret = NOTIFY_DONE;
643
644 if (args->regs && user_mode(args->regs))
645 return ret;
646
647 switch (val) {
648 case DIE_INT3:
649 if (kprobe_handler(args->regs))
650 ret = NOTIFY_STOP;
651 break;
652 case DIE_DEBUG:
653 if (post_kprobe_handler(args->regs))
654 ret = NOTIFY_STOP;
655 break;
656 case DIE_GPF:
657 /* kprobe_running() needs smp_processor_id() */
658 preempt_disable();
659 if (kprobe_running() &&
660 kprobe_fault_handler(args->regs, args->trapnr))
661 ret = NOTIFY_STOP;
662 preempt_enable();
663 break;
664 default:
665 break;
666 }
667 return ret;
668}
669
670int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
671{
672 struct jprobe *jp = container_of(p, struct jprobe, kp);
673 unsigned long addr;
674 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
675
676 kcb->jprobe_saved_regs = *regs;
677 kcb->jprobe_saved_rsp = (long *) regs->rsp;
678 addr = (unsigned long)(kcb->jprobe_saved_rsp);
679 /*
680 * As Linus pointed out, gcc assumes that the callee
681 * owns the argument space and could overwrite it, e.g.
682 * tailcall optimization. So, to be absolutely safe
683 * we also save and restore enough stack bytes to cover
684 * the argument area.
685 */
686 memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr,
687 MIN_STACK_SIZE(addr));
688 regs->eflags &= ~IF_MASK;
689 trace_hardirqs_off();
690 regs->rip = (unsigned long)(jp->entry);
691 return 1;
692}
693
694void __kprobes jprobe_return(void)
695{
696 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
697
698 asm volatile (" xchg %%rbx,%%rsp \n"
699 " int3 \n"
700 " .globl jprobe_return_end \n"
701 " jprobe_return_end: \n"
702 " nop \n"::"b"
703 (kcb->jprobe_saved_rsp):"memory");
704}
705
706int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
707{
708 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
709 u8 *addr = (u8 *) (regs->rip - 1);
710 unsigned long stack_addr = (unsigned long)(kcb->jprobe_saved_rsp);
711 struct jprobe *jp = container_of(p, struct jprobe, kp);
712
713 if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) {
714 if ((unsigned long *)regs->rsp != kcb->jprobe_saved_rsp) {
715 struct pt_regs *saved_regs = &kcb->jprobe_saved_regs;
716 printk("current rsp %p does not match saved rsp %p\n",
717 (long *)regs->rsp, kcb->jprobe_saved_rsp);
718 printk("Saved registers for jprobe %p\n", jp);
719 show_registers(saved_regs);
720 printk("Current registers\n");
721 show_registers(regs);
722 BUG();
723 }
724 *regs = kcb->jprobe_saved_regs;
725 memcpy((kprobe_opcode_t *) stack_addr, kcb->jprobes_stack,
726 MIN_STACK_SIZE(stack_addr));
727 preempt_enable_no_resched();
728 return 1;
729 }
730 return 0;
731}
732
733static struct kprobe trampoline_p = {
734 .addr = (kprobe_opcode_t *) &kretprobe_trampoline,
735 .pre_handler = trampoline_probe_handler
736};
737
738int __init arch_init_kprobes(void)
739{
740 return register_kprobe(&trampoline_p);
741}
742
743int __kprobes arch_trampoline_kprobe(struct kprobe *p)
744{
745 if (p->addr == (kprobe_opcode_t *)&kretprobe_trampoline)
746 return 1;
747
748 return 0;
749}
diff --git a/arch/x86/kernel/ldt_32.c b/arch/x86/kernel/ldt.c
index 9ff90a27c45..0224c3637c7 100644
--- a/arch/x86/kernel/ldt_32.c
+++ b/arch/x86/kernel/ldt.c
@@ -1,6 +1,9 @@
1/* 1/*
2 * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds 2 * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
3 * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> 3 * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
4 * Copyright (C) 2002 Andi Kleen
5 *
6 * This handles calls from both 32bit and 64bit mode.
4 */ 7 */
5 8
6#include <linux/errno.h> 9#include <linux/errno.h>
@@ -9,7 +12,6 @@
9#include <linux/mm.h> 12#include <linux/mm.h>
10#include <linux/smp.h> 13#include <linux/smp.h>
11#include <linux/vmalloc.h> 14#include <linux/vmalloc.h>
12#include <linux/slab.h>
13 15
14#include <asm/uaccess.h> 16#include <asm/uaccess.h>
15#include <asm/system.h> 17#include <asm/system.h>
@@ -17,7 +19,7 @@
17#include <asm/desc.h> 19#include <asm/desc.h>
18#include <asm/mmu_context.h> 20#include <asm/mmu_context.h>
19 21
20#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ 22#ifdef CONFIG_SMP
21static void flush_ldt(void *null) 23static void flush_ldt(void *null)
22{ 24{
23 if (current->active_mm) 25 if (current->active_mm)
@@ -27,26 +29,32 @@ static void flush_ldt(void *null)
27 29
28static int alloc_ldt(mm_context_t *pc, int mincount, int reload) 30static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
29{ 31{
30 void *oldldt; 32 void *oldldt, *newldt;
31 void *newldt;
32 int oldsize; 33 int oldsize;
33 34
34 if (mincount <= pc->size) 35 if (mincount <= pc->size)
35 return 0; 36 return 0;
36 oldsize = pc->size; 37 oldsize = pc->size;
37 mincount = (mincount+511)&(~511); 38 mincount = (mincount + (PAGE_SIZE / LDT_ENTRY_SIZE - 1)) &
38 if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE) 39 (~(PAGE_SIZE / LDT_ENTRY_SIZE - 1));
39 newldt = vmalloc(mincount*LDT_ENTRY_SIZE); 40 if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE)
41 newldt = vmalloc(mincount * LDT_ENTRY_SIZE);
40 else 42 else
41 newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); 43 newldt = (void *)__get_free_page(GFP_KERNEL);
42 44
43 if (!newldt) 45 if (!newldt)
44 return -ENOMEM; 46 return -ENOMEM;
45 47
46 if (oldsize) 48 if (oldsize)
47 memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE); 49 memcpy(newldt, pc->ldt, oldsize * LDT_ENTRY_SIZE);
48 oldldt = pc->ldt; 50 oldldt = pc->ldt;
49 memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE); 51 memset(newldt + oldsize * LDT_ENTRY_SIZE, 0,
52 (mincount - oldsize) * LDT_ENTRY_SIZE);
53
54#ifdef CONFIG_X86_64
55 /* CHECKME: Do we really need this ? */
56 wmb();
57#endif
50 pc->ldt = newldt; 58 pc->ldt = newldt;
51 wmb(); 59 wmb();
52 pc->size = mincount; 60 pc->size = mincount;
@@ -55,6 +63,7 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
55 if (reload) { 63 if (reload) {
56#ifdef CONFIG_SMP 64#ifdef CONFIG_SMP
57 cpumask_t mask; 65 cpumask_t mask;
66
58 preempt_disable(); 67 preempt_disable();
59 load_LDT(pc); 68 load_LDT(pc);
60 mask = cpumask_of_cpu(smp_processor_id()); 69 mask = cpumask_of_cpu(smp_processor_id());
@@ -66,10 +75,10 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
66#endif 75#endif
67 } 76 }
68 if (oldsize) { 77 if (oldsize) {
69 if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE) 78 if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE)
70 vfree(oldldt); 79 vfree(oldldt);
71 else 80 else
72 kfree(oldldt); 81 put_page(virt_to_page(oldldt));
73 } 82 }
74 return 0; 83 return 0;
75} 84}
@@ -77,9 +86,10 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
77static inline int copy_ldt(mm_context_t *new, mm_context_t *old) 86static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
78{ 87{
79 int err = alloc_ldt(new, old->size, 0); 88 int err = alloc_ldt(new, old->size, 0);
89
80 if (err < 0) 90 if (err < 0)
81 return err; 91 return err;
82 memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE); 92 memcpy(new->ldt, old->ldt, old->size * LDT_ENTRY_SIZE);
83 return 0; 93 return 0;
84} 94}
85 95
@@ -89,7 +99,7 @@ static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
89 */ 99 */
90int init_new_context(struct task_struct *tsk, struct mm_struct *mm) 100int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
91{ 101{
92 struct mm_struct * old_mm; 102 struct mm_struct *old_mm;
93 int retval = 0; 103 int retval = 0;
94 104
95 mutex_init(&mm->context.lock); 105 mutex_init(&mm->context.lock);
@@ -105,33 +115,38 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
105 115
106/* 116/*
107 * No need to lock the MM as we are the last user 117 * No need to lock the MM as we are the last user
118 *
119 * 64bit: Don't touch the LDT register - we're already in the next thread.
108 */ 120 */
109void destroy_context(struct mm_struct *mm) 121void destroy_context(struct mm_struct *mm)
110{ 122{
111 if (mm->context.size) { 123 if (mm->context.size) {
124#ifdef CONFIG_X86_32
125 /* CHECKME: Can this ever happen ? */
112 if (mm == current->active_mm) 126 if (mm == current->active_mm)
113 clear_LDT(); 127 clear_LDT();
114 if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE) 128#endif
129 if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE)
115 vfree(mm->context.ldt); 130 vfree(mm->context.ldt);
116 else 131 else
117 kfree(mm->context.ldt); 132 put_page(virt_to_page(mm->context.ldt));
118 mm->context.size = 0; 133 mm->context.size = 0;
119 } 134 }
120} 135}
121 136
122static int read_ldt(void __user * ptr, unsigned long bytecount) 137static int read_ldt(void __user *ptr, unsigned long bytecount)
123{ 138{
124 int err; 139 int err;
125 unsigned long size; 140 unsigned long size;
126 struct mm_struct * mm = current->mm; 141 struct mm_struct *mm = current->mm;
127 142
128 if (!mm->context.size) 143 if (!mm->context.size)
129 return 0; 144 return 0;
130 if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES) 145 if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES)
131 bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES; 146 bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES;
132 147
133 mutex_lock(&mm->context.lock); 148 mutex_lock(&mm->context.lock);
134 size = mm->context.size*LDT_ENTRY_SIZE; 149 size = mm->context.size * LDT_ENTRY_SIZE;
135 if (size > bytecount) 150 if (size > bytecount)
136 size = bytecount; 151 size = bytecount;
137 152
@@ -143,7 +158,7 @@ static int read_ldt(void __user * ptr, unsigned long bytecount)
143 goto error_return; 158 goto error_return;
144 if (size != bytecount) { 159 if (size != bytecount) {
145 /* zero-fill the rest */ 160 /* zero-fill the rest */
146 if (clear_user(ptr+size, bytecount-size) != 0) { 161 if (clear_user(ptr + size, bytecount - size) != 0) {
147 err = -EFAULT; 162 err = -EFAULT;
148 goto error_return; 163 goto error_return;
149 } 164 }
@@ -153,34 +168,32 @@ error_return:
153 return err; 168 return err;
154} 169}
155 170
156static int read_default_ldt(void __user * ptr, unsigned long bytecount) 171static int read_default_ldt(void __user *ptr, unsigned long bytecount)
157{ 172{
158 int err; 173 /* CHECKME: Can we use _one_ random number ? */
159 unsigned long size; 174#ifdef CONFIG_X86_32
160 175 unsigned long size = 5 * sizeof(struct desc_struct);
161 err = 0; 176#else
162 size = 5*sizeof(struct desc_struct); 177 unsigned long size = 128;
163 if (size > bytecount) 178#endif
164 size = bytecount; 179 if (bytecount > size)
165 180 bytecount = size;
166 err = size; 181 if (clear_user(ptr, bytecount))
167 if (clear_user(ptr, size)) 182 return -EFAULT;
168 err = -EFAULT; 183 return bytecount;
169
170 return err;
171} 184}
172 185
173static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode) 186static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
174{ 187{
175 struct mm_struct * mm = current->mm; 188 struct mm_struct *mm = current->mm;
176 __u32 entry_1, entry_2; 189 struct desc_struct ldt;
177 int error; 190 int error;
178 struct user_desc ldt_info; 191 struct user_desc ldt_info;
179 192
180 error = -EINVAL; 193 error = -EINVAL;
181 if (bytecount != sizeof(ldt_info)) 194 if (bytecount != sizeof(ldt_info))
182 goto out; 195 goto out;
183 error = -EFAULT; 196 error = -EFAULT;
184 if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info))) 197 if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info)))
185 goto out; 198 goto out;
186 199
@@ -196,28 +209,27 @@ static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
196 209
197 mutex_lock(&mm->context.lock); 210 mutex_lock(&mm->context.lock);
198 if (ldt_info.entry_number >= mm->context.size) { 211 if (ldt_info.entry_number >= mm->context.size) {
199 error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1); 212 error = alloc_ldt(&current->mm->context,
213 ldt_info.entry_number + 1, 1);
200 if (error < 0) 214 if (error < 0)
201 goto out_unlock; 215 goto out_unlock;
202 } 216 }
203 217
204 /* Allow LDTs to be cleared by the user. */ 218 /* Allow LDTs to be cleared by the user. */
205 if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { 219 if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
206 if (oldmode || LDT_empty(&ldt_info)) { 220 if (oldmode || LDT_empty(&ldt_info)) {
207 entry_1 = 0; 221 memset(&ldt, 0, sizeof(ldt));
208 entry_2 = 0;
209 goto install; 222 goto install;
210 } 223 }
211 } 224 }
212 225
213 entry_1 = LDT_entry_a(&ldt_info); 226 fill_ldt(&ldt, &ldt_info);
214 entry_2 = LDT_entry_b(&ldt_info);
215 if (oldmode) 227 if (oldmode)
216 entry_2 &= ~(1 << 20); 228 ldt.avl = 0;
217 229
218 /* Install the new entry ... */ 230 /* Install the new entry ... */
219install: 231install:
220 write_ldt_entry(mm->context.ldt, ldt_info.entry_number, entry_1, entry_2); 232 write_ldt_entry(mm->context.ldt, ldt_info.entry_number, &ldt);
221 error = 0; 233 error = 0;
222 234
223out_unlock: 235out_unlock:
@@ -226,7 +238,8 @@ out:
226 return error; 238 return error;
227} 239}
228 240
229asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount) 241asmlinkage int sys_modify_ldt(int func, void __user *ptr,
242 unsigned long bytecount)
230{ 243{
231 int ret = -ENOSYS; 244 int ret = -ENOSYS;
232 245
diff --git a/arch/x86/kernel/ldt_64.c b/arch/x86/kernel/ldt_64.c
deleted file mode 100644
index 60e57abb8e9..00000000000
--- a/arch/x86/kernel/ldt_64.c
+++ /dev/null
@@ -1,250 +0,0 @@
1/*
2 * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
3 * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
4 * Copyright (C) 2002 Andi Kleen
5 *
6 * This handles calls from both 32bit and 64bit mode.
7 */
8
9#include <linux/errno.h>
10#include <linux/sched.h>
11#include <linux/string.h>
12#include <linux/mm.h>
13#include <linux/smp.h>
14#include <linux/vmalloc.h>
15#include <linux/slab.h>
16
17#include <asm/uaccess.h>
18#include <asm/system.h>
19#include <asm/ldt.h>
20#include <asm/desc.h>
21#include <asm/proto.h>
22
23#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
24static void flush_ldt(void *null)
25{
26 if (current->active_mm)
27 load_LDT(&current->active_mm->context);
28}
29#endif
30
31static int alloc_ldt(mm_context_t *pc, unsigned mincount, int reload)
32{
33 void *oldldt;
34 void *newldt;
35 unsigned oldsize;
36
37 if (mincount <= (unsigned)pc->size)
38 return 0;
39 oldsize = pc->size;
40 mincount = (mincount+511)&(~511);
41 if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
42 newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
43 else
44 newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
45
46 if (!newldt)
47 return -ENOMEM;
48
49 if (oldsize)
50 memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
51 oldldt = pc->ldt;
52 memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
53 wmb();
54 pc->ldt = newldt;
55 wmb();
56 pc->size = mincount;
57 wmb();
58 if (reload) {
59#ifdef CONFIG_SMP
60 cpumask_t mask;
61
62 preempt_disable();
63 mask = cpumask_of_cpu(smp_processor_id());
64 load_LDT(pc);
65 if (!cpus_equal(current->mm->cpu_vm_mask, mask))
66 smp_call_function(flush_ldt, NULL, 1, 1);
67 preempt_enable();
68#else
69 load_LDT(pc);
70#endif
71 }
72 if (oldsize) {
73 if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
74 vfree(oldldt);
75 else
76 kfree(oldldt);
77 }
78 return 0;
79}
80
81static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
82{
83 int err = alloc_ldt(new, old->size, 0);
84 if (err < 0)
85 return err;
86 memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
87 return 0;
88}
89
90/*
91 * we do not have to muck with descriptors here, that is
92 * done in switch_mm() as needed.
93 */
94int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
95{
96 struct mm_struct * old_mm;
97 int retval = 0;
98
99 mutex_init(&mm->context.lock);
100 mm->context.size = 0;
101 old_mm = current->mm;
102 if (old_mm && old_mm->context.size > 0) {
103 mutex_lock(&old_mm->context.lock);
104 retval = copy_ldt(&mm->context, &old_mm->context);
105 mutex_unlock(&old_mm->context.lock);
106 }
107 return retval;
108}
109
110/*
111 *
112 * Don't touch the LDT register - we're already in the next thread.
113 */
114void destroy_context(struct mm_struct *mm)
115{
116 if (mm->context.size) {
117 if ((unsigned)mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
118 vfree(mm->context.ldt);
119 else
120 kfree(mm->context.ldt);
121 mm->context.size = 0;
122 }
123}
124
125static int read_ldt(void __user * ptr, unsigned long bytecount)
126{
127 int err;
128 unsigned long size;
129 struct mm_struct * mm = current->mm;
130
131 if (!mm->context.size)
132 return 0;
133 if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
134 bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
135
136 mutex_lock(&mm->context.lock);
137 size = mm->context.size*LDT_ENTRY_SIZE;
138 if (size > bytecount)
139 size = bytecount;
140
141 err = 0;
142 if (copy_to_user(ptr, mm->context.ldt, size))
143 err = -EFAULT;
144 mutex_unlock(&mm->context.lock);
145 if (err < 0)
146 goto error_return;
147 if (size != bytecount) {
148 /* zero-fill the rest */
149 if (clear_user(ptr+size, bytecount-size) != 0) {
150 err = -EFAULT;
151 goto error_return;
152 }
153 }
154 return bytecount;
155error_return:
156 return err;
157}
158
159static int read_default_ldt(void __user * ptr, unsigned long bytecount)
160{
161 /* Arbitrary number */
162 /* x86-64 default LDT is all zeros */
163 if (bytecount > 128)
164 bytecount = 128;
165 if (clear_user(ptr, bytecount))
166 return -EFAULT;
167 return bytecount;
168}
169
170static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
171{
172 struct task_struct *me = current;
173 struct mm_struct * mm = me->mm;
174 __u32 entry_1, entry_2, *lp;
175 int error;
176 struct user_desc ldt_info;
177
178 error = -EINVAL;
179
180 if (bytecount != sizeof(ldt_info))
181 goto out;
182 error = -EFAULT;
183 if (copy_from_user(&ldt_info, ptr, bytecount))
184 goto out;
185
186 error = -EINVAL;
187 if (ldt_info.entry_number >= LDT_ENTRIES)
188 goto out;
189 if (ldt_info.contents == 3) {
190 if (oldmode)
191 goto out;
192 if (ldt_info.seg_not_present == 0)
193 goto out;
194 }
195
196 mutex_lock(&mm->context.lock);
197 if (ldt_info.entry_number >= (unsigned)mm->context.size) {
198 error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1);
199 if (error < 0)
200 goto out_unlock;
201 }
202
203 lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt);
204
205 /* Allow LDTs to be cleared by the user. */
206 if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
207 if (oldmode || LDT_empty(&ldt_info)) {
208 entry_1 = 0;
209 entry_2 = 0;
210 goto install;
211 }
212 }
213
214 entry_1 = LDT_entry_a(&ldt_info);
215 entry_2 = LDT_entry_b(&ldt_info);
216 if (oldmode)
217 entry_2 &= ~(1 << 20);
218
219 /* Install the new entry ... */
220install:
221 *lp = entry_1;
222 *(lp+1) = entry_2;
223 error = 0;
224
225out_unlock:
226 mutex_unlock(&mm->context.lock);
227out:
228 return error;
229}
230
231asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
232{
233 int ret = -ENOSYS;
234
235 switch (func) {
236 case 0:
237 ret = read_ldt(ptr, bytecount);
238 break;
239 case 1:
240 ret = write_ldt(ptr, bytecount, 1);
241 break;
242 case 2:
243 ret = read_default_ldt(ptr, bytecount);
244 break;
245 case 0x11:
246 ret = write_ldt(ptr, bytecount, 0);
247 break;
248 }
249 return ret;
250}
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index 11b935f4f88..c1cfd60639d 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -32,7 +32,7 @@ static u32 kexec_pte1[1024] PAGE_ALIGNED;
32 32
33static void set_idt(void *newidt, __u16 limit) 33static void set_idt(void *newidt, __u16 limit)
34{ 34{
35 struct Xgt_desc_struct curidt; 35 struct desc_ptr curidt;
36 36
37 /* ia32 supports unaliged loads & stores */ 37 /* ia32 supports unaliged loads & stores */
38 curidt.size = limit; 38 curidt.size = limit;
@@ -44,7 +44,7 @@ static void set_idt(void *newidt, __u16 limit)
44 44
45static void set_gdt(void *newgdt, __u16 limit) 45static void set_gdt(void *newgdt, __u16 limit)
46{ 46{
47 struct Xgt_desc_struct curgdt; 47 struct desc_ptr curgdt;
48 48
49 /* ia32 supports unaligned loads & stores */ 49 /* ia32 supports unaligned loads & stores */
50 curgdt.size = limit; 50 curgdt.size = limit;
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index aa3d2c8f773..a1fef42f8cd 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -234,10 +234,5 @@ NORET_TYPE void machine_kexec(struct kimage *image)
234void arch_crash_save_vmcoreinfo(void) 234void arch_crash_save_vmcoreinfo(void)
235{ 235{
236 VMCOREINFO_SYMBOL(init_level4_pgt); 236 VMCOREINFO_SYMBOL(init_level4_pgt);
237
238#ifdef CONFIG_ARCH_DISCONTIGMEM_ENABLE
239 VMCOREINFO_SYMBOL(node_data);
240 VMCOREINFO_LENGTH(node_data, MAX_NUMNODES);
241#endif
242} 237}
243 238
diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c
index 0ab680f2d9d..219f86eb612 100644
--- a/arch/x86/kernel/mfgpt_32.c
+++ b/arch/x86/kernel/mfgpt_32.c
@@ -63,6 +63,21 @@ static int __init mfgpt_disable(char *s)
63} 63}
64__setup("nomfgpt", mfgpt_disable); 64__setup("nomfgpt", mfgpt_disable);
65 65
66/* Reset the MFGPT timers. This is required by some broken BIOSes which already
67 * do the same and leave the system in an unstable state. TinyBIOS 0.98 is
68 * affected at least (0.99 is OK with MFGPT workaround left to off).
69 */
70static int __init mfgpt_fix(char *s)
71{
72 u32 val, dummy;
73
74 /* The following udocumented bit resets the MFGPT timers */
75 val = 0xFF; dummy = 0;
76 wrmsr(0x5140002B, val, dummy);
77 return 1;
78}
79__setup("mfgptfix", mfgpt_fix);
80
66/* 81/*
67 * Check whether any MFGPTs are available for the kernel to use. In most 82 * Check whether any MFGPTs are available for the kernel to use. In most
68 * cases, firmware that uses AMD's VSA code will claim all timers during 83 * cases, firmware that uses AMD's VSA code will claim all timers during
@@ -278,12 +293,12 @@ static int mfgpt_next_event(unsigned long delta, struct clock_event_device *evt)
278 293
279static irqreturn_t mfgpt_tick(int irq, void *dev_id) 294static irqreturn_t mfgpt_tick(int irq, void *dev_id)
280{ 295{
296 /* Turn off the clock (and clear the event) */
297 mfgpt_disable_timer(mfgpt_event_clock);
298
281 if (mfgpt_tick_mode == CLOCK_EVT_MODE_SHUTDOWN) 299 if (mfgpt_tick_mode == CLOCK_EVT_MODE_SHUTDOWN)
282 return IRQ_HANDLED; 300 return IRQ_HANDLED;
283 301
284 /* Turn off the clock */
285 mfgpt_disable_timer(mfgpt_event_clock);
286
287 /* Clear the counter */ 302 /* Clear the counter */
288 geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_COUNTER, 0); 303 geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_COUNTER, 0);
289 304
@@ -319,10 +334,6 @@ static int __init mfgpt_timer_setup(void)
319 } 334 }
320 335
321 mfgpt_event_clock = timer; 336 mfgpt_event_clock = timer;
322 /* Set the clock scale and enable the event mode for CMP2 */
323 val = MFGPT_SCALE | (3 << 8);
324
325 geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_SETUP, val);
326 337
327 /* Set up the IRQ on the MFGPT side */ 338 /* Set up the IRQ on the MFGPT side */
328 if (geode_mfgpt_setup_irq(mfgpt_event_clock, MFGPT_CMP2, irq)) { 339 if (geode_mfgpt_setup_irq(mfgpt_event_clock, MFGPT_CMP2, irq)) {
@@ -339,6 +350,11 @@ static int __init mfgpt_timer_setup(void)
339 goto err; 350 goto err;
340 } 351 }
341 352
353 /* Set the clock scale and enable the event mode for CMP2 */
354 val = MFGPT_SCALE | (3 << 8);
355
356 geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_SETUP, val);
357
342 /* Set up the clock event */ 358 /* Set up the clock event */
343 mfgpt_clockevent.mult = div_sc(MFGPT_HZ, NSEC_PER_SEC, 32); 359 mfgpt_clockevent.mult = div_sc(MFGPT_HZ, NSEC_PER_SEC, 32);
344 mfgpt_clockevent.min_delta_ns = clockevent_delta2ns(0xF, 360 mfgpt_clockevent.min_delta_ns = clockevent_delta2ns(0xF,
diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c
index 09c315214a5..f2702d01b8a 100644
--- a/arch/x86/kernel/microcode.c
+++ b/arch/x86/kernel/microcode.c
@@ -244,8 +244,8 @@ static int microcode_sanity_check(void *mc)
244 return 0; 244 return 0;
245 /* check extended signature checksum */ 245 /* check extended signature checksum */
246 for (i = 0; i < ext_sigcount; i++) { 246 for (i = 0; i < ext_sigcount; i++) {
247 ext_sig = (struct extended_signature *)((void *)ext_header 247 ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
248 + EXT_HEADER_SIZE + EXT_SIGNATURE_SIZE * i); 248 EXT_SIGNATURE_SIZE * i;
249 sum = orig_sum 249 sum = orig_sum
250 - (mc_header->sig + mc_header->pf + mc_header->cksum) 250 - (mc_header->sig + mc_header->pf + mc_header->cksum)
251 + (ext_sig->sig + ext_sig->pf + ext_sig->cksum); 251 + (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
@@ -279,11 +279,9 @@ static int get_maching_microcode(void *mc, int cpu)
279 if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE) 279 if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE)
280 return 0; 280 return 0;
281 281
282 ext_header = (struct extended_sigtable *)(mc + 282 ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE;
283 get_datasize(mc_header) + MC_HEADER_SIZE);
284 ext_sigcount = ext_header->count; 283 ext_sigcount = ext_header->count;
285 ext_sig = (struct extended_signature *)((void *)ext_header 284 ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
286 + EXT_HEADER_SIZE);
287 for (i = 0; i < ext_sigcount; i++) { 285 for (i = 0; i < ext_sigcount; i++) {
288 if (microcode_update_match(cpu, mc_header, 286 if (microcode_update_match(cpu, mc_header,
289 ext_sig->sig, ext_sig->pf)) 287 ext_sig->sig, ext_sig->pf))
@@ -436,7 +434,7 @@ static ssize_t microcode_write (struct file *file, const char __user *buf, size_
436 return -EINVAL; 434 return -EINVAL;
437 } 435 }
438 436
439 lock_cpu_hotplug(); 437 get_online_cpus();
440 mutex_lock(&microcode_mutex); 438 mutex_lock(&microcode_mutex);
441 439
442 user_buffer = (void __user *) buf; 440 user_buffer = (void __user *) buf;
@@ -447,7 +445,7 @@ static ssize_t microcode_write (struct file *file, const char __user *buf, size_
447 ret = (ssize_t)len; 445 ret = (ssize_t)len;
448 446
449 mutex_unlock(&microcode_mutex); 447 mutex_unlock(&microcode_mutex);
450 unlock_cpu_hotplug(); 448 put_online_cpus();
451 449
452 return ret; 450 return ret;
453} 451}
@@ -539,7 +537,7 @@ static int cpu_request_microcode(int cpu)
539 pr_debug("ucode data file %s load failed\n", name); 537 pr_debug("ucode data file %s load failed\n", name);
540 return error; 538 return error;
541 } 539 }
542 buf = (void *)firmware->data; 540 buf = firmware->data;
543 size = firmware->size; 541 size = firmware->size;
544 while ((offset = get_next_ucode_from_buffer(&mc, buf, size, offset)) 542 while ((offset = get_next_ucode_from_buffer(&mc, buf, size, offset))
545 > 0) { 543 > 0) {
@@ -658,14 +656,14 @@ static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz)
658 656
659 old = current->cpus_allowed; 657 old = current->cpus_allowed;
660 658
661 lock_cpu_hotplug(); 659 get_online_cpus();
662 set_cpus_allowed(current, cpumask_of_cpu(cpu)); 660 set_cpus_allowed(current, cpumask_of_cpu(cpu));
663 661
664 mutex_lock(&microcode_mutex); 662 mutex_lock(&microcode_mutex);
665 if (uci->valid) 663 if (uci->valid)
666 err = cpu_request_microcode(cpu); 664 err = cpu_request_microcode(cpu);
667 mutex_unlock(&microcode_mutex); 665 mutex_unlock(&microcode_mutex);
668 unlock_cpu_hotplug(); 666 put_online_cpus();
669 set_cpus_allowed(current, old); 667 set_cpus_allowed(current, old);
670 } 668 }
671 if (err) 669 if (err)
@@ -799,7 +797,7 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
799 return NOTIFY_OK; 797 return NOTIFY_OK;
800} 798}
801 799
802static struct notifier_block __cpuinitdata mc_cpu_notifier = { 800static struct notifier_block __refdata mc_cpu_notifier = {
803 .notifier_call = mc_cpu_callback, 801 .notifier_call = mc_cpu_callback,
804}; 802};
805 803
@@ -817,9 +815,9 @@ static int __init microcode_init (void)
817 return PTR_ERR(microcode_pdev); 815 return PTR_ERR(microcode_pdev);
818 } 816 }
819 817
820 lock_cpu_hotplug(); 818 get_online_cpus();
821 error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver); 819 error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
822 unlock_cpu_hotplug(); 820 put_online_cpus();
823 if (error) { 821 if (error) {
824 microcode_dev_exit(); 822 microcode_dev_exit();
825 platform_device_unregister(microcode_pdev); 823 platform_device_unregister(microcode_pdev);
@@ -839,9 +837,9 @@ static void __exit microcode_exit (void)
839 837
840 unregister_hotcpu_notifier(&mc_cpu_notifier); 838 unregister_hotcpu_notifier(&mc_cpu_notifier);
841 839
842 lock_cpu_hotplug(); 840 get_online_cpus();
843 sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver); 841 sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
844 unlock_cpu_hotplug(); 842 put_online_cpus();
845 843
846 platform_device_unregister(microcode_pdev); 844 platform_device_unregister(microcode_pdev);
847} 845}
diff --git a/arch/x86/kernel/mpparse_32.c b/arch/x86/kernel/mpparse_32.c
index 7a05a7f6099..67009cdd5ec 100644
--- a/arch/x86/kernel/mpparse_32.c
+++ b/arch/x86/kernel/mpparse_32.c
@@ -68,7 +68,7 @@ unsigned int def_to_bigsmp = 0;
68/* Processor that is doing the boot up */ 68/* Processor that is doing the boot up */
69unsigned int boot_cpu_physical_apicid = -1U; 69unsigned int boot_cpu_physical_apicid = -1U;
70/* Internal processor count */ 70/* Internal processor count */
71unsigned int __cpuinitdata num_processors; 71unsigned int num_processors;
72 72
73/* Bitmask of physically existing CPUs */ 73/* Bitmask of physically existing CPUs */
74physid_mask_t phys_cpu_present_map; 74physid_mask_t phys_cpu_present_map;
@@ -258,7 +258,7 @@ static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
258 if (!(m->mpc_flags & MPC_APIC_USABLE)) 258 if (!(m->mpc_flags & MPC_APIC_USABLE))
259 return; 259 return;
260 260
261 printk(KERN_INFO "I/O APIC #%d Version %d at 0x%lX.\n", 261 printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n",
262 m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr); 262 m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
263 if (nr_ioapics >= MAX_IO_APICS) { 263 if (nr_ioapics >= MAX_IO_APICS) {
264 printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found %d).\n", 264 printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found %d).\n",
@@ -405,9 +405,9 @@ static int __init smp_read_mpc(struct mp_config_table *mpc)
405 405
406 mps_oem_check(mpc, oem, str); 406 mps_oem_check(mpc, oem, str);
407 407
408 printk("APIC at: 0x%lX\n",mpc->mpc_lapic); 408 printk("APIC at: 0x%X\n", mpc->mpc_lapic);
409 409
410 /* 410 /*
411 * Save the local APIC address (it might be non-default) -- but only 411 * Save the local APIC address (it might be non-default) -- but only
412 * if we're not using ACPI. 412 * if we're not using ACPI.
413 */ 413 */
@@ -721,7 +721,7 @@ static int __init smp_scan_config (unsigned long base, unsigned long length)
721 unsigned long *bp = phys_to_virt(base); 721 unsigned long *bp = phys_to_virt(base);
722 struct intel_mp_floating *mpf; 722 struct intel_mp_floating *mpf;
723 723
724 Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length); 724 printk(KERN_INFO "Scan SMP from %p for %ld bytes.\n", bp,length);
725 if (sizeof(*mpf) != 16) 725 if (sizeof(*mpf) != 16)
726 printk("Error: MPF size\n"); 726 printk("Error: MPF size\n");
727 727
@@ -734,8 +734,8 @@ static int __init smp_scan_config (unsigned long base, unsigned long length)
734 || (mpf->mpf_specification == 4)) ) { 734 || (mpf->mpf_specification == 4)) ) {
735 735
736 smp_found_config = 1; 736 smp_found_config = 1;
737 printk(KERN_INFO "found SMP MP-table at %08lx\n", 737 printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
738 virt_to_phys(mpf)); 738 mpf, virt_to_phys(mpf));
739 reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE); 739 reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE);
740 if (mpf->mpf_physptr) { 740 if (mpf->mpf_physptr) {
741 /* 741 /*
@@ -918,14 +918,14 @@ void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base)
918 */ 918 */
919 mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid; 919 mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
920 mp_ioapic_routing[idx].gsi_base = gsi_base; 920 mp_ioapic_routing[idx].gsi_base = gsi_base;
921 mp_ioapic_routing[idx].gsi_end = gsi_base + 921 mp_ioapic_routing[idx].gsi_end = gsi_base +
922 io_apic_get_redir_entries(idx); 922 io_apic_get_redir_entries(idx);
923 923
924 printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, " 924 printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
925 "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, 925 "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
926 mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, 926 mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
927 mp_ioapic_routing[idx].gsi_base, 927 mp_ioapic_routing[idx].gsi_base,
928 mp_ioapic_routing[idx].gsi_end); 928 mp_ioapic_routing[idx].gsi_end);
929} 929}
930 930
931void __init 931void __init
@@ -1041,15 +1041,16 @@ void __init mp_config_acpi_legacy_irqs (void)
1041} 1041}
1042 1042
1043#define MAX_GSI_NUM 4096 1043#define MAX_GSI_NUM 4096
1044#define IRQ_COMPRESSION_START 64
1044 1045
1045int mp_register_gsi(u32 gsi, int triggering, int polarity) 1046int mp_register_gsi(u32 gsi, int triggering, int polarity)
1046{ 1047{
1047 int ioapic = -1; 1048 int ioapic = -1;
1048 int ioapic_pin = 0; 1049 int ioapic_pin = 0;
1049 int idx, bit = 0; 1050 int idx, bit = 0;
1050 static int pci_irq = 16; 1051 static int pci_irq = IRQ_COMPRESSION_START;
1051 /* 1052 /*
1052 * Mapping between Global System Interrups, which 1053 * Mapping between Global System Interrupts, which
1053 * represent all possible interrupts, and IRQs 1054 * represent all possible interrupts, and IRQs
1054 * assigned to actual devices. 1055 * assigned to actual devices.
1055 */ 1056 */
@@ -1086,12 +1087,16 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity)
1086 if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) { 1087 if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
1087 Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n", 1088 Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
1088 mp_ioapic_routing[ioapic].apic_id, ioapic_pin); 1089 mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
1089 return gsi_to_irq[gsi]; 1090 return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
1090 } 1091 }
1091 1092
1092 mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit); 1093 mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
1093 1094
1094 if (triggering == ACPI_LEVEL_SENSITIVE) { 1095 /*
1096 * For GSI >= 64, use IRQ compression
1097 */
1098 if ((gsi >= IRQ_COMPRESSION_START)
1099 && (triggering == ACPI_LEVEL_SENSITIVE)) {
1095 /* 1100 /*
1096 * For PCI devices assign IRQs in order, avoiding gaps 1101 * For PCI devices assign IRQs in order, avoiding gaps
1097 * due to unused I/O APIC pins. 1102 * due to unused I/O APIC pins.
diff --git a/arch/x86/kernel/mpparse_64.c b/arch/x86/kernel/mpparse_64.c
index ef4aab12358..72ab1403fed 100644
--- a/arch/x86/kernel/mpparse_64.c
+++ b/arch/x86/kernel/mpparse_64.c
@@ -60,14 +60,18 @@ unsigned int boot_cpu_id = -1U;
60EXPORT_SYMBOL(boot_cpu_id); 60EXPORT_SYMBOL(boot_cpu_id);
61 61
62/* Internal processor count */ 62/* Internal processor count */
63unsigned int num_processors __cpuinitdata = 0; 63unsigned int num_processors;
64 64
65unsigned disabled_cpus __cpuinitdata; 65unsigned disabled_cpus __cpuinitdata;
66 66
67/* Bitmask of physically existing CPUs */ 67/* Bitmask of physically existing CPUs */
68physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE; 68physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;
69 69
70u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; 70u16 x86_bios_cpu_apicid_init[NR_CPUS] __initdata
71 = { [0 ... NR_CPUS-1] = BAD_APICID };
72void *x86_bios_cpu_apicid_early_ptr;
73DEFINE_PER_CPU(u16, x86_bios_cpu_apicid) = BAD_APICID;
74EXPORT_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
71 75
72 76
73/* 77/*
@@ -118,24 +122,22 @@ static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
118 physid_set(m->mpc_apicid, phys_cpu_present_map); 122 physid_set(m->mpc_apicid, phys_cpu_present_map);
119 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { 123 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
120 /* 124 /*
121 * bios_cpu_apicid is required to have processors listed 125 * x86_bios_cpu_apicid is required to have processors listed
122 * in same order as logical cpu numbers. Hence the first 126 * in same order as logical cpu numbers. Hence the first
123 * entry is BSP, and so on. 127 * entry is BSP, and so on.
124 */ 128 */
125 cpu = 0; 129 cpu = 0;
126 } 130 }
127 bios_cpu_apicid[cpu] = m->mpc_apicid; 131 /* are we being called early in kernel startup? */
128 /* 132 if (x86_cpu_to_apicid_early_ptr) {
129 * We get called early in the the start_kernel initialization 133 u16 *cpu_to_apicid = x86_cpu_to_apicid_early_ptr;
130 * process when the per_cpu data area is not yet setup, so we 134 u16 *bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr;
131 * use a static array that is removed after the per_cpu data 135
132 * area is created. 136 cpu_to_apicid[cpu] = m->mpc_apicid;
133 */ 137 bios_cpu_apicid[cpu] = m->mpc_apicid;
134 if (x86_cpu_to_apicid_ptr) {
135 u8 *x86_cpu_to_apicid = (u8 *)x86_cpu_to_apicid_ptr;
136 x86_cpu_to_apicid[cpu] = m->mpc_apicid;
137 } else { 138 } else {
138 per_cpu(x86_cpu_to_apicid, cpu) = m->mpc_apicid; 139 per_cpu(x86_cpu_to_apicid, cpu) = m->mpc_apicid;
140 per_cpu(x86_bios_cpu_apicid, cpu) = m->mpc_apicid;
139 } 141 }
140 142
141 cpu_set(cpu, cpu_possible_map); 143 cpu_set(cpu, cpu_possible_map);
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index ee6eba4ecfe..af51ea8400b 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -1,6 +1,6 @@
1/* ----------------------------------------------------------------------- * 1/* ----------------------------------------------------------------------- *
2 * 2 *
3 * Copyright 2000 H. Peter Anvin - All Rights Reserved 3 * Copyright 2000-2008 H. Peter Anvin - All Rights Reserved
4 * 4 *
5 * This program is free software; you can redistribute it and/or modify 5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by 6 * it under the terms of the GNU General Public License as published by
@@ -45,9 +45,10 @@ static struct class *msr_class;
45 45
46static loff_t msr_seek(struct file *file, loff_t offset, int orig) 46static loff_t msr_seek(struct file *file, loff_t offset, int orig)
47{ 47{
48 loff_t ret = -EINVAL; 48 loff_t ret;
49 struct inode *inode = file->f_mapping->host;
49 50
50 lock_kernel(); 51 mutex_lock(&inode->i_mutex);
51 switch (orig) { 52 switch (orig) {
52 case 0: 53 case 0:
53 file->f_pos = offset; 54 file->f_pos = offset;
@@ -56,8 +57,11 @@ static loff_t msr_seek(struct file *file, loff_t offset, int orig)
56 case 1: 57 case 1:
57 file->f_pos += offset; 58 file->f_pos += offset;
58 ret = file->f_pos; 59 ret = file->f_pos;
60 break;
61 default:
62 ret = -EINVAL;
59 } 63 }
60 unlock_kernel(); 64 mutex_unlock(&inode->i_mutex);
61 return ret; 65 return ret;
62} 66}
63 67
@@ -155,20 +159,20 @@ static int __cpuinit msr_class_cpu_callback(struct notifier_block *nfb,
155 159
156 switch (action) { 160 switch (action) {
157 case CPU_UP_PREPARE: 161 case CPU_UP_PREPARE:
158 case CPU_UP_PREPARE_FROZEN:
159 err = msr_device_create(cpu); 162 err = msr_device_create(cpu);
160 break; 163 break;
161 case CPU_UP_CANCELED: 164 case CPU_UP_CANCELED:
162 case CPU_UP_CANCELED_FROZEN:
163 case CPU_DEAD: 165 case CPU_DEAD:
164 case CPU_DEAD_FROZEN:
165 msr_device_destroy(cpu); 166 msr_device_destroy(cpu);
166 break; 167 break;
168 case CPU_UP_CANCELED_FROZEN:
169 destroy_suspended_device(msr_class, MKDEV(MSR_MAJOR, cpu));
170 break;
167 } 171 }
168 return err ? NOTIFY_BAD : NOTIFY_OK; 172 return err ? NOTIFY_BAD : NOTIFY_OK;
169} 173}
170 174
171static struct notifier_block __cpuinitdata msr_class_cpu_notifier = { 175static struct notifier_block __refdata msr_class_cpu_notifier = {
172 .notifier_call = msr_class_cpu_callback, 176 .notifier_call = msr_class_cpu_callback,
173}; 177};
174 178
diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c
index 80ca72e5ac2..edd413650b3 100644
--- a/arch/x86/kernel/nmi_32.c
+++ b/arch/x86/kernel/nmi_32.c
@@ -25,7 +25,6 @@
25 25
26#include <asm/smp.h> 26#include <asm/smp.h>
27#include <asm/nmi.h> 27#include <asm/nmi.h>
28#include <asm/timer.h>
29 28
30#include "mach_traps.h" 29#include "mach_traps.h"
31 30
@@ -52,13 +51,13 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu);
52 51
53static int endflag __initdata = 0; 52static int endflag __initdata = 0;
54 53
54#ifdef CONFIG_SMP
55/* The performance counters used by NMI_LOCAL_APIC don't trigger when 55/* The performance counters used by NMI_LOCAL_APIC don't trigger when
56 * the CPU is idle. To make sure the NMI watchdog really ticks on all 56 * the CPU is idle. To make sure the NMI watchdog really ticks on all
57 * CPUs during the test make them busy. 57 * CPUs during the test make them busy.
58 */ 58 */
59static __init void nmi_cpu_busy(void *data) 59static __init void nmi_cpu_busy(void *data)
60{ 60{
61#ifdef CONFIG_SMP
62 local_irq_enable_in_hardirq(); 61 local_irq_enable_in_hardirq();
63 /* Intentionally don't use cpu_relax here. This is 62 /* Intentionally don't use cpu_relax here. This is
64 to make sure that the performance counter really ticks, 63 to make sure that the performance counter really ticks,
@@ -68,8 +67,8 @@ static __init void nmi_cpu_busy(void *data)
68 care if they get somewhat less cycles. */ 67 care if they get somewhat less cycles. */
69 while (endflag == 0) 68 while (endflag == 0)
70 mb(); 69 mb();
71#endif
72} 70}
71#endif
73 72
74static int __init check_nmi_watchdog(void) 73static int __init check_nmi_watchdog(void)
75{ 74{
@@ -84,15 +83,17 @@ static int __init check_nmi_watchdog(void)
84 83
85 prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL); 84 prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
86 if (!prev_nmi_count) 85 if (!prev_nmi_count)
87 goto error; 86 return -1;
88 87
89 printk(KERN_INFO "Testing NMI watchdog ... "); 88 printk(KERN_INFO "Testing NMI watchdog ... ");
90 89
90#ifdef CONFIG_SMP
91 if (nmi_watchdog == NMI_LOCAL_APIC) 91 if (nmi_watchdog == NMI_LOCAL_APIC)
92 smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0); 92 smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
93#endif
93 94
94 for_each_possible_cpu(cpu) 95 for_each_possible_cpu(cpu)
95 prev_nmi_count[cpu] = per_cpu(irq_stat, cpu).__nmi_count; 96 prev_nmi_count[cpu] = nmi_count(cpu);
96 local_irq_enable(); 97 local_irq_enable();
97 mdelay((20*1000)/nmi_hz); // wait 20 ticks 98 mdelay((20*1000)/nmi_hz); // wait 20 ticks
98 99
@@ -119,7 +120,7 @@ static int __init check_nmi_watchdog(void)
119 if (!atomic_read(&nmi_active)) { 120 if (!atomic_read(&nmi_active)) {
120 kfree(prev_nmi_count); 121 kfree(prev_nmi_count);
121 atomic_set(&nmi_active, -1); 122 atomic_set(&nmi_active, -1);
122 goto error; 123 return -1;
123 } 124 }
124 printk("OK.\n"); 125 printk("OK.\n");
125 126
@@ -130,10 +131,6 @@ static int __init check_nmi_watchdog(void)
130 131
131 kfree(prev_nmi_count); 132 kfree(prev_nmi_count);
132 return 0; 133 return 0;
133error:
134 timer_ack = !cpu_has_tsc;
135
136 return -1;
137} 134}
138/* This needs to happen later in boot so counters are working */ 135/* This needs to happen later in boot so counters are working */
139late_initcall(check_nmi_watchdog); 136late_initcall(check_nmi_watchdog);
@@ -181,7 +178,7 @@ static int lapic_nmi_resume(struct sys_device *dev)
181 178
182 179
183static struct sysdev_class nmi_sysclass = { 180static struct sysdev_class nmi_sysclass = {
184 set_kset_name("lapic_nmi"), 181 .name = "lapic_nmi",
185 .resume = lapic_nmi_resume, 182 .resume = lapic_nmi_resume,
186 .suspend = lapic_nmi_suspend, 183 .suspend = lapic_nmi_suspend,
187}; 184};
@@ -242,10 +239,10 @@ void acpi_nmi_disable(void)
242 on_each_cpu(__acpi_nmi_disable, NULL, 0, 1); 239 on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
243} 240}
244 241
245void setup_apic_nmi_watchdog (void *unused) 242void setup_apic_nmi_watchdog(void *unused)
246{ 243{
247 if (__get_cpu_var(wd_enabled)) 244 if (__get_cpu_var(wd_enabled))
248 return; 245 return;
249 246
250 /* cheap hack to support suspend/resume */ 247 /* cheap hack to support suspend/resume */
251 /* if cpu0 is not active neither should the other cpus */ 248 /* if cpu0 is not active neither should the other cpus */
@@ -334,7 +331,7 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
334 unsigned int sum; 331 unsigned int sum;
335 int touched = 0; 332 int touched = 0;
336 int cpu = smp_processor_id(); 333 int cpu = smp_processor_id();
337 int rc=0; 334 int rc = 0;
338 335
339 /* check for other users first */ 336 /* check for other users first */
340 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) 337 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c
index 4253c4e8849..fb99484d21c 100644
--- a/arch/x86/kernel/nmi_64.c
+++ b/arch/x86/kernel/nmi_64.c
@@ -39,7 +39,7 @@ static cpumask_t backtrace_mask = CPU_MASK_NONE;
39 * 0: the lapic NMI watchdog is disabled, but can be enabled 39 * 0: the lapic NMI watchdog is disabled, but can be enabled
40 */ 40 */
41atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ 41atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */
42int panic_on_timeout; 42static int panic_on_timeout;
43 43
44unsigned int nmi_watchdog = NMI_DEFAULT; 44unsigned int nmi_watchdog = NMI_DEFAULT;
45static unsigned int nmi_hz = HZ; 45static unsigned int nmi_hz = HZ;
@@ -78,22 +78,22 @@ static __init void nmi_cpu_busy(void *data)
78} 78}
79#endif 79#endif
80 80
81int __init check_nmi_watchdog (void) 81int __init check_nmi_watchdog(void)
82{ 82{
83 int *counts; 83 int *prev_nmi_count;
84 int cpu; 84 int cpu;
85 85
86 if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DISABLED)) 86 if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DISABLED))
87 return 0; 87 return 0;
88 88
89 if (!atomic_read(&nmi_active)) 89 if (!atomic_read(&nmi_active))
90 return 0; 90 return 0;
91 91
92 counts = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL); 92 prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
93 if (!counts) 93 if (!prev_nmi_count)
94 return -1; 94 return -1;
95 95
96 printk(KERN_INFO "testing NMI watchdog ... "); 96 printk(KERN_INFO "Testing NMI watchdog ... ");
97 97
98#ifdef CONFIG_SMP 98#ifdef CONFIG_SMP
99 if (nmi_watchdog == NMI_LOCAL_APIC) 99 if (nmi_watchdog == NMI_LOCAL_APIC)
@@ -101,30 +101,29 @@ int __init check_nmi_watchdog (void)
101#endif 101#endif
102 102
103 for (cpu = 0; cpu < NR_CPUS; cpu++) 103 for (cpu = 0; cpu < NR_CPUS; cpu++)
104 counts[cpu] = cpu_pda(cpu)->__nmi_count; 104 prev_nmi_count[cpu] = cpu_pda(cpu)->__nmi_count;
105 local_irq_enable(); 105 local_irq_enable();
106 mdelay((20*1000)/nmi_hz); // wait 20 ticks 106 mdelay((20*1000)/nmi_hz); // wait 20 ticks
107 107
108 for_each_online_cpu(cpu) { 108 for_each_online_cpu(cpu) {
109 if (!per_cpu(wd_enabled, cpu)) 109 if (!per_cpu(wd_enabled, cpu))
110 continue; 110 continue;
111 if (cpu_pda(cpu)->__nmi_count - counts[cpu] <= 5) { 111 if (cpu_pda(cpu)->__nmi_count - prev_nmi_count[cpu] <= 5) {
112 printk(KERN_WARNING "WARNING: CPU#%d: NMI " 112 printk(KERN_WARNING "WARNING: CPU#%d: NMI "
113 "appears to be stuck (%d->%d)!\n", 113 "appears to be stuck (%d->%d)!\n",
114 cpu, 114 cpu,
115 counts[cpu], 115 prev_nmi_count[cpu],
116 cpu_pda(cpu)->__nmi_count); 116 cpu_pda(cpu)->__nmi_count);
117 per_cpu(wd_enabled, cpu) = 0; 117 per_cpu(wd_enabled, cpu) = 0;
118 atomic_dec(&nmi_active); 118 atomic_dec(&nmi_active);
119 } 119 }
120 } 120 }
121 endflag = 1;
121 if (!atomic_read(&nmi_active)) { 122 if (!atomic_read(&nmi_active)) {
122 kfree(counts); 123 kfree(prev_nmi_count);
123 atomic_set(&nmi_active, -1); 124 atomic_set(&nmi_active, -1);
124 endflag = 1;
125 return -1; 125 return -1;
126 } 126 }
127 endflag = 1;
128 printk("OK.\n"); 127 printk("OK.\n");
129 128
130 /* now that we know it works we can reduce NMI frequency to 129 /* now that we know it works we can reduce NMI frequency to
@@ -132,11 +131,11 @@ int __init check_nmi_watchdog (void)
132 if (nmi_watchdog == NMI_LOCAL_APIC) 131 if (nmi_watchdog == NMI_LOCAL_APIC)
133 nmi_hz = lapic_adjust_nmi_hz(1); 132 nmi_hz = lapic_adjust_nmi_hz(1);
134 133
135 kfree(counts); 134 kfree(prev_nmi_count);
136 return 0; 135 return 0;
137} 136}
138 137
139int __init setup_nmi_watchdog(char *str) 138static int __init setup_nmi_watchdog(char *str)
140{ 139{
141 int nmi; 140 int nmi;
142 141
@@ -159,34 +158,6 @@ int __init setup_nmi_watchdog(char *str)
159 158
160__setup("nmi_watchdog=", setup_nmi_watchdog); 159__setup("nmi_watchdog=", setup_nmi_watchdog);
161 160
162
163static void __acpi_nmi_disable(void *__unused)
164{
165 apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
166}
167
168/*
169 * Disable timer based NMIs on all CPUs:
170 */
171void acpi_nmi_disable(void)
172{
173 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
174 on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
175}
176
177static void __acpi_nmi_enable(void *__unused)
178{
179 apic_write(APIC_LVT0, APIC_DM_NMI);
180}
181
182/*
183 * Enable timer based NMIs on all CPUs:
184 */
185void acpi_nmi_enable(void)
186{
187 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
188 on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
189}
190#ifdef CONFIG_PM 161#ifdef CONFIG_PM
191 162
192static int nmi_pm_active; /* nmi_active before suspend */ 163static int nmi_pm_active; /* nmi_active before suspend */
@@ -211,13 +182,13 @@ static int lapic_nmi_resume(struct sys_device *dev)
211} 182}
212 183
213static struct sysdev_class nmi_sysclass = { 184static struct sysdev_class nmi_sysclass = {
214 set_kset_name("lapic_nmi"), 185 .name = "lapic_nmi",
215 .resume = lapic_nmi_resume, 186 .resume = lapic_nmi_resume,
216 .suspend = lapic_nmi_suspend, 187 .suspend = lapic_nmi_suspend,
217}; 188};
218 189
219static struct sys_device device_lapic_nmi = { 190static struct sys_device device_lapic_nmi = {
220 .id = 0, 191 .id = 0,
221 .cls = &nmi_sysclass, 192 .cls = &nmi_sysclass,
222}; 193};
223 194
@@ -231,7 +202,7 @@ static int __init init_lapic_nmi_sysfs(void)
231 if (nmi_watchdog != NMI_LOCAL_APIC) 202 if (nmi_watchdog != NMI_LOCAL_APIC)
232 return 0; 203 return 0;
233 204
234 if ( atomic_read(&nmi_active) < 0 ) 205 if (atomic_read(&nmi_active) < 0)
235 return 0; 206 return 0;
236 207
237 error = sysdev_class_register(&nmi_sysclass); 208 error = sysdev_class_register(&nmi_sysclass);
@@ -244,9 +215,37 @@ late_initcall(init_lapic_nmi_sysfs);
244 215
245#endif /* CONFIG_PM */ 216#endif /* CONFIG_PM */
246 217
218static void __acpi_nmi_enable(void *__unused)
219{
220 apic_write(APIC_LVT0, APIC_DM_NMI);
221}
222
223/*
224 * Enable timer based NMIs on all CPUs:
225 */
226void acpi_nmi_enable(void)
227{
228 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
229 on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
230}
231
232static void __acpi_nmi_disable(void *__unused)
233{
234 apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
235}
236
237/*
238 * Disable timer based NMIs on all CPUs:
239 */
240void acpi_nmi_disable(void)
241{
242 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
243 on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
244}
245
247void setup_apic_nmi_watchdog(void *unused) 246void setup_apic_nmi_watchdog(void *unused)
248{ 247{
249 if (__get_cpu_var(wd_enabled) == 1) 248 if (__get_cpu_var(wd_enabled))
250 return; 249 return;
251 250
252 /* cheap hack to support suspend/resume */ 251 /* cheap hack to support suspend/resume */
@@ -311,8 +310,9 @@ void touch_nmi_watchdog(void)
311 } 310 }
312 } 311 }
313 312
314 touch_softlockup_watchdog(); 313 touch_softlockup_watchdog();
315} 314}
315EXPORT_SYMBOL(touch_nmi_watchdog);
316 316
317int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) 317int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
318{ 318{
@@ -479,4 +479,3 @@ void __trigger_all_cpu_backtrace(void)
479 479
480EXPORT_SYMBOL(nmi_active); 480EXPORT_SYMBOL(nmi_active);
481EXPORT_SYMBOL(nmi_watchdog); 481EXPORT_SYMBOL(nmi_watchdog);
482EXPORT_SYMBOL(touch_nmi_watchdog);
diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c
index 9000d82c6dc..e65281b1634 100644
--- a/arch/x86/kernel/numaq_32.c
+++ b/arch/x86/kernel/numaq_32.c
@@ -82,7 +82,7 @@ static int __init numaq_tsc_disable(void)
82{ 82{
83 if (num_online_nodes() > 1) { 83 if (num_online_nodes() > 1) {
84 printk(KERN_DEBUG "NUMAQ: disabling TSC\n"); 84 printk(KERN_DEBUG "NUMAQ: disabling TSC\n");
85 tsc_disable = 1; 85 setup_clear_cpu_cap(X86_FEATURE_TSC);
86 } 86 }
87 return 0; 87 return 0;
88} 88}
diff --git a/arch/x86/kernel/paravirt_32.c b/arch/x86/kernel/paravirt.c
index f5000799f8e..075962cc75a 100644
--- a/arch/x86/kernel/paravirt_32.c
+++ b/arch/x86/kernel/paravirt.c
@@ -14,7 +14,10 @@
14 You should have received a copy of the GNU General Public License 14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software 15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17
18 2007 - x86_64 support added by Glauber de Oliveira Costa, Red Hat Inc
17*/ 19*/
20
18#include <linux/errno.h> 21#include <linux/errno.h>
19#include <linux/module.h> 22#include <linux/module.h>
20#include <linux/efi.h> 23#include <linux/efi.h>
@@ -55,59 +58,9 @@ char *memory_setup(void)
55 extern const char start_##ops##_##name[], end_##ops##_##name[]; \ 58 extern const char start_##ops##_##name[], end_##ops##_##name[]; \
56 asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":") 59 asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":")
57 60
58DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
59DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
60DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf");
61DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax");
62DEF_NATIVE(pv_cpu_ops, iret, "iret");
63DEF_NATIVE(pv_cpu_ops, irq_enable_sysexit, "sti; sysexit");
64DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax");
65DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3");
66DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
67DEF_NATIVE(pv_cpu_ops, clts, "clts");
68DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc");
69
70/* Undefined instruction for dealing with missing ops pointers. */ 61/* Undefined instruction for dealing with missing ops pointers. */
71static const unsigned char ud2a[] = { 0x0f, 0x0b }; 62static const unsigned char ud2a[] = { 0x0f, 0x0b };
72 63
73static unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
74 unsigned long addr, unsigned len)
75{
76 const unsigned char *start, *end;
77 unsigned ret;
78
79 switch(type) {
80#define SITE(ops, x) \
81 case PARAVIRT_PATCH(ops.x): \
82 start = start_##ops##_##x; \
83 end = end_##ops##_##x; \
84 goto patch_site
85
86 SITE(pv_irq_ops, irq_disable);
87 SITE(pv_irq_ops, irq_enable);
88 SITE(pv_irq_ops, restore_fl);
89 SITE(pv_irq_ops, save_fl);
90 SITE(pv_cpu_ops, iret);
91 SITE(pv_cpu_ops, irq_enable_sysexit);
92 SITE(pv_mmu_ops, read_cr2);
93 SITE(pv_mmu_ops, read_cr3);
94 SITE(pv_mmu_ops, write_cr3);
95 SITE(pv_cpu_ops, clts);
96 SITE(pv_cpu_ops, read_tsc);
97#undef SITE
98
99 patch_site:
100 ret = paravirt_patch_insns(ibuf, len, start, end);
101 break;
102
103 default:
104 ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
105 break;
106 }
107
108 return ret;
109}
110
111unsigned paravirt_patch_nop(void) 64unsigned paravirt_patch_nop(void)
112{ 65{
113 return 0; 66 return 0;
@@ -186,7 +139,7 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
186 /* If the operation is a nop, then nop the callsite */ 139 /* If the operation is a nop, then nop the callsite */
187 ret = paravirt_patch_nop(); 140 ret = paravirt_patch_nop();
188 else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) || 141 else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) ||
189 type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit)) 142 type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_syscall_ret))
190 /* If operation requires a jmp, then jmp */ 143 /* If operation requires a jmp, then jmp */
191 ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len); 144 ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len);
192 else 145 else
@@ -237,7 +190,7 @@ static void native_flush_tlb_single(unsigned long addr)
237 190
238/* These are in entry.S */ 191/* These are in entry.S */
239extern void native_iret(void); 192extern void native_iret(void);
240extern void native_irq_enable_sysexit(void); 193extern void native_irq_enable_syscall_ret(void);
241 194
242static int __init print_banner(void) 195static int __init print_banner(void)
243{ 196{
@@ -285,18 +238,18 @@ static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LA
285 238
286static inline void enter_lazy(enum paravirt_lazy_mode mode) 239static inline void enter_lazy(enum paravirt_lazy_mode mode)
287{ 240{
288 BUG_ON(x86_read_percpu(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); 241 BUG_ON(__get_cpu_var(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
289 BUG_ON(preemptible()); 242 BUG_ON(preemptible());
290 243
291 x86_write_percpu(paravirt_lazy_mode, mode); 244 __get_cpu_var(paravirt_lazy_mode) = mode;
292} 245}
293 246
294void paravirt_leave_lazy(enum paravirt_lazy_mode mode) 247void paravirt_leave_lazy(enum paravirt_lazy_mode mode)
295{ 248{
296 BUG_ON(x86_read_percpu(paravirt_lazy_mode) != mode); 249 BUG_ON(__get_cpu_var(paravirt_lazy_mode) != mode);
297 BUG_ON(preemptible()); 250 BUG_ON(preemptible());
298 251
299 x86_write_percpu(paravirt_lazy_mode, PARAVIRT_LAZY_NONE); 252 __get_cpu_var(paravirt_lazy_mode) = PARAVIRT_LAZY_NONE;
300} 253}
301 254
302void paravirt_enter_lazy_mmu(void) 255void paravirt_enter_lazy_mmu(void)
@@ -321,7 +274,7 @@ void paravirt_leave_lazy_cpu(void)
321 274
322enum paravirt_lazy_mode paravirt_get_lazy_mode(void) 275enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
323{ 276{
324 return x86_read_percpu(paravirt_lazy_mode); 277 return __get_cpu_var(paravirt_lazy_mode);
325} 278}
326 279
327struct pv_info pv_info = { 280struct pv_info pv_info = {
@@ -366,11 +319,16 @@ struct pv_cpu_ops pv_cpu_ops = {
366 .read_cr4 = native_read_cr4, 319 .read_cr4 = native_read_cr4,
367 .read_cr4_safe = native_read_cr4_safe, 320 .read_cr4_safe = native_read_cr4_safe,
368 .write_cr4 = native_write_cr4, 321 .write_cr4 = native_write_cr4,
322#ifdef CONFIG_X86_64
323 .read_cr8 = native_read_cr8,
324 .write_cr8 = native_write_cr8,
325#endif
369 .wbinvd = native_wbinvd, 326 .wbinvd = native_wbinvd,
370 .read_msr = native_read_msr_safe, 327 .read_msr = native_read_msr_safe,
371 .write_msr = native_write_msr_safe, 328 .write_msr = native_write_msr_safe,
372 .read_tsc = native_read_tsc, 329 .read_tsc = native_read_tsc,
373 .read_pmc = native_read_pmc, 330 .read_pmc = native_read_pmc,
331 .read_tscp = native_read_tscp,
374 .load_tr_desc = native_load_tr_desc, 332 .load_tr_desc = native_load_tr_desc,
375 .set_ldt = native_set_ldt, 333 .set_ldt = native_set_ldt,
376 .load_gdt = native_load_gdt, 334 .load_gdt = native_load_gdt,
@@ -379,13 +337,14 @@ struct pv_cpu_ops pv_cpu_ops = {
379 .store_idt = native_store_idt, 337 .store_idt = native_store_idt,
380 .store_tr = native_store_tr, 338 .store_tr = native_store_tr,
381 .load_tls = native_load_tls, 339 .load_tls = native_load_tls,
382 .write_ldt_entry = write_dt_entry, 340 .write_ldt_entry = native_write_ldt_entry,
383 .write_gdt_entry = write_dt_entry, 341 .write_gdt_entry = native_write_gdt_entry,
384 .write_idt_entry = write_dt_entry, 342 .write_idt_entry = native_write_idt_entry,
385 .load_esp0 = native_load_esp0, 343 .load_sp0 = native_load_sp0,
386 344
387 .irq_enable_sysexit = native_irq_enable_sysexit, 345 .irq_enable_syscall_ret = native_irq_enable_syscall_ret,
388 .iret = native_iret, 346 .iret = native_iret,
347 .swapgs = native_swapgs,
389 348
390 .set_iopl_mask = native_set_iopl_mask, 349 .set_iopl_mask = native_set_iopl_mask,
391 .io_delay = native_io_delay, 350 .io_delay = native_io_delay,
@@ -408,8 +367,10 @@ struct pv_apic_ops pv_apic_ops = {
408}; 367};
409 368
410struct pv_mmu_ops pv_mmu_ops = { 369struct pv_mmu_ops pv_mmu_ops = {
370#ifndef CONFIG_X86_64
411 .pagetable_setup_start = native_pagetable_setup_start, 371 .pagetable_setup_start = native_pagetable_setup_start,
412 .pagetable_setup_done = native_pagetable_setup_done, 372 .pagetable_setup_done = native_pagetable_setup_done,
373#endif
413 374
414 .read_cr2 = native_read_cr2, 375 .read_cr2 = native_read_cr2,
415 .write_cr2 = native_write_cr2, 376 .write_cr2 = native_write_cr2,
@@ -437,16 +398,23 @@ struct pv_mmu_ops pv_mmu_ops = {
437 .kmap_atomic_pte = kmap_atomic, 398 .kmap_atomic_pte = kmap_atomic,
438#endif 399#endif
439 400
401#if PAGETABLE_LEVELS >= 3
440#ifdef CONFIG_X86_PAE 402#ifdef CONFIG_X86_PAE
441 .set_pte_atomic = native_set_pte_atomic, 403 .set_pte_atomic = native_set_pte_atomic,
442 .set_pte_present = native_set_pte_present, 404 .set_pte_present = native_set_pte_present,
443 .set_pud = native_set_pud,
444 .pte_clear = native_pte_clear, 405 .pte_clear = native_pte_clear,
445 .pmd_clear = native_pmd_clear, 406 .pmd_clear = native_pmd_clear,
446 407#endif
408 .set_pud = native_set_pud,
447 .pmd_val = native_pmd_val, 409 .pmd_val = native_pmd_val,
448 .make_pmd = native_make_pmd, 410 .make_pmd = native_make_pmd,
411
412#if PAGETABLE_LEVELS == 4
413 .pud_val = native_pud_val,
414 .make_pud = native_make_pud,
415 .set_pgd = native_set_pgd,
449#endif 416#endif
417#endif /* PAGETABLE_LEVELS >= 3 */
450 418
451 .pte_val = native_pte_val, 419 .pte_val = native_pte_val,
452 .pgd_val = native_pgd_val, 420 .pgd_val = native_pgd_val,
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
new file mode 100644
index 00000000000..82fc5fcab4f
--- /dev/null
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -0,0 +1,49 @@
1#include <asm/paravirt.h>
2
3DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
4DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
5DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf");
6DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax");
7DEF_NATIVE(pv_cpu_ops, iret, "iret");
8DEF_NATIVE(pv_cpu_ops, irq_enable_syscall_ret, "sti; sysexit");
9DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax");
10DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3");
11DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
12DEF_NATIVE(pv_cpu_ops, clts, "clts");
13DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc");
14
15unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
16 unsigned long addr, unsigned len)
17{
18 const unsigned char *start, *end;
19 unsigned ret;
20
21#define PATCH_SITE(ops, x) \
22 case PARAVIRT_PATCH(ops.x): \
23 start = start_##ops##_##x; \
24 end = end_##ops##_##x; \
25 goto patch_site
26 switch(type) {
27 PATCH_SITE(pv_irq_ops, irq_disable);
28 PATCH_SITE(pv_irq_ops, irq_enable);
29 PATCH_SITE(pv_irq_ops, restore_fl);
30 PATCH_SITE(pv_irq_ops, save_fl);
31 PATCH_SITE(pv_cpu_ops, iret);
32 PATCH_SITE(pv_cpu_ops, irq_enable_syscall_ret);
33 PATCH_SITE(pv_mmu_ops, read_cr2);
34 PATCH_SITE(pv_mmu_ops, read_cr3);
35 PATCH_SITE(pv_mmu_ops, write_cr3);
36 PATCH_SITE(pv_cpu_ops, clts);
37 PATCH_SITE(pv_cpu_ops, read_tsc);
38
39 patch_site:
40 ret = paravirt_patch_insns(ibuf, len, start, end);
41 break;
42
43 default:
44 ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
45 break;
46 }
47#undef PATCH_SITE
48 return ret;
49}
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
new file mode 100644
index 00000000000..7d904e138d7
--- /dev/null
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -0,0 +1,57 @@
1#include <asm/paravirt.h>
2#include <asm/asm-offsets.h>
3#include <linux/stringify.h>
4
5DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
6DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
7DEF_NATIVE(pv_irq_ops, restore_fl, "pushq %rdi; popfq");
8DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");
9DEF_NATIVE(pv_cpu_ops, iret, "iretq");
10DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
11DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");
12DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");
13DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)");
14DEF_NATIVE(pv_cpu_ops, clts, "clts");
15DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd");
16
17/* the three commands give us more control to how to return from a syscall */
18DEF_NATIVE(pv_cpu_ops, irq_enable_syscall_ret, "movq %gs:" __stringify(pda_oldrsp) ", %rsp; swapgs; sysretq;");
19DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs");
20
21unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
22 unsigned long addr, unsigned len)
23{
24 const unsigned char *start, *end;
25 unsigned ret;
26
27#define PATCH_SITE(ops, x) \
28 case PARAVIRT_PATCH(ops.x): \
29 start = start_##ops##_##x; \
30 end = end_##ops##_##x; \
31 goto patch_site
32 switch(type) {
33 PATCH_SITE(pv_irq_ops, restore_fl);
34 PATCH_SITE(pv_irq_ops, save_fl);
35 PATCH_SITE(pv_irq_ops, irq_enable);
36 PATCH_SITE(pv_irq_ops, irq_disable);
37 PATCH_SITE(pv_cpu_ops, iret);
38 PATCH_SITE(pv_cpu_ops, irq_enable_syscall_ret);
39 PATCH_SITE(pv_cpu_ops, swapgs);
40 PATCH_SITE(pv_mmu_ops, read_cr2);
41 PATCH_SITE(pv_mmu_ops, read_cr3);
42 PATCH_SITE(pv_mmu_ops, write_cr3);
43 PATCH_SITE(pv_cpu_ops, clts);
44 PATCH_SITE(pv_mmu_ops, flush_tlb_single);
45 PATCH_SITE(pv_cpu_ops, wbinvd);
46
47 patch_site:
48 ret = paravirt_patch_insns(ibuf, len, start, end);
49 break;
50
51 default:
52 ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
53 break;
54 }
55#undef PATCH_SITE
56 return ret;
57}
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 6bf1f716909..1b5464c2434 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -30,12 +30,12 @@
30#include <linux/spinlock.h> 30#include <linux/spinlock.h>
31#include <linux/string.h> 31#include <linux/string.h>
32#include <linux/dma-mapping.h> 32#include <linux/dma-mapping.h>
33#include <linux/init.h>
34#include <linux/bitops.h> 33#include <linux/bitops.h>
35#include <linux/pci_ids.h> 34#include <linux/pci_ids.h>
36#include <linux/pci.h> 35#include <linux/pci.h>
37#include <linux/delay.h> 36#include <linux/delay.h>
38#include <linux/scatterlist.h> 37#include <linux/scatterlist.h>
38#include <linux/iommu-helper.h>
39#include <asm/gart.h> 39#include <asm/gart.h>
40#include <asm/calgary.h> 40#include <asm/calgary.h>
41#include <asm/tce.h> 41#include <asm/tce.h>
@@ -183,7 +183,7 @@ static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, };
183 183
184/* enable this to stress test the chip's TCE cache */ 184/* enable this to stress test the chip's TCE cache */
185#ifdef CONFIG_IOMMU_DEBUG 185#ifdef CONFIG_IOMMU_DEBUG
186int debugging __read_mostly = 1; 186static int debugging = 1;
187 187
188static inline unsigned long verify_bit_range(unsigned long* bitmap, 188static inline unsigned long verify_bit_range(unsigned long* bitmap,
189 int expected, unsigned long start, unsigned long end) 189 int expected, unsigned long start, unsigned long end)
@@ -202,7 +202,7 @@ static inline unsigned long verify_bit_range(unsigned long* bitmap,
202 return ~0UL; 202 return ~0UL;
203} 203}
204#else /* debugging is disabled */ 204#else /* debugging is disabled */
205int debugging __read_mostly = 0; 205static int debugging;
206 206
207static inline unsigned long verify_bit_range(unsigned long* bitmap, 207static inline unsigned long verify_bit_range(unsigned long* bitmap,
208 int expected, unsigned long start, unsigned long end) 208 int expected, unsigned long start, unsigned long end)
@@ -261,22 +261,28 @@ static void iommu_range_reserve(struct iommu_table *tbl,
261 spin_unlock_irqrestore(&tbl->it_lock, flags); 261 spin_unlock_irqrestore(&tbl->it_lock, flags);
262} 262}
263 263
264static unsigned long iommu_range_alloc(struct iommu_table *tbl, 264static unsigned long iommu_range_alloc(struct device *dev,
265 unsigned int npages) 265 struct iommu_table *tbl,
266 unsigned int npages)
266{ 267{
267 unsigned long flags; 268 unsigned long flags;
268 unsigned long offset; 269 unsigned long offset;
270 unsigned long boundary_size;
271
272 boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
273 PAGE_SIZE) >> PAGE_SHIFT;
269 274
270 BUG_ON(npages == 0); 275 BUG_ON(npages == 0);
271 276
272 spin_lock_irqsave(&tbl->it_lock, flags); 277 spin_lock_irqsave(&tbl->it_lock, flags);
273 278
274 offset = find_next_zero_string(tbl->it_map, tbl->it_hint, 279 offset = iommu_area_alloc(tbl->it_map, tbl->it_size, tbl->it_hint,
275 tbl->it_size, npages); 280 npages, 0, boundary_size, 0);
276 if (offset == ~0UL) { 281 if (offset == ~0UL) {
277 tbl->chip_ops->tce_cache_blast(tbl); 282 tbl->chip_ops->tce_cache_blast(tbl);
278 offset = find_next_zero_string(tbl->it_map, 0, 283
279 tbl->it_size, npages); 284 offset = iommu_area_alloc(tbl->it_map, tbl->it_size, 0,
285 npages, 0, boundary_size, 0);
280 if (offset == ~0UL) { 286 if (offset == ~0UL) {
281 printk(KERN_WARNING "Calgary: IOMMU full.\n"); 287 printk(KERN_WARNING "Calgary: IOMMU full.\n");
282 spin_unlock_irqrestore(&tbl->it_lock, flags); 288 spin_unlock_irqrestore(&tbl->it_lock, flags);
@@ -287,7 +293,6 @@ static unsigned long iommu_range_alloc(struct iommu_table *tbl,
287 } 293 }
288 } 294 }
289 295
290 set_bit_string(tbl->it_map, offset, npages);
291 tbl->it_hint = offset + npages; 296 tbl->it_hint = offset + npages;
292 BUG_ON(tbl->it_hint > tbl->it_size); 297 BUG_ON(tbl->it_hint > tbl->it_size);
293 298
@@ -296,13 +301,13 @@ static unsigned long iommu_range_alloc(struct iommu_table *tbl,
296 return offset; 301 return offset;
297} 302}
298 303
299static dma_addr_t iommu_alloc(struct iommu_table *tbl, void *vaddr, 304static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
300 unsigned int npages, int direction) 305 void *vaddr, unsigned int npages, int direction)
301{ 306{
302 unsigned long entry; 307 unsigned long entry;
303 dma_addr_t ret = bad_dma_address; 308 dma_addr_t ret = bad_dma_address;
304 309
305 entry = iommu_range_alloc(tbl, npages); 310 entry = iommu_range_alloc(dev, tbl, npages);
306 311
307 if (unlikely(entry == bad_dma_address)) 312 if (unlikely(entry == bad_dma_address))
308 goto error; 313 goto error;
@@ -355,7 +360,7 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
355 badbit, tbl, dma_addr, entry, npages); 360 badbit, tbl, dma_addr, entry, npages);
356 } 361 }
357 362
358 __clear_bit_string(tbl->it_map, entry, npages); 363 iommu_area_free(tbl->it_map, entry, npages);
359 364
360 spin_unlock_irqrestore(&tbl->it_lock, flags); 365 spin_unlock_irqrestore(&tbl->it_lock, flags);
361} 366}
@@ -439,7 +444,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
439 vaddr = (unsigned long) sg_virt(s); 444 vaddr = (unsigned long) sg_virt(s);
440 npages = num_dma_pages(vaddr, s->length); 445 npages = num_dma_pages(vaddr, s->length);
441 446
442 entry = iommu_range_alloc(tbl, npages); 447 entry = iommu_range_alloc(dev, tbl, npages);
443 if (entry == bad_dma_address) { 448 if (entry == bad_dma_address) {
444 /* makes sure unmap knows to stop */ 449 /* makes sure unmap knows to stop */
445 s->dma_length = 0; 450 s->dma_length = 0;
@@ -477,7 +482,7 @@ static dma_addr_t calgary_map_single(struct device *dev, void *vaddr,
477 npages = num_dma_pages(uaddr, size); 482 npages = num_dma_pages(uaddr, size);
478 483
479 if (translation_enabled(tbl)) 484 if (translation_enabled(tbl))
480 dma_handle = iommu_alloc(tbl, vaddr, npages, direction); 485 dma_handle = iommu_alloc(dev, tbl, vaddr, npages, direction);
481 else 486 else
482 dma_handle = virt_to_bus(vaddr); 487 dma_handle = virt_to_bus(vaddr);
483 488
@@ -517,7 +522,7 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size,
517 522
518 if (translation_enabled(tbl)) { 523 if (translation_enabled(tbl)) {
519 /* set up tces to cover the allocated range */ 524 /* set up tces to cover the allocated range */
520 mapping = iommu_alloc(tbl, ret, npages, DMA_BIDIRECTIONAL); 525 mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL);
521 if (mapping == bad_dma_address) 526 if (mapping == bad_dma_address)
522 goto free; 527 goto free;
523 528
@@ -1007,7 +1012,7 @@ static void __init calgary_set_split_completion_timeout(void __iomem *bbar,
1007 readq(target); /* flush */ 1012 readq(target); /* flush */
1008} 1013}
1009 1014
1010static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev) 1015static void __init calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev)
1011{ 1016{
1012 unsigned char busnum = dev->bus->number; 1017 unsigned char busnum = dev->bus->number;
1013 void __iomem *bbar = tbl->bbar; 1018 void __iomem *bbar = tbl->bbar;
@@ -1023,7 +1028,7 @@ static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev)
1023 writel(cpu_to_be32(val), target); 1028 writel(cpu_to_be32(val), target);
1024} 1029}
1025 1030
1026static void calgary_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev) 1031static void __init calgary_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev)
1027{ 1032{
1028 unsigned char busnum = dev->bus->number; 1033 unsigned char busnum = dev->bus->number;
1029 1034
diff --git a/arch/x86/kernel/pci-dma_64.c b/arch/x86/kernel/pci-dma_64.c
index 5552d23d23c..a82473d192a 100644
--- a/arch/x86/kernel/pci-dma_64.c
+++ b/arch/x86/kernel/pci-dma_64.c
@@ -13,7 +13,6 @@
13#include <asm/calgary.h> 13#include <asm/calgary.h>
14 14
15int iommu_merge __read_mostly = 0; 15int iommu_merge __read_mostly = 0;
16EXPORT_SYMBOL(iommu_merge);
17 16
18dma_addr_t bad_dma_address __read_mostly; 17dma_addr_t bad_dma_address __read_mostly;
19EXPORT_SYMBOL(bad_dma_address); 18EXPORT_SYMBOL(bad_dma_address);
@@ -230,7 +229,7 @@ EXPORT_SYMBOL(dma_set_mask);
230 * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter 229 * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter
231 * documentation. 230 * documentation.
232 */ 231 */
233__init int iommu_setup(char *p) 232static __init int iommu_setup(char *p)
234{ 233{
235 iommu_merge = 1; 234 iommu_merge = 1;
236 235
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index 06bcba53604..65f6acb025c 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -1,12 +1,12 @@
1/* 1/*
2 * Dynamic DMA mapping support for AMD Hammer. 2 * Dynamic DMA mapping support for AMD Hammer.
3 * 3 *
4 * Use the integrated AGP GART in the Hammer northbridge as an IOMMU for PCI. 4 * Use the integrated AGP GART in the Hammer northbridge as an IOMMU for PCI.
5 * This allows to use PCI devices that only support 32bit addresses on systems 5 * This allows to use PCI devices that only support 32bit addresses on systems
6 * with more than 4GB. 6 * with more than 4GB.
7 * 7 *
8 * See Documentation/DMA-mapping.txt for the interface specification. 8 * See Documentation/DMA-mapping.txt for the interface specification.
9 * 9 *
10 * Copyright 2002 Andi Kleen, SuSE Labs. 10 * Copyright 2002 Andi Kleen, SuSE Labs.
11 * Subject to the GNU General Public License v2 only. 11 * Subject to the GNU General Public License v2 only.
12 */ 12 */
@@ -25,6 +25,7 @@
25#include <linux/bitops.h> 25#include <linux/bitops.h>
26#include <linux/kdebug.h> 26#include <linux/kdebug.h>
27#include <linux/scatterlist.h> 27#include <linux/scatterlist.h>
28#include <linux/iommu-helper.h>
28#include <asm/atomic.h> 29#include <asm/atomic.h>
29#include <asm/io.h> 30#include <asm/io.h>
30#include <asm/mtrr.h> 31#include <asm/mtrr.h>
@@ -37,23 +38,26 @@
37#include <asm/k8.h> 38#include <asm/k8.h>
38 39
39static unsigned long iommu_bus_base; /* GART remapping area (physical) */ 40static unsigned long iommu_bus_base; /* GART remapping area (physical) */
40static unsigned long iommu_size; /* size of remapping area bytes */ 41static unsigned long iommu_size; /* size of remapping area bytes */
41static unsigned long iommu_pages; /* .. and in pages */ 42static unsigned long iommu_pages; /* .. and in pages */
42 43
43static u32 *iommu_gatt_base; /* Remapping table */ 44static u32 *iommu_gatt_base; /* Remapping table */
44 45
45/* If this is disabled the IOMMU will use an optimized flushing strategy 46/*
46 of only flushing when an mapping is reused. With it true the GART is flushed 47 * If this is disabled the IOMMU will use an optimized flushing strategy
47 for every mapping. Problem is that doing the lazy flush seems to trigger 48 * of only flushing when an mapping is reused. With it true the GART is
48 bugs with some popular PCI cards, in particular 3ware (but has been also 49 * flushed for every mapping. Problem is that doing the lazy flush seems
49 also seen with Qlogic at least). */ 50 * to trigger bugs with some popular PCI cards, in particular 3ware (but
51 * has been also also seen with Qlogic at least).
52 */
50int iommu_fullflush = 1; 53int iommu_fullflush = 1;
51 54
52/* Allocation bitmap for the remapping area */ 55/* Allocation bitmap for the remapping area: */
53static DEFINE_SPINLOCK(iommu_bitmap_lock); 56static DEFINE_SPINLOCK(iommu_bitmap_lock);
54static unsigned long *iommu_gart_bitmap; /* guarded by iommu_bitmap_lock */ 57/* Guarded by iommu_bitmap_lock: */
58static unsigned long *iommu_gart_bitmap;
55 59
56static u32 gart_unmapped_entry; 60static u32 gart_unmapped_entry;
57 61
58#define GPTE_VALID 1 62#define GPTE_VALID 1
59#define GPTE_COHERENT 2 63#define GPTE_COHERENT 2
@@ -61,10 +65,10 @@ static u32 gart_unmapped_entry;
61 (((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT) 65 (((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT)
62#define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28)) 66#define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28))
63 67
64#define to_pages(addr,size) \ 68#define to_pages(addr, size) \
65 (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT) 69 (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT)
66 70
67#define EMERGENCY_PAGES 32 /* = 128KB */ 71#define EMERGENCY_PAGES 32 /* = 128KB */
68 72
69#ifdef CONFIG_AGP 73#ifdef CONFIG_AGP
70#define AGPEXTERN extern 74#define AGPEXTERN extern
@@ -77,130 +81,159 @@ AGPEXTERN int agp_memory_reserved;
77AGPEXTERN __u32 *agp_gatt_table; 81AGPEXTERN __u32 *agp_gatt_table;
78 82
79static unsigned long next_bit; /* protected by iommu_bitmap_lock */ 83static unsigned long next_bit; /* protected by iommu_bitmap_lock */
80static int need_flush; /* global flush state. set for each gart wrap */ 84static int need_flush; /* global flush state. set for each gart wrap */
81 85
82static unsigned long alloc_iommu(int size) 86static unsigned long alloc_iommu(struct device *dev, int size)
83{ 87{
84 unsigned long offset, flags; 88 unsigned long offset, flags;
89 unsigned long boundary_size;
90 unsigned long base_index;
91
92 base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev),
93 PAGE_SIZE) >> PAGE_SHIFT;
94 boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
95 PAGE_SIZE) >> PAGE_SHIFT;
85 96
86 spin_lock_irqsave(&iommu_bitmap_lock, flags); 97 spin_lock_irqsave(&iommu_bitmap_lock, flags);
87 offset = find_next_zero_string(iommu_gart_bitmap,next_bit,iommu_pages,size); 98 offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, next_bit,
99 size, base_index, boundary_size, 0);
88 if (offset == -1) { 100 if (offset == -1) {
89 need_flush = 1; 101 need_flush = 1;
90 offset = find_next_zero_string(iommu_gart_bitmap,0,iommu_pages,size); 102 offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, 0,
103 size, base_index, boundary_size, 0);
91 } 104 }
92 if (offset != -1) { 105 if (offset != -1) {
93 set_bit_string(iommu_gart_bitmap, offset, size); 106 set_bit_string(iommu_gart_bitmap, offset, size);
94 next_bit = offset+size; 107 next_bit = offset+size;
95 if (next_bit >= iommu_pages) { 108 if (next_bit >= iommu_pages) {
96 next_bit = 0; 109 next_bit = 0;
97 need_flush = 1; 110 need_flush = 1;
98 } 111 }
99 } 112 }
100 if (iommu_fullflush) 113 if (iommu_fullflush)
101 need_flush = 1; 114 need_flush = 1;
102 spin_unlock_irqrestore(&iommu_bitmap_lock, flags); 115 spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
116
103 return offset; 117 return offset;
104} 118}
105 119
106static void free_iommu(unsigned long offset, int size) 120static void free_iommu(unsigned long offset, int size)
107{ 121{
108 unsigned long flags; 122 unsigned long flags;
123
109 spin_lock_irqsave(&iommu_bitmap_lock, flags); 124 spin_lock_irqsave(&iommu_bitmap_lock, flags);
110 __clear_bit_string(iommu_gart_bitmap, offset, size); 125 iommu_area_free(iommu_gart_bitmap, offset, size);
111 spin_unlock_irqrestore(&iommu_bitmap_lock, flags); 126 spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
112} 127}
113 128
114/* 129/*
115 * Use global flush state to avoid races with multiple flushers. 130 * Use global flush state to avoid races with multiple flushers.
116 */ 131 */
117static void flush_gart(void) 132static void flush_gart(void)
118{ 133{
119 unsigned long flags; 134 unsigned long flags;
135
120 spin_lock_irqsave(&iommu_bitmap_lock, flags); 136 spin_lock_irqsave(&iommu_bitmap_lock, flags);
121 if (need_flush) { 137 if (need_flush) {
122 k8_flush_garts(); 138 k8_flush_garts();
123 need_flush = 0; 139 need_flush = 0;
124 } 140 }
125 spin_unlock_irqrestore(&iommu_bitmap_lock, flags); 141 spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
126} 142}
127 143
128#ifdef CONFIG_IOMMU_LEAK 144#ifdef CONFIG_IOMMU_LEAK
129 145
130#define SET_LEAK(x) if (iommu_leak_tab) \ 146#define SET_LEAK(x) \
131 iommu_leak_tab[x] = __builtin_return_address(0); 147 do { \
132#define CLEAR_LEAK(x) if (iommu_leak_tab) \ 148 if (iommu_leak_tab) \
133 iommu_leak_tab[x] = NULL; 149 iommu_leak_tab[x] = __builtin_return_address(0);\
150 } while (0)
151
152#define CLEAR_LEAK(x) \
153 do { \
154 if (iommu_leak_tab) \
155 iommu_leak_tab[x] = NULL; \
156 } while (0)
134 157
135/* Debugging aid for drivers that don't free their IOMMU tables */ 158/* Debugging aid for drivers that don't free their IOMMU tables */
136static void **iommu_leak_tab; 159static void **iommu_leak_tab;
137static int leak_trace; 160static int leak_trace;
138static int iommu_leak_pages = 20; 161static int iommu_leak_pages = 20;
162
139static void dump_leak(void) 163static void dump_leak(void)
140{ 164{
141 int i; 165 int i;
142 static int dump; 166 static int dump;
143 if (dump || !iommu_leak_tab) return; 167
168 if (dump || !iommu_leak_tab)
169 return;
144 dump = 1; 170 dump = 1;
145 show_stack(NULL,NULL); 171 show_stack(NULL, NULL);
146 /* Very crude. dump some from the end of the table too */ 172
147 printk("Dumping %d pages from end of IOMMU:\n", iommu_leak_pages); 173 /* Very crude. dump some from the end of the table too */
148 for (i = 0; i < iommu_leak_pages; i+=2) { 174 printk(KERN_DEBUG "Dumping %d pages from end of IOMMU:\n",
149 printk("%lu: ", iommu_pages-i); 175 iommu_leak_pages);
150 printk_address((unsigned long) iommu_leak_tab[iommu_pages-i]); 176 for (i = 0; i < iommu_leak_pages; i += 2) {
151 printk("%c", (i+1)%2 == 0 ? '\n' : ' '); 177 printk(KERN_DEBUG "%lu: ", iommu_pages-i);
152 } 178 printk_address((unsigned long) iommu_leak_tab[iommu_pages-i], 0);
153 printk("\n"); 179 printk(KERN_CONT "%c", (i+1)%2 == 0 ? '\n' : ' ');
180 }
181 printk(KERN_DEBUG "\n");
154} 182}
155#else 183#else
156#define SET_LEAK(x) 184# define SET_LEAK(x)
157#define CLEAR_LEAK(x) 185# define CLEAR_LEAK(x)
158#endif 186#endif
159 187
160static void iommu_full(struct device *dev, size_t size, int dir) 188static void iommu_full(struct device *dev, size_t size, int dir)
161{ 189{
162 /* 190 /*
163 * Ran out of IOMMU space for this operation. This is very bad. 191 * Ran out of IOMMU space for this operation. This is very bad.
164 * Unfortunately the drivers cannot handle this operation properly. 192 * Unfortunately the drivers cannot handle this operation properly.
165 * Return some non mapped prereserved space in the aperture and 193 * Return some non mapped prereserved space in the aperture and
166 * let the Northbridge deal with it. This will result in garbage 194 * let the Northbridge deal with it. This will result in garbage
167 * in the IO operation. When the size exceeds the prereserved space 195 * in the IO operation. When the size exceeds the prereserved space
168 * memory corruption will occur or random memory will be DMAed 196 * memory corruption will occur or random memory will be DMAed
169 * out. Hopefully no network devices use single mappings that big. 197 * out. Hopefully no network devices use single mappings that big.
170 */ 198 */
171 199
172 printk(KERN_ERR 200 printk(KERN_ERR
173 "PCI-DMA: Out of IOMMU space for %lu bytes at device %s\n", 201 "PCI-DMA: Out of IOMMU space for %lu bytes at device %s\n",
174 size, dev->bus_id); 202 size, dev->bus_id);
175 203
176 if (size > PAGE_SIZE*EMERGENCY_PAGES) { 204 if (size > PAGE_SIZE*EMERGENCY_PAGES) {
177 if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL) 205 if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL)
178 panic("PCI-DMA: Memory would be corrupted\n"); 206 panic("PCI-DMA: Memory would be corrupted\n");
179 if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL) 207 if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL)
180 panic(KERN_ERR "PCI-DMA: Random memory would be DMAed\n"); 208 panic(KERN_ERR
181 } 209 "PCI-DMA: Random memory would be DMAed\n");
182 210 }
183#ifdef CONFIG_IOMMU_LEAK 211#ifdef CONFIG_IOMMU_LEAK
184 dump_leak(); 212 dump_leak();
185#endif 213#endif
186} 214}
187 215
188static inline int need_iommu(struct device *dev, unsigned long addr, size_t size) 216static inline int
189{ 217need_iommu(struct device *dev, unsigned long addr, size_t size)
218{
190 u64 mask = *dev->dma_mask; 219 u64 mask = *dev->dma_mask;
191 int high = addr + size > mask; 220 int high = addr + size > mask;
192 int mmu = high; 221 int mmu = high;
193 if (force_iommu) 222
194 mmu = 1; 223 if (force_iommu)
195 return mmu; 224 mmu = 1;
225
226 return mmu;
196} 227}
197 228
198static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t size) 229static inline int
199{ 230nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
231{
200 u64 mask = *dev->dma_mask; 232 u64 mask = *dev->dma_mask;
201 int high = addr + size > mask; 233 int high = addr + size > mask;
202 int mmu = high; 234 int mmu = high;
203 return mmu; 235
236 return mmu;
204} 237}
205 238
206/* Map a single continuous physical area into the IOMMU. 239/* Map a single continuous physical area into the IOMMU.
@@ -208,13 +241,14 @@ static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t
208 */ 241 */
209static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, 242static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
210 size_t size, int dir) 243 size_t size, int dir)
211{ 244{
212 unsigned long npages = to_pages(phys_mem, size); 245 unsigned long npages = to_pages(phys_mem, size);
213 unsigned long iommu_page = alloc_iommu(npages); 246 unsigned long iommu_page = alloc_iommu(dev, npages);
214 int i; 247 int i;
248
215 if (iommu_page == -1) { 249 if (iommu_page == -1) {
216 if (!nonforced_iommu(dev, phys_mem, size)) 250 if (!nonforced_iommu(dev, phys_mem, size))
217 return phys_mem; 251 return phys_mem;
218 if (panic_on_overflow) 252 if (panic_on_overflow)
219 panic("dma_map_area overflow %lu bytes\n", size); 253 panic("dma_map_area overflow %lu bytes\n", size);
220 iommu_full(dev, size, dir); 254 iommu_full(dev, size, dir);
@@ -229,35 +263,39 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
229 return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK); 263 return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK);
230} 264}
231 265
232static dma_addr_t gart_map_simple(struct device *dev, char *buf, 266static dma_addr_t
233 size_t size, int dir) 267gart_map_simple(struct device *dev, char *buf, size_t size, int dir)
234{ 268{
235 dma_addr_t map = dma_map_area(dev, virt_to_bus(buf), size, dir); 269 dma_addr_t map = dma_map_area(dev, virt_to_bus(buf), size, dir);
270
236 flush_gart(); 271 flush_gart();
272
237 return map; 273 return map;
238} 274}
239 275
240/* Map a single area into the IOMMU */ 276/* Map a single area into the IOMMU */
241static dma_addr_t gart_map_single(struct device *dev, void *addr, size_t size, int dir) 277static dma_addr_t
278gart_map_single(struct device *dev, void *addr, size_t size, int dir)
242{ 279{
243 unsigned long phys_mem, bus; 280 unsigned long phys_mem, bus;
244 281
245 if (!dev) 282 if (!dev)
246 dev = &fallback_dev; 283 dev = &fallback_dev;
247 284
248 phys_mem = virt_to_phys(addr); 285 phys_mem = virt_to_phys(addr);
249 if (!need_iommu(dev, phys_mem, size)) 286 if (!need_iommu(dev, phys_mem, size))
250 return phys_mem; 287 return phys_mem;
251 288
252 bus = gart_map_simple(dev, addr, size, dir); 289 bus = gart_map_simple(dev, addr, size, dir);
253 return bus; 290
291 return bus;
254} 292}
255 293
256/* 294/*
257 * Free a DMA mapping. 295 * Free a DMA mapping.
258 */ 296 */
259static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr, 297static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr,
260 size_t size, int direction) 298 size_t size, int direction)
261{ 299{
262 unsigned long iommu_page; 300 unsigned long iommu_page;
263 int npages; 301 int npages;
@@ -266,6 +304,7 @@ static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr,
266 if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE || 304 if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE ||
267 dma_addr >= iommu_bus_base + iommu_size) 305 dma_addr >= iommu_bus_base + iommu_size)
268 return; 306 return;
307
269 iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT; 308 iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT;
270 npages = to_pages(dma_addr, size); 309 npages = to_pages(dma_addr, size);
271 for (i = 0; i < npages; i++) { 310 for (i = 0; i < npages; i++) {
@@ -278,7 +317,8 @@ static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr,
278/* 317/*
279 * Wrapper for pci_unmap_single working with scatterlists. 318 * Wrapper for pci_unmap_single working with scatterlists.
280 */ 319 */
281static void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) 320static void
321gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
282{ 322{
283 struct scatterlist *s; 323 struct scatterlist *s;
284 int i; 324 int i;
@@ -303,12 +343,13 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
303 343
304 for_each_sg(sg, s, nents, i) { 344 for_each_sg(sg, s, nents, i) {
305 unsigned long addr = sg_phys(s); 345 unsigned long addr = sg_phys(s);
306 if (nonforced_iommu(dev, addr, s->length)) { 346
347 if (nonforced_iommu(dev, addr, s->length)) {
307 addr = dma_map_area(dev, addr, s->length, dir); 348 addr = dma_map_area(dev, addr, s->length, dir);
308 if (addr == bad_dma_address) { 349 if (addr == bad_dma_address) {
309 if (i > 0) 350 if (i > 0)
310 gart_unmap_sg(dev, sg, i, dir); 351 gart_unmap_sg(dev, sg, i, dir);
311 nents = 0; 352 nents = 0;
312 sg[0].dma_length = 0; 353 sg[0].dma_length = 0;
313 break; 354 break;
314 } 355 }
@@ -317,15 +358,17 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
317 s->dma_length = s->length; 358 s->dma_length = s->length;
318 } 359 }
319 flush_gart(); 360 flush_gart();
361
320 return nents; 362 return nents;
321} 363}
322 364
323/* Map multiple scatterlist entries continuous into the first. */ 365/* Map multiple scatterlist entries continuous into the first. */
324static int __dma_map_cont(struct scatterlist *start, int nelems, 366static int __dma_map_cont(struct device *dev, struct scatterlist *start,
325 struct scatterlist *sout, unsigned long pages) 367 int nelems, struct scatterlist *sout,
368 unsigned long pages)
326{ 369{
327 unsigned long iommu_start = alloc_iommu(pages); 370 unsigned long iommu_start = alloc_iommu(dev, pages);
328 unsigned long iommu_page = iommu_start; 371 unsigned long iommu_page = iommu_start;
329 struct scatterlist *s; 372 struct scatterlist *s;
330 int i; 373 int i;
331 374
@@ -335,32 +378,33 @@ static int __dma_map_cont(struct scatterlist *start, int nelems,
335 for_each_sg(start, s, nelems, i) { 378 for_each_sg(start, s, nelems, i) {
336 unsigned long pages, addr; 379 unsigned long pages, addr;
337 unsigned long phys_addr = s->dma_address; 380 unsigned long phys_addr = s->dma_address;
338 381
339 BUG_ON(s != start && s->offset); 382 BUG_ON(s != start && s->offset);
340 if (s == start) { 383 if (s == start) {
341 sout->dma_address = iommu_bus_base; 384 sout->dma_address = iommu_bus_base;
342 sout->dma_address += iommu_page*PAGE_SIZE + s->offset; 385 sout->dma_address += iommu_page*PAGE_SIZE + s->offset;
343 sout->dma_length = s->length; 386 sout->dma_length = s->length;
344 } else { 387 } else {
345 sout->dma_length += s->length; 388 sout->dma_length += s->length;
346 } 389 }
347 390
348 addr = phys_addr; 391 addr = phys_addr;
349 pages = to_pages(s->offset, s->length); 392 pages = to_pages(s->offset, s->length);
350 while (pages--) { 393 while (pages--) {
351 iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); 394 iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr);
352 SET_LEAK(iommu_page); 395 SET_LEAK(iommu_page);
353 addr += PAGE_SIZE; 396 addr += PAGE_SIZE;
354 iommu_page++; 397 iommu_page++;
355 } 398 }
356 } 399 }
357 BUG_ON(iommu_page - iommu_start != pages); 400 BUG_ON(iommu_page - iommu_start != pages);
401
358 return 0; 402 return 0;
359} 403}
360 404
361static inline int dma_map_cont(struct scatterlist *start, int nelems, 405static inline int
362 struct scatterlist *sout, 406dma_map_cont(struct device *dev, struct scatterlist *start, int nelems,
363 unsigned long pages, int need) 407 struct scatterlist *sout, unsigned long pages, int need)
364{ 408{
365 if (!need) { 409 if (!need) {
366 BUG_ON(nelems != 1); 410 BUG_ON(nelems != 1);
@@ -368,24 +412,23 @@ static inline int dma_map_cont(struct scatterlist *start, int nelems,
368 sout->dma_length = start->length; 412 sout->dma_length = start->length;
369 return 0; 413 return 0;
370 } 414 }
371 return __dma_map_cont(start, nelems, sout, pages); 415 return __dma_map_cont(dev, start, nelems, sout, pages);
372} 416}
373 417
374/* 418/*
375 * DMA map all entries in a scatterlist. 419 * DMA map all entries in a scatterlist.
376 * Merge chunks that have page aligned sizes into a continuous mapping. 420 * Merge chunks that have page aligned sizes into a continuous mapping.
377 */ 421 */
378static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, 422static int
379 int dir) 423gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
380{ 424{
381 int i;
382 int out;
383 int start;
384 unsigned long pages = 0;
385 int need = 0, nextneed;
386 struct scatterlist *s, *ps, *start_sg, *sgmap; 425 struct scatterlist *s, *ps, *start_sg, *sgmap;
426 int need = 0, nextneed, i, out, start;
427 unsigned long pages = 0;
428 unsigned int seg_size;
429 unsigned int max_seg_size;
387 430
388 if (nents == 0) 431 if (nents == 0)
389 return 0; 432 return 0;
390 433
391 if (!dev) 434 if (!dev)
@@ -394,24 +437,32 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
394 out = 0; 437 out = 0;
395 start = 0; 438 start = 0;
396 start_sg = sgmap = sg; 439 start_sg = sgmap = sg;
440 seg_size = 0;
441 max_seg_size = dma_get_max_seg_size(dev);
397 ps = NULL; /* shut up gcc */ 442 ps = NULL; /* shut up gcc */
398 for_each_sg(sg, s, nents, i) { 443 for_each_sg(sg, s, nents, i) {
399 dma_addr_t addr = sg_phys(s); 444 dma_addr_t addr = sg_phys(s);
445
400 s->dma_address = addr; 446 s->dma_address = addr;
401 BUG_ON(s->length == 0); 447 BUG_ON(s->length == 0);
402 448
403 nextneed = need_iommu(dev, addr, s->length); 449 nextneed = need_iommu(dev, addr, s->length);
404 450
405 /* Handle the previous not yet processed entries */ 451 /* Handle the previous not yet processed entries */
406 if (i > start) { 452 if (i > start) {
407 /* Can only merge when the last chunk ends on a page 453 /*
408 boundary and the new one doesn't have an offset. */ 454 * Can only merge when the last chunk ends on a
455 * page boundary and the new one doesn't have an
456 * offset.
457 */
409 if (!iommu_merge || !nextneed || !need || s->offset || 458 if (!iommu_merge || !nextneed || !need || s->offset ||
459 (s->length + seg_size > max_seg_size) ||
410 (ps->offset + ps->length) % PAGE_SIZE) { 460 (ps->offset + ps->length) % PAGE_SIZE) {
411 if (dma_map_cont(start_sg, i - start, sgmap, 461 if (dma_map_cont(dev, start_sg, i - start,
412 pages, need) < 0) 462 sgmap, pages, need) < 0)
413 goto error; 463 goto error;
414 out++; 464 out++;
465 seg_size = 0;
415 sgmap = sg_next(sgmap); 466 sgmap = sg_next(sgmap);
416 pages = 0; 467 pages = 0;
417 start = i; 468 start = i;
@@ -419,11 +470,12 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
419 } 470 }
420 } 471 }
421 472
473 seg_size += s->length;
422 need = nextneed; 474 need = nextneed;
423 pages += to_pages(s->offset, s->length); 475 pages += to_pages(s->offset, s->length);
424 ps = s; 476 ps = s;
425 } 477 }
426 if (dma_map_cont(start_sg, i - start, sgmap, pages, need) < 0) 478 if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0)
427 goto error; 479 goto error;
428 out++; 480 out++;
429 flush_gart(); 481 flush_gart();
@@ -436,6 +488,7 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
436error: 488error:
437 flush_gart(); 489 flush_gart();
438 gart_unmap_sg(dev, sg, out, dir); 490 gart_unmap_sg(dev, sg, out, dir);
491
439 /* When it was forced or merged try again in a dumb way */ 492 /* When it was forced or merged try again in a dumb way */
440 if (force_iommu || iommu_merge) { 493 if (force_iommu || iommu_merge) {
441 out = dma_map_sg_nonforce(dev, sg, nents, dir); 494 out = dma_map_sg_nonforce(dev, sg, nents, dir);
@@ -444,64 +497,68 @@ error:
444 } 497 }
445 if (panic_on_overflow) 498 if (panic_on_overflow)
446 panic("dma_map_sg: overflow on %lu pages\n", pages); 499 panic("dma_map_sg: overflow on %lu pages\n", pages);
500
447 iommu_full(dev, pages << PAGE_SHIFT, dir); 501 iommu_full(dev, pages << PAGE_SHIFT, dir);
448 for_each_sg(sg, s, nents, i) 502 for_each_sg(sg, s, nents, i)
449 s->dma_address = bad_dma_address; 503 s->dma_address = bad_dma_address;
450 return 0; 504 return 0;
451} 505}
452 506
453static int no_agp; 507static int no_agp;
454 508
455static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) 509static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
456{ 510{
457 unsigned long a; 511 unsigned long a;
458 if (!iommu_size) { 512
459 iommu_size = aper_size; 513 if (!iommu_size) {
460 if (!no_agp) 514 iommu_size = aper_size;
461 iommu_size /= 2; 515 if (!no_agp)
462 } 516 iommu_size /= 2;
463 517 }
464 a = aper + iommu_size; 518
465 iommu_size -= round_up(a, LARGE_PAGE_SIZE) - a; 519 a = aper + iommu_size;
466 520 iommu_size -= round_up(a, PMD_PAGE_SIZE) - a;
467 if (iommu_size < 64*1024*1024) 521
522 if (iommu_size < 64*1024*1024) {
468 printk(KERN_WARNING 523 printk(KERN_WARNING
469 "PCI-DMA: Warning: Small IOMMU %luMB. Consider increasing the AGP aperture in BIOS\n",iommu_size>>20); 524 "PCI-DMA: Warning: Small IOMMU %luMB."
470 525 " Consider increasing the AGP aperture in BIOS\n",
526 iommu_size >> 20);
527 }
528
471 return iommu_size; 529 return iommu_size;
472} 530}
473 531
474static __init unsigned read_aperture(struct pci_dev *dev, u32 *size) 532static __init unsigned read_aperture(struct pci_dev *dev, u32 *size)
475{ 533{
476 unsigned aper_size = 0, aper_base_32; 534 unsigned aper_size = 0, aper_base_32, aper_order;
477 u64 aper_base; 535 u64 aper_base;
478 unsigned aper_order;
479 536
480 pci_read_config_dword(dev, 0x94, &aper_base_32); 537 pci_read_config_dword(dev, 0x94, &aper_base_32);
481 pci_read_config_dword(dev, 0x90, &aper_order); 538 pci_read_config_dword(dev, 0x90, &aper_order);
482 aper_order = (aper_order >> 1) & 7; 539 aper_order = (aper_order >> 1) & 7;
483 540
484 aper_base = aper_base_32 & 0x7fff; 541 aper_base = aper_base_32 & 0x7fff;
485 aper_base <<= 25; 542 aper_base <<= 25;
486 543
487 aper_size = (32 * 1024 * 1024) << aper_order; 544 aper_size = (32 * 1024 * 1024) << aper_order;
488 if (aper_base + aper_size > 0x100000000UL || !aper_size) 545 if (aper_base + aper_size > 0x100000000UL || !aper_size)
489 aper_base = 0; 546 aper_base = 0;
490 547
491 *size = aper_size; 548 *size = aper_size;
492 return aper_base; 549 return aper_base;
493} 550}
494 551
495/* 552/*
496 * Private Northbridge GATT initialization in case we cannot use the 553 * Private Northbridge GATT initialization in case we cannot use the
497 * AGP driver for some reason. 554 * AGP driver for some reason.
498 */ 555 */
499static __init int init_k8_gatt(struct agp_kern_info *info) 556static __init int init_k8_gatt(struct agp_kern_info *info)
500{ 557{
558 unsigned aper_size, gatt_size, new_aper_size;
559 unsigned aper_base, new_aper_base;
501 struct pci_dev *dev; 560 struct pci_dev *dev;
502 void *gatt; 561 void *gatt;
503 unsigned aper_base, new_aper_base;
504 unsigned aper_size, gatt_size, new_aper_size;
505 int i; 562 int i;
506 563
507 printk(KERN_INFO "PCI-DMA: Disabling AGP.\n"); 564 printk(KERN_INFO "PCI-DMA: Disabling AGP.\n");
@@ -509,75 +566,75 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
509 dev = NULL; 566 dev = NULL;
510 for (i = 0; i < num_k8_northbridges; i++) { 567 for (i = 0; i < num_k8_northbridges; i++) {
511 dev = k8_northbridges[i]; 568 dev = k8_northbridges[i];
512 new_aper_base = read_aperture(dev, &new_aper_size); 569 new_aper_base = read_aperture(dev, &new_aper_size);
513 if (!new_aper_base) 570 if (!new_aper_base)
514 goto nommu; 571 goto nommu;
515 572
516 if (!aper_base) { 573 if (!aper_base) {
517 aper_size = new_aper_size; 574 aper_size = new_aper_size;
518 aper_base = new_aper_base; 575 aper_base = new_aper_base;
519 } 576 }
520 if (aper_size != new_aper_size || aper_base != new_aper_base) 577 if (aper_size != new_aper_size || aper_base != new_aper_base)
521 goto nommu; 578 goto nommu;
522 } 579 }
523 if (!aper_base) 580 if (!aper_base)
524 goto nommu; 581 goto nommu;
525 info->aper_base = aper_base; 582 info->aper_base = aper_base;
526 info->aper_size = aper_size>>20; 583 info->aper_size = aper_size >> 20;
527 584
528 gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32); 585 gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32);
529 gatt = (void *)__get_free_pages(GFP_KERNEL, get_order(gatt_size)); 586 gatt = (void *)__get_free_pages(GFP_KERNEL, get_order(gatt_size));
530 if (!gatt) 587 if (!gatt)
531 panic("Cannot allocate GATT table"); 588 panic("Cannot allocate GATT table");
532 if (change_page_attr_addr((unsigned long)gatt, gatt_size >> PAGE_SHIFT, PAGE_KERNEL_NOCACHE)) 589 if (set_memory_uc((unsigned long)gatt, gatt_size >> PAGE_SHIFT))
533 panic("Could not set GART PTEs to uncacheable pages"); 590 panic("Could not set GART PTEs to uncacheable pages");
534 global_flush_tlb();
535 591
536 memset(gatt, 0, gatt_size); 592 memset(gatt, 0, gatt_size);
537 agp_gatt_table = gatt; 593 agp_gatt_table = gatt;
538 594
539 for (i = 0; i < num_k8_northbridges; i++) { 595 for (i = 0; i < num_k8_northbridges; i++) {
540 u32 ctl; 596 u32 gatt_reg;
541 u32 gatt_reg; 597 u32 ctl;
542 598
543 dev = k8_northbridges[i]; 599 dev = k8_northbridges[i];
544 gatt_reg = __pa(gatt) >> 12; 600 gatt_reg = __pa(gatt) >> 12;
545 gatt_reg <<= 4; 601 gatt_reg <<= 4;
546 pci_write_config_dword(dev, 0x98, gatt_reg); 602 pci_write_config_dword(dev, 0x98, gatt_reg);
547 pci_read_config_dword(dev, 0x90, &ctl); 603 pci_read_config_dword(dev, 0x90, &ctl);
548 604
549 ctl |= 1; 605 ctl |= 1;
550 ctl &= ~((1<<4) | (1<<5)); 606 ctl &= ~((1<<4) | (1<<5));
551 607
552 pci_write_config_dword(dev, 0x90, ctl); 608 pci_write_config_dword(dev, 0x90, ctl);
553 } 609 }
554 flush_gart(); 610 flush_gart();
555 611
556 printk("PCI-DMA: aperture base @ %x size %u KB\n",aper_base, aper_size>>10); 612 printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n",
613 aper_base, aper_size>>10);
557 return 0; 614 return 0;
558 615
559 nommu: 616 nommu:
560 /* Should not happen anymore */ 617 /* Should not happen anymore */
561 printk(KERN_ERR "PCI-DMA: More than 4GB of RAM and no IOMMU\n" 618 printk(KERN_ERR "PCI-DMA: More than 4GB of RAM and no IOMMU\n"
562 KERN_ERR "PCI-DMA: 32bit PCI IO may malfunction.\n"); 619 KERN_ERR "PCI-DMA: 32bit PCI IO may malfunction.\n");
563 return -1; 620 return -1;
564} 621}
565 622
566extern int agp_amd64_init(void); 623extern int agp_amd64_init(void);
567 624
568static const struct dma_mapping_ops gart_dma_ops = { 625static const struct dma_mapping_ops gart_dma_ops = {
569 .mapping_error = NULL, 626 .mapping_error = NULL,
570 .map_single = gart_map_single, 627 .map_single = gart_map_single,
571 .map_simple = gart_map_simple, 628 .map_simple = gart_map_simple,
572 .unmap_single = gart_unmap_single, 629 .unmap_single = gart_unmap_single,
573 .sync_single_for_cpu = NULL, 630 .sync_single_for_cpu = NULL,
574 .sync_single_for_device = NULL, 631 .sync_single_for_device = NULL,
575 .sync_single_range_for_cpu = NULL, 632 .sync_single_range_for_cpu = NULL,
576 .sync_single_range_for_device = NULL, 633 .sync_single_range_for_device = NULL,
577 .sync_sg_for_cpu = NULL, 634 .sync_sg_for_cpu = NULL,
578 .sync_sg_for_device = NULL, 635 .sync_sg_for_device = NULL,
579 .map_sg = gart_map_sg, 636 .map_sg = gart_map_sg,
580 .unmap_sg = gart_unmap_sg, 637 .unmap_sg = gart_unmap_sg,
581}; 638};
582 639
583void gart_iommu_shutdown(void) 640void gart_iommu_shutdown(void)
@@ -588,23 +645,23 @@ void gart_iommu_shutdown(void)
588 if (no_agp && (dma_ops != &gart_dma_ops)) 645 if (no_agp && (dma_ops != &gart_dma_ops))
589 return; 646 return;
590 647
591 for (i = 0; i < num_k8_northbridges; i++) { 648 for (i = 0; i < num_k8_northbridges; i++) {
592 u32 ctl; 649 u32 ctl;
593 650
594 dev = k8_northbridges[i]; 651 dev = k8_northbridges[i];
595 pci_read_config_dword(dev, 0x90, &ctl); 652 pci_read_config_dword(dev, 0x90, &ctl);
596 653
597 ctl &= ~1; 654 ctl &= ~1;
598 655
599 pci_write_config_dword(dev, 0x90, ctl); 656 pci_write_config_dword(dev, 0x90, ctl);
600 } 657 }
601} 658}
602 659
603void __init gart_iommu_init(void) 660void __init gart_iommu_init(void)
604{ 661{
605 struct agp_kern_info info; 662 struct agp_kern_info info;
606 unsigned long aper_size;
607 unsigned long iommu_start; 663 unsigned long iommu_start;
664 unsigned long aper_size;
608 unsigned long scratch; 665 unsigned long scratch;
609 long i; 666 long i;
610 667
@@ -614,14 +671,14 @@ void __init gart_iommu_init(void)
614 } 671 }
615 672
616#ifndef CONFIG_AGP_AMD64 673#ifndef CONFIG_AGP_AMD64
617 no_agp = 1; 674 no_agp = 1;
618#else 675#else
619 /* Makefile puts PCI initialization via subsys_initcall first. */ 676 /* Makefile puts PCI initialization via subsys_initcall first. */
620 /* Add other K8 AGP bridge drivers here */ 677 /* Add other K8 AGP bridge drivers here */
621 no_agp = no_agp || 678 no_agp = no_agp ||
622 (agp_amd64_init() < 0) || 679 (agp_amd64_init() < 0) ||
623 (agp_copy_info(agp_bridge, &info) < 0); 680 (agp_copy_info(agp_bridge, &info) < 0);
624#endif 681#endif
625 682
626 if (swiotlb) 683 if (swiotlb)
627 return; 684 return;
@@ -643,77 +700,79 @@ void __init gart_iommu_init(void)
643 } 700 }
644 701
645 printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n"); 702 printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n");
646 aper_size = info.aper_size * 1024 * 1024; 703 aper_size = info.aper_size * 1024 * 1024;
647 iommu_size = check_iommu_size(info.aper_base, aper_size); 704 iommu_size = check_iommu_size(info.aper_base, aper_size);
648 iommu_pages = iommu_size >> PAGE_SHIFT; 705 iommu_pages = iommu_size >> PAGE_SHIFT;
649 706
650 iommu_gart_bitmap = (void*)__get_free_pages(GFP_KERNEL, 707 iommu_gart_bitmap = (void *) __get_free_pages(GFP_KERNEL,
651 get_order(iommu_pages/8)); 708 get_order(iommu_pages/8));
652 if (!iommu_gart_bitmap) 709 if (!iommu_gart_bitmap)
653 panic("Cannot allocate iommu bitmap\n"); 710 panic("Cannot allocate iommu bitmap\n");
654 memset(iommu_gart_bitmap, 0, iommu_pages/8); 711 memset(iommu_gart_bitmap, 0, iommu_pages/8);
655 712
656#ifdef CONFIG_IOMMU_LEAK 713#ifdef CONFIG_IOMMU_LEAK
657 if (leak_trace) { 714 if (leak_trace) {
658 iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL, 715 iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL,
659 get_order(iommu_pages*sizeof(void *))); 716 get_order(iommu_pages*sizeof(void *)));
660 if (iommu_leak_tab) 717 if (iommu_leak_tab)
661 memset(iommu_leak_tab, 0, iommu_pages * 8); 718 memset(iommu_leak_tab, 0, iommu_pages * 8);
662 else 719 else
663 printk("PCI-DMA: Cannot allocate leak trace area\n"); 720 printk(KERN_DEBUG
664 } 721 "PCI-DMA: Cannot allocate leak trace area\n");
722 }
665#endif 723#endif
666 724
667 /* 725 /*
668 * Out of IOMMU space handling. 726 * Out of IOMMU space handling.
669 * Reserve some invalid pages at the beginning of the GART. 727 * Reserve some invalid pages at the beginning of the GART.
670 */ 728 */
671 set_bit_string(iommu_gart_bitmap, 0, EMERGENCY_PAGES); 729 set_bit_string(iommu_gart_bitmap, 0, EMERGENCY_PAGES);
672 730
673 agp_memory_reserved = iommu_size; 731 agp_memory_reserved = iommu_size;
674 printk(KERN_INFO 732 printk(KERN_INFO
675 "PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n", 733 "PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n",
676 iommu_size>>20); 734 iommu_size >> 20);
677 735
678 iommu_start = aper_size - iommu_size; 736 iommu_start = aper_size - iommu_size;
679 iommu_bus_base = info.aper_base + iommu_start; 737 iommu_bus_base = info.aper_base + iommu_start;
680 bad_dma_address = iommu_bus_base; 738 bad_dma_address = iommu_bus_base;
681 iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT); 739 iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT);
682 740
683 /* 741 /*
684 * Unmap the IOMMU part of the GART. The alias of the page is 742 * Unmap the IOMMU part of the GART. The alias of the page is
685 * always mapped with cache enabled and there is no full cache 743 * always mapped with cache enabled and there is no full cache
686 * coherency across the GART remapping. The unmapping avoids 744 * coherency across the GART remapping. The unmapping avoids
687 * automatic prefetches from the CPU allocating cache lines in 745 * automatic prefetches from the CPU allocating cache lines in
688 * there. All CPU accesses are done via the direct mapping to 746 * there. All CPU accesses are done via the direct mapping to
689 * the backing memory. The GART address is only used by PCI 747 * the backing memory. The GART address is only used by PCI
690 * devices. 748 * devices.
691 */ 749 */
692 clear_kernel_mapping((unsigned long)__va(iommu_bus_base), iommu_size); 750 set_memory_np((unsigned long)__va(iommu_bus_base),
751 iommu_size >> PAGE_SHIFT);
693 752
694 /* 753 /*
695 * Try to workaround a bug (thanks to BenH) 754 * Try to workaround a bug (thanks to BenH)
696 * Set unmapped entries to a scratch page instead of 0. 755 * Set unmapped entries to a scratch page instead of 0.
697 * Any prefetches that hit unmapped entries won't get an bus abort 756 * Any prefetches that hit unmapped entries won't get an bus abort
698 * then. 757 * then.
699 */ 758 */
700 scratch = get_zeroed_page(GFP_KERNEL); 759 scratch = get_zeroed_page(GFP_KERNEL);
701 if (!scratch) 760 if (!scratch)
702 panic("Cannot allocate iommu scratch page"); 761 panic("Cannot allocate iommu scratch page");
703 gart_unmapped_entry = GPTE_ENCODE(__pa(scratch)); 762 gart_unmapped_entry = GPTE_ENCODE(__pa(scratch));
704 for (i = EMERGENCY_PAGES; i < iommu_pages; i++) 763 for (i = EMERGENCY_PAGES; i < iommu_pages; i++)
705 iommu_gatt_base[i] = gart_unmapped_entry; 764 iommu_gatt_base[i] = gart_unmapped_entry;
706 765
707 flush_gart(); 766 flush_gart();
708 dma_ops = &gart_dma_ops; 767 dma_ops = &gart_dma_ops;
709} 768}
710 769
711void __init gart_parse_options(char *p) 770void __init gart_parse_options(char *p)
712{ 771{
713 int arg; 772 int arg;
714 773
715#ifdef CONFIG_IOMMU_LEAK 774#ifdef CONFIG_IOMMU_LEAK
716 if (!strncmp(p,"leak",4)) { 775 if (!strncmp(p, "leak", 4)) {
717 leak_trace = 1; 776 leak_trace = 1;
718 p += 4; 777 p += 4;
719 if (*p == '=') ++p; 778 if (*p == '=') ++p;
@@ -723,18 +782,18 @@ void __init gart_parse_options(char *p)
723#endif 782#endif
724 if (isdigit(*p) && get_option(&p, &arg)) 783 if (isdigit(*p) && get_option(&p, &arg))
725 iommu_size = arg; 784 iommu_size = arg;
726 if (!strncmp(p, "fullflush",8)) 785 if (!strncmp(p, "fullflush", 8))
727 iommu_fullflush = 1; 786 iommu_fullflush = 1;
728 if (!strncmp(p, "nofullflush",11)) 787 if (!strncmp(p, "nofullflush", 11))
729 iommu_fullflush = 0; 788 iommu_fullflush = 0;
730 if (!strncmp(p,"noagp",5)) 789 if (!strncmp(p, "noagp", 5))
731 no_agp = 1; 790 no_agp = 1;
732 if (!strncmp(p, "noaperture",10)) 791 if (!strncmp(p, "noaperture", 10))
733 fix_aperture = 0; 792 fix_aperture = 0;
734 /* duplicated from pci-dma.c */ 793 /* duplicated from pci-dma.c */
735 if (!strncmp(p,"force",5)) 794 if (!strncmp(p, "force", 5))
736 gart_iommu_aperture_allowed = 1; 795 gart_iommu_aperture_allowed = 1;
737 if (!strncmp(p,"allowed",7)) 796 if (!strncmp(p, "allowed", 7))
738 gart_iommu_aperture_allowed = 1; 797 gart_iommu_aperture_allowed = 1;
739 if (!strncmp(p, "memaper", 7)) { 798 if (!strncmp(p, "memaper", 7)) {
740 fallback_aper_force = 1; 799 fallback_aper_force = 1;
diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c
index 102866d729a..82a0a674a00 100644
--- a/arch/x86/kernel/pci-swiotlb_64.c
+++ b/arch/x86/kernel/pci-swiotlb_64.c
@@ -10,7 +10,6 @@
10#include <asm/dma.h> 10#include <asm/dma.h>
11 11
12int swiotlb __read_mostly; 12int swiotlb __read_mostly;
13EXPORT_SYMBOL(swiotlb);
14 13
15const struct dma_mapping_ops swiotlb_dma_ops = { 14const struct dma_mapping_ops swiotlb_dma_ops = {
16 .mapping_error = swiotlb_dma_mapping_error, 15 .mapping_error = swiotlb_dma_mapping_error,
diff --git a/arch/x86/kernel/pmtimer_64.c b/arch/x86/kernel/pmtimer_64.c
index ae8f91214f1..b112406f199 100644
--- a/arch/x86/kernel/pmtimer_64.c
+++ b/arch/x86/kernel/pmtimer_64.c
@@ -19,13 +19,13 @@
19#include <linux/time.h> 19#include <linux/time.h>
20#include <linux/init.h> 20#include <linux/init.h>
21#include <linux/cpumask.h> 21#include <linux/cpumask.h>
22#include <linux/acpi_pmtmr.h>
23
22#include <asm/io.h> 24#include <asm/io.h>
23#include <asm/proto.h> 25#include <asm/proto.h>
24#include <asm/msr.h> 26#include <asm/msr.h>
25#include <asm/vsyscall.h> 27#include <asm/vsyscall.h>
26 28
27#define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */
28
29static inline u32 cyc2us(u32 cycles) 29static inline u32 cyc2us(u32 cycles)
30{ 30{
31 /* The Power Management Timer ticks at 3.579545 ticks per microsecond. 31 /* The Power Management Timer ticks at 3.579545 ticks per microsecond.
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 9663c2a7483..dabdbeff1f7 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -55,6 +55,7 @@
55 55
56#include <asm/tlbflush.h> 56#include <asm/tlbflush.h>
57#include <asm/cpu.h> 57#include <asm/cpu.h>
58#include <asm/kdebug.h>
58 59
59asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); 60asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
60 61
@@ -74,7 +75,7 @@ EXPORT_PER_CPU_SYMBOL(cpu_number);
74 */ 75 */
75unsigned long thread_saved_pc(struct task_struct *tsk) 76unsigned long thread_saved_pc(struct task_struct *tsk)
76{ 77{
77 return ((unsigned long *)tsk->thread.esp)[3]; 78 return ((unsigned long *)tsk->thread.sp)[3];
78} 79}
79 80
80/* 81/*
@@ -113,10 +114,19 @@ void default_idle(void)
113 smp_mb(); 114 smp_mb();
114 115
115 local_irq_disable(); 116 local_irq_disable();
116 if (!need_resched()) 117 if (!need_resched()) {
118 ktime_t t0, t1;
119 u64 t0n, t1n;
120
121 t0 = ktime_get();
122 t0n = ktime_to_ns(t0);
117 safe_halt(); /* enables interrupts racelessly */ 123 safe_halt(); /* enables interrupts racelessly */
118 else 124 local_irq_disable();
119 local_irq_enable(); 125 t1 = ktime_get();
126 t1n = ktime_to_ns(t1);
127 sched_clock_idle_wakeup_event(t1n - t0n);
128 }
129 local_irq_enable();
120 current_thread_info()->status |= TS_POLLING; 130 current_thread_info()->status |= TS_POLLING;
121 } else { 131 } else {
122 /* loop is done by the caller */ 132 /* loop is done by the caller */
@@ -132,7 +142,7 @@ EXPORT_SYMBOL(default_idle);
132 * to poll the ->work.need_resched flag instead of waiting for the 142 * to poll the ->work.need_resched flag instead of waiting for the
133 * cross-CPU IPI to arrive. Use this option with caution. 143 * cross-CPU IPI to arrive. Use this option with caution.
134 */ 144 */
135static void poll_idle (void) 145static void poll_idle(void)
136{ 146{
137 cpu_relax(); 147 cpu_relax();
138} 148}
@@ -188,6 +198,9 @@ void cpu_idle(void)
188 rmb(); 198 rmb();
189 idle = pm_idle; 199 idle = pm_idle;
190 200
201 if (rcu_pending(cpu))
202 rcu_check_callbacks(cpu, 0);
203
191 if (!idle) 204 if (!idle)
192 idle = default_idle; 205 idle = default_idle;
193 206
@@ -204,6 +217,10 @@ void cpu_idle(void)
204 } 217 }
205} 218}
206 219
220static void do_nothing(void *unused)
221{
222}
223
207void cpu_idle_wait(void) 224void cpu_idle_wait(void)
208{ 225{
209 unsigned int cpu, this_cpu = get_cpu(); 226 unsigned int cpu, this_cpu = get_cpu();
@@ -228,6 +245,13 @@ void cpu_idle_wait(void)
228 cpu_clear(cpu, map); 245 cpu_clear(cpu, map);
229 } 246 }
230 cpus_and(map, map, cpu_online_map); 247 cpus_and(map, map, cpu_online_map);
248 /*
249 * We waited 1 sec, if a CPU still did not call idle
250 * it may be because it is in idle and not waking up
251 * because it has nothing to do.
252 * Give all the remaining CPUS a kick.
253 */
254 smp_call_function_mask(map, do_nothing, NULL, 0);
231 } while (!cpus_empty(map)); 255 } while (!cpus_empty(map));
232 256
233 set_cpus_allowed(current, tmp); 257 set_cpus_allowed(current, tmp);
@@ -244,13 +268,13 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
244 * New with Core Duo processors, MWAIT can take some hints based on CPU 268 * New with Core Duo processors, MWAIT can take some hints based on CPU
245 * capability. 269 * capability.
246 */ 270 */
247void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) 271void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
248{ 272{
249 if (!need_resched()) { 273 if (!need_resched()) {
250 __monitor((void *)&current_thread_info()->flags, 0, 0); 274 __monitor((void *)&current_thread_info()->flags, 0, 0);
251 smp_mb(); 275 smp_mb();
252 if (!need_resched()) 276 if (!need_resched())
253 __mwait(eax, ecx); 277 __mwait(ax, cx);
254 } 278 }
255} 279}
256 280
@@ -261,19 +285,37 @@ static void mwait_idle(void)
261 mwait_idle_with_hints(0, 0); 285 mwait_idle_with_hints(0, 0);
262} 286}
263 287
288static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
289{
290 if (force_mwait)
291 return 1;
292 /* Any C1 states supported? */
293 return c->cpuid_level >= 5 && ((cpuid_edx(5) >> 4) & 0xf) > 0;
294}
295
264void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) 296void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
265{ 297{
266 if (cpu_has(c, X86_FEATURE_MWAIT)) { 298 static int selected;
267 printk("monitor/mwait feature present.\n"); 299
300 if (selected)
301 return;
302#ifdef CONFIG_X86_SMP
303 if (pm_idle == poll_idle && smp_num_siblings > 1) {
304 printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
305 " performance may degrade.\n");
306 }
307#endif
308 if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
268 /* 309 /*
269 * Skip, if setup has overridden idle. 310 * Skip, if setup has overridden idle.
270 * One CPU supports mwait => All CPUs supports mwait 311 * One CPU supports mwait => All CPUs supports mwait
271 */ 312 */
272 if (!pm_idle) { 313 if (!pm_idle) {
273 printk("using mwait in idle threads.\n"); 314 printk(KERN_INFO "using mwait in idle threads.\n");
274 pm_idle = mwait_idle; 315 pm_idle = mwait_idle;
275 } 316 }
276 } 317 }
318 selected = 1;
277} 319}
278 320
279static int __init idle_setup(char *str) 321static int __init idle_setup(char *str)
@@ -281,10 +323,6 @@ static int __init idle_setup(char *str)
281 if (!strcmp(str, "poll")) { 323 if (!strcmp(str, "poll")) {
282 printk("using polling idle threads.\n"); 324 printk("using polling idle threads.\n");
283 pm_idle = poll_idle; 325 pm_idle = poll_idle;
284#ifdef CONFIG_X86_SMP
285 if (smp_num_siblings > 1)
286 printk("WARNING: polling idle and HT enabled, performance may degrade.\n");
287#endif
288 } else if (!strcmp(str, "mwait")) 326 } else if (!strcmp(str, "mwait"))
289 force_mwait = 1; 327 force_mwait = 1;
290 else 328 else
@@ -299,15 +337,15 @@ void __show_registers(struct pt_regs *regs, int all)
299{ 337{
300 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; 338 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
301 unsigned long d0, d1, d2, d3, d6, d7; 339 unsigned long d0, d1, d2, d3, d6, d7;
302 unsigned long esp; 340 unsigned long sp;
303 unsigned short ss, gs; 341 unsigned short ss, gs;
304 342
305 if (user_mode_vm(regs)) { 343 if (user_mode_vm(regs)) {
306 esp = regs->esp; 344 sp = regs->sp;
307 ss = regs->xss & 0xffff; 345 ss = regs->ss & 0xffff;
308 savesegment(gs, gs); 346 savesegment(gs, gs);
309 } else { 347 } else {
310 esp = (unsigned long) (&regs->esp); 348 sp = (unsigned long) (&regs->sp);
311 savesegment(ss, ss); 349 savesegment(ss, ss);
312 savesegment(gs, gs); 350 savesegment(gs, gs);
313 } 351 }
@@ -320,17 +358,17 @@ void __show_registers(struct pt_regs *regs, int all)
320 init_utsname()->version); 358 init_utsname()->version);
321 359
322 printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", 360 printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
323 0xffff & regs->xcs, regs->eip, regs->eflags, 361 0xffff & regs->cs, regs->ip, regs->flags,
324 smp_processor_id()); 362 smp_processor_id());
325 print_symbol("EIP is at %s\n", regs->eip); 363 print_symbol("EIP is at %s\n", regs->ip);
326 364
327 printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", 365 printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
328 regs->eax, regs->ebx, regs->ecx, regs->edx); 366 regs->ax, regs->bx, regs->cx, regs->dx);
329 printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", 367 printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n",
330 regs->esi, regs->edi, regs->ebp, esp); 368 regs->si, regs->di, regs->bp, sp);
331 printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n", 369 printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n",
332 regs->xds & 0xffff, regs->xes & 0xffff, 370 regs->ds & 0xffff, regs->es & 0xffff,
333 regs->xfs & 0xffff, gs, ss); 371 regs->fs & 0xffff, gs, ss);
334 372
335 if (!all) 373 if (!all)
336 return; 374 return;
@@ -358,12 +396,12 @@ void __show_registers(struct pt_regs *regs, int all)
358void show_regs(struct pt_regs *regs) 396void show_regs(struct pt_regs *regs)
359{ 397{
360 __show_registers(regs, 1); 398 __show_registers(regs, 1);
361 show_trace(NULL, regs, &regs->esp); 399 show_trace(NULL, regs, &regs->sp, regs->bp);
362} 400}
363 401
364/* 402/*
365 * This gets run with %ebx containing the 403 * This gets run with %bx containing the
366 * function to call, and %edx containing 404 * function to call, and %dx containing
367 * the "args". 405 * the "args".
368 */ 406 */
369extern void kernel_thread_helper(void); 407extern void kernel_thread_helper(void);
@@ -377,16 +415,16 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
377 415
378 memset(&regs, 0, sizeof(regs)); 416 memset(&regs, 0, sizeof(regs));
379 417
380 regs.ebx = (unsigned long) fn; 418 regs.bx = (unsigned long) fn;
381 regs.edx = (unsigned long) arg; 419 regs.dx = (unsigned long) arg;
382 420
383 regs.xds = __USER_DS; 421 regs.ds = __USER_DS;
384 regs.xes = __USER_DS; 422 regs.es = __USER_DS;
385 regs.xfs = __KERNEL_PERCPU; 423 regs.fs = __KERNEL_PERCPU;
386 regs.orig_eax = -1; 424 regs.orig_ax = -1;
387 regs.eip = (unsigned long) kernel_thread_helper; 425 regs.ip = (unsigned long) kernel_thread_helper;
388 regs.xcs = __KERNEL_CS | get_kernel_rpl(); 426 regs.cs = __KERNEL_CS | get_kernel_rpl();
389 regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2; 427 regs.flags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
390 428
391 /* Ok, create the new process.. */ 429 /* Ok, create the new process.. */
392 return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL); 430 return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
@@ -424,7 +462,12 @@ void flush_thread(void)
424{ 462{
425 struct task_struct *tsk = current; 463 struct task_struct *tsk = current;
426 464
427 memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8); 465 tsk->thread.debugreg0 = 0;
466 tsk->thread.debugreg1 = 0;
467 tsk->thread.debugreg2 = 0;
468 tsk->thread.debugreg3 = 0;
469 tsk->thread.debugreg6 = 0;
470 tsk->thread.debugreg7 = 0;
428 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); 471 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
429 clear_tsk_thread_flag(tsk, TIF_DEBUG); 472 clear_tsk_thread_flag(tsk, TIF_DEBUG);
430 /* 473 /*
@@ -449,7 +492,7 @@ void prepare_to_copy(struct task_struct *tsk)
449 unlazy_fpu(tsk); 492 unlazy_fpu(tsk);
450} 493}
451 494
452int copy_thread(int nr, unsigned long clone_flags, unsigned long esp, 495int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
453 unsigned long unused, 496 unsigned long unused,
454 struct task_struct * p, struct pt_regs * regs) 497 struct task_struct * p, struct pt_regs * regs)
455{ 498{
@@ -459,15 +502,15 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
459 502
460 childregs = task_pt_regs(p); 503 childregs = task_pt_regs(p);
461 *childregs = *regs; 504 *childregs = *regs;
462 childregs->eax = 0; 505 childregs->ax = 0;
463 childregs->esp = esp; 506 childregs->sp = sp;
464 507
465 p->thread.esp = (unsigned long) childregs; 508 p->thread.sp = (unsigned long) childregs;
466 p->thread.esp0 = (unsigned long) (childregs+1); 509 p->thread.sp0 = (unsigned long) (childregs+1);
467 510
468 p->thread.eip = (unsigned long) ret_from_fork; 511 p->thread.ip = (unsigned long) ret_from_fork;
469 512
470 savesegment(gs,p->thread.gs); 513 savesegment(gs, p->thread.gs);
471 514
472 tsk = current; 515 tsk = current;
473 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { 516 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
@@ -480,32 +523,15 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
480 set_tsk_thread_flag(p, TIF_IO_BITMAP); 523 set_tsk_thread_flag(p, TIF_IO_BITMAP);
481 } 524 }
482 525
526 err = 0;
527
483 /* 528 /*
484 * Set a new TLS for the child thread? 529 * Set a new TLS for the child thread?
485 */ 530 */
486 if (clone_flags & CLONE_SETTLS) { 531 if (clone_flags & CLONE_SETTLS)
487 struct desc_struct *desc; 532 err = do_set_thread_area(p, -1,
488 struct user_desc info; 533 (struct user_desc __user *)childregs->si, 0);
489 int idx;
490
491 err = -EFAULT;
492 if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info)))
493 goto out;
494 err = -EINVAL;
495 if (LDT_empty(&info))
496 goto out;
497
498 idx = info.entry_number;
499 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
500 goto out;
501
502 desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
503 desc->a = LDT_entry_a(&info);
504 desc->b = LDT_entry_b(&info);
505 }
506 534
507 err = 0;
508 out:
509 if (err && p->thread.io_bitmap_ptr) { 535 if (err && p->thread.io_bitmap_ptr) {
510 kfree(p->thread.io_bitmap_ptr); 536 kfree(p->thread.io_bitmap_ptr);
511 p->thread.io_bitmap_max = 0; 537 p->thread.io_bitmap_max = 0;
@@ -518,62 +544,52 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
518 */ 544 */
519void dump_thread(struct pt_regs * regs, struct user * dump) 545void dump_thread(struct pt_regs * regs, struct user * dump)
520{ 546{
521 int i; 547 u16 gs;
522 548
523/* changed the size calculations - should hopefully work better. lbt */ 549/* changed the size calculations - should hopefully work better. lbt */
524 dump->magic = CMAGIC; 550 dump->magic = CMAGIC;
525 dump->start_code = 0; 551 dump->start_code = 0;
526 dump->start_stack = regs->esp & ~(PAGE_SIZE - 1); 552 dump->start_stack = regs->sp & ~(PAGE_SIZE - 1);
527 dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT; 553 dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
528 dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT; 554 dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
529 dump->u_dsize -= dump->u_tsize; 555 dump->u_dsize -= dump->u_tsize;
530 dump->u_ssize = 0; 556 dump->u_ssize = 0;
531 for (i = 0; i < 8; i++) 557 dump->u_debugreg[0] = current->thread.debugreg0;
532 dump->u_debugreg[i] = current->thread.debugreg[i]; 558 dump->u_debugreg[1] = current->thread.debugreg1;
559 dump->u_debugreg[2] = current->thread.debugreg2;
560 dump->u_debugreg[3] = current->thread.debugreg3;
561 dump->u_debugreg[4] = 0;
562 dump->u_debugreg[5] = 0;
563 dump->u_debugreg[6] = current->thread.debugreg6;
564 dump->u_debugreg[7] = current->thread.debugreg7;
533 565
534 if (dump->start_stack < TASK_SIZE) 566 if (dump->start_stack < TASK_SIZE)
535 dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT; 567 dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT;
536 568
537 dump->regs.ebx = regs->ebx; 569 dump->regs.bx = regs->bx;
538 dump->regs.ecx = regs->ecx; 570 dump->regs.cx = regs->cx;
539 dump->regs.edx = regs->edx; 571 dump->regs.dx = regs->dx;
540 dump->regs.esi = regs->esi; 572 dump->regs.si = regs->si;
541 dump->regs.edi = regs->edi; 573 dump->regs.di = regs->di;
542 dump->regs.ebp = regs->ebp; 574 dump->regs.bp = regs->bp;
543 dump->regs.eax = regs->eax; 575 dump->regs.ax = regs->ax;
544 dump->regs.ds = regs->xds; 576 dump->regs.ds = (u16)regs->ds;
545 dump->regs.es = regs->xes; 577 dump->regs.es = (u16)regs->es;
546 dump->regs.fs = regs->xfs; 578 dump->regs.fs = (u16)regs->fs;
547 savesegment(gs,dump->regs.gs); 579 savesegment(gs,gs);
548 dump->regs.orig_eax = regs->orig_eax; 580 dump->regs.orig_ax = regs->orig_ax;
549 dump->regs.eip = regs->eip; 581 dump->regs.ip = regs->ip;
550 dump->regs.cs = regs->xcs; 582 dump->regs.cs = (u16)regs->cs;
551 dump->regs.eflags = regs->eflags; 583 dump->regs.flags = regs->flags;
552 dump->regs.esp = regs->esp; 584 dump->regs.sp = regs->sp;
553 dump->regs.ss = regs->xss; 585 dump->regs.ss = (u16)regs->ss;
554 586
555 dump->u_fpvalid = dump_fpu (regs, &dump->i387); 587 dump->u_fpvalid = dump_fpu (regs, &dump->i387);
556} 588}
557EXPORT_SYMBOL(dump_thread); 589EXPORT_SYMBOL(dump_thread);
558 590
559/*
560 * Capture the user space registers if the task is not running (in user space)
561 */
562int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
563{
564 struct pt_regs ptregs = *task_pt_regs(tsk);
565 ptregs.xcs &= 0xffff;
566 ptregs.xds &= 0xffff;
567 ptregs.xes &= 0xffff;
568 ptregs.xss &= 0xffff;
569
570 elf_core_copy_regs(regs, &ptregs);
571
572 return 1;
573}
574
575#ifdef CONFIG_SECCOMP 591#ifdef CONFIG_SECCOMP
576void hard_disable_TSC(void) 592static void hard_disable_TSC(void)
577{ 593{
578 write_cr4(read_cr4() | X86_CR4_TSD); 594 write_cr4(read_cr4() | X86_CR4_TSD);
579} 595}
@@ -588,7 +604,7 @@ void disable_TSC(void)
588 hard_disable_TSC(); 604 hard_disable_TSC();
589 preempt_enable(); 605 preempt_enable();
590} 606}
591void hard_enable_TSC(void) 607static void hard_enable_TSC(void)
592{ 608{
593 write_cr4(read_cr4() & ~X86_CR4_TSD); 609 write_cr4(read_cr4() & ~X86_CR4_TSD);
594} 610}
@@ -598,18 +614,32 @@ static noinline void
598__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, 614__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
599 struct tss_struct *tss) 615 struct tss_struct *tss)
600{ 616{
601 struct thread_struct *next; 617 struct thread_struct *prev, *next;
618 unsigned long debugctl;
602 619
620 prev = &prev_p->thread;
603 next = &next_p->thread; 621 next = &next_p->thread;
604 622
623 debugctl = prev->debugctlmsr;
624 if (next->ds_area_msr != prev->ds_area_msr) {
625 /* we clear debugctl to make sure DS
626 * is not in use when we change it */
627 debugctl = 0;
628 wrmsrl(MSR_IA32_DEBUGCTLMSR, 0);
629 wrmsr(MSR_IA32_DS_AREA, next->ds_area_msr, 0);
630 }
631
632 if (next->debugctlmsr != debugctl)
633 wrmsr(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr, 0);
634
605 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { 635 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
606 set_debugreg(next->debugreg[0], 0); 636 set_debugreg(next->debugreg0, 0);
607 set_debugreg(next->debugreg[1], 1); 637 set_debugreg(next->debugreg1, 1);
608 set_debugreg(next->debugreg[2], 2); 638 set_debugreg(next->debugreg2, 2);
609 set_debugreg(next->debugreg[3], 3); 639 set_debugreg(next->debugreg3, 3);
610 /* no 4 and 5 */ 640 /* no 4 and 5 */
611 set_debugreg(next->debugreg[6], 6); 641 set_debugreg(next->debugreg6, 6);
612 set_debugreg(next->debugreg[7], 7); 642 set_debugreg(next->debugreg7, 7);
613 } 643 }
614 644
615#ifdef CONFIG_SECCOMP 645#ifdef CONFIG_SECCOMP
@@ -623,6 +653,13 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
623 } 653 }
624#endif 654#endif
625 655
656 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
657 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
658
659 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
660 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
661
662
626 if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { 663 if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
627 /* 664 /*
628 * Disable the bitmap via an invalid offset. We still cache 665 * Disable the bitmap via an invalid offset. We still cache
@@ -676,11 +713,11 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
676 * More important, however, is the fact that this allows us much 713 * More important, however, is the fact that this allows us much
677 * more flexibility. 714 * more flexibility.
678 * 715 *
679 * The return value (in %eax) will be the "prev" task after 716 * The return value (in %ax) will be the "prev" task after
680 * the task-switch, and shows up in ret_from_fork in entry.S, 717 * the task-switch, and shows up in ret_from_fork in entry.S,
681 * for example. 718 * for example.
682 */ 719 */
683struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) 720struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
684{ 721{
685 struct thread_struct *prev = &prev_p->thread, 722 struct thread_struct *prev = &prev_p->thread,
686 *next = &next_p->thread; 723 *next = &next_p->thread;
@@ -699,7 +736,7 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
699 /* 736 /*
700 * Reload esp0. 737 * Reload esp0.
701 */ 738 */
702 load_esp0(tss, next); 739 load_sp0(tss, next);
703 740
704 /* 741 /*
705 * Save away %gs. No need to save %fs, as it was saved on the 742 * Save away %gs. No need to save %fs, as it was saved on the
@@ -763,7 +800,7 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
763 800
764asmlinkage int sys_fork(struct pt_regs regs) 801asmlinkage int sys_fork(struct pt_regs regs)
765{ 802{
766 return do_fork(SIGCHLD, regs.esp, &regs, 0, NULL, NULL); 803 return do_fork(SIGCHLD, regs.sp, &regs, 0, NULL, NULL);
767} 804}
768 805
769asmlinkage int sys_clone(struct pt_regs regs) 806asmlinkage int sys_clone(struct pt_regs regs)
@@ -772,12 +809,12 @@ asmlinkage int sys_clone(struct pt_regs regs)
772 unsigned long newsp; 809 unsigned long newsp;
773 int __user *parent_tidptr, *child_tidptr; 810 int __user *parent_tidptr, *child_tidptr;
774 811
775 clone_flags = regs.ebx; 812 clone_flags = regs.bx;
776 newsp = regs.ecx; 813 newsp = regs.cx;
777 parent_tidptr = (int __user *)regs.edx; 814 parent_tidptr = (int __user *)regs.dx;
778 child_tidptr = (int __user *)regs.edi; 815 child_tidptr = (int __user *)regs.di;
779 if (!newsp) 816 if (!newsp)
780 newsp = regs.esp; 817 newsp = regs.sp;
781 return do_fork(clone_flags, newsp, &regs, 0, parent_tidptr, child_tidptr); 818 return do_fork(clone_flags, newsp, &regs, 0, parent_tidptr, child_tidptr);
782} 819}
783 820
@@ -793,7 +830,7 @@ asmlinkage int sys_clone(struct pt_regs regs)
793 */ 830 */
794asmlinkage int sys_vfork(struct pt_regs regs) 831asmlinkage int sys_vfork(struct pt_regs regs)
795{ 832{
796 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, &regs, 0, NULL, NULL); 833 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.sp, &regs, 0, NULL, NULL);
797} 834}
798 835
799/* 836/*
@@ -804,18 +841,15 @@ asmlinkage int sys_execve(struct pt_regs regs)
804 int error; 841 int error;
805 char * filename; 842 char * filename;
806 843
807 filename = getname((char __user *) regs.ebx); 844 filename = getname((char __user *) regs.bx);
808 error = PTR_ERR(filename); 845 error = PTR_ERR(filename);
809 if (IS_ERR(filename)) 846 if (IS_ERR(filename))
810 goto out; 847 goto out;
811 error = do_execve(filename, 848 error = do_execve(filename,
812 (char __user * __user *) regs.ecx, 849 (char __user * __user *) regs.cx,
813 (char __user * __user *) regs.edx, 850 (char __user * __user *) regs.dx,
814 &regs); 851 &regs);
815 if (error == 0) { 852 if (error == 0) {
816 task_lock(current);
817 current->ptrace &= ~PT_DTRACE;
818 task_unlock(current);
819 /* Make sure we don't return using sysenter.. */ 853 /* Make sure we don't return using sysenter.. */
820 set_thread_flag(TIF_IRET); 854 set_thread_flag(TIF_IRET);
821 } 855 }
@@ -829,145 +863,37 @@ out:
829 863
830unsigned long get_wchan(struct task_struct *p) 864unsigned long get_wchan(struct task_struct *p)
831{ 865{
832 unsigned long ebp, esp, eip; 866 unsigned long bp, sp, ip;
833 unsigned long stack_page; 867 unsigned long stack_page;
834 int count = 0; 868 int count = 0;
835 if (!p || p == current || p->state == TASK_RUNNING) 869 if (!p || p == current || p->state == TASK_RUNNING)
836 return 0; 870 return 0;
837 stack_page = (unsigned long)task_stack_page(p); 871 stack_page = (unsigned long)task_stack_page(p);
838 esp = p->thread.esp; 872 sp = p->thread.sp;
839 if (!stack_page || esp < stack_page || esp > top_esp+stack_page) 873 if (!stack_page || sp < stack_page || sp > top_esp+stack_page)
840 return 0; 874 return 0;
841 /* include/asm-i386/system.h:switch_to() pushes ebp last. */ 875 /* include/asm-i386/system.h:switch_to() pushes bp last. */
842 ebp = *(unsigned long *) esp; 876 bp = *(unsigned long *) sp;
843 do { 877 do {
844 if (ebp < stack_page || ebp > top_ebp+stack_page) 878 if (bp < stack_page || bp > top_ebp+stack_page)
845 return 0; 879 return 0;
846 eip = *(unsigned long *) (ebp+4); 880 ip = *(unsigned long *) (bp+4);
847 if (!in_sched_functions(eip)) 881 if (!in_sched_functions(ip))
848 return eip; 882 return ip;
849 ebp = *(unsigned long *) ebp; 883 bp = *(unsigned long *) bp;
850 } while (count++ < 16); 884 } while (count++ < 16);
851 return 0; 885 return 0;
852} 886}
853 887
854/*
855 * sys_alloc_thread_area: get a yet unused TLS descriptor index.
856 */
857static int get_free_idx(void)
858{
859 struct thread_struct *t = &current->thread;
860 int idx;
861
862 for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++)
863 if (desc_empty(t->tls_array + idx))
864 return idx + GDT_ENTRY_TLS_MIN;
865 return -ESRCH;
866}
867
868/*
869 * Set a given TLS descriptor:
870 */
871asmlinkage int sys_set_thread_area(struct user_desc __user *u_info)
872{
873 struct thread_struct *t = &current->thread;
874 struct user_desc info;
875 struct desc_struct *desc;
876 int cpu, idx;
877
878 if (copy_from_user(&info, u_info, sizeof(info)))
879 return -EFAULT;
880 idx = info.entry_number;
881
882 /*
883 * index -1 means the kernel should try to find and
884 * allocate an empty descriptor:
885 */
886 if (idx == -1) {
887 idx = get_free_idx();
888 if (idx < 0)
889 return idx;
890 if (put_user(idx, &u_info->entry_number))
891 return -EFAULT;
892 }
893
894 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
895 return -EINVAL;
896
897 desc = t->tls_array + idx - GDT_ENTRY_TLS_MIN;
898
899 /*
900 * We must not get preempted while modifying the TLS.
901 */
902 cpu = get_cpu();
903
904 if (LDT_empty(&info)) {
905 desc->a = 0;
906 desc->b = 0;
907 } else {
908 desc->a = LDT_entry_a(&info);
909 desc->b = LDT_entry_b(&info);
910 }
911 load_TLS(t, cpu);
912
913 put_cpu();
914
915 return 0;
916}
917
918/*
919 * Get the current Thread-Local Storage area:
920 */
921
922#define GET_BASE(desc) ( \
923 (((desc)->a >> 16) & 0x0000ffff) | \
924 (((desc)->b << 16) & 0x00ff0000) | \
925 ( (desc)->b & 0xff000000) )
926
927#define GET_LIMIT(desc) ( \
928 ((desc)->a & 0x0ffff) | \
929 ((desc)->b & 0xf0000) )
930
931#define GET_32BIT(desc) (((desc)->b >> 22) & 1)
932#define GET_CONTENTS(desc) (((desc)->b >> 10) & 3)
933#define GET_WRITABLE(desc) (((desc)->b >> 9) & 1)
934#define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1)
935#define GET_PRESENT(desc) (((desc)->b >> 15) & 1)
936#define GET_USEABLE(desc) (((desc)->b >> 20) & 1)
937
938asmlinkage int sys_get_thread_area(struct user_desc __user *u_info)
939{
940 struct user_desc info;
941 struct desc_struct *desc;
942 int idx;
943
944 if (get_user(idx, &u_info->entry_number))
945 return -EFAULT;
946 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
947 return -EINVAL;
948
949 memset(&info, 0, sizeof(info));
950
951 desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
952
953 info.entry_number = idx;
954 info.base_addr = GET_BASE(desc);
955 info.limit = GET_LIMIT(desc);
956 info.seg_32bit = GET_32BIT(desc);
957 info.contents = GET_CONTENTS(desc);
958 info.read_exec_only = !GET_WRITABLE(desc);
959 info.limit_in_pages = GET_LIMIT_PAGES(desc);
960 info.seg_not_present = !GET_PRESENT(desc);
961 info.useable = GET_USEABLE(desc);
962
963 if (copy_to_user(u_info, &info, sizeof(info)))
964 return -EFAULT;
965 return 0;
966}
967
968unsigned long arch_align_stack(unsigned long sp) 888unsigned long arch_align_stack(unsigned long sp)
969{ 889{
970 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) 890 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
971 sp -= get_random_int() % 8192; 891 sp -= get_random_int() % 8192;
972 return sp & ~0xf; 892 return sp & ~0xf;
973} 893}
894
895unsigned long arch_randomize_brk(struct mm_struct *mm)
896{
897 unsigned long range_end = mm->brk + 0x02000000;
898 return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
899}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 6309b275cb9..137a86171c3 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * Pentium III FXSR, SSE support 4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000 5 * Gareth Hughes <gareth@valinux.com>, May 2000
6 * 6 *
7 * X86-64 port 7 * X86-64 port
8 * Andi Kleen. 8 * Andi Kleen.
9 * 9 *
@@ -19,19 +19,19 @@
19#include <linux/cpu.h> 19#include <linux/cpu.h>
20#include <linux/errno.h> 20#include <linux/errno.h>
21#include <linux/sched.h> 21#include <linux/sched.h>
22#include <linux/fs.h>
22#include <linux/kernel.h> 23#include <linux/kernel.h>
23#include <linux/mm.h> 24#include <linux/mm.h>
24#include <linux/fs.h>
25#include <linux/elfcore.h> 25#include <linux/elfcore.h>
26#include <linux/smp.h> 26#include <linux/smp.h>
27#include <linux/slab.h> 27#include <linux/slab.h>
28#include <linux/user.h> 28#include <linux/user.h>
29#include <linux/module.h>
30#include <linux/a.out.h> 29#include <linux/a.out.h>
31#include <linux/interrupt.h> 30#include <linux/interrupt.h>
31#include <linux/utsname.h>
32#include <linux/delay.h> 32#include <linux/delay.h>
33#include <linux/module.h>
33#include <linux/ptrace.h> 34#include <linux/ptrace.h>
34#include <linux/utsname.h>
35#include <linux/random.h> 35#include <linux/random.h>
36#include <linux/notifier.h> 36#include <linux/notifier.h>
37#include <linux/kprobes.h> 37#include <linux/kprobes.h>
@@ -72,13 +72,6 @@ void idle_notifier_register(struct notifier_block *n)
72{ 72{
73 atomic_notifier_chain_register(&idle_notifier, n); 73 atomic_notifier_chain_register(&idle_notifier, n);
74} 74}
75EXPORT_SYMBOL_GPL(idle_notifier_register);
76
77void idle_notifier_unregister(struct notifier_block *n)
78{
79 atomic_notifier_chain_unregister(&idle_notifier, n);
80}
81EXPORT_SYMBOL(idle_notifier_unregister);
82 75
83void enter_idle(void) 76void enter_idle(void)
84{ 77{
@@ -106,7 +99,7 @@ void exit_idle(void)
106 * We use this if we don't have any better 99 * We use this if we don't have any better
107 * idle routine.. 100 * idle routine..
108 */ 101 */
109static void default_idle(void) 102void default_idle(void)
110{ 103{
111 current_thread_info()->status &= ~TS_POLLING; 104 current_thread_info()->status &= ~TS_POLLING;
112 /* 105 /*
@@ -116,11 +109,18 @@ static void default_idle(void)
116 smp_mb(); 109 smp_mb();
117 local_irq_disable(); 110 local_irq_disable();
118 if (!need_resched()) { 111 if (!need_resched()) {
119 /* Enables interrupts one instruction before HLT. 112 ktime_t t0, t1;
120 x86 special cases this so there is no race. */ 113 u64 t0n, t1n;
121 safe_halt(); 114
122 } else 115 t0 = ktime_get();
123 local_irq_enable(); 116 t0n = ktime_to_ns(t0);
117 safe_halt(); /* enables interrupts racelessly */
118 local_irq_disable();
119 t1 = ktime_get();
120 t1n = ktime_to_ns(t1);
121 sched_clock_idle_wakeup_event(t1n - t0n);
122 }
123 local_irq_enable();
124 current_thread_info()->status |= TS_POLLING; 124 current_thread_info()->status |= TS_POLLING;
125} 125}
126 126
@@ -129,43 +129,12 @@ static void default_idle(void)
129 * to poll the ->need_resched flag instead of waiting for the 129 * to poll the ->need_resched flag instead of waiting for the
130 * cross-CPU IPI to arrive. Use this option with caution. 130 * cross-CPU IPI to arrive. Use this option with caution.
131 */ 131 */
132static void poll_idle (void) 132static void poll_idle(void)
133{ 133{
134 local_irq_enable(); 134 local_irq_enable();
135 cpu_relax(); 135 cpu_relax();
136} 136}
137 137
138void cpu_idle_wait(void)
139{
140 unsigned int cpu, this_cpu = get_cpu();
141 cpumask_t map, tmp = current->cpus_allowed;
142
143 set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
144 put_cpu();
145
146 cpus_clear(map);
147 for_each_online_cpu(cpu) {
148 per_cpu(cpu_idle_state, cpu) = 1;
149 cpu_set(cpu, map);
150 }
151
152 __get_cpu_var(cpu_idle_state) = 0;
153
154 wmb();
155 do {
156 ssleep(1);
157 for_each_online_cpu(cpu) {
158 if (cpu_isset(cpu, map) &&
159 !per_cpu(cpu_idle_state, cpu))
160 cpu_clear(cpu, map);
161 }
162 cpus_and(map, map, cpu_online_map);
163 } while (!cpus_empty(map));
164
165 set_cpus_allowed(current, tmp);
166}
167EXPORT_SYMBOL_GPL(cpu_idle_wait);
168
169#ifdef CONFIG_HOTPLUG_CPU 138#ifdef CONFIG_HOTPLUG_CPU
170DECLARE_PER_CPU(int, cpu_state); 139DECLARE_PER_CPU(int, cpu_state);
171 140
@@ -196,19 +165,18 @@ static inline void play_dead(void)
196 * low exit latency (ie sit in a loop waiting for 165 * low exit latency (ie sit in a loop waiting for
197 * somebody to say that they'd like to reschedule) 166 * somebody to say that they'd like to reschedule)
198 */ 167 */
199void cpu_idle (void) 168void cpu_idle(void)
200{ 169{
201 current_thread_info()->status |= TS_POLLING; 170 current_thread_info()->status |= TS_POLLING;
202 /* endless idle loop with no priority at all */ 171 /* endless idle loop with no priority at all */
203 while (1) { 172 while (1) {
173 tick_nohz_stop_sched_tick();
204 while (!need_resched()) { 174 while (!need_resched()) {
205 void (*idle)(void); 175 void (*idle)(void);
206 176
207 if (__get_cpu_var(cpu_idle_state)) 177 if (__get_cpu_var(cpu_idle_state))
208 __get_cpu_var(cpu_idle_state) = 0; 178 __get_cpu_var(cpu_idle_state) = 0;
209 179
210 tick_nohz_stop_sched_tick();
211
212 rmb(); 180 rmb();
213 idle = pm_idle; 181 idle = pm_idle;
214 if (!idle) 182 if (!idle)
@@ -236,6 +204,47 @@ void cpu_idle (void)
236 } 204 }
237} 205}
238 206
207static void do_nothing(void *unused)
208{
209}
210
211void cpu_idle_wait(void)
212{
213 unsigned int cpu, this_cpu = get_cpu();
214 cpumask_t map, tmp = current->cpus_allowed;
215
216 set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
217 put_cpu();
218
219 cpus_clear(map);
220 for_each_online_cpu(cpu) {
221 per_cpu(cpu_idle_state, cpu) = 1;
222 cpu_set(cpu, map);
223 }
224
225 __get_cpu_var(cpu_idle_state) = 0;
226
227 wmb();
228 do {
229 ssleep(1);
230 for_each_online_cpu(cpu) {
231 if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
232 cpu_clear(cpu, map);
233 }
234 cpus_and(map, map, cpu_online_map);
235 /*
236 * We waited 1 sec, if a CPU still did not call idle
237 * it may be because it is in idle and not waking up
238 * because it has nothing to do.
239 * Give all the remaining CPUS a kick.
240 */
241 smp_call_function_mask(map, do_nothing, 0, 0);
242 } while (!cpus_empty(map));
243
244 set_cpus_allowed(current, tmp);
245}
246EXPORT_SYMBOL_GPL(cpu_idle_wait);
247
239/* 248/*
240 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, 249 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
241 * which can obviate IPI to trigger checking of need_resched. 250 * which can obviate IPI to trigger checking of need_resched.
@@ -246,13 +255,13 @@ void cpu_idle (void)
246 * New with Core Duo processors, MWAIT can take some hints based on CPU 255 * New with Core Duo processors, MWAIT can take some hints based on CPU
247 * capability. 256 * capability.
248 */ 257 */
249void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) 258void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
250{ 259{
251 if (!need_resched()) { 260 if (!need_resched()) {
252 __monitor((void *)&current_thread_info()->flags, 0, 0); 261 __monitor((void *)&current_thread_info()->flags, 0, 0);
253 smp_mb(); 262 smp_mb();
254 if (!need_resched()) 263 if (!need_resched())
255 __mwait(eax, ecx); 264 __mwait(ax, cx);
256 } 265 }
257} 266}
258 267
@@ -271,25 +280,41 @@ static void mwait_idle(void)
271 } 280 }
272} 281}
273 282
283
284static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
285{
286 if (force_mwait)
287 return 1;
288 /* Any C1 states supported? */
289 return c->cpuid_level >= 5 && ((cpuid_edx(5) >> 4) & 0xf) > 0;
290}
291
274void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) 292void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
275{ 293{
276 static int printed; 294 static int selected;
277 if (cpu_has(c, X86_FEATURE_MWAIT)) { 295
296 if (selected)
297 return;
298#ifdef CONFIG_X86_SMP
299 if (pm_idle == poll_idle && smp_num_siblings > 1) {
300 printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
301 " performance may degrade.\n");
302 }
303#endif
304 if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
278 /* 305 /*
279 * Skip, if setup has overridden idle. 306 * Skip, if setup has overridden idle.
280 * One CPU supports mwait => All CPUs supports mwait 307 * One CPU supports mwait => All CPUs supports mwait
281 */ 308 */
282 if (!pm_idle) { 309 if (!pm_idle) {
283 if (!printed) { 310 printk(KERN_INFO "using mwait in idle threads.\n");
284 printk(KERN_INFO "using mwait in idle threads.\n");
285 printed = 1;
286 }
287 pm_idle = mwait_idle; 311 pm_idle = mwait_idle;
288 } 312 }
289 } 313 }
314 selected = 1;
290} 315}
291 316
292static int __init idle_setup (char *str) 317static int __init idle_setup(char *str)
293{ 318{
294 if (!strcmp(str, "poll")) { 319 if (!strcmp(str, "poll")) {
295 printk("using polling idle threads.\n"); 320 printk("using polling idle threads.\n");
@@ -304,13 +329,13 @@ static int __init idle_setup (char *str)
304} 329}
305early_param("idle", idle_setup); 330early_param("idle", idle_setup);
306 331
307/* Prints also some state that isn't saved in the pt_regs */ 332/* Prints also some state that isn't saved in the pt_regs */
308void __show_regs(struct pt_regs * regs) 333void __show_regs(struct pt_regs * regs)
309{ 334{
310 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; 335 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
311 unsigned long d0, d1, d2, d3, d6, d7; 336 unsigned long d0, d1, d2, d3, d6, d7;
312 unsigned int fsindex,gsindex; 337 unsigned int fsindex, gsindex;
313 unsigned int ds,cs,es; 338 unsigned int ds, cs, es;
314 339
315 printk("\n"); 340 printk("\n");
316 print_modules(); 341 print_modules();
@@ -319,16 +344,16 @@ void __show_regs(struct pt_regs * regs)
319 init_utsname()->release, 344 init_utsname()->release,
320 (int)strcspn(init_utsname()->version, " "), 345 (int)strcspn(init_utsname()->version, " "),
321 init_utsname()->version); 346 init_utsname()->version);
322 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip); 347 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
323 printk_address(regs->rip); 348 printk_address(regs->ip, 1);
324 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, 349 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp,
325 regs->eflags); 350 regs->flags);
326 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n", 351 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
327 regs->rax, regs->rbx, regs->rcx); 352 regs->ax, regs->bx, regs->cx);
328 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n", 353 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
329 regs->rdx, regs->rsi, regs->rdi); 354 regs->dx, regs->si, regs->di);
330 printk("RBP: %016lx R08: %016lx R09: %016lx\n", 355 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
331 regs->rbp, regs->r8, regs->r9); 356 regs->bp, regs->r8, regs->r9);
332 printk("R10: %016lx R11: %016lx R12: %016lx\n", 357 printk("R10: %016lx R11: %016lx R12: %016lx\n",
333 regs->r10, regs->r11, regs->r12); 358 regs->r10, regs->r11, regs->r12);
334 printk("R13: %016lx R14: %016lx R15: %016lx\n", 359 printk("R13: %016lx R14: %016lx R15: %016lx\n",
@@ -368,7 +393,7 @@ void show_regs(struct pt_regs *regs)
368{ 393{
369 printk("CPU %d:", smp_processor_id()); 394 printk("CPU %d:", smp_processor_id());
370 __show_regs(regs); 395 __show_regs(regs);
371 show_trace(NULL, regs, (void *)(regs + 1)); 396 show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
372} 397}
373 398
374/* 399/*
@@ -379,7 +404,7 @@ void exit_thread(void)
379 struct task_struct *me = current; 404 struct task_struct *me = current;
380 struct thread_struct *t = &me->thread; 405 struct thread_struct *t = &me->thread;
381 406
382 if (me->thread.io_bitmap_ptr) { 407 if (me->thread.io_bitmap_ptr) {
383 struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); 408 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
384 409
385 kfree(t->io_bitmap_ptr); 410 kfree(t->io_bitmap_ptr);
@@ -415,7 +440,7 @@ void flush_thread(void)
415 tsk->thread.debugreg3 = 0; 440 tsk->thread.debugreg3 = 0;
416 tsk->thread.debugreg6 = 0; 441 tsk->thread.debugreg6 = 0;
417 tsk->thread.debugreg7 = 0; 442 tsk->thread.debugreg7 = 0;
418 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); 443 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
419 /* 444 /*
420 * Forget coprocessor state.. 445 * Forget coprocessor state..
421 */ 446 */
@@ -438,26 +463,21 @@ void release_thread(struct task_struct *dead_task)
438 463
439static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr) 464static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
440{ 465{
441 struct user_desc ud = { 466 struct user_desc ud = {
442 .base_addr = addr, 467 .base_addr = addr,
443 .limit = 0xfffff, 468 .limit = 0xfffff,
444 .seg_32bit = 1, 469 .seg_32bit = 1,
445 .limit_in_pages = 1, 470 .limit_in_pages = 1,
446 .useable = 1, 471 .useable = 1,
447 }; 472 };
448 struct n_desc_struct *desc = (void *)t->thread.tls_array; 473 struct desc_struct *desc = t->thread.tls_array;
449 desc += tls; 474 desc += tls;
450 desc->a = LDT_entry_a(&ud); 475 fill_ldt(desc, &ud);
451 desc->b = LDT_entry_b(&ud);
452} 476}
453 477
454static inline u32 read_32bit_tls(struct task_struct *t, int tls) 478static inline u32 read_32bit_tls(struct task_struct *t, int tls)
455{ 479{
456 struct desc_struct *desc = (void *)t->thread.tls_array; 480 return get_desc_base(&t->thread.tls_array[tls]);
457 desc += tls;
458 return desc->base0 |
459 (((u32)desc->base1) << 16) |
460 (((u32)desc->base2) << 24);
461} 481}
462 482
463/* 483/*
@@ -469,7 +489,7 @@ void prepare_to_copy(struct task_struct *tsk)
469 unlazy_fpu(tsk); 489 unlazy_fpu(tsk);
470} 490}
471 491
472int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, 492int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
473 unsigned long unused, 493 unsigned long unused,
474 struct task_struct * p, struct pt_regs * regs) 494 struct task_struct * p, struct pt_regs * regs)
475{ 495{
@@ -481,14 +501,14 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
481 (THREAD_SIZE + task_stack_page(p))) - 1; 501 (THREAD_SIZE + task_stack_page(p))) - 1;
482 *childregs = *regs; 502 *childregs = *regs;
483 503
484 childregs->rax = 0; 504 childregs->ax = 0;
485 childregs->rsp = rsp; 505 childregs->sp = sp;
486 if (rsp == ~0UL) 506 if (sp == ~0UL)
487 childregs->rsp = (unsigned long)childregs; 507 childregs->sp = (unsigned long)childregs;
488 508
489 p->thread.rsp = (unsigned long) childregs; 509 p->thread.sp = (unsigned long) childregs;
490 p->thread.rsp0 = (unsigned long) (childregs+1); 510 p->thread.sp0 = (unsigned long) (childregs+1);
491 p->thread.userrsp = me->thread.userrsp; 511 p->thread.usersp = me->thread.usersp;
492 512
493 set_tsk_thread_flag(p, TIF_FORK); 513 set_tsk_thread_flag(p, TIF_FORK);
494 514
@@ -509,7 +529,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
509 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, 529 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
510 IO_BITMAP_BYTES); 530 IO_BITMAP_BYTES);
511 set_tsk_thread_flag(p, TIF_IO_BITMAP); 531 set_tsk_thread_flag(p, TIF_IO_BITMAP);
512 } 532 }
513 533
514 /* 534 /*
515 * Set a new TLS for the child thread? 535 * Set a new TLS for the child thread?
@@ -517,7 +537,8 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
517 if (clone_flags & CLONE_SETTLS) { 537 if (clone_flags & CLONE_SETTLS) {
518#ifdef CONFIG_IA32_EMULATION 538#ifdef CONFIG_IA32_EMULATION
519 if (test_thread_flag(TIF_IA32)) 539 if (test_thread_flag(TIF_IA32))
520 err = ia32_child_tls(p, childregs); 540 err = do_set_thread_area(p, -1,
541 (struct user_desc __user *)childregs->si, 0);
521 else 542 else
522#endif 543#endif
523 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); 544 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
@@ -536,17 +557,30 @@ out:
536/* 557/*
537 * This special macro can be used to load a debugging register 558 * This special macro can be used to load a debugging register
538 */ 559 */
539#define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r) 560#define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)
540 561
541static inline void __switch_to_xtra(struct task_struct *prev_p, 562static inline void __switch_to_xtra(struct task_struct *prev_p,
542 struct task_struct *next_p, 563 struct task_struct *next_p,
543 struct tss_struct *tss) 564 struct tss_struct *tss)
544{ 565{
545 struct thread_struct *prev, *next; 566 struct thread_struct *prev, *next;
567 unsigned long debugctl;
546 568
547 prev = &prev_p->thread, 569 prev = &prev_p->thread,
548 next = &next_p->thread; 570 next = &next_p->thread;
549 571
572 debugctl = prev->debugctlmsr;
573 if (next->ds_area_msr != prev->ds_area_msr) {
574 /* we clear debugctl to make sure DS
575 * is not in use when we change it */
576 debugctl = 0;
577 wrmsrl(MSR_IA32_DEBUGCTLMSR, 0);
578 wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr);
579 }
580
581 if (next->debugctlmsr != debugctl)
582 wrmsrl(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr);
583
550 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { 584 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
551 loaddebug(next, 0); 585 loaddebug(next, 0);
552 loaddebug(next, 1); 586 loaddebug(next, 1);
@@ -570,12 +604,18 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
570 */ 604 */
571 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); 605 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
572 } 606 }
607
608 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
609 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
610
611 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
612 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
573} 613}
574 614
575/* 615/*
576 * switch_to(x,y) should switch tasks from x to y. 616 * switch_to(x,y) should switch tasks from x to y.
577 * 617 *
578 * This could still be optimized: 618 * This could still be optimized:
579 * - fold all the options into a flag word and test it with a single test. 619 * - fold all the options into a flag word and test it with a single test.
580 * - could test fs/gs bitsliced 620 * - could test fs/gs bitsliced
581 * 621 *
@@ -586,7 +626,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
586{ 626{
587 struct thread_struct *prev = &prev_p->thread, 627 struct thread_struct *prev = &prev_p->thread,
588 *next = &next_p->thread; 628 *next = &next_p->thread;
589 int cpu = smp_processor_id(); 629 int cpu = smp_processor_id();
590 struct tss_struct *tss = &per_cpu(init_tss, cpu); 630 struct tss_struct *tss = &per_cpu(init_tss, cpu);
591 631
592 /* we're going to use this soon, after a few expensive things */ 632 /* we're going to use this soon, after a few expensive things */
@@ -596,7 +636,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
596 /* 636 /*
597 * Reload esp0, LDT and the page table pointer: 637 * Reload esp0, LDT and the page table pointer:
598 */ 638 */
599 tss->rsp0 = next->rsp0; 639 load_sp0(tss, next);
600 640
601 /* 641 /*
602 * Switch DS and ES. 642 * Switch DS and ES.
@@ -655,8 +695,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
655 /* 695 /*
656 * Switch the PDA and FPU contexts. 696 * Switch the PDA and FPU contexts.
657 */ 697 */
658 prev->userrsp = read_pda(oldrsp); 698 prev->usersp = read_pda(oldrsp);
659 write_pda(oldrsp, next->userrsp); 699 write_pda(oldrsp, next->usersp);
660 write_pda(pcurrent, next_p); 700 write_pda(pcurrent, next_p);
661 701
662 write_pda(kernelstack, 702 write_pda(kernelstack,
@@ -673,8 +713,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
673 /* 713 /*
674 * Now maybe reload the debug registers and handle I/O bitmaps 714 * Now maybe reload the debug registers and handle I/O bitmaps
675 */ 715 */
676 if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW)) 716 if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
677 || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) 717 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
678 __switch_to_xtra(prev_p, next_p, tss); 718 __switch_to_xtra(prev_p, next_p, tss);
679 719
680 /* If the task has used fpu the last 5 timeslices, just do a full 720 /* If the task has used fpu the last 5 timeslices, just do a full
@@ -689,7 +729,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
689/* 729/*
690 * sys_execve() executes a new program. 730 * sys_execve() executes a new program.
691 */ 731 */
692asmlinkage 732asmlinkage
693long sys_execve(char __user *name, char __user * __user *argv, 733long sys_execve(char __user *name, char __user * __user *argv,
694 char __user * __user *envp, struct pt_regs regs) 734 char __user * __user *envp, struct pt_regs regs)
695{ 735{
@@ -701,11 +741,6 @@ long sys_execve(char __user *name, char __user * __user *argv,
701 if (IS_ERR(filename)) 741 if (IS_ERR(filename))
702 return error; 742 return error;
703 error = do_execve(filename, argv, envp, &regs); 743 error = do_execve(filename, argv, envp, &regs);
704 if (error == 0) {
705 task_lock(current);
706 current->ptrace &= ~PT_DTRACE;
707 task_unlock(current);
708 }
709 putname(filename); 744 putname(filename);
710 return error; 745 return error;
711} 746}
@@ -715,18 +750,18 @@ void set_personality_64bit(void)
715 /* inherit personality from parent */ 750 /* inherit personality from parent */
716 751
717 /* Make sure to be in 64bit mode */ 752 /* Make sure to be in 64bit mode */
718 clear_thread_flag(TIF_IA32); 753 clear_thread_flag(TIF_IA32);
719 754
720 /* TBD: overwrites user setup. Should have two bits. 755 /* TBD: overwrites user setup. Should have two bits.
721 But 64bit processes have always behaved this way, 756 But 64bit processes have always behaved this way,
722 so it's not too bad. The main problem is just that 757 so it's not too bad. The main problem is just that
723 32bit childs are affected again. */ 758 32bit childs are affected again. */
724 current->personality &= ~READ_IMPLIES_EXEC; 759 current->personality &= ~READ_IMPLIES_EXEC;
725} 760}
726 761
727asmlinkage long sys_fork(struct pt_regs *regs) 762asmlinkage long sys_fork(struct pt_regs *regs)
728{ 763{
729 return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL); 764 return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
730} 765}
731 766
732asmlinkage long 767asmlinkage long
@@ -734,7 +769,7 @@ sys_clone(unsigned long clone_flags, unsigned long newsp,
734 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) 769 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
735{ 770{
736 if (!newsp) 771 if (!newsp)
737 newsp = regs->rsp; 772 newsp = regs->sp;
738 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); 773 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
739} 774}
740 775
@@ -750,29 +785,29 @@ sys_clone(unsigned long clone_flags, unsigned long newsp,
750 */ 785 */
751asmlinkage long sys_vfork(struct pt_regs *regs) 786asmlinkage long sys_vfork(struct pt_regs *regs)
752{ 787{
753 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0, 788 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
754 NULL, NULL); 789 NULL, NULL);
755} 790}
756 791
757unsigned long get_wchan(struct task_struct *p) 792unsigned long get_wchan(struct task_struct *p)
758{ 793{
759 unsigned long stack; 794 unsigned long stack;
760 u64 fp,rip; 795 u64 fp,ip;
761 int count = 0; 796 int count = 0;
762 797
763 if (!p || p == current || p->state==TASK_RUNNING) 798 if (!p || p == current || p->state==TASK_RUNNING)
764 return 0; 799 return 0;
765 stack = (unsigned long)task_stack_page(p); 800 stack = (unsigned long)task_stack_page(p);
766 if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE) 801 if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE)
767 return 0; 802 return 0;
768 fp = *(u64 *)(p->thread.rsp); 803 fp = *(u64 *)(p->thread.sp);
769 do { 804 do {
770 if (fp < (unsigned long)stack || 805 if (fp < (unsigned long)stack ||
771 fp > (unsigned long)stack+THREAD_SIZE) 806 fp > (unsigned long)stack+THREAD_SIZE)
772 return 0; 807 return 0;
773 rip = *(u64 *)(fp+8); 808 ip = *(u64 *)(fp+8);
774 if (!in_sched_functions(rip)) 809 if (!in_sched_functions(ip))
775 return rip; 810 return ip;
776 fp = *(u64 *)fp; 811 fp = *(u64 *)fp;
777 } while (count++ < 16); 812 } while (count++ < 16);
778 return 0; 813 return 0;
@@ -813,19 +848,19 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
813 /* Not strictly needed for fs, but do it for symmetry 848 /* Not strictly needed for fs, but do it for symmetry
814 with gs */ 849 with gs */
815 if (addr >= TASK_SIZE_OF(task)) 850 if (addr >= TASK_SIZE_OF(task))
816 return -EPERM; 851 return -EPERM;
817 cpu = get_cpu(); 852 cpu = get_cpu();
818 /* handle small bases via the GDT because that's faster to 853 /* handle small bases via the GDT because that's faster to
819 switch. */ 854 switch. */
820 if (addr <= 0xffffffff) { 855 if (addr <= 0xffffffff) {
821 set_32bit_tls(task, FS_TLS, addr); 856 set_32bit_tls(task, FS_TLS, addr);
822 if (doit) { 857 if (doit) {
823 load_TLS(&task->thread, cpu); 858 load_TLS(&task->thread, cpu);
824 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL)); 859 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
825 } 860 }
826 task->thread.fsindex = FS_TLS_SEL; 861 task->thread.fsindex = FS_TLS_SEL;
827 task->thread.fs = 0; 862 task->thread.fs = 0;
828 } else { 863 } else {
829 task->thread.fsindex = 0; 864 task->thread.fsindex = 0;
830 task->thread.fs = addr; 865 task->thread.fs = addr;
831 if (doit) { 866 if (doit) {
@@ -837,24 +872,24 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
837 } 872 }
838 put_cpu(); 873 put_cpu();
839 break; 874 break;
840 case ARCH_GET_FS: { 875 case ARCH_GET_FS: {
841 unsigned long base; 876 unsigned long base;
842 if (task->thread.fsindex == FS_TLS_SEL) 877 if (task->thread.fsindex == FS_TLS_SEL)
843 base = read_32bit_tls(task, FS_TLS); 878 base = read_32bit_tls(task, FS_TLS);
844 else if (doit) 879 else if (doit)
845 rdmsrl(MSR_FS_BASE, base); 880 rdmsrl(MSR_FS_BASE, base);
846 else 881 else
847 base = task->thread.fs; 882 base = task->thread.fs;
848 ret = put_user(base, (unsigned long __user *)addr); 883 ret = put_user(base, (unsigned long __user *)addr);
849 break; 884 break;
850 } 885 }
851 case ARCH_GET_GS: { 886 case ARCH_GET_GS: {
852 unsigned long base; 887 unsigned long base;
853 unsigned gsindex; 888 unsigned gsindex;
854 if (task->thread.gsindex == GS_TLS_SEL) 889 if (task->thread.gsindex == GS_TLS_SEL)
855 base = read_32bit_tls(task, GS_TLS); 890 base = read_32bit_tls(task, GS_TLS);
856 else if (doit) { 891 else if (doit) {
857 asm("movl %%gs,%0" : "=r" (gsindex)); 892 asm("movl %%gs,%0" : "=r" (gsindex));
858 if (gsindex) 893 if (gsindex)
859 rdmsrl(MSR_KERNEL_GS_BASE, base); 894 rdmsrl(MSR_KERNEL_GS_BASE, base);
860 else 895 else
@@ -862,39 +897,21 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
862 } 897 }
863 else 898 else
864 base = task->thread.gs; 899 base = task->thread.gs;
865 ret = put_user(base, (unsigned long __user *)addr); 900 ret = put_user(base, (unsigned long __user *)addr);
866 break; 901 break;
867 } 902 }
868 903
869 default: 904 default:
870 ret = -EINVAL; 905 ret = -EINVAL;
871 break; 906 break;
872 } 907 }
873 908
874 return ret; 909 return ret;
875} 910}
876 911
877long sys_arch_prctl(int code, unsigned long addr) 912long sys_arch_prctl(int code, unsigned long addr)
878{ 913{
879 return do_arch_prctl(current, code, addr); 914 return do_arch_prctl(current, code, addr);
880}
881
882/*
883 * Capture the user space registers if the task is not running (in user space)
884 */
885int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
886{
887 struct pt_regs *pp, ptregs;
888
889 pp = task_pt_regs(tsk);
890
891 ptregs = *pp;
892 ptregs.cs &= 0xffff;
893 ptregs.ss &= 0xffff;
894
895 elf_core_copy_regs(regs, &ptregs);
896
897 return 1;
898} 915}
899 916
900unsigned long arch_align_stack(unsigned long sp) 917unsigned long arch_align_stack(unsigned long sp)
@@ -903,3 +920,9 @@ unsigned long arch_align_stack(unsigned long sp)
903 sp -= get_random_int() % 8192; 920 sp -= get_random_int() % 8192;
904 return sp & ~0xf; 921 return sp & ~0xf;
905} 922}
923
924unsigned long arch_randomize_brk(struct mm_struct *mm)
925{
926 unsigned long range_end = mm->brk + 0x02000000;
927 return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
928}
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
new file mode 100644
index 00000000000..702c33efea8
--- /dev/null
+++ b/arch/x86/kernel/ptrace.c
@@ -0,0 +1,1566 @@
1/* By Ross Biro 1/23/92 */
2/*
3 * Pentium III FXSR, SSE support
4 * Gareth Hughes <gareth@valinux.com>, May 2000
5 *
6 * BTS tracing
7 * Markus Metzger <markus.t.metzger@intel.com>, Dec 2007
8 */
9
10#include <linux/kernel.h>
11#include <linux/sched.h>
12#include <linux/mm.h>
13#include <linux/smp.h>
14#include <linux/errno.h>
15#include <linux/ptrace.h>
16#include <linux/regset.h>
17#include <linux/user.h>
18#include <linux/elf.h>
19#include <linux/security.h>
20#include <linux/audit.h>
21#include <linux/seccomp.h>
22#include <linux/signal.h>
23
24#include <asm/uaccess.h>
25#include <asm/pgtable.h>
26#include <asm/system.h>
27#include <asm/processor.h>
28#include <asm/i387.h>
29#include <asm/debugreg.h>
30#include <asm/ldt.h>
31#include <asm/desc.h>
32#include <asm/prctl.h>
33#include <asm/proto.h>
34#include <asm/ds.h>
35
36#include "tls.h"
37
38enum x86_regset {
39 REGSET_GENERAL,
40 REGSET_FP,
41 REGSET_XFP,
42 REGSET_TLS,
43};
44
45/*
46 * does not yet catch signals sent when the child dies.
47 * in exit.c or in signal.c.
48 */
49
50/*
51 * Determines which flags the user has access to [1 = access, 0 = no access].
52 */
53#define FLAG_MASK_32 ((unsigned long) \
54 (X86_EFLAGS_CF | X86_EFLAGS_PF | \
55 X86_EFLAGS_AF | X86_EFLAGS_ZF | \
56 X86_EFLAGS_SF | X86_EFLAGS_TF | \
57 X86_EFLAGS_DF | X86_EFLAGS_OF | \
58 X86_EFLAGS_RF | X86_EFLAGS_AC))
59
60/*
61 * Determines whether a value may be installed in a segment register.
62 */
63static inline bool invalid_selector(u16 value)
64{
65 return unlikely(value != 0 && (value & SEGMENT_RPL_MASK) != USER_RPL);
66}
67
68#ifdef CONFIG_X86_32
69
70#define FLAG_MASK FLAG_MASK_32
71
72static long *pt_regs_access(struct pt_regs *regs, unsigned long regno)
73{
74 BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0);
75 regno >>= 2;
76 if (regno > FS)
77 --regno;
78 return &regs->bx + regno;
79}
80
81static u16 get_segment_reg(struct task_struct *task, unsigned long offset)
82{
83 /*
84 * Returning the value truncates it to 16 bits.
85 */
86 unsigned int retval;
87 if (offset != offsetof(struct user_regs_struct, gs))
88 retval = *pt_regs_access(task_pt_regs(task), offset);
89 else {
90 retval = task->thread.gs;
91 if (task == current)
92 savesegment(gs, retval);
93 }
94 return retval;
95}
96
97static int set_segment_reg(struct task_struct *task,
98 unsigned long offset, u16 value)
99{
100 /*
101 * The value argument was already truncated to 16 bits.
102 */
103 if (invalid_selector(value))
104 return -EIO;
105
106 /*
107 * For %cs and %ss we cannot permit a null selector.
108 * We can permit a bogus selector as long as it has USER_RPL.
109 * Null selectors are fine for other segment registers, but
110 * we will never get back to user mode with invalid %cs or %ss
111 * and will take the trap in iret instead. Much code relies
112 * on user_mode() to distinguish a user trap frame (which can
113 * safely use invalid selectors) from a kernel trap frame.
114 */
115 switch (offset) {
116 case offsetof(struct user_regs_struct, cs):
117 case offsetof(struct user_regs_struct, ss):
118 if (unlikely(value == 0))
119 return -EIO;
120
121 default:
122 *pt_regs_access(task_pt_regs(task), offset) = value;
123 break;
124
125 case offsetof(struct user_regs_struct, gs):
126 task->thread.gs = value;
127 if (task == current)
128 /*
129 * The user-mode %gs is not affected by
130 * kernel entry, so we must update the CPU.
131 */
132 loadsegment(gs, value);
133 }
134
135 return 0;
136}
137
138static unsigned long debugreg_addr_limit(struct task_struct *task)
139{
140 return TASK_SIZE - 3;
141}
142
143#else /* CONFIG_X86_64 */
144
145#define FLAG_MASK (FLAG_MASK_32 | X86_EFLAGS_NT)
146
147static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long offset)
148{
149 BUILD_BUG_ON(offsetof(struct pt_regs, r15) != 0);
150 return &regs->r15 + (offset / sizeof(regs->r15));
151}
152
153static u16 get_segment_reg(struct task_struct *task, unsigned long offset)
154{
155 /*
156 * Returning the value truncates it to 16 bits.
157 */
158 unsigned int seg;
159
160 switch (offset) {
161 case offsetof(struct user_regs_struct, fs):
162 if (task == current) {
163 /* Older gas can't assemble movq %?s,%r?? */
164 asm("movl %%fs,%0" : "=r" (seg));
165 return seg;
166 }
167 return task->thread.fsindex;
168 case offsetof(struct user_regs_struct, gs):
169 if (task == current) {
170 asm("movl %%gs,%0" : "=r" (seg));
171 return seg;
172 }
173 return task->thread.gsindex;
174 case offsetof(struct user_regs_struct, ds):
175 if (task == current) {
176 asm("movl %%ds,%0" : "=r" (seg));
177 return seg;
178 }
179 return task->thread.ds;
180 case offsetof(struct user_regs_struct, es):
181 if (task == current) {
182 asm("movl %%es,%0" : "=r" (seg));
183 return seg;
184 }
185 return task->thread.es;
186
187 case offsetof(struct user_regs_struct, cs):
188 case offsetof(struct user_regs_struct, ss):
189 break;
190 }
191 return *pt_regs_access(task_pt_regs(task), offset);
192}
193
194static int set_segment_reg(struct task_struct *task,
195 unsigned long offset, u16 value)
196{
197 /*
198 * The value argument was already truncated to 16 bits.
199 */
200 if (invalid_selector(value))
201 return -EIO;
202
203 switch (offset) {
204 case offsetof(struct user_regs_struct,fs):
205 /*
206 * If this is setting fs as for normal 64-bit use but
207 * setting fs_base has implicitly changed it, leave it.
208 */
209 if ((value == FS_TLS_SEL && task->thread.fsindex == 0 &&
210 task->thread.fs != 0) ||
211 (value == 0 && task->thread.fsindex == FS_TLS_SEL &&
212 task->thread.fs == 0))
213 break;
214 task->thread.fsindex = value;
215 if (task == current)
216 loadsegment(fs, task->thread.fsindex);
217 break;
218 case offsetof(struct user_regs_struct,gs):
219 /*
220 * If this is setting gs as for normal 64-bit use but
221 * setting gs_base has implicitly changed it, leave it.
222 */
223 if ((value == GS_TLS_SEL && task->thread.gsindex == 0 &&
224 task->thread.gs != 0) ||
225 (value == 0 && task->thread.gsindex == GS_TLS_SEL &&
226 task->thread.gs == 0))
227 break;
228 task->thread.gsindex = value;
229 if (task == current)
230 load_gs_index(task->thread.gsindex);
231 break;
232 case offsetof(struct user_regs_struct,ds):
233 task->thread.ds = value;
234 if (task == current)
235 loadsegment(ds, task->thread.ds);
236 break;
237 case offsetof(struct user_regs_struct,es):
238 task->thread.es = value;
239 if (task == current)
240 loadsegment(es, task->thread.es);
241 break;
242
243 /*
244 * Can't actually change these in 64-bit mode.
245 */
246 case offsetof(struct user_regs_struct,cs):
247 if (unlikely(value == 0))
248 return -EIO;
249#ifdef CONFIG_IA32_EMULATION
250 if (test_tsk_thread_flag(task, TIF_IA32))
251 task_pt_regs(task)->cs = value;
252#endif
253 break;
254 case offsetof(struct user_regs_struct,ss):
255 if (unlikely(value == 0))
256 return -EIO;
257#ifdef CONFIG_IA32_EMULATION
258 if (test_tsk_thread_flag(task, TIF_IA32))
259 task_pt_regs(task)->ss = value;
260#endif
261 break;
262 }
263
264 return 0;
265}
266
267static unsigned long debugreg_addr_limit(struct task_struct *task)
268{
269#ifdef CONFIG_IA32_EMULATION
270 if (test_tsk_thread_flag(task, TIF_IA32))
271 return IA32_PAGE_OFFSET - 3;
272#endif
273 return TASK_SIZE64 - 7;
274}
275
276#endif /* CONFIG_X86_32 */
277
278static unsigned long get_flags(struct task_struct *task)
279{
280 unsigned long retval = task_pt_regs(task)->flags;
281
282 /*
283 * If the debugger set TF, hide it from the readout.
284 */
285 if (test_tsk_thread_flag(task, TIF_FORCED_TF))
286 retval &= ~X86_EFLAGS_TF;
287
288 return retval;
289}
290
291static int set_flags(struct task_struct *task, unsigned long value)
292{
293 struct pt_regs *regs = task_pt_regs(task);
294
295 /*
296 * If the user value contains TF, mark that
297 * it was not "us" (the debugger) that set it.
298 * If not, make sure it stays set if we had.
299 */
300 if (value & X86_EFLAGS_TF)
301 clear_tsk_thread_flag(task, TIF_FORCED_TF);
302 else if (test_tsk_thread_flag(task, TIF_FORCED_TF))
303 value |= X86_EFLAGS_TF;
304
305 regs->flags = (regs->flags & ~FLAG_MASK) | (value & FLAG_MASK);
306
307 return 0;
308}
309
310static int putreg(struct task_struct *child,
311 unsigned long offset, unsigned long value)
312{
313 switch (offset) {
314 case offsetof(struct user_regs_struct, cs):
315 case offsetof(struct user_regs_struct, ds):
316 case offsetof(struct user_regs_struct, es):
317 case offsetof(struct user_regs_struct, fs):
318 case offsetof(struct user_regs_struct, gs):
319 case offsetof(struct user_regs_struct, ss):
320 return set_segment_reg(child, offset, value);
321
322 case offsetof(struct user_regs_struct, flags):
323 return set_flags(child, value);
324
325#ifdef CONFIG_X86_64
326 case offsetof(struct user_regs_struct,fs_base):
327 if (value >= TASK_SIZE_OF(child))
328 return -EIO;
329 /*
330 * When changing the segment base, use do_arch_prctl
331 * to set either thread.fs or thread.fsindex and the
332 * corresponding GDT slot.
333 */
334 if (child->thread.fs != value)
335 return do_arch_prctl(child, ARCH_SET_FS, value);
336 return 0;
337 case offsetof(struct user_regs_struct,gs_base):
338 /*
339 * Exactly the same here as the %fs handling above.
340 */
341 if (value >= TASK_SIZE_OF(child))
342 return -EIO;
343 if (child->thread.gs != value)
344 return do_arch_prctl(child, ARCH_SET_GS, value);
345 return 0;
346#endif
347 }
348
349 *pt_regs_access(task_pt_regs(child), offset) = value;
350 return 0;
351}
352
353static unsigned long getreg(struct task_struct *task, unsigned long offset)
354{
355 switch (offset) {
356 case offsetof(struct user_regs_struct, cs):
357 case offsetof(struct user_regs_struct, ds):
358 case offsetof(struct user_regs_struct, es):
359 case offsetof(struct user_regs_struct, fs):
360 case offsetof(struct user_regs_struct, gs):
361 case offsetof(struct user_regs_struct, ss):
362 return get_segment_reg(task, offset);
363
364 case offsetof(struct user_regs_struct, flags):
365 return get_flags(task);
366
367#ifdef CONFIG_X86_64
368 case offsetof(struct user_regs_struct, fs_base): {
369 /*
370 * do_arch_prctl may have used a GDT slot instead of
371 * the MSR. To userland, it appears the same either
372 * way, except the %fs segment selector might not be 0.
373 */
374 unsigned int seg = task->thread.fsindex;
375 if (task->thread.fs != 0)
376 return task->thread.fs;
377 if (task == current)
378 asm("movl %%fs,%0" : "=r" (seg));
379 if (seg != FS_TLS_SEL)
380 return 0;
381 return get_desc_base(&task->thread.tls_array[FS_TLS]);
382 }
383 case offsetof(struct user_regs_struct, gs_base): {
384 /*
385 * Exactly the same here as the %fs handling above.
386 */
387 unsigned int seg = task->thread.gsindex;
388 if (task->thread.gs != 0)
389 return task->thread.gs;
390 if (task == current)
391 asm("movl %%gs,%0" : "=r" (seg));
392 if (seg != GS_TLS_SEL)
393 return 0;
394 return get_desc_base(&task->thread.tls_array[GS_TLS]);
395 }
396#endif
397 }
398
399 return *pt_regs_access(task_pt_regs(task), offset);
400}
401
402static int genregs_get(struct task_struct *target,
403 const struct user_regset *regset,
404 unsigned int pos, unsigned int count,
405 void *kbuf, void __user *ubuf)
406{
407 if (kbuf) {
408 unsigned long *k = kbuf;
409 while (count > 0) {
410 *k++ = getreg(target, pos);
411 count -= sizeof(*k);
412 pos += sizeof(*k);
413 }
414 } else {
415 unsigned long __user *u = ubuf;
416 while (count > 0) {
417 if (__put_user(getreg(target, pos), u++))
418 return -EFAULT;
419 count -= sizeof(*u);
420 pos += sizeof(*u);
421 }
422 }
423
424 return 0;
425}
426
427static int genregs_set(struct task_struct *target,
428 const struct user_regset *regset,
429 unsigned int pos, unsigned int count,
430 const void *kbuf, const void __user *ubuf)
431{
432 int ret = 0;
433 if (kbuf) {
434 const unsigned long *k = kbuf;
435 while (count > 0 && !ret) {
436 ret = putreg(target, pos, *k++);
437 count -= sizeof(*k);
438 pos += sizeof(*k);
439 }
440 } else {
441 const unsigned long __user *u = ubuf;
442 while (count > 0 && !ret) {
443 unsigned long word;
444 ret = __get_user(word, u++);
445 if (ret)
446 break;
447 ret = putreg(target, pos, word);
448 count -= sizeof(*u);
449 pos += sizeof(*u);
450 }
451 }
452 return ret;
453}
454
455/*
456 * This function is trivial and will be inlined by the compiler.
457 * Having it separates the implementation details of debug
458 * registers from the interface details of ptrace.
459 */
460static unsigned long ptrace_get_debugreg(struct task_struct *child, int n)
461{
462 switch (n) {
463 case 0: return child->thread.debugreg0;
464 case 1: return child->thread.debugreg1;
465 case 2: return child->thread.debugreg2;
466 case 3: return child->thread.debugreg3;
467 case 6: return child->thread.debugreg6;
468 case 7: return child->thread.debugreg7;
469 }
470 return 0;
471}
472
473static int ptrace_set_debugreg(struct task_struct *child,
474 int n, unsigned long data)
475{
476 int i;
477
478 if (unlikely(n == 4 || n == 5))
479 return -EIO;
480
481 if (n < 4 && unlikely(data >= debugreg_addr_limit(child)))
482 return -EIO;
483
484 switch (n) {
485 case 0: child->thread.debugreg0 = data; break;
486 case 1: child->thread.debugreg1 = data; break;
487 case 2: child->thread.debugreg2 = data; break;
488 case 3: child->thread.debugreg3 = data; break;
489
490 case 6:
491 if ((data & ~0xffffffffUL) != 0)
492 return -EIO;
493 child->thread.debugreg6 = data;
494 break;
495
496 case 7:
497 /*
498 * Sanity-check data. Take one half-byte at once with
499 * check = (val >> (16 + 4*i)) & 0xf. It contains the
500 * R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits
501 * 2 and 3 are LENi. Given a list of invalid values,
502 * we do mask |= 1 << invalid_value, so that
503 * (mask >> check) & 1 is a correct test for invalid
504 * values.
505 *
506 * R/Wi contains the type of the breakpoint /
507 * watchpoint, LENi contains the length of the watched
508 * data in the watchpoint case.
509 *
510 * The invalid values are:
511 * - LENi == 0x10 (undefined), so mask |= 0x0f00. [32-bit]
512 * - R/Wi == 0x10 (break on I/O reads or writes), so
513 * mask |= 0x4444.
514 * - R/Wi == 0x00 && LENi != 0x00, so we have mask |=
515 * 0x1110.
516 *
517 * Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54.
518 *
519 * See the Intel Manual "System Programming Guide",
520 * 15.2.4
521 *
522 * Note that LENi == 0x10 is defined on x86_64 in long
523 * mode (i.e. even for 32-bit userspace software, but
524 * 64-bit kernel), so the x86_64 mask value is 0x5454.
525 * See the AMD manual no. 24593 (AMD64 System Programming)
526 */
527#ifdef CONFIG_X86_32
528#define DR7_MASK 0x5f54
529#else
530#define DR7_MASK 0x5554
531#endif
532 data &= ~DR_CONTROL_RESERVED;
533 for (i = 0; i < 4; i++)
534 if ((DR7_MASK >> ((data >> (16 + 4*i)) & 0xf)) & 1)
535 return -EIO;
536 child->thread.debugreg7 = data;
537 if (data)
538 set_tsk_thread_flag(child, TIF_DEBUG);
539 else
540 clear_tsk_thread_flag(child, TIF_DEBUG);
541 break;
542 }
543
544 return 0;
545}
546
547static int ptrace_bts_get_size(struct task_struct *child)
548{
549 if (!child->thread.ds_area_msr)
550 return -ENXIO;
551
552 return ds_get_bts_index((void *)child->thread.ds_area_msr);
553}
554
555static int ptrace_bts_read_record(struct task_struct *child,
556 long index,
557 struct bts_struct __user *out)
558{
559 struct bts_struct ret;
560 int retval;
561 int bts_end;
562 int bts_index;
563
564 if (!child->thread.ds_area_msr)
565 return -ENXIO;
566
567 if (index < 0)
568 return -EINVAL;
569
570 bts_end = ds_get_bts_end((void *)child->thread.ds_area_msr);
571 if (bts_end <= index)
572 return -EINVAL;
573
574 /* translate the ptrace bts index into the ds bts index */
575 bts_index = ds_get_bts_index((void *)child->thread.ds_area_msr);
576 bts_index -= (index + 1);
577 if (bts_index < 0)
578 bts_index += bts_end;
579
580 retval = ds_read_bts((void *)child->thread.ds_area_msr,
581 bts_index, &ret);
582 if (retval < 0)
583 return retval;
584
585 if (copy_to_user(out, &ret, sizeof(ret)))
586 return -EFAULT;
587
588 return sizeof(ret);
589}
590
591static int ptrace_bts_write_record(struct task_struct *child,
592 const struct bts_struct *in)
593{
594 int retval;
595
596 if (!child->thread.ds_area_msr)
597 return -ENXIO;
598
599 retval = ds_write_bts((void *)child->thread.ds_area_msr, in);
600 if (retval)
601 return retval;
602
603 return sizeof(*in);
604}
605
606static int ptrace_bts_clear(struct task_struct *child)
607{
608 if (!child->thread.ds_area_msr)
609 return -ENXIO;
610
611 return ds_clear((void *)child->thread.ds_area_msr);
612}
613
614static int ptrace_bts_drain(struct task_struct *child,
615 long size,
616 struct bts_struct __user *out)
617{
618 int end, i;
619 void *ds = (void *)child->thread.ds_area_msr;
620
621 if (!ds)
622 return -ENXIO;
623
624 end = ds_get_bts_index(ds);
625 if (end <= 0)
626 return end;
627
628 if (size < (end * sizeof(struct bts_struct)))
629 return -EIO;
630
631 for (i = 0; i < end; i++, out++) {
632 struct bts_struct ret;
633 int retval;
634
635 retval = ds_read_bts(ds, i, &ret);
636 if (retval < 0)
637 return retval;
638
639 if (copy_to_user(out, &ret, sizeof(ret)))
640 return -EFAULT;
641 }
642
643 ds_clear(ds);
644
645 return end;
646}
647
648static int ptrace_bts_realloc(struct task_struct *child,
649 int size, int reduce_size)
650{
651 unsigned long rlim, vm;
652 int ret, old_size;
653
654 if (size < 0)
655 return -EINVAL;
656
657 old_size = ds_get_bts_size((void *)child->thread.ds_area_msr);
658 if (old_size < 0)
659 return old_size;
660
661 ret = ds_free((void **)&child->thread.ds_area_msr);
662 if (ret < 0)
663 goto out;
664
665 size >>= PAGE_SHIFT;
666 old_size >>= PAGE_SHIFT;
667
668 current->mm->total_vm -= old_size;
669 current->mm->locked_vm -= old_size;
670
671 if (size == 0)
672 goto out;
673
674 rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
675 vm = current->mm->total_vm + size;
676 if (rlim < vm) {
677 ret = -ENOMEM;
678
679 if (!reduce_size)
680 goto out;
681
682 size = rlim - current->mm->total_vm;
683 if (size <= 0)
684 goto out;
685 }
686
687 rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
688 vm = current->mm->locked_vm + size;
689 if (rlim < vm) {
690 ret = -ENOMEM;
691
692 if (!reduce_size)
693 goto out;
694
695 size = rlim - current->mm->locked_vm;
696 if (size <= 0)
697 goto out;
698 }
699
700 ret = ds_allocate((void **)&child->thread.ds_area_msr,
701 size << PAGE_SHIFT);
702 if (ret < 0)
703 goto out;
704
705 current->mm->total_vm += size;
706 current->mm->locked_vm += size;
707
708out:
709 if (child->thread.ds_area_msr)
710 set_tsk_thread_flag(child, TIF_DS_AREA_MSR);
711 else
712 clear_tsk_thread_flag(child, TIF_DS_AREA_MSR);
713
714 return ret;
715}
716
717static int ptrace_bts_config(struct task_struct *child,
718 long cfg_size,
719 const struct ptrace_bts_config __user *ucfg)
720{
721 struct ptrace_bts_config cfg;
722 int bts_size, ret = 0;
723 void *ds;
724
725 if (cfg_size < sizeof(cfg))
726 return -EIO;
727
728 if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
729 return -EFAULT;
730
731 if ((int)cfg.size < 0)
732 return -EINVAL;
733
734 bts_size = 0;
735 ds = (void *)child->thread.ds_area_msr;
736 if (ds) {
737 bts_size = ds_get_bts_size(ds);
738 if (bts_size < 0)
739 return bts_size;
740 }
741 cfg.size = PAGE_ALIGN(cfg.size);
742
743 if (bts_size != cfg.size) {
744 ret = ptrace_bts_realloc(child, cfg.size,
745 cfg.flags & PTRACE_BTS_O_CUT_SIZE);
746 if (ret < 0)
747 goto errout;
748
749 ds = (void *)child->thread.ds_area_msr;
750 }
751
752 if (cfg.flags & PTRACE_BTS_O_SIGNAL)
753 ret = ds_set_overflow(ds, DS_O_SIGNAL);
754 else
755 ret = ds_set_overflow(ds, DS_O_WRAP);
756 if (ret < 0)
757 goto errout;
758
759 if (cfg.flags & PTRACE_BTS_O_TRACE)
760 child->thread.debugctlmsr |= ds_debugctl_mask();
761 else
762 child->thread.debugctlmsr &= ~ds_debugctl_mask();
763
764 if (cfg.flags & PTRACE_BTS_O_SCHED)
765 set_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
766 else
767 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
768
769 ret = sizeof(cfg);
770
771out:
772 if (child->thread.debugctlmsr)
773 set_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
774 else
775 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
776
777 return ret;
778
779errout:
780 child->thread.debugctlmsr &= ~ds_debugctl_mask();
781 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
782 goto out;
783}
784
785static int ptrace_bts_status(struct task_struct *child,
786 long cfg_size,
787 struct ptrace_bts_config __user *ucfg)
788{
789 void *ds = (void *)child->thread.ds_area_msr;
790 struct ptrace_bts_config cfg;
791
792 if (cfg_size < sizeof(cfg))
793 return -EIO;
794
795 memset(&cfg, 0, sizeof(cfg));
796
797 if (ds) {
798 cfg.size = ds_get_bts_size(ds);
799
800 if (ds_get_overflow(ds) == DS_O_SIGNAL)
801 cfg.flags |= PTRACE_BTS_O_SIGNAL;
802
803 if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) &&
804 child->thread.debugctlmsr & ds_debugctl_mask())
805 cfg.flags |= PTRACE_BTS_O_TRACE;
806
807 if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS))
808 cfg.flags |= PTRACE_BTS_O_SCHED;
809 }
810
811 cfg.bts_size = sizeof(struct bts_struct);
812
813 if (copy_to_user(ucfg, &cfg, sizeof(cfg)))
814 return -EFAULT;
815
816 return sizeof(cfg);
817}
818
819void ptrace_bts_take_timestamp(struct task_struct *tsk,
820 enum bts_qualifier qualifier)
821{
822 struct bts_struct rec = {
823 .qualifier = qualifier,
824 .variant.jiffies = jiffies_64
825 };
826
827 ptrace_bts_write_record(tsk, &rec);
828}
829
830/*
831 * Called by kernel/ptrace.c when detaching..
832 *
833 * Make sure the single step bit is not set.
834 */
835void ptrace_disable(struct task_struct *child)
836{
837 user_disable_single_step(child);
838#ifdef TIF_SYSCALL_EMU
839 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
840#endif
841 if (child->thread.ds_area_msr) {
842 ptrace_bts_realloc(child, 0, 0);
843 child->thread.debugctlmsr &= ~ds_debugctl_mask();
844 if (!child->thread.debugctlmsr)
845 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
846 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
847 }
848}
849
850#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
851static const struct user_regset_view user_x86_32_view; /* Initialized below. */
852#endif
853
854long arch_ptrace(struct task_struct *child, long request, long addr, long data)
855{
856 int ret;
857 unsigned long __user *datap = (unsigned long __user *)data;
858
859 switch (request) {
860 /* read the word at location addr in the USER area. */
861 case PTRACE_PEEKUSR: {
862 unsigned long tmp;
863
864 ret = -EIO;
865 if ((addr & (sizeof(data) - 1)) || addr < 0 ||
866 addr >= sizeof(struct user))
867 break;
868
869 tmp = 0; /* Default return condition */
870 if (addr < sizeof(struct user_regs_struct))
871 tmp = getreg(child, addr);
872 else if (addr >= offsetof(struct user, u_debugreg[0]) &&
873 addr <= offsetof(struct user, u_debugreg[7])) {
874 addr -= offsetof(struct user, u_debugreg[0]);
875 tmp = ptrace_get_debugreg(child, addr / sizeof(data));
876 }
877 ret = put_user(tmp, datap);
878 break;
879 }
880
881 case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
882 ret = -EIO;
883 if ((addr & (sizeof(data) - 1)) || addr < 0 ||
884 addr >= sizeof(struct user))
885 break;
886
887 if (addr < sizeof(struct user_regs_struct))
888 ret = putreg(child, addr, data);
889 else if (addr >= offsetof(struct user, u_debugreg[0]) &&
890 addr <= offsetof(struct user, u_debugreg[7])) {
891 addr -= offsetof(struct user, u_debugreg[0]);
892 ret = ptrace_set_debugreg(child,
893 addr / sizeof(data), data);
894 }
895 break;
896
897 case PTRACE_GETREGS: /* Get all gp regs from the child. */
898 return copy_regset_to_user(child,
899 task_user_regset_view(current),
900 REGSET_GENERAL,
901 0, sizeof(struct user_regs_struct),
902 datap);
903
904 case PTRACE_SETREGS: /* Set all gp regs in the child. */
905 return copy_regset_from_user(child,
906 task_user_regset_view(current),
907 REGSET_GENERAL,
908 0, sizeof(struct user_regs_struct),
909 datap);
910
911 case PTRACE_GETFPREGS: /* Get the child FPU state. */
912 return copy_regset_to_user(child,
913 task_user_regset_view(current),
914 REGSET_FP,
915 0, sizeof(struct user_i387_struct),
916 datap);
917
918 case PTRACE_SETFPREGS: /* Set the child FPU state. */
919 return copy_regset_from_user(child,
920 task_user_regset_view(current),
921 REGSET_FP,
922 0, sizeof(struct user_i387_struct),
923 datap);
924
925#ifdef CONFIG_X86_32
926 case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */
927 return copy_regset_to_user(child, &user_x86_32_view,
928 REGSET_XFP,
929 0, sizeof(struct user_fxsr_struct),
930 datap);
931
932 case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */
933 return copy_regset_from_user(child, &user_x86_32_view,
934 REGSET_XFP,
935 0, sizeof(struct user_fxsr_struct),
936 datap);
937#endif
938
939#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
940 case PTRACE_GET_THREAD_AREA:
941 if (addr < 0)
942 return -EIO;
943 ret = do_get_thread_area(child, addr,
944 (struct user_desc __user *) data);
945 break;
946
947 case PTRACE_SET_THREAD_AREA:
948 if (addr < 0)
949 return -EIO;
950 ret = do_set_thread_area(child, addr,
951 (struct user_desc __user *) data, 0);
952 break;
953#endif
954
955#ifdef CONFIG_X86_64
956 /* normal 64bit interface to access TLS data.
957 Works just like arch_prctl, except that the arguments
958 are reversed. */
959 case PTRACE_ARCH_PRCTL:
960 ret = do_arch_prctl(child, data, addr);
961 break;
962#endif
963
964 case PTRACE_BTS_CONFIG:
965 ret = ptrace_bts_config
966 (child, data, (struct ptrace_bts_config __user *)addr);
967 break;
968
969 case PTRACE_BTS_STATUS:
970 ret = ptrace_bts_status
971 (child, data, (struct ptrace_bts_config __user *)addr);
972 break;
973
974 case PTRACE_BTS_SIZE:
975 ret = ptrace_bts_get_size(child);
976 break;
977
978 case PTRACE_BTS_GET:
979 ret = ptrace_bts_read_record
980 (child, data, (struct bts_struct __user *) addr);
981 break;
982
983 case PTRACE_BTS_CLEAR:
984 ret = ptrace_bts_clear(child);
985 break;
986
987 case PTRACE_BTS_DRAIN:
988 ret = ptrace_bts_drain
989 (child, data, (struct bts_struct __user *) addr);
990 break;
991
992 default:
993 ret = ptrace_request(child, request, addr, data);
994 break;
995 }
996
997 return ret;
998}
999
1000#ifdef CONFIG_IA32_EMULATION
1001
1002#include <linux/compat.h>
1003#include <linux/syscalls.h>
1004#include <asm/ia32.h>
1005#include <asm/user32.h>
1006
1007#define R32(l,q) \
1008 case offsetof(struct user32, regs.l): \
1009 regs->q = value; break
1010
1011#define SEG32(rs) \
1012 case offsetof(struct user32, regs.rs): \
1013 return set_segment_reg(child, \
1014 offsetof(struct user_regs_struct, rs), \
1015 value); \
1016 break
1017
1018static int putreg32(struct task_struct *child, unsigned regno, u32 value)
1019{
1020 struct pt_regs *regs = task_pt_regs(child);
1021
1022 switch (regno) {
1023
1024 SEG32(cs);
1025 SEG32(ds);
1026 SEG32(es);
1027 SEG32(fs);
1028 SEG32(gs);
1029 SEG32(ss);
1030
1031 R32(ebx, bx);
1032 R32(ecx, cx);
1033 R32(edx, dx);
1034 R32(edi, di);
1035 R32(esi, si);
1036 R32(ebp, bp);
1037 R32(eax, ax);
1038 R32(orig_eax, orig_ax);
1039 R32(eip, ip);
1040 R32(esp, sp);
1041
1042 case offsetof(struct user32, regs.eflags):
1043 return set_flags(child, value);
1044
1045 case offsetof(struct user32, u_debugreg[0]) ...
1046 offsetof(struct user32, u_debugreg[7]):
1047 regno -= offsetof(struct user32, u_debugreg[0]);
1048 return ptrace_set_debugreg(child, regno / 4, value);
1049
1050 default:
1051 if (regno > sizeof(struct user32) || (regno & 3))
1052 return -EIO;
1053
1054 /*
1055 * Other dummy fields in the virtual user structure
1056 * are ignored
1057 */
1058 break;
1059 }
1060 return 0;
1061}
1062
1063#undef R32
1064#undef SEG32
1065
1066#define R32(l,q) \
1067 case offsetof(struct user32, regs.l): \
1068 *val = regs->q; break
1069
1070#define SEG32(rs) \
1071 case offsetof(struct user32, regs.rs): \
1072 *val = get_segment_reg(child, \
1073 offsetof(struct user_regs_struct, rs)); \
1074 break
1075
1076static int getreg32(struct task_struct *child, unsigned regno, u32 *val)
1077{
1078 struct pt_regs *regs = task_pt_regs(child);
1079
1080 switch (regno) {
1081
1082 SEG32(ds);
1083 SEG32(es);
1084 SEG32(fs);
1085 SEG32(gs);
1086
1087 R32(cs, cs);
1088 R32(ss, ss);
1089 R32(ebx, bx);
1090 R32(ecx, cx);
1091 R32(edx, dx);
1092 R32(edi, di);
1093 R32(esi, si);
1094 R32(ebp, bp);
1095 R32(eax, ax);
1096 R32(orig_eax, orig_ax);
1097 R32(eip, ip);
1098 R32(esp, sp);
1099
1100 case offsetof(struct user32, regs.eflags):
1101 *val = get_flags(child);
1102 break;
1103
1104 case offsetof(struct user32, u_debugreg[0]) ...
1105 offsetof(struct user32, u_debugreg[7]):
1106 regno -= offsetof(struct user32, u_debugreg[0]);
1107 *val = ptrace_get_debugreg(child, regno / 4);
1108 break;
1109
1110 default:
1111 if (regno > sizeof(struct user32) || (regno & 3))
1112 return -EIO;
1113
1114 /*
1115 * Other dummy fields in the virtual user structure
1116 * are ignored
1117 */
1118 *val = 0;
1119 break;
1120 }
1121 return 0;
1122}
1123
1124#undef R32
1125#undef SEG32
1126
1127static int genregs32_get(struct task_struct *target,
1128 const struct user_regset *regset,
1129 unsigned int pos, unsigned int count,
1130 void *kbuf, void __user *ubuf)
1131{
1132 if (kbuf) {
1133 compat_ulong_t *k = kbuf;
1134 while (count > 0) {
1135 getreg32(target, pos, k++);
1136 count -= sizeof(*k);
1137 pos += sizeof(*k);
1138 }
1139 } else {
1140 compat_ulong_t __user *u = ubuf;
1141 while (count > 0) {
1142 compat_ulong_t word;
1143 getreg32(target, pos, &word);
1144 if (__put_user(word, u++))
1145 return -EFAULT;
1146 count -= sizeof(*u);
1147 pos += sizeof(*u);
1148 }
1149 }
1150
1151 return 0;
1152}
1153
1154static int genregs32_set(struct task_struct *target,
1155 const struct user_regset *regset,
1156 unsigned int pos, unsigned int count,
1157 const void *kbuf, const void __user *ubuf)
1158{
1159 int ret = 0;
1160 if (kbuf) {
1161 const compat_ulong_t *k = kbuf;
1162 while (count > 0 && !ret) {
1163 ret = putreg(target, pos, *k++);
1164 count -= sizeof(*k);
1165 pos += sizeof(*k);
1166 }
1167 } else {
1168 const compat_ulong_t __user *u = ubuf;
1169 while (count > 0 && !ret) {
1170 compat_ulong_t word;
1171 ret = __get_user(word, u++);
1172 if (ret)
1173 break;
1174 ret = putreg(target, pos, word);
1175 count -= sizeof(*u);
1176 pos += sizeof(*u);
1177 }
1178 }
1179 return ret;
1180}
1181
1182static long ptrace32_siginfo(unsigned request, u32 pid, u32 addr, u32 data)
1183{
1184 siginfo_t __user *si = compat_alloc_user_space(sizeof(siginfo_t));
1185 compat_siginfo_t __user *si32 = compat_ptr(data);
1186 siginfo_t ssi;
1187 int ret;
1188
1189 if (request == PTRACE_SETSIGINFO) {
1190 memset(&ssi, 0, sizeof(siginfo_t));
1191 ret = copy_siginfo_from_user32(&ssi, si32);
1192 if (ret)
1193 return ret;
1194 if (copy_to_user(si, &ssi, sizeof(siginfo_t)))
1195 return -EFAULT;
1196 }
1197 ret = sys_ptrace(request, pid, addr, (unsigned long)si);
1198 if (ret)
1199 return ret;
1200 if (request == PTRACE_GETSIGINFO) {
1201 if (copy_from_user(&ssi, si, sizeof(siginfo_t)))
1202 return -EFAULT;
1203 ret = copy_siginfo_to_user32(si32, &ssi);
1204 }
1205 return ret;
1206}
1207
1208asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data)
1209{
1210 struct task_struct *child;
1211 struct pt_regs *childregs;
1212 void __user *datap = compat_ptr(data);
1213 int ret;
1214 __u32 val;
1215
1216 switch (request) {
1217 case PTRACE_TRACEME:
1218 case PTRACE_ATTACH:
1219 case PTRACE_KILL:
1220 case PTRACE_CONT:
1221 case PTRACE_SINGLESTEP:
1222 case PTRACE_SINGLEBLOCK:
1223 case PTRACE_DETACH:
1224 case PTRACE_SYSCALL:
1225 case PTRACE_OLDSETOPTIONS:
1226 case PTRACE_SETOPTIONS:
1227 case PTRACE_SET_THREAD_AREA:
1228 case PTRACE_GET_THREAD_AREA:
1229 case PTRACE_BTS_CONFIG:
1230 case PTRACE_BTS_STATUS:
1231 case PTRACE_BTS_SIZE:
1232 case PTRACE_BTS_GET:
1233 case PTRACE_BTS_CLEAR:
1234 case PTRACE_BTS_DRAIN:
1235 return sys_ptrace(request, pid, addr, data);
1236
1237 default:
1238 return -EINVAL;
1239
1240 case PTRACE_PEEKTEXT:
1241 case PTRACE_PEEKDATA:
1242 case PTRACE_POKEDATA:
1243 case PTRACE_POKETEXT:
1244 case PTRACE_POKEUSR:
1245 case PTRACE_PEEKUSR:
1246 case PTRACE_GETREGS:
1247 case PTRACE_SETREGS:
1248 case PTRACE_SETFPREGS:
1249 case PTRACE_GETFPREGS:
1250 case PTRACE_SETFPXREGS:
1251 case PTRACE_GETFPXREGS:
1252 case PTRACE_GETEVENTMSG:
1253 break;
1254
1255 case PTRACE_SETSIGINFO:
1256 case PTRACE_GETSIGINFO:
1257 return ptrace32_siginfo(request, pid, addr, data);
1258 }
1259
1260 child = ptrace_get_task_struct(pid);
1261 if (IS_ERR(child))
1262 return PTR_ERR(child);
1263
1264 ret = ptrace_check_attach(child, request == PTRACE_KILL);
1265 if (ret < 0)
1266 goto out;
1267
1268 childregs = task_pt_regs(child);
1269
1270 switch (request) {
1271 case PTRACE_PEEKUSR:
1272 ret = getreg32(child, addr, &val);
1273 if (ret == 0)
1274 ret = put_user(val, (__u32 __user *)datap);
1275 break;
1276
1277 case PTRACE_POKEUSR:
1278 ret = putreg32(child, addr, data);
1279 break;
1280
1281 case PTRACE_GETREGS: /* Get all gp regs from the child. */
1282 return copy_regset_to_user(child, &user_x86_32_view,
1283 REGSET_GENERAL,
1284 0, sizeof(struct user_regs_struct32),
1285 datap);
1286
1287 case PTRACE_SETREGS: /* Set all gp regs in the child. */
1288 return copy_regset_from_user(child, &user_x86_32_view,
1289 REGSET_GENERAL, 0,
1290 sizeof(struct user_regs_struct32),
1291 datap);
1292
1293 case PTRACE_GETFPREGS: /* Get the child FPU state. */
1294 return copy_regset_to_user(child, &user_x86_32_view,
1295 REGSET_FP, 0,
1296 sizeof(struct user_i387_ia32_struct),
1297 datap);
1298
1299 case PTRACE_SETFPREGS: /* Set the child FPU state. */
1300 return copy_regset_from_user(
1301 child, &user_x86_32_view, REGSET_FP,
1302 0, sizeof(struct user_i387_ia32_struct), datap);
1303
1304 case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */
1305 return copy_regset_to_user(child, &user_x86_32_view,
1306 REGSET_XFP, 0,
1307 sizeof(struct user32_fxsr_struct),
1308 datap);
1309
1310 case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */
1311 return copy_regset_from_user(child, &user_x86_32_view,
1312 REGSET_XFP, 0,
1313 sizeof(struct user32_fxsr_struct),
1314 datap);
1315
1316 default:
1317 return compat_ptrace_request(child, request, addr, data);
1318 }
1319
1320 out:
1321 put_task_struct(child);
1322 return ret;
1323}
1324
1325#endif /* CONFIG_IA32_EMULATION */
1326
1327#ifdef CONFIG_X86_64
1328
1329static const struct user_regset x86_64_regsets[] = {
1330 [REGSET_GENERAL] = {
1331 .core_note_type = NT_PRSTATUS,
1332 .n = sizeof(struct user_regs_struct) / sizeof(long),
1333 .size = sizeof(long), .align = sizeof(long),
1334 .get = genregs_get, .set = genregs_set
1335 },
1336 [REGSET_FP] = {
1337 .core_note_type = NT_PRFPREG,
1338 .n = sizeof(struct user_i387_struct) / sizeof(long),
1339 .size = sizeof(long), .align = sizeof(long),
1340 .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set
1341 },
1342};
1343
1344static const struct user_regset_view user_x86_64_view = {
1345 .name = "x86_64", .e_machine = EM_X86_64,
1346 .regsets = x86_64_regsets, .n = ARRAY_SIZE(x86_64_regsets)
1347};
1348
1349#else /* CONFIG_X86_32 */
1350
1351#define user_regs_struct32 user_regs_struct
1352#define genregs32_get genregs_get
1353#define genregs32_set genregs_set
1354
1355#endif /* CONFIG_X86_64 */
1356
1357#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
1358static const struct user_regset x86_32_regsets[] = {
1359 [REGSET_GENERAL] = {
1360 .core_note_type = NT_PRSTATUS,
1361 .n = sizeof(struct user_regs_struct32) / sizeof(u32),
1362 .size = sizeof(u32), .align = sizeof(u32),
1363 .get = genregs32_get, .set = genregs32_set
1364 },
1365 [REGSET_FP] = {
1366 .core_note_type = NT_PRFPREG,
1367 .n = sizeof(struct user_i387_struct) / sizeof(u32),
1368 .size = sizeof(u32), .align = sizeof(u32),
1369 .active = fpregs_active, .get = fpregs_get, .set = fpregs_set
1370 },
1371 [REGSET_XFP] = {
1372 .core_note_type = NT_PRXFPREG,
1373 .n = sizeof(struct user_i387_struct) / sizeof(u32),
1374 .size = sizeof(u32), .align = sizeof(u32),
1375 .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set
1376 },
1377 [REGSET_TLS] = {
1378 .core_note_type = NT_386_TLS,
1379 .n = GDT_ENTRY_TLS_ENTRIES, .bias = GDT_ENTRY_TLS_MIN,
1380 .size = sizeof(struct user_desc),
1381 .align = sizeof(struct user_desc),
1382 .active = regset_tls_active,
1383 .get = regset_tls_get, .set = regset_tls_set
1384 },
1385};
1386
1387static const struct user_regset_view user_x86_32_view = {
1388 .name = "i386", .e_machine = EM_386,
1389 .regsets = x86_32_regsets, .n = ARRAY_SIZE(x86_32_regsets)
1390};
1391#endif
1392
1393const struct user_regset_view *task_user_regset_view(struct task_struct *task)
1394{
1395#ifdef CONFIG_IA32_EMULATION
1396 if (test_tsk_thread_flag(task, TIF_IA32))
1397#endif
1398#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
1399 return &user_x86_32_view;
1400#endif
1401#ifdef CONFIG_X86_64
1402 return &user_x86_64_view;
1403#endif
1404}
1405
1406#ifdef CONFIG_X86_32
1407
1408void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
1409{
1410 struct siginfo info;
1411
1412 tsk->thread.trap_no = 1;
1413 tsk->thread.error_code = error_code;
1414
1415 memset(&info, 0, sizeof(info));
1416 info.si_signo = SIGTRAP;
1417 info.si_code = TRAP_BRKPT;
1418
1419 /* User-mode ip? */
1420 info.si_addr = user_mode_vm(regs) ? (void __user *) regs->ip : NULL;
1421
1422 /* Send us the fake SIGTRAP */
1423 force_sig_info(SIGTRAP, &info, tsk);
1424}
1425
1426/* notification of system call entry/exit
1427 * - triggered by current->work.syscall_trace
1428 */
1429__attribute__((regparm(3)))
1430int do_syscall_trace(struct pt_regs *regs, int entryexit)
1431{
1432 int is_sysemu = test_thread_flag(TIF_SYSCALL_EMU);
1433 /*
1434 * With TIF_SYSCALL_EMU set we want to ignore TIF_SINGLESTEP for syscall
1435 * interception
1436 */
1437 int is_singlestep = !is_sysemu && test_thread_flag(TIF_SINGLESTEP);
1438 int ret = 0;
1439
1440 /* do the secure computing check first */
1441 if (!entryexit)
1442 secure_computing(regs->orig_ax);
1443
1444 if (unlikely(current->audit_context)) {
1445 if (entryexit)
1446 audit_syscall_exit(AUDITSC_RESULT(regs->ax),
1447 regs->ax);
1448 /* Debug traps, when using PTRACE_SINGLESTEP, must be sent only
1449 * on the syscall exit path. Normally, when TIF_SYSCALL_AUDIT is
1450 * not used, entry.S will call us only on syscall exit, not
1451 * entry; so when TIF_SYSCALL_AUDIT is used we must avoid
1452 * calling send_sigtrap() on syscall entry.
1453 *
1454 * Note that when PTRACE_SYSEMU_SINGLESTEP is used,
1455 * is_singlestep is false, despite his name, so we will still do
1456 * the correct thing.
1457 */
1458 else if (is_singlestep)
1459 goto out;
1460 }
1461
1462 if (!(current->ptrace & PT_PTRACED))
1463 goto out;
1464
1465 /* If a process stops on the 1st tracepoint with SYSCALL_TRACE
1466 * and then is resumed with SYSEMU_SINGLESTEP, it will come in
1467 * here. We have to check this and return */
1468 if (is_sysemu && entryexit)
1469 return 0;
1470
1471 /* Fake a debug trap */
1472 if (is_singlestep)
1473 send_sigtrap(current, regs, 0);
1474
1475 if (!test_thread_flag(TIF_SYSCALL_TRACE) && !is_sysemu)
1476 goto out;
1477
1478 /* the 0x80 provides a way for the tracing parent to distinguish
1479 between a syscall stop and SIGTRAP delivery */
1480 /* Note that the debugger could change the result of test_thread_flag!*/
1481 ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80:0));
1482
1483 /*
1484 * this isn't the same as continuing with a signal, but it will do
1485 * for normal use. strace only continues with a signal if the
1486 * stopping signal is not SIGTRAP. -brl
1487 */
1488 if (current->exit_code) {
1489 send_sig(current->exit_code, current, 1);
1490 current->exit_code = 0;
1491 }
1492 ret = is_sysemu;
1493out:
1494 if (unlikely(current->audit_context) && !entryexit)
1495 audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_ax,
1496 regs->bx, regs->cx, regs->dx, regs->si);
1497 if (ret == 0)
1498 return 0;
1499
1500 regs->orig_ax = -1; /* force skip of syscall restarting */
1501 if (unlikely(current->audit_context))
1502 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
1503 return 1;
1504}
1505
1506#else /* CONFIG_X86_64 */
1507
1508static void syscall_trace(struct pt_regs *regs)
1509{
1510
1511#if 0
1512 printk("trace %s ip %lx sp %lx ax %d origrax %d caller %lx tiflags %x ptrace %x\n",
1513 current->comm,
1514 regs->ip, regs->sp, regs->ax, regs->orig_ax, __builtin_return_address(0),
1515 current_thread_info()->flags, current->ptrace);
1516#endif
1517
1518 ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD)
1519 ? 0x80 : 0));
1520 /*
1521 * this isn't the same as continuing with a signal, but it will do
1522 * for normal use. strace only continues with a signal if the
1523 * stopping signal is not SIGTRAP. -brl
1524 */
1525 if (current->exit_code) {
1526 send_sig(current->exit_code, current, 1);
1527 current->exit_code = 0;
1528 }
1529}
1530
1531asmlinkage void syscall_trace_enter(struct pt_regs *regs)
1532{
1533 /* do the secure computing check first */
1534 secure_computing(regs->orig_ax);
1535
1536 if (test_thread_flag(TIF_SYSCALL_TRACE)
1537 && (current->ptrace & PT_PTRACED))
1538 syscall_trace(regs);
1539
1540 if (unlikely(current->audit_context)) {
1541 if (test_thread_flag(TIF_IA32)) {
1542 audit_syscall_entry(AUDIT_ARCH_I386,
1543 regs->orig_ax,
1544 regs->bx, regs->cx,
1545 regs->dx, regs->si);
1546 } else {
1547 audit_syscall_entry(AUDIT_ARCH_X86_64,
1548 regs->orig_ax,
1549 regs->di, regs->si,
1550 regs->dx, regs->r10);
1551 }
1552 }
1553}
1554
1555asmlinkage void syscall_trace_leave(struct pt_regs *regs)
1556{
1557 if (unlikely(current->audit_context))
1558 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
1559
1560 if ((test_thread_flag(TIF_SYSCALL_TRACE)
1561 || test_thread_flag(TIF_SINGLESTEP))
1562 && (current->ptrace & PT_PTRACED))
1563 syscall_trace(regs);
1564}
1565
1566#endif /* CONFIG_X86_32 */
diff --git a/arch/x86/kernel/ptrace_32.c b/arch/x86/kernel/ptrace_32.c
deleted file mode 100644
index ff5431cc03e..00000000000
--- a/arch/x86/kernel/ptrace_32.c
+++ /dev/null
@@ -1,717 +0,0 @@
1/* By Ross Biro 1/23/92 */
2/*
3 * Pentium III FXSR, SSE support
4 * Gareth Hughes <gareth@valinux.com>, May 2000
5 */
6
7#include <linux/kernel.h>
8#include <linux/sched.h>
9#include <linux/mm.h>
10#include <linux/smp.h>
11#include <linux/errno.h>
12#include <linux/ptrace.h>
13#include <linux/user.h>
14#include <linux/security.h>
15#include <linux/audit.h>
16#include <linux/seccomp.h>
17#include <linux/signal.h>
18
19#include <asm/uaccess.h>
20#include <asm/pgtable.h>
21#include <asm/system.h>
22#include <asm/processor.h>
23#include <asm/i387.h>
24#include <asm/debugreg.h>
25#include <asm/ldt.h>
26#include <asm/desc.h>
27
28/*
29 * does not yet catch signals sent when the child dies.
30 * in exit.c or in signal.c.
31 */
32
33/*
34 * Determines which flags the user has access to [1 = access, 0 = no access].
35 * Prohibits changing ID(21), VIP(20), VIF(19), VM(17), NT(14), IOPL(12-13), IF(9).
36 * Also masks reserved bits (31-22, 15, 5, 3, 1).
37 */
38#define FLAG_MASK 0x00050dd5
39
40/* set's the trap flag. */
41#define TRAP_FLAG 0x100
42
43/*
44 * Offset of eflags on child stack..
45 */
46#define EFL_OFFSET offsetof(struct pt_regs, eflags)
47
48static inline struct pt_regs *get_child_regs(struct task_struct *task)
49{
50 void *stack_top = (void *)task->thread.esp0;
51 return stack_top - sizeof(struct pt_regs);
52}
53
54/*
55 * This routine will get a word off of the processes privileged stack.
56 * the offset is bytes into the pt_regs structure on the stack.
57 * This routine assumes that all the privileged stacks are in our
58 * data space.
59 */
60static inline int get_stack_long(struct task_struct *task, int offset)
61{
62 unsigned char *stack;
63
64 stack = (unsigned char *)task->thread.esp0 - sizeof(struct pt_regs);
65 stack += offset;
66 return (*((int *)stack));
67}
68
69/*
70 * This routine will put a word on the processes privileged stack.
71 * the offset is bytes into the pt_regs structure on the stack.
72 * This routine assumes that all the privileged stacks are in our
73 * data space.
74 */
75static inline int put_stack_long(struct task_struct *task, int offset,
76 unsigned long data)
77{
78 unsigned char * stack;
79
80 stack = (unsigned char *)task->thread.esp0 - sizeof(struct pt_regs);
81 stack += offset;
82 *(unsigned long *) stack = data;
83 return 0;
84}
85
86static int putreg(struct task_struct *child,
87 unsigned long regno, unsigned long value)
88{
89 switch (regno >> 2) {
90 case GS:
91 if (value && (value & 3) != 3)
92 return -EIO;
93 child->thread.gs = value;
94 return 0;
95 case DS:
96 case ES:
97 case FS:
98 if (value && (value & 3) != 3)
99 return -EIO;
100 value &= 0xffff;
101 break;
102 case SS:
103 case CS:
104 if ((value & 3) != 3)
105 return -EIO;
106 value &= 0xffff;
107 break;
108 case EFL:
109 value &= FLAG_MASK;
110 value |= get_stack_long(child, EFL_OFFSET) & ~FLAG_MASK;
111 break;
112 }
113 if (regno > FS*4)
114 regno -= 1*4;
115 put_stack_long(child, regno, value);
116 return 0;
117}
118
119static unsigned long getreg(struct task_struct *child,
120 unsigned long regno)
121{
122 unsigned long retval = ~0UL;
123
124 switch (regno >> 2) {
125 case GS:
126 retval = child->thread.gs;
127 break;
128 case DS:
129 case ES:
130 case FS:
131 case SS:
132 case CS:
133 retval = 0xffff;
134 /* fall through */
135 default:
136 if (regno > FS*4)
137 regno -= 1*4;
138 retval &= get_stack_long(child, regno);
139 }
140 return retval;
141}
142
143#define LDT_SEGMENT 4
144
145static unsigned long convert_eip_to_linear(struct task_struct *child, struct pt_regs *regs)
146{
147 unsigned long addr, seg;
148
149 addr = regs->eip;
150 seg = regs->xcs & 0xffff;
151 if (regs->eflags & VM_MASK) {
152 addr = (addr & 0xffff) + (seg << 4);
153 return addr;
154 }
155
156 /*
157 * We'll assume that the code segments in the GDT
158 * are all zero-based. That is largely true: the
159 * TLS segments are used for data, and the PNPBIOS
160 * and APM bios ones we just ignore here.
161 */
162 if (seg & LDT_SEGMENT) {
163 u32 *desc;
164 unsigned long base;
165
166 seg &= ~7UL;
167
168 mutex_lock(&child->mm->context.lock);
169 if (unlikely((seg >> 3) >= child->mm->context.size))
170 addr = -1L; /* bogus selector, access would fault */
171 else {
172 desc = child->mm->context.ldt + seg;
173 base = ((desc[0] >> 16) |
174 ((desc[1] & 0xff) << 16) |
175 (desc[1] & 0xff000000));
176
177 /* 16-bit code segment? */
178 if (!((desc[1] >> 22) & 1))
179 addr &= 0xffff;
180 addr += base;
181 }
182 mutex_unlock(&child->mm->context.lock);
183 }
184 return addr;
185}
186
187static inline int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
188{
189 int i, copied;
190 unsigned char opcode[15];
191 unsigned long addr = convert_eip_to_linear(child, regs);
192
193 copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0);
194 for (i = 0; i < copied; i++) {
195 switch (opcode[i]) {
196 /* popf and iret */
197 case 0x9d: case 0xcf:
198 return 1;
199 /* opcode and address size prefixes */
200 case 0x66: case 0x67:
201 continue;
202 /* irrelevant prefixes (segment overrides and repeats) */
203 case 0x26: case 0x2e:
204 case 0x36: case 0x3e:
205 case 0x64: case 0x65:
206 case 0xf0: case 0xf2: case 0xf3:
207 continue;
208
209 /*
210 * pushf: NOTE! We should probably not let
211 * the user see the TF bit being set. But
212 * it's more pain than it's worth to avoid
213 * it, and a debugger could emulate this
214 * all in user space if it _really_ cares.
215 */
216 case 0x9c:
217 default:
218 return 0;
219 }
220 }
221 return 0;
222}
223
224static void set_singlestep(struct task_struct *child)
225{
226 struct pt_regs *regs = get_child_regs(child);
227
228 /*
229 * Always set TIF_SINGLESTEP - this guarantees that
230 * we single-step system calls etc.. This will also
231 * cause us to set TF when returning to user mode.
232 */
233 set_tsk_thread_flag(child, TIF_SINGLESTEP);
234
235 /*
236 * If TF was already set, don't do anything else
237 */
238 if (regs->eflags & TRAP_FLAG)
239 return;
240
241 /* Set TF on the kernel stack.. */
242 regs->eflags |= TRAP_FLAG;
243
244 /*
245 * ..but if TF is changed by the instruction we will trace,
246 * don't mark it as being "us" that set it, so that we
247 * won't clear it by hand later.
248 */
249 if (is_setting_trap_flag(child, regs))
250 return;
251
252 child->ptrace |= PT_DTRACE;
253}
254
255static void clear_singlestep(struct task_struct *child)
256{
257 /* Always clear TIF_SINGLESTEP... */
258 clear_tsk_thread_flag(child, TIF_SINGLESTEP);
259
260 /* But touch TF only if it was set by us.. */
261 if (child->ptrace & PT_DTRACE) {
262 struct pt_regs *regs = get_child_regs(child);
263 regs->eflags &= ~TRAP_FLAG;
264 child->ptrace &= ~PT_DTRACE;
265 }
266}
267
268/*
269 * Called by kernel/ptrace.c when detaching..
270 *
271 * Make sure the single step bit is not set.
272 */
273void ptrace_disable(struct task_struct *child)
274{
275 clear_singlestep(child);
276 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
277}
278
279/*
280 * Perform get_thread_area on behalf of the traced child.
281 */
282static int
283ptrace_get_thread_area(struct task_struct *child,
284 int idx, struct user_desc __user *user_desc)
285{
286 struct user_desc info;
287 struct desc_struct *desc;
288
289/*
290 * Get the current Thread-Local Storage area:
291 */
292
293#define GET_BASE(desc) ( \
294 (((desc)->a >> 16) & 0x0000ffff) | \
295 (((desc)->b << 16) & 0x00ff0000) | \
296 ( (desc)->b & 0xff000000) )
297
298#define GET_LIMIT(desc) ( \
299 ((desc)->a & 0x0ffff) | \
300 ((desc)->b & 0xf0000) )
301
302#define GET_32BIT(desc) (((desc)->b >> 22) & 1)
303#define GET_CONTENTS(desc) (((desc)->b >> 10) & 3)
304#define GET_WRITABLE(desc) (((desc)->b >> 9) & 1)
305#define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1)
306#define GET_PRESENT(desc) (((desc)->b >> 15) & 1)
307#define GET_USEABLE(desc) (((desc)->b >> 20) & 1)
308
309 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
310 return -EINVAL;
311
312 desc = child->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
313
314 info.entry_number = idx;
315 info.base_addr = GET_BASE(desc);
316 info.limit = GET_LIMIT(desc);
317 info.seg_32bit = GET_32BIT(desc);
318 info.contents = GET_CONTENTS(desc);
319 info.read_exec_only = !GET_WRITABLE(desc);
320 info.limit_in_pages = GET_LIMIT_PAGES(desc);
321 info.seg_not_present = !GET_PRESENT(desc);
322 info.useable = GET_USEABLE(desc);
323
324 if (copy_to_user(user_desc, &info, sizeof(info)))
325 return -EFAULT;
326
327 return 0;
328}
329
330/*
331 * Perform set_thread_area on behalf of the traced child.
332 */
333static int
334ptrace_set_thread_area(struct task_struct *child,
335 int idx, struct user_desc __user *user_desc)
336{
337 struct user_desc info;
338 struct desc_struct *desc;
339
340 if (copy_from_user(&info, user_desc, sizeof(info)))
341 return -EFAULT;
342
343 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
344 return -EINVAL;
345
346 desc = child->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
347 if (LDT_empty(&info)) {
348 desc->a = 0;
349 desc->b = 0;
350 } else {
351 desc->a = LDT_entry_a(&info);
352 desc->b = LDT_entry_b(&info);
353 }
354
355 return 0;
356}
357
358long arch_ptrace(struct task_struct *child, long request, long addr, long data)
359{
360 struct user * dummy = NULL;
361 int i, ret;
362 unsigned long __user *datap = (unsigned long __user *)data;
363
364 switch (request) {
365 /* when I and D space are separate, these will need to be fixed. */
366 case PTRACE_PEEKTEXT: /* read word at location addr. */
367 case PTRACE_PEEKDATA:
368 ret = generic_ptrace_peekdata(child, addr, data);
369 break;
370
371 /* read the word at location addr in the USER area. */
372 case PTRACE_PEEKUSR: {
373 unsigned long tmp;
374
375 ret = -EIO;
376 if ((addr & 3) || addr < 0 ||
377 addr > sizeof(struct user) - 3)
378 break;
379
380 tmp = 0; /* Default return condition */
381 if(addr < FRAME_SIZE*sizeof(long))
382 tmp = getreg(child, addr);
383 if(addr >= (long) &dummy->u_debugreg[0] &&
384 addr <= (long) &dummy->u_debugreg[7]){
385 addr -= (long) &dummy->u_debugreg[0];
386 addr = addr >> 2;
387 tmp = child->thread.debugreg[addr];
388 }
389 ret = put_user(tmp, datap);
390 break;
391 }
392
393 /* when I and D space are separate, this will have to be fixed. */
394 case PTRACE_POKETEXT: /* write the word at location addr. */
395 case PTRACE_POKEDATA:
396 ret = generic_ptrace_pokedata(child, addr, data);
397 break;
398
399 case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
400 ret = -EIO;
401 if ((addr & 3) || addr < 0 ||
402 addr > sizeof(struct user) - 3)
403 break;
404
405 if (addr < FRAME_SIZE*sizeof(long)) {
406 ret = putreg(child, addr, data);
407 break;
408 }
409 /* We need to be very careful here. We implicitly
410 want to modify a portion of the task_struct, and we
411 have to be selective about what portions we allow someone
412 to modify. */
413
414 ret = -EIO;
415 if(addr >= (long) &dummy->u_debugreg[0] &&
416 addr <= (long) &dummy->u_debugreg[7]){
417
418 if(addr == (long) &dummy->u_debugreg[4]) break;
419 if(addr == (long) &dummy->u_debugreg[5]) break;
420 if(addr < (long) &dummy->u_debugreg[4] &&
421 ((unsigned long) data) >= TASK_SIZE-3) break;
422
423 /* Sanity-check data. Take one half-byte at once with
424 * check = (val >> (16 + 4*i)) & 0xf. It contains the
425 * R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits
426 * 2 and 3 are LENi. Given a list of invalid values,
427 * we do mask |= 1 << invalid_value, so that
428 * (mask >> check) & 1 is a correct test for invalid
429 * values.
430 *
431 * R/Wi contains the type of the breakpoint /
432 * watchpoint, LENi contains the length of the watched
433 * data in the watchpoint case.
434 *
435 * The invalid values are:
436 * - LENi == 0x10 (undefined), so mask |= 0x0f00.
437 * - R/Wi == 0x10 (break on I/O reads or writes), so
438 * mask |= 0x4444.
439 * - R/Wi == 0x00 && LENi != 0x00, so we have mask |=
440 * 0x1110.
441 *
442 * Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54.
443 *
444 * See the Intel Manual "System Programming Guide",
445 * 15.2.4
446 *
447 * Note that LENi == 0x10 is defined on x86_64 in long
448 * mode (i.e. even for 32-bit userspace software, but
449 * 64-bit kernel), so the x86_64 mask value is 0x5454.
450 * See the AMD manual no. 24593 (AMD64 System
451 * Programming)*/
452
453 if(addr == (long) &dummy->u_debugreg[7]) {
454 data &= ~DR_CONTROL_RESERVED;
455 for(i=0; i<4; i++)
456 if ((0x5f54 >> ((data >> (16 + 4*i)) & 0xf)) & 1)
457 goto out_tsk;
458 if (data)
459 set_tsk_thread_flag(child, TIF_DEBUG);
460 else
461 clear_tsk_thread_flag(child, TIF_DEBUG);
462 }
463 addr -= (long) &dummy->u_debugreg;
464 addr = addr >> 2;
465 child->thread.debugreg[addr] = data;
466 ret = 0;
467 }
468 break;
469
470 case PTRACE_SYSEMU: /* continue and stop at next syscall, which will not be executed */
471 case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */
472 case PTRACE_CONT: /* restart after signal. */
473 ret = -EIO;
474 if (!valid_signal(data))
475 break;
476 if (request == PTRACE_SYSEMU) {
477 set_tsk_thread_flag(child, TIF_SYSCALL_EMU);
478 clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
479 } else if (request == PTRACE_SYSCALL) {
480 set_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
481 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
482 } else {
483 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
484 clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
485 }
486 child->exit_code = data;
487 /* make sure the single step bit is not set. */
488 clear_singlestep(child);
489 wake_up_process(child);
490 ret = 0;
491 break;
492
493/*
494 * make the child exit. Best I can do is send it a sigkill.
495 * perhaps it should be put in the status that it wants to
496 * exit.
497 */
498 case PTRACE_KILL:
499 ret = 0;
500 if (child->exit_state == EXIT_ZOMBIE) /* already dead */
501 break;
502 child->exit_code = SIGKILL;
503 /* make sure the single step bit is not set. */
504 clear_singlestep(child);
505 wake_up_process(child);
506 break;
507
508 case PTRACE_SYSEMU_SINGLESTEP: /* Same as SYSEMU, but singlestep if not syscall */
509 case PTRACE_SINGLESTEP: /* set the trap flag. */
510 ret = -EIO;
511 if (!valid_signal(data))
512 break;
513
514 if (request == PTRACE_SYSEMU_SINGLESTEP)
515 set_tsk_thread_flag(child, TIF_SYSCALL_EMU);
516 else
517 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
518
519 clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
520 set_singlestep(child);
521 child->exit_code = data;
522 /* give it a chance to run. */
523 wake_up_process(child);
524 ret = 0;
525 break;
526
527 case PTRACE_GETREGS: { /* Get all gp regs from the child. */
528 if (!access_ok(VERIFY_WRITE, datap, FRAME_SIZE*sizeof(long))) {
529 ret = -EIO;
530 break;
531 }
532 for ( i = 0; i < FRAME_SIZE*sizeof(long); i += sizeof(long) ) {
533 __put_user(getreg(child, i), datap);
534 datap++;
535 }
536 ret = 0;
537 break;
538 }
539
540 case PTRACE_SETREGS: { /* Set all gp regs in the child. */
541 unsigned long tmp;
542 if (!access_ok(VERIFY_READ, datap, FRAME_SIZE*sizeof(long))) {
543 ret = -EIO;
544 break;
545 }
546 for ( i = 0; i < FRAME_SIZE*sizeof(long); i += sizeof(long) ) {
547 __get_user(tmp, datap);
548 putreg(child, i, tmp);
549 datap++;
550 }
551 ret = 0;
552 break;
553 }
554
555 case PTRACE_GETFPREGS: { /* Get the child FPU state. */
556 if (!access_ok(VERIFY_WRITE, datap,
557 sizeof(struct user_i387_struct))) {
558 ret = -EIO;
559 break;
560 }
561 ret = 0;
562 if (!tsk_used_math(child))
563 init_fpu(child);
564 get_fpregs((struct user_i387_struct __user *)data, child);
565 break;
566 }
567
568 case PTRACE_SETFPREGS: { /* Set the child FPU state. */
569 if (!access_ok(VERIFY_READ, datap,
570 sizeof(struct user_i387_struct))) {
571 ret = -EIO;
572 break;
573 }
574 set_stopped_child_used_math(child);
575 set_fpregs(child, (struct user_i387_struct __user *)data);
576 ret = 0;
577 break;
578 }
579
580 case PTRACE_GETFPXREGS: { /* Get the child extended FPU state. */
581 if (!access_ok(VERIFY_WRITE, datap,
582 sizeof(struct user_fxsr_struct))) {
583 ret = -EIO;
584 break;
585 }
586 if (!tsk_used_math(child))
587 init_fpu(child);
588 ret = get_fpxregs((struct user_fxsr_struct __user *)data, child);
589 break;
590 }
591
592 case PTRACE_SETFPXREGS: { /* Set the child extended FPU state. */
593 if (!access_ok(VERIFY_READ, datap,
594 sizeof(struct user_fxsr_struct))) {
595 ret = -EIO;
596 break;
597 }
598 set_stopped_child_used_math(child);
599 ret = set_fpxregs(child, (struct user_fxsr_struct __user *)data);
600 break;
601 }
602
603 case PTRACE_GET_THREAD_AREA:
604 ret = ptrace_get_thread_area(child, addr,
605 (struct user_desc __user *) data);
606 break;
607
608 case PTRACE_SET_THREAD_AREA:
609 ret = ptrace_set_thread_area(child, addr,
610 (struct user_desc __user *) data);
611 break;
612
613 default:
614 ret = ptrace_request(child, request, addr, data);
615 break;
616 }
617 out_tsk:
618 return ret;
619}
620
621void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
622{
623 struct siginfo info;
624
625 tsk->thread.trap_no = 1;
626 tsk->thread.error_code = error_code;
627
628 memset(&info, 0, sizeof(info));
629 info.si_signo = SIGTRAP;
630 info.si_code = TRAP_BRKPT;
631
632 /* User-mode eip? */
633 info.si_addr = user_mode_vm(regs) ? (void __user *) regs->eip : NULL;
634
635 /* Send us the fake SIGTRAP */
636 force_sig_info(SIGTRAP, &info, tsk);
637}
638
639/* notification of system call entry/exit
640 * - triggered by current->work.syscall_trace
641 */
642__attribute__((regparm(3)))
643int do_syscall_trace(struct pt_regs *regs, int entryexit)
644{
645 int is_sysemu = test_thread_flag(TIF_SYSCALL_EMU);
646 /*
647 * With TIF_SYSCALL_EMU set we want to ignore TIF_SINGLESTEP for syscall
648 * interception
649 */
650 int is_singlestep = !is_sysemu && test_thread_flag(TIF_SINGLESTEP);
651 int ret = 0;
652
653 /* do the secure computing check first */
654 if (!entryexit)
655 secure_computing(regs->orig_eax);
656
657 if (unlikely(current->audit_context)) {
658 if (entryexit)
659 audit_syscall_exit(AUDITSC_RESULT(regs->eax),
660 regs->eax);
661 /* Debug traps, when using PTRACE_SINGLESTEP, must be sent only
662 * on the syscall exit path. Normally, when TIF_SYSCALL_AUDIT is
663 * not used, entry.S will call us only on syscall exit, not
664 * entry; so when TIF_SYSCALL_AUDIT is used we must avoid
665 * calling send_sigtrap() on syscall entry.
666 *
667 * Note that when PTRACE_SYSEMU_SINGLESTEP is used,
668 * is_singlestep is false, despite his name, so we will still do
669 * the correct thing.
670 */
671 else if (is_singlestep)
672 goto out;
673 }
674
675 if (!(current->ptrace & PT_PTRACED))
676 goto out;
677
678 /* If a process stops on the 1st tracepoint with SYSCALL_TRACE
679 * and then is resumed with SYSEMU_SINGLESTEP, it will come in
680 * here. We have to check this and return */
681 if (is_sysemu && entryexit)
682 return 0;
683
684 /* Fake a debug trap */
685 if (is_singlestep)
686 send_sigtrap(current, regs, 0);
687
688 if (!test_thread_flag(TIF_SYSCALL_TRACE) && !is_sysemu)
689 goto out;
690
691 /* the 0x80 provides a way for the tracing parent to distinguish
692 between a syscall stop and SIGTRAP delivery */
693 /* Note that the debugger could change the result of test_thread_flag!*/
694 ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80:0));
695
696 /*
697 * this isn't the same as continuing with a signal, but it will do
698 * for normal use. strace only continues with a signal if the
699 * stopping signal is not SIGTRAP. -brl
700 */
701 if (current->exit_code) {
702 send_sig(current->exit_code, current, 1);
703 current->exit_code = 0;
704 }
705 ret = is_sysemu;
706out:
707 if (unlikely(current->audit_context) && !entryexit)
708 audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_eax,
709 regs->ebx, regs->ecx, regs->edx, regs->esi);
710 if (ret == 0)
711 return 0;
712
713 regs->orig_eax = -1; /* force skip of syscall restarting */
714 if (unlikely(current->audit_context))
715 audit_syscall_exit(AUDITSC_RESULT(regs->eax), regs->eax);
716 return 1;
717}
diff --git a/arch/x86/kernel/ptrace_64.c b/arch/x86/kernel/ptrace_64.c
deleted file mode 100644
index 607085f3f08..00000000000
--- a/arch/x86/kernel/ptrace_64.c
+++ /dev/null
@@ -1,621 +0,0 @@
1/* By Ross Biro 1/23/92 */
2/*
3 * Pentium III FXSR, SSE support
4 * Gareth Hughes <gareth@valinux.com>, May 2000
5 *
6 * x86-64 port 2000-2002 Andi Kleen
7 */
8
9#include <linux/kernel.h>
10#include <linux/sched.h>
11#include <linux/mm.h>
12#include <linux/smp.h>
13#include <linux/errno.h>
14#include <linux/ptrace.h>
15#include <linux/user.h>
16#include <linux/security.h>
17#include <linux/audit.h>
18#include <linux/seccomp.h>
19#include <linux/signal.h>
20
21#include <asm/uaccess.h>
22#include <asm/pgtable.h>
23#include <asm/system.h>
24#include <asm/processor.h>
25#include <asm/i387.h>
26#include <asm/debugreg.h>
27#include <asm/ldt.h>
28#include <asm/desc.h>
29#include <asm/proto.h>
30#include <asm/ia32.h>
31
32/*
33 * does not yet catch signals sent when the child dies.
34 * in exit.c or in signal.c.
35 */
36
37/*
38 * Determines which flags the user has access to [1 = access, 0 = no access].
39 * Prohibits changing ID(21), VIP(20), VIF(19), VM(17), IOPL(12-13), IF(9).
40 * Also masks reserved bits (63-22, 15, 5, 3, 1).
41 */
42#define FLAG_MASK 0x54dd5UL
43
44/* set's the trap flag. */
45#define TRAP_FLAG 0x100UL
46
47/*
48 * eflags and offset of eflags on child stack..
49 */
50#define EFLAGS offsetof(struct pt_regs, eflags)
51#define EFL_OFFSET ((int)(EFLAGS-sizeof(struct pt_regs)))
52
53/*
54 * this routine will get a word off of the processes privileged stack.
55 * the offset is how far from the base addr as stored in the TSS.
56 * this routine assumes that all the privileged stacks are in our
57 * data space.
58 */
59static inline unsigned long get_stack_long(struct task_struct *task, int offset)
60{
61 unsigned char *stack;
62
63 stack = (unsigned char *)task->thread.rsp0;
64 stack += offset;
65 return (*((unsigned long *)stack));
66}
67
68/*
69 * this routine will put a word on the processes privileged stack.
70 * the offset is how far from the base addr as stored in the TSS.
71 * this routine assumes that all the privileged stacks are in our
72 * data space.
73 */
74static inline long put_stack_long(struct task_struct *task, int offset,
75 unsigned long data)
76{
77 unsigned char * stack;
78
79 stack = (unsigned char *) task->thread.rsp0;
80 stack += offset;
81 *(unsigned long *) stack = data;
82 return 0;
83}
84
85#define LDT_SEGMENT 4
86
87unsigned long convert_rip_to_linear(struct task_struct *child, struct pt_regs *regs)
88{
89 unsigned long addr, seg;
90
91 addr = regs->rip;
92 seg = regs->cs & 0xffff;
93
94 /*
95 * We'll assume that the code segments in the GDT
96 * are all zero-based. That is largely true: the
97 * TLS segments are used for data, and the PNPBIOS
98 * and APM bios ones we just ignore here.
99 */
100 if (seg & LDT_SEGMENT) {
101 u32 *desc;
102 unsigned long base;
103
104 seg &= ~7UL;
105
106 mutex_lock(&child->mm->context.lock);
107 if (unlikely((seg >> 3) >= child->mm->context.size))
108 addr = -1L; /* bogus selector, access would fault */
109 else {
110 desc = child->mm->context.ldt + seg;
111 base = ((desc[0] >> 16) |
112 ((desc[1] & 0xff) << 16) |
113 (desc[1] & 0xff000000));
114
115 /* 16-bit code segment? */
116 if (!((desc[1] >> 22) & 1))
117 addr &= 0xffff;
118 addr += base;
119 }
120 mutex_unlock(&child->mm->context.lock);
121 }
122
123 return addr;
124}
125
126static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
127{
128 int i, copied;
129 unsigned char opcode[15];
130 unsigned long addr = convert_rip_to_linear(child, regs);
131
132 copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0);
133 for (i = 0; i < copied; i++) {
134 switch (opcode[i]) {
135 /* popf and iret */
136 case 0x9d: case 0xcf:
137 return 1;
138
139 /* CHECKME: 64 65 */
140
141 /* opcode and address size prefixes */
142 case 0x66: case 0x67:
143 continue;
144 /* irrelevant prefixes (segment overrides and repeats) */
145 case 0x26: case 0x2e:
146 case 0x36: case 0x3e:
147 case 0x64: case 0x65:
148 case 0xf2: case 0xf3:
149 continue;
150
151 case 0x40 ... 0x4f:
152 if (regs->cs != __USER_CS)
153 /* 32-bit mode: register increment */
154 return 0;
155 /* 64-bit mode: REX prefix */
156 continue;
157
158 /* CHECKME: f2, f3 */
159
160 /*
161 * pushf: NOTE! We should probably not let
162 * the user see the TF bit being set. But
163 * it's more pain than it's worth to avoid
164 * it, and a debugger could emulate this
165 * all in user space if it _really_ cares.
166 */
167 case 0x9c:
168 default:
169 return 0;
170 }
171 }
172 return 0;
173}
174
175static void set_singlestep(struct task_struct *child)
176{
177 struct pt_regs *regs = task_pt_regs(child);
178
179 /*
180 * Always set TIF_SINGLESTEP - this guarantees that
181 * we single-step system calls etc.. This will also
182 * cause us to set TF when returning to user mode.
183 */
184 set_tsk_thread_flag(child, TIF_SINGLESTEP);
185
186 /*
187 * If TF was already set, don't do anything else
188 */
189 if (regs->eflags & TRAP_FLAG)
190 return;
191
192 /* Set TF on the kernel stack.. */
193 regs->eflags |= TRAP_FLAG;
194
195 /*
196 * ..but if TF is changed by the instruction we will trace,
197 * don't mark it as being "us" that set it, so that we
198 * won't clear it by hand later.
199 */
200 if (is_setting_trap_flag(child, regs))
201 return;
202
203 child->ptrace |= PT_DTRACE;
204}
205
206static void clear_singlestep(struct task_struct *child)
207{
208 /* Always clear TIF_SINGLESTEP... */
209 clear_tsk_thread_flag(child, TIF_SINGLESTEP);
210
211 /* But touch TF only if it was set by us.. */
212 if (child->ptrace & PT_DTRACE) {
213 struct pt_regs *regs = task_pt_regs(child);
214 regs->eflags &= ~TRAP_FLAG;
215 child->ptrace &= ~PT_DTRACE;
216 }
217}
218
219/*
220 * Called by kernel/ptrace.c when detaching..
221 *
222 * Make sure the single step bit is not set.
223 */
224void ptrace_disable(struct task_struct *child)
225{
226 clear_singlestep(child);
227}
228
229static int putreg(struct task_struct *child,
230 unsigned long regno, unsigned long value)
231{
232 unsigned long tmp;
233
234 switch (regno) {
235 case offsetof(struct user_regs_struct,fs):
236 if (value && (value & 3) != 3)
237 return -EIO;
238 child->thread.fsindex = value & 0xffff;
239 return 0;
240 case offsetof(struct user_regs_struct,gs):
241 if (value && (value & 3) != 3)
242 return -EIO;
243 child->thread.gsindex = value & 0xffff;
244 return 0;
245 case offsetof(struct user_regs_struct,ds):
246 if (value && (value & 3) != 3)
247 return -EIO;
248 child->thread.ds = value & 0xffff;
249 return 0;
250 case offsetof(struct user_regs_struct,es):
251 if (value && (value & 3) != 3)
252 return -EIO;
253 child->thread.es = value & 0xffff;
254 return 0;
255 case offsetof(struct user_regs_struct,ss):
256 if ((value & 3) != 3)
257 return -EIO;
258 value &= 0xffff;
259 return 0;
260 case offsetof(struct user_regs_struct,fs_base):
261 if (value >= TASK_SIZE_OF(child))
262 return -EIO;
263 child->thread.fs = value;
264 return 0;
265 case offsetof(struct user_regs_struct,gs_base):
266 if (value >= TASK_SIZE_OF(child))
267 return -EIO;
268 child->thread.gs = value;
269 return 0;
270 case offsetof(struct user_regs_struct, eflags):
271 value &= FLAG_MASK;
272 tmp = get_stack_long(child, EFL_OFFSET);
273 tmp &= ~FLAG_MASK;
274 value |= tmp;
275 break;
276 case offsetof(struct user_regs_struct,cs):
277 if ((value & 3) != 3)
278 return -EIO;
279 value &= 0xffff;
280 break;
281 }
282 put_stack_long(child, regno - sizeof(struct pt_regs), value);
283 return 0;
284}
285
286static unsigned long getreg(struct task_struct *child, unsigned long regno)
287{
288 unsigned long val;
289 switch (regno) {
290 case offsetof(struct user_regs_struct, fs):
291 return child->thread.fsindex;
292 case offsetof(struct user_regs_struct, gs):
293 return child->thread.gsindex;
294 case offsetof(struct user_regs_struct, ds):
295 return child->thread.ds;
296 case offsetof(struct user_regs_struct, es):
297 return child->thread.es;
298 case offsetof(struct user_regs_struct, fs_base):
299 return child->thread.fs;
300 case offsetof(struct user_regs_struct, gs_base):
301 return child->thread.gs;
302 default:
303 regno = regno - sizeof(struct pt_regs);
304 val = get_stack_long(child, regno);
305 if (test_tsk_thread_flag(child, TIF_IA32))
306 val &= 0xffffffff;
307 return val;
308 }
309
310}
311
312long arch_ptrace(struct task_struct *child, long request, long addr, long data)
313{
314 long i, ret;
315 unsigned ui;
316
317 switch (request) {
318 /* when I and D space are separate, these will need to be fixed. */
319 case PTRACE_PEEKTEXT: /* read word at location addr. */
320 case PTRACE_PEEKDATA:
321 ret = generic_ptrace_peekdata(child, addr, data);
322 break;
323
324 /* read the word at location addr in the USER area. */
325 case PTRACE_PEEKUSR: {
326 unsigned long tmp;
327
328 ret = -EIO;
329 if ((addr & 7) ||
330 addr > sizeof(struct user) - 7)
331 break;
332
333 switch (addr) {
334 case 0 ... sizeof(struct user_regs_struct) - sizeof(long):
335 tmp = getreg(child, addr);
336 break;
337 case offsetof(struct user, u_debugreg[0]):
338 tmp = child->thread.debugreg0;
339 break;
340 case offsetof(struct user, u_debugreg[1]):
341 tmp = child->thread.debugreg1;
342 break;
343 case offsetof(struct user, u_debugreg[2]):
344 tmp = child->thread.debugreg2;
345 break;
346 case offsetof(struct user, u_debugreg[3]):
347 tmp = child->thread.debugreg3;
348 break;
349 case offsetof(struct user, u_debugreg[6]):
350 tmp = child->thread.debugreg6;
351 break;
352 case offsetof(struct user, u_debugreg[7]):
353 tmp = child->thread.debugreg7;
354 break;
355 default:
356 tmp = 0;
357 break;
358 }
359 ret = put_user(tmp,(unsigned long __user *) data);
360 break;
361 }
362
363 /* when I and D space are separate, this will have to be fixed. */
364 case PTRACE_POKETEXT: /* write the word at location addr. */
365 case PTRACE_POKEDATA:
366 ret = generic_ptrace_pokedata(child, addr, data);
367 break;
368
369 case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
370 {
371 int dsize = test_tsk_thread_flag(child, TIF_IA32) ? 3 : 7;
372 ret = -EIO;
373 if ((addr & 7) ||
374 addr > sizeof(struct user) - 7)
375 break;
376
377 switch (addr) {
378 case 0 ... sizeof(struct user_regs_struct) - sizeof(long):
379 ret = putreg(child, addr, data);
380 break;
381 /* Disallows to set a breakpoint into the vsyscall */
382 case offsetof(struct user, u_debugreg[0]):
383 if (data >= TASK_SIZE_OF(child) - dsize) break;
384 child->thread.debugreg0 = data;
385 ret = 0;
386 break;
387 case offsetof(struct user, u_debugreg[1]):
388 if (data >= TASK_SIZE_OF(child) - dsize) break;
389 child->thread.debugreg1 = data;
390 ret = 0;
391 break;
392 case offsetof(struct user, u_debugreg[2]):
393 if (data >= TASK_SIZE_OF(child) - dsize) break;
394 child->thread.debugreg2 = data;
395 ret = 0;
396 break;
397 case offsetof(struct user, u_debugreg[3]):
398 if (data >= TASK_SIZE_OF(child) - dsize) break;
399 child->thread.debugreg3 = data;
400 ret = 0;
401 break;
402 case offsetof(struct user, u_debugreg[6]):
403 if (data >> 32)
404 break;
405 child->thread.debugreg6 = data;
406 ret = 0;
407 break;
408 case offsetof(struct user, u_debugreg[7]):
409 /* See arch/i386/kernel/ptrace.c for an explanation of
410 * this awkward check.*/
411 data &= ~DR_CONTROL_RESERVED;
412 for(i=0; i<4; i++)
413 if ((0x5554 >> ((data >> (16 + 4*i)) & 0xf)) & 1)
414 break;
415 if (i == 4) {
416 child->thread.debugreg7 = data;
417 if (data)
418 set_tsk_thread_flag(child, TIF_DEBUG);
419 else
420 clear_tsk_thread_flag(child, TIF_DEBUG);
421 ret = 0;
422 }
423 break;
424 }
425 break;
426 }
427 case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */
428 case PTRACE_CONT: /* restart after signal. */
429
430 ret = -EIO;
431 if (!valid_signal(data))
432 break;
433 if (request == PTRACE_SYSCALL)
434 set_tsk_thread_flag(child,TIF_SYSCALL_TRACE);
435 else
436 clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE);
437 clear_tsk_thread_flag(child, TIF_SINGLESTEP);
438 child->exit_code = data;
439 /* make sure the single step bit is not set. */
440 clear_singlestep(child);
441 wake_up_process(child);
442 ret = 0;
443 break;
444
445#ifdef CONFIG_IA32_EMULATION
446 /* This makes only sense with 32bit programs. Allow a
447 64bit debugger to fully examine them too. Better
448 don't use it against 64bit processes, use
449 PTRACE_ARCH_PRCTL instead. */
450 case PTRACE_SET_THREAD_AREA: {
451 struct user_desc __user *p;
452 int old;
453 p = (struct user_desc __user *)data;
454 get_user(old, &p->entry_number);
455 put_user(addr, &p->entry_number);
456 ret = do_set_thread_area(&child->thread, p);
457 put_user(old, &p->entry_number);
458 break;
459 case PTRACE_GET_THREAD_AREA:
460 p = (struct user_desc __user *)data;
461 get_user(old, &p->entry_number);
462 put_user(addr, &p->entry_number);
463 ret = do_get_thread_area(&child->thread, p);
464 put_user(old, &p->entry_number);
465 break;
466 }
467#endif
468 /* normal 64bit interface to access TLS data.
469 Works just like arch_prctl, except that the arguments
470 are reversed. */
471 case PTRACE_ARCH_PRCTL:
472 ret = do_arch_prctl(child, data, addr);
473 break;
474
475/*
476 * make the child exit. Best I can do is send it a sigkill.
477 * perhaps it should be put in the status that it wants to
478 * exit.
479 */
480 case PTRACE_KILL:
481 ret = 0;
482 if (child->exit_state == EXIT_ZOMBIE) /* already dead */
483 break;
484 clear_tsk_thread_flag(child, TIF_SINGLESTEP);
485 child->exit_code = SIGKILL;
486 /* make sure the single step bit is not set. */
487 clear_singlestep(child);
488 wake_up_process(child);
489 break;
490
491 case PTRACE_SINGLESTEP: /* set the trap flag. */
492 ret = -EIO;
493 if (!valid_signal(data))
494 break;
495 clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE);
496 set_singlestep(child);
497 child->exit_code = data;
498 /* give it a chance to run. */
499 wake_up_process(child);
500 ret = 0;
501 break;
502
503 case PTRACE_GETREGS: { /* Get all gp regs from the child. */
504 if (!access_ok(VERIFY_WRITE, (unsigned __user *)data,
505 sizeof(struct user_regs_struct))) {
506 ret = -EIO;
507 break;
508 }
509 ret = 0;
510 for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) {
511 ret |= __put_user(getreg(child, ui),(unsigned long __user *) data);
512 data += sizeof(long);
513 }
514 break;
515 }
516
517 case PTRACE_SETREGS: { /* Set all gp regs in the child. */
518 unsigned long tmp;
519 if (!access_ok(VERIFY_READ, (unsigned __user *)data,
520 sizeof(struct user_regs_struct))) {
521 ret = -EIO;
522 break;
523 }
524 ret = 0;
525 for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) {
526 ret = __get_user(tmp, (unsigned long __user *) data);
527 if (ret)
528 break;
529 ret = putreg(child, ui, tmp);
530 if (ret)
531 break;
532 data += sizeof(long);
533 }
534 break;
535 }
536
537 case PTRACE_GETFPREGS: { /* Get the child extended FPU state. */
538 if (!access_ok(VERIFY_WRITE, (unsigned __user *)data,
539 sizeof(struct user_i387_struct))) {
540 ret = -EIO;
541 break;
542 }
543 ret = get_fpregs((struct user_i387_struct __user *)data, child);
544 break;
545 }
546
547 case PTRACE_SETFPREGS: { /* Set the child extended FPU state. */
548 if (!access_ok(VERIFY_READ, (unsigned __user *)data,
549 sizeof(struct user_i387_struct))) {
550 ret = -EIO;
551 break;
552 }
553 set_stopped_child_used_math(child);
554 ret = set_fpregs(child, (struct user_i387_struct __user *)data);
555 break;
556 }
557
558 default:
559 ret = ptrace_request(child, request, addr, data);
560 break;
561 }
562 return ret;
563}
564
565static void syscall_trace(struct pt_regs *regs)
566{
567
568#if 0
569 printk("trace %s rip %lx rsp %lx rax %d origrax %d caller %lx tiflags %x ptrace %x\n",
570 current->comm,
571 regs->rip, regs->rsp, regs->rax, regs->orig_rax, __builtin_return_address(0),
572 current_thread_info()->flags, current->ptrace);
573#endif
574
575 ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD)
576 ? 0x80 : 0));
577 /*
578 * this isn't the same as continuing with a signal, but it will do
579 * for normal use. strace only continues with a signal if the
580 * stopping signal is not SIGTRAP. -brl
581 */
582 if (current->exit_code) {
583 send_sig(current->exit_code, current, 1);
584 current->exit_code = 0;
585 }
586}
587
588asmlinkage void syscall_trace_enter(struct pt_regs *regs)
589{
590 /* do the secure computing check first */
591 secure_computing(regs->orig_rax);
592
593 if (test_thread_flag(TIF_SYSCALL_TRACE)
594 && (current->ptrace & PT_PTRACED))
595 syscall_trace(regs);
596
597 if (unlikely(current->audit_context)) {
598 if (test_thread_flag(TIF_IA32)) {
599 audit_syscall_entry(AUDIT_ARCH_I386,
600 regs->orig_rax,
601 regs->rbx, regs->rcx,
602 regs->rdx, regs->rsi);
603 } else {
604 audit_syscall_entry(AUDIT_ARCH_X86_64,
605 regs->orig_rax,
606 regs->rdi, regs->rsi,
607 regs->rdx, regs->r10);
608 }
609 }
610}
611
612asmlinkage void syscall_trace_leave(struct pt_regs *regs)
613{
614 if (unlikely(current->audit_context))
615 audit_syscall_exit(AUDITSC_RESULT(regs->rax), regs->rax);
616
617 if ((test_thread_flag(TIF_SYSCALL_TRACE)
618 || test_thread_flag(TIF_SINGLESTEP))
619 && (current->ptrace & PT_PTRACED))
620 syscall_trace(regs);
621}
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index fab30e13483..6ba33ca8715 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -30,8 +30,8 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
30 raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word); 30 raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word);
31 31
32 if (!(word & (1 << 13))) { 32 if (!(word & (1 << 13))) {
33 printk(KERN_INFO "Intel E7520/7320/7525 detected. " 33 dev_info(&dev->dev, "Intel E7520/7320/7525 detected; "
34 "Disabling irq balancing and affinity\n"); 34 "disabling irq balancing and affinity\n");
35#ifdef CONFIG_IRQBALANCE 35#ifdef CONFIG_IRQBALANCE
36 irqbalance_disable(""); 36 irqbalance_disable("");
37#endif 37#endif
@@ -104,14 +104,16 @@ static void ich_force_enable_hpet(struct pci_dev *dev)
104 pci_read_config_dword(dev, 0xF0, &rcba); 104 pci_read_config_dword(dev, 0xF0, &rcba);
105 rcba &= 0xFFFFC000; 105 rcba &= 0xFFFFC000;
106 if (rcba == 0) { 106 if (rcba == 0) {
107 printk(KERN_DEBUG "RCBA disabled. Cannot force enable HPET\n"); 107 dev_printk(KERN_DEBUG, &dev->dev, "RCBA disabled; "
108 "cannot force enable HPET\n");
108 return; 109 return;
109 } 110 }
110 111
111 /* use bits 31:14, 16 kB aligned */ 112 /* use bits 31:14, 16 kB aligned */
112 rcba_base = ioremap_nocache(rcba, 0x4000); 113 rcba_base = ioremap_nocache(rcba, 0x4000);
113 if (rcba_base == NULL) { 114 if (rcba_base == NULL) {
114 printk(KERN_DEBUG "ioremap failed. Cannot force enable HPET\n"); 115 dev_printk(KERN_DEBUG, &dev->dev, "ioremap failed; "
116 "cannot force enable HPET\n");
115 return; 117 return;
116 } 118 }
117 119
@@ -122,8 +124,8 @@ static void ich_force_enable_hpet(struct pci_dev *dev)
122 /* HPET is enabled in HPTC. Just not reported by BIOS */ 124 /* HPET is enabled in HPTC. Just not reported by BIOS */
123 val = val & 0x3; 125 val = val & 0x3;
124 force_hpet_address = 0xFED00000 | (val << 12); 126 force_hpet_address = 0xFED00000 | (val << 12);
125 printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n", 127 dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at "
126 force_hpet_address); 128 "0x%lx\n", force_hpet_address);
127 iounmap(rcba_base); 129 iounmap(rcba_base);
128 return; 130 return;
129 } 131 }
@@ -142,11 +144,12 @@ static void ich_force_enable_hpet(struct pci_dev *dev)
142 if (err) { 144 if (err) {
143 force_hpet_address = 0; 145 force_hpet_address = 0;
144 iounmap(rcba_base); 146 iounmap(rcba_base);
145 printk(KERN_DEBUG "Failed to force enable HPET\n"); 147 dev_printk(KERN_DEBUG, &dev->dev,
148 "Failed to force enable HPET\n");
146 } else { 149 } else {
147 force_hpet_resume_type = ICH_FORCE_HPET_RESUME; 150 force_hpet_resume_type = ICH_FORCE_HPET_RESUME;
148 printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n", 151 dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at "
149 force_hpet_address); 152 "0x%lx\n", force_hpet_address);
150 } 153 }
151} 154}
152 155
@@ -162,6 +165,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_31,
162 ich_force_enable_hpet); 165 ich_force_enable_hpet);
163DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_1, 166DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_1,
164 ich_force_enable_hpet); 167 ich_force_enable_hpet);
168DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7,
169 ich_force_enable_hpet);
165 170
166 171
167static struct pci_dev *cached_dev; 172static struct pci_dev *cached_dev;
@@ -206,8 +211,8 @@ static void old_ich_force_enable_hpet(struct pci_dev *dev)
206 if (val & 0x4) { 211 if (val & 0x4) {
207 val &= 0x3; 212 val &= 0x3;
208 force_hpet_address = 0xFED00000 | (val << 12); 213 force_hpet_address = 0xFED00000 | (val << 12);
209 printk(KERN_DEBUG "HPET at base address 0x%lx\n", 214 dev_printk(KERN_DEBUG, &dev->dev, "HPET at 0x%lx\n",
210 force_hpet_address); 215 force_hpet_address);
211 return; 216 return;
212 } 217 }
213 218
@@ -227,14 +232,14 @@ static void old_ich_force_enable_hpet(struct pci_dev *dev)
227 /* HPET is enabled in HPTC. Just not reported by BIOS */ 232 /* HPET is enabled in HPTC. Just not reported by BIOS */
228 val &= 0x3; 233 val &= 0x3;
229 force_hpet_address = 0xFED00000 | (val << 12); 234 force_hpet_address = 0xFED00000 | (val << 12);
230 printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n", 235 dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at "
231 force_hpet_address); 236 "0x%lx\n", force_hpet_address);
232 cached_dev = dev; 237 cached_dev = dev;
233 force_hpet_resume_type = OLD_ICH_FORCE_HPET_RESUME; 238 force_hpet_resume_type = OLD_ICH_FORCE_HPET_RESUME;
234 return; 239 return;
235 } 240 }
236 241
237 printk(KERN_DEBUG "Failed to force enable HPET\n"); 242 dev_printk(KERN_DEBUG, &dev->dev, "Failed to force enable HPET\n");
238} 243}
239 244
240/* 245/*
@@ -292,8 +297,8 @@ static void vt8237_force_enable_hpet(struct pci_dev *dev)
292 */ 297 */
293 if (val & 0x80) { 298 if (val & 0x80) {
294 force_hpet_address = (val & ~0x3ff); 299 force_hpet_address = (val & ~0x3ff);
295 printk(KERN_DEBUG "HPET at base address 0x%lx\n", 300 dev_printk(KERN_DEBUG, &dev->dev, "HPET at 0x%lx\n",
296 force_hpet_address); 301 force_hpet_address);
297 return; 302 return;
298 } 303 }
299 304
@@ -307,14 +312,14 @@ static void vt8237_force_enable_hpet(struct pci_dev *dev)
307 pci_read_config_dword(dev, 0x68, &val); 312 pci_read_config_dword(dev, 0x68, &val);
308 if (val & 0x80) { 313 if (val & 0x80) {
309 force_hpet_address = (val & ~0x3ff); 314 force_hpet_address = (val & ~0x3ff);
310 printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n", 315 dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at "
311 force_hpet_address); 316 "0x%lx\n", force_hpet_address);
312 cached_dev = dev; 317 cached_dev = dev;
313 force_hpet_resume_type = VT8237_FORCE_HPET_RESUME; 318 force_hpet_resume_type = VT8237_FORCE_HPET_RESUME;
314 return; 319 return;
315 } 320 }
316 321
317 printk(KERN_DEBUG "Failed to force enable HPET\n"); 322 dev_printk(KERN_DEBUG, &dev->dev, "Failed to force enable HPET\n");
318} 323}
319 324
320DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235, 325DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235,
@@ -342,7 +347,7 @@ static void nvidia_force_enable_hpet(struct pci_dev *dev)
342 pci_read_config_dword(dev, 0x44, &val); 347 pci_read_config_dword(dev, 0x44, &val);
343 force_hpet_address = val & 0xfffffffe; 348 force_hpet_address = val & 0xfffffffe;
344 force_hpet_resume_type = NVIDIA_FORCE_HPET_RESUME; 349 force_hpet_resume_type = NVIDIA_FORCE_HPET_RESUME;
345 printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n", 350 dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n",
346 force_hpet_address); 351 force_hpet_address);
347 cached_dev = dev; 352 cached_dev = dev;
348 return; 353 return;
@@ -375,19 +380,19 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0367,
375void force_hpet_resume(void) 380void force_hpet_resume(void)
376{ 381{
377 switch (force_hpet_resume_type) { 382 switch (force_hpet_resume_type) {
378 case ICH_FORCE_HPET_RESUME: 383 case ICH_FORCE_HPET_RESUME:
379 return ich_force_hpet_resume(); 384 ich_force_hpet_resume();
380 385 return;
381 case OLD_ICH_FORCE_HPET_RESUME: 386 case OLD_ICH_FORCE_HPET_RESUME:
382 return old_ich_force_hpet_resume(); 387 old_ich_force_hpet_resume();
383 388 return;
384 case VT8237_FORCE_HPET_RESUME: 389 case VT8237_FORCE_HPET_RESUME:
385 return vt8237_force_hpet_resume(); 390 vt8237_force_hpet_resume();
386 391 return;
387 case NVIDIA_FORCE_HPET_RESUME: 392 case NVIDIA_FORCE_HPET_RESUME:
388 return nvidia_force_hpet_resume(); 393 nvidia_force_hpet_resume();
389 394 return;
390 default: 395 default:
391 break; 396 break;
392 } 397 }
393} 398}
diff --git a/arch/x86/kernel/reboot_32.c b/arch/x86/kernel/reboot.c
index bb1a0f889c5..5818dc28167 100644
--- a/arch/x86/kernel/reboot_32.c
+++ b/arch/x86/kernel/reboot.c
@@ -1,64 +1,94 @@
1#include <linux/mm.h>
2#include <linux/module.h> 1#include <linux/module.h>
3#include <linux/delay.h>
4#include <linux/init.h> 2#include <linux/init.h>
5#include <linux/interrupt.h>
6#include <linux/mc146818rtc.h>
7#include <linux/efi.h>
8#include <linux/dmi.h>
9#include <linux/ctype.h>
10#include <linux/pm.h>
11#include <linux/reboot.h> 3#include <linux/reboot.h>
12#include <asm/uaccess.h> 4#include <linux/init.h>
5#include <linux/pm.h>
6#include <linux/efi.h>
7#include <acpi/reboot.h>
8#include <asm/io.h>
13#include <asm/apic.h> 9#include <asm/apic.h>
14#include <asm/hpet.h>
15#include <asm/desc.h> 10#include <asm/desc.h>
16#include "mach_reboot.h" 11#include <asm/hpet.h>
17#include <asm/reboot_fixups.h> 12#include <asm/reboot_fixups.h>
18#include <asm/reboot.h> 13#include <asm/reboot.h>
19 14
15#ifdef CONFIG_X86_32
16# include <linux/dmi.h>
17# include <linux/ctype.h>
18# include <linux/mc146818rtc.h>
19# include <asm/pgtable.h>
20#else
21# include <asm/iommu.h>
22#endif
23
20/* 24/*
21 * Power off function, if any 25 * Power off function, if any
22 */ 26 */
23void (*pm_power_off)(void); 27void (*pm_power_off)(void);
24EXPORT_SYMBOL(pm_power_off); 28EXPORT_SYMBOL(pm_power_off);
25 29
30static long no_idt[3];
26static int reboot_mode; 31static int reboot_mode;
27static int reboot_thru_bios; 32enum reboot_type reboot_type = BOOT_KBD;
33int reboot_force;
28 34
29#ifdef CONFIG_SMP 35#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
30static int reboot_cpu = -1; 36static int reboot_cpu = -1;
31#endif 37#endif
38
39/* reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old]
40 warm Don't set the cold reboot flag
41 cold Set the cold reboot flag
42 bios Reboot by jumping through the BIOS (only for X86_32)
43 smp Reboot by executing reset on BSP or other CPU (only for X86_32)
44 triple Force a triple fault (init)
45 kbd Use the keyboard controller. cold reset (default)
46 acpi Use the RESET_REG in the FADT
47 efi Use efi reset_system runtime service
48 force Avoid anything that could hang.
49 */
32static int __init reboot_setup(char *str) 50static int __init reboot_setup(char *str)
33{ 51{
34 while(1) { 52 for (;;) {
35 switch (*str) { 53 switch (*str) {
36 case 'w': /* "warm" reboot (no memory testing etc) */ 54 case 'w':
37 reboot_mode = 0x1234; 55 reboot_mode = 0x1234;
38 break; 56 break;
39 case 'c': /* "cold" reboot (with memory testing etc) */ 57
40 reboot_mode = 0x0; 58 case 'c':
41 break; 59 reboot_mode = 0;
42 case 'b': /* "bios" reboot by jumping through the BIOS */
43 reboot_thru_bios = 1;
44 break;
45 case 'h': /* "hard" reboot by toggling RESET and/or crashing the CPU */
46 reboot_thru_bios = 0;
47 break; 60 break;
61
62#ifdef CONFIG_X86_32
48#ifdef CONFIG_SMP 63#ifdef CONFIG_SMP
49 case 's': /* "smp" reboot by executing reset on BSP or other CPU*/ 64 case 's':
50 if (isdigit(*(str+1))) { 65 if (isdigit(*(str+1))) {
51 reboot_cpu = (int) (*(str+1) - '0'); 66 reboot_cpu = (int) (*(str+1) - '0');
52 if (isdigit(*(str+2))) 67 if (isdigit(*(str+2)))
53 reboot_cpu = reboot_cpu*10 + (int)(*(str+2) - '0'); 68 reboot_cpu = reboot_cpu*10 + (int)(*(str+2) - '0');
54 } 69 }
55 /* we will leave sorting out the final value 70 /* we will leave sorting out the final value
56 when we are ready to reboot, since we might not 71 when we are ready to reboot, since we might not
57 have set up boot_cpu_id or smp_num_cpu */ 72 have set up boot_cpu_id or smp_num_cpu */
58 break; 73 break;
74#endif /* CONFIG_SMP */
75
76 case 'b':
59#endif 77#endif
78 case 'a':
79 case 'k':
80 case 't':
81 case 'e':
82 reboot_type = *str;
83 break;
84
85 case 'f':
86 reboot_force = 1;
87 break;
60 } 88 }
61 if((str = strchr(str,',')) != NULL) 89
90 str = strchr(str, ',');
91 if (str)
62 str++; 92 str++;
63 else 93 else
64 break; 94 break;
@@ -68,18 +98,21 @@ static int __init reboot_setup(char *str)
68 98
69__setup("reboot=", reboot_setup); 99__setup("reboot=", reboot_setup);
70 100
101
102#ifdef CONFIG_X86_32
71/* 103/*
72 * Reboot options and system auto-detection code provided by 104 * Reboot options and system auto-detection code provided by
73 * Dell Inc. so their systems "just work". :-) 105 * Dell Inc. so their systems "just work". :-)
74 */ 106 */
75 107
76/* 108/*
77 * Some machines require the "reboot=b" commandline option, this quirk makes that automatic. 109 * Some machines require the "reboot=b" commandline option,
110 * this quirk makes that automatic.
78 */ 111 */
79static int __init set_bios_reboot(const struct dmi_system_id *d) 112static int __init set_bios_reboot(const struct dmi_system_id *d)
80{ 113{
81 if (!reboot_thru_bios) { 114 if (reboot_type != BOOT_BIOS) {
82 reboot_thru_bios = 1; 115 reboot_type = BOOT_BIOS;
83 printk(KERN_INFO "%s series board detected. Selecting BIOS-method for reboots.\n", d->ident); 116 printk(KERN_INFO "%s series board detected. Selecting BIOS-method for reboots.\n", d->ident);
84 } 117 }
85 return 0; 118 return 0;
@@ -143,7 +176,6 @@ static int __init reboot_init(void)
143 dmi_check_system(reboot_dmi_table); 176 dmi_check_system(reboot_dmi_table);
144 return 0; 177 return 0;
145} 178}
146
147core_initcall(reboot_init); 179core_initcall(reboot_init);
148 180
149/* The following code and data reboots the machine by switching to real 181/* The following code and data reboots the machine by switching to real
@@ -152,7 +184,6 @@ core_initcall(reboot_init);
152 controller to pulse the CPU reset line, which is more thorough, but 184 controller to pulse the CPU reset line, which is more thorough, but
153 doesn't work with at least one type of 486 motherboard. It is easy 185 doesn't work with at least one type of 486 motherboard. It is easy
154 to stop this code working; hence the copious comments. */ 186 to stop this code working; hence the copious comments. */
155
156static unsigned long long 187static unsigned long long
157real_mode_gdt_entries [3] = 188real_mode_gdt_entries [3] =
158{ 189{
@@ -161,11 +192,9 @@ real_mode_gdt_entries [3] =
161 0x000092000100ffffULL /* 16-bit real-mode 64k data at 0x00000100 */ 192 0x000092000100ffffULL /* 16-bit real-mode 64k data at 0x00000100 */
162}; 193};
163 194
164static struct Xgt_desc_struct 195static struct desc_ptr
165real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, (long)real_mode_gdt_entries }, 196real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, (long)real_mode_gdt_entries },
166real_mode_idt = { 0x3ff, 0 }, 197real_mode_idt = { 0x3ff, 0 };
167no_idt = { 0, 0 };
168
169 198
170/* This is 16-bit protected mode code to disable paging and the cache, 199/* This is 16-bit protected mode code to disable paging and the cache,
171 switch to real mode and jump to the BIOS reset code. 200 switch to real mode and jump to the BIOS reset code.
@@ -185,7 +214,6 @@ no_idt = { 0, 0 };
185 214
186 More could be done here to set up the registers as if a CPU reset had 215 More could be done here to set up the registers as if a CPU reset had
187 occurred; hopefully real BIOSs don't assume much. */ 216 occurred; hopefully real BIOSs don't assume much. */
188
189static unsigned char real_mode_switch [] = 217static unsigned char real_mode_switch [] =
190{ 218{
191 0x66, 0x0f, 0x20, 0xc0, /* movl %cr0,%eax */ 219 0x66, 0x0f, 0x20, 0xc0, /* movl %cr0,%eax */
@@ -223,7 +251,6 @@ void machine_real_restart(unsigned char *code, int length)
223 `outb_p' is needed instead of just `outb'. Use it to be on the 251 `outb_p' is needed instead of just `outb'. Use it to be on the
224 safe side. (Yes, CMOS_WRITE does outb_p's. - Paul G.) 252 safe side. (Yes, CMOS_WRITE does outb_p's. - Paul G.)
225 */ 253 */
226
227 spin_lock(&rtc_lock); 254 spin_lock(&rtc_lock);
228 CMOS_WRITE(0x00, 0x8f); 255 CMOS_WRITE(0x00, 0x8f);
229 spin_unlock(&rtc_lock); 256 spin_unlock(&rtc_lock);
@@ -231,9 +258,8 @@ void machine_real_restart(unsigned char *code, int length)
231 /* Remap the kernel at virtual address zero, as well as offset zero 258 /* Remap the kernel at virtual address zero, as well as offset zero
232 from the kernel segment. This assumes the kernel segment starts at 259 from the kernel segment. This assumes the kernel segment starts at
233 virtual address PAGE_OFFSET. */ 260 virtual address PAGE_OFFSET. */
234 261 memcpy(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
235 memcpy (swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, 262 sizeof(swapper_pg_dir [0]) * KERNEL_PGD_PTRS);
236 sizeof (swapper_pg_dir [0]) * KERNEL_PGD_PTRS);
237 263
238 /* 264 /*
239 * Use `swapper_pg_dir' as our page directory. 265 * Use `swapper_pg_dir' as our page directory.
@@ -245,7 +271,6 @@ void machine_real_restart(unsigned char *code, int length)
245 boot)". This seems like a fairly standard thing that gets set by 271 boot)". This seems like a fairly standard thing that gets set by
246 REBOOT.COM programs, and the previous reset routine did this 272 REBOOT.COM programs, and the previous reset routine did this
247 too. */ 273 too. */
248
249 *((unsigned short *)0x472) = reboot_mode; 274 *((unsigned short *)0x472) = reboot_mode;
250 275
251 /* For the switch to real mode, copy some code to low memory. It has 276 /* For the switch to real mode, copy some code to low memory. It has
@@ -253,19 +278,16 @@ void machine_real_restart(unsigned char *code, int length)
253 has to have the same physical and virtual address, because it turns 278 has to have the same physical and virtual address, because it turns
254 off paging. Copy it near the end of the first page, out of the way 279 off paging. Copy it near the end of the first page, out of the way
255 of BIOS variables. */ 280 of BIOS variables. */
256 281 memcpy((void *)(0x1000 - sizeof(real_mode_switch) - 100),
257 memcpy ((void *) (0x1000 - sizeof (real_mode_switch) - 100),
258 real_mode_switch, sizeof (real_mode_switch)); 282 real_mode_switch, sizeof (real_mode_switch));
259 memcpy ((void *) (0x1000 - 100), code, length); 283 memcpy((void *)(0x1000 - 100), code, length);
260 284
261 /* Set up the IDT for real mode. */ 285 /* Set up the IDT for real mode. */
262
263 load_idt(&real_mode_idt); 286 load_idt(&real_mode_idt);
264 287
265 /* Set up a GDT from which we can load segment descriptors for real 288 /* Set up a GDT from which we can load segment descriptors for real
266 mode. The GDT is not used in real mode; it is just needed here to 289 mode. The GDT is not used in real mode; it is just needed here to
267 prepare the descriptors. */ 290 prepare the descriptors. */
268
269 load_gdt(&real_mode_gdt); 291 load_gdt(&real_mode_gdt);
270 292
271 /* Load the data segment registers, and thus the descriptors ready for 293 /* Load the data segment registers, and thus the descriptors ready for
@@ -273,7 +295,6 @@ void machine_real_restart(unsigned char *code, int length)
273 selector value being loaded here. This is so that the segment 295 selector value being loaded here. This is so that the segment
274 registers don't have to be reloaded after switching to real mode: 296 registers don't have to be reloaded after switching to real mode:
275 the values are consistent for real mode operation already. */ 297 the values are consistent for real mode operation already. */
276
277 __asm__ __volatile__ ("movl $0x0010,%%eax\n" 298 __asm__ __volatile__ ("movl $0x0010,%%eax\n"
278 "\tmovl %%eax,%%ds\n" 299 "\tmovl %%eax,%%ds\n"
279 "\tmovl %%eax,%%es\n" 300 "\tmovl %%eax,%%es\n"
@@ -284,130 +305,147 @@ void machine_real_restart(unsigned char *code, int length)
284 /* Jump to the 16-bit code that we copied earlier. It disables paging 305 /* Jump to the 16-bit code that we copied earlier. It disables paging
285 and the cache, switches to real mode, and jumps to the BIOS reset 306 and the cache, switches to real mode, and jumps to the BIOS reset
286 entry point. */ 307 entry point. */
287
288 __asm__ __volatile__ ("ljmp $0x0008,%0" 308 __asm__ __volatile__ ("ljmp $0x0008,%0"
289 : 309 :
290 : "i" ((void *) (0x1000 - sizeof (real_mode_switch) - 100))); 310 : "i" ((void *)(0x1000 - sizeof (real_mode_switch) - 100)));
291} 311}
292#ifdef CONFIG_APM_MODULE 312#ifdef CONFIG_APM_MODULE
293EXPORT_SYMBOL(machine_real_restart); 313EXPORT_SYMBOL(machine_real_restart);
294#endif 314#endif
295 315
296static void native_machine_shutdown(void) 316#endif /* CONFIG_X86_32 */
317
318static inline void kb_wait(void)
319{
320 int i;
321
322 for (i = 0; i < 0x10000; i++) {
323 if ((inb(0x64) & 0x02) == 0)
324 break;
325 udelay(2);
326 }
327}
328
329void machine_emergency_restart(void)
330{
331 int i;
332
333 /* Tell the BIOS if we want cold or warm reboot */
334 *((unsigned short *)__va(0x472)) = reboot_mode;
335
336 for (;;) {
337 /* Could also try the reset bit in the Hammer NB */
338 switch (reboot_type) {
339 case BOOT_KBD:
340 for (i = 0; i < 10; i++) {
341 kb_wait();
342 udelay(50);
343 outb(0xfe, 0x64); /* pulse reset low */
344 udelay(50);
345 }
346
347 case BOOT_TRIPLE:
348 load_idt((const struct desc_ptr *)&no_idt);
349 __asm__ __volatile__("int3");
350
351 reboot_type = BOOT_KBD;
352 break;
353
354#ifdef CONFIG_X86_32
355 case BOOT_BIOS:
356 machine_real_restart(jump_to_bios, sizeof(jump_to_bios));
357
358 reboot_type = BOOT_KBD;
359 break;
360#endif
361
362 case BOOT_ACPI:
363 acpi_reboot();
364 reboot_type = BOOT_KBD;
365 break;
366
367
368 case BOOT_EFI:
369 if (efi_enabled)
370 efi.reset_system(reboot_mode ? EFI_RESET_WARM : EFI_RESET_COLD,
371 EFI_SUCCESS, 0, NULL);
372
373 reboot_type = BOOT_KBD;
374 break;
375 }
376 }
377}
378
379void machine_shutdown(void)
297{ 380{
381 /* Stop the cpus and apics */
298#ifdef CONFIG_SMP 382#ifdef CONFIG_SMP
299 int reboot_cpu_id; 383 int reboot_cpu_id;
300 384
301 /* The boot cpu is always logical cpu 0 */ 385 /* The boot cpu is always logical cpu 0 */
302 reboot_cpu_id = 0; 386 reboot_cpu_id = 0;
303 387
388#ifdef CONFIG_X86_32
304 /* See if there has been given a command line override */ 389 /* See if there has been given a command line override */
305 if ((reboot_cpu != -1) && (reboot_cpu < NR_CPUS) && 390 if ((reboot_cpu != -1) && (reboot_cpu < NR_CPUS) &&
306 cpu_isset(reboot_cpu, cpu_online_map)) { 391 cpu_isset(reboot_cpu, cpu_online_map))
307 reboot_cpu_id = reboot_cpu; 392 reboot_cpu_id = reboot_cpu;
308 } 393#endif
309 394
310 /* Make certain the cpu I'm rebooting on is online */ 395 /* Make certain the cpu I'm about to reboot on is online */
311 if (!cpu_isset(reboot_cpu_id, cpu_online_map)) { 396 if (!cpu_isset(reboot_cpu_id, cpu_online_map))
312 reboot_cpu_id = smp_processor_id(); 397 reboot_cpu_id = smp_processor_id();
313 }
314 398
315 /* Make certain I only run on the appropriate processor */ 399 /* Make certain I only run on the appropriate processor */
316 set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id)); 400 set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id));
317 401
318 /* O.K. Now that I'm on the appropriate processor, stop 402 /* O.K Now that I'm on the appropriate processor,
319 * all of the others, and disable their local APICs. 403 * stop all of the others.
320 */ 404 */
321
322 smp_send_stop(); 405 smp_send_stop();
323#endif /* CONFIG_SMP */ 406#endif
324 407
325 lapic_shutdown(); 408 lapic_shutdown();
326 409
327#ifdef CONFIG_X86_IO_APIC 410#ifdef CONFIG_X86_IO_APIC
328 disable_IO_APIC(); 411 disable_IO_APIC();
329#endif 412#endif
413
330#ifdef CONFIG_HPET_TIMER 414#ifdef CONFIG_HPET_TIMER
331 hpet_disable(); 415 hpet_disable();
332#endif 416#endif
333}
334 417
335void __attribute__((weak)) mach_reboot_fixups(void) 418#ifdef CONFIG_X86_64
336{ 419 pci_iommu_shutdown();
420#endif
337} 421}
338 422
339static void native_machine_emergency_restart(void) 423void machine_restart(char *__unused)
340{ 424{
341 if (!reboot_thru_bios) { 425 printk("machine restart\n");
342 if (efi_enabled) {
343 efi.reset_system(EFI_RESET_COLD, EFI_SUCCESS, 0, NULL);
344 load_idt(&no_idt);
345 __asm__ __volatile__("int3");
346 }
347 /* rebooting needs to touch the page at absolute addr 0 */
348 *((unsigned short *)__va(0x472)) = reboot_mode;
349 for (;;) {
350 mach_reboot_fixups(); /* for board specific fixups */
351 mach_reboot();
352 /* That didn't work - force a triple fault.. */
353 load_idt(&no_idt);
354 __asm__ __volatile__("int3");
355 }
356 }
357 if (efi_enabled)
358 efi.reset_system(EFI_RESET_WARM, EFI_SUCCESS, 0, NULL);
359 426
360 machine_real_restart(jump_to_bios, sizeof(jump_to_bios)); 427 if (!reboot_force)
361} 428 machine_shutdown();
362
363static void native_machine_restart(char * __unused)
364{
365 machine_shutdown();
366 machine_emergency_restart(); 429 machine_emergency_restart();
367} 430}
368 431
369static void native_machine_halt(void) 432void machine_halt(void)
370{ 433{
371} 434}
372 435
373static void native_machine_power_off(void) 436void machine_power_off(void)
374{ 437{
375 if (pm_power_off) { 438 if (pm_power_off) {
376 machine_shutdown(); 439 if (!reboot_force)
440 machine_shutdown();
377 pm_power_off(); 441 pm_power_off();
378 } 442 }
379} 443}
380 444
381
382struct machine_ops machine_ops = { 445struct machine_ops machine_ops = {
383 .power_off = native_machine_power_off, 446 .power_off = machine_power_off,
384 .shutdown = native_machine_shutdown, 447 .shutdown = machine_shutdown,
385 .emergency_restart = native_machine_emergency_restart, 448 .emergency_restart = machine_emergency_restart,
386 .restart = native_machine_restart, 449 .restart = machine_restart,
387 .halt = native_machine_halt, 450 .halt = machine_halt
388}; 451};
389
390void machine_power_off(void)
391{
392 machine_ops.power_off();
393}
394
395void machine_shutdown(void)
396{
397 machine_ops.shutdown();
398}
399
400void machine_emergency_restart(void)
401{
402 machine_ops.emergency_restart();
403}
404
405void machine_restart(char *cmd)
406{
407 machine_ops.restart(cmd);
408}
409
410void machine_halt(void)
411{
412 machine_ops.halt();
413}
diff --git a/arch/x86/kernel/reboot_64.c b/arch/x86/kernel/reboot_64.c
deleted file mode 100644
index 53620a92a8f..00000000000
--- a/arch/x86/kernel/reboot_64.c
+++ /dev/null
@@ -1,176 +0,0 @@
1/* Various gunk just to reboot the machine. */
2#include <linux/module.h>
3#include <linux/reboot.h>
4#include <linux/init.h>
5#include <linux/smp.h>
6#include <linux/kernel.h>
7#include <linux/ctype.h>
8#include <linux/string.h>
9#include <linux/pm.h>
10#include <linux/kdebug.h>
11#include <linux/sched.h>
12#include <asm/io.h>
13#include <asm/delay.h>
14#include <asm/desc.h>
15#include <asm/hw_irq.h>
16#include <asm/system.h>
17#include <asm/pgtable.h>
18#include <asm/tlbflush.h>
19#include <asm/apic.h>
20#include <asm/hpet.h>
21#include <asm/gart.h>
22
23/*
24 * Power off function, if any
25 */
26void (*pm_power_off)(void);
27EXPORT_SYMBOL(pm_power_off);
28
29static long no_idt[3];
30static enum {
31 BOOT_TRIPLE = 't',
32 BOOT_KBD = 'k'
33} reboot_type = BOOT_KBD;
34static int reboot_mode = 0;
35int reboot_force;
36
37/* reboot=t[riple] | k[bd] [, [w]arm | [c]old]
38 warm Don't set the cold reboot flag
39 cold Set the cold reboot flag
40 triple Force a triple fault (init)
41 kbd Use the keyboard controller. cold reset (default)
42 force Avoid anything that could hang.
43 */
44static int __init reboot_setup(char *str)
45{
46 for (;;) {
47 switch (*str) {
48 case 'w':
49 reboot_mode = 0x1234;
50 break;
51
52 case 'c':
53 reboot_mode = 0;
54 break;
55
56 case 't':
57 case 'b':
58 case 'k':
59 reboot_type = *str;
60 break;
61 case 'f':
62 reboot_force = 1;
63 break;
64 }
65 if((str = strchr(str,',')) != NULL)
66 str++;
67 else
68 break;
69 }
70 return 1;
71}
72
73__setup("reboot=", reboot_setup);
74
75static inline void kb_wait(void)
76{
77 int i;
78
79 for (i=0; i<0x10000; i++)
80 if ((inb_p(0x64) & 0x02) == 0)
81 break;
82}
83
84void machine_shutdown(void)
85{
86 unsigned long flags;
87
88 /* Stop the cpus and apics */
89#ifdef CONFIG_SMP
90 int reboot_cpu_id;
91
92 /* The boot cpu is always logical cpu 0 */
93 reboot_cpu_id = 0;
94
95 /* Make certain the cpu I'm about to reboot on is online */
96 if (!cpu_isset(reboot_cpu_id, cpu_online_map)) {
97 reboot_cpu_id = smp_processor_id();
98 }
99
100 /* Make certain I only run on the appropriate processor */
101 set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id));
102
103 /* O.K Now that I'm on the appropriate processor,
104 * stop all of the others.
105 */
106 smp_send_stop();
107#endif
108
109 local_irq_save(flags);
110
111#ifndef CONFIG_SMP
112 disable_local_APIC();
113#endif
114
115 disable_IO_APIC();
116
117#ifdef CONFIG_HPET_TIMER
118 hpet_disable();
119#endif
120 local_irq_restore(flags);
121
122 pci_iommu_shutdown();
123}
124
125void machine_emergency_restart(void)
126{
127 int i;
128
129 /* Tell the BIOS if we want cold or warm reboot */
130 *((unsigned short *)__va(0x472)) = reboot_mode;
131
132 for (;;) {
133 /* Could also try the reset bit in the Hammer NB */
134 switch (reboot_type) {
135 case BOOT_KBD:
136 for (i=0; i<10; i++) {
137 kb_wait();
138 udelay(50);
139 outb(0xfe,0x64); /* pulse reset low */
140 udelay(50);
141 }
142
143 case BOOT_TRIPLE:
144 load_idt((const struct desc_ptr *)&no_idt);
145 __asm__ __volatile__("int3");
146
147 reboot_type = BOOT_KBD;
148 break;
149 }
150 }
151}
152
153void machine_restart(char * __unused)
154{
155 printk("machine restart\n");
156
157 if (!reboot_force) {
158 machine_shutdown();
159 }
160 machine_emergency_restart();
161}
162
163void machine_halt(void)
164{
165}
166
167void machine_power_off(void)
168{
169 if (pm_power_off) {
170 if (!reboot_force) {
171 machine_shutdown();
172 }
173 pm_power_off();
174 }
175}
176
diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c
index f452726c0fe..dec0b5ec25c 100644
--- a/arch/x86/kernel/reboot_fixups_32.c
+++ b/arch/x86/kernel/reboot_fixups_32.c
@@ -30,6 +30,19 @@ static void cs5536_warm_reset(struct pci_dev *dev)
30 udelay(50); /* shouldn't get here but be safe and spin a while */ 30 udelay(50); /* shouldn't get here but be safe and spin a while */
31} 31}
32 32
33static void rdc321x_reset(struct pci_dev *dev)
34{
35 unsigned i;
36 /* Voluntary reset the watchdog timer */
37 outl(0x80003840, 0xCF8);
38 /* Generate a CPU reset on next tick */
39 i = inl(0xCFC);
40 /* Use the minimum timer resolution */
41 i |= 0x1600;
42 outl(i, 0xCFC);
43 outb(1, 0x92);
44}
45
33struct device_fixup { 46struct device_fixup {
34 unsigned int vendor; 47 unsigned int vendor;
35 unsigned int device; 48 unsigned int device;
@@ -40,6 +53,7 @@ static struct device_fixup fixups_table[] = {
40{ PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, cs5530a_warm_reset }, 53{ PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, cs5530a_warm_reset },
41{ PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, cs5536_warm_reset }, 54{ PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, cs5536_warm_reset },
42{ PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE, cs5530a_warm_reset }, 55{ PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE, cs5530a_warm_reset },
56{ PCI_VENDOR_ID_RDC, PCI_DEVICE_ID_RDC_R6030, rdc321x_reset },
43}; 57};
44 58
45/* 59/*
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
new file mode 100644
index 00000000000..eb9b1a198f5
--- /dev/null
+++ b/arch/x86/kernel/rtc.c
@@ -0,0 +1,204 @@
1/*
2 * RTC related functions
3 */
4#include <linux/acpi.h>
5#include <linux/bcd.h>
6#include <linux/mc146818rtc.h>
7
8#include <asm/time.h>
9#include <asm/vsyscall.h>
10
11#ifdef CONFIG_X86_32
12# define CMOS_YEARS_OFFS 1900
13/*
14 * This is a special lock that is owned by the CPU and holds the index
15 * register we are working with. It is required for NMI access to the
16 * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details.
17 */
18volatile unsigned long cmos_lock = 0;
19EXPORT_SYMBOL(cmos_lock);
20#else
21/*
22 * x86-64 systems only exists since 2002.
23 * This will work up to Dec 31, 2100
24 */
25# define CMOS_YEARS_OFFS 2000
26#endif
27
28DEFINE_SPINLOCK(rtc_lock);
29EXPORT_SYMBOL(rtc_lock);
30
31/*
32 * In order to set the CMOS clock precisely, set_rtc_mmss has to be
33 * called 500 ms after the second nowtime has started, because when
34 * nowtime is written into the registers of the CMOS clock, it will
35 * jump to the next second precisely 500 ms later. Check the Motorola
36 * MC146818A or Dallas DS12887 data sheet for details.
37 *
38 * BUG: This routine does not handle hour overflow properly; it just
39 * sets the minutes. Usually you'll only notice that after reboot!
40 */
41int mach_set_rtc_mmss(unsigned long nowtime)
42{
43 int retval = 0;
44 int real_seconds, real_minutes, cmos_minutes;
45 unsigned char save_control, save_freq_select;
46
47 /* tell the clock it's being set */
48 save_control = CMOS_READ(RTC_CONTROL);
49 CMOS_WRITE((save_control|RTC_SET), RTC_CONTROL);
50
51 /* stop and reset prescaler */
52 save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
53 CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT);
54
55 cmos_minutes = CMOS_READ(RTC_MINUTES);
56 if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
57 BCD_TO_BIN(cmos_minutes);
58
59 /*
60 * since we're only adjusting minutes and seconds,
61 * don't interfere with hour overflow. This avoids
62 * messing with unknown time zones but requires your
63 * RTC not to be off by more than 15 minutes
64 */
65 real_seconds = nowtime % 60;
66 real_minutes = nowtime / 60;
67 /* correct for half hour time zone */
68 if (((abs(real_minutes - cmos_minutes) + 15)/30) & 1)
69 real_minutes += 30;
70 real_minutes %= 60;
71
72 if (abs(real_minutes - cmos_minutes) < 30) {
73 if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
74 BIN_TO_BCD(real_seconds);
75 BIN_TO_BCD(real_minutes);
76 }
77 CMOS_WRITE(real_seconds,RTC_SECONDS);
78 CMOS_WRITE(real_minutes,RTC_MINUTES);
79 } else {
80 printk(KERN_WARNING
81 "set_rtc_mmss: can't update from %d to %d\n",
82 cmos_minutes, real_minutes);
83 retval = -1;
84 }
85
86 /* The following flags have to be released exactly in this order,
87 * otherwise the DS12887 (popular MC146818A clone with integrated
88 * battery and quartz) will not reset the oscillator and will not
89 * update precisely 500 ms later. You won't find this mentioned in
90 * the Dallas Semiconductor data sheets, but who believes data
91 * sheets anyway ... -- Markus Kuhn
92 */
93 CMOS_WRITE(save_control, RTC_CONTROL);
94 CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
95
96 return retval;
97}
98
99unsigned long mach_get_cmos_time(void)
100{
101 unsigned int year, mon, day, hour, min, sec, century = 0;
102
103 /*
104 * If UIP is clear, then we have >= 244 microseconds before
105 * RTC registers will be updated. Spec sheet says that this
106 * is the reliable way to read RTC - registers. If UIP is set
107 * then the register access might be invalid.
108 */
109 while ((CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP))
110 cpu_relax();
111
112 sec = CMOS_READ(RTC_SECONDS);
113 min = CMOS_READ(RTC_MINUTES);
114 hour = CMOS_READ(RTC_HOURS);
115 day = CMOS_READ(RTC_DAY_OF_MONTH);
116 mon = CMOS_READ(RTC_MONTH);
117 year = CMOS_READ(RTC_YEAR);
118
119#if defined(CONFIG_ACPI) && defined(CONFIG_X86_64)
120 /* CHECKME: Is this really 64bit only ??? */
121 if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
122 acpi_gbl_FADT.century)
123 century = CMOS_READ(acpi_gbl_FADT.century);
124#endif
125
126 if (RTC_ALWAYS_BCD || !(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY)) {
127 BCD_TO_BIN(sec);
128 BCD_TO_BIN(min);
129 BCD_TO_BIN(hour);
130 BCD_TO_BIN(day);
131 BCD_TO_BIN(mon);
132 BCD_TO_BIN(year);
133 }
134
135 if (century) {
136 BCD_TO_BIN(century);
137 year += century * 100;
138 printk(KERN_INFO "Extended CMOS year: %d\n", century * 100);
139 } else {
140 year += CMOS_YEARS_OFFS;
141 if (year < 1970)
142 year += 100;
143 }
144
145 return mktime(year, mon, day, hour, min, sec);
146}
147
148/* Routines for accessing the CMOS RAM/RTC. */
149unsigned char rtc_cmos_read(unsigned char addr)
150{
151 unsigned char val;
152
153 lock_cmos_prefix(addr);
154 outb_p(addr, RTC_PORT(0));
155 val = inb_p(RTC_PORT(1));
156 lock_cmos_suffix(addr);
157 return val;
158}
159EXPORT_SYMBOL(rtc_cmos_read);
160
161void rtc_cmos_write(unsigned char val, unsigned char addr)
162{
163 lock_cmos_prefix(addr);
164 outb_p(addr, RTC_PORT(0));
165 outb_p(val, RTC_PORT(1));
166 lock_cmos_suffix(addr);
167}
168EXPORT_SYMBOL(rtc_cmos_write);
169
170static int set_rtc_mmss(unsigned long nowtime)
171{
172 int retval;
173 unsigned long flags;
174
175 spin_lock_irqsave(&rtc_lock, flags);
176 retval = set_wallclock(nowtime);
177 spin_unlock_irqrestore(&rtc_lock, flags);
178
179 return retval;
180}
181
182/* not static: needed by APM */
183unsigned long read_persistent_clock(void)
184{
185 unsigned long retval, flags;
186
187 spin_lock_irqsave(&rtc_lock, flags);
188 retval = get_wallclock();
189 spin_unlock_irqrestore(&rtc_lock, flags);
190
191 return retval;
192}
193
194int update_persistent_clock(struct timespec now)
195{
196 return set_rtc_mmss(now.tv_sec);
197}
198
199unsigned long long native_read_tsc(void)
200{
201 return __native_read_tsc();
202}
203EXPORT_SYMBOL(native_read_tsc);
204
diff --git a/arch/x86/kernel/scx200_32.c b/arch/x86/kernel/scx200_32.c
index 87bc159d29d..7e004acbe52 100644
--- a/arch/x86/kernel/scx200_32.c
+++ b/arch/x86/kernel/scx200_32.c
@@ -65,7 +65,7 @@ static int __devinit scx200_probe(struct pci_dev *pdev, const struct pci_device_
65 base = pci_resource_start(pdev, 0); 65 base = pci_resource_start(pdev, 0);
66 printk(KERN_INFO NAME ": GPIO base 0x%x\n", base); 66 printk(KERN_INFO NAME ": GPIO base 0x%x\n", base);
67 67
68 if (request_region(base, SCx200_GPIO_SIZE, "NatSemi SCx200 GPIO") == 0) { 68 if (!request_region(base, SCx200_GPIO_SIZE, "NatSemi SCx200 GPIO")) {
69 printk(KERN_ERR NAME ": can't allocate I/O for GPIOs\n"); 69 printk(KERN_ERR NAME ": can't allocate I/O for GPIOs\n");
70 return -EBUSY; 70 return -EBUSY;
71 } 71 }
diff --git a/arch/x86/kernel/setup64.c b/arch/x86/kernel/setup64.c
index 3558ac78c92..309366f8f60 100644
--- a/arch/x86/kernel/setup64.c
+++ b/arch/x86/kernel/setup64.c
@@ -24,7 +24,11 @@
24#include <asm/sections.h> 24#include <asm/sections.h>
25#include <asm/setup.h> 25#include <asm/setup.h>
26 26
27#ifndef CONFIG_DEBUG_BOOT_PARAMS
27struct boot_params __initdata boot_params; 28struct boot_params __initdata boot_params;
29#else
30struct boot_params boot_params;
31#endif
28 32
29cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; 33cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
30 34
@@ -37,6 +41,8 @@ struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
37char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned"))); 41char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));
38 42
39unsigned long __supported_pte_mask __read_mostly = ~0UL; 43unsigned long __supported_pte_mask __read_mostly = ~0UL;
44EXPORT_SYMBOL_GPL(__supported_pte_mask);
45
40static int do_not_nx __cpuinitdata = 0; 46static int do_not_nx __cpuinitdata = 0;
41 47
42/* noexec=on|off 48/* noexec=on|off
@@ -80,6 +86,43 @@ static int __init nonx32_setup(char *str)
80__setup("noexec32=", nonx32_setup); 86__setup("noexec32=", nonx32_setup);
81 87
82/* 88/*
89 * Copy data used in early init routines from the initial arrays to the
90 * per cpu data areas. These arrays then become expendable and the
91 * *_early_ptr's are zeroed indicating that the static arrays are gone.
92 */
93static void __init setup_per_cpu_maps(void)
94{
95 int cpu;
96
97 for_each_possible_cpu(cpu) {
98#ifdef CONFIG_SMP
99 if (per_cpu_offset(cpu)) {
100#endif
101 per_cpu(x86_cpu_to_apicid, cpu) =
102 x86_cpu_to_apicid_init[cpu];
103 per_cpu(x86_bios_cpu_apicid, cpu) =
104 x86_bios_cpu_apicid_init[cpu];
105#ifdef CONFIG_NUMA
106 per_cpu(x86_cpu_to_node_map, cpu) =
107 x86_cpu_to_node_map_init[cpu];
108#endif
109#ifdef CONFIG_SMP
110 }
111 else
112 printk(KERN_NOTICE "per_cpu_offset zero for cpu %d\n",
113 cpu);
114#endif
115 }
116
117 /* indicate the early static arrays will soon be gone */
118 x86_cpu_to_apicid_early_ptr = NULL;
119 x86_bios_cpu_apicid_early_ptr = NULL;
120#ifdef CONFIG_NUMA
121 x86_cpu_to_node_map_early_ptr = NULL;
122#endif
123}
124
125/*
83 * Great future plan: 126 * Great future plan:
84 * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data. 127 * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
85 * Always point %gs to its beginning 128 * Always point %gs to its beginning
@@ -100,18 +143,21 @@ void __init setup_per_cpu_areas(void)
100 for_each_cpu_mask (i, cpu_possible_map) { 143 for_each_cpu_mask (i, cpu_possible_map) {
101 char *ptr; 144 char *ptr;
102 145
103 if (!NODE_DATA(cpu_to_node(i))) { 146 if (!NODE_DATA(early_cpu_to_node(i))) {
104 printk("cpu with no node %d, num_online_nodes %d\n", 147 printk("cpu with no node %d, num_online_nodes %d\n",
105 i, num_online_nodes()); 148 i, num_online_nodes());
106 ptr = alloc_bootmem_pages(size); 149 ptr = alloc_bootmem_pages(size);
107 } else { 150 } else {
108 ptr = alloc_bootmem_pages_node(NODE_DATA(cpu_to_node(i)), size); 151 ptr = alloc_bootmem_pages_node(NODE_DATA(early_cpu_to_node(i)), size);
109 } 152 }
110 if (!ptr) 153 if (!ptr)
111 panic("Cannot allocate cpu data for CPU %d\n", i); 154 panic("Cannot allocate cpu data for CPU %d\n", i);
112 cpu_pda(i)->data_offset = ptr - __per_cpu_start; 155 cpu_pda(i)->data_offset = ptr - __per_cpu_start;
113 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); 156 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
114 } 157 }
158
159 /* setup percpu data maps early */
160 setup_per_cpu_maps();
115} 161}
116 162
117void pda_init(int cpu) 163void pda_init(int cpu)
@@ -169,7 +215,8 @@ void syscall_init(void)
169#endif 215#endif
170 216
171 /* Flags to clear on syscall */ 217 /* Flags to clear on syscall */
172 wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000); 218 wrmsrl(MSR_SYSCALL_MASK,
219 X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
173} 220}
174 221
175void __cpuinit check_efer(void) 222void __cpuinit check_efer(void)
@@ -227,7 +274,7 @@ void __cpuinit cpu_init (void)
227 * and set up the GDT descriptor: 274 * and set up the GDT descriptor:
228 */ 275 */
229 if (cpu) 276 if (cpu)
230 memcpy(cpu_gdt(cpu), cpu_gdt_table, GDT_SIZE); 277 memcpy(get_cpu_gdt_table(cpu), cpu_gdt_table, GDT_SIZE);
231 278
232 cpu_gdt_descr[cpu].size = GDT_SIZE; 279 cpu_gdt_descr[cpu].size = GDT_SIZE;
233 load_gdt((const struct desc_ptr *)&cpu_gdt_descr[cpu]); 280 load_gdt((const struct desc_ptr *)&cpu_gdt_descr[cpu]);
@@ -257,10 +304,10 @@ void __cpuinit cpu_init (void)
257 v, cpu); 304 v, cpu);
258 } 305 }
259 estacks += PAGE_SIZE << order[v]; 306 estacks += PAGE_SIZE << order[v];
260 orig_ist->ist[v] = t->ist[v] = (unsigned long)estacks; 307 orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks;
261 } 308 }
262 309
263 t->io_bitmap_base = offsetof(struct tss_struct, io_bitmap); 310 t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
264 /* 311 /*
265 * <= is required because the CPU will access up to 312 * <= is required because the CPU will access up to
266 * 8 bits beyond the end of the IO permission bitmap. 313 * 8 bits beyond the end of the IO permission bitmap.
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 9c24b45b513..62adc5f20be 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -44,9 +44,12 @@
44#include <linux/crash_dump.h> 44#include <linux/crash_dump.h>
45#include <linux/dmi.h> 45#include <linux/dmi.h>
46#include <linux/pfn.h> 46#include <linux/pfn.h>
47#include <linux/pci.h>
48#include <linux/init_ohci1394_dma.h>
47 49
48#include <video/edid.h> 50#include <video/edid.h>
49 51
52#include <asm/mtrr.h>
50#include <asm/apic.h> 53#include <asm/apic.h>
51#include <asm/e820.h> 54#include <asm/e820.h>
52#include <asm/mpspec.h> 55#include <asm/mpspec.h>
@@ -67,14 +70,83 @@
67 address, and must not be in the .bss segment! */ 70 address, and must not be in the .bss segment! */
68unsigned long init_pg_tables_end __initdata = ~0UL; 71unsigned long init_pg_tables_end __initdata = ~0UL;
69 72
70int disable_pse __cpuinitdata = 0;
71
72/* 73/*
73 * Machine setup.. 74 * Machine setup..
74 */ 75 */
75extern struct resource code_resource; 76static struct resource data_resource = {
76extern struct resource data_resource; 77 .name = "Kernel data",
77extern struct resource bss_resource; 78 .start = 0,
79 .end = 0,
80 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
81};
82
83static struct resource code_resource = {
84 .name = "Kernel code",
85 .start = 0,
86 .end = 0,
87 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
88};
89
90static struct resource bss_resource = {
91 .name = "Kernel bss",
92 .start = 0,
93 .end = 0,
94 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
95};
96
97static struct resource video_ram_resource = {
98 .name = "Video RAM area",
99 .start = 0xa0000,
100 .end = 0xbffff,
101 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
102};
103
104static struct resource standard_io_resources[] = { {
105 .name = "dma1",
106 .start = 0x0000,
107 .end = 0x001f,
108 .flags = IORESOURCE_BUSY | IORESOURCE_IO
109}, {
110 .name = "pic1",
111 .start = 0x0020,
112 .end = 0x0021,
113 .flags = IORESOURCE_BUSY | IORESOURCE_IO
114}, {
115 .name = "timer0",
116 .start = 0x0040,
117 .end = 0x0043,
118 .flags = IORESOURCE_BUSY | IORESOURCE_IO
119}, {
120 .name = "timer1",
121 .start = 0x0050,
122 .end = 0x0053,
123 .flags = IORESOURCE_BUSY | IORESOURCE_IO
124}, {
125 .name = "keyboard",
126 .start = 0x0060,
127 .end = 0x006f,
128 .flags = IORESOURCE_BUSY | IORESOURCE_IO
129}, {
130 .name = "dma page reg",
131 .start = 0x0080,
132 .end = 0x008f,
133 .flags = IORESOURCE_BUSY | IORESOURCE_IO
134}, {
135 .name = "pic2",
136 .start = 0x00a0,
137 .end = 0x00a1,
138 .flags = IORESOURCE_BUSY | IORESOURCE_IO
139}, {
140 .name = "dma2",
141 .start = 0x00c0,
142 .end = 0x00df,
143 .flags = IORESOURCE_BUSY | IORESOURCE_IO
144}, {
145 .name = "fpu",
146 .start = 0x00f0,
147 .end = 0x00ff,
148 .flags = IORESOURCE_BUSY | IORESOURCE_IO
149} };
78 150
79/* cpu data as detected by the assembly code in head.S */ 151/* cpu data as detected by the assembly code in head.S */
80struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; 152struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
@@ -116,13 +188,17 @@ extern int root_mountflags;
116 188
117unsigned long saved_videomode; 189unsigned long saved_videomode;
118 190
119#define RAMDISK_IMAGE_START_MASK 0x07FF 191#define RAMDISK_IMAGE_START_MASK 0x07FF
120#define RAMDISK_PROMPT_FLAG 0x8000 192#define RAMDISK_PROMPT_FLAG 0x8000
121#define RAMDISK_LOAD_FLAG 0x4000 193#define RAMDISK_LOAD_FLAG 0x4000
122 194
123static char __initdata command_line[COMMAND_LINE_SIZE]; 195static char __initdata command_line[COMMAND_LINE_SIZE];
124 196
197#ifndef CONFIG_DEBUG_BOOT_PARAMS
125struct boot_params __initdata boot_params; 198struct boot_params __initdata boot_params;
199#else
200struct boot_params boot_params;
201#endif
126 202
127#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) 203#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
128struct edd edd; 204struct edd edd;
@@ -166,8 +242,7 @@ static int __init parse_mem(char *arg)
166 return -EINVAL; 242 return -EINVAL;
167 243
168 if (strcmp(arg, "nopentium") == 0) { 244 if (strcmp(arg, "nopentium") == 0) {
169 clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability); 245 setup_clear_cpu_cap(X86_FEATURE_PSE);
170 disable_pse = 1;
171 } else { 246 } else {
172 /* If the user specifies memory size, we 247 /* If the user specifies memory size, we
173 * limit the BIOS-provided memory map to 248 * limit the BIOS-provided memory map to
@@ -176,7 +251,7 @@ static int __init parse_mem(char *arg)
176 * trim the existing memory map. 251 * trim the existing memory map.
177 */ 252 */
178 unsigned long long mem_size; 253 unsigned long long mem_size;
179 254
180 mem_size = memparse(arg, &arg); 255 mem_size = memparse(arg, &arg);
181 limit_regions(mem_size); 256 limit_regions(mem_size);
182 user_defined_memmap = 1; 257 user_defined_memmap = 1;
@@ -315,7 +390,7 @@ static void __init reserve_ebda_region(void)
315 unsigned int addr; 390 unsigned int addr;
316 addr = get_bios_ebda(); 391 addr = get_bios_ebda();
317 if (addr) 392 if (addr)
318 reserve_bootmem(addr, PAGE_SIZE); 393 reserve_bootmem(addr, PAGE_SIZE);
319} 394}
320 395
321#ifndef CONFIG_NEED_MULTIPLE_NODES 396#ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -420,6 +495,100 @@ static inline void __init reserve_crashkernel(void)
420{} 495{}
421#endif 496#endif
422 497
498#ifdef CONFIG_BLK_DEV_INITRD
499
500static bool do_relocate_initrd = false;
501
502static void __init reserve_initrd(void)
503{
504 unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
505 unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
506 unsigned long ramdisk_end = ramdisk_image + ramdisk_size;
507 unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
508 unsigned long ramdisk_here;
509
510 initrd_start = 0;
511
512 if (!boot_params.hdr.type_of_loader ||
513 !ramdisk_image || !ramdisk_size)
514 return; /* No initrd provided by bootloader */
515
516 if (ramdisk_end < ramdisk_image) {
517 printk(KERN_ERR "initrd wraps around end of memory, "
518 "disabling initrd\n");
519 return;
520 }
521 if (ramdisk_size >= end_of_lowmem/2) {
522 printk(KERN_ERR "initrd too large to handle, "
523 "disabling initrd\n");
524 return;
525 }
526 if (ramdisk_end <= end_of_lowmem) {
527 /* All in lowmem, easy case */
528 reserve_bootmem(ramdisk_image, ramdisk_size);
529 initrd_start = ramdisk_image + PAGE_OFFSET;
530 initrd_end = initrd_start+ramdisk_size;
531 return;
532 }
533
534 /* We need to move the initrd down into lowmem */
535 ramdisk_here = (end_of_lowmem - ramdisk_size) & PAGE_MASK;
536
537 /* Note: this includes all the lowmem currently occupied by
538 the initrd, we rely on that fact to keep the data intact. */
539 reserve_bootmem(ramdisk_here, ramdisk_size);
540 initrd_start = ramdisk_here + PAGE_OFFSET;
541 initrd_end = initrd_start + ramdisk_size;
542
543 do_relocate_initrd = true;
544}
545
546#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
547
548static void __init relocate_initrd(void)
549{
550 unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
551 unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
552 unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
553 unsigned long ramdisk_here;
554 unsigned long slop, clen, mapaddr;
555 char *p, *q;
556
557 if (!do_relocate_initrd)
558 return;
559
560 ramdisk_here = initrd_start - PAGE_OFFSET;
561
562 q = (char *)initrd_start;
563
564 /* Copy any lowmem portion of the initrd */
565 if (ramdisk_image < end_of_lowmem) {
566 clen = end_of_lowmem - ramdisk_image;
567 p = (char *)__va(ramdisk_image);
568 memcpy(q, p, clen);
569 q += clen;
570 ramdisk_image += clen;
571 ramdisk_size -= clen;
572 }
573
574 /* Copy the highmem portion of the initrd */
575 while (ramdisk_size) {
576 slop = ramdisk_image & ~PAGE_MASK;
577 clen = ramdisk_size;
578 if (clen > MAX_MAP_CHUNK-slop)
579 clen = MAX_MAP_CHUNK-slop;
580 mapaddr = ramdisk_image & PAGE_MASK;
581 p = early_ioremap(mapaddr, clen+slop);
582 memcpy(q, p+slop, clen);
583 early_iounmap(p, clen+slop);
584 q += clen;
585 ramdisk_image += clen;
586 ramdisk_size -= clen;
587 }
588}
589
590#endif /* CONFIG_BLK_DEV_INITRD */
591
423void __init setup_bootmem_allocator(void) 592void __init setup_bootmem_allocator(void)
424{ 593{
425 unsigned long bootmap_size; 594 unsigned long bootmap_size;
@@ -475,26 +644,10 @@ void __init setup_bootmem_allocator(void)
475 */ 644 */
476 find_smp_config(); 645 find_smp_config();
477#endif 646#endif
478 numa_kva_reserve();
479#ifdef CONFIG_BLK_DEV_INITRD 647#ifdef CONFIG_BLK_DEV_INITRD
480 if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { 648 reserve_initrd();
481 unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
482 unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
483 unsigned long ramdisk_end = ramdisk_image + ramdisk_size;
484 unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
485
486 if (ramdisk_end <= end_of_lowmem) {
487 reserve_bootmem(ramdisk_image, ramdisk_size);
488 initrd_start = ramdisk_image + PAGE_OFFSET;
489 initrd_end = initrd_start+ramdisk_size;
490 } else {
491 printk(KERN_ERR "initrd extends beyond end of memory "
492 "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
493 ramdisk_end, end_of_lowmem);
494 initrd_start = 0;
495 }
496 }
497#endif 649#endif
650 numa_kva_reserve();
498 reserve_crashkernel(); 651 reserve_crashkernel();
499} 652}
500 653
@@ -545,17 +698,11 @@ void __init setup_arch(char **cmdline_p)
545 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); 698 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
546 pre_setup_arch_hook(); 699 pre_setup_arch_hook();
547 early_cpu_init(); 700 early_cpu_init();
701 early_ioremap_init();
548 702
549 /*
550 * FIXME: This isn't an official loader_type right
551 * now but does currently work with elilo.
552 * If we were configured as an EFI kernel, check to make
553 * sure that we were loaded correctly from elilo and that
554 * the system table is valid. If not, then initialize normally.
555 */
556#ifdef CONFIG_EFI 703#ifdef CONFIG_EFI
557 if ((boot_params.hdr.type_of_loader == 0x50) && 704 if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
558 boot_params.efi_info.efi_systab) 705 "EL32", 4))
559 efi_enabled = 1; 706 efi_enabled = 1;
560#endif 707#endif
561 708
@@ -579,12 +726,9 @@ void __init setup_arch(char **cmdline_p)
579 rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0); 726 rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
580#endif 727#endif
581 ARCH_SETUP 728 ARCH_SETUP
582 if (efi_enabled) 729
583 efi_init(); 730 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
584 else { 731 print_memory_map(memory_setup());
585 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
586 print_memory_map(memory_setup());
587 }
588 732
589 copy_edd(); 733 copy_edd();
590 734
@@ -612,8 +756,16 @@ void __init setup_arch(char **cmdline_p)
612 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); 756 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
613 *cmdline_p = command_line; 757 *cmdline_p = command_line;
614 758
759 if (efi_enabled)
760 efi_init();
761
615 max_low_pfn = setup_memory(); 762 max_low_pfn = setup_memory();
616 763
764 /* update e820 for memory not covered by WB MTRRs */
765 mtrr_bp_init();
766 if (mtrr_trim_uncached_memory(max_pfn))
767 max_low_pfn = setup_memory();
768
617#ifdef CONFIG_VMI 769#ifdef CONFIG_VMI
618 /* 770 /*
619 * Must be after max_low_pfn is determined, and before kernel 771 * Must be after max_low_pfn is determined, and before kernel
@@ -636,6 +788,16 @@ void __init setup_arch(char **cmdline_p)
636 smp_alloc_memory(); /* AP processor realmode stacks in low memory*/ 788 smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
637#endif 789#endif
638 paging_init(); 790 paging_init();
791
792 /*
793 * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
794 */
795
796#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
797 if (init_ohci1394_dma_early)
798 init_ohci1394_dma_on_all_controllers();
799#endif
800
639 remapped_pgdat_init(); 801 remapped_pgdat_init();
640 sparse_init(); 802 sparse_init();
641 zone_sizes_init(); 803 zone_sizes_init();
@@ -644,15 +806,19 @@ void __init setup_arch(char **cmdline_p)
644 * NOTE: at this point the bootmem allocator is fully available. 806 * NOTE: at this point the bootmem allocator is fully available.
645 */ 807 */
646 808
809#ifdef CONFIG_BLK_DEV_INITRD
810 relocate_initrd();
811#endif
812
647 paravirt_post_allocator_init(); 813 paravirt_post_allocator_init();
648 814
649 dmi_scan_machine(); 815 dmi_scan_machine();
650 816
817 io_delay_init();
818
651#ifdef CONFIG_X86_GENERICARCH 819#ifdef CONFIG_X86_GENERICARCH
652 generic_apic_probe(); 820 generic_apic_probe();
653#endif 821#endif
654 if (efi_enabled)
655 efi_map_memmap();
656 822
657#ifdef CONFIG_ACPI 823#ifdef CONFIG_ACPI
658 /* 824 /*
@@ -661,9 +827,7 @@ void __init setup_arch(char **cmdline_p)
661 acpi_boot_table_init(); 827 acpi_boot_table_init();
662#endif 828#endif
663 829
664#ifdef CONFIG_PCI
665 early_quirks(); 830 early_quirks();
666#endif
667 831
668#ifdef CONFIG_ACPI 832#ifdef CONFIG_ACPI
669 acpi_boot_init(); 833 acpi_boot_init();
@@ -692,3 +856,26 @@ void __init setup_arch(char **cmdline_p)
692#endif 856#endif
693#endif 857#endif
694} 858}
859
860/*
861 * Request address space for all standard resources
862 *
863 * This is called just before pcibios_init(), which is also a
864 * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
865 */
866static int __init request_standard_resources(void)
867{
868 int i;
869
870 printk(KERN_INFO "Setting up standard PCI resources\n");
871 init_iomem_resources(&code_resource, &data_resource, &bss_resource);
872
873 request_resource(&iomem_resource, &video_ram_resource);
874
875 /* request I/O space for devices used on all i[345]86 PCs */
876 for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
877 request_resource(&ioport_resource, &standard_io_resources[i]);
878 return 0;
879}
880
881subsys_initcall(request_standard_resources);
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 30d94d1d5f5..c8939dfddfb 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -30,6 +30,7 @@
30#include <linux/crash_dump.h> 30#include <linux/crash_dump.h>
31#include <linux/root_dev.h> 31#include <linux/root_dev.h>
32#include <linux/pci.h> 32#include <linux/pci.h>
33#include <linux/efi.h>
33#include <linux/acpi.h> 34#include <linux/acpi.h>
34#include <linux/kallsyms.h> 35#include <linux/kallsyms.h>
35#include <linux/edd.h> 36#include <linux/edd.h>
@@ -39,10 +40,13 @@
39#include <linux/dmi.h> 40#include <linux/dmi.h>
40#include <linux/dma-mapping.h> 41#include <linux/dma-mapping.h>
41#include <linux/ctype.h> 42#include <linux/ctype.h>
43#include <linux/uaccess.h>
44#include <linux/init_ohci1394_dma.h>
42 45
43#include <asm/mtrr.h> 46#include <asm/mtrr.h>
44#include <asm/uaccess.h> 47#include <asm/uaccess.h>
45#include <asm/system.h> 48#include <asm/system.h>
49#include <asm/vsyscall.h>
46#include <asm/io.h> 50#include <asm/io.h>
47#include <asm/smp.h> 51#include <asm/smp.h>
48#include <asm/msr.h> 52#include <asm/msr.h>
@@ -50,6 +54,7 @@
50#include <video/edid.h> 54#include <video/edid.h>
51#include <asm/e820.h> 55#include <asm/e820.h>
52#include <asm/dma.h> 56#include <asm/dma.h>
57#include <asm/gart.h>
53#include <asm/mpspec.h> 58#include <asm/mpspec.h>
54#include <asm/mmu_context.h> 59#include <asm/mmu_context.h>
55#include <asm/proto.h> 60#include <asm/proto.h>
@@ -59,6 +64,15 @@
59#include <asm/sections.h> 64#include <asm/sections.h>
60#include <asm/dmi.h> 65#include <asm/dmi.h>
61#include <asm/cacheflush.h> 66#include <asm/cacheflush.h>
67#include <asm/mce.h>
68#include <asm/ds.h>
69#include <asm/topology.h>
70
71#ifdef CONFIG_PARAVIRT
72#include <asm/paravirt.h>
73#else
74#define ARCH_SETUP
75#endif
62 76
63/* 77/*
64 * Machine setup.. 78 * Machine setup..
@@ -67,6 +81,8 @@
67struct cpuinfo_x86 boot_cpu_data __read_mostly; 81struct cpuinfo_x86 boot_cpu_data __read_mostly;
68EXPORT_SYMBOL(boot_cpu_data); 82EXPORT_SYMBOL(boot_cpu_data);
69 83
84__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
85
70unsigned long mmu_cr4_features; 86unsigned long mmu_cr4_features;
71 87
72/* Boot loader ID as an integer, for the benefit of proc_dointvec */ 88/* Boot loader ID as an integer, for the benefit of proc_dointvec */
@@ -76,7 +92,7 @@ unsigned long saved_video_mode;
76 92
77int force_mwait __cpuinitdata; 93int force_mwait __cpuinitdata;
78 94
79/* 95/*
80 * Early DMI memory 96 * Early DMI memory
81 */ 97 */
82int dmi_alloc_index; 98int dmi_alloc_index;
@@ -122,25 +138,27 @@ struct resource standard_io_resources[] = {
122 138
123#define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM) 139#define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM)
124 140
125struct resource data_resource = { 141static struct resource data_resource = {
126 .name = "Kernel data", 142 .name = "Kernel data",
127 .start = 0, 143 .start = 0,
128 .end = 0, 144 .end = 0,
129 .flags = IORESOURCE_RAM, 145 .flags = IORESOURCE_RAM,
130}; 146};
131struct resource code_resource = { 147static struct resource code_resource = {
132 .name = "Kernel code", 148 .name = "Kernel code",
133 .start = 0, 149 .start = 0,
134 .end = 0, 150 .end = 0,
135 .flags = IORESOURCE_RAM, 151 .flags = IORESOURCE_RAM,
136}; 152};
137struct resource bss_resource = { 153static struct resource bss_resource = {
138 .name = "Kernel bss", 154 .name = "Kernel bss",
139 .start = 0, 155 .start = 0,
140 .end = 0, 156 .end = 0,
141 .flags = IORESOURCE_RAM, 157 .flags = IORESOURCE_RAM,
142}; 158};
143 159
160static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c);
161
144#ifdef CONFIG_PROC_VMCORE 162#ifdef CONFIG_PROC_VMCORE
145/* elfcorehdr= specifies the location of elf core header 163/* elfcorehdr= specifies the location of elf core header
146 * stored by the crashed kernel. This option will be passed 164 * stored by the crashed kernel. This option will be passed
@@ -164,14 +182,15 @@ contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
164 unsigned long bootmap_size, bootmap; 182 unsigned long bootmap_size, bootmap;
165 183
166 bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT; 184 bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
167 bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size); 185 bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
186 PAGE_SIZE);
168 if (bootmap == -1L) 187 if (bootmap == -1L)
169 panic("Cannot find bootmem map of size %ld\n",bootmap_size); 188 panic("Cannot find bootmem map of size %ld\n", bootmap_size);
170 bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn); 189 bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
171 e820_register_active_regions(0, start_pfn, end_pfn); 190 e820_register_active_regions(0, start_pfn, end_pfn);
172 free_bootmem_with_active_regions(0, end_pfn); 191 free_bootmem_with_active_regions(0, end_pfn);
173 reserve_bootmem(bootmap, bootmap_size); 192 reserve_bootmem(bootmap, bootmap_size);
174} 193}
175#endif 194#endif
176 195
177#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) 196#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
@@ -205,7 +224,8 @@ static void __init reserve_crashkernel(void)
205 unsigned long long crash_size, crash_base; 224 unsigned long long crash_size, crash_base;
206 int ret; 225 int ret;
207 226
208 free_mem = ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT; 227 free_mem =
228 ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT;
209 229
210 ret = parse_crashkernel(boot_command_line, free_mem, 230 ret = parse_crashkernel(boot_command_line, free_mem,
211 &crash_size, &crash_base); 231 &crash_size, &crash_base);
@@ -229,33 +249,21 @@ static inline void __init reserve_crashkernel(void)
229{} 249{}
230#endif 250#endif
231 251
232#define EBDA_ADDR_POINTER 0x40E 252/* Overridden in paravirt.c if CONFIG_PARAVIRT */
233 253void __attribute__((weak)) __init memory_setup(void)
234unsigned __initdata ebda_addr;
235unsigned __initdata ebda_size;
236
237static void discover_ebda(void)
238{ 254{
239 /* 255 machine_specific_memory_setup();
240 * there is a real-mode segmented pointer pointing to the
241 * 4K EBDA area at 0x40E
242 */
243 ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER);
244 ebda_addr <<= 4;
245
246 ebda_size = *(unsigned short *)__va(ebda_addr);
247
248 /* Round EBDA up to pages */
249 if (ebda_size == 0)
250 ebda_size = 1;
251 ebda_size <<= 10;
252 ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE);
253 if (ebda_size > 64*1024)
254 ebda_size = 64*1024;
255} 256}
256 257
258/*
259 * setup_arch - architecture-specific boot-time initializations
260 *
261 * Note: On x86_64, fixmaps are ready for use even before this is called.
262 */
257void __init setup_arch(char **cmdline_p) 263void __init setup_arch(char **cmdline_p)
258{ 264{
265 unsigned i;
266
259 printk(KERN_INFO "Command line: %s\n", boot_command_line); 267 printk(KERN_INFO "Command line: %s\n", boot_command_line);
260 268
261 ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); 269 ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
@@ -269,7 +277,15 @@ void __init setup_arch(char **cmdline_p)
269 rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0); 277 rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
270 rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0); 278 rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
271#endif 279#endif
272 setup_memory_region(); 280#ifdef CONFIG_EFI
281 if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
282 "EL64", 4))
283 efi_enabled = 1;
284#endif
285
286 ARCH_SETUP
287
288 memory_setup();
273 copy_edd(); 289 copy_edd();
274 290
275 if (!boot_params.hdr.root_flags) 291 if (!boot_params.hdr.root_flags)
@@ -293,27 +309,47 @@ void __init setup_arch(char **cmdline_p)
293 309
294 parse_early_param(); 310 parse_early_param();
295 311
312#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
313 if (init_ohci1394_dma_early)
314 init_ohci1394_dma_on_all_controllers();
315#endif
316
296 finish_e820_parsing(); 317 finish_e820_parsing();
297 318
319 early_gart_iommu_check();
320
298 e820_register_active_regions(0, 0, -1UL); 321 e820_register_active_regions(0, 0, -1UL);
299 /* 322 /*
300 * partially used pages are not usable - thus 323 * partially used pages are not usable - thus
301 * we are rounding upwards: 324 * we are rounding upwards:
302 */ 325 */
303 end_pfn = e820_end_of_ram(); 326 end_pfn = e820_end_of_ram();
327 /* update e820 for memory not covered by WB MTRRs */
328 mtrr_bp_init();
329 if (mtrr_trim_uncached_memory(end_pfn)) {
330 e820_register_active_regions(0, 0, -1UL);
331 end_pfn = e820_end_of_ram();
332 }
333
304 num_physpages = end_pfn; 334 num_physpages = end_pfn;
305 335
306 check_efer(); 336 check_efer();
307 337
308 discover_ebda();
309
310 init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT)); 338 init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT));
339 if (efi_enabled)
340 efi_init();
311 341
312 dmi_scan_machine(); 342 dmi_scan_machine();
313 343
344 io_delay_init();
345
314#ifdef CONFIG_SMP 346#ifdef CONFIG_SMP
315 /* setup to use the static apicid table during kernel startup */ 347 /* setup to use the early static init tables during kernel startup */
316 x86_cpu_to_apicid_ptr = (void *)&x86_cpu_to_apicid_init; 348 x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
349 x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init;
350#ifdef CONFIG_NUMA
351 x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init;
352#endif
317#endif 353#endif
318 354
319#ifdef CONFIG_ACPI 355#ifdef CONFIG_ACPI
@@ -340,48 +376,26 @@ void __init setup_arch(char **cmdline_p)
340#endif 376#endif
341 377
342#ifdef CONFIG_NUMA 378#ifdef CONFIG_NUMA
343 numa_initmem_init(0, end_pfn); 379 numa_initmem_init(0, end_pfn);
344#else 380#else
345 contig_initmem_init(0, end_pfn); 381 contig_initmem_init(0, end_pfn);
346#endif 382#endif
347 383
348 /* Reserve direct mapping */ 384 early_res_to_bootmem();
349 reserve_bootmem_generic(table_start << PAGE_SHIFT,
350 (table_end - table_start) << PAGE_SHIFT);
351
352 /* reserve kernel */
353 reserve_bootmem_generic(__pa_symbol(&_text),
354 __pa_symbol(&_end) - __pa_symbol(&_text));
355 385
386#ifdef CONFIG_ACPI_SLEEP
356 /* 387 /*
357 * reserve physical page 0 - it's a special BIOS page on many boxes, 388 * Reserve low memory region for sleep support.
358 * enabling clean reboots, SMP operation, laptop functions.
359 */ 389 */
360 reserve_bootmem_generic(0, PAGE_SIZE); 390 acpi_reserve_bootmem();
361
362 /* reserve ebda region */
363 if (ebda_addr)
364 reserve_bootmem_generic(ebda_addr, ebda_size);
365#ifdef CONFIG_NUMA
366 /* reserve nodemap region */
367 if (nodemap_addr)
368 reserve_bootmem_generic(nodemap_addr, nodemap_size);
369#endif 391#endif
370 392
371#ifdef CONFIG_SMP 393 if (efi_enabled)
372 /* Reserve SMP trampoline */ 394 efi_reserve_bootmem();
373 reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, 2*PAGE_SIZE);
374#endif
375 395
376#ifdef CONFIG_ACPI_SLEEP
377 /* 396 /*
378 * Reserve low memory region for sleep support. 397 * Find and reserve possible boot-time SMP configuration:
379 */ 398 */
380 acpi_reserve_bootmem();
381#endif
382 /*
383 * Find and reserve possible boot-time SMP configuration:
384 */
385 find_smp_config(); 399 find_smp_config();
386#ifdef CONFIG_BLK_DEV_INITRD 400#ifdef CONFIG_BLK_DEV_INITRD
387 if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { 401 if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
@@ -395,6 +409,8 @@ void __init setup_arch(char **cmdline_p)
395 initrd_start = ramdisk_image + PAGE_OFFSET; 409 initrd_start = ramdisk_image + PAGE_OFFSET;
396 initrd_end = initrd_start+ramdisk_size; 410 initrd_end = initrd_start+ramdisk_size;
397 } else { 411 } else {
412 /* Assumes everything on node 0 */
413 free_bootmem(ramdisk_image, ramdisk_size);
398 printk(KERN_ERR "initrd extends beyond end of memory " 414 printk(KERN_ERR "initrd extends beyond end of memory "
399 "(0x%08lx > 0x%08lx)\ndisabling initrd\n", 415 "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
400 ramdisk_end, end_of_mem); 416 ramdisk_end, end_of_mem);
@@ -404,17 +420,10 @@ void __init setup_arch(char **cmdline_p)
404#endif 420#endif
405 reserve_crashkernel(); 421 reserve_crashkernel();
406 paging_init(); 422 paging_init();
423 map_vsyscall();
407 424
408#ifdef CONFIG_PCI
409 early_quirks(); 425 early_quirks();
410#endif
411 426
412 /*
413 * set this early, so we dont allocate cpu0
414 * if MADT list doesnt list BSP first
415 * mpparse.c/MP_processor_info() allocates logical cpu numbers.
416 */
417 cpu_set(0, cpu_present_map);
418#ifdef CONFIG_ACPI 427#ifdef CONFIG_ACPI
419 /* 428 /*
420 * Read APIC and some other early information from ACPI tables. 429 * Read APIC and some other early information from ACPI tables.
@@ -430,25 +439,24 @@ void __init setup_arch(char **cmdline_p)
430 if (smp_found_config) 439 if (smp_found_config)
431 get_smp_config(); 440 get_smp_config();
432 init_apic_mappings(); 441 init_apic_mappings();
442 ioapic_init_mappings();
433 443
434 /* 444 /*
435 * We trust e820 completely. No explicit ROM probing in memory. 445 * We trust e820 completely. No explicit ROM probing in memory.
436 */ 446 */
437 e820_reserve_resources(); 447 e820_reserve_resources(&code_resource, &data_resource, &bss_resource);
438 e820_mark_nosave_regions(); 448 e820_mark_nosave_regions();
439 449
440 {
441 unsigned i;
442 /* request I/O space for devices used on all i[345]86 PCs */ 450 /* request I/O space for devices used on all i[345]86 PCs */
443 for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++) 451 for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
444 request_resource(&ioport_resource, &standard_io_resources[i]); 452 request_resource(&ioport_resource, &standard_io_resources[i]);
445 }
446 453
447 e820_setup_gap(); 454 e820_setup_gap();
448 455
449#ifdef CONFIG_VT 456#ifdef CONFIG_VT
450#if defined(CONFIG_VGA_CONSOLE) 457#if defined(CONFIG_VGA_CONSOLE)
451 conswitchp = &vga_con; 458 if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
459 conswitchp = &vga_con;
452#elif defined(CONFIG_DUMMY_CONSOLE) 460#elif defined(CONFIG_DUMMY_CONSOLE)
453 conswitchp = &dummy_con; 461 conswitchp = &dummy_con;
454#endif 462#endif
@@ -479,9 +487,10 @@ static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
479 487
480 if (n >= 0x80000005) { 488 if (n >= 0x80000005) {
481 cpuid(0x80000005, &dummy, &ebx, &ecx, &edx); 489 cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
482 printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n", 490 printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), "
483 edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); 491 "D cache %dK (%d bytes/line)\n",
484 c->x86_cache_size=(ecx>>24)+(edx>>24); 492 edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
493 c->x86_cache_size = (ecx>>24) + (edx>>24);
485 /* On K8 L1 TLB is inclusive, so don't count it */ 494 /* On K8 L1 TLB is inclusive, so don't count it */
486 c->x86_tlbsize = 0; 495 c->x86_tlbsize = 0;
487 } 496 }
@@ -495,11 +504,8 @@ static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
495 printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", 504 printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
496 c->x86_cache_size, ecx & 0xFF); 505 c->x86_cache_size, ecx & 0xFF);
497 } 506 }
498
499 if (n >= 0x80000007)
500 cpuid(0x80000007, &dummy, &dummy, &dummy, &c->x86_power);
501 if (n >= 0x80000008) { 507 if (n >= 0x80000008) {
502 cpuid(0x80000008, &eax, &dummy, &dummy, &dummy); 508 cpuid(0x80000008, &eax, &dummy, &dummy, &dummy);
503 c->x86_virt_bits = (eax >> 8) & 0xff; 509 c->x86_virt_bits = (eax >> 8) & 0xff;
504 c->x86_phys_bits = eax & 0xff; 510 c->x86_phys_bits = eax & 0xff;
505 } 511 }
@@ -508,14 +514,15 @@ static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
508#ifdef CONFIG_NUMA 514#ifdef CONFIG_NUMA
509static int nearby_node(int apicid) 515static int nearby_node(int apicid)
510{ 516{
511 int i; 517 int i, node;
518
512 for (i = apicid - 1; i >= 0; i--) { 519 for (i = apicid - 1; i >= 0; i--) {
513 int node = apicid_to_node[i]; 520 node = apicid_to_node[i];
514 if (node != NUMA_NO_NODE && node_online(node)) 521 if (node != NUMA_NO_NODE && node_online(node))
515 return node; 522 return node;
516 } 523 }
517 for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) { 524 for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
518 int node = apicid_to_node[i]; 525 node = apicid_to_node[i];
519 if (node != NUMA_NO_NODE && node_online(node)) 526 if (node != NUMA_NO_NODE && node_online(node))
520 return node; 527 return node;
521 } 528 }
@@ -527,7 +534,7 @@ static int nearby_node(int apicid)
527 * On a AMD dual core setup the lower bits of the APIC id distingush the cores. 534 * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
528 * Assumes number of cores is a power of two. 535 * Assumes number of cores is a power of two.
529 */ 536 */
530static void __init amd_detect_cmp(struct cpuinfo_x86 *c) 537static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
531{ 538{
532#ifdef CONFIG_SMP 539#ifdef CONFIG_SMP
533 unsigned bits; 540 unsigned bits;
@@ -536,7 +543,54 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
536 int node = 0; 543 int node = 0;
537 unsigned apicid = hard_smp_processor_id(); 544 unsigned apicid = hard_smp_processor_id();
538#endif 545#endif
539 unsigned ecx = cpuid_ecx(0x80000008); 546 bits = c->x86_coreid_bits;
547
548 /* Low order bits define the core id (index of core in socket) */
549 c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1);
550 /* Convert the APIC ID into the socket ID */
551 c->phys_proc_id = phys_pkg_id(bits);
552
553#ifdef CONFIG_NUMA
554 node = c->phys_proc_id;
555 if (apicid_to_node[apicid] != NUMA_NO_NODE)
556 node = apicid_to_node[apicid];
557 if (!node_online(node)) {
558 /* Two possibilities here:
559 - The CPU is missing memory and no node was created.
560 In that case try picking one from a nearby CPU
561 - The APIC IDs differ from the HyperTransport node IDs
562 which the K8 northbridge parsing fills in.
563 Assume they are all increased by a constant offset,
564 but in the same order as the HT nodeids.
565 If that doesn't result in a usable node fall back to the
566 path for the previous case. */
567
568 int ht_nodeid = apicid - (cpu_data(0).phys_proc_id << bits);
569
570 if (ht_nodeid >= 0 &&
571 apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
572 node = apicid_to_node[ht_nodeid];
573 /* Pick a nearby node */
574 if (!node_online(node))
575 node = nearby_node(apicid);
576 }
577 numa_set_node(cpu, node);
578
579 printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
580#endif
581#endif
582}
583
584static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
585{
586#ifdef CONFIG_SMP
587 unsigned bits, ecx;
588
589 /* Multi core CPU? */
590 if (c->extended_cpuid_level < 0x80000008)
591 return;
592
593 ecx = cpuid_ecx(0x80000008);
540 594
541 c->x86_max_cores = (ecx & 0xff) + 1; 595 c->x86_max_cores = (ecx & 0xff) + 1;
542 596
@@ -549,37 +603,8 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
549 bits++; 603 bits++;
550 } 604 }
551 605
552 /* Low order bits define the core id (index of core in socket) */ 606 c->x86_coreid_bits = bits;
553 c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1);
554 /* Convert the APIC ID into the socket ID */
555 c->phys_proc_id = phys_pkg_id(bits);
556
557#ifdef CONFIG_NUMA
558 node = c->phys_proc_id;
559 if (apicid_to_node[apicid] != NUMA_NO_NODE)
560 node = apicid_to_node[apicid];
561 if (!node_online(node)) {
562 /* Two possibilities here:
563 - The CPU is missing memory and no node was created.
564 In that case try picking one from a nearby CPU
565 - The APIC IDs differ from the HyperTransport node IDs
566 which the K8 northbridge parsing fills in.
567 Assume they are all increased by a constant offset,
568 but in the same order as the HT nodeids.
569 If that doesn't result in a usable node fall back to the
570 path for the previous case. */
571 int ht_nodeid = apicid - (cpu_data(0).phys_proc_id << bits);
572 if (ht_nodeid >= 0 &&
573 apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
574 node = apicid_to_node[ht_nodeid];
575 /* Pick a nearby node */
576 if (!node_online(node))
577 node = nearby_node(apicid);
578 }
579 numa_set_node(cpu, node);
580 607
581 printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
582#endif
583#endif 608#endif
584} 609}
585 610
@@ -595,8 +620,8 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
595/* AMD systems with C1E don't have a working lAPIC timer. Check for that. */ 620/* AMD systems with C1E don't have a working lAPIC timer. Check for that. */
596static __cpuinit int amd_apic_timer_broken(void) 621static __cpuinit int amd_apic_timer_broken(void)
597{ 622{
598 u32 lo, hi; 623 u32 lo, hi, eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
599 u32 eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE); 624
600 switch (eax & CPUID_XFAM) { 625 switch (eax & CPUID_XFAM) {
601 case CPUID_XFAM_K8: 626 case CPUID_XFAM_K8:
602 if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F) 627 if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F)
@@ -614,6 +639,15 @@ static __cpuinit int amd_apic_timer_broken(void)
614 return 0; 639 return 0;
615} 640}
616 641
642static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
643{
644 early_init_amd_mc(c);
645
646 /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
647 if (c->x86_power & (1<<8))
648 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
649}
650
617static void __cpuinit init_amd(struct cpuinfo_x86 *c) 651static void __cpuinit init_amd(struct cpuinfo_x86 *c)
618{ 652{
619 unsigned level; 653 unsigned level;
@@ -624,7 +658,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
624 /* 658 /*
625 * Disable TLB flush filter by setting HWCR.FFDIS on K8 659 * Disable TLB flush filter by setting HWCR.FFDIS on K8
626 * bit 6 of msr C001_0015 660 * bit 6 of msr C001_0015
627 * 661 *
628 * Errata 63 for SH-B3 steppings 662 * Errata 63 for SH-B3 steppings
629 * Errata 122 for all steppings (F+ have it disabled by default) 663 * Errata 122 for all steppings (F+ have it disabled by default)
630 */ 664 */
@@ -637,35 +671,32 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
637 671
638 /* Bit 31 in normal CPUID used for nonstandard 3DNow ID; 672 /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
639 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */ 673 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
640 clear_bit(0*32+31, &c->x86_capability); 674 clear_bit(0*32+31, (unsigned long *)&c->x86_capability);
641 675
642 /* On C+ stepping K8 rep microcode works well for copy/memset */ 676 /* On C+ stepping K8 rep microcode works well for copy/memset */
643 level = cpuid_eax(1); 677 level = cpuid_eax(1);
644 if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)) 678 if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) ||
645 set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); 679 level >= 0x0f58))
680 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
646 if (c->x86 == 0x10 || c->x86 == 0x11) 681 if (c->x86 == 0x10 || c->x86 == 0x11)
647 set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); 682 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
648 683
649 /* Enable workaround for FXSAVE leak */ 684 /* Enable workaround for FXSAVE leak */
650 if (c->x86 >= 6) 685 if (c->x86 >= 6)
651 set_bit(X86_FEATURE_FXSAVE_LEAK, &c->x86_capability); 686 set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
652 687
653 level = get_model_name(c); 688 level = get_model_name(c);
654 if (!level) { 689 if (!level) {
655 switch (c->x86) { 690 switch (c->x86) {
656 case 15: 691 case 15:
657 /* Should distinguish Models here, but this is only 692 /* Should distinguish Models here, but this is only
658 a fallback anyways. */ 693 a fallback anyways. */
659 strcpy(c->x86_model_id, "Hammer"); 694 strcpy(c->x86_model_id, "Hammer");
660 break; 695 break;
661 } 696 }
662 } 697 }
663 display_cacheinfo(c); 698 display_cacheinfo(c);
664 699
665 /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
666 if (c->x86_power & (1<<8))
667 set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
668
669 /* Multi core CPU? */ 700 /* Multi core CPU? */
670 if (c->extended_cpuid_level >= 0x80000008) 701 if (c->extended_cpuid_level >= 0x80000008)
671 amd_detect_cmp(c); 702 amd_detect_cmp(c);
@@ -677,41 +708,38 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
677 num_cache_leaves = 3; 708 num_cache_leaves = 3;
678 709
679 if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11) 710 if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11)
680 set_bit(X86_FEATURE_K8, &c->x86_capability); 711 set_cpu_cap(c, X86_FEATURE_K8);
681
682 /* RDTSC can be speculated around */
683 clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
684 712
685 /* Family 10 doesn't support C states in MWAIT so don't use it */ 713 /* MFENCE stops RDTSC speculation */
686 if (c->x86 == 0x10 && !force_mwait) 714 set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
687 clear_bit(X86_FEATURE_MWAIT, &c->x86_capability);
688 715
689 if (amd_apic_timer_broken()) 716 if (amd_apic_timer_broken())
690 disable_apic_timer = 1; 717 disable_apic_timer = 1;
691} 718}
692 719
693static void __cpuinit detect_ht(struct cpuinfo_x86 *c) 720void __cpuinit detect_ht(struct cpuinfo_x86 *c)
694{ 721{
695#ifdef CONFIG_SMP 722#ifdef CONFIG_SMP
696 u32 eax, ebx, ecx, edx; 723 u32 eax, ebx, ecx, edx;
697 int index_msb, core_bits; 724 int index_msb, core_bits;
698 725
699 cpuid(1, &eax, &ebx, &ecx, &edx); 726 cpuid(1, &eax, &ebx, &ecx, &edx);
700 727
701 728
702 if (!cpu_has(c, X86_FEATURE_HT)) 729 if (!cpu_has(c, X86_FEATURE_HT))
703 return; 730 return;
704 if (cpu_has(c, X86_FEATURE_CMP_LEGACY)) 731 if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
705 goto out; 732 goto out;
706 733
707 smp_num_siblings = (ebx & 0xff0000) >> 16; 734 smp_num_siblings = (ebx & 0xff0000) >> 16;
708 735
709 if (smp_num_siblings == 1) { 736 if (smp_num_siblings == 1) {
710 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); 737 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
711 } else if (smp_num_siblings > 1 ) { 738 } else if (smp_num_siblings > 1) {
712 739
713 if (smp_num_siblings > NR_CPUS) { 740 if (smp_num_siblings > NR_CPUS) {
714 printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings); 741 printk(KERN_WARNING "CPU: Unsupported number of "
742 "siblings %d", smp_num_siblings);
715 smp_num_siblings = 1; 743 smp_num_siblings = 1;
716 return; 744 return;
717 } 745 }
@@ -721,7 +749,7 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
721 749
722 smp_num_siblings = smp_num_siblings / c->x86_max_cores; 750 smp_num_siblings = smp_num_siblings / c->x86_max_cores;
723 751
724 index_msb = get_count_order(smp_num_siblings) ; 752 index_msb = get_count_order(smp_num_siblings);
725 753
726 core_bits = get_count_order(c->x86_max_cores); 754 core_bits = get_count_order(c->x86_max_cores);
727 755
@@ -730,8 +758,10 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
730 } 758 }
731out: 759out:
732 if ((c->x86_max_cores * smp_num_siblings) > 1) { 760 if ((c->x86_max_cores * smp_num_siblings) > 1) {
733 printk(KERN_INFO "CPU: Physical Processor ID: %d\n", c->phys_proc_id); 761 printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
734 printk(KERN_INFO "CPU: Processor Core ID: %d\n", c->cpu_core_id); 762 c->phys_proc_id);
763 printk(KERN_INFO "CPU: Processor Core ID: %d\n",
764 c->cpu_core_id);
735 } 765 }
736 766
737#endif 767#endif
@@ -773,28 +803,39 @@ static void srat_detect_node(void)
773#endif 803#endif
774} 804}
775 805
806static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
807{
808 if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
809 (c->x86 == 0x6 && c->x86_model >= 0x0e))
810 set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
811}
812
776static void __cpuinit init_intel(struct cpuinfo_x86 *c) 813static void __cpuinit init_intel(struct cpuinfo_x86 *c)
777{ 814{
778 /* Cache sizes */ 815 /* Cache sizes */
779 unsigned n; 816 unsigned n;
780 817
781 init_intel_cacheinfo(c); 818 init_intel_cacheinfo(c);
782 if (c->cpuid_level > 9 ) { 819 if (c->cpuid_level > 9) {
783 unsigned eax = cpuid_eax(10); 820 unsigned eax = cpuid_eax(10);
784 /* Check for version and the number of counters */ 821 /* Check for version and the number of counters */
785 if ((eax & 0xff) && (((eax>>8) & 0xff) > 1)) 822 if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
786 set_bit(X86_FEATURE_ARCH_PERFMON, &c->x86_capability); 823 set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
787 } 824 }
788 825
789 if (cpu_has_ds) { 826 if (cpu_has_ds) {
790 unsigned int l1, l2; 827 unsigned int l1, l2;
791 rdmsr(MSR_IA32_MISC_ENABLE, l1, l2); 828 rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
792 if (!(l1 & (1<<11))) 829 if (!(l1 & (1<<11)))
793 set_bit(X86_FEATURE_BTS, c->x86_capability); 830 set_cpu_cap(c, X86_FEATURE_BTS);
794 if (!(l1 & (1<<12))) 831 if (!(l1 & (1<<12)))
795 set_bit(X86_FEATURE_PEBS, c->x86_capability); 832 set_cpu_cap(c, X86_FEATURE_PEBS);
796 } 833 }
797 834
835
836 if (cpu_has_bts)
837 ds_init_intel(c);
838
798 n = c->extended_cpuid_level; 839 n = c->extended_cpuid_level;
799 if (n >= 0x80000008) { 840 if (n >= 0x80000008) {
800 unsigned eax = cpuid_eax(0x80000008); 841 unsigned eax = cpuid_eax(0x80000008);
@@ -811,14 +852,11 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
811 c->x86_cache_alignment = c->x86_clflush_size * 2; 852 c->x86_cache_alignment = c->x86_clflush_size * 2;
812 if ((c->x86 == 0xf && c->x86_model >= 0x03) || 853 if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
813 (c->x86 == 0x6 && c->x86_model >= 0x0e)) 854 (c->x86 == 0x6 && c->x86_model >= 0x0e))
814 set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability); 855 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
815 if (c->x86 == 6) 856 if (c->x86 == 6)
816 set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); 857 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
817 if (c->x86 == 15) 858 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
818 set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); 859 c->x86_max_cores = intel_num_cpu_cores(c);
819 else
820 clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
821 c->x86_max_cores = intel_num_cpu_cores(c);
822 860
823 srat_detect_node(); 861 srat_detect_node();
824} 862}
@@ -835,18 +873,12 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
835 c->x86_vendor = X86_VENDOR_UNKNOWN; 873 c->x86_vendor = X86_VENDOR_UNKNOWN;
836} 874}
837 875
838struct cpu_model_info {
839 int vendor;
840 int family;
841 char *model_names[16];
842};
843
844/* Do some early cpuid on the boot CPU to get some parameter that are 876/* Do some early cpuid on the boot CPU to get some parameter that are
845 needed before check_bugs. Everything advanced is in identify_cpu 877 needed before check_bugs. Everything advanced is in identify_cpu
846 below. */ 878 below. */
847void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) 879static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
848{ 880{
849 u32 tfms; 881 u32 tfms, xlvl;
850 882
851 c->loops_per_jiffy = loops_per_jiffy; 883 c->loops_per_jiffy = loops_per_jiffy;
852 c->x86_cache_size = -1; 884 c->x86_cache_size = -1;
@@ -857,6 +889,7 @@ void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
857 c->x86_clflush_size = 64; 889 c->x86_clflush_size = 64;
858 c->x86_cache_alignment = c->x86_clflush_size; 890 c->x86_cache_alignment = c->x86_clflush_size;
859 c->x86_max_cores = 1; 891 c->x86_max_cores = 1;
892 c->x86_coreid_bits = 0;
860 c->extended_cpuid_level = 0; 893 c->extended_cpuid_level = 0;
861 memset(&c->x86_capability, 0, sizeof c->x86_capability); 894 memset(&c->x86_capability, 0, sizeof c->x86_capability);
862 895
@@ -865,7 +898,7 @@ void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
865 (unsigned int *)&c->x86_vendor_id[0], 898 (unsigned int *)&c->x86_vendor_id[0],
866 (unsigned int *)&c->x86_vendor_id[8], 899 (unsigned int *)&c->x86_vendor_id[8],
867 (unsigned int *)&c->x86_vendor_id[4]); 900 (unsigned int *)&c->x86_vendor_id[4]);
868 901
869 get_cpu_vendor(c); 902 get_cpu_vendor(c);
870 903
871 /* Initialize the standard set of capabilities */ 904 /* Initialize the standard set of capabilities */
@@ -883,7 +916,7 @@ void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
883 c->x86 += (tfms >> 20) & 0xff; 916 c->x86 += (tfms >> 20) & 0xff;
884 if (c->x86 >= 0x6) 917 if (c->x86 >= 0x6)
885 c->x86_model += ((tfms >> 16) & 0xF) << 4; 918 c->x86_model += ((tfms >> 16) & 0xF) << 4;
886 if (c->x86_capability[0] & (1<<19)) 919 if (c->x86_capability[0] & (1<<19))
887 c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; 920 c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
888 } else { 921 } else {
889 /* Have CPUID level 0 only - unheard of */ 922 /* Have CPUID level 0 only - unheard of */
@@ -893,18 +926,6 @@ void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
893#ifdef CONFIG_SMP 926#ifdef CONFIG_SMP
894 c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff; 927 c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
895#endif 928#endif
896}
897
898/*
899 * This does the hard work of actually picking apart the CPU stuff...
900 */
901void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
902{
903 int i;
904 u32 xlvl;
905
906 early_identify_cpu(c);
907
908 /* AMD-defined flags: level 0x80000001 */ 929 /* AMD-defined flags: level 0x80000001 */
909 xlvl = cpuid_eax(0x80000000); 930 xlvl = cpuid_eax(0x80000000);
910 c->extended_cpuid_level = xlvl; 931 c->extended_cpuid_level = xlvl;
@@ -925,6 +946,30 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
925 c->x86_capability[2] = cpuid_edx(0x80860001); 946 c->x86_capability[2] = cpuid_edx(0x80860001);
926 } 947 }
927 948
949 c->extended_cpuid_level = cpuid_eax(0x80000000);
950 if (c->extended_cpuid_level >= 0x80000007)
951 c->x86_power = cpuid_edx(0x80000007);
952
953 switch (c->x86_vendor) {
954 case X86_VENDOR_AMD:
955 early_init_amd(c);
956 break;
957 case X86_VENDOR_INTEL:
958 early_init_intel(c);
959 break;
960 }
961
962}
963
964/*
965 * This does the hard work of actually picking apart the CPU stuff...
966 */
967void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
968{
969 int i;
970
971 early_identify_cpu(c);
972
928 init_scattered_cpuid_features(c); 973 init_scattered_cpuid_features(c);
929 974
930 c->apicid = phys_pkg_id(0); 975 c->apicid = phys_pkg_id(0);
@@ -954,8 +999,7 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
954 break; 999 break;
955 } 1000 }
956 1001
957 select_idle_routine(c); 1002 detect_ht(c);
958 detect_ht(c);
959 1003
960 /* 1004 /*
961 * On SMP, boot_cpu_data holds the common feature set between 1005 * On SMP, boot_cpu_data holds the common feature set between
@@ -965,31 +1009,55 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
965 */ 1009 */
966 if (c != &boot_cpu_data) { 1010 if (c != &boot_cpu_data) {
967 /* AND the already accumulated flags with these */ 1011 /* AND the already accumulated flags with these */
968 for (i = 0 ; i < NCAPINTS ; i++) 1012 for (i = 0; i < NCAPINTS; i++)
969 boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; 1013 boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
970 } 1014 }
971 1015
1016 /* Clear all flags overriden by options */
1017 for (i = 0; i < NCAPINTS; i++)
1018 c->x86_capability[i] ^= cleared_cpu_caps[i];
1019
972#ifdef CONFIG_X86_MCE 1020#ifdef CONFIG_X86_MCE
973 mcheck_init(c); 1021 mcheck_init(c);
974#endif 1022#endif
1023 select_idle_routine(c);
1024
975 if (c != &boot_cpu_data) 1025 if (c != &boot_cpu_data)
976 mtrr_ap_init(); 1026 mtrr_ap_init();
977#ifdef CONFIG_NUMA 1027#ifdef CONFIG_NUMA
978 numa_add_cpu(smp_processor_id()); 1028 numa_add_cpu(smp_processor_id());
979#endif 1029#endif
1030
980} 1031}
981 1032
1033static __init int setup_noclflush(char *arg)
1034{
1035 setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
1036 return 1;
1037}
1038__setup("noclflush", setup_noclflush);
982 1039
983void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) 1040void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
984{ 1041{
985 if (c->x86_model_id[0]) 1042 if (c->x86_model_id[0])
986 printk("%s", c->x86_model_id); 1043 printk(KERN_INFO "%s", c->x86_model_id);
987 1044
988 if (c->x86_mask || c->cpuid_level >= 0) 1045 if (c->x86_mask || c->cpuid_level >= 0)
989 printk(" stepping %02x\n", c->x86_mask); 1046 printk(KERN_CONT " stepping %02x\n", c->x86_mask);
990 else 1047 else
991 printk("\n"); 1048 printk(KERN_CONT "\n");
1049}
1050
1051static __init int setup_disablecpuid(char *arg)
1052{
1053 int bit;
1054 if (get_option(&arg, &bit) && bit < NCAPINTS*32)
1055 setup_clear_cpu_cap(bit);
1056 else
1057 return 0;
1058 return 1;
992} 1059}
1060__setup("clearcpuid=", setup_disablecpuid);
993 1061
994/* 1062/*
995 * Get CPU information for use by the procfs. 1063 * Get CPU information for use by the procfs.
@@ -998,116 +1066,41 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
998static int show_cpuinfo(struct seq_file *m, void *v) 1066static int show_cpuinfo(struct seq_file *m, void *v)
999{ 1067{
1000 struct cpuinfo_x86 *c = v; 1068 struct cpuinfo_x86 *c = v;
1001 int cpu = 0; 1069 int cpu = 0, i;
1002
1003 /*
1004 * These flag bits must match the definitions in <asm/cpufeature.h>.
1005 * NULL means this bit is undefined or reserved; either way it doesn't
1006 * have meaning as far as Linux is concerned. Note that it's important
1007 * to realize there is a difference between this table and CPUID -- if
1008 * applications want to get the raw CPUID data, they should access
1009 * /dev/cpu/<cpu_nr>/cpuid instead.
1010 */
1011 static const char *const x86_cap_flags[] = {
1012 /* Intel-defined */
1013 "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
1014 "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
1015 "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx",
1016 "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe",
1017
1018 /* AMD-defined */
1019 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1020 NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
1021 NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL,
1022 NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm",
1023 "3dnowext", "3dnow",
1024
1025 /* Transmeta-defined */
1026 "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
1027 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1028 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1029 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1030
1031 /* Other (Linux-defined) */
1032 "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr",
1033 NULL, NULL, NULL, NULL,
1034 "constant_tsc", "up", NULL, "arch_perfmon",
1035 "pebs", "bts", NULL, "sync_rdtsc",
1036 "rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1037 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1038
1039 /* Intel-defined (#2) */
1040 "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
1041 "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL,
1042 NULL, NULL, "dca", "sse4_1", "sse4_2", NULL, NULL, "popcnt",
1043 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1044
1045 /* VIA/Cyrix/Centaur-defined */
1046 NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en",
1047 "ace2", "ace2_en", "phe", "phe_en", "pmm", "pmm_en", NULL, NULL,
1048 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1049 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1050
1051 /* AMD-defined (#2) */
1052 "lahf_lm", "cmp_legacy", "svm", "extapic",
1053 "cr8_legacy", "abm", "sse4a", "misalignsse",
1054 "3dnowprefetch", "osvw", "ibs", "sse5",
1055 "skinit", "wdt", NULL, NULL,
1056 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1057 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1058
1059 /* Auxiliary (Linux-defined) */
1060 "ida", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1061 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1062 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1063 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1064 };
1065 static const char *const x86_power_flags[] = {
1066 "ts", /* temperature sensor */
1067 "fid", /* frequency id control */
1068 "vid", /* voltage id control */
1069 "ttp", /* thermal trip */
1070 "tm",
1071 "stc",
1072 "100mhzsteps",
1073 "hwpstate",
1074 "", /* tsc invariant mapped to constant_tsc */
1075 /* nothing */
1076 };
1077
1078 1070
1079#ifdef CONFIG_SMP 1071#ifdef CONFIG_SMP
1080 cpu = c->cpu_index; 1072 cpu = c->cpu_index;
1081#endif 1073#endif
1082 1074
1083 seq_printf(m,"processor\t: %u\n" 1075 seq_printf(m, "processor\t: %u\n"
1084 "vendor_id\t: %s\n" 1076 "vendor_id\t: %s\n"
1085 "cpu family\t: %d\n" 1077 "cpu family\t: %d\n"
1086 "model\t\t: %d\n" 1078 "model\t\t: %d\n"
1087 "model name\t: %s\n", 1079 "model name\t: %s\n",
1088 (unsigned)cpu, 1080 (unsigned)cpu,
1089 c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown", 1081 c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
1090 c->x86, 1082 c->x86,
1091 (int)c->x86_model, 1083 (int)c->x86_model,
1092 c->x86_model_id[0] ? c->x86_model_id : "unknown"); 1084 c->x86_model_id[0] ? c->x86_model_id : "unknown");
1093 1085
1094 if (c->x86_mask || c->cpuid_level >= 0) 1086 if (c->x86_mask || c->cpuid_level >= 0)
1095 seq_printf(m, "stepping\t: %d\n", c->x86_mask); 1087 seq_printf(m, "stepping\t: %d\n", c->x86_mask);
1096 else 1088 else
1097 seq_printf(m, "stepping\t: unknown\n"); 1089 seq_printf(m, "stepping\t: unknown\n");
1098 1090
1099 if (cpu_has(c,X86_FEATURE_TSC)) { 1091 if (cpu_has(c, X86_FEATURE_TSC)) {
1100 unsigned int freq = cpufreq_quick_get((unsigned)cpu); 1092 unsigned int freq = cpufreq_quick_get((unsigned)cpu);
1093
1101 if (!freq) 1094 if (!freq)
1102 freq = cpu_khz; 1095 freq = cpu_khz;
1103 seq_printf(m, "cpu MHz\t\t: %u.%03u\n", 1096 seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
1104 freq / 1000, (freq % 1000)); 1097 freq / 1000, (freq % 1000));
1105 } 1098 }
1106 1099
1107 /* Cache size */ 1100 /* Cache size */
1108 if (c->x86_cache_size >= 0) 1101 if (c->x86_cache_size >= 0)
1109 seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size); 1102 seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
1110 1103
1111#ifdef CONFIG_SMP 1104#ifdef CONFIG_SMP
1112 if (smp_num_siblings * c->x86_max_cores > 1) { 1105 if (smp_num_siblings * c->x86_max_cores > 1) {
1113 seq_printf(m, "physical id\t: %d\n", c->phys_proc_id); 1106 seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
@@ -1116,48 +1109,43 @@ static int show_cpuinfo(struct seq_file *m, void *v)
1116 seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id); 1109 seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
1117 seq_printf(m, "cpu cores\t: %d\n", c->booted_cores); 1110 seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
1118 } 1111 }
1119#endif 1112#endif
1120 1113
1121 seq_printf(m, 1114 seq_printf(m,
1122 "fpu\t\t: yes\n" 1115 "fpu\t\t: yes\n"
1123 "fpu_exception\t: yes\n" 1116 "fpu_exception\t: yes\n"
1124 "cpuid level\t: %d\n" 1117 "cpuid level\t: %d\n"
1125 "wp\t\t: yes\n" 1118 "wp\t\t: yes\n"
1126 "flags\t\t:", 1119 "flags\t\t:",
1127 c->cpuid_level); 1120 c->cpuid_level);
1128 1121
1129 { 1122 for (i = 0; i < 32*NCAPINTS; i++)
1130 int i; 1123 if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
1131 for ( i = 0 ; i < 32*NCAPINTS ; i++ ) 1124 seq_printf(m, " %s", x86_cap_flags[i]);
1132 if (cpu_has(c, i) && x86_cap_flags[i] != NULL) 1125
1133 seq_printf(m, " %s", x86_cap_flags[i]);
1134 }
1135
1136 seq_printf(m, "\nbogomips\t: %lu.%02lu\n", 1126 seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
1137 c->loops_per_jiffy/(500000/HZ), 1127 c->loops_per_jiffy/(500000/HZ),
1138 (c->loops_per_jiffy/(5000/HZ)) % 100); 1128 (c->loops_per_jiffy/(5000/HZ)) % 100);
1139 1129
1140 if (c->x86_tlbsize > 0) 1130 if (c->x86_tlbsize > 0)
1141 seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize); 1131 seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize);
1142 seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size); 1132 seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size);
1143 seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment); 1133 seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment);
1144 1134
1145 seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n", 1135 seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n",
1146 c->x86_phys_bits, c->x86_virt_bits); 1136 c->x86_phys_bits, c->x86_virt_bits);
1147 1137
1148 seq_printf(m, "power management:"); 1138 seq_printf(m, "power management:");
1149 { 1139 for (i = 0; i < 32; i++) {
1150 unsigned i; 1140 if (c->x86_power & (1 << i)) {
1151 for (i = 0; i < 32; i++) 1141 if (i < ARRAY_SIZE(x86_power_flags) &&
1152 if (c->x86_power & (1 << i)) { 1142 x86_power_flags[i])
1153 if (i < ARRAY_SIZE(x86_power_flags) && 1143 seq_printf(m, "%s%s",
1154 x86_power_flags[i]) 1144 x86_power_flags[i][0]?" ":"",
1155 seq_printf(m, "%s%s", 1145 x86_power_flags[i]);
1156 x86_power_flags[i][0]?" ":"", 1146 else
1157 x86_power_flags[i]); 1147 seq_printf(m, " [%d]", i);
1158 else 1148 }
1159 seq_printf(m, " [%d]", i);
1160 }
1161 } 1149 }
1162 1150
1163 seq_printf(m, "\n\n"); 1151 seq_printf(m, "\n\n");
@@ -1184,8 +1172,8 @@ static void c_stop(struct seq_file *m, void *v)
1184{ 1172{
1185} 1173}
1186 1174
1187struct seq_operations cpuinfo_op = { 1175const struct seq_operations cpuinfo_op = {
1188 .start =c_start, 1176 .start = c_start,
1189 .next = c_next, 1177 .next = c_next,
1190 .stop = c_stop, 1178 .stop = c_stop,
1191 .show = show_cpuinfo, 1179 .show = show_cpuinfo,
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index 9bdd83022f5..caee1f002fe 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -23,6 +23,7 @@
23#include <asm/ucontext.h> 23#include <asm/ucontext.h>
24#include <asm/uaccess.h> 24#include <asm/uaccess.h>
25#include <asm/i387.h> 25#include <asm/i387.h>
26#include <asm/vdso.h>
26#include "sigframe_32.h" 27#include "sigframe_32.h"
27 28
28#define DEBUG_SIG 0 29#define DEBUG_SIG 0
@@ -81,14 +82,14 @@ sys_sigaction(int sig, const struct old_sigaction __user *act,
81} 82}
82 83
83asmlinkage int 84asmlinkage int
84sys_sigaltstack(unsigned long ebx) 85sys_sigaltstack(unsigned long bx)
85{ 86{
86 /* This is needed to make gcc realize it doesn't own the "struct pt_regs" */ 87 /* This is needed to make gcc realize it doesn't own the "struct pt_regs" */
87 struct pt_regs *regs = (struct pt_regs *)&ebx; 88 struct pt_regs *regs = (struct pt_regs *)&bx;
88 const stack_t __user *uss = (const stack_t __user *)ebx; 89 const stack_t __user *uss = (const stack_t __user *)bx;
89 stack_t __user *uoss = (stack_t __user *)regs->ecx; 90 stack_t __user *uoss = (stack_t __user *)regs->cx;
90 91
91 return do_sigaltstack(uss, uoss, regs->esp); 92 return do_sigaltstack(uss, uoss, regs->sp);
92} 93}
93 94
94 95
@@ -109,12 +110,12 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax
109#define COPY_SEG(seg) \ 110#define COPY_SEG(seg) \
110 { unsigned short tmp; \ 111 { unsigned short tmp; \
111 err |= __get_user(tmp, &sc->seg); \ 112 err |= __get_user(tmp, &sc->seg); \
112 regs->x##seg = tmp; } 113 regs->seg = tmp; }
113 114
114#define COPY_SEG_STRICT(seg) \ 115#define COPY_SEG_STRICT(seg) \
115 { unsigned short tmp; \ 116 { unsigned short tmp; \
116 err |= __get_user(tmp, &sc->seg); \ 117 err |= __get_user(tmp, &sc->seg); \
117 regs->x##seg = tmp|3; } 118 regs->seg = tmp|3; }
118 119
119#define GET_SEG(seg) \ 120#define GET_SEG(seg) \
120 { unsigned short tmp; \ 121 { unsigned short tmp; \
@@ -130,22 +131,22 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax
130 COPY_SEG(fs); 131 COPY_SEG(fs);
131 COPY_SEG(es); 132 COPY_SEG(es);
132 COPY_SEG(ds); 133 COPY_SEG(ds);
133 COPY(edi); 134 COPY(di);
134 COPY(esi); 135 COPY(si);
135 COPY(ebp); 136 COPY(bp);
136 COPY(esp); 137 COPY(sp);
137 COPY(ebx); 138 COPY(bx);
138 COPY(edx); 139 COPY(dx);
139 COPY(ecx); 140 COPY(cx);
140 COPY(eip); 141 COPY(ip);
141 COPY_SEG_STRICT(cs); 142 COPY_SEG_STRICT(cs);
142 COPY_SEG_STRICT(ss); 143 COPY_SEG_STRICT(ss);
143 144
144 { 145 {
145 unsigned int tmpflags; 146 unsigned int tmpflags;
146 err |= __get_user(tmpflags, &sc->eflags); 147 err |= __get_user(tmpflags, &sc->flags);
147 regs->eflags = (regs->eflags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); 148 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
148 regs->orig_eax = -1; /* disable syscall checks */ 149 regs->orig_ax = -1; /* disable syscall checks */
149 } 150 }
150 151
151 { 152 {
@@ -164,7 +165,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax
164 } 165 }
165 } 166 }
166 167
167 err |= __get_user(*peax, &sc->eax); 168 err |= __get_user(*peax, &sc->ax);
168 return err; 169 return err;
169 170
170badframe: 171badframe:
@@ -174,9 +175,9 @@ badframe:
174asmlinkage int sys_sigreturn(unsigned long __unused) 175asmlinkage int sys_sigreturn(unsigned long __unused)
175{ 176{
176 struct pt_regs *regs = (struct pt_regs *) &__unused; 177 struct pt_regs *regs = (struct pt_regs *) &__unused;
177 struct sigframe __user *frame = (struct sigframe __user *)(regs->esp - 8); 178 struct sigframe __user *frame = (struct sigframe __user *)(regs->sp - 8);
178 sigset_t set; 179 sigset_t set;
179 int eax; 180 int ax;
180 181
181 if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) 182 if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
182 goto badframe; 183 goto badframe;
@@ -192,17 +193,20 @@ asmlinkage int sys_sigreturn(unsigned long __unused)
192 recalc_sigpending(); 193 recalc_sigpending();
193 spin_unlock_irq(&current->sighand->siglock); 194 spin_unlock_irq(&current->sighand->siglock);
194 195
195 if (restore_sigcontext(regs, &frame->sc, &eax)) 196 if (restore_sigcontext(regs, &frame->sc, &ax))
196 goto badframe; 197 goto badframe;
197 return eax; 198 return ax;
198 199
199badframe: 200badframe:
200 if (show_unhandled_signals && printk_ratelimit()) 201 if (show_unhandled_signals && printk_ratelimit()) {
201 printk("%s%s[%d] bad frame in sigreturn frame:%p eip:%lx" 202 printk("%s%s[%d] bad frame in sigreturn frame:%p ip:%lx"
202 " esp:%lx oeax:%lx\n", 203 " sp:%lx oeax:%lx",
203 task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG, 204 task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG,
204 current->comm, task_pid_nr(current), frame, regs->eip, 205 current->comm, task_pid_nr(current), frame, regs->ip,
205 regs->esp, regs->orig_eax); 206 regs->sp, regs->orig_ax);
207 print_vma_addr(" in ", regs->ip);
208 printk("\n");
209 }
206 210
207 force_sig(SIGSEGV, current); 211 force_sig(SIGSEGV, current);
208 return 0; 212 return 0;
@@ -211,9 +215,9 @@ badframe:
211asmlinkage int sys_rt_sigreturn(unsigned long __unused) 215asmlinkage int sys_rt_sigreturn(unsigned long __unused)
212{ 216{
213 struct pt_regs *regs = (struct pt_regs *) &__unused; 217 struct pt_regs *regs = (struct pt_regs *) &__unused;
214 struct rt_sigframe __user *frame = (struct rt_sigframe __user *)(regs->esp - 4); 218 struct rt_sigframe __user *frame = (struct rt_sigframe __user *)(regs->sp - 4);
215 sigset_t set; 219 sigset_t set;
216 int eax; 220 int ax;
217 221
218 if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) 222 if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
219 goto badframe; 223 goto badframe;
@@ -226,13 +230,13 @@ asmlinkage int sys_rt_sigreturn(unsigned long __unused)
226 recalc_sigpending(); 230 recalc_sigpending();
227 spin_unlock_irq(&current->sighand->siglock); 231 spin_unlock_irq(&current->sighand->siglock);
228 232
229 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax)) 233 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
230 goto badframe; 234 goto badframe;
231 235
232 if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->esp) == -EFAULT) 236 if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT)
233 goto badframe; 237 goto badframe;
234 238
235 return eax; 239 return ax;
236 240
237badframe: 241badframe:
238 force_sig(SIGSEGV, current); 242 force_sig(SIGSEGV, current);
@@ -249,27 +253,27 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
249{ 253{
250 int tmp, err = 0; 254 int tmp, err = 0;
251 255
252 err |= __put_user(regs->xfs, (unsigned int __user *)&sc->fs); 256 err |= __put_user(regs->fs, (unsigned int __user *)&sc->fs);
253 savesegment(gs, tmp); 257 savesegment(gs, tmp);
254 err |= __put_user(tmp, (unsigned int __user *)&sc->gs); 258 err |= __put_user(tmp, (unsigned int __user *)&sc->gs);
255 259
256 err |= __put_user(regs->xes, (unsigned int __user *)&sc->es); 260 err |= __put_user(regs->es, (unsigned int __user *)&sc->es);
257 err |= __put_user(regs->xds, (unsigned int __user *)&sc->ds); 261 err |= __put_user(regs->ds, (unsigned int __user *)&sc->ds);
258 err |= __put_user(regs->edi, &sc->edi); 262 err |= __put_user(regs->di, &sc->di);
259 err |= __put_user(regs->esi, &sc->esi); 263 err |= __put_user(regs->si, &sc->si);
260 err |= __put_user(regs->ebp, &sc->ebp); 264 err |= __put_user(regs->bp, &sc->bp);
261 err |= __put_user(regs->esp, &sc->esp); 265 err |= __put_user(regs->sp, &sc->sp);
262 err |= __put_user(regs->ebx, &sc->ebx); 266 err |= __put_user(regs->bx, &sc->bx);
263 err |= __put_user(regs->edx, &sc->edx); 267 err |= __put_user(regs->dx, &sc->dx);
264 err |= __put_user(regs->ecx, &sc->ecx); 268 err |= __put_user(regs->cx, &sc->cx);
265 err |= __put_user(regs->eax, &sc->eax); 269 err |= __put_user(regs->ax, &sc->ax);
266 err |= __put_user(current->thread.trap_no, &sc->trapno); 270 err |= __put_user(current->thread.trap_no, &sc->trapno);
267 err |= __put_user(current->thread.error_code, &sc->err); 271 err |= __put_user(current->thread.error_code, &sc->err);
268 err |= __put_user(regs->eip, &sc->eip); 272 err |= __put_user(regs->ip, &sc->ip);
269 err |= __put_user(regs->xcs, (unsigned int __user *)&sc->cs); 273 err |= __put_user(regs->cs, (unsigned int __user *)&sc->cs);
270 err |= __put_user(regs->eflags, &sc->eflags); 274 err |= __put_user(regs->flags, &sc->flags);
271 err |= __put_user(regs->esp, &sc->esp_at_signal); 275 err |= __put_user(regs->sp, &sc->sp_at_signal);
272 err |= __put_user(regs->xss, (unsigned int __user *)&sc->ss); 276 err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss);
273 277
274 tmp = save_i387(fpstate); 278 tmp = save_i387(fpstate);
275 if (tmp < 0) 279 if (tmp < 0)
@@ -290,29 +294,36 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
290static inline void __user * 294static inline void __user *
291get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size) 295get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size)
292{ 296{
293 unsigned long esp; 297 unsigned long sp;
294 298
295 /* Default to using normal stack */ 299 /* Default to using normal stack */
296 esp = regs->esp; 300 sp = regs->sp;
301
302 /*
303 * If we are on the alternate signal stack and would overflow it, don't.
304 * Return an always-bogus address instead so we will die with SIGSEGV.
305 */
306 if (on_sig_stack(sp) && !likely(on_sig_stack(sp - frame_size)))
307 return (void __user *) -1L;
297 308
298 /* This is the X/Open sanctioned signal stack switching. */ 309 /* This is the X/Open sanctioned signal stack switching. */
299 if (ka->sa.sa_flags & SA_ONSTACK) { 310 if (ka->sa.sa_flags & SA_ONSTACK) {
300 if (sas_ss_flags(esp) == 0) 311 if (sas_ss_flags(sp) == 0)
301 esp = current->sas_ss_sp + current->sas_ss_size; 312 sp = current->sas_ss_sp + current->sas_ss_size;
302 } 313 }
303 314
304 /* This is the legacy signal stack switching. */ 315 /* This is the legacy signal stack switching. */
305 else if ((regs->xss & 0xffff) != __USER_DS && 316 else if ((regs->ss & 0xffff) != __USER_DS &&
306 !(ka->sa.sa_flags & SA_RESTORER) && 317 !(ka->sa.sa_flags & SA_RESTORER) &&
307 ka->sa.sa_restorer) { 318 ka->sa.sa_restorer) {
308 esp = (unsigned long) ka->sa.sa_restorer; 319 sp = (unsigned long) ka->sa.sa_restorer;
309 } 320 }
310 321
311 esp -= frame_size; 322 sp -= frame_size;
312 /* Align the stack pointer according to the i386 ABI, 323 /* Align the stack pointer according to the i386 ABI,
313 * i.e. so that on function entry ((sp + 4) & 15) == 0. */ 324 * i.e. so that on function entry ((sp + 4) & 15) == 0. */
314 esp = ((esp + 4) & -16ul) - 4; 325 sp = ((sp + 4) & -16ul) - 4;
315 return (void __user *) esp; 326 return (void __user *) sp;
316} 327}
317 328
318/* These symbols are defined with the addresses in the vsyscall page. 329/* These symbols are defined with the addresses in the vsyscall page.
@@ -355,9 +366,9 @@ static int setup_frame(int sig, struct k_sigaction *ka,
355 } 366 }
356 367
357 if (current->binfmt->hasvdso) 368 if (current->binfmt->hasvdso)
358 restorer = (void *)VDSO_SYM(&__kernel_sigreturn); 369 restorer = VDSO32_SYMBOL(current->mm->context.vdso, sigreturn);
359 else 370 else
360 restorer = (void *)&frame->retcode; 371 restorer = &frame->retcode;
361 if (ka->sa.sa_flags & SA_RESTORER) 372 if (ka->sa.sa_flags & SA_RESTORER)
362 restorer = ka->sa.sa_restorer; 373 restorer = ka->sa.sa_restorer;
363 374
@@ -379,16 +390,16 @@ static int setup_frame(int sig, struct k_sigaction *ka,
379 goto give_sigsegv; 390 goto give_sigsegv;
380 391
381 /* Set up registers for signal handler */ 392 /* Set up registers for signal handler */
382 regs->esp = (unsigned long) frame; 393 regs->sp = (unsigned long) frame;
383 regs->eip = (unsigned long) ka->sa.sa_handler; 394 regs->ip = (unsigned long) ka->sa.sa_handler;
384 regs->eax = (unsigned long) sig; 395 regs->ax = (unsigned long) sig;
385 regs->edx = (unsigned long) 0; 396 regs->dx = (unsigned long) 0;
386 regs->ecx = (unsigned long) 0; 397 regs->cx = (unsigned long) 0;
387 398
388 regs->xds = __USER_DS; 399 regs->ds = __USER_DS;
389 regs->xes = __USER_DS; 400 regs->es = __USER_DS;
390 regs->xss = __USER_DS; 401 regs->ss = __USER_DS;
391 regs->xcs = __USER_CS; 402 regs->cs = __USER_CS;
392 403
393 /* 404 /*
394 * Clear TF when entering the signal handler, but 405 * Clear TF when entering the signal handler, but
@@ -396,13 +407,13 @@ static int setup_frame(int sig, struct k_sigaction *ka,
396 * The tracer may want to single-step inside the 407 * The tracer may want to single-step inside the
397 * handler too. 408 * handler too.
398 */ 409 */
399 regs->eflags &= ~TF_MASK; 410 regs->flags &= ~TF_MASK;
400 if (test_thread_flag(TIF_SINGLESTEP)) 411 if (test_thread_flag(TIF_SINGLESTEP))
401 ptrace_notify(SIGTRAP); 412 ptrace_notify(SIGTRAP);
402 413
403#if DEBUG_SIG 414#if DEBUG_SIG
404 printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n", 415 printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n",
405 current->comm, current->pid, frame, regs->eip, frame->pretcode); 416 current->comm, current->pid, frame, regs->ip, frame->pretcode);
406#endif 417#endif
407 418
408 return 0; 419 return 0;
@@ -442,7 +453,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
442 err |= __put_user(0, &frame->uc.uc_flags); 453 err |= __put_user(0, &frame->uc.uc_flags);
443 err |= __put_user(0, &frame->uc.uc_link); 454 err |= __put_user(0, &frame->uc.uc_link);
444 err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); 455 err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
445 err |= __put_user(sas_ss_flags(regs->esp), 456 err |= __put_user(sas_ss_flags(regs->sp),
446 &frame->uc.uc_stack.ss_flags); 457 &frame->uc.uc_stack.ss_flags);
447 err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); 458 err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
448 err |= setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate, 459 err |= setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate,
@@ -452,13 +463,13 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
452 goto give_sigsegv; 463 goto give_sigsegv;
453 464
454 /* Set up to return from userspace. */ 465 /* Set up to return from userspace. */
455 restorer = (void *)VDSO_SYM(&__kernel_rt_sigreturn); 466 restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn);
456 if (ka->sa.sa_flags & SA_RESTORER) 467 if (ka->sa.sa_flags & SA_RESTORER)
457 restorer = ka->sa.sa_restorer; 468 restorer = ka->sa.sa_restorer;
458 err |= __put_user(restorer, &frame->pretcode); 469 err |= __put_user(restorer, &frame->pretcode);
459 470
460 /* 471 /*
461 * This is movl $,%eax ; int $0x80 472 * This is movl $,%ax ; int $0x80
462 * 473 *
463 * WE DO NOT USE IT ANY MORE! It's only left here for historical 474 * WE DO NOT USE IT ANY MORE! It's only left here for historical
464 * reasons and because gdb uses it as a signature to notice 475 * reasons and because gdb uses it as a signature to notice
@@ -472,16 +483,16 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
472 goto give_sigsegv; 483 goto give_sigsegv;
473 484
474 /* Set up registers for signal handler */ 485 /* Set up registers for signal handler */
475 regs->esp = (unsigned long) frame; 486 regs->sp = (unsigned long) frame;
476 regs->eip = (unsigned long) ka->sa.sa_handler; 487 regs->ip = (unsigned long) ka->sa.sa_handler;
477 regs->eax = (unsigned long) usig; 488 regs->ax = (unsigned long) usig;
478 regs->edx = (unsigned long) &frame->info; 489 regs->dx = (unsigned long) &frame->info;
479 regs->ecx = (unsigned long) &frame->uc; 490 regs->cx = (unsigned long) &frame->uc;
480 491
481 regs->xds = __USER_DS; 492 regs->ds = __USER_DS;
482 regs->xes = __USER_DS; 493 regs->es = __USER_DS;
483 regs->xss = __USER_DS; 494 regs->ss = __USER_DS;
484 regs->xcs = __USER_CS; 495 regs->cs = __USER_CS;
485 496
486 /* 497 /*
487 * Clear TF when entering the signal handler, but 498 * Clear TF when entering the signal handler, but
@@ -489,13 +500,13 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
489 * The tracer may want to single-step inside the 500 * The tracer may want to single-step inside the
490 * handler too. 501 * handler too.
491 */ 502 */
492 regs->eflags &= ~TF_MASK; 503 regs->flags &= ~TF_MASK;
493 if (test_thread_flag(TIF_SINGLESTEP)) 504 if (test_thread_flag(TIF_SINGLESTEP))
494 ptrace_notify(SIGTRAP); 505 ptrace_notify(SIGTRAP);
495 506
496#if DEBUG_SIG 507#if DEBUG_SIG
497 printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n", 508 printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n",
498 current->comm, current->pid, frame, regs->eip, frame->pretcode); 509 current->comm, current->pid, frame, regs->ip, frame->pretcode);
499#endif 510#endif
500 511
501 return 0; 512 return 0;
@@ -516,35 +527,33 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
516 int ret; 527 int ret;
517 528
518 /* Are we from a system call? */ 529 /* Are we from a system call? */
519 if (regs->orig_eax >= 0) { 530 if (regs->orig_ax >= 0) {
520 /* If so, check system call restarting.. */ 531 /* If so, check system call restarting.. */
521 switch (regs->eax) { 532 switch (regs->ax) {
522 case -ERESTART_RESTARTBLOCK: 533 case -ERESTART_RESTARTBLOCK:
523 case -ERESTARTNOHAND: 534 case -ERESTARTNOHAND:
524 regs->eax = -EINTR; 535 regs->ax = -EINTR;
525 break; 536 break;
526 537
527 case -ERESTARTSYS: 538 case -ERESTARTSYS:
528 if (!(ka->sa.sa_flags & SA_RESTART)) { 539 if (!(ka->sa.sa_flags & SA_RESTART)) {
529 regs->eax = -EINTR; 540 regs->ax = -EINTR;
530 break; 541 break;
531 } 542 }
532 /* fallthrough */ 543 /* fallthrough */
533 case -ERESTARTNOINTR: 544 case -ERESTARTNOINTR:
534 regs->eax = regs->orig_eax; 545 regs->ax = regs->orig_ax;
535 regs->eip -= 2; 546 regs->ip -= 2;
536 } 547 }
537 } 548 }
538 549
539 /* 550 /*
540 * If TF is set due to a debugger (PT_DTRACE), clear the TF flag so 551 * If TF is set due to a debugger (TIF_FORCED_TF), clear the TF
541 * that register information in the sigcontext is correct. 552 * flag so that register information in the sigcontext is correct.
542 */ 553 */
543 if (unlikely(regs->eflags & TF_MASK) 554 if (unlikely(regs->flags & X86_EFLAGS_TF) &&
544 && likely(current->ptrace & PT_DTRACE)) { 555 likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
545 current->ptrace &= ~PT_DTRACE; 556 regs->flags &= ~X86_EFLAGS_TF;
546 regs->eflags &= ~TF_MASK;
547 }
548 557
549 /* Set up the stack frame */ 558 /* Set up the stack frame */
550 if (ka->sa.sa_flags & SA_SIGINFO) 559 if (ka->sa.sa_flags & SA_SIGINFO)
@@ -569,7 +578,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
569 * want to handle. Thus you cannot kill init even with a SIGKILL even by 578 * want to handle. Thus you cannot kill init even with a SIGKILL even by
570 * mistake. 579 * mistake.
571 */ 580 */
572static void fastcall do_signal(struct pt_regs *regs) 581static void do_signal(struct pt_regs *regs)
573{ 582{
574 siginfo_t info; 583 siginfo_t info;
575 int signr; 584 int signr;
@@ -599,8 +608,8 @@ static void fastcall do_signal(struct pt_regs *regs)
599 * have been cleared if the watchpoint triggered 608 * have been cleared if the watchpoint triggered
600 * inside the kernel. 609 * inside the kernel.
601 */ 610 */
602 if (unlikely(current->thread.debugreg[7])) 611 if (unlikely(current->thread.debugreg7))
603 set_debugreg(current->thread.debugreg[7], 7); 612 set_debugreg(current->thread.debugreg7, 7);
604 613
605 /* Whee! Actually deliver the signal. */ 614 /* Whee! Actually deliver the signal. */
606 if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { 615 if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
@@ -616,19 +625,19 @@ static void fastcall do_signal(struct pt_regs *regs)
616 } 625 }
617 626
618 /* Did we come from a system call? */ 627 /* Did we come from a system call? */
619 if (regs->orig_eax >= 0) { 628 if (regs->orig_ax >= 0) {
620 /* Restart the system call - no handlers present */ 629 /* Restart the system call - no handlers present */
621 switch (regs->eax) { 630 switch (regs->ax) {
622 case -ERESTARTNOHAND: 631 case -ERESTARTNOHAND:
623 case -ERESTARTSYS: 632 case -ERESTARTSYS:
624 case -ERESTARTNOINTR: 633 case -ERESTARTNOINTR:
625 regs->eax = regs->orig_eax; 634 regs->ax = regs->orig_ax;
626 regs->eip -= 2; 635 regs->ip -= 2;
627 break; 636 break;
628 637
629 case -ERESTART_RESTARTBLOCK: 638 case -ERESTART_RESTARTBLOCK:
630 regs->eax = __NR_restart_syscall; 639 regs->ax = __NR_restart_syscall;
631 regs->eip -= 2; 640 regs->ip -= 2;
632 break; 641 break;
633 } 642 }
634 } 643 }
@@ -651,13 +660,16 @@ void do_notify_resume(struct pt_regs *regs, void *_unused,
651{ 660{
652 /* Pending single-step? */ 661 /* Pending single-step? */
653 if (thread_info_flags & _TIF_SINGLESTEP) { 662 if (thread_info_flags & _TIF_SINGLESTEP) {
654 regs->eflags |= TF_MASK; 663 regs->flags |= TF_MASK;
655 clear_thread_flag(TIF_SINGLESTEP); 664 clear_thread_flag(TIF_SINGLESTEP);
656 } 665 }
657 666
658 /* deal with pending signal delivery */ 667 /* deal with pending signal delivery */
659 if (thread_info_flags & (_TIF_SIGPENDING | _TIF_RESTORE_SIGMASK)) 668 if (thread_info_flags & (_TIF_SIGPENDING | _TIF_RESTORE_SIGMASK))
660 do_signal(regs); 669 do_signal(regs);
670
671 if (thread_info_flags & _TIF_HRTICK_RESCHED)
672 hrtick_resched();
661 673
662 clear_thread_flag(TIF_IRET); 674 clear_thread_flag(TIF_IRET);
663} 675}
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index ab086b0357f..7347bb14e30 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -39,7 +39,7 @@ asmlinkage long
39sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, 39sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
40 struct pt_regs *regs) 40 struct pt_regs *regs)
41{ 41{
42 return do_sigaltstack(uss, uoss, regs->rsp); 42 return do_sigaltstack(uss, uoss, regs->sp);
43} 43}
44 44
45 45
@@ -64,8 +64,8 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned
64 64
65#define COPY(x) err |= __get_user(regs->x, &sc->x) 65#define COPY(x) err |= __get_user(regs->x, &sc->x)
66 66
67 COPY(rdi); COPY(rsi); COPY(rbp); COPY(rsp); COPY(rbx); 67 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
68 COPY(rdx); COPY(rcx); COPY(rip); 68 COPY(dx); COPY(cx); COPY(ip);
69 COPY(r8); 69 COPY(r8);
70 COPY(r9); 70 COPY(r9);
71 COPY(r10); 71 COPY(r10);
@@ -86,9 +86,9 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned
86 86
87 { 87 {
88 unsigned int tmpflags; 88 unsigned int tmpflags;
89 err |= __get_user(tmpflags, &sc->eflags); 89 err |= __get_user(tmpflags, &sc->flags);
90 regs->eflags = (regs->eflags & ~0x40DD5) | (tmpflags & 0x40DD5); 90 regs->flags = (regs->flags & ~0x40DD5) | (tmpflags & 0x40DD5);
91 regs->orig_rax = -1; /* disable syscall checks */ 91 regs->orig_ax = -1; /* disable syscall checks */
92 } 92 }
93 93
94 { 94 {
@@ -108,7 +108,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned
108 } 108 }
109 } 109 }
110 110
111 err |= __get_user(*prax, &sc->rax); 111 err |= __get_user(*prax, &sc->ax);
112 return err; 112 return err;
113 113
114badframe: 114badframe:
@@ -119,9 +119,9 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
119{ 119{
120 struct rt_sigframe __user *frame; 120 struct rt_sigframe __user *frame;
121 sigset_t set; 121 sigset_t set;
122 unsigned long eax; 122 unsigned long ax;
123 123
124 frame = (struct rt_sigframe __user *)(regs->rsp - 8); 124 frame = (struct rt_sigframe __user *)(regs->sp - 8);
125 if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) { 125 if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) {
126 goto badframe; 126 goto badframe;
127 } 127 }
@@ -135,17 +135,17 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
135 recalc_sigpending(); 135 recalc_sigpending();
136 spin_unlock_irq(&current->sighand->siglock); 136 spin_unlock_irq(&current->sighand->siglock);
137 137
138 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax)) 138 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
139 goto badframe; 139 goto badframe;
140 140
141#ifdef DEBUG_SIG 141#ifdef DEBUG_SIG
142 printk("%d sigreturn rip:%lx rsp:%lx frame:%p rax:%lx\n",current->pid,regs->rip,regs->rsp,frame,eax); 142 printk("%d sigreturn ip:%lx sp:%lx frame:%p ax:%lx\n",current->pid,regs->ip,regs->sp,frame,ax);
143#endif 143#endif
144 144
145 if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->rsp) == -EFAULT) 145 if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT)
146 goto badframe; 146 goto badframe;
147 147
148 return eax; 148 return ax;
149 149
150badframe: 150badframe:
151 signal_fault(regs,frame,"sigreturn"); 151 signal_fault(regs,frame,"sigreturn");
@@ -165,14 +165,14 @@ setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned lo
165 err |= __put_user(0, &sc->gs); 165 err |= __put_user(0, &sc->gs);
166 err |= __put_user(0, &sc->fs); 166 err |= __put_user(0, &sc->fs);
167 167
168 err |= __put_user(regs->rdi, &sc->rdi); 168 err |= __put_user(regs->di, &sc->di);
169 err |= __put_user(regs->rsi, &sc->rsi); 169 err |= __put_user(regs->si, &sc->si);
170 err |= __put_user(regs->rbp, &sc->rbp); 170 err |= __put_user(regs->bp, &sc->bp);
171 err |= __put_user(regs->rsp, &sc->rsp); 171 err |= __put_user(regs->sp, &sc->sp);
172 err |= __put_user(regs->rbx, &sc->rbx); 172 err |= __put_user(regs->bx, &sc->bx);
173 err |= __put_user(regs->rdx, &sc->rdx); 173 err |= __put_user(regs->dx, &sc->dx);
174 err |= __put_user(regs->rcx, &sc->rcx); 174 err |= __put_user(regs->cx, &sc->cx);
175 err |= __put_user(regs->rax, &sc->rax); 175 err |= __put_user(regs->ax, &sc->ax);
176 err |= __put_user(regs->r8, &sc->r8); 176 err |= __put_user(regs->r8, &sc->r8);
177 err |= __put_user(regs->r9, &sc->r9); 177 err |= __put_user(regs->r9, &sc->r9);
178 err |= __put_user(regs->r10, &sc->r10); 178 err |= __put_user(regs->r10, &sc->r10);
@@ -183,8 +183,8 @@ setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned lo
183 err |= __put_user(regs->r15, &sc->r15); 183 err |= __put_user(regs->r15, &sc->r15);
184 err |= __put_user(me->thread.trap_no, &sc->trapno); 184 err |= __put_user(me->thread.trap_no, &sc->trapno);
185 err |= __put_user(me->thread.error_code, &sc->err); 185 err |= __put_user(me->thread.error_code, &sc->err);
186 err |= __put_user(regs->rip, &sc->rip); 186 err |= __put_user(regs->ip, &sc->ip);
187 err |= __put_user(regs->eflags, &sc->eflags); 187 err |= __put_user(regs->flags, &sc->flags);
188 err |= __put_user(mask, &sc->oldmask); 188 err |= __put_user(mask, &sc->oldmask);
189 err |= __put_user(me->thread.cr2, &sc->cr2); 189 err |= __put_user(me->thread.cr2, &sc->cr2);
190 190
@@ -198,18 +198,18 @@ setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned lo
198static void __user * 198static void __user *
199get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size) 199get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size)
200{ 200{
201 unsigned long rsp; 201 unsigned long sp;
202 202
203 /* Default to using normal stack - redzone*/ 203 /* Default to using normal stack - redzone*/
204 rsp = regs->rsp - 128; 204 sp = regs->sp - 128;
205 205
206 /* This is the X/Open sanctioned signal stack switching. */ 206 /* This is the X/Open sanctioned signal stack switching. */
207 if (ka->sa.sa_flags & SA_ONSTACK) { 207 if (ka->sa.sa_flags & SA_ONSTACK) {
208 if (sas_ss_flags(rsp) == 0) 208 if (sas_ss_flags(sp) == 0)
209 rsp = current->sas_ss_sp + current->sas_ss_size; 209 sp = current->sas_ss_sp + current->sas_ss_size;
210 } 210 }
211 211
212 return (void __user *)round_down(rsp - size, 16); 212 return (void __user *)round_down(sp - size, 16);
213} 213}
214 214
215static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, 215static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
@@ -246,7 +246,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
246 err |= __put_user(0, &frame->uc.uc_flags); 246 err |= __put_user(0, &frame->uc.uc_flags);
247 err |= __put_user(0, &frame->uc.uc_link); 247 err |= __put_user(0, &frame->uc.uc_link);
248 err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp); 248 err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
249 err |= __put_user(sas_ss_flags(regs->rsp), 249 err |= __put_user(sas_ss_flags(regs->sp),
250 &frame->uc.uc_stack.ss_flags); 250 &frame->uc.uc_stack.ss_flags);
251 err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size); 251 err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size);
252 err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me); 252 err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me);
@@ -271,21 +271,21 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
271 goto give_sigsegv; 271 goto give_sigsegv;
272 272
273#ifdef DEBUG_SIG 273#ifdef DEBUG_SIG
274 printk("%d old rip %lx old rsp %lx old rax %lx\n", current->pid,regs->rip,regs->rsp,regs->rax); 274 printk("%d old ip %lx old sp %lx old ax %lx\n", current->pid,regs->ip,regs->sp,regs->ax);
275#endif 275#endif
276 276
277 /* Set up registers for signal handler */ 277 /* Set up registers for signal handler */
278 regs->rdi = sig; 278 regs->di = sig;
279 /* In case the signal handler was declared without prototypes */ 279 /* In case the signal handler was declared without prototypes */
280 regs->rax = 0; 280 regs->ax = 0;
281 281
282 /* This also works for non SA_SIGINFO handlers because they expect the 282 /* This also works for non SA_SIGINFO handlers because they expect the
283 next argument after the signal number on the stack. */ 283 next argument after the signal number on the stack. */
284 regs->rsi = (unsigned long)&frame->info; 284 regs->si = (unsigned long)&frame->info;
285 regs->rdx = (unsigned long)&frame->uc; 285 regs->dx = (unsigned long)&frame->uc;
286 regs->rip = (unsigned long) ka->sa.sa_handler; 286 regs->ip = (unsigned long) ka->sa.sa_handler;
287 287
288 regs->rsp = (unsigned long)frame; 288 regs->sp = (unsigned long)frame;
289 289
290 /* Set up the CS register to run signal handlers in 64-bit mode, 290 /* Set up the CS register to run signal handlers in 64-bit mode,
291 even if the handler happens to be interrupting 32-bit code. */ 291 even if the handler happens to be interrupting 32-bit code. */
@@ -295,12 +295,12 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
295 see include/asm-x86_64/uaccess.h for details. */ 295 see include/asm-x86_64/uaccess.h for details. */
296 set_fs(USER_DS); 296 set_fs(USER_DS);
297 297
298 regs->eflags &= ~TF_MASK; 298 regs->flags &= ~X86_EFLAGS_TF;
299 if (test_thread_flag(TIF_SINGLESTEP)) 299 if (test_thread_flag(TIF_SINGLESTEP))
300 ptrace_notify(SIGTRAP); 300 ptrace_notify(SIGTRAP);
301#ifdef DEBUG_SIG 301#ifdef DEBUG_SIG
302 printk("SIG deliver (%s:%d): sp=%p pc=%lx ra=%p\n", 302 printk("SIG deliver (%s:%d): sp=%p pc=%lx ra=%p\n",
303 current->comm, current->pid, frame, regs->rip, frame->pretcode); 303 current->comm, current->pid, frame, regs->ip, frame->pretcode);
304#endif 304#endif
305 305
306 return 0; 306 return 0;
@@ -321,44 +321,40 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
321 int ret; 321 int ret;
322 322
323#ifdef DEBUG_SIG 323#ifdef DEBUG_SIG
324 printk("handle_signal pid:%d sig:%lu rip:%lx rsp:%lx regs=%p\n", 324 printk("handle_signal pid:%d sig:%lu ip:%lx sp:%lx regs=%p\n",
325 current->pid, sig, 325 current->pid, sig,
326 regs->rip, regs->rsp, regs); 326 regs->ip, regs->sp, regs);
327#endif 327#endif
328 328
329 /* Are we from a system call? */ 329 /* Are we from a system call? */
330 if ((long)regs->orig_rax >= 0) { 330 if ((long)regs->orig_ax >= 0) {
331 /* If so, check system call restarting.. */ 331 /* If so, check system call restarting.. */
332 switch (regs->rax) { 332 switch (regs->ax) {
333 case -ERESTART_RESTARTBLOCK: 333 case -ERESTART_RESTARTBLOCK:
334 case -ERESTARTNOHAND: 334 case -ERESTARTNOHAND:
335 regs->rax = -EINTR; 335 regs->ax = -EINTR;
336 break; 336 break;
337 337
338 case -ERESTARTSYS: 338 case -ERESTARTSYS:
339 if (!(ka->sa.sa_flags & SA_RESTART)) { 339 if (!(ka->sa.sa_flags & SA_RESTART)) {
340 regs->rax = -EINTR; 340 regs->ax = -EINTR;
341 break; 341 break;
342 } 342 }
343 /* fallthrough */ 343 /* fallthrough */
344 case -ERESTARTNOINTR: 344 case -ERESTARTNOINTR:
345 regs->rax = regs->orig_rax; 345 regs->ax = regs->orig_ax;
346 regs->rip -= 2; 346 regs->ip -= 2;
347 break; 347 break;
348 } 348 }
349 } 349 }
350 350
351 /* 351 /*
352 * If TF is set due to a debugger (PT_DTRACE), clear the TF 352 * If TF is set due to a debugger (TIF_FORCED_TF), clear the TF
353 * flag so that register information in the sigcontext is 353 * flag so that register information in the sigcontext is correct.
354 * correct.
355 */ 354 */
356 if (unlikely(regs->eflags & TF_MASK)) { 355 if (unlikely(regs->flags & X86_EFLAGS_TF) &&
357 if (likely(current->ptrace & PT_DTRACE)) { 356 likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
358 current->ptrace &= ~PT_DTRACE; 357 regs->flags &= ~X86_EFLAGS_TF;
359 regs->eflags &= ~TF_MASK;
360 }
361 }
362 358
363#ifdef CONFIG_IA32_EMULATION 359#ifdef CONFIG_IA32_EMULATION
364 if (test_thread_flag(TIF_IA32)) { 360 if (test_thread_flag(TIF_IA32)) {
@@ -430,21 +426,21 @@ static void do_signal(struct pt_regs *regs)
430 } 426 }
431 427
432 /* Did we come from a system call? */ 428 /* Did we come from a system call? */
433 if ((long)regs->orig_rax >= 0) { 429 if ((long)regs->orig_ax >= 0) {
434 /* Restart the system call - no handlers present */ 430 /* Restart the system call - no handlers present */
435 long res = regs->rax; 431 long res = regs->ax;
436 switch (res) { 432 switch (res) {
437 case -ERESTARTNOHAND: 433 case -ERESTARTNOHAND:
438 case -ERESTARTSYS: 434 case -ERESTARTSYS:
439 case -ERESTARTNOINTR: 435 case -ERESTARTNOINTR:
440 regs->rax = regs->orig_rax; 436 regs->ax = regs->orig_ax;
441 regs->rip -= 2; 437 regs->ip -= 2;
442 break; 438 break;
443 case -ERESTART_RESTARTBLOCK: 439 case -ERESTART_RESTARTBLOCK:
444 regs->rax = test_thread_flag(TIF_IA32) ? 440 regs->ax = test_thread_flag(TIF_IA32) ?
445 __NR_ia32_restart_syscall : 441 __NR_ia32_restart_syscall :
446 __NR_restart_syscall; 442 __NR_restart_syscall;
447 regs->rip -= 2; 443 regs->ip -= 2;
448 break; 444 break;
449 } 445 }
450 } 446 }
@@ -461,13 +457,13 @@ void
461do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) 457do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
462{ 458{
463#ifdef DEBUG_SIG 459#ifdef DEBUG_SIG
464 printk("do_notify_resume flags:%x rip:%lx rsp:%lx caller:%p pending:%x\n", 460 printk("do_notify_resume flags:%x ip:%lx sp:%lx caller:%p pending:%x\n",
465 thread_info_flags, regs->rip, regs->rsp, __builtin_return_address(0),signal_pending(current)); 461 thread_info_flags, regs->ip, regs->sp, __builtin_return_address(0),signal_pending(current));
466#endif 462#endif
467 463
468 /* Pending single-step? */ 464 /* Pending single-step? */
469 if (thread_info_flags & _TIF_SINGLESTEP) { 465 if (thread_info_flags & _TIF_SINGLESTEP) {
470 regs->eflags |= TF_MASK; 466 regs->flags |= X86_EFLAGS_TF;
471 clear_thread_flag(TIF_SINGLESTEP); 467 clear_thread_flag(TIF_SINGLESTEP);
472 } 468 }
473 469
@@ -480,14 +476,20 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
480 /* deal with pending signal delivery */ 476 /* deal with pending signal delivery */
481 if (thread_info_flags & (_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK)) 477 if (thread_info_flags & (_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK))
482 do_signal(regs); 478 do_signal(regs);
479
480 if (thread_info_flags & _TIF_HRTICK_RESCHED)
481 hrtick_resched();
483} 482}
484 483
485void signal_fault(struct pt_regs *regs, void __user *frame, char *where) 484void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
486{ 485{
487 struct task_struct *me = current; 486 struct task_struct *me = current;
488 if (show_unhandled_signals && printk_ratelimit()) 487 if (show_unhandled_signals && printk_ratelimit()) {
489 printk("%s[%d] bad frame in %s frame:%p rip:%lx rsp:%lx orax:%lx\n", 488 printk("%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx",
490 me->comm,me->pid,where,frame,regs->rip,regs->rsp,regs->orig_rax); 489 me->comm,me->pid,where,frame,regs->ip,regs->sp,regs->orig_ax);
490 print_vma_addr(" in ", regs->ip);
491 printk("\n");
492 }
491 493
492 force_sig(SIGSEGV, me); 494 force_sig(SIGSEGV, me);
493} 495}
diff --git a/arch/x86/kernel/smp_32.c b/arch/x86/kernel/smp_32.c
index fcaa026eb80..dc0cde9d16f 100644
--- a/arch/x86/kernel/smp_32.c
+++ b/arch/x86/kernel/smp_32.c
@@ -159,7 +159,7 @@ void __send_IPI_shortcut(unsigned int shortcut, int vector)
159 apic_write_around(APIC_ICR, cfg); 159 apic_write_around(APIC_ICR, cfg);
160} 160}
161 161
162void fastcall send_IPI_self(int vector) 162void send_IPI_self(int vector)
163{ 163{
164 __send_IPI_shortcut(APIC_DEST_SELF, vector); 164 __send_IPI_shortcut(APIC_DEST_SELF, vector);
165} 165}
@@ -223,7 +223,7 @@ void send_IPI_mask_sequence(cpumask_t mask, int vector)
223 */ 223 */
224 224
225 local_irq_save(flags); 225 local_irq_save(flags);
226 for (query_cpu = 0; query_cpu < NR_CPUS; ++query_cpu) { 226 for_each_possible_cpu(query_cpu) {
227 if (cpu_isset(query_cpu, mask)) { 227 if (cpu_isset(query_cpu, mask)) {
228 __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu), 228 __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu),
229 vector); 229 vector);
@@ -256,13 +256,14 @@ static DEFINE_SPINLOCK(tlbstate_lock);
256 * We need to reload %cr3 since the page tables may be going 256 * We need to reload %cr3 since the page tables may be going
257 * away from under us.. 257 * away from under us..
258 */ 258 */
259void leave_mm(unsigned long cpu) 259void leave_mm(int cpu)
260{ 260{
261 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) 261 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
262 BUG(); 262 BUG();
263 cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask); 263 cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
264 load_cr3(swapper_pg_dir); 264 load_cr3(swapper_pg_dir);
265} 265}
266EXPORT_SYMBOL_GPL(leave_mm);
266 267
267/* 268/*
268 * 269 *
@@ -310,7 +311,7 @@ void leave_mm(unsigned long cpu)
310 * 2) Leave the mm if we are in the lazy tlb mode. 311 * 2) Leave the mm if we are in the lazy tlb mode.
311 */ 312 */
312 313
313fastcall void smp_invalidate_interrupt(struct pt_regs *regs) 314void smp_invalidate_interrupt(struct pt_regs *regs)
314{ 315{
315 unsigned long cpu; 316 unsigned long cpu;
316 317
@@ -638,13 +639,13 @@ static void native_smp_send_stop(void)
638 * all the work is done automatically when 639 * all the work is done automatically when
639 * we return from the interrupt. 640 * we return from the interrupt.
640 */ 641 */
641fastcall void smp_reschedule_interrupt(struct pt_regs *regs) 642void smp_reschedule_interrupt(struct pt_regs *regs)
642{ 643{
643 ack_APIC_irq(); 644 ack_APIC_irq();
644 __get_cpu_var(irq_stat).irq_resched_count++; 645 __get_cpu_var(irq_stat).irq_resched_count++;
645} 646}
646 647
647fastcall void smp_call_function_interrupt(struct pt_regs *regs) 648void smp_call_function_interrupt(struct pt_regs *regs)
648{ 649{
649 void (*func) (void *info) = call_data->func; 650 void (*func) (void *info) = call_data->func;
650 void *info = call_data->info; 651 void *info = call_data->info;
@@ -675,7 +676,7 @@ static int convert_apicid_to_cpu(int apic_id)
675{ 676{
676 int i; 677 int i;
677 678
678 for (i = 0; i < NR_CPUS; i++) { 679 for_each_possible_cpu(i) {
679 if (per_cpu(x86_cpu_to_apicid, i) == apic_id) 680 if (per_cpu(x86_cpu_to_apicid, i) == apic_id)
680 return i; 681 return i;
681 } 682 }
diff --git a/arch/x86/kernel/smp_64.c b/arch/x86/kernel/smp_64.c
index 03fa6ed559c..2fd74b06db6 100644
--- a/arch/x86/kernel/smp_64.c
+++ b/arch/x86/kernel/smp_64.c
@@ -29,7 +29,7 @@
29#include <asm/idle.h> 29#include <asm/idle.h>
30 30
31/* 31/*
32 * Smarter SMP flushing macros. 32 * Smarter SMP flushing macros.
33 * c/o Linus Torvalds. 33 * c/o Linus Torvalds.
34 * 34 *
35 * These mean you can really definitely utterly forget about 35 * These mean you can really definitely utterly forget about
@@ -37,15 +37,15 @@
37 * 37 *
38 * Optimizations Manfred Spraul <manfred@colorfullife.com> 38 * Optimizations Manfred Spraul <manfred@colorfullife.com>
39 * 39 *
40 * More scalable flush, from Andi Kleen 40 * More scalable flush, from Andi Kleen
41 * 41 *
42 * To avoid global state use 8 different call vectors. 42 * To avoid global state use 8 different call vectors.
43 * Each CPU uses a specific vector to trigger flushes on other 43 * Each CPU uses a specific vector to trigger flushes on other
44 * CPUs. Depending on the received vector the target CPUs look into 44 * CPUs. Depending on the received vector the target CPUs look into
45 * the right per cpu variable for the flush data. 45 * the right per cpu variable for the flush data.
46 * 46 *
47 * With more than 8 CPUs they are hashed to the 8 available 47 * With more than 8 CPUs they are hashed to the 8 available
48 * vectors. The limited global vector space forces us to this right now. 48 * vectors. The limited global vector space forces us to this right now.
49 * In future when interrupts are split into per CPU domains this could be 49 * In future when interrupts are split into per CPU domains this could be
50 * fixed, at the cost of triggering multiple IPIs in some cases. 50 * fixed, at the cost of triggering multiple IPIs in some cases.
51 */ 51 */
@@ -55,7 +55,6 @@ union smp_flush_state {
55 cpumask_t flush_cpumask; 55 cpumask_t flush_cpumask;
56 struct mm_struct *flush_mm; 56 struct mm_struct *flush_mm;
57 unsigned long flush_va; 57 unsigned long flush_va;
58#define FLUSH_ALL -1ULL
59 spinlock_t tlbstate_lock; 58 spinlock_t tlbstate_lock;
60 }; 59 };
61 char pad[SMP_CACHE_BYTES]; 60 char pad[SMP_CACHE_BYTES];
@@ -67,16 +66,17 @@ union smp_flush_state {
67static DEFINE_PER_CPU(union smp_flush_state, flush_state); 66static DEFINE_PER_CPU(union smp_flush_state, flush_state);
68 67
69/* 68/*
70 * We cannot call mmdrop() because we are in interrupt context, 69 * We cannot call mmdrop() because we are in interrupt context,
71 * instead update mm->cpu_vm_mask. 70 * instead update mm->cpu_vm_mask.
72 */ 71 */
73static inline void leave_mm(int cpu) 72void leave_mm(int cpu)
74{ 73{
75 if (read_pda(mmu_state) == TLBSTATE_OK) 74 if (read_pda(mmu_state) == TLBSTATE_OK)
76 BUG(); 75 BUG();
77 cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask); 76 cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask);
78 load_cr3(swapper_pg_dir); 77 load_cr3(swapper_pg_dir);
79} 78}
79EXPORT_SYMBOL_GPL(leave_mm);
80 80
81/* 81/*
82 * 82 *
@@ -85,25 +85,25 @@ static inline void leave_mm(int cpu)
85 * 1) switch_mm() either 1a) or 1b) 85 * 1) switch_mm() either 1a) or 1b)
86 * 1a) thread switch to a different mm 86 * 1a) thread switch to a different mm
87 * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask); 87 * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
88 * Stop ipi delivery for the old mm. This is not synchronized with 88 * Stop ipi delivery for the old mm. This is not synchronized with
89 * the other cpus, but smp_invalidate_interrupt ignore flush ipis 89 * the other cpus, but smp_invalidate_interrupt ignore flush ipis
90 * for the wrong mm, and in the worst case we perform a superfluous 90 * for the wrong mm, and in the worst case we perform a superfluous
91 * tlb flush. 91 * tlb flush.
92 * 1a2) set cpu mmu_state to TLBSTATE_OK 92 * 1a2) set cpu mmu_state to TLBSTATE_OK
93 * Now the smp_invalidate_interrupt won't call leave_mm if cpu0 93 * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
94 * was in lazy tlb mode. 94 * was in lazy tlb mode.
95 * 1a3) update cpu active_mm 95 * 1a3) update cpu active_mm
96 * Now cpu0 accepts tlb flushes for the new mm. 96 * Now cpu0 accepts tlb flushes for the new mm.
97 * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask); 97 * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
98 * Now the other cpus will send tlb flush ipis. 98 * Now the other cpus will send tlb flush ipis.
99 * 1a4) change cr3. 99 * 1a4) change cr3.
100 * 1b) thread switch without mm change 100 * 1b) thread switch without mm change
101 * cpu active_mm is correct, cpu0 already handles 101 * cpu active_mm is correct, cpu0 already handles
102 * flush ipis. 102 * flush ipis.
103 * 1b1) set cpu mmu_state to TLBSTATE_OK 103 * 1b1) set cpu mmu_state to TLBSTATE_OK
104 * 1b2) test_and_set the cpu bit in cpu_vm_mask. 104 * 1b2) test_and_set the cpu bit in cpu_vm_mask.
105 * Atomically set the bit [other cpus will start sending flush ipis], 105 * Atomically set the bit [other cpus will start sending flush ipis],
106 * and test the bit. 106 * and test the bit.
107 * 1b3) if the bit was 0: leave_mm was called, flush the tlb. 107 * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
108 * 2) switch %%esp, ie current 108 * 2) switch %%esp, ie current
109 * 109 *
@@ -137,12 +137,12 @@ asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
137 * orig_rax contains the negated interrupt vector. 137 * orig_rax contains the negated interrupt vector.
138 * Use that to determine where the sender put the data. 138 * Use that to determine where the sender put the data.
139 */ 139 */
140 sender = ~regs->orig_rax - INVALIDATE_TLB_VECTOR_START; 140 sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START;
141 f = &per_cpu(flush_state, sender); 141 f = &per_cpu(flush_state, sender);
142 142
143 if (!cpu_isset(cpu, f->flush_cpumask)) 143 if (!cpu_isset(cpu, f->flush_cpumask))
144 goto out; 144 goto out;
145 /* 145 /*
146 * This was a BUG() but until someone can quote me the 146 * This was a BUG() but until someone can quote me the
147 * line from the intel manual that guarantees an IPI to 147 * line from the intel manual that guarantees an IPI to
148 * multiple CPUs is retried _only_ on the erroring CPUs 148 * multiple CPUs is retried _only_ on the erroring CPUs
@@ -150,10 +150,10 @@ asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
150 * 150 *
151 * BUG(); 151 * BUG();
152 */ 152 */
153 153
154 if (f->flush_mm == read_pda(active_mm)) { 154 if (f->flush_mm == read_pda(active_mm)) {
155 if (read_pda(mmu_state) == TLBSTATE_OK) { 155 if (read_pda(mmu_state) == TLBSTATE_OK) {
156 if (f->flush_va == FLUSH_ALL) 156 if (f->flush_va == TLB_FLUSH_ALL)
157 local_flush_tlb(); 157 local_flush_tlb();
158 else 158 else
159 __flush_tlb_one(f->flush_va); 159 __flush_tlb_one(f->flush_va);
@@ -166,19 +166,22 @@ out:
166 add_pda(irq_tlb_count, 1); 166 add_pda(irq_tlb_count, 1);
167} 167}
168 168
169static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, 169void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
170 unsigned long va) 170 unsigned long va)
171{ 171{
172 int sender; 172 int sender;
173 union smp_flush_state *f; 173 union smp_flush_state *f;
174 cpumask_t cpumask = *cpumaskp;
174 175
175 /* Caller has disabled preemption */ 176 /* Caller has disabled preemption */
176 sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS; 177 sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
177 f = &per_cpu(flush_state, sender); 178 f = &per_cpu(flush_state, sender);
178 179
179 /* Could avoid this lock when 180 /*
180 num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is 181 * Could avoid this lock when
181 probably not worth checking this for a cache-hot lock. */ 182 * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
183 * probably not worth checking this for a cache-hot lock.
184 */
182 spin_lock(&f->tlbstate_lock); 185 spin_lock(&f->tlbstate_lock);
183 186
184 f->flush_mm = mm; 187 f->flush_mm = mm;
@@ -202,14 +205,14 @@ static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
202int __cpuinit init_smp_flush(void) 205int __cpuinit init_smp_flush(void)
203{ 206{
204 int i; 207 int i;
208
205 for_each_cpu_mask(i, cpu_possible_map) { 209 for_each_cpu_mask(i, cpu_possible_map) {
206 spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock); 210 spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock);
207 } 211 }
208 return 0; 212 return 0;
209} 213}
210
211core_initcall(init_smp_flush); 214core_initcall(init_smp_flush);
212 215
213void flush_tlb_current_task(void) 216void flush_tlb_current_task(void)
214{ 217{
215 struct mm_struct *mm = current->mm; 218 struct mm_struct *mm = current->mm;
@@ -221,10 +224,9 @@ void flush_tlb_current_task(void)
221 224
222 local_flush_tlb(); 225 local_flush_tlb();
223 if (!cpus_empty(cpu_mask)) 226 if (!cpus_empty(cpu_mask))
224 flush_tlb_others(cpu_mask, mm, FLUSH_ALL); 227 flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
225 preempt_enable(); 228 preempt_enable();
226} 229}
227EXPORT_SYMBOL(flush_tlb_current_task);
228 230
229void flush_tlb_mm (struct mm_struct * mm) 231void flush_tlb_mm (struct mm_struct * mm)
230{ 232{
@@ -241,11 +243,10 @@ void flush_tlb_mm (struct mm_struct * mm)
241 leave_mm(smp_processor_id()); 243 leave_mm(smp_processor_id());
242 } 244 }
243 if (!cpus_empty(cpu_mask)) 245 if (!cpus_empty(cpu_mask))
244 flush_tlb_others(cpu_mask, mm, FLUSH_ALL); 246 flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
245 247
246 preempt_enable(); 248 preempt_enable();
247} 249}
248EXPORT_SYMBOL(flush_tlb_mm);
249 250
250void flush_tlb_page(struct vm_area_struct * vma, unsigned long va) 251void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
251{ 252{
@@ -259,8 +260,8 @@ void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
259 if (current->active_mm == mm) { 260 if (current->active_mm == mm) {
260 if(current->mm) 261 if(current->mm)
261 __flush_tlb_one(va); 262 __flush_tlb_one(va);
262 else 263 else
263 leave_mm(smp_processor_id()); 264 leave_mm(smp_processor_id());
264 } 265 }
265 266
266 if (!cpus_empty(cpu_mask)) 267 if (!cpus_empty(cpu_mask))
@@ -268,7 +269,6 @@ void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
268 269
269 preempt_enable(); 270 preempt_enable();
270} 271}
271EXPORT_SYMBOL(flush_tlb_page);
272 272
273static void do_flush_tlb_all(void* info) 273static void do_flush_tlb_all(void* info)
274{ 274{
@@ -325,11 +325,9 @@ void unlock_ipi_call_lock(void)
325 * this function sends a 'generic call function' IPI to all other CPU 325 * this function sends a 'generic call function' IPI to all other CPU
326 * of the system defined in the mask. 326 * of the system defined in the mask.
327 */ 327 */
328 328static int __smp_call_function_mask(cpumask_t mask,
329static int 329 void (*func)(void *), void *info,
330__smp_call_function_mask(cpumask_t mask, 330 int wait)
331 void (*func)(void *), void *info,
332 int wait)
333{ 331{
334 struct call_data_struct data; 332 struct call_data_struct data;
335 cpumask_t allbutself; 333 cpumask_t allbutself;
@@ -417,11 +415,10 @@ EXPORT_SYMBOL(smp_call_function_mask);
417 */ 415 */
418 416
419int smp_call_function_single (int cpu, void (*func) (void *info), void *info, 417int smp_call_function_single (int cpu, void (*func) (void *info), void *info,
420 int nonatomic, int wait) 418 int nonatomic, int wait)
421{ 419{
422 /* prevent preemption and reschedule on another processor */ 420 /* prevent preemption and reschedule on another processor */
423 int ret; 421 int ret, me = get_cpu();
424 int me = get_cpu();
425 422
426 /* Can deadlock when called with interrupts disabled */ 423 /* Can deadlock when called with interrupts disabled */
427 WARN_ON(irqs_disabled()); 424 WARN_ON(irqs_disabled());
@@ -471,9 +468,9 @@ static void stop_this_cpu(void *dummy)
471 */ 468 */
472 cpu_clear(smp_processor_id(), cpu_online_map); 469 cpu_clear(smp_processor_id(), cpu_online_map);
473 disable_local_APIC(); 470 disable_local_APIC();
474 for (;;) 471 for (;;)
475 halt(); 472 halt();
476} 473}
477 474
478void smp_send_stop(void) 475void smp_send_stop(void)
479{ 476{
diff --git a/arch/x86/kernel/smpboot_32.c b/arch/x86/kernel/smpboot_32.c
index 4ea80cbe52e..579b9b740c7 100644
--- a/arch/x86/kernel/smpboot_32.c
+++ b/arch/x86/kernel/smpboot_32.c
@@ -83,7 +83,6 @@ EXPORT_SYMBOL(cpu_online_map);
83 83
84cpumask_t cpu_callin_map; 84cpumask_t cpu_callin_map;
85cpumask_t cpu_callout_map; 85cpumask_t cpu_callout_map;
86EXPORT_SYMBOL(cpu_callout_map);
87cpumask_t cpu_possible_map; 86cpumask_t cpu_possible_map;
88EXPORT_SYMBOL(cpu_possible_map); 87EXPORT_SYMBOL(cpu_possible_map);
89static cpumask_t smp_commenced_mask; 88static cpumask_t smp_commenced_mask;
@@ -92,15 +91,10 @@ static cpumask_t smp_commenced_mask;
92DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); 91DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
93EXPORT_PER_CPU_SYMBOL(cpu_info); 92EXPORT_PER_CPU_SYMBOL(cpu_info);
94 93
95/* 94/* which logical CPU number maps to which CPU (physical APIC ID) */
96 * The following static array is used during kernel startup
97 * and the x86_cpu_to_apicid_ptr contains the address of the
98 * array during this time. Is it zeroed when the per_cpu
99 * data area is removed.
100 */
101u8 x86_cpu_to_apicid_init[NR_CPUS] __initdata = 95u8 x86_cpu_to_apicid_init[NR_CPUS] __initdata =
102 { [0 ... NR_CPUS-1] = BAD_APICID }; 96 { [0 ... NR_CPUS-1] = BAD_APICID };
103void *x86_cpu_to_apicid_ptr; 97void *x86_cpu_to_apicid_early_ptr;
104DEFINE_PER_CPU(u8, x86_cpu_to_apicid) = BAD_APICID; 98DEFINE_PER_CPU(u8, x86_cpu_to_apicid) = BAD_APICID;
105EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid); 99EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
106 100
@@ -113,7 +107,6 @@ u8 apicid_2_node[MAX_APICID];
113extern const unsigned char trampoline_data []; 107extern const unsigned char trampoline_data [];
114extern const unsigned char trampoline_end []; 108extern const unsigned char trampoline_end [];
115static unsigned char *trampoline_base; 109static unsigned char *trampoline_base;
116static int trampoline_exec;
117 110
118static void map_cpu_to_logical_apicid(void); 111static void map_cpu_to_logical_apicid(void);
119 112
@@ -138,17 +131,13 @@ static unsigned long __cpuinit setup_trampoline(void)
138 */ 131 */
139void __init smp_alloc_memory(void) 132void __init smp_alloc_memory(void)
140{ 133{
141 trampoline_base = (void *) alloc_bootmem_low_pages(PAGE_SIZE); 134 trampoline_base = alloc_bootmem_low_pages(PAGE_SIZE);
142 /* 135 /*
143 * Has to be in very low memory so we can execute 136 * Has to be in very low memory so we can execute
144 * real-mode AP code. 137 * real-mode AP code.
145 */ 138 */
146 if (__pa(trampoline_base) >= 0x9F000) 139 if (__pa(trampoline_base) >= 0x9F000)
147 BUG(); 140 BUG();
148 /*
149 * Make the SMP trampoline executable:
150 */
151 trampoline_exec = set_kernel_exec((unsigned long)trampoline_base, 1);
152} 141}
153 142
154/* 143/*
@@ -213,8 +202,6 @@ valid_k7:
213 ; 202 ;
214} 203}
215 204
216extern void calibrate_delay(void);
217
218static atomic_t init_deasserted; 205static atomic_t init_deasserted;
219 206
220static void __cpuinit smp_callin(void) 207static void __cpuinit smp_callin(void)
@@ -405,7 +392,7 @@ static void __cpuinit start_secondary(void *unused)
405 setup_secondary_clock(); 392 setup_secondary_clock();
406 if (nmi_watchdog == NMI_IO_APIC) { 393 if (nmi_watchdog == NMI_IO_APIC) {
407 disable_8259A_irq(0); 394 disable_8259A_irq(0);
408 enable_NMI_through_LVT0(NULL); 395 enable_NMI_through_LVT0();
409 enable_8259A_irq(0); 396 enable_8259A_irq(0);
410 } 397 }
411 /* 398 /*
@@ -448,38 +435,38 @@ void __devinit initialize_secondary(void)
448{ 435{
449 /* 436 /*
450 * We don't actually need to load the full TSS, 437 * We don't actually need to load the full TSS,
451 * basically just the stack pointer and the eip. 438 * basically just the stack pointer and the ip.
452 */ 439 */
453 440
454 asm volatile( 441 asm volatile(
455 "movl %0,%%esp\n\t" 442 "movl %0,%%esp\n\t"
456 "jmp *%1" 443 "jmp *%1"
457 : 444 :
458 :"m" (current->thread.esp),"m" (current->thread.eip)); 445 :"m" (current->thread.sp),"m" (current->thread.ip));
459} 446}
460 447
461/* Static state in head.S used to set up a CPU */ 448/* Static state in head.S used to set up a CPU */
462extern struct { 449extern struct {
463 void * esp; 450 void * sp;
464 unsigned short ss; 451 unsigned short ss;
465} stack_start; 452} stack_start;
466 453
467#ifdef CONFIG_NUMA 454#ifdef CONFIG_NUMA
468 455
469/* which logical CPUs are on which nodes */ 456/* which logical CPUs are on which nodes */
470cpumask_t node_2_cpu_mask[MAX_NUMNODES] __read_mostly = 457cpumask_t node_to_cpumask_map[MAX_NUMNODES] __read_mostly =
471 { [0 ... MAX_NUMNODES-1] = CPU_MASK_NONE }; 458 { [0 ... MAX_NUMNODES-1] = CPU_MASK_NONE };
472EXPORT_SYMBOL(node_2_cpu_mask); 459EXPORT_SYMBOL(node_to_cpumask_map);
473/* which node each logical CPU is on */ 460/* which node each logical CPU is on */
474int cpu_2_node[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 }; 461int cpu_to_node_map[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 };
475EXPORT_SYMBOL(cpu_2_node); 462EXPORT_SYMBOL(cpu_to_node_map);
476 463
477/* set up a mapping between cpu and node. */ 464/* set up a mapping between cpu and node. */
478static inline void map_cpu_to_node(int cpu, int node) 465static inline void map_cpu_to_node(int cpu, int node)
479{ 466{
480 printk("Mapping cpu %d to node %d\n", cpu, node); 467 printk("Mapping cpu %d to node %d\n", cpu, node);
481 cpu_set(cpu, node_2_cpu_mask[node]); 468 cpu_set(cpu, node_to_cpumask_map[node]);
482 cpu_2_node[cpu] = node; 469 cpu_to_node_map[cpu] = node;
483} 470}
484 471
485/* undo a mapping between cpu and node. */ 472/* undo a mapping between cpu and node. */
@@ -489,8 +476,8 @@ static inline void unmap_cpu_to_node(int cpu)
489 476
490 printk("Unmapping cpu %d from all nodes\n", cpu); 477 printk("Unmapping cpu %d from all nodes\n", cpu);
491 for (node = 0; node < MAX_NUMNODES; node ++) 478 for (node = 0; node < MAX_NUMNODES; node ++)
492 cpu_clear(cpu, node_2_cpu_mask[node]); 479 cpu_clear(cpu, node_to_cpumask_map[node]);
493 cpu_2_node[cpu] = 0; 480 cpu_to_node_map[cpu] = 0;
494} 481}
495#else /* !CONFIG_NUMA */ 482#else /* !CONFIG_NUMA */
496 483
@@ -668,7 +655,7 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
668 * target processor state. 655 * target processor state.
669 */ 656 */
670 startup_ipi_hook(phys_apicid, (unsigned long) start_secondary, 657 startup_ipi_hook(phys_apicid, (unsigned long) start_secondary,
671 (unsigned long) stack_start.esp); 658 (unsigned long) stack_start.sp);
672 659
673 /* 660 /*
674 * Run STARTUP IPI loop. 661 * Run STARTUP IPI loop.
@@ -754,7 +741,7 @@ static inline struct task_struct * __cpuinit alloc_idle_task(int cpu)
754 /* initialize thread_struct. we really want to avoid destroy 741 /* initialize thread_struct. we really want to avoid destroy
755 * idle tread 742 * idle tread
756 */ 743 */
757 idle->thread.esp = (unsigned long)task_pt_regs(idle); 744 idle->thread.sp = (unsigned long)task_pt_regs(idle);
758 init_idle(idle, cpu); 745 init_idle(idle, cpu);
759 return idle; 746 return idle;
760 } 747 }
@@ -799,7 +786,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
799 per_cpu(current_task, cpu) = idle; 786 per_cpu(current_task, cpu) = idle;
800 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); 787 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
801 788
802 idle->thread.eip = (unsigned long) start_secondary; 789 idle->thread.ip = (unsigned long) start_secondary;
803 /* start_eip had better be page-aligned! */ 790 /* start_eip had better be page-aligned! */
804 start_eip = setup_trampoline(); 791 start_eip = setup_trampoline();
805 792
@@ -807,9 +794,9 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
807 alternatives_smp_switch(1); 794 alternatives_smp_switch(1);
808 795
809 /* So we see what's up */ 796 /* So we see what's up */
810 printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip); 797 printk("Booting processor %d/%d ip %lx\n", cpu, apicid, start_eip);
811 /* Stack for startup_32 can be just as for start_secondary onwards */ 798 /* Stack for startup_32 can be just as for start_secondary onwards */
812 stack_start.esp = (void *) idle->thread.esp; 799 stack_start.sp = (void *) idle->thread.sp;
813 800
814 irq_ctx_init(cpu); 801 irq_ctx_init(cpu);
815 802
@@ -1091,7 +1078,7 @@ static void __init smp_boot_cpus(unsigned int max_cpus)
1091 * Allow the user to impress friends. 1078 * Allow the user to impress friends.
1092 */ 1079 */
1093 Dprintk("Before bogomips.\n"); 1080 Dprintk("Before bogomips.\n");
1094 for (cpu = 0; cpu < NR_CPUS; cpu++) 1081 for_each_possible_cpu(cpu)
1095 if (cpu_isset(cpu, cpu_callout_map)) 1082 if (cpu_isset(cpu, cpu_callout_map))
1096 bogosum += cpu_data(cpu).loops_per_jiffy; 1083 bogosum += cpu_data(cpu).loops_per_jiffy;
1097 printk(KERN_INFO 1084 printk(KERN_INFO
@@ -1122,7 +1109,7 @@ static void __init smp_boot_cpus(unsigned int max_cpus)
1122 * construct cpu_sibling_map, so that we can tell sibling CPUs 1109 * construct cpu_sibling_map, so that we can tell sibling CPUs
1123 * efficiently. 1110 * efficiently.
1124 */ 1111 */
1125 for (cpu = 0; cpu < NR_CPUS; cpu++) { 1112 for_each_possible_cpu(cpu) {
1126 cpus_clear(per_cpu(cpu_sibling_map, cpu)); 1113 cpus_clear(per_cpu(cpu_sibling_map, cpu));
1127 cpus_clear(per_cpu(cpu_core_map, cpu)); 1114 cpus_clear(per_cpu(cpu_core_map, cpu));
1128 } 1115 }
@@ -1296,12 +1283,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
1296 setup_ioapic_dest(); 1283 setup_ioapic_dest();
1297#endif 1284#endif
1298 zap_low_mappings(); 1285 zap_low_mappings();
1299#ifndef CONFIG_HOTPLUG_CPU
1300 /*
1301 * Disable executability of the SMP trampoline:
1302 */
1303 set_kernel_exec((unsigned long)trampoline_base, trampoline_exec);
1304#endif
1305} 1286}
1306 1287
1307void __init smp_intr_init(void) 1288void __init smp_intr_init(void)
diff --git a/arch/x86/kernel/smpboot_64.c b/arch/x86/kernel/smpboot_64.c
index aaf4e129121..d53bd6fcb42 100644
--- a/arch/x86/kernel/smpboot_64.c
+++ b/arch/x86/kernel/smpboot_64.c
@@ -65,7 +65,7 @@ int smp_num_siblings = 1;
65EXPORT_SYMBOL(smp_num_siblings); 65EXPORT_SYMBOL(smp_num_siblings);
66 66
67/* Last level cache ID of each logical CPU */ 67/* Last level cache ID of each logical CPU */
68DEFINE_PER_CPU(u8, cpu_llc_id) = BAD_APICID; 68DEFINE_PER_CPU(u16, cpu_llc_id) = BAD_APICID;
69 69
70/* Bitmask of currently online CPUs */ 70/* Bitmask of currently online CPUs */
71cpumask_t cpu_online_map __read_mostly; 71cpumask_t cpu_online_map __read_mostly;
@@ -78,8 +78,6 @@ EXPORT_SYMBOL(cpu_online_map);
78 */ 78 */
79cpumask_t cpu_callin_map; 79cpumask_t cpu_callin_map;
80cpumask_t cpu_callout_map; 80cpumask_t cpu_callout_map;
81EXPORT_SYMBOL(cpu_callout_map);
82
83cpumask_t cpu_possible_map; 81cpumask_t cpu_possible_map;
84EXPORT_SYMBOL(cpu_possible_map); 82EXPORT_SYMBOL(cpu_possible_map);
85 83
@@ -113,10 +111,20 @@ DEFINE_PER_CPU(int, cpu_state) = { 0 };
113 * a new thread. Also avoids complicated thread destroy functionality 111 * a new thread. Also avoids complicated thread destroy functionality
114 * for idle threads. 112 * for idle threads.
115 */ 113 */
114#ifdef CONFIG_HOTPLUG_CPU
115/*
116 * Needed only for CONFIG_HOTPLUG_CPU because __cpuinitdata is
117 * removed after init for !CONFIG_HOTPLUG_CPU.
118 */
119static DEFINE_PER_CPU(struct task_struct *, idle_thread_array);
120#define get_idle_for_cpu(x) (per_cpu(idle_thread_array, x))
121#define set_idle_for_cpu(x,p) (per_cpu(idle_thread_array, x) = (p))
122#else
116struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ; 123struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
117
118#define get_idle_for_cpu(x) (idle_thread_array[(x)]) 124#define get_idle_for_cpu(x) (idle_thread_array[(x)])
119#define set_idle_for_cpu(x,p) (idle_thread_array[(x)] = (p)) 125#define set_idle_for_cpu(x,p) (idle_thread_array[(x)] = (p))
126#endif
127
120 128
121/* 129/*
122 * Currently trivial. Write the real->protected mode 130 * Currently trivial. Write the real->protected mode
@@ -212,6 +220,7 @@ void __cpuinit smp_callin(void)
212 220
213 Dprintk("CALLIN, before setup_local_APIC().\n"); 221 Dprintk("CALLIN, before setup_local_APIC().\n");
214 setup_local_APIC(); 222 setup_local_APIC();
223 end_local_APIC_setup();
215 224
216 /* 225 /*
217 * Get our bogomips. 226 * Get our bogomips.
@@ -338,7 +347,7 @@ void __cpuinit start_secondary(void)
338 347
339 if (nmi_watchdog == NMI_IO_APIC) { 348 if (nmi_watchdog == NMI_IO_APIC) {
340 disable_8259A_irq(0); 349 disable_8259A_irq(0);
341 enable_NMI_through_LVT0(NULL); 350 enable_NMI_through_LVT0();
342 enable_8259A_irq(0); 351 enable_8259A_irq(0);
343 } 352 }
344 353
@@ -370,7 +379,7 @@ void __cpuinit start_secondary(void)
370 379
371 unlock_ipi_call_lock(); 380 unlock_ipi_call_lock();
372 381
373 setup_secondary_APIC_clock(); 382 setup_secondary_clock();
374 383
375 cpu_idle(); 384 cpu_idle();
376} 385}
@@ -384,19 +393,20 @@ static void inquire_remote_apic(int apicid)
384 unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 }; 393 unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
385 char *names[] = { "ID", "VERSION", "SPIV" }; 394 char *names[] = { "ID", "VERSION", "SPIV" };
386 int timeout; 395 int timeout;
387 unsigned int status; 396 u32 status;
388 397
389 printk(KERN_INFO "Inquiring remote APIC #%d...\n", apicid); 398 printk(KERN_INFO "Inquiring remote APIC #%d...\n", apicid);
390 399
391 for (i = 0; i < ARRAY_SIZE(regs); i++) { 400 for (i = 0; i < ARRAY_SIZE(regs); i++) {
392 printk("... APIC #%d %s: ", apicid, names[i]); 401 printk(KERN_INFO "... APIC #%d %s: ", apicid, names[i]);
393 402
394 /* 403 /*
395 * Wait for idle. 404 * Wait for idle.
396 */ 405 */
397 status = safe_apic_wait_icr_idle(); 406 status = safe_apic_wait_icr_idle();
398 if (status) 407 if (status)
399 printk("a previous APIC delivery may have failed\n"); 408 printk(KERN_CONT
409 "a previous APIC delivery may have failed\n");
400 410
401 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); 411 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
402 apic_write(APIC_ICR, APIC_DM_REMRD | regs[i]); 412 apic_write(APIC_ICR, APIC_DM_REMRD | regs[i]);
@@ -410,10 +420,10 @@ static void inquire_remote_apic(int apicid)
410 switch (status) { 420 switch (status) {
411 case APIC_ICR_RR_VALID: 421 case APIC_ICR_RR_VALID:
412 status = apic_read(APIC_RRR); 422 status = apic_read(APIC_RRR);
413 printk("%08x\n", status); 423 printk(KERN_CONT "%08x\n", status);
414 break; 424 break;
415 default: 425 default:
416 printk("failed\n"); 426 printk(KERN_CONT "failed\n");
417 } 427 }
418 } 428 }
419} 429}
@@ -466,7 +476,7 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta
466 */ 476 */
467 Dprintk("#startup loops: %d.\n", num_starts); 477 Dprintk("#startup loops: %d.\n", num_starts);
468 478
469 maxlvt = get_maxlvt(); 479 maxlvt = lapic_get_maxlvt();
470 480
471 for (j = 1; j <= num_starts; j++) { 481 for (j = 1; j <= num_starts; j++) {
472 Dprintk("Sending STARTUP #%d.\n",j); 482 Dprintk("Sending STARTUP #%d.\n",j);
@@ -577,7 +587,7 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid)
577 c_idle.idle = get_idle_for_cpu(cpu); 587 c_idle.idle = get_idle_for_cpu(cpu);
578 588
579 if (c_idle.idle) { 589 if (c_idle.idle) {
580 c_idle.idle->thread.rsp = (unsigned long) (((struct pt_regs *) 590 c_idle.idle->thread.sp = (unsigned long) (((struct pt_regs *)
581 (THREAD_SIZE + task_stack_page(c_idle.idle))) - 1); 591 (THREAD_SIZE + task_stack_page(c_idle.idle))) - 1);
582 init_idle(c_idle.idle, cpu); 592 init_idle(c_idle.idle, cpu);
583 goto do_rest; 593 goto do_rest;
@@ -613,8 +623,8 @@ do_rest:
613 623
614 start_rip = setup_trampoline(); 624 start_rip = setup_trampoline();
615 625
616 init_rsp = c_idle.idle->thread.rsp; 626 init_rsp = c_idle.idle->thread.sp;
617 per_cpu(init_tss,cpu).rsp0 = init_rsp; 627 load_sp0(&per_cpu(init_tss, cpu), &c_idle.idle->thread);
618 initial_code = start_secondary; 628 initial_code = start_secondary;
619 clear_tsk_thread_flag(c_idle.idle, TIF_FORK); 629 clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
620 630
@@ -691,7 +701,7 @@ do_rest:
691 } 701 }
692 if (boot_error) { 702 if (boot_error) {
693 cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */ 703 cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */
694 clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */ 704 clear_bit(cpu, (unsigned long *)&cpu_initialized); /* was set by cpu_init() */
695 clear_node_cpumask(cpu); /* was set by numa_add_cpu */ 705 clear_node_cpumask(cpu); /* was set by numa_add_cpu */
696 cpu_clear(cpu, cpu_present_map); 706 cpu_clear(cpu, cpu_present_map);
697 cpu_clear(cpu, cpu_possible_map); 707 cpu_clear(cpu, cpu_possible_map);
@@ -841,24 +851,16 @@ static int __init smp_sanity_check(unsigned max_cpus)
841 return 0; 851 return 0;
842} 852}
843 853
844/* 854static void __init smp_cpu_index_default(void)
845 * Copy apicid's found by MP_processor_info from initial array to the per cpu
846 * data area. The x86_cpu_to_apicid_init array is then expendable and the
847 * x86_cpu_to_apicid_ptr is zeroed indicating that the static array is no
848 * longer available.
849 */
850void __init smp_set_apicids(void)
851{ 855{
852 int cpu; 856 int i;
857 struct cpuinfo_x86 *c;
853 858
854 for_each_cpu_mask(cpu, cpu_possible_map) { 859 for_each_cpu_mask(i, cpu_possible_map) {
855 if (per_cpu_offset(cpu)) 860 c = &cpu_data(i);
856 per_cpu(x86_cpu_to_apicid, cpu) = 861 /* mark all to hotplug */
857 x86_cpu_to_apicid_init[cpu]; 862 c->cpu_index = NR_CPUS;
858 } 863 }
859
860 /* indicate the static array will be going away soon */
861 x86_cpu_to_apicid_ptr = NULL;
862} 864}
863 865
864/* 866/*
@@ -868,9 +870,9 @@ void __init smp_set_apicids(void)
868void __init smp_prepare_cpus(unsigned int max_cpus) 870void __init smp_prepare_cpus(unsigned int max_cpus)
869{ 871{
870 nmi_watchdog_default(); 872 nmi_watchdog_default();
873 smp_cpu_index_default();
871 current_cpu_data = boot_cpu_data; 874 current_cpu_data = boot_cpu_data;
872 current_thread_info()->cpu = 0; /* needed? */ 875 current_thread_info()->cpu = 0; /* needed? */
873 smp_set_apicids();
874 set_cpu_sibling_map(0); 876 set_cpu_sibling_map(0);
875 877
876 if (smp_sanity_check(max_cpus) < 0) { 878 if (smp_sanity_check(max_cpus) < 0) {
@@ -885,6 +887,13 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
885 */ 887 */
886 setup_local_APIC(); 888 setup_local_APIC();
887 889
890 /*
891 * Enable IO APIC before setting up error vector
892 */
893 if (!skip_ioapic_setup && nr_ioapics)
894 enable_IO_APIC();
895 end_local_APIC_setup();
896
888 if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_id) { 897 if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_id) {
889 panic("Boot APIC ID in local APIC unexpected (%d vs %d)", 898 panic("Boot APIC ID in local APIC unexpected (%d vs %d)",
890 GET_APIC_ID(apic_read(APIC_ID)), boot_cpu_id); 899 GET_APIC_ID(apic_read(APIC_ID)), boot_cpu_id);
@@ -903,7 +912,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
903 * Set up local APIC timer on boot CPU. 912 * Set up local APIC timer on boot CPU.
904 */ 913 */
905 914
906 setup_boot_APIC_clock(); 915 setup_boot_clock();
907} 916}
908 917
909/* 918/*
@@ -912,7 +921,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
912void __init smp_prepare_boot_cpu(void) 921void __init smp_prepare_boot_cpu(void)
913{ 922{
914 int me = smp_processor_id(); 923 int me = smp_processor_id();
915 cpu_set(me, cpu_online_map); 924 /* already set me in cpu_online_map in boot_cpu_init() */
916 cpu_set(me, cpu_callout_map); 925 cpu_set(me, cpu_callout_map);
917 per_cpu(cpu_state, me) = CPU_ONLINE; 926 per_cpu(cpu_state, me) = CPU_ONLINE;
918} 927}
@@ -1010,13 +1019,13 @@ static void remove_siblinginfo(int cpu)
1010 cpu_clear(cpu, cpu_sibling_setup_map); 1019 cpu_clear(cpu, cpu_sibling_setup_map);
1011} 1020}
1012 1021
1013void remove_cpu_from_maps(void) 1022static void __ref remove_cpu_from_maps(void)
1014{ 1023{
1015 int cpu = smp_processor_id(); 1024 int cpu = smp_processor_id();
1016 1025
1017 cpu_clear(cpu, cpu_callout_map); 1026 cpu_clear(cpu, cpu_callout_map);
1018 cpu_clear(cpu, cpu_callin_map); 1027 cpu_clear(cpu, cpu_callin_map);
1019 clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */ 1028 clear_bit(cpu, (unsigned long *)&cpu_initialized); /* was set by cpu_init() */
1020 clear_node_cpumask(cpu); 1029 clear_node_cpumask(cpu);
1021} 1030}
1022 1031
diff --git a/arch/x86/kernel/smpcommon_32.c b/arch/x86/kernel/smpcommon_32.c
index bbfe85a0f69..8bc38af29ae 100644
--- a/arch/x86/kernel/smpcommon_32.c
+++ b/arch/x86/kernel/smpcommon_32.c
@@ -14,10 +14,11 @@ __cpuinit void init_gdt(int cpu)
14{ 14{
15 struct desc_struct *gdt = get_cpu_gdt_table(cpu); 15 struct desc_struct *gdt = get_cpu_gdt_table(cpu);
16 16
17 pack_descriptor((u32 *)&gdt[GDT_ENTRY_PERCPU].a, 17 pack_descriptor(&gdt[GDT_ENTRY_PERCPU],
18 (u32 *)&gdt[GDT_ENTRY_PERCPU].b,
19 __per_cpu_offset[cpu], 0xFFFFF, 18 __per_cpu_offset[cpu], 0xFFFFF,
20 0x80 | DESCTYPE_S | 0x2, 0x8); 19 0x2 | DESCTYPE_S, 0x8);
20
21 gdt[GDT_ENTRY_PERCPU].s = 1;
21 22
22 per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu]; 23 per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu];
23 per_cpu(cpu_number, cpu) = cpu; 24 per_cpu(cpu_number, cpu) = cpu;
diff --git a/arch/x86/kernel/srat_32.c b/arch/x86/kernel/srat_32.c
index 2a8713ec0f9..b72e61359c3 100644
--- a/arch/x86/kernel/srat_32.c
+++ b/arch/x86/kernel/srat_32.c
@@ -57,8 +57,6 @@ static struct node_memory_chunk_s node_memory_chunk[MAXCHUNKS];
57static int num_memory_chunks; /* total number of memory chunks */ 57static int num_memory_chunks; /* total number of memory chunks */
58static u8 __initdata apicid_to_pxm[MAX_APICID]; 58static u8 __initdata apicid_to_pxm[MAX_APICID];
59 59
60extern void * boot_ioremap(unsigned long, unsigned long);
61
62/* Identify CPU proximity domains */ 60/* Identify CPU proximity domains */
63static void __init parse_cpu_affinity_structure(char *p) 61static void __init parse_cpu_affinity_structure(char *p)
64{ 62{
@@ -276,7 +274,7 @@ int __init get_memcfg_from_srat(void)
276 int tables = 0; 274 int tables = 0;
277 int i = 0; 275 int i = 0;
278 276
279 rsdp_address = acpi_find_rsdp(); 277 rsdp_address = acpi_os_get_root_pointer();
280 if (!rsdp_address) { 278 if (!rsdp_address) {
281 printk("%s: System description tables not found\n", 279 printk("%s: System description tables not found\n",
282 __FUNCTION__); 280 __FUNCTION__);
@@ -299,7 +297,7 @@ int __init get_memcfg_from_srat(void)
299 } 297 }
300 298
301 rsdt = (struct acpi_table_rsdt *) 299 rsdt = (struct acpi_table_rsdt *)
302 boot_ioremap(rsdp->rsdt_physical_address, sizeof(struct acpi_table_rsdt)); 300 early_ioremap(rsdp->rsdt_physical_address, sizeof(struct acpi_table_rsdt));
303 301
304 if (!rsdt) { 302 if (!rsdt) {
305 printk(KERN_WARNING 303 printk(KERN_WARNING
@@ -339,11 +337,11 @@ int __init get_memcfg_from_srat(void)
339 for (i = 0; i < tables; i++) { 337 for (i = 0; i < tables; i++) {
340 /* Map in header, then map in full table length. */ 338 /* Map in header, then map in full table length. */
341 header = (struct acpi_table_header *) 339 header = (struct acpi_table_header *)
342 boot_ioremap(saved_rsdt.table.table_offset_entry[i], sizeof(struct acpi_table_header)); 340 early_ioremap(saved_rsdt.table.table_offset_entry[i], sizeof(struct acpi_table_header));
343 if (!header) 341 if (!header)
344 break; 342 break;
345 header = (struct acpi_table_header *) 343 header = (struct acpi_table_header *)
346 boot_ioremap(saved_rsdt.table.table_offset_entry[i], header->length); 344 early_ioremap(saved_rsdt.table.table_offset_entry[i], header->length);
347 if (!header) 345 if (!header)
348 break; 346 break;
349 347
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 6fa6cf036c7..02f0f61f5b1 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -22,9 +22,23 @@ static int save_stack_stack(void *data, char *name)
22 return -1; 22 return -1;
23} 23}
24 24
25static void save_stack_address(void *data, unsigned long addr) 25static void save_stack_address(void *data, unsigned long addr, int reliable)
26{
27 struct stack_trace *trace = data;
28 if (trace->skip > 0) {
29 trace->skip--;
30 return;
31 }
32 if (trace->nr_entries < trace->max_entries)
33 trace->entries[trace->nr_entries++] = addr;
34}
35
36static void
37save_stack_address_nosched(void *data, unsigned long addr, int reliable)
26{ 38{
27 struct stack_trace *trace = (struct stack_trace *)data; 39 struct stack_trace *trace = (struct stack_trace *)data;
40 if (in_sched_functions(addr))
41 return;
28 if (trace->skip > 0) { 42 if (trace->skip > 0) {
29 trace->skip--; 43 trace->skip--;
30 return; 44 return;
@@ -40,13 +54,26 @@ static const struct stacktrace_ops save_stack_ops = {
40 .address = save_stack_address, 54 .address = save_stack_address,
41}; 55};
42 56
57static const struct stacktrace_ops save_stack_ops_nosched = {
58 .warning = save_stack_warning,
59 .warning_symbol = save_stack_warning_symbol,
60 .stack = save_stack_stack,
61 .address = save_stack_address_nosched,
62};
63
43/* 64/*
44 * Save stack-backtrace addresses into a stack_trace buffer. 65 * Save stack-backtrace addresses into a stack_trace buffer.
45 */ 66 */
46void save_stack_trace(struct stack_trace *trace) 67void save_stack_trace(struct stack_trace *trace)
47{ 68{
48 dump_trace(current, NULL, NULL, &save_stack_ops, trace); 69 dump_trace(current, NULL, NULL, 0, &save_stack_ops, trace);
70 if (trace->nr_entries < trace->max_entries)
71 trace->entries[trace->nr_entries++] = ULONG_MAX;
72}
73
74void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
75{
76 dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace);
49 if (trace->nr_entries < trace->max_entries) 77 if (trace->nr_entries < trace->max_entries)
50 trace->entries[trace->nr_entries++] = ULONG_MAX; 78 trace->entries[trace->nr_entries++] = ULONG_MAX;
51} 79}
52EXPORT_SYMBOL(save_stack_trace);
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
new file mode 100644
index 00000000000..2ef1a5f8d67
--- /dev/null
+++ b/arch/x86/kernel/step.c
@@ -0,0 +1,203 @@
1/*
2 * x86 single-step support code, common to 32-bit and 64-bit.
3 */
4#include <linux/sched.h>
5#include <linux/mm.h>
6#include <linux/ptrace.h>
7
8unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs)
9{
10 unsigned long addr, seg;
11
12 addr = regs->ip;
13 seg = regs->cs & 0xffff;
14 if (v8086_mode(regs)) {
15 addr = (addr & 0xffff) + (seg << 4);
16 return addr;
17 }
18
19 /*
20 * We'll assume that the code segments in the GDT
21 * are all zero-based. That is largely true: the
22 * TLS segments are used for data, and the PNPBIOS
23 * and APM bios ones we just ignore here.
24 */
25 if ((seg & SEGMENT_TI_MASK) == SEGMENT_LDT) {
26 u32 *desc;
27 unsigned long base;
28
29 seg &= ~7UL;
30
31 mutex_lock(&child->mm->context.lock);
32 if (unlikely((seg >> 3) >= child->mm->context.size))
33 addr = -1L; /* bogus selector, access would fault */
34 else {
35 desc = child->mm->context.ldt + seg;
36 base = ((desc[0] >> 16) |
37 ((desc[1] & 0xff) << 16) |
38 (desc[1] & 0xff000000));
39
40 /* 16-bit code segment? */
41 if (!((desc[1] >> 22) & 1))
42 addr &= 0xffff;
43 addr += base;
44 }
45 mutex_unlock(&child->mm->context.lock);
46 }
47
48 return addr;
49}
50
51static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
52{
53 int i, copied;
54 unsigned char opcode[15];
55 unsigned long addr = convert_ip_to_linear(child, regs);
56
57 copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0);
58 for (i = 0; i < copied; i++) {
59 switch (opcode[i]) {
60 /* popf and iret */
61 case 0x9d: case 0xcf:
62 return 1;
63
64 /* CHECKME: 64 65 */
65
66 /* opcode and address size prefixes */
67 case 0x66: case 0x67:
68 continue;
69 /* irrelevant prefixes (segment overrides and repeats) */
70 case 0x26: case 0x2e:
71 case 0x36: case 0x3e:
72 case 0x64: case 0x65:
73 case 0xf0: case 0xf2: case 0xf3:
74 continue;
75
76#ifdef CONFIG_X86_64
77 case 0x40 ... 0x4f:
78 if (regs->cs != __USER_CS)
79 /* 32-bit mode: register increment */
80 return 0;
81 /* 64-bit mode: REX prefix */
82 continue;
83#endif
84
85 /* CHECKME: f2, f3 */
86
87 /*
88 * pushf: NOTE! We should probably not let
89 * the user see the TF bit being set. But
90 * it's more pain than it's worth to avoid
91 * it, and a debugger could emulate this
92 * all in user space if it _really_ cares.
93 */
94 case 0x9c:
95 default:
96 return 0;
97 }
98 }
99 return 0;
100}
101
102/*
103 * Enable single-stepping. Return nonzero if user mode is not using TF itself.
104 */
105static int enable_single_step(struct task_struct *child)
106{
107 struct pt_regs *regs = task_pt_regs(child);
108
109 /*
110 * Always set TIF_SINGLESTEP - this guarantees that
111 * we single-step system calls etc.. This will also
112 * cause us to set TF when returning to user mode.
113 */
114 set_tsk_thread_flag(child, TIF_SINGLESTEP);
115
116 /*
117 * If TF was already set, don't do anything else
118 */
119 if (regs->flags & X86_EFLAGS_TF)
120 return 0;
121
122 /* Set TF on the kernel stack.. */
123 regs->flags |= X86_EFLAGS_TF;
124
125 /*
126 * ..but if TF is changed by the instruction we will trace,
127 * don't mark it as being "us" that set it, so that we
128 * won't clear it by hand later.
129 */
130 if (is_setting_trap_flag(child, regs))
131 return 0;
132
133 set_tsk_thread_flag(child, TIF_FORCED_TF);
134
135 return 1;
136}
137
138/*
139 * Install this value in MSR_IA32_DEBUGCTLMSR whenever child is running.
140 */
141static void write_debugctlmsr(struct task_struct *child, unsigned long val)
142{
143 child->thread.debugctlmsr = val;
144
145 if (child != current)
146 return;
147
148 wrmsrl(MSR_IA32_DEBUGCTLMSR, val);
149}
150
151/*
152 * Enable single or block step.
153 */
154static void enable_step(struct task_struct *child, bool block)
155{
156 /*
157 * Make sure block stepping (BTF) is not enabled unless it should be.
158 * Note that we don't try to worry about any is_setting_trap_flag()
159 * instructions after the first when using block stepping.
160 * So noone should try to use debugger block stepping in a program
161 * that uses user-mode single stepping itself.
162 */
163 if (enable_single_step(child) && block) {
164 set_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
165 write_debugctlmsr(child,
166 child->thread.debugctlmsr | DEBUGCTLMSR_BTF);
167 } else {
168 write_debugctlmsr(child,
169 child->thread.debugctlmsr & ~TIF_DEBUGCTLMSR);
170
171 if (!child->thread.debugctlmsr)
172 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
173 }
174}
175
176void user_enable_single_step(struct task_struct *child)
177{
178 enable_step(child, 0);
179}
180
181void user_enable_block_step(struct task_struct *child)
182{
183 enable_step(child, 1);
184}
185
186void user_disable_single_step(struct task_struct *child)
187{
188 /*
189 * Make sure block stepping (BTF) is disabled.
190 */
191 write_debugctlmsr(child,
192 child->thread.debugctlmsr & ~TIF_DEBUGCTLMSR);
193
194 if (!child->thread.debugctlmsr)
195 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
196
197 /* Always clear TIF_SINGLESTEP... */
198 clear_tsk_thread_flag(child, TIF_SINGLESTEP);
199
200 /* But touch TF only if it was set by us.. */
201 if (test_and_clear_tsk_thread_flag(child, TIF_FORCED_TF))
202 task_pt_regs(child)->flags &= ~X86_EFLAGS_TF;
203}
diff --git a/arch/x86/kernel/suspend_64.c b/arch/x86/kernel/suspend_64.c
index 2e5efaaf880..7ac7130022f 100644
--- a/arch/x86/kernel/suspend_64.c
+++ b/arch/x86/kernel/suspend_64.c
@@ -17,9 +17,26 @@
17/* References to section boundaries */ 17/* References to section boundaries */
18extern const void __nosave_begin, __nosave_end; 18extern const void __nosave_begin, __nosave_end;
19 19
20static void fix_processor_context(void);
21
20struct saved_context saved_context; 22struct saved_context saved_context;
21 23
22void __save_processor_state(struct saved_context *ctxt) 24/**
25 * __save_processor_state - save CPU registers before creating a
26 * hibernation image and before restoring the memory state from it
27 * @ctxt - structure to store the registers contents in
28 *
29 * NOTE: If there is a CPU register the modification of which by the
30 * boot kernel (ie. the kernel used for loading the hibernation image)
31 * might affect the operations of the restored target kernel (ie. the one
32 * saved in the hibernation image), then its contents must be saved by this
33 * function. In other words, if kernel A is hibernated and different
34 * kernel B is used for loading the hibernation image into memory, the
35 * kernel A's __save_processor_state() function must save all registers
36 * needed by kernel A, so that it can operate correctly after the resume
37 * regardless of what kernel B does in the meantime.
38 */
39static void __save_processor_state(struct saved_context *ctxt)
23{ 40{
24 kernel_fpu_begin(); 41 kernel_fpu_begin();
25 42
@@ -69,7 +86,12 @@ static void do_fpu_end(void)
69 kernel_fpu_end(); 86 kernel_fpu_end();
70} 87}
71 88
72void __restore_processor_state(struct saved_context *ctxt) 89/**
90 * __restore_processor_state - restore the contents of CPU registers saved
91 * by __save_processor_state()
92 * @ctxt - structure to load the registers contents from
93 */
94static void __restore_processor_state(struct saved_context *ctxt)
73{ 95{
74 /* 96 /*
75 * control registers 97 * control registers
@@ -113,14 +135,19 @@ void restore_processor_state(void)
113 __restore_processor_state(&saved_context); 135 __restore_processor_state(&saved_context);
114} 136}
115 137
116void fix_processor_context(void) 138static void fix_processor_context(void)
117{ 139{
118 int cpu = smp_processor_id(); 140 int cpu = smp_processor_id();
119 struct tss_struct *t = &per_cpu(init_tss, cpu); 141 struct tss_struct *t = &per_cpu(init_tss, cpu);
120 142
121 set_tss_desc(cpu,t); /* This just modifies memory; should not be necessary. But... This is necessary, because 386 hardware has concept of busy TSS or some similar stupidity. */ 143 /*
144 * This just modifies memory; should not be necessary. But... This
145 * is necessary, because 386 hardware has concept of busy TSS or some
146 * similar stupidity.
147 */
148 set_tss_desc(cpu, t);
122 149
123 cpu_gdt(cpu)[GDT_ENTRY_TSS].type = 9; 150 get_cpu_gdt_table(cpu)[GDT_ENTRY_TSS].type = 9;
124 151
125 syscall_init(); /* This sets MSR_*STAR and related */ 152 syscall_init(); /* This sets MSR_*STAR and related */
126 load_TR_desc(); /* This does ltr */ 153 load_TR_desc(); /* This does ltr */
@@ -138,7 +165,6 @@ void fix_processor_context(void)
138 loaddebug(&current->thread, 6); 165 loaddebug(&current->thread, 6);
139 loaddebug(&current->thread, 7); 166 loaddebug(&current->thread, 7);
140 } 167 }
141
142} 168}
143 169
144#ifdef CONFIG_HIBERNATION 170#ifdef CONFIG_HIBERNATION
diff --git a/arch/x86/kernel/suspend_asm_64.S b/arch/x86/kernel/suspend_asm_64.S
index 72f952103e5..aeb9a4d7681 100644
--- a/arch/x86/kernel/suspend_asm_64.S
+++ b/arch/x86/kernel/suspend_asm_64.S
@@ -18,13 +18,13 @@
18 18
19ENTRY(swsusp_arch_suspend) 19ENTRY(swsusp_arch_suspend)
20 movq $saved_context, %rax 20 movq $saved_context, %rax
21 movq %rsp, pt_regs_rsp(%rax) 21 movq %rsp, pt_regs_sp(%rax)
22 movq %rbp, pt_regs_rbp(%rax) 22 movq %rbp, pt_regs_bp(%rax)
23 movq %rsi, pt_regs_rsi(%rax) 23 movq %rsi, pt_regs_si(%rax)
24 movq %rdi, pt_regs_rdi(%rax) 24 movq %rdi, pt_regs_di(%rax)
25 movq %rbx, pt_regs_rbx(%rax) 25 movq %rbx, pt_regs_bx(%rax)
26 movq %rcx, pt_regs_rcx(%rax) 26 movq %rcx, pt_regs_cx(%rax)
27 movq %rdx, pt_regs_rdx(%rax) 27 movq %rdx, pt_regs_dx(%rax)
28 movq %r8, pt_regs_r8(%rax) 28 movq %r8, pt_regs_r8(%rax)
29 movq %r9, pt_regs_r9(%rax) 29 movq %r9, pt_regs_r9(%rax)
30 movq %r10, pt_regs_r10(%rax) 30 movq %r10, pt_regs_r10(%rax)
@@ -34,7 +34,7 @@ ENTRY(swsusp_arch_suspend)
34 movq %r14, pt_regs_r14(%rax) 34 movq %r14, pt_regs_r14(%rax)
35 movq %r15, pt_regs_r15(%rax) 35 movq %r15, pt_regs_r15(%rax)
36 pushfq 36 pushfq
37 popq pt_regs_eflags(%rax) 37 popq pt_regs_flags(%rax)
38 38
39 /* save the address of restore_registers */ 39 /* save the address of restore_registers */
40 movq $restore_registers, %rax 40 movq $restore_registers, %rax
@@ -115,13 +115,13 @@ ENTRY(restore_registers)
115 115
116 /* We don't restore %rax, it must be 0 anyway */ 116 /* We don't restore %rax, it must be 0 anyway */
117 movq $saved_context, %rax 117 movq $saved_context, %rax
118 movq pt_regs_rsp(%rax), %rsp 118 movq pt_regs_sp(%rax), %rsp
119 movq pt_regs_rbp(%rax), %rbp 119 movq pt_regs_bp(%rax), %rbp
120 movq pt_regs_rsi(%rax), %rsi 120 movq pt_regs_si(%rax), %rsi
121 movq pt_regs_rdi(%rax), %rdi 121 movq pt_regs_di(%rax), %rdi
122 movq pt_regs_rbx(%rax), %rbx 122 movq pt_regs_bx(%rax), %rbx
123 movq pt_regs_rcx(%rax), %rcx 123 movq pt_regs_cx(%rax), %rcx
124 movq pt_regs_rdx(%rax), %rdx 124 movq pt_regs_dx(%rax), %rdx
125 movq pt_regs_r8(%rax), %r8 125 movq pt_regs_r8(%rax), %r8
126 movq pt_regs_r9(%rax), %r9 126 movq pt_regs_r9(%rax), %r9
127 movq pt_regs_r10(%rax), %r10 127 movq pt_regs_r10(%rax), %r10
@@ -130,7 +130,7 @@ ENTRY(restore_registers)
130 movq pt_regs_r13(%rax), %r13 130 movq pt_regs_r13(%rax), %r13
131 movq pt_regs_r14(%rax), %r14 131 movq pt_regs_r14(%rax), %r14
132 movq pt_regs_r15(%rax), %r15 132 movq pt_regs_r15(%rax), %r15
133 pushq pt_regs_eflags(%rax) 133 pushq pt_regs_flags(%rax)
134 popfq 134 popfq
135 135
136 xorq %rax, %rax 136 xorq %rax, %rax
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 907942ee6e7..bd802a5e1aa 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -12,6 +12,7 @@
12#include <linux/file.h> 12#include <linux/file.h>
13#include <linux/utsname.h> 13#include <linux/utsname.h>
14#include <linux/personality.h> 14#include <linux/personality.h>
15#include <linux/random.h>
15 16
16#include <asm/uaccess.h> 17#include <asm/uaccess.h>
17#include <asm/ia32.h> 18#include <asm/ia32.h>
@@ -65,6 +66,7 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
65 unsigned long *end) 66 unsigned long *end)
66{ 67{
67 if (!test_thread_flag(TIF_IA32) && (flags & MAP_32BIT)) { 68 if (!test_thread_flag(TIF_IA32) && (flags & MAP_32BIT)) {
69 unsigned long new_begin;
68 /* This is usually used needed to map code in small 70 /* This is usually used needed to map code in small
69 model, so it needs to be in the first 31bit. Limit 71 model, so it needs to be in the first 31bit. Limit
70 it to that. This means we need to move the 72 it to that. This means we need to move the
@@ -74,6 +76,11 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
74 of playground for now. -AK */ 76 of playground for now. -AK */
75 *begin = 0x40000000; 77 *begin = 0x40000000;
76 *end = 0x80000000; 78 *end = 0x80000000;
79 if (current->flags & PF_RANDOMIZE) {
80 new_begin = randomize_range(*begin, *begin + 0x02000000, 0);
81 if (new_begin)
82 *begin = new_begin;
83 }
77 } else { 84 } else {
78 *begin = TASK_UNMAPPED_BASE; 85 *begin = TASK_UNMAPPED_BASE;
79 *end = TASK_SIZE; 86 *end = TASK_SIZE;
@@ -143,6 +150,97 @@ full_search:
143 } 150 }
144} 151}
145 152
153
154unsigned long
155arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
156 const unsigned long len, const unsigned long pgoff,
157 const unsigned long flags)
158{
159 struct vm_area_struct *vma;
160 struct mm_struct *mm = current->mm;
161 unsigned long addr = addr0;
162
163 /* requested length too big for entire address space */
164 if (len > TASK_SIZE)
165 return -ENOMEM;
166
167 if (flags & MAP_FIXED)
168 return addr;
169
170 /* for MAP_32BIT mappings we force the legact mmap base */
171 if (!test_thread_flag(TIF_IA32) && (flags & MAP_32BIT))
172 goto bottomup;
173
174 /* requesting a specific address */
175 if (addr) {
176 addr = PAGE_ALIGN(addr);
177 vma = find_vma(mm, addr);
178 if (TASK_SIZE - len >= addr &&
179 (!vma || addr + len <= vma->vm_start))
180 return addr;
181 }
182
183 /* check if free_area_cache is useful for us */
184 if (len <= mm->cached_hole_size) {
185 mm->cached_hole_size = 0;
186 mm->free_area_cache = mm->mmap_base;
187 }
188
189 /* either no address requested or can't fit in requested address hole */
190 addr = mm->free_area_cache;
191
192 /* make sure it can fit in the remaining address space */
193 if (addr > len) {
194 vma = find_vma(mm, addr-len);
195 if (!vma || addr <= vma->vm_start)
196 /* remember the address as a hint for next time */
197 return (mm->free_area_cache = addr-len);
198 }
199
200 if (mm->mmap_base < len)
201 goto bottomup;
202
203 addr = mm->mmap_base-len;
204
205 do {
206 /*
207 * Lookup failure means no vma is above this address,
208 * else if new region fits below vma->vm_start,
209 * return with success:
210 */
211 vma = find_vma(mm, addr);
212 if (!vma || addr+len <= vma->vm_start)
213 /* remember the address as a hint for next time */
214 return (mm->free_area_cache = addr);
215
216 /* remember the largest hole we saw so far */
217 if (addr + mm->cached_hole_size < vma->vm_start)
218 mm->cached_hole_size = vma->vm_start - addr;
219
220 /* try just below the current vma->vm_start */
221 addr = vma->vm_start-len;
222 } while (len < vma->vm_start);
223
224bottomup:
225 /*
226 * A failed mmap() very likely causes application failure,
227 * so fall back to the bottom-up function here. This scenario
228 * can happen with large stack limits and large mmap()
229 * allocations.
230 */
231 mm->cached_hole_size = ~0UL;
232 mm->free_area_cache = TASK_UNMAPPED_BASE;
233 addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
234 /*
235 * Restore the topdown base:
236 */
237 mm->free_area_cache = mm->mmap_base;
238 mm->cached_hole_size = ~0UL;
239
240 return addr;
241}
242
243
146asmlinkage long sys_uname(struct new_utsname __user * name) 244asmlinkage long sys_uname(struct new_utsname __user * name)
147{ 245{
148 int err; 246 int err;
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 8344c70adf6..adff5562f5f 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -321,6 +321,8 @@ ENTRY(sys_call_table)
321 .long sys_epoll_pwait 321 .long sys_epoll_pwait
322 .long sys_utimensat /* 320 */ 322 .long sys_utimensat /* 320 */
323 .long sys_signalfd 323 .long sys_signalfd
324 .long sys_timerfd 324 .long sys_timerfd_create
325 .long sys_eventfd 325 .long sys_eventfd
326 .long sys_fallocate 326 .long sys_fallocate
327 .long sys_timerfd_settime /* 325 */
328 .long sys_timerfd_gettime
diff --git a/arch/x86/kernel/test_nx.c b/arch/x86/kernel/test_nx.c
new file mode 100644
index 00000000000..10b8a6f69f8
--- /dev/null
+++ b/arch/x86/kernel/test_nx.c
@@ -0,0 +1,173 @@
1/*
2 * test_nx.c: functional test for NX functionality
3 *
4 * (C) Copyright 2008 Intel Corporation
5 * Author: Arjan van de Ven <arjan@linux.intel.com>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; version 2
10 * of the License.
11 */
12#include <linux/module.h>
13#include <linux/sort.h>
14#include <asm/uaccess.h>
15#include <asm/asm.h>
16
17extern int rodata_test_data;
18
19/*
20 * This file checks 4 things:
21 * 1) Check if the stack is not executable
22 * 2) Check if kmalloc memory is not executable
23 * 3) Check if the .rodata section is not executable
24 * 4) Check if the .data section of a module is not executable
25 *
26 * To do this, the test code tries to execute memory in stack/kmalloc/etc,
27 * and then checks if the expected trap happens.
28 *
29 * Sadly, this implies having a dynamic exception handling table entry.
30 * ... which can be done (and will make Rusty cry)... but it can only
31 * be done in a stand-alone module with only 1 entry total.
32 * (otherwise we'd have to sort and that's just too messy)
33 */
34
35
36
37/*
38 * We want to set up an exception handling point on our stack,
39 * which means a variable value. This function is rather dirty
40 * and walks the exception table of the module, looking for a magic
41 * marker and replaces it with a specific function.
42 */
43static void fudze_exception_table(void *marker, void *new)
44{
45 struct module *mod = THIS_MODULE;
46 struct exception_table_entry *extable;
47
48 /*
49 * Note: This module has only 1 exception table entry,
50 * so searching and sorting is not needed. If that changes,
51 * this would be the place to search and re-sort the exception
52 * table.
53 */
54 if (mod->num_exentries > 1) {
55 printk(KERN_ERR "test_nx: too many exception table entries!\n");
56 printk(KERN_ERR "test_nx: test results are not reliable.\n");
57 return;
58 }
59 extable = (struct exception_table_entry *)mod->extable;
60 extable[0].insn = (unsigned long)new;
61}
62
63
64/*
65 * exception tables get their symbols translated so we need
66 * to use a fake function to put in there, which we can then
67 * replace at runtime.
68 */
69void foo_label(void);
70
71/*
72 * returns 0 for not-executable, negative for executable
73 *
74 * Note: we cannot allow this function to be inlined, because
75 * that would give us more than 1 exception table entry.
76 * This in turn would break the assumptions above.
77 */
78static noinline int test_address(void *address)
79{
80 unsigned long result;
81
82 /* Set up an exception table entry for our address */
83 fudze_exception_table(&foo_label, address);
84 result = 1;
85 asm volatile(
86 "foo_label:\n"
87 "0: call *%[fake_code]\n"
88 "1:\n"
89 ".section .fixup,\"ax\"\n"
90 "2: mov %[zero], %[rslt]\n"
91 " ret\n"
92 ".previous\n"
93 _ASM_EXTABLE(0b,2b)
94 : [rslt] "=r" (result)
95 : [fake_code] "r" (address), [zero] "r" (0UL), "0" (result)
96 );
97 /* change the exception table back for the next round */
98 fudze_exception_table(address, &foo_label);
99
100 if (result)
101 return -ENODEV;
102 return 0;
103}
104
105static unsigned char test_data = 0xC3; /* 0xC3 is the opcode for "ret" */
106
107static int test_NX(void)
108{
109 int ret = 0;
110 /* 0xC3 is the opcode for "ret" */
111 char stackcode[] = {0xC3, 0x90, 0 };
112 char *heap;
113
114 test_data = 0xC3;
115
116 printk(KERN_INFO "Testing NX protection\n");
117
118 /* Test 1: check if the stack is not executable */
119 if (test_address(&stackcode)) {
120 printk(KERN_ERR "test_nx: stack was executable\n");
121 ret = -ENODEV;
122 }
123
124
125 /* Test 2: Check if the heap is executable */
126 heap = kmalloc(64, GFP_KERNEL);
127 if (!heap)
128 return -ENOMEM;
129 heap[0] = 0xC3; /* opcode for "ret" */
130
131 if (test_address(heap)) {
132 printk(KERN_ERR "test_nx: heap was executable\n");
133 ret = -ENODEV;
134 }
135 kfree(heap);
136
137 /*
138 * The following 2 tests currently fail, this needs to get fixed
139 * Until then, don't run them to avoid too many people getting scared
140 * by the error message
141 */
142
143#ifdef CONFIG_DEBUG_RODATA
144 /* Test 3: Check if the .rodata section is executable */
145 if (rodata_test_data != 0xC3) {
146 printk(KERN_ERR "test_nx: .rodata marker has invalid value\n");
147 ret = -ENODEV;
148 } else if (test_address(&rodata_test_data)) {
149 printk(KERN_ERR "test_nx: .rodata section is executable\n");
150 ret = -ENODEV;
151 }
152#endif
153
154#if 0
155 /* Test 4: Check if the .data section of a module is executable */
156 if (test_address(&test_data)) {
157 printk(KERN_ERR "test_nx: .data section is executable\n");
158 ret = -ENODEV;
159 }
160
161#endif
162 return 0;
163}
164
165static void test_exit(void)
166{
167}
168
169module_init(test_NX);
170module_exit(test_exit);
171MODULE_LICENSE("GPL");
172MODULE_DESCRIPTION("Testcase for the NX infrastructure");
173MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c
new file mode 100644
index 00000000000..4c163772000
--- /dev/null
+++ b/arch/x86/kernel/test_rodata.c
@@ -0,0 +1,86 @@
1/*
2 * test_rodata.c: functional test for mark_rodata_ro function
3 *
4 * (C) Copyright 2008 Intel Corporation
5 * Author: Arjan van de Ven <arjan@linux.intel.com>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; version 2
10 * of the License.
11 */
12#include <linux/module.h>
13#include <asm/sections.h>
14extern int rodata_test_data;
15
16int rodata_test(void)
17{
18 unsigned long result;
19 unsigned long start, end;
20
21 /* test 1: read the value */
22 /* If this test fails, some previous testrun has clobbered the state */
23 if (!rodata_test_data) {
24 printk(KERN_ERR "rodata_test: test 1 fails (start data)\n");
25 return -ENODEV;
26 }
27
28 /* test 2: write to the variable; this should fault */
29 /*
30 * If this test fails, we managed to overwrite the data
31 *
32 * This is written in assembly to be able to catch the
33 * exception that is supposed to happen in the correct
34 * case
35 */
36
37 result = 1;
38 asm volatile(
39 "0: mov %[zero],(%[rodata_test])\n"
40 " mov %[zero], %[rslt]\n"
41 "1:\n"
42 ".section .fixup,\"ax\"\n"
43 "2: jmp 1b\n"
44 ".previous\n"
45 ".section __ex_table,\"a\"\n"
46 " .align 16\n"
47#ifdef CONFIG_X86_32
48 " .long 0b,2b\n"
49#else
50 " .quad 0b,2b\n"
51#endif
52 ".previous"
53 : [rslt] "=r" (result)
54 : [rodata_test] "r" (&rodata_test_data), [zero] "r" (0UL)
55 );
56
57
58 if (!result) {
59 printk(KERN_ERR "rodata_test: test data was not read only\n");
60 return -ENODEV;
61 }
62
63 /* test 3: check the value hasn't changed */
64 /* If this test fails, we managed to overwrite the data */
65 if (!rodata_test_data) {
66 printk(KERN_ERR "rodata_test: Test 3 failes (end data)\n");
67 return -ENODEV;
68 }
69 /* test 4: check if the rodata section is 4Kb aligned */
70 start = (unsigned long)__start_rodata;
71 end = (unsigned long)__end_rodata;
72 if (start & (PAGE_SIZE - 1)) {
73 printk(KERN_ERR "rodata_test: .rodata is not 4k aligned\n");
74 return -ENODEV;
75 }
76 if (end & (PAGE_SIZE - 1)) {
77 printk(KERN_ERR "rodata_test: .rodata end is not 4k aligned\n");
78 return -ENODEV;
79 }
80
81 return 0;
82}
83
84MODULE_LICENSE("GPL");
85MODULE_DESCRIPTION("Testcase for the DEBUG_RODATA infrastructure");
86MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c
index 8a322c96bc2..1a89e93f3f1 100644
--- a/arch/x86/kernel/time_32.c
+++ b/arch/x86/kernel/time_32.c
@@ -28,98 +28,20 @@
28 * serialize accesses to xtime/lost_ticks). 28 * serialize accesses to xtime/lost_ticks).
29 */ 29 */
30 30
31#include <linux/errno.h> 31#include <linux/init.h>
32#include <linux/sched.h>
33#include <linux/kernel.h>
34#include <linux/param.h>
35#include <linux/string.h>
36#include <linux/mm.h>
37#include <linux/interrupt.h> 32#include <linux/interrupt.h>
38#include <linux/time.h> 33#include <linux/time.h>
39#include <linux/delay.h>
40#include <linux/init.h>
41#include <linux/smp.h>
42#include <linux/module.h>
43#include <linux/sysdev.h>
44#include <linux/bcd.h>
45#include <linux/efi.h>
46#include <linux/mca.h> 34#include <linux/mca.h>
47 35
48#include <asm/io.h>
49#include <asm/smp.h>
50#include <asm/irq.h>
51#include <asm/msr.h>
52#include <asm/delay.h>
53#include <asm/mpspec.h>
54#include <asm/uaccess.h>
55#include <asm/processor.h>
56#include <asm/timer.h>
57#include <asm/time.h>
58
59#include "mach_time.h"
60
61#include <linux/timex.h>
62
63#include <asm/hpet.h>
64
65#include <asm/arch_hooks.h> 36#include <asm/arch_hooks.h>
66 37#include <asm/hpet.h>
67#include "io_ports.h" 38#include <asm/time.h>
68
69#include <asm/i8259.h>
70 39
71#include "do_timer.h" 40#include "do_timer.h"
72 41
73unsigned int cpu_khz; /* Detected as we calibrate the TSC */ 42unsigned int cpu_khz; /* Detected as we calibrate the TSC */
74EXPORT_SYMBOL(cpu_khz); 43EXPORT_SYMBOL(cpu_khz);
75 44
76DEFINE_SPINLOCK(rtc_lock);
77EXPORT_SYMBOL(rtc_lock);
78
79/*
80 * This is a special lock that is owned by the CPU and holds the index
81 * register we are working with. It is required for NMI access to the
82 * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details.
83 */
84volatile unsigned long cmos_lock = 0;
85EXPORT_SYMBOL(cmos_lock);
86
87/* Routines for accessing the CMOS RAM/RTC. */
88unsigned char rtc_cmos_read(unsigned char addr)
89{
90 unsigned char val;
91 lock_cmos_prefix(addr);
92 outb_p(addr, RTC_PORT(0));
93 val = inb_p(RTC_PORT(1));
94 lock_cmos_suffix(addr);
95 return val;
96}
97EXPORT_SYMBOL(rtc_cmos_read);
98
99void rtc_cmos_write(unsigned char val, unsigned char addr)
100{
101 lock_cmos_prefix(addr);
102 outb_p(addr, RTC_PORT(0));
103 outb_p(val, RTC_PORT(1));
104 lock_cmos_suffix(addr);
105}
106EXPORT_SYMBOL(rtc_cmos_write);
107
108static int set_rtc_mmss(unsigned long nowtime)
109{
110 int retval;
111 unsigned long flags;
112
113 /* gets recalled with irq locally disabled */
114 /* XXX - does irqsave resolve this? -johnstul */
115 spin_lock_irqsave(&rtc_lock, flags);
116 retval = set_wallclock(nowtime);
117 spin_unlock_irqrestore(&rtc_lock, flags);
118
119 return retval;
120}
121
122
123int timer_ack; 45int timer_ack;
124 46
125unsigned long profile_pc(struct pt_regs *regs) 47unsigned long profile_pc(struct pt_regs *regs)
@@ -127,17 +49,17 @@ unsigned long profile_pc(struct pt_regs *regs)
127 unsigned long pc = instruction_pointer(regs); 49 unsigned long pc = instruction_pointer(regs);
128 50
129#ifdef CONFIG_SMP 51#ifdef CONFIG_SMP
130 if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->xcs) && 52 if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->cs) &&
131 in_lock_functions(pc)) { 53 in_lock_functions(pc)) {
132#ifdef CONFIG_FRAME_POINTER 54#ifdef CONFIG_FRAME_POINTER
133 return *(unsigned long *)(regs->ebp + 4); 55 return *(unsigned long *)(regs->bp + 4);
134#else 56#else
135 unsigned long *sp = (unsigned long *)&regs->esp; 57 unsigned long *sp = (unsigned long *)&regs->sp;
136 58
137 /* Return address is either directly at stack pointer 59 /* Return address is either directly at stack pointer
138 or above a saved eflags. Eflags has bits 22-31 zero, 60 or above a saved flags. Eflags has bits 22-31 zero,
139 kernel addresses don't. */ 61 kernel addresses don't. */
140 if (sp[0] >> 22) 62 if (sp[0] >> 22)
141 return sp[0]; 63 return sp[0];
142 if (sp[1] >> 22) 64 if (sp[1] >> 22)
143 return sp[1]; 65 return sp[1];
@@ -193,26 +115,6 @@ irqreturn_t timer_interrupt(int irq, void *dev_id)
193 return IRQ_HANDLED; 115 return IRQ_HANDLED;
194} 116}
195 117
196/* not static: needed by APM */
197unsigned long read_persistent_clock(void)
198{
199 unsigned long retval;
200 unsigned long flags;
201
202 spin_lock_irqsave(&rtc_lock, flags);
203
204 retval = get_wallclock();
205
206 spin_unlock_irqrestore(&rtc_lock, flags);
207
208 return retval;
209}
210
211int update_persistent_clock(struct timespec now)
212{
213 return set_rtc_mmss(now.tv_sec);
214}
215
216extern void (*late_time_init)(void); 118extern void (*late_time_init)(void);
217/* Duplicate of time_init() below, with hpet_enable part added */ 119/* Duplicate of time_init() below, with hpet_enable part added */
218void __init hpet_time_init(void) 120void __init hpet_time_init(void)
diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
index 368b1942b39..0380795121a 100644
--- a/arch/x86/kernel/time_64.c
+++ b/arch/x86/kernel/time_64.c
@@ -11,43 +11,18 @@
11 * RTC support code taken from arch/i386/kernel/timers/time_hpet.c 11 * RTC support code taken from arch/i386/kernel/timers/time_hpet.c
12 */ 12 */
13 13
14#include <linux/kernel.h> 14#include <linux/clockchips.h>
15#include <linux/sched.h>
16#include <linux/interrupt.h>
17#include <linux/init.h> 15#include <linux/init.h>
18#include <linux/mc146818rtc.h> 16#include <linux/interrupt.h>
19#include <linux/time.h>
20#include <linux/ioport.h>
21#include <linux/module.h> 17#include <linux/module.h>
22#include <linux/device.h> 18#include <linux/time.h>
23#include <linux/sysdev.h>
24#include <linux/bcd.h>
25#include <linux/notifier.h>
26#include <linux/cpu.h>
27#include <linux/kallsyms.h>
28#include <linux/acpi.h>
29#include <linux/clockchips.h>
30 19
31#ifdef CONFIG_ACPI
32#include <acpi/achware.h> /* for PM timer frequency */
33#include <acpi/acpi_bus.h>
34#endif
35#include <asm/i8253.h> 20#include <asm/i8253.h>
36#include <asm/pgtable.h>
37#include <asm/vsyscall.h>
38#include <asm/timex.h>
39#include <asm/proto.h>
40#include <asm/hpet.h>
41#include <asm/sections.h>
42#include <linux/hpet.h>
43#include <asm/apic.h>
44#include <asm/hpet.h> 21#include <asm/hpet.h>
45#include <asm/mpspec.h>
46#include <asm/nmi.h> 22#include <asm/nmi.h>
47#include <asm/vgtod.h> 23#include <asm/vgtod.h>
48 24#include <asm/time.h>
49DEFINE_SPINLOCK(rtc_lock); 25#include <asm/timer.h>
50EXPORT_SYMBOL(rtc_lock);
51 26
52volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; 27volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
53 28
@@ -56,10 +31,10 @@ unsigned long profile_pc(struct pt_regs *regs)
56 unsigned long pc = instruction_pointer(regs); 31 unsigned long pc = instruction_pointer(regs);
57 32
58 /* Assume the lock function has either no stack frame or a copy 33 /* Assume the lock function has either no stack frame or a copy
59 of eflags from PUSHF 34 of flags from PUSHF
60 Eflags always has bits 22 and up cleared unlike kernel addresses. */ 35 Eflags always has bits 22 and up cleared unlike kernel addresses. */
61 if (!user_mode(regs) && in_lock_functions(pc)) { 36 if (!user_mode(regs) && in_lock_functions(pc)) {
62 unsigned long *sp = (unsigned long *)regs->rsp; 37 unsigned long *sp = (unsigned long *)regs->sp;
63 if (sp[0] >> 22) 38 if (sp[0] >> 22)
64 return sp[0]; 39 return sp[0];
65 if (sp[1] >> 22) 40 if (sp[1] >> 22)
@@ -69,82 +44,6 @@ unsigned long profile_pc(struct pt_regs *regs)
69} 44}
70EXPORT_SYMBOL(profile_pc); 45EXPORT_SYMBOL(profile_pc);
71 46
72/*
73 * In order to set the CMOS clock precisely, set_rtc_mmss has to be called 500
74 * ms after the second nowtime has started, because when nowtime is written
75 * into the registers of the CMOS clock, it will jump to the next second
76 * precisely 500 ms later. Check the Motorola MC146818A or Dallas DS12887 data
77 * sheet for details.
78 */
79
80static int set_rtc_mmss(unsigned long nowtime)
81{
82 int retval = 0;
83 int real_seconds, real_minutes, cmos_minutes;
84 unsigned char control, freq_select;
85 unsigned long flags;
86
87/*
88 * set_rtc_mmss is called when irqs are enabled, so disable irqs here
89 */
90 spin_lock_irqsave(&rtc_lock, flags);
91/*
92 * Tell the clock it's being set and stop it.
93 */
94 control = CMOS_READ(RTC_CONTROL);
95 CMOS_WRITE(control | RTC_SET, RTC_CONTROL);
96
97 freq_select = CMOS_READ(RTC_FREQ_SELECT);
98 CMOS_WRITE(freq_select | RTC_DIV_RESET2, RTC_FREQ_SELECT);
99
100 cmos_minutes = CMOS_READ(RTC_MINUTES);
101 BCD_TO_BIN(cmos_minutes);
102
103/*
104 * since we're only adjusting minutes and seconds, don't interfere with hour
105 * overflow. This avoids messing with unknown time zones but requires your RTC
106 * not to be off by more than 15 minutes. Since we're calling it only when
107 * our clock is externally synchronized using NTP, this shouldn't be a problem.
108 */
109
110 real_seconds = nowtime % 60;
111 real_minutes = nowtime / 60;
112 if (((abs(real_minutes - cmos_minutes) + 15) / 30) & 1)
113 real_minutes += 30; /* correct for half hour time zone */
114 real_minutes %= 60;
115
116 if (abs(real_minutes - cmos_minutes) >= 30) {
117 printk(KERN_WARNING "time.c: can't update CMOS clock "
118 "from %d to %d\n", cmos_minutes, real_minutes);
119 retval = -1;
120 } else {
121 BIN_TO_BCD(real_seconds);
122 BIN_TO_BCD(real_minutes);
123 CMOS_WRITE(real_seconds, RTC_SECONDS);
124 CMOS_WRITE(real_minutes, RTC_MINUTES);
125 }
126
127/*
128 * The following flags have to be released exactly in this order, otherwise the
129 * DS12887 (popular MC146818A clone with integrated battery and quartz) will
130 * not reset the oscillator and will not update precisely 500 ms later. You
131 * won't find this mentioned in the Dallas Semiconductor data sheets, but who
132 * believes data sheets anyway ... -- Markus Kuhn
133 */
134
135 CMOS_WRITE(control, RTC_CONTROL);
136 CMOS_WRITE(freq_select, RTC_FREQ_SELECT);
137
138 spin_unlock_irqrestore(&rtc_lock, flags);
139
140 return retval;
141}
142
143int update_persistent_clock(struct timespec now)
144{
145 return set_rtc_mmss(now.tv_sec);
146}
147
148static irqreturn_t timer_event_interrupt(int irq, void *dev_id) 47static irqreturn_t timer_event_interrupt(int irq, void *dev_id)
149{ 48{
150 add_pda(irq0_irqs, 1); 49 add_pda(irq0_irqs, 1);
@@ -154,67 +53,10 @@ static irqreturn_t timer_event_interrupt(int irq, void *dev_id)
154 return IRQ_HANDLED; 53 return IRQ_HANDLED;
155} 54}
156 55
157unsigned long read_persistent_clock(void)
158{
159 unsigned int year, mon, day, hour, min, sec;
160 unsigned long flags;
161 unsigned century = 0;
162
163 spin_lock_irqsave(&rtc_lock, flags);
164 /*
165 * if UIP is clear, then we have >= 244 microseconds before RTC
166 * registers will be updated. Spec sheet says that this is the
167 * reliable way to read RTC - registers invalid (off bus) during update
168 */
169 while ((CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP))
170 cpu_relax();
171
172
173 /* now read all RTC registers while stable with interrupts disabled */
174 sec = CMOS_READ(RTC_SECONDS);
175 min = CMOS_READ(RTC_MINUTES);
176 hour = CMOS_READ(RTC_HOURS);
177 day = CMOS_READ(RTC_DAY_OF_MONTH);
178 mon = CMOS_READ(RTC_MONTH);
179 year = CMOS_READ(RTC_YEAR);
180#ifdef CONFIG_ACPI
181 if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
182 acpi_gbl_FADT.century)
183 century = CMOS_READ(acpi_gbl_FADT.century);
184#endif
185 spin_unlock_irqrestore(&rtc_lock, flags);
186
187 /*
188 * We know that x86-64 always uses BCD format, no need to check the
189 * config register.
190 */
191
192 BCD_TO_BIN(sec);
193 BCD_TO_BIN(min);
194 BCD_TO_BIN(hour);
195 BCD_TO_BIN(day);
196 BCD_TO_BIN(mon);
197 BCD_TO_BIN(year);
198
199 if (century) {
200 BCD_TO_BIN(century);
201 year += century * 100;
202 printk(KERN_INFO "Extended CMOS year: %d\n", century * 100);
203 } else {
204 /*
205 * x86-64 systems only exists since 2002.
206 * This will work up to Dec 31, 2100
207 */
208 year += 2000;
209 }
210
211 return mktime(year, mon, day, hour, min, sec);
212}
213
214/* calibrate_cpu is used on systems with fixed rate TSCs to determine 56/* calibrate_cpu is used on systems with fixed rate TSCs to determine
215 * processor frequency */ 57 * processor frequency */
216#define TICK_COUNT 100000000 58#define TICK_COUNT 100000000
217static unsigned int __init tsc_calibrate_cpu_khz(void) 59unsigned long __init native_calculate_cpu_khz(void)
218{ 60{
219 int tsc_start, tsc_now; 61 int tsc_start, tsc_now;
220 int i, no_ctr_free; 62 int i, no_ctr_free;
@@ -241,7 +83,7 @@ static unsigned int __init tsc_calibrate_cpu_khz(void)
241 rdtscl(tsc_start); 83 rdtscl(tsc_start);
242 do { 84 do {
243 rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now); 85 rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now);
244 tsc_now = get_cycles_sync(); 86 tsc_now = get_cycles();
245 } while ((tsc_now - tsc_start) < TICK_COUNT); 87 } while ((tsc_now - tsc_start) < TICK_COUNT);
246 88
247 local_irq_restore(flags); 89 local_irq_restore(flags);
@@ -264,20 +106,22 @@ static struct irqaction irq0 = {
264 .name = "timer" 106 .name = "timer"
265}; 107};
266 108
267void __init time_init(void) 109void __init hpet_time_init(void)
268{ 110{
269 if (!hpet_enable()) 111 if (!hpet_enable())
270 setup_pit_timer(); 112 setup_pit_timer();
271 113
272 setup_irq(0, &irq0); 114 setup_irq(0, &irq0);
115}
273 116
117void __init time_init(void)
118{
274 tsc_calibrate(); 119 tsc_calibrate();
275 120
276 cpu_khz = tsc_khz; 121 cpu_khz = tsc_khz;
277 if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) && 122 if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) &&
278 boot_cpu_data.x86_vendor == X86_VENDOR_AMD && 123 (boot_cpu_data.x86_vendor == X86_VENDOR_AMD))
279 boot_cpu_data.x86 == 16) 124 cpu_khz = calculate_cpu_khz();
280 cpu_khz = tsc_calibrate_cpu_khz();
281 125
282 if (unsynchronized_tsc()) 126 if (unsynchronized_tsc())
283 mark_tsc_unstable("TSCs unsynchronized"); 127 mark_tsc_unstable("TSCs unsynchronized");
@@ -290,4 +134,5 @@ void __init time_init(void)
290 printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n", 134 printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n",
291 cpu_khz / 1000, cpu_khz % 1000); 135 cpu_khz / 1000, cpu_khz % 1000);
292 init_tsc_clocksource(); 136 init_tsc_clocksource();
137 late_time_init = choose_time_init();
293} 138}
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
new file mode 100644
index 00000000000..6dfd4e76661
--- /dev/null
+++ b/arch/x86/kernel/tls.c
@@ -0,0 +1,213 @@
1#include <linux/kernel.h>
2#include <linux/errno.h>
3#include <linux/sched.h>
4#include <linux/user.h>
5#include <linux/regset.h>
6
7#include <asm/uaccess.h>
8#include <asm/desc.h>
9#include <asm/system.h>
10#include <asm/ldt.h>
11#include <asm/processor.h>
12#include <asm/proto.h>
13
14#include "tls.h"
15
16/*
17 * sys_alloc_thread_area: get a yet unused TLS descriptor index.
18 */
19static int get_free_idx(void)
20{
21 struct thread_struct *t = &current->thread;
22 int idx;
23
24 for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++)
25 if (desc_empty(&t->tls_array[idx]))
26 return idx + GDT_ENTRY_TLS_MIN;
27 return -ESRCH;
28}
29
30static void set_tls_desc(struct task_struct *p, int idx,
31 const struct user_desc *info, int n)
32{
33 struct thread_struct *t = &p->thread;
34 struct desc_struct *desc = &t->tls_array[idx - GDT_ENTRY_TLS_MIN];
35 int cpu;
36
37 /*
38 * We must not get preempted while modifying the TLS.
39 */
40 cpu = get_cpu();
41
42 while (n-- > 0) {
43 if (LDT_empty(info))
44 desc->a = desc->b = 0;
45 else
46 fill_ldt(desc, info);
47 ++info;
48 ++desc;
49 }
50
51 if (t == &current->thread)
52 load_TLS(t, cpu);
53
54 put_cpu();
55}
56
57/*
58 * Set a given TLS descriptor:
59 */
60int do_set_thread_area(struct task_struct *p, int idx,
61 struct user_desc __user *u_info,
62 int can_allocate)
63{
64 struct user_desc info;
65
66 if (copy_from_user(&info, u_info, sizeof(info)))
67 return -EFAULT;
68
69 if (idx == -1)
70 idx = info.entry_number;
71
72 /*
73 * index -1 means the kernel should try to find and
74 * allocate an empty descriptor:
75 */
76 if (idx == -1 && can_allocate) {
77 idx = get_free_idx();
78 if (idx < 0)
79 return idx;
80 if (put_user(idx, &u_info->entry_number))
81 return -EFAULT;
82 }
83
84 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
85 return -EINVAL;
86
87 set_tls_desc(p, idx, &info, 1);
88
89 return 0;
90}
91
92asmlinkage int sys_set_thread_area(struct user_desc __user *u_info)
93{
94 return do_set_thread_area(current, -1, u_info, 1);
95}
96
97
98/*
99 * Get the current Thread-Local Storage area:
100 */
101
102static void fill_user_desc(struct user_desc *info, int idx,
103 const struct desc_struct *desc)
104
105{
106 memset(info, 0, sizeof(*info));
107 info->entry_number = idx;
108 info->base_addr = get_desc_base(desc);
109 info->limit = get_desc_limit(desc);
110 info->seg_32bit = desc->d;
111 info->contents = desc->type >> 2;
112 info->read_exec_only = !(desc->type & 2);
113 info->limit_in_pages = desc->g;
114 info->seg_not_present = !desc->p;
115 info->useable = desc->avl;
116#ifdef CONFIG_X86_64
117 info->lm = desc->l;
118#endif
119}
120
121int do_get_thread_area(struct task_struct *p, int idx,
122 struct user_desc __user *u_info)
123{
124 struct user_desc info;
125
126 if (idx == -1 && get_user(idx, &u_info->entry_number))
127 return -EFAULT;
128
129 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
130 return -EINVAL;
131
132 fill_user_desc(&info, idx,
133 &p->thread.tls_array[idx - GDT_ENTRY_TLS_MIN]);
134
135 if (copy_to_user(u_info, &info, sizeof(info)))
136 return -EFAULT;
137 return 0;
138}
139
140asmlinkage int sys_get_thread_area(struct user_desc __user *u_info)
141{
142 return do_get_thread_area(current, -1, u_info);
143}
144
145int regset_tls_active(struct task_struct *target,
146 const struct user_regset *regset)
147{
148 struct thread_struct *t = &target->thread;
149 int n = GDT_ENTRY_TLS_ENTRIES;
150 while (n > 0 && desc_empty(&t->tls_array[n - 1]))
151 --n;
152 return n;
153}
154
155int regset_tls_get(struct task_struct *target, const struct user_regset *regset,
156 unsigned int pos, unsigned int count,
157 void *kbuf, void __user *ubuf)
158{
159 const struct desc_struct *tls;
160
161 if (pos > GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) ||
162 (pos % sizeof(struct user_desc)) != 0 ||
163 (count % sizeof(struct user_desc)) != 0)
164 return -EINVAL;
165
166 pos /= sizeof(struct user_desc);
167 count /= sizeof(struct user_desc);
168
169 tls = &target->thread.tls_array[pos];
170
171 if (kbuf) {
172 struct user_desc *info = kbuf;
173 while (count-- > 0)
174 fill_user_desc(info++, GDT_ENTRY_TLS_MIN + pos++,
175 tls++);
176 } else {
177 struct user_desc __user *u_info = ubuf;
178 while (count-- > 0) {
179 struct user_desc info;
180 fill_user_desc(&info, GDT_ENTRY_TLS_MIN + pos++, tls++);
181 if (__copy_to_user(u_info++, &info, sizeof(info)))
182 return -EFAULT;
183 }
184 }
185
186 return 0;
187}
188
189int regset_tls_set(struct task_struct *target, const struct user_regset *regset,
190 unsigned int pos, unsigned int count,
191 const void *kbuf, const void __user *ubuf)
192{
193 struct user_desc infobuf[GDT_ENTRY_TLS_ENTRIES];
194 const struct user_desc *info;
195
196 if (pos > GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) ||
197 (pos % sizeof(struct user_desc)) != 0 ||
198 (count % sizeof(struct user_desc)) != 0)
199 return -EINVAL;
200
201 if (kbuf)
202 info = kbuf;
203 else if (__copy_from_user(infobuf, ubuf, count))
204 return -EFAULT;
205 else
206 info = infobuf;
207
208 set_tls_desc(target,
209 GDT_ENTRY_TLS_MIN + (pos / sizeof(struct user_desc)),
210 info, count / sizeof(struct user_desc));
211
212 return 0;
213}
diff --git a/arch/x86/kernel/tls.h b/arch/x86/kernel/tls.h
new file mode 100644
index 00000000000..2f083a2fe21
--- /dev/null
+++ b/arch/x86/kernel/tls.h
@@ -0,0 +1,21 @@
1/*
2 * Internal declarations for x86 TLS implementation functions.
3 *
4 * Copyright (C) 2007 Red Hat, Inc. All rights reserved.
5 *
6 * This copyrighted material is made available to anyone wishing to use,
7 * modify, copy, or redistribute it subject to the terms and conditions
8 * of the GNU General Public License v.2.
9 *
10 * Red Hat Author: Roland McGrath.
11 */
12
13#ifndef _ARCH_X86_KERNEL_TLS_H
14
15#include <linux/regset.h>
16
17extern user_regset_active_fn regset_tls_active;
18extern user_regset_get_fn regset_tls_get;
19extern user_regset_set_fn regset_tls_set;
20
21#endif /* _ARCH_X86_KERNEL_TLS_H */
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
index 7e16d675eb8..e6757aaa202 100644
--- a/arch/x86/kernel/topology.c
+++ b/arch/x86/kernel/topology.c
@@ -31,9 +31,10 @@
31#include <linux/mmzone.h> 31#include <linux/mmzone.h>
32#include <asm/cpu.h> 32#include <asm/cpu.h>
33 33
34static struct i386_cpu cpu_devices[NR_CPUS]; 34static DEFINE_PER_CPU(struct x86_cpu, cpu_devices);
35 35
36int __cpuinit arch_register_cpu(int num) 36#ifdef CONFIG_HOTPLUG_CPU
37int arch_register_cpu(int num)
37{ 38{
38 /* 39 /*
39 * CPU0 cannot be offlined due to several 40 * CPU0 cannot be offlined due to several
@@ -44,21 +45,22 @@ int __cpuinit arch_register_cpu(int num)
44 * Also certain PCI quirks require not to enable hotplug control 45 * Also certain PCI quirks require not to enable hotplug control
45 * for all CPU's. 46 * for all CPU's.
46 */ 47 */
47#ifdef CONFIG_HOTPLUG_CPU
48 if (num) 48 if (num)
49 cpu_devices[num].cpu.hotpluggable = 1; 49 per_cpu(cpu_devices, num).cpu.hotpluggable = 1;
50#endif 50 return register_cpu(&per_cpu(cpu_devices, num).cpu, num);
51
52 return register_cpu(&cpu_devices[num].cpu, num);
53} 51}
52EXPORT_SYMBOL(arch_register_cpu);
54 53
55#ifdef CONFIG_HOTPLUG_CPU
56void arch_unregister_cpu(int num) 54void arch_unregister_cpu(int num)
57{ 55{
58 return unregister_cpu(&cpu_devices[num].cpu); 56 return unregister_cpu(&per_cpu(cpu_devices, num).cpu);
59} 57}
60EXPORT_SYMBOL(arch_register_cpu);
61EXPORT_SYMBOL(arch_unregister_cpu); 58EXPORT_SYMBOL(arch_unregister_cpu);
59#else
60static int __init arch_register_cpu(int num)
61{
62 return register_cpu(&per_cpu(cpu_devices, num).cpu, num);
63}
62#endif /*CONFIG_HOTPLUG_CPU*/ 64#endif /*CONFIG_HOTPLUG_CPU*/
63 65
64static int __init topology_init(void) 66static int __init topology_init(void)
diff --git a/arch/x86/kernel/trampoline_32.S b/arch/x86/kernel/trampoline_32.S
index 9bcc1c6aca3..64580679861 100644
--- a/arch/x86/kernel/trampoline_32.S
+++ b/arch/x86/kernel/trampoline_32.S
@@ -11,12 +11,7 @@
11 * trampoline page to make our stack and everything else 11 * trampoline page to make our stack and everything else
12 * is a mystery. 12 * is a mystery.
13 * 13 *
14 * In fact we don't actually need a stack so we don't 14 * We jump into arch/x86/kernel/head_32.S.
15 * set one up.
16 *
17 * We jump into the boot/compressed/head.S code. So you'd
18 * better be running a compressed kernel image or you
19 * won't get very far.
20 * 15 *
21 * On entry to trampoline_data, the processor is in real mode 16 * On entry to trampoline_data, the processor is in real mode
22 * with 16-bit addressing and 16-bit data. CS has some value 17 * with 16-bit addressing and 16-bit data. CS has some value
diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S
index e30b67c6a9f..4aedd0bcee4 100644
--- a/arch/x86/kernel/trampoline_64.S
+++ b/arch/x86/kernel/trampoline_64.S
@@ -10,9 +10,6 @@
10 * trampoline page to make our stack and everything else 10 * trampoline page to make our stack and everything else
11 * is a mystery. 11 * is a mystery.
12 * 12 *
13 * In fact we don't actually need a stack so we don't
14 * set one up.
15 *
16 * On entry to trampoline_data, the processor is in real mode 13 * On entry to trampoline_data, the processor is in real mode
17 * with 16-bit addressing and 16-bit data. CS has some value 14 * with 16-bit addressing and 16-bit data. CS has some value
18 * and IP is zero. Thus, data addresses need to be absolute 15 * and IP is zero. Thus, data addresses need to be absolute
diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
index c88bbffcaa0..b22c01e05a1 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps_32.c
@@ -76,7 +76,8 @@ char ignore_fpu_irq = 0;
76 * F0 0F bug workaround.. We have a special link segment 76 * F0 0F bug workaround.. We have a special link segment
77 * for this. 77 * for this.
78 */ 78 */
79struct desc_struct idt_table[256] __attribute__((__section__(".data.idt"))) = { {0, 0}, }; 79gate_desc idt_table[256]
80 __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
80 81
81asmlinkage void divide_error(void); 82asmlinkage void divide_error(void);
82asmlinkage void debug(void); 83asmlinkage void debug(void);
@@ -101,6 +102,34 @@ asmlinkage void machine_check(void);
101int kstack_depth_to_print = 24; 102int kstack_depth_to_print = 24;
102static unsigned int code_bytes = 64; 103static unsigned int code_bytes = 64;
103 104
105void printk_address(unsigned long address, int reliable)
106{
107#ifdef CONFIG_KALLSYMS
108 unsigned long offset = 0, symsize;
109 const char *symname;
110 char *modname;
111 char *delim = ":";
112 char namebuf[128];
113 char reliab[4] = "";
114
115 symname = kallsyms_lookup(address, &symsize, &offset,
116 &modname, namebuf);
117 if (!symname) {
118 printk(" [<%08lx>]\n", address);
119 return;
120 }
121 if (!reliable)
122 strcpy(reliab, "? ");
123
124 if (!modname)
125 modname = delim = "";
126 printk(" [<%08lx>] %s%s%s%s%s+0x%lx/0x%lx\n",
127 address, reliab, delim, modname, delim, symname, offset, symsize);
128#else
129 printk(" [<%08lx>]\n", address);
130#endif
131}
132
104static inline int valid_stack_ptr(struct thread_info *tinfo, void *p, unsigned size) 133static inline int valid_stack_ptr(struct thread_info *tinfo, void *p, unsigned size)
105{ 134{
106 return p > (void *)tinfo && 135 return p > (void *)tinfo &&
@@ -114,48 +143,35 @@ struct stack_frame {
114}; 143};
115 144
116static inline unsigned long print_context_stack(struct thread_info *tinfo, 145static inline unsigned long print_context_stack(struct thread_info *tinfo,
117 unsigned long *stack, unsigned long ebp, 146 unsigned long *stack, unsigned long bp,
118 const struct stacktrace_ops *ops, void *data) 147 const struct stacktrace_ops *ops, void *data)
119{ 148{
120#ifdef CONFIG_FRAME_POINTER 149 struct stack_frame *frame = (struct stack_frame *)bp;
121 struct stack_frame *frame = (struct stack_frame *)ebp;
122 while (valid_stack_ptr(tinfo, frame, sizeof(*frame))) {
123 struct stack_frame *next;
124 unsigned long addr;
125 150
126 addr = frame->return_address;
127 ops->address(data, addr);
128 /*
129 * break out of recursive entries (such as
130 * end_of_stack_stop_unwind_function). Also,
131 * we can never allow a frame pointer to
132 * move downwards!
133 */
134 next = frame->next_frame;
135 if (next <= frame)
136 break;
137 frame = next;
138 }
139#else
140 while (valid_stack_ptr(tinfo, stack, sizeof(*stack))) { 151 while (valid_stack_ptr(tinfo, stack, sizeof(*stack))) {
141 unsigned long addr; 152 unsigned long addr;
142 153
143 addr = *stack++; 154 addr = *stack;
144 if (__kernel_text_address(addr)) 155 if (__kernel_text_address(addr)) {
145 ops->address(data, addr); 156 if ((unsigned long) stack == bp + 4) {
157 ops->address(data, addr, 1);
158 frame = frame->next_frame;
159 bp = (unsigned long) frame;
160 } else {
161 ops->address(data, addr, bp == 0);
162 }
163 }
164 stack++;
146 } 165 }
147#endif 166 return bp;
148 return ebp;
149} 167}
150 168
151#define MSG(msg) ops->warning(data, msg) 169#define MSG(msg) ops->warning(data, msg)
152 170
153void dump_trace(struct task_struct *task, struct pt_regs *regs, 171void dump_trace(struct task_struct *task, struct pt_regs *regs,
154 unsigned long *stack, 172 unsigned long *stack, unsigned long bp,
155 const struct stacktrace_ops *ops, void *data) 173 const struct stacktrace_ops *ops, void *data)
156{ 174{
157 unsigned long ebp = 0;
158
159 if (!task) 175 if (!task)
160 task = current; 176 task = current;
161 177
@@ -163,17 +179,17 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
163 unsigned long dummy; 179 unsigned long dummy;
164 stack = &dummy; 180 stack = &dummy;
165 if (task != current) 181 if (task != current)
166 stack = (unsigned long *)task->thread.esp; 182 stack = (unsigned long *)task->thread.sp;
167 } 183 }
168 184
169#ifdef CONFIG_FRAME_POINTER 185#ifdef CONFIG_FRAME_POINTER
170 if (!ebp) { 186 if (!bp) {
171 if (task == current) { 187 if (task == current) {
172 /* Grab ebp right from our regs */ 188 /* Grab bp right from our regs */
173 asm ("movl %%ebp, %0" : "=r" (ebp) : ); 189 asm ("movl %%ebp, %0" : "=r" (bp) : );
174 } else { 190 } else {
175 /* ebp is the last reg pushed by switch_to */ 191 /* bp is the last reg pushed by switch_to */
176 ebp = *(unsigned long *) task->thread.esp; 192 bp = *(unsigned long *) task->thread.sp;
177 } 193 }
178 } 194 }
179#endif 195#endif
@@ -182,7 +198,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
182 struct thread_info *context; 198 struct thread_info *context;
183 context = (struct thread_info *) 199 context = (struct thread_info *)
184 ((unsigned long)stack & (~(THREAD_SIZE - 1))); 200 ((unsigned long)stack & (~(THREAD_SIZE - 1)));
185 ebp = print_context_stack(context, stack, ebp, ops, data); 201 bp = print_context_stack(context, stack, bp, ops, data);
186 /* Should be after the line below, but somewhere 202 /* Should be after the line below, but somewhere
187 in early boot context comes out corrupted and we 203 in early boot context comes out corrupted and we
188 can't reference it -AK */ 204 can't reference it -AK */
@@ -217,9 +233,11 @@ static int print_trace_stack(void *data, char *name)
217/* 233/*
218 * Print one address/symbol entries per line. 234 * Print one address/symbol entries per line.
219 */ 235 */
220static void print_trace_address(void *data, unsigned long addr) 236static void print_trace_address(void *data, unsigned long addr, int reliable)
221{ 237{
222 printk("%s [<%08lx>] ", (char *)data, addr); 238 printk("%s [<%08lx>] ", (char *)data, addr);
239 if (!reliable)
240 printk("? ");
223 print_symbol("%s\n", addr); 241 print_symbol("%s\n", addr);
224 touch_nmi_watchdog(); 242 touch_nmi_watchdog();
225} 243}
@@ -233,32 +251,32 @@ static const struct stacktrace_ops print_trace_ops = {
233 251
234static void 252static void
235show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, 253show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
236 unsigned long * stack, char *log_lvl) 254 unsigned long *stack, unsigned long bp, char *log_lvl)
237{ 255{
238 dump_trace(task, regs, stack, &print_trace_ops, log_lvl); 256 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
239 printk("%s =======================\n", log_lvl); 257 printk("%s =======================\n", log_lvl);
240} 258}
241 259
242void show_trace(struct task_struct *task, struct pt_regs *regs, 260void show_trace(struct task_struct *task, struct pt_regs *regs,
243 unsigned long * stack) 261 unsigned long *stack, unsigned long bp)
244{ 262{
245 show_trace_log_lvl(task, regs, stack, ""); 263 show_trace_log_lvl(task, regs, stack, bp, "");
246} 264}
247 265
248static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, 266static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
249 unsigned long *esp, char *log_lvl) 267 unsigned long *sp, unsigned long bp, char *log_lvl)
250{ 268{
251 unsigned long *stack; 269 unsigned long *stack;
252 int i; 270 int i;
253 271
254 if (esp == NULL) { 272 if (sp == NULL) {
255 if (task) 273 if (task)
256 esp = (unsigned long*)task->thread.esp; 274 sp = (unsigned long*)task->thread.sp;
257 else 275 else
258 esp = (unsigned long *)&esp; 276 sp = (unsigned long *)&sp;
259 } 277 }
260 278
261 stack = esp; 279 stack = sp;
262 for(i = 0; i < kstack_depth_to_print; i++) { 280 for(i = 0; i < kstack_depth_to_print; i++) {
263 if (kstack_end(stack)) 281 if (kstack_end(stack))
264 break; 282 break;
@@ -267,13 +285,13 @@ static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
267 printk("%08lx ", *stack++); 285 printk("%08lx ", *stack++);
268 } 286 }
269 printk("\n%sCall Trace:\n", log_lvl); 287 printk("\n%sCall Trace:\n", log_lvl);
270 show_trace_log_lvl(task, regs, esp, log_lvl); 288 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
271} 289}
272 290
273void show_stack(struct task_struct *task, unsigned long *esp) 291void show_stack(struct task_struct *task, unsigned long *sp)
274{ 292{
275 printk(" "); 293 printk(" ");
276 show_stack_log_lvl(task, NULL, esp, ""); 294 show_stack_log_lvl(task, NULL, sp, 0, "");
277} 295}
278 296
279/* 297/*
@@ -282,13 +300,19 @@ void show_stack(struct task_struct *task, unsigned long *esp)
282void dump_stack(void) 300void dump_stack(void)
283{ 301{
284 unsigned long stack; 302 unsigned long stack;
303 unsigned long bp = 0;
304
305#ifdef CONFIG_FRAME_POINTER
306 if (!bp)
307 asm("movl %%ebp, %0" : "=r" (bp):);
308#endif
285 309
286 printk("Pid: %d, comm: %.20s %s %s %.*s\n", 310 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
287 current->pid, current->comm, print_tainted(), 311 current->pid, current->comm, print_tainted(),
288 init_utsname()->release, 312 init_utsname()->release,
289 (int)strcspn(init_utsname()->version, " "), 313 (int)strcspn(init_utsname()->version, " "),
290 init_utsname()->version); 314 init_utsname()->version);
291 show_trace(current, NULL, &stack); 315 show_trace(current, NULL, &stack, bp);
292} 316}
293 317
294EXPORT_SYMBOL(dump_stack); 318EXPORT_SYMBOL(dump_stack);
@@ -307,30 +331,30 @@ void show_registers(struct pt_regs *regs)
307 * time of the fault.. 331 * time of the fault..
308 */ 332 */
309 if (!user_mode_vm(regs)) { 333 if (!user_mode_vm(regs)) {
310 u8 *eip; 334 u8 *ip;
311 unsigned int code_prologue = code_bytes * 43 / 64; 335 unsigned int code_prologue = code_bytes * 43 / 64;
312 unsigned int code_len = code_bytes; 336 unsigned int code_len = code_bytes;
313 unsigned char c; 337 unsigned char c;
314 338
315 printk("\n" KERN_EMERG "Stack: "); 339 printk("\n" KERN_EMERG "Stack: ");
316 show_stack_log_lvl(NULL, regs, &regs->esp, KERN_EMERG); 340 show_stack_log_lvl(NULL, regs, &regs->sp, 0, KERN_EMERG);
317 341
318 printk(KERN_EMERG "Code: "); 342 printk(KERN_EMERG "Code: ");
319 343
320 eip = (u8 *)regs->eip - code_prologue; 344 ip = (u8 *)regs->ip - code_prologue;
321 if (eip < (u8 *)PAGE_OFFSET || 345 if (ip < (u8 *)PAGE_OFFSET ||
322 probe_kernel_address(eip, c)) { 346 probe_kernel_address(ip, c)) {
323 /* try starting at EIP */ 347 /* try starting at EIP */
324 eip = (u8 *)regs->eip; 348 ip = (u8 *)regs->ip;
325 code_len = code_len - code_prologue + 1; 349 code_len = code_len - code_prologue + 1;
326 } 350 }
327 for (i = 0; i < code_len; i++, eip++) { 351 for (i = 0; i < code_len; i++, ip++) {
328 if (eip < (u8 *)PAGE_OFFSET || 352 if (ip < (u8 *)PAGE_OFFSET ||
329 probe_kernel_address(eip, c)) { 353 probe_kernel_address(ip, c)) {
330 printk(" Bad EIP value."); 354 printk(" Bad EIP value.");
331 break; 355 break;
332 } 356 }
333 if (eip == (u8 *)regs->eip) 357 if (ip == (u8 *)regs->ip)
334 printk("<%02x> ", c); 358 printk("<%02x> ", c);
335 else 359 else
336 printk("%02x ", c); 360 printk("%02x ", c);
@@ -339,18 +363,57 @@ void show_registers(struct pt_regs *regs)
339 printk("\n"); 363 printk("\n");
340} 364}
341 365
342int is_valid_bugaddr(unsigned long eip) 366int is_valid_bugaddr(unsigned long ip)
343{ 367{
344 unsigned short ud2; 368 unsigned short ud2;
345 369
346 if (eip < PAGE_OFFSET) 370 if (ip < PAGE_OFFSET)
347 return 0; 371 return 0;
348 if (probe_kernel_address((unsigned short *)eip, ud2)) 372 if (probe_kernel_address((unsigned short *)ip, ud2))
349 return 0; 373 return 0;
350 374
351 return ud2 == 0x0b0f; 375 return ud2 == 0x0b0f;
352} 376}
353 377
378static int die_counter;
379
380int __kprobes __die(const char * str, struct pt_regs * regs, long err)
381{
382 unsigned long sp;
383 unsigned short ss;
384
385 printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
386#ifdef CONFIG_PREEMPT
387 printk("PREEMPT ");
388#endif
389#ifdef CONFIG_SMP
390 printk("SMP ");
391#endif
392#ifdef CONFIG_DEBUG_PAGEALLOC
393 printk("DEBUG_PAGEALLOC");
394#endif
395 printk("\n");
396
397 if (notify_die(DIE_OOPS, str, regs, err,
398 current->thread.trap_no, SIGSEGV) !=
399 NOTIFY_STOP) {
400 show_registers(regs);
401 /* Executive summary in case the oops scrolled away */
402 sp = (unsigned long) (&regs->sp);
403 savesegment(ss, ss);
404 if (user_mode(regs)) {
405 sp = regs->sp;
406 ss = regs->ss & 0xffff;
407 }
408 printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
409 print_symbol("%s", regs->ip);
410 printk(" SS:ESP %04x:%08lx\n", ss, sp);
411 return 0;
412 } else {
413 return 1;
414 }
415}
416
354/* 417/*
355 * This is gone through when something in the kernel has done something bad and 418 * This is gone through when something in the kernel has done something bad and
356 * is about to be terminated. 419 * is about to be terminated.
@@ -366,7 +429,6 @@ void die(const char * str, struct pt_regs * regs, long err)
366 .lock_owner = -1, 429 .lock_owner = -1,
367 .lock_owner_depth = 0 430 .lock_owner_depth = 0
368 }; 431 };
369 static int die_counter;
370 unsigned long flags; 432 unsigned long flags;
371 433
372 oops_enter(); 434 oops_enter();
@@ -382,43 +444,13 @@ void die(const char * str, struct pt_regs * regs, long err)
382 raw_local_irq_save(flags); 444 raw_local_irq_save(flags);
383 445
384 if (++die.lock_owner_depth < 3) { 446 if (++die.lock_owner_depth < 3) {
385 unsigned long esp; 447 report_bug(regs->ip, regs);
386 unsigned short ss;
387
388 report_bug(regs->eip, regs);
389 448
390 printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, 449 if (__die(str, regs, err))
391 ++die_counter);
392#ifdef CONFIG_PREEMPT
393 printk("PREEMPT ");
394#endif
395#ifdef CONFIG_SMP
396 printk("SMP ");
397#endif
398#ifdef CONFIG_DEBUG_PAGEALLOC
399 printk("DEBUG_PAGEALLOC");
400#endif
401 printk("\n");
402
403 if (notify_die(DIE_OOPS, str, regs, err,
404 current->thread.trap_no, SIGSEGV) !=
405 NOTIFY_STOP) {
406 show_registers(regs);
407 /* Executive summary in case the oops scrolled away */
408 esp = (unsigned long) (&regs->esp);
409 savesegment(ss, ss);
410 if (user_mode(regs)) {
411 esp = regs->esp;
412 ss = regs->xss & 0xffff;
413 }
414 printk(KERN_EMERG "EIP: [<%08lx>] ", regs->eip);
415 print_symbol("%s", regs->eip);
416 printk(" SS:ESP %04x:%08lx\n", ss, esp);
417 }
418 else
419 regs = NULL; 450 regs = NULL;
420 } else 451 } else {
421 printk(KERN_EMERG "Recursive die() failure, output suppressed\n"); 452 printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
453 }
422 454
423 bust_spinlocks(0); 455 bust_spinlocks(0);
424 die.lock_owner = -1; 456 die.lock_owner = -1;
@@ -454,7 +486,7 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86,
454{ 486{
455 struct task_struct *tsk = current; 487 struct task_struct *tsk = current;
456 488
457 if (regs->eflags & VM_MASK) { 489 if (regs->flags & VM_MASK) {
458 if (vm86) 490 if (vm86)
459 goto vm86_trap; 491 goto vm86_trap;
460 goto trap_signal; 492 goto trap_signal;
@@ -500,7 +532,7 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86,
500} 532}
501 533
502#define DO_ERROR(trapnr, signr, str, name) \ 534#define DO_ERROR(trapnr, signr, str, name) \
503fastcall void do_##name(struct pt_regs * regs, long error_code) \ 535void do_##name(struct pt_regs * regs, long error_code) \
504{ \ 536{ \
505 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ 537 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
506 == NOTIFY_STOP) \ 538 == NOTIFY_STOP) \
@@ -509,7 +541,7 @@ fastcall void do_##name(struct pt_regs * regs, long error_code) \
509} 541}
510 542
511#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \ 543#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \
512fastcall void do_##name(struct pt_regs * regs, long error_code) \ 544void do_##name(struct pt_regs * regs, long error_code) \
513{ \ 545{ \
514 siginfo_t info; \ 546 siginfo_t info; \
515 if (irq) \ 547 if (irq) \
@@ -525,7 +557,7 @@ fastcall void do_##name(struct pt_regs * regs, long error_code) \
525} 557}
526 558
527#define DO_VM86_ERROR(trapnr, signr, str, name) \ 559#define DO_VM86_ERROR(trapnr, signr, str, name) \
528fastcall void do_##name(struct pt_regs * regs, long error_code) \ 560void do_##name(struct pt_regs * regs, long error_code) \
529{ \ 561{ \
530 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ 562 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
531 == NOTIFY_STOP) \ 563 == NOTIFY_STOP) \
@@ -534,26 +566,27 @@ fastcall void do_##name(struct pt_regs * regs, long error_code) \
534} 566}
535 567
536#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ 568#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
537fastcall void do_##name(struct pt_regs * regs, long error_code) \ 569void do_##name(struct pt_regs * regs, long error_code) \
538{ \ 570{ \
539 siginfo_t info; \ 571 siginfo_t info; \
540 info.si_signo = signr; \ 572 info.si_signo = signr; \
541 info.si_errno = 0; \ 573 info.si_errno = 0; \
542 info.si_code = sicode; \ 574 info.si_code = sicode; \
543 info.si_addr = (void __user *)siaddr; \ 575 info.si_addr = (void __user *)siaddr; \
576 trace_hardirqs_fixup(); \
544 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ 577 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
545 == NOTIFY_STOP) \ 578 == NOTIFY_STOP) \
546 return; \ 579 return; \
547 do_trap(trapnr, signr, str, 1, regs, error_code, &info); \ 580 do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
548} 581}
549 582
550DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->eip) 583DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
551#ifndef CONFIG_KPROBES 584#ifndef CONFIG_KPROBES
552DO_VM86_ERROR( 3, SIGTRAP, "int3", int3) 585DO_VM86_ERROR( 3, SIGTRAP, "int3", int3)
553#endif 586#endif
554DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow) 587DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow)
555DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds) 588DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds)
556DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->eip, 0) 589DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
557DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) 590DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
558DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) 591DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
559DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) 592DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
@@ -561,7 +594,7 @@ DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
561DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0) 594DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0)
562DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0, 1) 595DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0, 1)
563 596
564fastcall void __kprobes do_general_protection(struct pt_regs * regs, 597void __kprobes do_general_protection(struct pt_regs * regs,
565 long error_code) 598 long error_code)
566{ 599{
567 int cpu = get_cpu(); 600 int cpu = get_cpu();
@@ -595,7 +628,7 @@ fastcall void __kprobes do_general_protection(struct pt_regs * regs,
595 } 628 }
596 put_cpu(); 629 put_cpu();
597 630
598 if (regs->eflags & VM_MASK) 631 if (regs->flags & VM_MASK)
599 goto gp_in_vm86; 632 goto gp_in_vm86;
600 633
601 if (!user_mode(regs)) 634 if (!user_mode(regs))
@@ -604,11 +637,14 @@ fastcall void __kprobes do_general_protection(struct pt_regs * regs,
604 current->thread.error_code = error_code; 637 current->thread.error_code = error_code;
605 current->thread.trap_no = 13; 638 current->thread.trap_no = 13;
606 if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) && 639 if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) &&
607 printk_ratelimit()) 640 printk_ratelimit()) {
608 printk(KERN_INFO 641 printk(KERN_INFO
609 "%s[%d] general protection eip:%lx esp:%lx error:%lx\n", 642 "%s[%d] general protection ip:%lx sp:%lx error:%lx",
610 current->comm, task_pid_nr(current), 643 current->comm, task_pid_nr(current),
611 regs->eip, regs->esp, error_code); 644 regs->ip, regs->sp, error_code);
645 print_vma_addr(" in ", regs->ip);
646 printk("\n");
647 }
612 648
613 force_sig(SIGSEGV, current); 649 force_sig(SIGSEGV, current);
614 return; 650 return;
@@ -704,8 +740,8 @@ void __kprobes die_nmi(struct pt_regs *regs, const char *msg)
704 */ 740 */
705 bust_spinlocks(1); 741 bust_spinlocks(1);
706 printk(KERN_EMERG "%s", msg); 742 printk(KERN_EMERG "%s", msg);
707 printk(" on CPU%d, eip %08lx, registers:\n", 743 printk(" on CPU%d, ip %08lx, registers:\n",
708 smp_processor_id(), regs->eip); 744 smp_processor_id(), regs->ip);
709 show_registers(regs); 745 show_registers(regs);
710 console_silent(); 746 console_silent();
711 spin_unlock(&nmi_print_lock); 747 spin_unlock(&nmi_print_lock);
@@ -762,7 +798,7 @@ static __kprobes void default_do_nmi(struct pt_regs * regs)
762 798
763static int ignore_nmis; 799static int ignore_nmis;
764 800
765fastcall __kprobes void do_nmi(struct pt_regs * regs, long error_code) 801__kprobes void do_nmi(struct pt_regs * regs, long error_code)
766{ 802{
767 int cpu; 803 int cpu;
768 804
@@ -791,7 +827,7 @@ void restart_nmi(void)
791} 827}
792 828
793#ifdef CONFIG_KPROBES 829#ifdef CONFIG_KPROBES
794fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code) 830void __kprobes do_int3(struct pt_regs *regs, long error_code)
795{ 831{
796 trace_hardirqs_fixup(); 832 trace_hardirqs_fixup();
797 833
@@ -827,7 +863,7 @@ fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code)
827 * find every occurrence of the TF bit that could be saved away even 863 * find every occurrence of the TF bit that could be saved away even
828 * by user code) 864 * by user code)
829 */ 865 */
830fastcall void __kprobes do_debug(struct pt_regs * regs, long error_code) 866void __kprobes do_debug(struct pt_regs * regs, long error_code)
831{ 867{
832 unsigned int condition; 868 unsigned int condition;
833 struct task_struct *tsk = current; 869 struct task_struct *tsk = current;
@@ -836,24 +872,30 @@ fastcall void __kprobes do_debug(struct pt_regs * regs, long error_code)
836 872
837 get_debugreg(condition, 6); 873 get_debugreg(condition, 6);
838 874
875 /*
876 * The processor cleared BTF, so don't mark that we need it set.
877 */
878 clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
879 tsk->thread.debugctlmsr = 0;
880
839 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, 881 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
840 SIGTRAP) == NOTIFY_STOP) 882 SIGTRAP) == NOTIFY_STOP)
841 return; 883 return;
842 /* It's safe to allow irq's after DR6 has been saved */ 884 /* It's safe to allow irq's after DR6 has been saved */
843 if (regs->eflags & X86_EFLAGS_IF) 885 if (regs->flags & X86_EFLAGS_IF)
844 local_irq_enable(); 886 local_irq_enable();
845 887
846 /* Mask out spurious debug traps due to lazy DR7 setting */ 888 /* Mask out spurious debug traps due to lazy DR7 setting */
847 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { 889 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
848 if (!tsk->thread.debugreg[7]) 890 if (!tsk->thread.debugreg7)
849 goto clear_dr7; 891 goto clear_dr7;
850 } 892 }
851 893
852 if (regs->eflags & VM_MASK) 894 if (regs->flags & VM_MASK)
853 goto debug_vm86; 895 goto debug_vm86;
854 896
855 /* Save debug status register where ptrace can see it */ 897 /* Save debug status register where ptrace can see it */
856 tsk->thread.debugreg[6] = condition; 898 tsk->thread.debugreg6 = condition;
857 899
858 /* 900 /*
859 * Single-stepping through TF: make sure we ignore any events in 901 * Single-stepping through TF: make sure we ignore any events in
@@ -885,7 +927,7 @@ debug_vm86:
885 927
886clear_TF_reenable: 928clear_TF_reenable:
887 set_tsk_thread_flag(tsk, TIF_SINGLESTEP); 929 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
888 regs->eflags &= ~TF_MASK; 930 regs->flags &= ~TF_MASK;
889 return; 931 return;
890} 932}
891 933
@@ -894,7 +936,7 @@ clear_TF_reenable:
894 * the correct behaviour even in the presence of the asynchronous 936 * the correct behaviour even in the presence of the asynchronous
895 * IRQ13 behaviour 937 * IRQ13 behaviour
896 */ 938 */
897void math_error(void __user *eip) 939void math_error(void __user *ip)
898{ 940{
899 struct task_struct * task; 941 struct task_struct * task;
900 siginfo_t info; 942 siginfo_t info;
@@ -910,7 +952,7 @@ void math_error(void __user *eip)
910 info.si_signo = SIGFPE; 952 info.si_signo = SIGFPE;
911 info.si_errno = 0; 953 info.si_errno = 0;
912 info.si_code = __SI_FAULT; 954 info.si_code = __SI_FAULT;
913 info.si_addr = eip; 955 info.si_addr = ip;
914 /* 956 /*
915 * (~cwd & swd) will mask out exceptions that are not set to unmasked 957 * (~cwd & swd) will mask out exceptions that are not set to unmasked
916 * status. 0x3f is the exception bits in these regs, 0x200 is the 958 * status. 0x3f is the exception bits in these regs, 0x200 is the
@@ -953,13 +995,13 @@ void math_error(void __user *eip)
953 force_sig_info(SIGFPE, &info, task); 995 force_sig_info(SIGFPE, &info, task);
954} 996}
955 997
956fastcall void do_coprocessor_error(struct pt_regs * regs, long error_code) 998void do_coprocessor_error(struct pt_regs * regs, long error_code)
957{ 999{
958 ignore_fpu_irq = 1; 1000 ignore_fpu_irq = 1;
959 math_error((void __user *)regs->eip); 1001 math_error((void __user *)regs->ip);
960} 1002}
961 1003
962static void simd_math_error(void __user *eip) 1004static void simd_math_error(void __user *ip)
963{ 1005{
964 struct task_struct * task; 1006 struct task_struct * task;
965 siginfo_t info; 1007 siginfo_t info;
@@ -975,7 +1017,7 @@ static void simd_math_error(void __user *eip)
975 info.si_signo = SIGFPE; 1017 info.si_signo = SIGFPE;
976 info.si_errno = 0; 1018 info.si_errno = 0;
977 info.si_code = __SI_FAULT; 1019 info.si_code = __SI_FAULT;
978 info.si_addr = eip; 1020 info.si_addr = ip;
979 /* 1021 /*
980 * The SIMD FPU exceptions are handled a little differently, as there 1022 * The SIMD FPU exceptions are handled a little differently, as there
981 * is only a single status/control register. Thus, to determine which 1023 * is only a single status/control register. Thus, to determine which
@@ -1007,19 +1049,19 @@ static void simd_math_error(void __user *eip)
1007 force_sig_info(SIGFPE, &info, task); 1049 force_sig_info(SIGFPE, &info, task);
1008} 1050}
1009 1051
1010fastcall void do_simd_coprocessor_error(struct pt_regs * regs, 1052void do_simd_coprocessor_error(struct pt_regs * regs,
1011 long error_code) 1053 long error_code)
1012{ 1054{
1013 if (cpu_has_xmm) { 1055 if (cpu_has_xmm) {
1014 /* Handle SIMD FPU exceptions on PIII+ processors. */ 1056 /* Handle SIMD FPU exceptions on PIII+ processors. */
1015 ignore_fpu_irq = 1; 1057 ignore_fpu_irq = 1;
1016 simd_math_error((void __user *)regs->eip); 1058 simd_math_error((void __user *)regs->ip);
1017 } else { 1059 } else {
1018 /* 1060 /*
1019 * Handle strange cache flush from user space exception 1061 * Handle strange cache flush from user space exception
1020 * in all other cases. This is undocumented behaviour. 1062 * in all other cases. This is undocumented behaviour.
1021 */ 1063 */
1022 if (regs->eflags & VM_MASK) { 1064 if (regs->flags & VM_MASK) {
1023 handle_vm86_fault((struct kernel_vm86_regs *)regs, 1065 handle_vm86_fault((struct kernel_vm86_regs *)regs,
1024 error_code); 1066 error_code);
1025 return; 1067 return;
@@ -1031,7 +1073,7 @@ fastcall void do_simd_coprocessor_error(struct pt_regs * regs,
1031 } 1073 }
1032} 1074}
1033 1075
1034fastcall void do_spurious_interrupt_bug(struct pt_regs * regs, 1076void do_spurious_interrupt_bug(struct pt_regs * regs,
1035 long error_code) 1077 long error_code)
1036{ 1078{
1037#if 0 1079#if 0
@@ -1040,7 +1082,7 @@ fastcall void do_spurious_interrupt_bug(struct pt_regs * regs,
1040#endif 1082#endif
1041} 1083}
1042 1084
1043fastcall unsigned long patch_espfix_desc(unsigned long uesp, 1085unsigned long patch_espfix_desc(unsigned long uesp,
1044 unsigned long kesp) 1086 unsigned long kesp)
1045{ 1087{
1046 struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt; 1088 struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt;
@@ -1094,51 +1136,17 @@ asmlinkage void math_emulate(long arg)
1094 1136
1095#endif /* CONFIG_MATH_EMULATION */ 1137#endif /* CONFIG_MATH_EMULATION */
1096 1138
1097/*
1098 * This needs to use 'idt_table' rather than 'idt', and
1099 * thus use the _nonmapped_ version of the IDT, as the
1100 * Pentium F0 0F bugfix can have resulted in the mapped
1101 * IDT being write-protected.
1102 */
1103void set_intr_gate(unsigned int n, void *addr)
1104{
1105 _set_gate(n, DESCTYPE_INT, addr, __KERNEL_CS);
1106}
1107
1108/*
1109 * This routine sets up an interrupt gate at directory privilege level 3.
1110 */
1111static inline void set_system_intr_gate(unsigned int n, void *addr)
1112{
1113 _set_gate(n, DESCTYPE_INT | DESCTYPE_DPL3, addr, __KERNEL_CS);
1114}
1115
1116static void __init set_trap_gate(unsigned int n, void *addr)
1117{
1118 _set_gate(n, DESCTYPE_TRAP, addr, __KERNEL_CS);
1119}
1120
1121static void __init set_system_gate(unsigned int n, void *addr)
1122{
1123 _set_gate(n, DESCTYPE_TRAP | DESCTYPE_DPL3, addr, __KERNEL_CS);
1124}
1125
1126static void __init set_task_gate(unsigned int n, unsigned int gdt_entry)
1127{
1128 _set_gate(n, DESCTYPE_TASK, (void *)0, (gdt_entry<<3));
1129}
1130
1131 1139
1132void __init trap_init(void) 1140void __init trap_init(void)
1133{ 1141{
1134 int i; 1142 int i;
1135 1143
1136#ifdef CONFIG_EISA 1144#ifdef CONFIG_EISA
1137 void __iomem *p = ioremap(0x0FFFD9, 4); 1145 void __iomem *p = early_ioremap(0x0FFFD9, 4);
1138 if (readl(p) == 'E'+('I'<<8)+('S'<<16)+('A'<<24)) { 1146 if (readl(p) == 'E'+('I'<<8)+('S'<<16)+('A'<<24)) {
1139 EISA_bus = 1; 1147 EISA_bus = 1;
1140 } 1148 }
1141 iounmap(p); 1149 early_iounmap(p, 4);
1142#endif 1150#endif
1143 1151
1144#ifdef CONFIG_X86_LOCAL_APIC 1152#ifdef CONFIG_X86_LOCAL_APIC
@@ -1168,17 +1176,12 @@ void __init trap_init(void)
1168#endif 1176#endif
1169 set_trap_gate(19,&simd_coprocessor_error); 1177 set_trap_gate(19,&simd_coprocessor_error);
1170 1178
1179 /*
1180 * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned.
1181 * Generate a build-time error if the alignment is wrong.
1182 */
1183 BUILD_BUG_ON(offsetof(struct task_struct, thread.i387.fxsave) & 15);
1171 if (cpu_has_fxsr) { 1184 if (cpu_has_fxsr) {
1172 /*
1173 * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned.
1174 * Generates a compile-time "error: zero width for bit-field" if
1175 * the alignment is wrong.
1176 */
1177 struct fxsrAlignAssert {
1178 int _:!(offsetof(struct task_struct,
1179 thread.i387.fxsave) & 15);
1180 };
1181
1182 printk(KERN_INFO "Enabling fast FPU save and restore... "); 1185 printk(KERN_INFO "Enabling fast FPU save and restore... ");
1183 set_in_cr4(X86_CR4_OSFXSR); 1186 set_in_cr4(X86_CR4_OSFXSR);
1184 printk("done.\n"); 1187 printk("done.\n");
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index d11525ad81b..efc66df728b 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -74,22 +74,24 @@ asmlinkage void alignment_check(void);
74asmlinkage void machine_check(void); 74asmlinkage void machine_check(void);
75asmlinkage void spurious_interrupt_bug(void); 75asmlinkage void spurious_interrupt_bug(void);
76 76
77static unsigned int code_bytes = 64;
78
77static inline void conditional_sti(struct pt_regs *regs) 79static inline void conditional_sti(struct pt_regs *regs)
78{ 80{
79 if (regs->eflags & X86_EFLAGS_IF) 81 if (regs->flags & X86_EFLAGS_IF)
80 local_irq_enable(); 82 local_irq_enable();
81} 83}
82 84
83static inline void preempt_conditional_sti(struct pt_regs *regs) 85static inline void preempt_conditional_sti(struct pt_regs *regs)
84{ 86{
85 preempt_disable(); 87 preempt_disable();
86 if (regs->eflags & X86_EFLAGS_IF) 88 if (regs->flags & X86_EFLAGS_IF)
87 local_irq_enable(); 89 local_irq_enable();
88} 90}
89 91
90static inline void preempt_conditional_cli(struct pt_regs *regs) 92static inline void preempt_conditional_cli(struct pt_regs *regs)
91{ 93{
92 if (regs->eflags & X86_EFLAGS_IF) 94 if (regs->flags & X86_EFLAGS_IF)
93 local_irq_disable(); 95 local_irq_disable();
94 /* Make sure to not schedule here because we could be running 96 /* Make sure to not schedule here because we could be running
95 on an exception stack. */ 97 on an exception stack. */
@@ -98,14 +100,15 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
98 100
99int kstack_depth_to_print = 12; 101int kstack_depth_to_print = 12;
100 102
101#ifdef CONFIG_KALLSYMS 103void printk_address(unsigned long address, int reliable)
102void printk_address(unsigned long address)
103{ 104{
105#ifdef CONFIG_KALLSYMS
104 unsigned long offset = 0, symsize; 106 unsigned long offset = 0, symsize;
105 const char *symname; 107 const char *symname;
106 char *modname; 108 char *modname;
107 char *delim = ":"; 109 char *delim = ":";
108 char namebuf[128]; 110 char namebuf[KSYM_NAME_LEN];
111 char reliab[4] = "";
109 112
110 symname = kallsyms_lookup(address, &symsize, &offset, 113 symname = kallsyms_lookup(address, &symsize, &offset,
111 &modname, namebuf); 114 &modname, namebuf);
@@ -113,17 +116,17 @@ void printk_address(unsigned long address)
113 printk(" [<%016lx>]\n", address); 116 printk(" [<%016lx>]\n", address);
114 return; 117 return;
115 } 118 }
119 if (!reliable)
120 strcpy(reliab, "? ");
121
116 if (!modname) 122 if (!modname)
117 modname = delim = ""; 123 modname = delim = "";
118 printk(" [<%016lx>] %s%s%s%s+0x%lx/0x%lx\n", 124 printk(" [<%016lx>] %s%s%s%s%s+0x%lx/0x%lx\n",
119 address, delim, modname, delim, symname, offset, symsize); 125 address, reliab, delim, modname, delim, symname, offset, symsize);
120}
121#else 126#else
122void printk_address(unsigned long address)
123{
124 printk(" [<%016lx>]\n", address); 127 printk(" [<%016lx>]\n", address);
125}
126#endif 128#endif
129}
127 130
128static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, 131static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
129 unsigned *usedp, char **idp) 132 unsigned *usedp, char **idp)
@@ -208,14 +211,53 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
208 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack 211 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
209 */ 212 */
210 213
211static inline int valid_stack_ptr(struct thread_info *tinfo, void *p) 214static inline int valid_stack_ptr(struct thread_info *tinfo,
215 void *p, unsigned int size, void *end)
216{
217 void *t = tinfo;
218 if (end) {
219 if (p < end && p >= (end-THREAD_SIZE))
220 return 1;
221 else
222 return 0;
223 }
224 return p > t && p < t + THREAD_SIZE - size;
225}
226
227/* The form of the top of the frame on the stack */
228struct stack_frame {
229 struct stack_frame *next_frame;
230 unsigned long return_address;
231};
232
233
234static inline unsigned long print_context_stack(struct thread_info *tinfo,
235 unsigned long *stack, unsigned long bp,
236 const struct stacktrace_ops *ops, void *data,
237 unsigned long *end)
212{ 238{
213 void *t = (void *)tinfo; 239 struct stack_frame *frame = (struct stack_frame *)bp;
214 return p > t && p < t + THREAD_SIZE - 3; 240
241 while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
242 unsigned long addr;
243
244 addr = *stack;
245 if (__kernel_text_address(addr)) {
246 if ((unsigned long) stack == bp + 8) {
247 ops->address(data, addr, 1);
248 frame = frame->next_frame;
249 bp = (unsigned long) frame;
250 } else {
251 ops->address(data, addr, bp == 0);
252 }
253 }
254 stack++;
255 }
256 return bp;
215} 257}
216 258
217void dump_trace(struct task_struct *tsk, struct pt_regs *regs, 259void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
218 unsigned long *stack, 260 unsigned long *stack, unsigned long bp,
219 const struct stacktrace_ops *ops, void *data) 261 const struct stacktrace_ops *ops, void *data)
220{ 262{
221 const unsigned cpu = get_cpu(); 263 const unsigned cpu = get_cpu();
@@ -225,36 +267,28 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
225 267
226 if (!tsk) 268 if (!tsk)
227 tsk = current; 269 tsk = current;
270 tinfo = task_thread_info(tsk);
228 271
229 if (!stack) { 272 if (!stack) {
230 unsigned long dummy; 273 unsigned long dummy;
231 stack = &dummy; 274 stack = &dummy;
232 if (tsk && tsk != current) 275 if (tsk && tsk != current)
233 stack = (unsigned long *)tsk->thread.rsp; 276 stack = (unsigned long *)tsk->thread.sp;
234 } 277 }
235 278
236 /* 279#ifdef CONFIG_FRAME_POINTER
237 * Print function call entries within a stack. 'cond' is the 280 if (!bp) {
238 * "end of stackframe" condition, that the 'stack++' 281 if (tsk == current) {
239 * iteration will eventually trigger. 282 /* Grab bp right from our regs */
240 */ 283 asm("movq %%rbp, %0" : "=r" (bp):);
241#define HANDLE_STACK(cond) \ 284 } else {
242 do while (cond) { \ 285 /* bp is the last reg pushed by switch_to */
243 unsigned long addr = *stack++; \ 286 bp = *(unsigned long *) tsk->thread.sp;
244 /* Use unlocked access here because except for NMIs \ 287 }
245 we should be already protected against module unloads */ \ 288 }
246 if (__kernel_text_address(addr)) { \ 289#endif
247 /* \ 290
248 * If the address is either in the text segment of the \ 291
249 * kernel, or in the region which contains vmalloc'ed \
250 * memory, it *may* be the address of a calling \
251 * routine; if so, print it so that someone tracing \
252 * down the cause of the crash will be able to figure \
253 * out the call path that was taken. \
254 */ \
255 ops->address(data, addr); \
256 } \
257 } while (0)
258 292
259 /* 293 /*
260 * Print function call entries in all stacks, starting at the 294 * Print function call entries in all stacks, starting at the
@@ -270,7 +304,9 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
270 if (estack_end) { 304 if (estack_end) {
271 if (ops->stack(data, id) < 0) 305 if (ops->stack(data, id) < 0)
272 break; 306 break;
273 HANDLE_STACK (stack < estack_end); 307
308 bp = print_context_stack(tinfo, stack, bp, ops,
309 data, estack_end);
274 ops->stack(data, "<EOE>"); 310 ops->stack(data, "<EOE>");
275 /* 311 /*
276 * We link to the next stack via the 312 * We link to the next stack via the
@@ -288,7 +324,8 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
288 if (stack >= irqstack && stack < irqstack_end) { 324 if (stack >= irqstack && stack < irqstack_end) {
289 if (ops->stack(data, "IRQ") < 0) 325 if (ops->stack(data, "IRQ") < 0)
290 break; 326 break;
291 HANDLE_STACK (stack < irqstack_end); 327 bp = print_context_stack(tinfo, stack, bp,
328 ops, data, irqstack_end);
292 /* 329 /*
293 * We link to the next stack (which would be 330 * We link to the next stack (which would be
294 * the process stack normally) the last 331 * the process stack normally) the last
@@ -306,9 +343,7 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
306 /* 343 /*
307 * This handles the process stack: 344 * This handles the process stack:
308 */ 345 */
309 tinfo = task_thread_info(tsk); 346 bp = print_context_stack(tinfo, stack, bp, ops, data, NULL);
310 HANDLE_STACK (valid_stack_ptr(tinfo, stack));
311#undef HANDLE_STACK
312 put_cpu(); 347 put_cpu();
313} 348}
314EXPORT_SYMBOL(dump_trace); 349EXPORT_SYMBOL(dump_trace);
@@ -331,10 +366,10 @@ static int print_trace_stack(void *data, char *name)
331 return 0; 366 return 0;
332} 367}
333 368
334static void print_trace_address(void *data, unsigned long addr) 369static void print_trace_address(void *data, unsigned long addr, int reliable)
335{ 370{
336 touch_nmi_watchdog(); 371 touch_nmi_watchdog();
337 printk_address(addr); 372 printk_address(addr, reliable);
338} 373}
339 374
340static const struct stacktrace_ops print_trace_ops = { 375static const struct stacktrace_ops print_trace_ops = {
@@ -345,15 +380,17 @@ static const struct stacktrace_ops print_trace_ops = {
345}; 380};
346 381
347void 382void
348show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack) 383show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack,
384 unsigned long bp)
349{ 385{
350 printk("\nCall Trace:\n"); 386 printk("\nCall Trace:\n");
351 dump_trace(tsk, regs, stack, &print_trace_ops, NULL); 387 dump_trace(tsk, regs, stack, bp, &print_trace_ops, NULL);
352 printk("\n"); 388 printk("\n");
353} 389}
354 390
355static void 391static void
356_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp) 392_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp,
393 unsigned long bp)
357{ 394{
358 unsigned long *stack; 395 unsigned long *stack;
359 int i; 396 int i;
@@ -364,14 +401,14 @@ _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp)
364 // debugging aid: "show_stack(NULL, NULL);" prints the 401 // debugging aid: "show_stack(NULL, NULL);" prints the
365 // back trace for this cpu. 402 // back trace for this cpu.
366 403
367 if (rsp == NULL) { 404 if (sp == NULL) {
368 if (tsk) 405 if (tsk)
369 rsp = (unsigned long *)tsk->thread.rsp; 406 sp = (unsigned long *)tsk->thread.sp;
370 else 407 else
371 rsp = (unsigned long *)&rsp; 408 sp = (unsigned long *)&sp;
372 } 409 }
373 410
374 stack = rsp; 411 stack = sp;
375 for(i=0; i < kstack_depth_to_print; i++) { 412 for(i=0; i < kstack_depth_to_print; i++) {
376 if (stack >= irqstack && stack <= irqstack_end) { 413 if (stack >= irqstack && stack <= irqstack_end) {
377 if (stack == irqstack_end) { 414 if (stack == irqstack_end) {
@@ -387,12 +424,12 @@ _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp)
387 printk(" %016lx", *stack++); 424 printk(" %016lx", *stack++);
388 touch_nmi_watchdog(); 425 touch_nmi_watchdog();
389 } 426 }
390 show_trace(tsk, regs, rsp); 427 show_trace(tsk, regs, sp, bp);
391} 428}
392 429
393void show_stack(struct task_struct *tsk, unsigned long * rsp) 430void show_stack(struct task_struct *tsk, unsigned long * sp)
394{ 431{
395 _show_stack(tsk, NULL, rsp); 432 _show_stack(tsk, NULL, sp, 0);
396} 433}
397 434
398/* 435/*
@@ -401,13 +438,19 @@ void show_stack(struct task_struct *tsk, unsigned long * rsp)
401void dump_stack(void) 438void dump_stack(void)
402{ 439{
403 unsigned long dummy; 440 unsigned long dummy;
441 unsigned long bp = 0;
442
443#ifdef CONFIG_FRAME_POINTER
444 if (!bp)
445 asm("movq %%rbp, %0" : "=r" (bp):);
446#endif
404 447
405 printk("Pid: %d, comm: %.20s %s %s %.*s\n", 448 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
406 current->pid, current->comm, print_tainted(), 449 current->pid, current->comm, print_tainted(),
407 init_utsname()->release, 450 init_utsname()->release,
408 (int)strcspn(init_utsname()->version, " "), 451 (int)strcspn(init_utsname()->version, " "),
409 init_utsname()->version); 452 init_utsname()->version);
410 show_trace(NULL, NULL, &dummy); 453 show_trace(NULL, NULL, &dummy, bp);
411} 454}
412 455
413EXPORT_SYMBOL(dump_stack); 456EXPORT_SYMBOL(dump_stack);
@@ -415,12 +458,15 @@ EXPORT_SYMBOL(dump_stack);
415void show_registers(struct pt_regs *regs) 458void show_registers(struct pt_regs *regs)
416{ 459{
417 int i; 460 int i;
418 int in_kernel = !user_mode(regs); 461 unsigned long sp;
419 unsigned long rsp;
420 const int cpu = smp_processor_id(); 462 const int cpu = smp_processor_id();
421 struct task_struct *cur = cpu_pda(cpu)->pcurrent; 463 struct task_struct *cur = cpu_pda(cpu)->pcurrent;
464 u8 *ip;
465 unsigned int code_prologue = code_bytes * 43 / 64;
466 unsigned int code_len = code_bytes;
422 467
423 rsp = regs->rsp; 468 sp = regs->sp;
469 ip = (u8 *) regs->ip - code_prologue;
424 printk("CPU %d ", cpu); 470 printk("CPU %d ", cpu);
425 __show_regs(regs); 471 __show_regs(regs);
426 printk("Process %s (pid: %d, threadinfo %p, task %p)\n", 472 printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
@@ -430,45 +476,43 @@ void show_registers(struct pt_regs *regs)
430 * When in-kernel, we also print out the stack and code at the 476 * When in-kernel, we also print out the stack and code at the
431 * time of the fault.. 477 * time of the fault..
432 */ 478 */
433 if (in_kernel) { 479 if (!user_mode(regs)) {
480 unsigned char c;
434 printk("Stack: "); 481 printk("Stack: ");
435 _show_stack(NULL, regs, (unsigned long*)rsp); 482 _show_stack(NULL, regs, (unsigned long *)sp, regs->bp);
436 483 printk("\n");
437 printk("\nCode: "); 484
438 if (regs->rip < PAGE_OFFSET) 485 printk(KERN_EMERG "Code: ");
439 goto bad; 486 if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
440 487 /* try starting at RIP */
441 for (i=0; i<20; i++) { 488 ip = (u8 *) regs->ip;
442 unsigned char c; 489 code_len = code_len - code_prologue + 1;
443 if (__get_user(c, &((unsigned char*)regs->rip)[i])) { 490 }
444bad: 491 for (i = 0; i < code_len; i++, ip++) {
492 if (ip < (u8 *)PAGE_OFFSET ||
493 probe_kernel_address(ip, c)) {
445 printk(" Bad RIP value."); 494 printk(" Bad RIP value.");
446 break; 495 break;
447 } 496 }
448 printk("%02x ", c); 497 if (ip == (u8 *)regs->ip)
498 printk("<%02x> ", c);
499 else
500 printk("%02x ", c);
449 } 501 }
450 } 502 }
451 printk("\n"); 503 printk("\n");
452} 504}
453 505
454int is_valid_bugaddr(unsigned long rip) 506int is_valid_bugaddr(unsigned long ip)
455{ 507{
456 unsigned short ud2; 508 unsigned short ud2;
457 509
458 if (__copy_from_user(&ud2, (const void __user *) rip, sizeof(ud2))) 510 if (__copy_from_user(&ud2, (const void __user *) ip, sizeof(ud2)))
459 return 0; 511 return 0;
460 512
461 return ud2 == 0x0b0f; 513 return ud2 == 0x0b0f;
462} 514}
463 515
464#ifdef CONFIG_BUG
465void out_of_line_bug(void)
466{
467 BUG();
468}
469EXPORT_SYMBOL(out_of_line_bug);
470#endif
471
472static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; 516static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
473static int die_owner = -1; 517static int die_owner = -1;
474static unsigned int die_nest_count; 518static unsigned int die_nest_count;
@@ -496,7 +540,7 @@ unsigned __kprobes long oops_begin(void)
496 return flags; 540 return flags;
497} 541}
498 542
499void __kprobes oops_end(unsigned long flags) 543void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
500{ 544{
501 die_owner = -1; 545 die_owner = -1;
502 bust_spinlocks(0); 546 bust_spinlocks(0);
@@ -505,12 +549,17 @@ void __kprobes oops_end(unsigned long flags)
505 /* Nest count reaches zero, release the lock. */ 549 /* Nest count reaches zero, release the lock. */
506 __raw_spin_unlock(&die_lock); 550 __raw_spin_unlock(&die_lock);
507 raw_local_irq_restore(flags); 551 raw_local_irq_restore(flags);
552 if (!regs) {
553 oops_exit();
554 return;
555 }
508 if (panic_on_oops) 556 if (panic_on_oops)
509 panic("Fatal exception"); 557 panic("Fatal exception");
510 oops_exit(); 558 oops_exit();
559 do_exit(signr);
511} 560}
512 561
513void __kprobes __die(const char * str, struct pt_regs * regs, long err) 562int __kprobes __die(const char * str, struct pt_regs * regs, long err)
514{ 563{
515 static int die_counter; 564 static int die_counter;
516 printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter); 565 printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter);
@@ -524,15 +573,17 @@ void __kprobes __die(const char * str, struct pt_regs * regs, long err)
524 printk("DEBUG_PAGEALLOC"); 573 printk("DEBUG_PAGEALLOC");
525#endif 574#endif
526 printk("\n"); 575 printk("\n");
527 notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV); 576 if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
577 return 1;
528 show_registers(regs); 578 show_registers(regs);
529 add_taint(TAINT_DIE); 579 add_taint(TAINT_DIE);
530 /* Executive summary in case the oops scrolled away */ 580 /* Executive summary in case the oops scrolled away */
531 printk(KERN_ALERT "RIP "); 581 printk(KERN_ALERT "RIP ");
532 printk_address(regs->rip); 582 printk_address(regs->ip, 1);
533 printk(" RSP <%016lx>\n", regs->rsp); 583 printk(" RSP <%016lx>\n", regs->sp);
534 if (kexec_should_crash(current)) 584 if (kexec_should_crash(current))
535 crash_kexec(regs); 585 crash_kexec(regs);
586 return 0;
536} 587}
537 588
538void die(const char * str, struct pt_regs * regs, long err) 589void die(const char * str, struct pt_regs * regs, long err)
@@ -540,11 +591,11 @@ void die(const char * str, struct pt_regs * regs, long err)
540 unsigned long flags = oops_begin(); 591 unsigned long flags = oops_begin();
541 592
542 if (!user_mode(regs)) 593 if (!user_mode(regs))
543 report_bug(regs->rip, regs); 594 report_bug(regs->ip, regs);
544 595
545 __die(str, regs, err); 596 if (__die(str, regs, err))
546 oops_end(flags); 597 regs = NULL;
547 do_exit(SIGSEGV); 598 oops_end(flags, regs, SIGSEGV);
548} 599}
549 600
550void __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic) 601void __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic)
@@ -561,10 +612,10 @@ void __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic)
561 crash_kexec(regs); 612 crash_kexec(regs);
562 if (do_panic || panic_on_oops) 613 if (do_panic || panic_on_oops)
563 panic("Non maskable interrupt"); 614 panic("Non maskable interrupt");
564 oops_end(flags); 615 oops_end(flags, NULL, SIGBUS);
565 nmi_exit(); 616 nmi_exit();
566 local_irq_enable(); 617 local_irq_enable();
567 do_exit(SIGSEGV); 618 do_exit(SIGBUS);
568} 619}
569 620
570static void __kprobes do_trap(int trapnr, int signr, char *str, 621static void __kprobes do_trap(int trapnr, int signr, char *str,
@@ -588,11 +639,14 @@ static void __kprobes do_trap(int trapnr, int signr, char *str,
588 tsk->thread.trap_no = trapnr; 639 tsk->thread.trap_no = trapnr;
589 640
590 if (show_unhandled_signals && unhandled_signal(tsk, signr) && 641 if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
591 printk_ratelimit()) 642 printk_ratelimit()) {
592 printk(KERN_INFO 643 printk(KERN_INFO
593 "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n", 644 "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
594 tsk->comm, tsk->pid, str, 645 tsk->comm, tsk->pid, str,
595 regs->rip, regs->rsp, error_code); 646 regs->ip, regs->sp, error_code);
647 print_vma_addr(" in ", regs->ip);
648 printk("\n");
649 }
596 650
597 if (info) 651 if (info)
598 force_sig_info(signr, info, tsk); 652 force_sig_info(signr, info, tsk);
@@ -602,19 +656,12 @@ static void __kprobes do_trap(int trapnr, int signr, char *str,
602 } 656 }
603 657
604 658
605 /* kernel trap */ 659 if (!fixup_exception(regs)) {
606 { 660 tsk->thread.error_code = error_code;
607 const struct exception_table_entry *fixup; 661 tsk->thread.trap_no = trapnr;
608 fixup = search_exception_tables(regs->rip); 662 die(str, regs, error_code);
609 if (fixup)
610 regs->rip = fixup->fixup;
611 else {
612 tsk->thread.error_code = error_code;
613 tsk->thread.trap_no = trapnr;
614 die(str, regs, error_code);
615 }
616 return;
617 } 663 }
664 return;
618} 665}
619 666
620#define DO_ERROR(trapnr, signr, str, name) \ 667#define DO_ERROR(trapnr, signr, str, name) \
@@ -635,6 +682,7 @@ asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
635 info.si_errno = 0; \ 682 info.si_errno = 0; \
636 info.si_code = sicode; \ 683 info.si_code = sicode; \
637 info.si_addr = (void __user *)siaddr; \ 684 info.si_addr = (void __user *)siaddr; \
685 trace_hardirqs_fixup(); \
638 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ 686 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
639 == NOTIFY_STOP) \ 687 == NOTIFY_STOP) \
640 return; \ 688 return; \
@@ -642,10 +690,10 @@ asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
642 do_trap(trapnr, signr, str, regs, error_code, &info); \ 690 do_trap(trapnr, signr, str, regs, error_code, &info); \
643} 691}
644 692
645DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->rip) 693DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
646DO_ERROR( 4, SIGSEGV, "overflow", overflow) 694DO_ERROR( 4, SIGSEGV, "overflow", overflow)
647DO_ERROR( 5, SIGSEGV, "bounds", bounds) 695DO_ERROR( 5, SIGSEGV, "bounds", bounds)
648DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->rip) 696DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
649DO_ERROR( 7, SIGSEGV, "device not available", device_not_available) 697DO_ERROR( 7, SIGSEGV, "device not available", device_not_available)
650DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) 698DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
651DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) 699DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
@@ -693,32 +741,28 @@ asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
693 tsk->thread.trap_no = 13; 741 tsk->thread.trap_no = 13;
694 742
695 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && 743 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
696 printk_ratelimit()) 744 printk_ratelimit()) {
697 printk(KERN_INFO 745 printk(KERN_INFO
698 "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n", 746 "%s[%d] general protection ip:%lx sp:%lx error:%lx",
699 tsk->comm, tsk->pid, 747 tsk->comm, tsk->pid,
700 regs->rip, regs->rsp, error_code); 748 regs->ip, regs->sp, error_code);
749 print_vma_addr(" in ", regs->ip);
750 printk("\n");
751 }
701 752
702 force_sig(SIGSEGV, tsk); 753 force_sig(SIGSEGV, tsk);
703 return; 754 return;
704 } 755 }
705 756
706 /* kernel gp */ 757 if (fixup_exception(regs))
707 { 758 return;
708 const struct exception_table_entry *fixup;
709 fixup = search_exception_tables(regs->rip);
710 if (fixup) {
711 regs->rip = fixup->fixup;
712 return;
713 }
714 759
715 tsk->thread.error_code = error_code; 760 tsk->thread.error_code = error_code;
716 tsk->thread.trap_no = 13; 761 tsk->thread.trap_no = 13;
717 if (notify_die(DIE_GPF, "general protection fault", regs, 762 if (notify_die(DIE_GPF, "general protection fault", regs,
718 error_code, 13, SIGSEGV) == NOTIFY_STOP) 763 error_code, 13, SIGSEGV) == NOTIFY_STOP)
719 return; 764 return;
720 die("general protection fault", regs, error_code); 765 die("general protection fault", regs, error_code);
721 }
722} 766}
723 767
724static __kprobes void 768static __kprobes void
@@ -831,15 +875,15 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
831{ 875{
832 struct pt_regs *regs = eregs; 876 struct pt_regs *regs = eregs;
833 /* Did already sync */ 877 /* Did already sync */
834 if (eregs == (struct pt_regs *)eregs->rsp) 878 if (eregs == (struct pt_regs *)eregs->sp)
835 ; 879 ;
836 /* Exception from user space */ 880 /* Exception from user space */
837 else if (user_mode(eregs)) 881 else if (user_mode(eregs))
838 regs = task_pt_regs(current); 882 regs = task_pt_regs(current);
839 /* Exception from kernel and interrupts are enabled. Move to 883 /* Exception from kernel and interrupts are enabled. Move to
840 kernel process stack. */ 884 kernel process stack. */
841 else if (eregs->eflags & X86_EFLAGS_IF) 885 else if (eregs->flags & X86_EFLAGS_IF)
842 regs = (struct pt_regs *)(eregs->rsp -= sizeof(struct pt_regs)); 886 regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
843 if (eregs != regs) 887 if (eregs != regs)
844 *regs = *eregs; 888 *regs = *eregs;
845 return regs; 889 return regs;
@@ -857,6 +901,12 @@ asmlinkage void __kprobes do_debug(struct pt_regs * regs,
857 901
858 get_debugreg(condition, 6); 902 get_debugreg(condition, 6);
859 903
904 /*
905 * The processor cleared BTF, so don't mark that we need it set.
906 */
907 clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
908 tsk->thread.debugctlmsr = 0;
909
860 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, 910 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
861 SIGTRAP) == NOTIFY_STOP) 911 SIGTRAP) == NOTIFY_STOP)
862 return; 912 return;
@@ -872,27 +922,14 @@ asmlinkage void __kprobes do_debug(struct pt_regs * regs,
872 922
873 tsk->thread.debugreg6 = condition; 923 tsk->thread.debugreg6 = condition;
874 924
875 /* Mask out spurious TF errors due to lazy TF clearing */ 925
926 /*
927 * Single-stepping through TF: make sure we ignore any events in
928 * kernel space (but re-enable TF when returning to user mode).
929 */
876 if (condition & DR_STEP) { 930 if (condition & DR_STEP) {
877 /*
878 * The TF error should be masked out only if the current
879 * process is not traced and if the TRAP flag has been set
880 * previously by a tracing process (condition detected by
881 * the PT_DTRACE flag); remember that the i386 TRAP flag
882 * can be modified by the process itself in user mode,
883 * allowing programs to debug themselves without the ptrace()
884 * interface.
885 */
886 if (!user_mode(regs)) 931 if (!user_mode(regs))
887 goto clear_TF_reenable; 932 goto clear_TF_reenable;
888 /*
889 * Was the TF flag set by a debugger? If so, clear it now,
890 * so that register information is correct.
891 */
892 if (tsk->ptrace & PT_DTRACE) {
893 regs->eflags &= ~TF_MASK;
894 tsk->ptrace &= ~PT_DTRACE;
895 }
896 } 933 }
897 934
898 /* Ok, finally something we can handle */ 935 /* Ok, finally something we can handle */
@@ -901,7 +938,7 @@ asmlinkage void __kprobes do_debug(struct pt_regs * regs,
901 info.si_signo = SIGTRAP; 938 info.si_signo = SIGTRAP;
902 info.si_errno = 0; 939 info.si_errno = 0;
903 info.si_code = TRAP_BRKPT; 940 info.si_code = TRAP_BRKPT;
904 info.si_addr = user_mode(regs) ? (void __user *)regs->rip : NULL; 941 info.si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL;
905 force_sig_info(SIGTRAP, &info, tsk); 942 force_sig_info(SIGTRAP, &info, tsk);
906 943
907clear_dr7: 944clear_dr7:
@@ -911,18 +948,15 @@ clear_dr7:
911 948
912clear_TF_reenable: 949clear_TF_reenable:
913 set_tsk_thread_flag(tsk, TIF_SINGLESTEP); 950 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
914 regs->eflags &= ~TF_MASK; 951 regs->flags &= ~X86_EFLAGS_TF;
915 preempt_conditional_cli(regs); 952 preempt_conditional_cli(regs);
916} 953}
917 954
918static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr) 955static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
919{ 956{
920 const struct exception_table_entry *fixup; 957 if (fixup_exception(regs))
921 fixup = search_exception_tables(regs->rip);
922 if (fixup) {
923 regs->rip = fixup->fixup;
924 return 1; 958 return 1;
925 } 959
926 notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE); 960 notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
927 /* Illegal floating point operation in the kernel */ 961 /* Illegal floating point operation in the kernel */
928 current->thread.trap_no = trapnr; 962 current->thread.trap_no = trapnr;
@@ -937,7 +971,7 @@ static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
937 */ 971 */
938asmlinkage void do_coprocessor_error(struct pt_regs *regs) 972asmlinkage void do_coprocessor_error(struct pt_regs *regs)
939{ 973{
940 void __user *rip = (void __user *)(regs->rip); 974 void __user *ip = (void __user *)(regs->ip);
941 struct task_struct * task; 975 struct task_struct * task;
942 siginfo_t info; 976 siginfo_t info;
943 unsigned short cwd, swd; 977 unsigned short cwd, swd;
@@ -957,7 +991,7 @@ asmlinkage void do_coprocessor_error(struct pt_regs *regs)
957 info.si_signo = SIGFPE; 991 info.si_signo = SIGFPE;
958 info.si_errno = 0; 992 info.si_errno = 0;
959 info.si_code = __SI_FAULT; 993 info.si_code = __SI_FAULT;
960 info.si_addr = rip; 994 info.si_addr = ip;
961 /* 995 /*
962 * (~cwd & swd) will mask out exceptions that are not set to unmasked 996 * (~cwd & swd) will mask out exceptions that are not set to unmasked
963 * status. 0x3f is the exception bits in these regs, 0x200 is the 997 * status. 0x3f is the exception bits in these regs, 0x200 is the
@@ -1006,7 +1040,7 @@ asmlinkage void bad_intr(void)
1006 1040
1007asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs) 1041asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
1008{ 1042{
1009 void __user *rip = (void __user *)(regs->rip); 1043 void __user *ip = (void __user *)(regs->ip);
1010 struct task_struct * task; 1044 struct task_struct * task;
1011 siginfo_t info; 1045 siginfo_t info;
1012 unsigned short mxcsr; 1046 unsigned short mxcsr;
@@ -1026,7 +1060,7 @@ asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
1026 info.si_signo = SIGFPE; 1060 info.si_signo = SIGFPE;
1027 info.si_errno = 0; 1061 info.si_errno = 0;
1028 info.si_code = __SI_FAULT; 1062 info.si_code = __SI_FAULT;
1029 info.si_addr = rip; 1063 info.si_addr = ip;
1030 /* 1064 /*
1031 * The SIMD FPU exceptions are handled a little differently, as there 1065 * The SIMD FPU exceptions are handled a little differently, as there
1032 * is only a single status/control register. Thus, to determine which 1066 * is only a single status/control register. Thus, to determine which
@@ -1088,6 +1122,7 @@ asmlinkage void math_state_restore(void)
1088 task_thread_info(me)->status |= TS_USEDFPU; 1122 task_thread_info(me)->status |= TS_USEDFPU;
1089 me->fpu_counter++; 1123 me->fpu_counter++;
1090} 1124}
1125EXPORT_SYMBOL_GPL(math_state_restore);
1091 1126
1092void __init trap_init(void) 1127void __init trap_init(void)
1093{ 1128{
@@ -1143,3 +1178,14 @@ static int __init kstack_setup(char *s)
1143 return 0; 1178 return 0;
1144} 1179}
1145early_param("kstack", kstack_setup); 1180early_param("kstack", kstack_setup);
1181
1182
1183static int __init code_bytes_setup(char *s)
1184{
1185 code_bytes = simple_strtoul(s, NULL, 0);
1186 if (code_bytes > 8192)
1187 code_bytes = 8192;
1188
1189 return 1;
1190}
1191__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c
index 9ebc0dab66b..43517e324be 100644
--- a/arch/x86/kernel/tsc_32.c
+++ b/arch/x86/kernel/tsc_32.c
@@ -5,6 +5,7 @@
5#include <linux/jiffies.h> 5#include <linux/jiffies.h>
6#include <linux/init.h> 6#include <linux/init.h>
7#include <linux/dmi.h> 7#include <linux/dmi.h>
8#include <linux/percpu.h>
8 9
9#include <asm/delay.h> 10#include <asm/delay.h>
10#include <asm/tsc.h> 11#include <asm/tsc.h>
@@ -23,8 +24,6 @@ static int tsc_enabled;
23unsigned int tsc_khz; 24unsigned int tsc_khz;
24EXPORT_SYMBOL_GPL(tsc_khz); 25EXPORT_SYMBOL_GPL(tsc_khz);
25 26
26int tsc_disable;
27
28#ifdef CONFIG_X86_TSC 27#ifdef CONFIG_X86_TSC
29static int __init tsc_setup(char *str) 28static int __init tsc_setup(char *str)
30{ 29{
@@ -39,8 +38,7 @@ static int __init tsc_setup(char *str)
39 */ 38 */
40static int __init tsc_setup(char *str) 39static int __init tsc_setup(char *str)
41{ 40{
42 tsc_disable = 1; 41 setup_clear_cpu_cap(X86_FEATURE_TSC);
43
44 return 1; 42 return 1;
45} 43}
46#endif 44#endif
@@ -80,13 +78,31 @@ EXPORT_SYMBOL_GPL(check_tsc_unstable);
80 * 78 *
81 * -johnstul@us.ibm.com "math is hard, lets go shopping!" 79 * -johnstul@us.ibm.com "math is hard, lets go shopping!"
82 */ 80 */
83unsigned long cyc2ns_scale __read_mostly;
84 81
85#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ 82DEFINE_PER_CPU(unsigned long, cyc2ns);
86 83
87static inline void set_cyc2ns_scale(unsigned long cpu_khz) 84static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
88{ 85{
89 cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz; 86 unsigned long flags, prev_scale, *scale;
87 unsigned long long tsc_now, ns_now;
88
89 local_irq_save(flags);
90 sched_clock_idle_sleep_event();
91
92 scale = &per_cpu(cyc2ns, cpu);
93
94 rdtscll(tsc_now);
95 ns_now = __cycles_2_ns(tsc_now);
96
97 prev_scale = *scale;
98 if (cpu_khz)
99 *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz;
100
101 /*
102 * Start smoothly with the new frequency:
103 */
104 sched_clock_idle_wakeup_event(0);
105 local_irq_restore(flags);
90} 106}
91 107
92/* 108/*
@@ -239,7 +255,9 @@ time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data)
239 ref_freq, freq->new); 255 ref_freq, freq->new);
240 if (!(freq->flags & CPUFREQ_CONST_LOOPS)) { 256 if (!(freq->flags & CPUFREQ_CONST_LOOPS)) {
241 tsc_khz = cpu_khz; 257 tsc_khz = cpu_khz;
242 set_cyc2ns_scale(cpu_khz); 258 preempt_disable();
259 set_cyc2ns_scale(cpu_khz, smp_processor_id());
260 preempt_enable();
243 /* 261 /*
244 * TSC based sched_clock turns 262 * TSC based sched_clock turns
245 * to junk w/ cpufreq 263 * to junk w/ cpufreq
@@ -333,6 +351,11 @@ __cpuinit int unsynchronized_tsc(void)
333{ 351{
334 if (!cpu_has_tsc || tsc_unstable) 352 if (!cpu_has_tsc || tsc_unstable)
335 return 1; 353 return 1;
354
355 /* Anything with constant TSC should be synchronized */
356 if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
357 return 0;
358
336 /* 359 /*
337 * Intel systems are normally all synchronized. 360 * Intel systems are normally all synchronized.
338 * Exceptions must mark TSC as unstable: 361 * Exceptions must mark TSC as unstable:
@@ -367,7 +390,9 @@ static inline void check_geode_tsc_reliable(void) { }
367 390
368void __init tsc_init(void) 391void __init tsc_init(void)
369{ 392{
370 if (!cpu_has_tsc || tsc_disable) 393 int cpu;
394
395 if (!cpu_has_tsc)
371 goto out_no_tsc; 396 goto out_no_tsc;
372 397
373 cpu_khz = calculate_cpu_khz(); 398 cpu_khz = calculate_cpu_khz();
@@ -380,7 +405,15 @@ void __init tsc_init(void)
380 (unsigned long)cpu_khz / 1000, 405 (unsigned long)cpu_khz / 1000,
381 (unsigned long)cpu_khz % 1000); 406 (unsigned long)cpu_khz % 1000);
382 407
383 set_cyc2ns_scale(cpu_khz); 408 /*
409 * Secondary CPUs do not run through tsc_init(), so set up
410 * all the scale factors for all CPUs, assuming the same
411 * speed as the bootup CPU. (cpufreq notifiers will fix this
412 * up if their speed diverges)
413 */
414 for_each_possible_cpu(cpu)
415 set_cyc2ns_scale(cpu_khz, cpu);
416
384 use_tsc_delay(); 417 use_tsc_delay();
385 418
386 /* Check and install the TSC clocksource */ 419 /* Check and install the TSC clocksource */
@@ -403,10 +436,5 @@ void __init tsc_init(void)
403 return; 436 return;
404 437
405out_no_tsc: 438out_no_tsc:
406 /* 439 setup_clear_cpu_cap(X86_FEATURE_TSC);
407 * Set the tsc_disable flag if there's no TSC support, this
408 * makes it a fast flag for the kernel to see whether it
409 * should be using the TSC.
410 */
411 tsc_disable = 1;
412} 440}
diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c
index 9c70af45b42..947554ddabb 100644
--- a/arch/x86/kernel/tsc_64.c
+++ b/arch/x86/kernel/tsc_64.c
@@ -10,6 +10,7 @@
10 10
11#include <asm/hpet.h> 11#include <asm/hpet.h>
12#include <asm/timex.h> 12#include <asm/timex.h>
13#include <asm/timer.h>
13 14
14static int notsc __initdata = 0; 15static int notsc __initdata = 0;
15 16
@@ -18,19 +19,51 @@ EXPORT_SYMBOL(cpu_khz);
18unsigned int tsc_khz; 19unsigned int tsc_khz;
19EXPORT_SYMBOL(tsc_khz); 20EXPORT_SYMBOL(tsc_khz);
20 21
21static unsigned int cyc2ns_scale __read_mostly; 22/* Accelerators for sched_clock()
23 * convert from cycles(64bits) => nanoseconds (64bits)
24 * basic equation:
25 * ns = cycles / (freq / ns_per_sec)
26 * ns = cycles * (ns_per_sec / freq)
27 * ns = cycles * (10^9 / (cpu_khz * 10^3))
28 * ns = cycles * (10^6 / cpu_khz)
29 *
30 * Then we use scaling math (suggested by george@mvista.com) to get:
31 * ns = cycles * (10^6 * SC / cpu_khz) / SC
32 * ns = cycles * cyc2ns_scale / SC
33 *
34 * And since SC is a constant power of two, we can convert the div
35 * into a shift.
36 *
37 * We can use khz divisor instead of mhz to keep a better precision, since
38 * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
39 * (mathieu.desnoyers@polymtl.ca)
40 *
41 * -johnstul@us.ibm.com "math is hard, lets go shopping!"
42 */
43DEFINE_PER_CPU(unsigned long, cyc2ns);
22 44
23static inline void set_cyc2ns_scale(unsigned long khz) 45static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
24{ 46{
25 cyc2ns_scale = (NSEC_PER_MSEC << NS_SCALE) / khz; 47 unsigned long flags, prev_scale, *scale;
26} 48 unsigned long long tsc_now, ns_now;
27 49
28static unsigned long long cycles_2_ns(unsigned long long cyc) 50 local_irq_save(flags);
29{ 51 sched_clock_idle_sleep_event();
30 return (cyc * cyc2ns_scale) >> NS_SCALE; 52
53 scale = &per_cpu(cyc2ns, cpu);
54
55 rdtscll(tsc_now);
56 ns_now = __cycles_2_ns(tsc_now);
57
58 prev_scale = *scale;
59 if (cpu_khz)
60 *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz;
61
62 sched_clock_idle_wakeup_event(0);
63 local_irq_restore(flags);
31} 64}
32 65
33unsigned long long sched_clock(void) 66unsigned long long native_sched_clock(void)
34{ 67{
35 unsigned long a = 0; 68 unsigned long a = 0;
36 69
@@ -44,12 +77,27 @@ unsigned long long sched_clock(void)
44 return cycles_2_ns(a); 77 return cycles_2_ns(a);
45} 78}
46 79
80/* We need to define a real function for sched_clock, to override the
81 weak default version */
82#ifdef CONFIG_PARAVIRT
83unsigned long long sched_clock(void)
84{
85 return paravirt_sched_clock();
86}
87#else
88unsigned long long
89sched_clock(void) __attribute__((alias("native_sched_clock")));
90#endif
91
92
47static int tsc_unstable; 93static int tsc_unstable;
48 94
49inline int check_tsc_unstable(void) 95int check_tsc_unstable(void)
50{ 96{
51 return tsc_unstable; 97 return tsc_unstable;
52} 98}
99EXPORT_SYMBOL_GPL(check_tsc_unstable);
100
53#ifdef CONFIG_CPU_FREQ 101#ifdef CONFIG_CPU_FREQ
54 102
55/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency 103/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
@@ -100,7 +148,9 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
100 mark_tsc_unstable("cpufreq changes"); 148 mark_tsc_unstable("cpufreq changes");
101 } 149 }
102 150
103 set_cyc2ns_scale(tsc_khz_ref); 151 preempt_disable();
152 set_cyc2ns_scale(tsc_khz_ref, smp_processor_id());
153 preempt_enable();
104 154
105 return 0; 155 return 0;
106} 156}
@@ -133,12 +183,12 @@ static unsigned long __init tsc_read_refs(unsigned long *pm,
133 int i; 183 int i;
134 184
135 for (i = 0; i < MAX_RETRIES; i++) { 185 for (i = 0; i < MAX_RETRIES; i++) {
136 t1 = get_cycles_sync(); 186 t1 = get_cycles();
137 if (hpet) 187 if (hpet)
138 *hpet = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF; 188 *hpet = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF;
139 else 189 else
140 *pm = acpi_pm_read_early(); 190 *pm = acpi_pm_read_early();
141 t2 = get_cycles_sync(); 191 t2 = get_cycles();
142 if ((t2 - t1) < SMI_TRESHOLD) 192 if ((t2 - t1) < SMI_TRESHOLD)
143 return t2; 193 return t2;
144 } 194 }
@@ -151,7 +201,7 @@ static unsigned long __init tsc_read_refs(unsigned long *pm,
151void __init tsc_calibrate(void) 201void __init tsc_calibrate(void)
152{ 202{
153 unsigned long flags, tsc1, tsc2, tr1, tr2, pm1, pm2, hpet1, hpet2; 203 unsigned long flags, tsc1, tsc2, tr1, tr2, pm1, pm2, hpet1, hpet2;
154 int hpet = is_hpet_enabled(); 204 int hpet = is_hpet_enabled(), cpu;
155 205
156 local_irq_save(flags); 206 local_irq_save(flags);
157 207
@@ -162,9 +212,9 @@ void __init tsc_calibrate(void)
162 outb(0xb0, 0x43); 212 outb(0xb0, 0x43);
163 outb((CLOCK_TICK_RATE / (1000 / 50)) & 0xff, 0x42); 213 outb((CLOCK_TICK_RATE / (1000 / 50)) & 0xff, 0x42);
164 outb((CLOCK_TICK_RATE / (1000 / 50)) >> 8, 0x42); 214 outb((CLOCK_TICK_RATE / (1000 / 50)) >> 8, 0x42);
165 tr1 = get_cycles_sync(); 215 tr1 = get_cycles();
166 while ((inb(0x61) & 0x20) == 0); 216 while ((inb(0x61) & 0x20) == 0);
167 tr2 = get_cycles_sync(); 217 tr2 = get_cycles();
168 218
169 tsc2 = tsc_read_refs(&pm2, hpet ? &hpet2 : NULL); 219 tsc2 = tsc_read_refs(&pm2, hpet ? &hpet2 : NULL);
170 220
@@ -206,7 +256,9 @@ void __init tsc_calibrate(void)
206 } 256 }
207 257
208 tsc_khz = tsc2 / tsc1; 258 tsc_khz = tsc2 / tsc1;
209 set_cyc2ns_scale(tsc_khz); 259
260 for_each_possible_cpu(cpu)
261 set_cyc2ns_scale(tsc_khz, cpu);
210} 262}
211 263
212/* 264/*
@@ -222,17 +274,9 @@ __cpuinit int unsynchronized_tsc(void)
222 if (apic_is_clustered_box()) 274 if (apic_is_clustered_box())
223 return 1; 275 return 1;
224#endif 276#endif
225 /* Most intel systems have synchronized TSCs except for 277
226 multi node systems */ 278 if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
227 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
228#ifdef CONFIG_ACPI
229 /* But TSC doesn't tick in C3 so don't use it there */
230 if (acpi_gbl_FADT.header.length > 0 &&
231 acpi_gbl_FADT.C3latency < 1000)
232 return 1;
233#endif
234 return 0; 279 return 0;
235 }
236 280
237 /* Assume multi socket systems are not synchronized */ 281 /* Assume multi socket systems are not synchronized */
238 return num_present_cpus() > 1; 282 return num_present_cpus() > 1;
@@ -250,13 +294,13 @@ __setup("notsc", notsc_setup);
250/* clock source code: */ 294/* clock source code: */
251static cycle_t read_tsc(void) 295static cycle_t read_tsc(void)
252{ 296{
253 cycle_t ret = (cycle_t)get_cycles_sync(); 297 cycle_t ret = (cycle_t)get_cycles();
254 return ret; 298 return ret;
255} 299}
256 300
257static cycle_t __vsyscall_fn vread_tsc(void) 301static cycle_t __vsyscall_fn vread_tsc(void)
258{ 302{
259 cycle_t ret = (cycle_t)get_cycles_sync(); 303 cycle_t ret = (cycle_t)vget_cycles();
260 return ret; 304 return ret;
261} 305}
262 306
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 9125efe66a0..0577825cf89 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -46,7 +46,7 @@ static __cpuinit void check_tsc_warp(void)
46 cycles_t start, now, prev, end; 46 cycles_t start, now, prev, end;
47 int i; 47 int i;
48 48
49 start = get_cycles_sync(); 49 start = get_cycles();
50 /* 50 /*
51 * The measurement runs for 20 msecs: 51 * The measurement runs for 20 msecs:
52 */ 52 */
@@ -61,18 +61,18 @@ static __cpuinit void check_tsc_warp(void)
61 */ 61 */
62 __raw_spin_lock(&sync_lock); 62 __raw_spin_lock(&sync_lock);
63 prev = last_tsc; 63 prev = last_tsc;
64 now = get_cycles_sync(); 64 now = get_cycles();
65 last_tsc = now; 65 last_tsc = now;
66 __raw_spin_unlock(&sync_lock); 66 __raw_spin_unlock(&sync_lock);
67 67
68 /* 68 /*
69 * Be nice every now and then (and also check whether 69 * Be nice every now and then (and also check whether
70 * measurement is done [we also insert a 100 million 70 * measurement is done [we also insert a 10 million
71 * loops safety exit, so we dont lock up in case the 71 * loops safety exit, so we dont lock up in case the
72 * TSC readout is totally broken]): 72 * TSC readout is totally broken]):
73 */ 73 */
74 if (unlikely(!(i & 7))) { 74 if (unlikely(!(i & 7))) {
75 if (now > end || i > 100000000) 75 if (now > end || i > 10000000)
76 break; 76 break;
77 cpu_relax(); 77 cpu_relax();
78 touch_nmi_watchdog(); 78 touch_nmi_watchdog();
@@ -87,7 +87,11 @@ static __cpuinit void check_tsc_warp(void)
87 nr_warps++; 87 nr_warps++;
88 __raw_spin_unlock(&sync_lock); 88 __raw_spin_unlock(&sync_lock);
89 } 89 }
90 90 }
91 if (!(now-start)) {
92 printk("Warning: zero tsc calibration delta: %Ld [max: %Ld]\n",
93 now-start, end-start);
94 WARN_ON(1);
91 } 95 }
92} 96}
93 97
@@ -129,24 +133,24 @@ void __cpuinit check_tsc_sync_source(int cpu)
129 while (atomic_read(&stop_count) != cpus-1) 133 while (atomic_read(&stop_count) != cpus-1)
130 cpu_relax(); 134 cpu_relax();
131 135
132 /*
133 * Reset it - just in case we boot another CPU later:
134 */
135 atomic_set(&start_count, 0);
136
137 if (nr_warps) { 136 if (nr_warps) {
138 printk("\n"); 137 printk("\n");
139 printk(KERN_WARNING "Measured %Ld cycles TSC warp between CPUs," 138 printk(KERN_WARNING "Measured %Ld cycles TSC warp between CPUs,"
140 " turning off TSC clock.\n", max_warp); 139 " turning off TSC clock.\n", max_warp);
141 mark_tsc_unstable("check_tsc_sync_source failed"); 140 mark_tsc_unstable("check_tsc_sync_source failed");
142 nr_warps = 0;
143 max_warp = 0;
144 last_tsc = 0;
145 } else { 141 } else {
146 printk(" passed.\n"); 142 printk(" passed.\n");
147 } 143 }
148 144
149 /* 145 /*
146 * Reset it - just in case we boot another CPU later:
147 */
148 atomic_set(&start_count, 0);
149 nr_warps = 0;
150 max_warp = 0;
151 last_tsc = 0;
152
153 /*
150 * Let the target continue with the bootup: 154 * Let the target continue with the bootup:
151 */ 155 */
152 atomic_inc(&stop_count); 156 atomic_inc(&stop_count);
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 157e4bedd3c..738c2104df3 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -70,10 +70,10 @@
70/* 70/*
71 * 8- and 16-bit register defines.. 71 * 8- and 16-bit register defines..
72 */ 72 */
73#define AL(regs) (((unsigned char *)&((regs)->pt.eax))[0]) 73#define AL(regs) (((unsigned char *)&((regs)->pt.ax))[0])
74#define AH(regs) (((unsigned char *)&((regs)->pt.eax))[1]) 74#define AH(regs) (((unsigned char *)&((regs)->pt.ax))[1])
75#define IP(regs) (*(unsigned short *)&((regs)->pt.eip)) 75#define IP(regs) (*(unsigned short *)&((regs)->pt.ip))
76#define SP(regs) (*(unsigned short *)&((regs)->pt.esp)) 76#define SP(regs) (*(unsigned short *)&((regs)->pt.sp))
77 77
78/* 78/*
79 * virtual flags (16 and 32-bit versions) 79 * virtual flags (16 and 32-bit versions)
@@ -93,12 +93,12 @@ static int copy_vm86_regs_to_user(struct vm86_regs __user *user,
93{ 93{
94 int ret = 0; 94 int ret = 0;
95 95
96 /* kernel_vm86_regs is missing xgs, so copy everything up to 96 /* kernel_vm86_regs is missing gs, so copy everything up to
97 (but not including) orig_eax, and then rest including orig_eax. */ 97 (but not including) orig_eax, and then rest including orig_eax. */
98 ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.orig_eax)); 98 ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.orig_ax));
99 ret += copy_to_user(&user->orig_eax, &regs->pt.orig_eax, 99 ret += copy_to_user(&user->orig_eax, &regs->pt.orig_ax,
100 sizeof(struct kernel_vm86_regs) - 100 sizeof(struct kernel_vm86_regs) -
101 offsetof(struct kernel_vm86_regs, pt.orig_eax)); 101 offsetof(struct kernel_vm86_regs, pt.orig_ax));
102 102
103 return ret; 103 return ret;
104} 104}
@@ -110,18 +110,17 @@ static int copy_vm86_regs_from_user(struct kernel_vm86_regs *regs,
110{ 110{
111 int ret = 0; 111 int ret = 0;
112 112
113 /* copy eax-xfs inclusive */ 113 /* copy ax-fs inclusive */
114 ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.orig_eax)); 114 ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.orig_ax));
115 /* copy orig_eax-__gsh+extra */ 115 /* copy orig_ax-__gsh+extra */
116 ret += copy_from_user(&regs->pt.orig_eax, &user->orig_eax, 116 ret += copy_from_user(&regs->pt.orig_ax, &user->orig_eax,
117 sizeof(struct kernel_vm86_regs) - 117 sizeof(struct kernel_vm86_regs) -
118 offsetof(struct kernel_vm86_regs, pt.orig_eax) + 118 offsetof(struct kernel_vm86_regs, pt.orig_ax) +
119 extra); 119 extra);
120 return ret; 120 return ret;
121} 121}
122 122
123struct pt_regs * FASTCALL(save_v86_state(struct kernel_vm86_regs * regs)); 123struct pt_regs * save_v86_state(struct kernel_vm86_regs * regs)
124struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
125{ 124{
126 struct tss_struct *tss; 125 struct tss_struct *tss;
127 struct pt_regs *ret; 126 struct pt_regs *ret;
@@ -138,7 +137,7 @@ struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
138 printk("no vm86_info: BAD\n"); 137 printk("no vm86_info: BAD\n");
139 do_exit(SIGSEGV); 138 do_exit(SIGSEGV);
140 } 139 }
141 set_flags(regs->pt.eflags, VEFLAGS, VIF_MASK | current->thread.v86mask); 140 set_flags(regs->pt.flags, VEFLAGS, VIF_MASK | current->thread.v86mask);
142 tmp = copy_vm86_regs_to_user(&current->thread.vm86_info->regs,regs); 141 tmp = copy_vm86_regs_to_user(&current->thread.vm86_info->regs,regs);
143 tmp += put_user(current->thread.screen_bitmap,&current->thread.vm86_info->screen_bitmap); 142 tmp += put_user(current->thread.screen_bitmap,&current->thread.vm86_info->screen_bitmap);
144 if (tmp) { 143 if (tmp) {
@@ -147,15 +146,15 @@ struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
147 } 146 }
148 147
149 tss = &per_cpu(init_tss, get_cpu()); 148 tss = &per_cpu(init_tss, get_cpu());
150 current->thread.esp0 = current->thread.saved_esp0; 149 current->thread.sp0 = current->thread.saved_sp0;
151 current->thread.sysenter_cs = __KERNEL_CS; 150 current->thread.sysenter_cs = __KERNEL_CS;
152 load_esp0(tss, &current->thread); 151 load_sp0(tss, &current->thread);
153 current->thread.saved_esp0 = 0; 152 current->thread.saved_sp0 = 0;
154 put_cpu(); 153 put_cpu();
155 154
156 ret = KVM86->regs32; 155 ret = KVM86->regs32;
157 156
158 ret->xfs = current->thread.saved_fs; 157 ret->fs = current->thread.saved_fs;
159 loadsegment(gs, current->thread.saved_gs); 158 loadsegment(gs, current->thread.saved_gs);
160 159
161 return ret; 160 return ret;
@@ -197,7 +196,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
197 196
198asmlinkage int sys_vm86old(struct pt_regs regs) 197asmlinkage int sys_vm86old(struct pt_regs regs)
199{ 198{
200 struct vm86_struct __user *v86 = (struct vm86_struct __user *)regs.ebx; 199 struct vm86_struct __user *v86 = (struct vm86_struct __user *)regs.bx;
201 struct kernel_vm86_struct info; /* declare this _on top_, 200 struct kernel_vm86_struct info; /* declare this _on top_,
202 * this avoids wasting of stack space. 201 * this avoids wasting of stack space.
203 * This remains on the stack until we 202 * This remains on the stack until we
@@ -207,7 +206,7 @@ asmlinkage int sys_vm86old(struct pt_regs regs)
207 int tmp, ret = -EPERM; 206 int tmp, ret = -EPERM;
208 207
209 tsk = current; 208 tsk = current;
210 if (tsk->thread.saved_esp0) 209 if (tsk->thread.saved_sp0)
211 goto out; 210 goto out;
212 tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs, 211 tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
213 offsetof(struct kernel_vm86_struct, vm86plus) - 212 offsetof(struct kernel_vm86_struct, vm86plus) -
@@ -237,12 +236,12 @@ asmlinkage int sys_vm86(struct pt_regs regs)
237 struct vm86plus_struct __user *v86; 236 struct vm86plus_struct __user *v86;
238 237
239 tsk = current; 238 tsk = current;
240 switch (regs.ebx) { 239 switch (regs.bx) {
241 case VM86_REQUEST_IRQ: 240 case VM86_REQUEST_IRQ:
242 case VM86_FREE_IRQ: 241 case VM86_FREE_IRQ:
243 case VM86_GET_IRQ_BITS: 242 case VM86_GET_IRQ_BITS:
244 case VM86_GET_AND_RESET_IRQ: 243 case VM86_GET_AND_RESET_IRQ:
245 ret = do_vm86_irq_handling(regs.ebx, (int)regs.ecx); 244 ret = do_vm86_irq_handling(regs.bx, (int)regs.cx);
246 goto out; 245 goto out;
247 case VM86_PLUS_INSTALL_CHECK: 246 case VM86_PLUS_INSTALL_CHECK:
248 /* NOTE: on old vm86 stuff this will return the error 247 /* NOTE: on old vm86 stuff this will return the error
@@ -256,9 +255,9 @@ asmlinkage int sys_vm86(struct pt_regs regs)
256 255
257 /* we come here only for functions VM86_ENTER, VM86_ENTER_NO_BYPASS */ 256 /* we come here only for functions VM86_ENTER, VM86_ENTER_NO_BYPASS */
258 ret = -EPERM; 257 ret = -EPERM;
259 if (tsk->thread.saved_esp0) 258 if (tsk->thread.saved_sp0)
260 goto out; 259 goto out;
261 v86 = (struct vm86plus_struct __user *)regs.ecx; 260 v86 = (struct vm86plus_struct __user *)regs.cx;
262 tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs, 261 tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
263 offsetof(struct kernel_vm86_struct, regs32) - 262 offsetof(struct kernel_vm86_struct, regs32) -
264 sizeof(info.regs)); 263 sizeof(info.regs));
@@ -281,23 +280,23 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
281/* 280/*
282 * make sure the vm86() system call doesn't try to do anything silly 281 * make sure the vm86() system call doesn't try to do anything silly
283 */ 282 */
284 info->regs.pt.xds = 0; 283 info->regs.pt.ds = 0;
285 info->regs.pt.xes = 0; 284 info->regs.pt.es = 0;
286 info->regs.pt.xfs = 0; 285 info->regs.pt.fs = 0;
287 286
288/* we are clearing gs later just before "jmp resume_userspace", 287/* we are clearing gs later just before "jmp resume_userspace",
289 * because it is not saved/restored. 288 * because it is not saved/restored.
290 */ 289 */
291 290
292/* 291/*
293 * The eflags register is also special: we cannot trust that the user 292 * The flags register is also special: we cannot trust that the user
294 * has set it up safely, so this makes sure interrupt etc flags are 293 * has set it up safely, so this makes sure interrupt etc flags are
295 * inherited from protected mode. 294 * inherited from protected mode.
296 */ 295 */
297 VEFLAGS = info->regs.pt.eflags; 296 VEFLAGS = info->regs.pt.flags;
298 info->regs.pt.eflags &= SAFE_MASK; 297 info->regs.pt.flags &= SAFE_MASK;
299 info->regs.pt.eflags |= info->regs32->eflags & ~SAFE_MASK; 298 info->regs.pt.flags |= info->regs32->flags & ~SAFE_MASK;
300 info->regs.pt.eflags |= VM_MASK; 299 info->regs.pt.flags |= VM_MASK;
301 300
302 switch (info->cpu_type) { 301 switch (info->cpu_type) {
303 case CPU_286: 302 case CPU_286:
@@ -315,18 +314,18 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
315 } 314 }
316 315
317/* 316/*
318 * Save old state, set default return value (%eax) to 0 317 * Save old state, set default return value (%ax) to 0
319 */ 318 */
320 info->regs32->eax = 0; 319 info->regs32->ax = 0;
321 tsk->thread.saved_esp0 = tsk->thread.esp0; 320 tsk->thread.saved_sp0 = tsk->thread.sp0;
322 tsk->thread.saved_fs = info->regs32->xfs; 321 tsk->thread.saved_fs = info->regs32->fs;
323 savesegment(gs, tsk->thread.saved_gs); 322 savesegment(gs, tsk->thread.saved_gs);
324 323
325 tss = &per_cpu(init_tss, get_cpu()); 324 tss = &per_cpu(init_tss, get_cpu());
326 tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0; 325 tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0;
327 if (cpu_has_sep) 326 if (cpu_has_sep)
328 tsk->thread.sysenter_cs = 0; 327 tsk->thread.sysenter_cs = 0;
329 load_esp0(tss, &tsk->thread); 328 load_sp0(tss, &tsk->thread);
330 put_cpu(); 329 put_cpu();
331 330
332 tsk->thread.screen_bitmap = info->screen_bitmap; 331 tsk->thread.screen_bitmap = info->screen_bitmap;
@@ -352,7 +351,7 @@ static inline void return_to_32bit(struct kernel_vm86_regs * regs16, int retval)
352 struct pt_regs * regs32; 351 struct pt_regs * regs32;
353 352
354 regs32 = save_v86_state(regs16); 353 regs32 = save_v86_state(regs16);
355 regs32->eax = retval; 354 regs32->ax = retval;
356 __asm__ __volatile__("movl %0,%%esp\n\t" 355 __asm__ __volatile__("movl %0,%%esp\n\t"
357 "movl %1,%%ebp\n\t" 356 "movl %1,%%ebp\n\t"
358 "jmp resume_userspace" 357 "jmp resume_userspace"
@@ -373,30 +372,30 @@ static inline void clear_IF(struct kernel_vm86_regs * regs)
373 372
374static inline void clear_TF(struct kernel_vm86_regs * regs) 373static inline void clear_TF(struct kernel_vm86_regs * regs)
375{ 374{
376 regs->pt.eflags &= ~TF_MASK; 375 regs->pt.flags &= ~TF_MASK;
377} 376}
378 377
379static inline void clear_AC(struct kernel_vm86_regs * regs) 378static inline void clear_AC(struct kernel_vm86_regs * regs)
380{ 379{
381 regs->pt.eflags &= ~AC_MASK; 380 regs->pt.flags &= ~AC_MASK;
382} 381}
383 382
384/* It is correct to call set_IF(regs) from the set_vflags_* 383/* It is correct to call set_IF(regs) from the set_vflags_*
385 * functions. However someone forgot to call clear_IF(regs) 384 * functions. However someone forgot to call clear_IF(regs)
386 * in the opposite case. 385 * in the opposite case.
387 * After the command sequence CLI PUSHF STI POPF you should 386 * After the command sequence CLI PUSHF STI POPF you should
388 * end up with interrups disabled, but you ended up with 387 * end up with interrupts disabled, but you ended up with
389 * interrupts enabled. 388 * interrupts enabled.
390 * ( I was testing my own changes, but the only bug I 389 * ( I was testing my own changes, but the only bug I
391 * could find was in a function I had not changed. ) 390 * could find was in a function I had not changed. )
392 * [KD] 391 * [KD]
393 */ 392 */
394 393
395static inline void set_vflags_long(unsigned long eflags, struct kernel_vm86_regs * regs) 394static inline void set_vflags_long(unsigned long flags, struct kernel_vm86_regs * regs)
396{ 395{
397 set_flags(VEFLAGS, eflags, current->thread.v86mask); 396 set_flags(VEFLAGS, flags, current->thread.v86mask);
398 set_flags(regs->pt.eflags, eflags, SAFE_MASK); 397 set_flags(regs->pt.flags, flags, SAFE_MASK);
399 if (eflags & IF_MASK) 398 if (flags & IF_MASK)
400 set_IF(regs); 399 set_IF(regs);
401 else 400 else
402 clear_IF(regs); 401 clear_IF(regs);
@@ -405,7 +404,7 @@ static inline void set_vflags_long(unsigned long eflags, struct kernel_vm86_regs
405static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs * regs) 404static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs * regs)
406{ 405{
407 set_flags(VFLAGS, flags, current->thread.v86mask); 406 set_flags(VFLAGS, flags, current->thread.v86mask);
408 set_flags(regs->pt.eflags, flags, SAFE_MASK); 407 set_flags(regs->pt.flags, flags, SAFE_MASK);
409 if (flags & IF_MASK) 408 if (flags & IF_MASK)
410 set_IF(regs); 409 set_IF(regs);
411 else 410 else
@@ -414,7 +413,7 @@ static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_reg
414 413
415static inline unsigned long get_vflags(struct kernel_vm86_regs * regs) 414static inline unsigned long get_vflags(struct kernel_vm86_regs * regs)
416{ 415{
417 unsigned long flags = regs->pt.eflags & RETURN_MASK; 416 unsigned long flags = regs->pt.flags & RETURN_MASK;
418 417
419 if (VEFLAGS & VIF_MASK) 418 if (VEFLAGS & VIF_MASK)
420 flags |= IF_MASK; 419 flags |= IF_MASK;
@@ -518,7 +517,7 @@ static void do_int(struct kernel_vm86_regs *regs, int i,
518 unsigned long __user *intr_ptr; 517 unsigned long __user *intr_ptr;
519 unsigned long segoffs; 518 unsigned long segoffs;
520 519
521 if (regs->pt.xcs == BIOSSEG) 520 if (regs->pt.cs == BIOSSEG)
522 goto cannot_handle; 521 goto cannot_handle;
523 if (is_revectored(i, &KVM86->int_revectored)) 522 if (is_revectored(i, &KVM86->int_revectored))
524 goto cannot_handle; 523 goto cannot_handle;
@@ -530,9 +529,9 @@ static void do_int(struct kernel_vm86_regs *regs, int i,
530 if ((segoffs >> 16) == BIOSSEG) 529 if ((segoffs >> 16) == BIOSSEG)
531 goto cannot_handle; 530 goto cannot_handle;
532 pushw(ssp, sp, get_vflags(regs), cannot_handle); 531 pushw(ssp, sp, get_vflags(regs), cannot_handle);
533 pushw(ssp, sp, regs->pt.xcs, cannot_handle); 532 pushw(ssp, sp, regs->pt.cs, cannot_handle);
534 pushw(ssp, sp, IP(regs), cannot_handle); 533 pushw(ssp, sp, IP(regs), cannot_handle);
535 regs->pt.xcs = segoffs >> 16; 534 regs->pt.cs = segoffs >> 16;
536 SP(regs) -= 6; 535 SP(regs) -= 6;
537 IP(regs) = segoffs & 0xffff; 536 IP(regs) = segoffs & 0xffff;
538 clear_TF(regs); 537 clear_TF(regs);
@@ -549,7 +548,7 @@ int handle_vm86_trap(struct kernel_vm86_regs * regs, long error_code, int trapno
549 if (VMPI.is_vm86pus) { 548 if (VMPI.is_vm86pus) {
550 if ( (trapno==3) || (trapno==1) ) 549 if ( (trapno==3) || (trapno==1) )
551 return_to_32bit(regs, VM86_TRAP + (trapno << 8)); 550 return_to_32bit(regs, VM86_TRAP + (trapno << 8));
552 do_int(regs, trapno, (unsigned char __user *) (regs->pt.xss << 4), SP(regs)); 551 do_int(regs, trapno, (unsigned char __user *) (regs->pt.ss << 4), SP(regs));
553 return 0; 552 return 0;
554 } 553 }
555 if (trapno !=1) 554 if (trapno !=1)
@@ -585,10 +584,10 @@ void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code)
585 handle_vm86_trap(regs, 0, 1); \ 584 handle_vm86_trap(regs, 0, 1); \
586 return; } while (0) 585 return; } while (0)
587 586
588 orig_flags = *(unsigned short *)&regs->pt.eflags; 587 orig_flags = *(unsigned short *)&regs->pt.flags;
589 588
590 csp = (unsigned char __user *) (regs->pt.xcs << 4); 589 csp = (unsigned char __user *) (regs->pt.cs << 4);
591 ssp = (unsigned char __user *) (regs->pt.xss << 4); 590 ssp = (unsigned char __user *) (regs->pt.ss << 4);
592 sp = SP(regs); 591 sp = SP(regs);
593 ip = IP(regs); 592 ip = IP(regs);
594 593
@@ -675,7 +674,7 @@ void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code)
675 SP(regs) += 6; 674 SP(regs) += 6;
676 } 675 }
677 IP(regs) = newip; 676 IP(regs) = newip;
678 regs->pt.xcs = newcs; 677 regs->pt.cs = newcs;
679 CHECK_IF_IN_TRAP; 678 CHECK_IF_IN_TRAP;
680 if (data32) { 679 if (data32) {
681 set_vflags_long(newflags, regs); 680 set_vflags_long(newflags, regs);
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index f02bad68aba..12affe1f9bc 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -62,7 +62,10 @@ static struct {
62 void (*cpuid)(void /* non-c */); 62 void (*cpuid)(void /* non-c */);
63 void (*_set_ldt)(u32 selector); 63 void (*_set_ldt)(u32 selector);
64 void (*set_tr)(u32 selector); 64 void (*set_tr)(u32 selector);
65 void (*set_kernel_stack)(u32 selector, u32 esp0); 65 void (*write_idt_entry)(struct desc_struct *, int, u32, u32);
66 void (*write_gdt_entry)(struct desc_struct *, int, u32, u32);
67 void (*write_ldt_entry)(struct desc_struct *, int, u32, u32);
68 void (*set_kernel_stack)(u32 selector, u32 sp0);
66 void (*allocate_page)(u32, u32, u32, u32, u32); 69 void (*allocate_page)(u32, u32, u32, u32, u32);
67 void (*release_page)(u32, u32); 70 void (*release_page)(u32, u32);
68 void (*set_pte)(pte_t, pte_t *, unsigned); 71 void (*set_pte)(pte_t, pte_t *, unsigned);
@@ -88,13 +91,13 @@ struct vmi_timer_ops vmi_timer_ops;
88#define IRQ_PATCH_DISABLE 5 91#define IRQ_PATCH_DISABLE 5
89 92
90static inline void patch_offset(void *insnbuf, 93static inline void patch_offset(void *insnbuf,
91 unsigned long eip, unsigned long dest) 94 unsigned long ip, unsigned long dest)
92{ 95{
93 *(unsigned long *)(insnbuf+1) = dest-eip-5; 96 *(unsigned long *)(insnbuf+1) = dest-ip-5;
94} 97}
95 98
96static unsigned patch_internal(int call, unsigned len, void *insnbuf, 99static unsigned patch_internal(int call, unsigned len, void *insnbuf,
97 unsigned long eip) 100 unsigned long ip)
98{ 101{
99 u64 reloc; 102 u64 reloc;
100 struct vmi_relocation_info *const rel = (struct vmi_relocation_info *)&reloc; 103 struct vmi_relocation_info *const rel = (struct vmi_relocation_info *)&reloc;
@@ -103,13 +106,13 @@ static unsigned patch_internal(int call, unsigned len, void *insnbuf,
103 case VMI_RELOCATION_CALL_REL: 106 case VMI_RELOCATION_CALL_REL:
104 BUG_ON(len < 5); 107 BUG_ON(len < 5);
105 *(char *)insnbuf = MNEM_CALL; 108 *(char *)insnbuf = MNEM_CALL;
106 patch_offset(insnbuf, eip, (unsigned long)rel->eip); 109 patch_offset(insnbuf, ip, (unsigned long)rel->eip);
107 return 5; 110 return 5;
108 111
109 case VMI_RELOCATION_JUMP_REL: 112 case VMI_RELOCATION_JUMP_REL:
110 BUG_ON(len < 5); 113 BUG_ON(len < 5);
111 *(char *)insnbuf = MNEM_JMP; 114 *(char *)insnbuf = MNEM_JMP;
112 patch_offset(insnbuf, eip, (unsigned long)rel->eip); 115 patch_offset(insnbuf, ip, (unsigned long)rel->eip);
113 return 5; 116 return 5;
114 117
115 case VMI_RELOCATION_NOP: 118 case VMI_RELOCATION_NOP:
@@ -131,25 +134,25 @@ static unsigned patch_internal(int call, unsigned len, void *insnbuf,
131 * sequence. The callee does nop padding for us. 134 * sequence. The callee does nop padding for us.
132 */ 135 */
133static unsigned vmi_patch(u8 type, u16 clobbers, void *insns, 136static unsigned vmi_patch(u8 type, u16 clobbers, void *insns,
134 unsigned long eip, unsigned len) 137 unsigned long ip, unsigned len)
135{ 138{
136 switch (type) { 139 switch (type) {
137 case PARAVIRT_PATCH(pv_irq_ops.irq_disable): 140 case PARAVIRT_PATCH(pv_irq_ops.irq_disable):
138 return patch_internal(VMI_CALL_DisableInterrupts, len, 141 return patch_internal(VMI_CALL_DisableInterrupts, len,
139 insns, eip); 142 insns, ip);
140 case PARAVIRT_PATCH(pv_irq_ops.irq_enable): 143 case PARAVIRT_PATCH(pv_irq_ops.irq_enable):
141 return patch_internal(VMI_CALL_EnableInterrupts, len, 144 return patch_internal(VMI_CALL_EnableInterrupts, len,
142 insns, eip); 145 insns, ip);
143 case PARAVIRT_PATCH(pv_irq_ops.restore_fl): 146 case PARAVIRT_PATCH(pv_irq_ops.restore_fl):
144 return patch_internal(VMI_CALL_SetInterruptMask, len, 147 return patch_internal(VMI_CALL_SetInterruptMask, len,
145 insns, eip); 148 insns, ip);
146 case PARAVIRT_PATCH(pv_irq_ops.save_fl): 149 case PARAVIRT_PATCH(pv_irq_ops.save_fl):
147 return patch_internal(VMI_CALL_GetInterruptMask, len, 150 return patch_internal(VMI_CALL_GetInterruptMask, len,
148 insns, eip); 151 insns, ip);
149 case PARAVIRT_PATCH(pv_cpu_ops.iret): 152 case PARAVIRT_PATCH(pv_cpu_ops.iret):
150 return patch_internal(VMI_CALL_IRET, len, insns, eip); 153 return patch_internal(VMI_CALL_IRET, len, insns, ip);
151 case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit): 154 case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_syscall_ret):
152 return patch_internal(VMI_CALL_SYSEXIT, len, insns, eip); 155 return patch_internal(VMI_CALL_SYSEXIT, len, insns, ip);
153 default: 156 default:
154 break; 157 break;
155 } 158 }
@@ -157,36 +160,36 @@ static unsigned vmi_patch(u8 type, u16 clobbers, void *insns,
157} 160}
158 161
159/* CPUID has non-C semantics, and paravirt-ops API doesn't match hardware ISA */ 162/* CPUID has non-C semantics, and paravirt-ops API doesn't match hardware ISA */
160static void vmi_cpuid(unsigned int *eax, unsigned int *ebx, 163static void vmi_cpuid(unsigned int *ax, unsigned int *bx,
161 unsigned int *ecx, unsigned int *edx) 164 unsigned int *cx, unsigned int *dx)
162{ 165{
163 int override = 0; 166 int override = 0;
164 if (*eax == 1) 167 if (*ax == 1)
165 override = 1; 168 override = 1;
166 asm volatile ("call *%6" 169 asm volatile ("call *%6"
167 : "=a" (*eax), 170 : "=a" (*ax),
168 "=b" (*ebx), 171 "=b" (*bx),
169 "=c" (*ecx), 172 "=c" (*cx),
170 "=d" (*edx) 173 "=d" (*dx)
171 : "0" (*eax), "2" (*ecx), "r" (vmi_ops.cpuid)); 174 : "0" (*ax), "2" (*cx), "r" (vmi_ops.cpuid));
172 if (override) { 175 if (override) {
173 if (disable_pse) 176 if (disable_pse)
174 *edx &= ~X86_FEATURE_PSE; 177 *dx &= ~X86_FEATURE_PSE;
175 if (disable_pge) 178 if (disable_pge)
176 *edx &= ~X86_FEATURE_PGE; 179 *dx &= ~X86_FEATURE_PGE;
177 if (disable_sep) 180 if (disable_sep)
178 *edx &= ~X86_FEATURE_SEP; 181 *dx &= ~X86_FEATURE_SEP;
179 if (disable_tsc) 182 if (disable_tsc)
180 *edx &= ~X86_FEATURE_TSC; 183 *dx &= ~X86_FEATURE_TSC;
181 if (disable_mtrr) 184 if (disable_mtrr)
182 *edx &= ~X86_FEATURE_MTRR; 185 *dx &= ~X86_FEATURE_MTRR;
183 } 186 }
184} 187}
185 188
186static inline void vmi_maybe_load_tls(struct desc_struct *gdt, int nr, struct desc_struct *new) 189static inline void vmi_maybe_load_tls(struct desc_struct *gdt, int nr, struct desc_struct *new)
187{ 190{
188 if (gdt[nr].a != new->a || gdt[nr].b != new->b) 191 if (gdt[nr].a != new->a || gdt[nr].b != new->b)
189 write_gdt_entry(gdt, nr, new->a, new->b); 192 write_gdt_entry(gdt, nr, new, 0);
190} 193}
191 194
192static void vmi_load_tls(struct thread_struct *t, unsigned int cpu) 195static void vmi_load_tls(struct thread_struct *t, unsigned int cpu)
@@ -200,12 +203,12 @@ static void vmi_load_tls(struct thread_struct *t, unsigned int cpu)
200static void vmi_set_ldt(const void *addr, unsigned entries) 203static void vmi_set_ldt(const void *addr, unsigned entries)
201{ 204{
202 unsigned cpu = smp_processor_id(); 205 unsigned cpu = smp_processor_id();
203 u32 low, high; 206 struct desc_struct desc;
204 207
205 pack_descriptor(&low, &high, (unsigned long)addr, 208 pack_descriptor(&desc, (unsigned long)addr,
206 entries * sizeof(struct desc_struct) - 1, 209 entries * sizeof(struct desc_struct) - 1,
207 DESCTYPE_LDT, 0); 210 DESC_LDT, 0);
208 write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, low, high); 211 write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, &desc, DESC_LDT);
209 vmi_ops._set_ldt(entries ? GDT_ENTRY_LDT*sizeof(struct desc_struct) : 0); 212 vmi_ops._set_ldt(entries ? GDT_ENTRY_LDT*sizeof(struct desc_struct) : 0);
210} 213}
211 214
@@ -214,17 +217,37 @@ static void vmi_set_tr(void)
214 vmi_ops.set_tr(GDT_ENTRY_TSS*sizeof(struct desc_struct)); 217 vmi_ops.set_tr(GDT_ENTRY_TSS*sizeof(struct desc_struct));
215} 218}
216 219
217static void vmi_load_esp0(struct tss_struct *tss, 220static void vmi_write_idt_entry(gate_desc *dt, int entry, const gate_desc *g)
221{
222 u32 *idt_entry = (u32 *)g;
223 vmi_ops.write_idt_entry(dt, entry, idt_entry[0], idt_entry[1]);
224}
225
226static void vmi_write_gdt_entry(struct desc_struct *dt, int entry,
227 const void *desc, int type)
228{
229 u32 *gdt_entry = (u32 *)desc;
230 vmi_ops.write_gdt_entry(dt, entry, gdt_entry[0], gdt_entry[1]);
231}
232
233static void vmi_write_ldt_entry(struct desc_struct *dt, int entry,
234 const void *desc)
235{
236 u32 *ldt_entry = (u32 *)desc;
237 vmi_ops.write_idt_entry(dt, entry, ldt_entry[0], ldt_entry[1]);
238}
239
240static void vmi_load_sp0(struct tss_struct *tss,
218 struct thread_struct *thread) 241 struct thread_struct *thread)
219{ 242{
220 tss->x86_tss.esp0 = thread->esp0; 243 tss->x86_tss.sp0 = thread->sp0;
221 244
222 /* This can only happen when SEP is enabled, no need to test "SEP"arately */ 245 /* This can only happen when SEP is enabled, no need to test "SEP"arately */
223 if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) { 246 if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
224 tss->x86_tss.ss1 = thread->sysenter_cs; 247 tss->x86_tss.ss1 = thread->sysenter_cs;
225 wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); 248 wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
226 } 249 }
227 vmi_ops.set_kernel_stack(__KERNEL_DS, tss->x86_tss.esp0); 250 vmi_ops.set_kernel_stack(__KERNEL_DS, tss->x86_tss.sp0);
228} 251}
229 252
230static void vmi_flush_tlb_user(void) 253static void vmi_flush_tlb_user(void)
@@ -375,7 +398,7 @@ static void vmi_allocate_pt(struct mm_struct *mm, u32 pfn)
375 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); 398 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
376} 399}
377 400
378static void vmi_allocate_pd(u32 pfn) 401static void vmi_allocate_pd(struct mm_struct *mm, u32 pfn)
379{ 402{
380 /* 403 /*
381 * This call comes in very early, before mem_map is setup. 404 * This call comes in very early, before mem_map is setup.
@@ -452,7 +475,7 @@ static void vmi_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep
452static void vmi_set_pmd(pmd_t *pmdp, pmd_t pmdval) 475static void vmi_set_pmd(pmd_t *pmdp, pmd_t pmdval)
453{ 476{
454#ifdef CONFIG_X86_PAE 477#ifdef CONFIG_X86_PAE
455 const pte_t pte = { pmdval.pmd, pmdval.pmd >> 32 }; 478 const pte_t pte = { .pte = pmdval.pmd };
456 vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PMD); 479 vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PMD);
457#else 480#else
458 const pte_t pte = { pmdval.pud.pgd.pgd }; 481 const pte_t pte = { pmdval.pud.pgd.pgd };
@@ -485,21 +508,21 @@ static void vmi_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t
485static void vmi_set_pud(pud_t *pudp, pud_t pudval) 508static void vmi_set_pud(pud_t *pudp, pud_t pudval)
486{ 509{
487 /* Um, eww */ 510 /* Um, eww */
488 const pte_t pte = { pudval.pgd.pgd, pudval.pgd.pgd >> 32 }; 511 const pte_t pte = { .pte = pudval.pgd.pgd };
489 vmi_check_page_type(__pa(pudp) >> PAGE_SHIFT, VMI_PAGE_PGD); 512 vmi_check_page_type(__pa(pudp) >> PAGE_SHIFT, VMI_PAGE_PGD);
490 vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP); 513 vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP);
491} 514}
492 515
493static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 516static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
494{ 517{
495 const pte_t pte = { 0 }; 518 const pte_t pte = { .pte = 0 };
496 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); 519 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
497 vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); 520 vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
498} 521}
499 522
500static void vmi_pmd_clear(pmd_t *pmd) 523static void vmi_pmd_clear(pmd_t *pmd)
501{ 524{
502 const pte_t pte = { 0 }; 525 const pte_t pte = { .pte = 0 };
503 vmi_check_page_type(__pa(pmd) >> PAGE_SHIFT, VMI_PAGE_PMD); 526 vmi_check_page_type(__pa(pmd) >> PAGE_SHIFT, VMI_PAGE_PMD);
504 vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD); 527 vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD);
505} 528}
@@ -790,10 +813,13 @@ static inline int __init activate_vmi(void)
790 para_fill(pv_cpu_ops.store_idt, GetIDT); 813 para_fill(pv_cpu_ops.store_idt, GetIDT);
791 para_fill(pv_cpu_ops.store_tr, GetTR); 814 para_fill(pv_cpu_ops.store_tr, GetTR);
792 pv_cpu_ops.load_tls = vmi_load_tls; 815 pv_cpu_ops.load_tls = vmi_load_tls;
793 para_fill(pv_cpu_ops.write_ldt_entry, WriteLDTEntry); 816 para_wrap(pv_cpu_ops.write_ldt_entry, vmi_write_ldt_entry,
794 para_fill(pv_cpu_ops.write_gdt_entry, WriteGDTEntry); 817 write_ldt_entry, WriteLDTEntry);
795 para_fill(pv_cpu_ops.write_idt_entry, WriteIDTEntry); 818 para_wrap(pv_cpu_ops.write_gdt_entry, vmi_write_gdt_entry,
796 para_wrap(pv_cpu_ops.load_esp0, vmi_load_esp0, set_kernel_stack, UpdateKernelStack); 819 write_gdt_entry, WriteGDTEntry);
820 para_wrap(pv_cpu_ops.write_idt_entry, vmi_write_idt_entry,
821 write_idt_entry, WriteIDTEntry);
822 para_wrap(pv_cpu_ops.load_sp0, vmi_load_sp0, set_kernel_stack, UpdateKernelStack);
797 para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask); 823 para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask);
798 para_fill(pv_cpu_ops.io_delay, IODelay); 824 para_fill(pv_cpu_ops.io_delay, IODelay);
799 825
@@ -870,7 +896,7 @@ static inline int __init activate_vmi(void)
870 * the backend. They are performance critical anyway, so requiring 896 * the backend. They are performance critical anyway, so requiring
871 * a patch is not a big problem. 897 * a patch is not a big problem.
872 */ 898 */
873 pv_cpu_ops.irq_enable_sysexit = (void *)0xfeedbab0; 899 pv_cpu_ops.irq_enable_syscall_ret = (void *)0xfeedbab0;
874 pv_cpu_ops.iret = (void *)0xbadbab0; 900 pv_cpu_ops.iret = (void *)0xbadbab0;
875 901
876#ifdef CONFIG_SMP 902#ifdef CONFIG_SMP
@@ -963,19 +989,19 @@ static int __init parse_vmi(char *arg)
963 return -EINVAL; 989 return -EINVAL;
964 990
965 if (!strcmp(arg, "disable_pge")) { 991 if (!strcmp(arg, "disable_pge")) {
966 clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability); 992 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE);
967 disable_pge = 1; 993 disable_pge = 1;
968 } else if (!strcmp(arg, "disable_pse")) { 994 } else if (!strcmp(arg, "disable_pse")) {
969 clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability); 995 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PSE);
970 disable_pse = 1; 996 disable_pse = 1;
971 } else if (!strcmp(arg, "disable_sep")) { 997 } else if (!strcmp(arg, "disable_sep")) {
972 clear_bit(X86_FEATURE_SEP, boot_cpu_data.x86_capability); 998 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP);
973 disable_sep = 1; 999 disable_sep = 1;
974 } else if (!strcmp(arg, "disable_tsc")) { 1000 } else if (!strcmp(arg, "disable_tsc")) {
975 clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability); 1001 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC);
976 disable_tsc = 1; 1002 disable_tsc = 1;
977 } else if (!strcmp(arg, "disable_mtrr")) { 1003 } else if (!strcmp(arg, "disable_mtrr")) {
978 clear_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability); 1004 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_MTRR);
979 disable_mtrr = 1; 1005 disable_mtrr = 1;
980 } else if (!strcmp(arg, "disable_timer")) { 1006 } else if (!strcmp(arg, "disable_timer")) {
981 disable_vmi_timer = 1; 1007 disable_vmi_timer = 1;
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index b1b5ab08b26..a2b030780aa 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -35,7 +35,6 @@
35#include <asm/i8253.h> 35#include <asm/i8253.h>
36 36
37#include <irq_vectors.h> 37#include <irq_vectors.h>
38#include "io_ports.h"
39 38
40#define VMI_ONESHOT (VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL | vmi_get_alarm_wiring()) 39#define VMI_ONESHOT (VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
41#define VMI_PERIODIC (VMI_ALARM_IS_PERIODIC | VMI_CYCLES_REAL | vmi_get_alarm_wiring()) 40#define VMI_PERIODIC (VMI_ALARM_IS_PERIODIC | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
@@ -238,7 +237,7 @@ static void __devinit vmi_time_init_clockevent(void)
238void __init vmi_time_init(void) 237void __init vmi_time_init(void)
239{ 238{
240 /* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */ 239 /* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */
241 outb_p(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */ 240 outb_pit(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */
242 241
243 vmi_time_init_clockevent(); 242 vmi_time_init_clockevent();
244 setup_irq(0, &vmi_clock_action); 243 setup_irq(0, &vmi_clock_action);
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index 7d72cce0052..f1148ac8abe 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -8,12 +8,6 @@
8 * put it inside the section definition. 8 * put it inside the section definition.
9 */ 9 */
10 10
11/* Don't define absolute symbols until and unless you know that symbol
12 * value is should remain constant even if kernel image is relocated
13 * at run time. Absolute symbols are not relocated. If symbol value should
14 * change if kernel is relocated, make the symbol section relative and
15 * put it inside the section definition.
16 */
17#define LOAD_OFFSET __PAGE_OFFSET 11#define LOAD_OFFSET __PAGE_OFFSET
18 12
19#include <asm-generic/vmlinux.lds.h> 13#include <asm-generic/vmlinux.lds.h>
@@ -44,6 +38,8 @@ SECTIONS
44 38
45 /* read-only */ 39 /* read-only */
46 .text : AT(ADDR(.text) - LOAD_OFFSET) { 40 .text : AT(ADDR(.text) - LOAD_OFFSET) {
41 . = ALIGN(4096); /* not really needed, already page aligned */
42 *(.text.page_aligned)
47 TEXT_TEXT 43 TEXT_TEXT
48 SCHED_TEXT 44 SCHED_TEXT
49 LOCK_TEXT 45 LOCK_TEXT
@@ -131,10 +127,12 @@ SECTIONS
131 .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { 127 .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
132 __init_begin = .; 128 __init_begin = .;
133 _sinittext = .; 129 _sinittext = .;
134 *(.init.text) 130 INIT_TEXT
135 _einittext = .; 131 _einittext = .;
136 } 132 }
137 .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { *(.init.data) } 133 .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
134 INIT_DATA
135 }
138 . = ALIGN(16); 136 . = ALIGN(16);
139 .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { 137 .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
140 __setup_start = .; 138 __setup_start = .;
@@ -169,8 +167,12 @@ SECTIONS
169 } 167 }
170 /* .exit.text is discard at runtime, not link time, to deal with references 168 /* .exit.text is discard at runtime, not link time, to deal with references
171 from .altinstructions and .eh_frame */ 169 from .altinstructions and .eh_frame */
172 .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) } 170 .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
173 .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) } 171 EXIT_TEXT
172 }
173 .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) {
174 EXIT_DATA
175 }
174#if defined(CONFIG_BLK_DEV_INITRD) 176#if defined(CONFIG_BLK_DEV_INITRD)
175 . = ALIGN(4096); 177 . = ALIGN(4096);
176 .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { 178 .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index ba8ea97abd2..0992b9946c6 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -37,16 +37,15 @@ SECTIONS
37 KPROBES_TEXT 37 KPROBES_TEXT
38 *(.fixup) 38 *(.fixup)
39 *(.gnu.warning) 39 *(.gnu.warning)
40 } :text = 0x9090 40 _etext = .; /* End of text section */
41 /* out-of-line lock text */ 41 } :text = 0x9090
42 .text.lock : AT(ADDR(.text.lock) - LOAD_OFFSET) { *(.text.lock) }
43
44 _etext = .; /* End of text section */
45 42
46 . = ALIGN(16); /* Exception table */ 43 . = ALIGN(16); /* Exception table */
47 __start___ex_table = .; 44 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
48 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) } 45 __start___ex_table = .;
49 __stop___ex_table = .; 46 *(__ex_table)
47 __stop___ex_table = .;
48 }
50 49
51 NOTES :text :note 50 NOTES :text :note
52 51
@@ -155,12 +154,15 @@ SECTIONS
155 __init_begin = .; 154 __init_begin = .;
156 .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { 155 .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
157 _sinittext = .; 156 _sinittext = .;
158 *(.init.text) 157 INIT_TEXT
159 _einittext = .; 158 _einittext = .;
160 } 159 }
161 __initdata_begin = .; 160 .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
162 .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { *(.init.data) } 161 __initdata_begin = .;
163 __initdata_end = .; 162 INIT_DATA
163 __initdata_end = .;
164 }
165
164 . = ALIGN(16); 166 . = ALIGN(16);
165 __setup_start = .; 167 __setup_start = .;
166 .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { *(.init.setup) } 168 .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { *(.init.setup) }
@@ -176,6 +178,14 @@ SECTIONS
176 } 178 }
177 __con_initcall_end = .; 179 __con_initcall_end = .;
178 SECURITY_INIT 180 SECURITY_INIT
181
182 . = ALIGN(8);
183 .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
184 __parainstructions = .;
185 *(.parainstructions)
186 __parainstructions_end = .;
187 }
188
179 . = ALIGN(8); 189 . = ALIGN(8);
180 __alt_instructions = .; 190 __alt_instructions = .;
181 .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { 191 .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
@@ -187,8 +197,12 @@ SECTIONS
187 } 197 }
188 /* .exit.text is discard at runtime, not link time, to deal with references 198 /* .exit.text is discard at runtime, not link time, to deal with references
189 from .altinstructions and .eh_frame */ 199 from .altinstructions and .eh_frame */
190 .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) } 200 .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
191 .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) } 201 EXIT_TEXT
202 }
203 .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) {
204 EXIT_DATA
205 }
192 206
193/* vdso blob that is mapped into user space */ 207/* vdso blob that is mapped into user space */
194 vdso_start = . ; 208 vdso_start = . ;
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index 414caf0c5f9..d971210a6d3 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -25,21 +25,24 @@ static int __init vsmp_init(void)
25 return 0; 25 return 0;
26 26
27 /* Check if we are running on a ScaleMP vSMP box */ 27 /* Check if we are running on a ScaleMP vSMP box */
28 if ((read_pci_config_16(0, 0x1f, 0, PCI_VENDOR_ID) != PCI_VENDOR_ID_SCALEMP) || 28 if ((read_pci_config_16(0, 0x1f, 0, PCI_VENDOR_ID) !=
29 (read_pci_config_16(0, 0x1f, 0, PCI_DEVICE_ID) != PCI_DEVICE_ID_SCALEMP_VSMP_CTL)) 29 PCI_VENDOR_ID_SCALEMP) ||
30 (read_pci_config_16(0, 0x1f, 0, PCI_DEVICE_ID) !=
31 PCI_DEVICE_ID_SCALEMP_VSMP_CTL))
30 return 0; 32 return 0;
31 33
32 /* set vSMP magic bits to indicate vSMP capable kernel */ 34 /* set vSMP magic bits to indicate vSMP capable kernel */
33 address = ioremap(read_pci_config(0, 0x1f, 0, PCI_BASE_ADDRESS_0), 8); 35 address = ioremap(read_pci_config(0, 0x1f, 0, PCI_BASE_ADDRESS_0), 8);
34 cap = readl(address); 36 cap = readl(address);
35 ctl = readl(address + 4); 37 ctl = readl(address + 4);
36 printk("vSMP CTL: capabilities:0x%08x control:0x%08x\n", cap, ctl); 38 printk(KERN_INFO "vSMP CTL: capabilities:0x%08x control:0x%08x\n",
39 cap, ctl);
37 if (cap & ctl & (1 << 4)) { 40 if (cap & ctl & (1 << 4)) {
38 /* Turn on vSMP IRQ fastpath handling (see system.h) */ 41 /* Turn on vSMP IRQ fastpath handling (see system.h) */
39 ctl &= ~(1 << 4); 42 ctl &= ~(1 << 4);
40 writel(ctl, address + 4); 43 writel(ctl, address + 4);
41 ctl = readl(address + 4); 44 ctl = readl(address + 4);
42 printk("vSMP CTL: control set to:0x%08x\n", ctl); 45 printk(KERN_INFO "vSMP CTL: control set to:0x%08x\n", ctl);
43 } 46 }
44 47
45 iounmap(address); 48 iounmap(address);
diff --git a/arch/x86/kernel/vsyscall_32.S b/arch/x86/kernel/vsyscall_32.S
deleted file mode 100644
index a5ab3dc4fd2..00000000000
--- a/arch/x86/kernel/vsyscall_32.S
+++ /dev/null
@@ -1,15 +0,0 @@
1#include <linux/init.h>
2
3__INITDATA
4
5 .globl vsyscall_int80_start, vsyscall_int80_end
6vsyscall_int80_start:
7 .incbin "arch/x86/kernel/vsyscall-int80_32.so"
8vsyscall_int80_end:
9
10 .globl vsyscall_sysenter_start, vsyscall_sysenter_end
11vsyscall_sysenter_start:
12 .incbin "arch/x86/kernel/vsyscall-sysenter_32.so"
13vsyscall_sysenter_end:
14
15__FINIT
diff --git a/arch/x86/kernel/vsyscall_32.lds.S b/arch/x86/kernel/vsyscall_32.lds.S
deleted file mode 100644
index 4a8b0ed9b8f..00000000000
--- a/arch/x86/kernel/vsyscall_32.lds.S
+++ /dev/null
@@ -1,67 +0,0 @@
1/*
2 * Linker script for vsyscall DSO. The vsyscall page is an ELF shared
3 * object prelinked to its virtual address, and with only one read-only
4 * segment (that fits in one page). This script controls its layout.
5 */
6#include <asm/asm-offsets.h>
7
8SECTIONS
9{
10 . = VDSO_PRELINK_asm + SIZEOF_HEADERS;
11
12 .hash : { *(.hash) } :text
13 .gnu.hash : { *(.gnu.hash) }
14 .dynsym : { *(.dynsym) }
15 .dynstr : { *(.dynstr) }
16 .gnu.version : { *(.gnu.version) }
17 .gnu.version_d : { *(.gnu.version_d) }
18 .gnu.version_r : { *(.gnu.version_r) }
19
20 /* This linker script is used both with -r and with -shared.
21 For the layouts to match, we need to skip more than enough
22 space for the dynamic symbol table et al. If this amount
23 is insufficient, ld -shared will barf. Just increase it here. */
24 . = VDSO_PRELINK_asm + 0x400;
25
26 .text : { *(.text) } :text =0x90909090
27 .note : { *(.note.*) } :text :note
28 .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr
29 .eh_frame : { KEEP (*(.eh_frame)) } :text
30 .dynamic : { *(.dynamic) } :text :dynamic
31 .useless : {
32 *(.got.plt) *(.got)
33 *(.data .data.* .gnu.linkonce.d.*)
34 *(.dynbss)
35 *(.bss .bss.* .gnu.linkonce.b.*)
36 } :text
37}
38
39/*
40 * We must supply the ELF program headers explicitly to get just one
41 * PT_LOAD segment, and set the flags explicitly to make segments read-only.
42 */
43PHDRS
44{
45 text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */
46 dynamic PT_DYNAMIC FLAGS(4); /* PF_R */
47 note PT_NOTE FLAGS(4); /* PF_R */
48 eh_frame_hdr 0x6474e550; /* PT_GNU_EH_FRAME, but ld doesn't match the name */
49}
50
51/*
52 * This controls what symbols we export from the DSO.
53 */
54VERSION
55{
56 LINUX_2.5 {
57 global:
58 __kernel_vsyscall;
59 __kernel_sigreturn;
60 __kernel_rt_sigreturn;
61
62 local: *;
63 };
64}
65
66/* The ELF entry point can be used to set the AT_SYSINFO value. */
67ENTRY(__kernel_vsyscall);
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index ad4005c6d4a..3f824277458 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -43,7 +43,7 @@
43#include <asm/vgtod.h> 43#include <asm/vgtod.h>
44 44
45#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) 45#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
46#define __syscall_clobber "r11","rcx","memory" 46#define __syscall_clobber "r11","cx","memory"
47#define __pa_vsymbol(x) \ 47#define __pa_vsymbol(x) \
48 ({unsigned long v; \ 48 ({unsigned long v; \
49 extern char __vsyscall_0; \ 49 extern char __vsyscall_0; \
@@ -190,7 +190,7 @@ time_t __vsyscall(1) vtime(time_t *t)
190long __vsyscall(2) 190long __vsyscall(2)
191vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) 191vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
192{ 192{
193 unsigned int dummy, p; 193 unsigned int p;
194 unsigned long j = 0; 194 unsigned long j = 0;
195 195
196 /* Fast cache - only recompute value once per jiffies and avoid 196 /* Fast cache - only recompute value once per jiffies and avoid
@@ -205,7 +205,7 @@ vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
205 p = tcache->blob[1]; 205 p = tcache->blob[1];
206 } else if (__vgetcpu_mode == VGETCPU_RDTSCP) { 206 } else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
207 /* Load per CPU data from RDTSCP */ 207 /* Load per CPU data from RDTSCP */
208 rdtscp(dummy, dummy, p); 208 native_read_tscp(&p);
209 } else { 209 } else {
210 /* Load per CPU data from GDT */ 210 /* Load per CPU data from GDT */
211 asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); 211 asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
@@ -297,7 +297,7 @@ static void __cpuinit vsyscall_set_cpu(int cpu)
297 /* Store cpu number in limit so that it can be loaded quickly 297 /* Store cpu number in limit so that it can be loaded quickly
298 in user space in vgetcpu. 298 in user space in vgetcpu.
299 12 bits for the CPU and 8 bits for the node. */ 299 12 bits for the CPU and 8 bits for the node. */
300 d = (unsigned long *)(cpu_gdt(cpu) + GDT_ENTRY_PER_CPU); 300 d = (unsigned long *)(get_cpu_gdt_table(cpu) + GDT_ENTRY_PER_CPU);
301 *d = 0x0f40000000000ULL; 301 *d = 0x0f40000000000ULL;
302 *d |= cpu; 302 *d |= cpu;
303 *d |= (node & 0xf) << 12; 303 *d |= (node & 0xf) << 12;
@@ -319,7 +319,7 @@ cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
319 return NOTIFY_DONE; 319 return NOTIFY_DONE;
320} 320}
321 321
322static void __init map_vsyscall(void) 322void __init map_vsyscall(void)
323{ 323{
324 extern char __vsyscall_0; 324 extern char __vsyscall_0;
325 unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0); 325 unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
@@ -335,7 +335,6 @@ static int __init vsyscall_init(void)
335 BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime)); 335 BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
336 BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE))); 336 BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
337 BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu)); 337 BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
338 map_vsyscall();
339#ifdef CONFIG_SYSCTL 338#ifdef CONFIG_SYSCTL
340 register_sysctl_table(kernel_root_table2); 339 register_sysctl_table(kernel_root_table2);
341#endif 340#endif
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 77c25b30763..a66e9c1a053 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -8,6 +8,7 @@
8#include <asm/processor.h> 8#include <asm/processor.h>
9#include <asm/uaccess.h> 9#include <asm/uaccess.h>
10#include <asm/pgtable.h> 10#include <asm/pgtable.h>
11#include <asm/desc.h>
11 12
12EXPORT_SYMBOL(kernel_thread); 13EXPORT_SYMBOL(kernel_thread);
13 14
@@ -34,13 +35,6 @@ EXPORT_SYMBOL(__copy_from_user_inatomic);
34EXPORT_SYMBOL(copy_page); 35EXPORT_SYMBOL(copy_page);
35EXPORT_SYMBOL(clear_page); 36EXPORT_SYMBOL(clear_page);
36 37
37#ifdef CONFIG_SMP
38extern void __write_lock_failed(rwlock_t *rw);
39extern void __read_lock_failed(rwlock_t *rw);
40EXPORT_SYMBOL(__write_lock_failed);
41EXPORT_SYMBOL(__read_lock_failed);
42#endif
43
44/* Export string functions. We normally rely on gcc builtin for most of these, 38/* Export string functions. We normally rely on gcc builtin for most of these,
45 but gcc sometimes decides not to inline them. */ 39 but gcc sometimes decides not to inline them. */
46#undef memcpy 40#undef memcpy
@@ -60,3 +54,8 @@ EXPORT_SYMBOL(init_level4_pgt);
60EXPORT_SYMBOL(load_gs_index); 54EXPORT_SYMBOL(load_gs_index);
61 55
62EXPORT_SYMBOL(_proxy_pda); 56EXPORT_SYMBOL(_proxy_pda);
57
58#ifdef CONFIG_PARAVIRT
59/* Virtualized guests may want to use it */
60EXPORT_SYMBOL_GPL(cpu_gdt_descr);
61#endif
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
new file mode 100644
index 00000000000..41962e793c0
--- /dev/null
+++ b/arch/x86/kvm/Kconfig
@@ -0,0 +1,58 @@
1#
2# KVM configuration
3#
4config HAVE_KVM
5 bool
6
7menuconfig VIRTUALIZATION
8 bool "Virtualization"
9 depends on HAVE_KVM || X86
10 default y
11 ---help---
12 Say Y here to get to see options for using your Linux host to run other
13 operating systems inside virtual machines (guests).
14 This option alone does not add any kernel code.
15
16 If you say N, all options in this submenu will be skipped and disabled.
17
18if VIRTUALIZATION
19
20config KVM
21 tristate "Kernel-based Virtual Machine (KVM) support"
22 depends on HAVE_KVM && EXPERIMENTAL
23 select PREEMPT_NOTIFIERS
24 select ANON_INODES
25 ---help---
26 Support hosting fully virtualized guest machines using hardware
27 virtualization extensions. You will need a fairly recent
28 processor equipped with virtualization extensions. You will also
29 need to select one or more of the processor modules below.
30
31 This module provides access to the hardware capabilities through
32 a character device node named /dev/kvm.
33
34 To compile this as a module, choose M here: the module
35 will be called kvm.
36
37 If unsure, say N.
38
39config KVM_INTEL
40 tristate "KVM for Intel processors support"
41 depends on KVM
42 ---help---
43 Provides support for KVM on Intel processors equipped with the VT
44 extensions.
45
46config KVM_AMD
47 tristate "KVM for AMD processors support"
48 depends on KVM
49 ---help---
50 Provides support for KVM on AMD processors equipped with the AMD-V
51 (SVM) extensions.
52
53# OK, it's a little counter-intuitive to do this, but it puts it neatly under
54# the virtualization menu.
55source drivers/lguest/Kconfig
56source drivers/virtio/Kconfig
57
58endif # VIRTUALIZATION
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
new file mode 100644
index 00000000000..ffdd0b31078
--- /dev/null
+++ b/arch/x86/kvm/Makefile
@@ -0,0 +1,14 @@
1#
2# Makefile for Kernel-based Virtual Machine module
3#
4
5common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o)
6
7EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
8
9kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o
10obj-$(CONFIG_KVM) += kvm.o
11kvm-intel-objs = vmx.o
12obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
13kvm-amd-objs = svm.o
14obj-$(CONFIG_KVM_AMD) += kvm-amd.o
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
new file mode 100644
index 00000000000..ab29cf2def4
--- /dev/null
+++ b/arch/x86/kvm/i8259.c
@@ -0,0 +1,450 @@
1/*
2 * 8259 interrupt controller emulation
3 *
4 * Copyright (c) 2003-2004 Fabrice Bellard
5 * Copyright (c) 2007 Intel Corporation
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a copy
8 * of this software and associated documentation files (the "Software"), to deal
9 * in the Software without restriction, including without limitation the rights
10 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 * copies of the Software, and to permit persons to whom the Software is
12 * furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice shall be included in
15 * all copies or substantial portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23 * THE SOFTWARE.
24 * Authors:
25 * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
26 * Port from Qemu.
27 */
28#include <linux/mm.h>
29#include "irq.h"
30
31#include <linux/kvm_host.h>
32
33/*
34 * set irq level. If an edge is detected, then the IRR is set to 1
35 */
36static inline void pic_set_irq1(struct kvm_kpic_state *s, int irq, int level)
37{
38 int mask;
39 mask = 1 << irq;
40 if (s->elcr & mask) /* level triggered */
41 if (level) {
42 s->irr |= mask;
43 s->last_irr |= mask;
44 } else {
45 s->irr &= ~mask;
46 s->last_irr &= ~mask;
47 }
48 else /* edge triggered */
49 if (level) {
50 if ((s->last_irr & mask) == 0)
51 s->irr |= mask;
52 s->last_irr |= mask;
53 } else
54 s->last_irr &= ~mask;
55}
56
57/*
58 * return the highest priority found in mask (highest = smallest
59 * number). Return 8 if no irq
60 */
61static inline int get_priority(struct kvm_kpic_state *s, int mask)
62{
63 int priority;
64 if (mask == 0)
65 return 8;
66 priority = 0;
67 while ((mask & (1 << ((priority + s->priority_add) & 7))) == 0)
68 priority++;
69 return priority;
70}
71
72/*
73 * return the pic wanted interrupt. return -1 if none
74 */
75static int pic_get_irq(struct kvm_kpic_state *s)
76{
77 int mask, cur_priority, priority;
78
79 mask = s->irr & ~s->imr;
80 priority = get_priority(s, mask);
81 if (priority == 8)
82 return -1;
83 /*
84 * compute current priority. If special fully nested mode on the
85 * master, the IRQ coming from the slave is not taken into account
86 * for the priority computation.
87 */
88 mask = s->isr;
89 if (s->special_fully_nested_mode && s == &s->pics_state->pics[0])
90 mask &= ~(1 << 2);
91 cur_priority = get_priority(s, mask);
92 if (priority < cur_priority)
93 /*
94 * higher priority found: an irq should be generated
95 */
96 return (priority + s->priority_add) & 7;
97 else
98 return -1;
99}
100
101/*
102 * raise irq to CPU if necessary. must be called every time the active
103 * irq may change
104 */
105static void pic_update_irq(struct kvm_pic *s)
106{
107 int irq2, irq;
108
109 irq2 = pic_get_irq(&s->pics[1]);
110 if (irq2 >= 0) {
111 /*
112 * if irq request by slave pic, signal master PIC
113 */
114 pic_set_irq1(&s->pics[0], 2, 1);
115 pic_set_irq1(&s->pics[0], 2, 0);
116 }
117 irq = pic_get_irq(&s->pics[0]);
118 if (irq >= 0)
119 s->irq_request(s->irq_request_opaque, 1);
120 else
121 s->irq_request(s->irq_request_opaque, 0);
122}
123
124void kvm_pic_update_irq(struct kvm_pic *s)
125{
126 pic_update_irq(s);
127}
128
129void kvm_pic_set_irq(void *opaque, int irq, int level)
130{
131 struct kvm_pic *s = opaque;
132
133 pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
134 pic_update_irq(s);
135}
136
137/*
138 * acknowledge interrupt 'irq'
139 */
140static inline void pic_intack(struct kvm_kpic_state *s, int irq)
141{
142 if (s->auto_eoi) {
143 if (s->rotate_on_auto_eoi)
144 s->priority_add = (irq + 1) & 7;
145 } else
146 s->isr |= (1 << irq);
147 /*
148 * We don't clear a level sensitive interrupt here
149 */
150 if (!(s->elcr & (1 << irq)))
151 s->irr &= ~(1 << irq);
152}
153
154int kvm_pic_read_irq(struct kvm_pic *s)
155{
156 int irq, irq2, intno;
157
158 irq = pic_get_irq(&s->pics[0]);
159 if (irq >= 0) {
160 pic_intack(&s->pics[0], irq);
161 if (irq == 2) {
162 irq2 = pic_get_irq(&s->pics[1]);
163 if (irq2 >= 0)
164 pic_intack(&s->pics[1], irq2);
165 else
166 /*
167 * spurious IRQ on slave controller
168 */
169 irq2 = 7;
170 intno = s->pics[1].irq_base + irq2;
171 irq = irq2 + 8;
172 } else
173 intno = s->pics[0].irq_base + irq;
174 } else {
175 /*
176 * spurious IRQ on host controller
177 */
178 irq = 7;
179 intno = s->pics[0].irq_base + irq;
180 }
181 pic_update_irq(s);
182
183 return intno;
184}
185
186void kvm_pic_reset(struct kvm_kpic_state *s)
187{
188 s->last_irr = 0;
189 s->irr = 0;
190 s->imr = 0;
191 s->isr = 0;
192 s->priority_add = 0;
193 s->irq_base = 0;
194 s->read_reg_select = 0;
195 s->poll = 0;
196 s->special_mask = 0;
197 s->init_state = 0;
198 s->auto_eoi = 0;
199 s->rotate_on_auto_eoi = 0;
200 s->special_fully_nested_mode = 0;
201 s->init4 = 0;
202}
203
204static void pic_ioport_write(void *opaque, u32 addr, u32 val)
205{
206 struct kvm_kpic_state *s = opaque;
207 int priority, cmd, irq;
208
209 addr &= 1;
210 if (addr == 0) {
211 if (val & 0x10) {
212 kvm_pic_reset(s); /* init */
213 /*
214 * deassert a pending interrupt
215 */
216 s->pics_state->irq_request(s->pics_state->
217 irq_request_opaque, 0);
218 s->init_state = 1;
219 s->init4 = val & 1;
220 if (val & 0x02)
221 printk(KERN_ERR "single mode not supported");
222 if (val & 0x08)
223 printk(KERN_ERR
224 "level sensitive irq not supported");
225 } else if (val & 0x08) {
226 if (val & 0x04)
227 s->poll = 1;
228 if (val & 0x02)
229 s->read_reg_select = val & 1;
230 if (val & 0x40)
231 s->special_mask = (val >> 5) & 1;
232 } else {
233 cmd = val >> 5;
234 switch (cmd) {
235 case 0:
236 case 4:
237 s->rotate_on_auto_eoi = cmd >> 2;
238 break;
239 case 1: /* end of interrupt */
240 case 5:
241 priority = get_priority(s, s->isr);
242 if (priority != 8) {
243 irq = (priority + s->priority_add) & 7;
244 s->isr &= ~(1 << irq);
245 if (cmd == 5)
246 s->priority_add = (irq + 1) & 7;
247 pic_update_irq(s->pics_state);
248 }
249 break;
250 case 3:
251 irq = val & 7;
252 s->isr &= ~(1 << irq);
253 pic_update_irq(s->pics_state);
254 break;
255 case 6:
256 s->priority_add = (val + 1) & 7;
257 pic_update_irq(s->pics_state);
258 break;
259 case 7:
260 irq = val & 7;
261 s->isr &= ~(1 << irq);
262 s->priority_add = (irq + 1) & 7;
263 pic_update_irq(s->pics_state);
264 break;
265 default:
266 break; /* no operation */
267 }
268 }
269 } else
270 switch (s->init_state) {
271 case 0: /* normal mode */
272 s->imr = val;
273 pic_update_irq(s->pics_state);
274 break;
275 case 1:
276 s->irq_base = val & 0xf8;
277 s->init_state = 2;
278 break;
279 case 2:
280 if (s->init4)
281 s->init_state = 3;
282 else
283 s->init_state = 0;
284 break;
285 case 3:
286 s->special_fully_nested_mode = (val >> 4) & 1;
287 s->auto_eoi = (val >> 1) & 1;
288 s->init_state = 0;
289 break;
290 }
291}
292
293static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1)
294{
295 int ret;
296
297 ret = pic_get_irq(s);
298 if (ret >= 0) {
299 if (addr1 >> 7) {
300 s->pics_state->pics[0].isr &= ~(1 << 2);
301 s->pics_state->pics[0].irr &= ~(1 << 2);
302 }
303 s->irr &= ~(1 << ret);
304 s->isr &= ~(1 << ret);
305 if (addr1 >> 7 || ret != 2)
306 pic_update_irq(s->pics_state);
307 } else {
308 ret = 0x07;
309 pic_update_irq(s->pics_state);
310 }
311
312 return ret;
313}
314
315static u32 pic_ioport_read(void *opaque, u32 addr1)
316{
317 struct kvm_kpic_state *s = opaque;
318 unsigned int addr;
319 int ret;
320
321 addr = addr1;
322 addr &= 1;
323 if (s->poll) {
324 ret = pic_poll_read(s, addr1);
325 s->poll = 0;
326 } else
327 if (addr == 0)
328 if (s->read_reg_select)
329 ret = s->isr;
330 else
331 ret = s->irr;
332 else
333 ret = s->imr;
334 return ret;
335}
336
337static void elcr_ioport_write(void *opaque, u32 addr, u32 val)
338{
339 struct kvm_kpic_state *s = opaque;
340 s->elcr = val & s->elcr_mask;
341}
342
343static u32 elcr_ioport_read(void *opaque, u32 addr1)
344{
345 struct kvm_kpic_state *s = opaque;
346 return s->elcr;
347}
348
349static int picdev_in_range(struct kvm_io_device *this, gpa_t addr)
350{
351 switch (addr) {
352 case 0x20:
353 case 0x21:
354 case 0xa0:
355 case 0xa1:
356 case 0x4d0:
357 case 0x4d1:
358 return 1;
359 default:
360 return 0;
361 }
362}
363
364static void picdev_write(struct kvm_io_device *this,
365 gpa_t addr, int len, const void *val)
366{
367 struct kvm_pic *s = this->private;
368 unsigned char data = *(unsigned char *)val;
369
370 if (len != 1) {
371 if (printk_ratelimit())
372 printk(KERN_ERR "PIC: non byte write\n");
373 return;
374 }
375 switch (addr) {
376 case 0x20:
377 case 0x21:
378 case 0xa0:
379 case 0xa1:
380 pic_ioport_write(&s->pics[addr >> 7], addr, data);
381 break;
382 case 0x4d0:
383 case 0x4d1:
384 elcr_ioport_write(&s->pics[addr & 1], addr, data);
385 break;
386 }
387}
388
389static void picdev_read(struct kvm_io_device *this,
390 gpa_t addr, int len, void *val)
391{
392 struct kvm_pic *s = this->private;
393 unsigned char data = 0;
394
395 if (len != 1) {
396 if (printk_ratelimit())
397 printk(KERN_ERR "PIC: non byte read\n");
398 return;
399 }
400 switch (addr) {
401 case 0x20:
402 case 0x21:
403 case 0xa0:
404 case 0xa1:
405 data = pic_ioport_read(&s->pics[addr >> 7], addr);
406 break;
407 case 0x4d0:
408 case 0x4d1:
409 data = elcr_ioport_read(&s->pics[addr & 1], addr);
410 break;
411 }
412 *(unsigned char *)val = data;
413}
414
415/*
416 * callback when PIC0 irq status changed
417 */
418static void pic_irq_request(void *opaque, int level)
419{
420 struct kvm *kvm = opaque;
421 struct kvm_vcpu *vcpu = kvm->vcpus[0];
422
423 pic_irqchip(kvm)->output = level;
424 if (vcpu)
425 kvm_vcpu_kick(vcpu);
426}
427
428struct kvm_pic *kvm_create_pic(struct kvm *kvm)
429{
430 struct kvm_pic *s;
431 s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
432 if (!s)
433 return NULL;
434 s->pics[0].elcr_mask = 0xf8;
435 s->pics[1].elcr_mask = 0xde;
436 s->irq_request = pic_irq_request;
437 s->irq_request_opaque = kvm;
438 s->pics[0].pics_state = s;
439 s->pics[1].pics_state = s;
440
441 /*
442 * Initialize PIO device
443 */
444 s->dev.read = picdev_read;
445 s->dev.write = picdev_write;
446 s->dev.in_range = picdev_in_range;
447 s->dev.private = s;
448 kvm_io_bus_register_dev(&kvm->pio_bus, &s->dev);
449 return s;
450}
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
new file mode 100644
index 00000000000..e5714759e97
--- /dev/null
+++ b/arch/x86/kvm/irq.c
@@ -0,0 +1,78 @@
1/*
2 * irq.c: API for in kernel interrupt controller
3 * Copyright (c) 2007, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 * Authors:
18 * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
19 *
20 */
21
22#include <linux/module.h>
23#include <linux/kvm_host.h>
24
25#include "irq.h"
26
27/*
28 * check if there is pending interrupt without
29 * intack.
30 */
31int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
32{
33 struct kvm_pic *s;
34
35 if (kvm_apic_has_interrupt(v) == -1) { /* LAPIC */
36 if (kvm_apic_accept_pic_intr(v)) {
37 s = pic_irqchip(v->kvm); /* PIC */
38 return s->output;
39 } else
40 return 0;
41 }
42 return 1;
43}
44EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt);
45
46/*
47 * Read pending interrupt vector and intack.
48 */
49int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
50{
51 struct kvm_pic *s;
52 int vector;
53
54 vector = kvm_get_apic_interrupt(v); /* APIC */
55 if (vector == -1) {
56 if (kvm_apic_accept_pic_intr(v)) {
57 s = pic_irqchip(v->kvm);
58 s->output = 0; /* PIC */
59 vector = kvm_pic_read_irq(s);
60 }
61 }
62 return vector;
63}
64EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
65
66void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
67{
68 kvm_inject_apic_timer_irqs(vcpu);
69 /* TODO: PIT, RTC etc. */
70}
71EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
72
73void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
74{
75 kvm_apic_timer_intr_post(vcpu, vec);
76 /* TODO: PIT, RTC etc. */
77}
78EXPORT_SYMBOL_GPL(kvm_timer_intr_post);
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
new file mode 100644
index 00000000000..fa5ed5d59b5
--- /dev/null
+++ b/arch/x86/kvm/irq.h
@@ -0,0 +1,88 @@
1/*
2 * irq.h: in kernel interrupt controller related definitions
3 * Copyright (c) 2007, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 * Authors:
18 * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
19 *
20 */
21
22#ifndef __IRQ_H
23#define __IRQ_H
24
25#include <linux/mm_types.h>
26#include <linux/hrtimer.h>
27#include <linux/kvm_host.h>
28
29#include "iodev.h"
30#include "ioapic.h"
31#include "lapic.h"
32
33struct kvm;
34struct kvm_vcpu;
35
36typedef void irq_request_func(void *opaque, int level);
37
38struct kvm_kpic_state {
39 u8 last_irr; /* edge detection */
40 u8 irr; /* interrupt request register */
41 u8 imr; /* interrupt mask register */
42 u8 isr; /* interrupt service register */
43 u8 priority_add; /* highest irq priority */
44 u8 irq_base;
45 u8 read_reg_select;
46 u8 poll;
47 u8 special_mask;
48 u8 init_state;
49 u8 auto_eoi;
50 u8 rotate_on_auto_eoi;
51 u8 special_fully_nested_mode;
52 u8 init4; /* true if 4 byte init */
53 u8 elcr; /* PIIX edge/trigger selection */
54 u8 elcr_mask;
55 struct kvm_pic *pics_state;
56};
57
58struct kvm_pic {
59 struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
60 irq_request_func *irq_request;
61 void *irq_request_opaque;
62 int output; /* intr from master PIC */
63 struct kvm_io_device dev;
64};
65
66struct kvm_pic *kvm_create_pic(struct kvm *kvm);
67void kvm_pic_set_irq(void *opaque, int irq, int level);
68int kvm_pic_read_irq(struct kvm_pic *s);
69void kvm_pic_update_irq(struct kvm_pic *s);
70
71static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
72{
73 return kvm->arch.vpic;
74}
75
76static inline int irqchip_in_kernel(struct kvm *kvm)
77{
78 return pic_irqchip(kvm) != NULL;
79}
80
81void kvm_pic_reset(struct kvm_kpic_state *s);
82
83void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
84void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
85void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
86void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
87
88#endif
diff --git a/arch/x86/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h
new file mode 100644
index 00000000000..ecdfe97e463
--- /dev/null
+++ b/arch/x86/kvm/kvm_svm.h
@@ -0,0 +1,45 @@
1#ifndef __KVM_SVM_H
2#define __KVM_SVM_H
3
4#include <linux/kernel.h>
5#include <linux/types.h>
6#include <linux/list.h>
7#include <linux/kvm_host.h>
8#include <asm/msr.h>
9
10#include "svm.h"
11
12static const u32 host_save_user_msrs[] = {
13#ifdef CONFIG_X86_64
14 MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
15 MSR_FS_BASE,
16#endif
17 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
18};
19
20#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
21#define NUM_DB_REGS 4
22
23struct kvm_vcpu;
24
25struct vcpu_svm {
26 struct kvm_vcpu vcpu;
27 struct vmcb *vmcb;
28 unsigned long vmcb_pa;
29 struct svm_cpu_data *svm_data;
30 uint64_t asid_generation;
31
32 unsigned long db_regs[NUM_DB_REGS];
33
34 u64 next_rip;
35
36 u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
37 u64 host_gs_base;
38 unsigned long host_cr2;
39 unsigned long host_db_regs[NUM_DB_REGS];
40 unsigned long host_dr6;
41 unsigned long host_dr7;
42};
43
44#endif
45
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
new file mode 100644
index 00000000000..2cbee9479ce
--- /dev/null
+++ b/arch/x86/kvm/lapic.c
@@ -0,0 +1,1154 @@
1
2/*
3 * Local APIC virtualization
4 *
5 * Copyright (C) 2006 Qumranet, Inc.
6 * Copyright (C) 2007 Novell
7 * Copyright (C) 2007 Intel
8 *
9 * Authors:
10 * Dor Laor <dor.laor@qumranet.com>
11 * Gregory Haskins <ghaskins@novell.com>
12 * Yaozu (Eddie) Dong <eddie.dong@intel.com>
13 *
14 * Based on Xen 3.1 code, Copyright (c) 2004, Intel Corporation.
15 *
16 * This work is licensed under the terms of the GNU GPL, version 2. See
17 * the COPYING file in the top-level directory.
18 */
19
20#include <linux/kvm_host.h>
21#include <linux/kvm.h>
22#include <linux/mm.h>
23#include <linux/highmem.h>
24#include <linux/smp.h>
25#include <linux/hrtimer.h>
26#include <linux/io.h>
27#include <linux/module.h>
28#include <asm/processor.h>
29#include <asm/msr.h>
30#include <asm/page.h>
31#include <asm/current.h>
32#include <asm/apicdef.h>
33#include <asm/atomic.h>
34#include <asm/div64.h>
35#include "irq.h"
36
37#define PRId64 "d"
38#define PRIx64 "llx"
39#define PRIu64 "u"
40#define PRIo64 "o"
41
42#define APIC_BUS_CYCLE_NS 1
43
44/* #define apic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */
45#define apic_debug(fmt, arg...)
46
47#define APIC_LVT_NUM 6
48/* 14 is the version for Xeon and Pentium 8.4.8*/
49#define APIC_VERSION (0x14UL | ((APIC_LVT_NUM - 1) << 16))
50#define LAPIC_MMIO_LENGTH (1 << 12)
51/* followed define is not in apicdef.h */
52#define APIC_SHORT_MASK 0xc0000
53#define APIC_DEST_NOSHORT 0x0
54#define APIC_DEST_MASK 0x800
55#define MAX_APIC_VECTOR 256
56
57#define VEC_POS(v) ((v) & (32 - 1))
58#define REG_POS(v) (((v) >> 5) << 4)
59
60static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off)
61{
62 return *((u32 *) (apic->regs + reg_off));
63}
64
65static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
66{
67 *((u32 *) (apic->regs + reg_off)) = val;
68}
69
70static inline int apic_test_and_set_vector(int vec, void *bitmap)
71{
72 return test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
73}
74
75static inline int apic_test_and_clear_vector(int vec, void *bitmap)
76{
77 return test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
78}
79
80static inline void apic_set_vector(int vec, void *bitmap)
81{
82 set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
83}
84
85static inline void apic_clear_vector(int vec, void *bitmap)
86{
87 clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
88}
89
90static inline int apic_hw_enabled(struct kvm_lapic *apic)
91{
92 return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE;
93}
94
95static inline int apic_sw_enabled(struct kvm_lapic *apic)
96{
97 return apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED;
98}
99
100static inline int apic_enabled(struct kvm_lapic *apic)
101{
102 return apic_sw_enabled(apic) && apic_hw_enabled(apic);
103}
104
105#define LVT_MASK \
106 (APIC_LVT_MASKED | APIC_SEND_PENDING | APIC_VECTOR_MASK)
107
108#define LINT_MASK \
109 (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
110 APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
111
112static inline int kvm_apic_id(struct kvm_lapic *apic)
113{
114 return (apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
115}
116
117static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type)
118{
119 return !(apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED);
120}
121
122static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type)
123{
124 return apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK;
125}
126
127static inline int apic_lvtt_period(struct kvm_lapic *apic)
128{
129 return apic_get_reg(apic, APIC_LVTT) & APIC_LVT_TIMER_PERIODIC;
130}
131
132static unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
133 LVT_MASK | APIC_LVT_TIMER_PERIODIC, /* LVTT */
134 LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */
135 LVT_MASK | APIC_MODE_MASK, /* LVTPC */
136 LINT_MASK, LINT_MASK, /* LVT0-1 */
137 LVT_MASK /* LVTERR */
138};
139
140static int find_highest_vector(void *bitmap)
141{
142 u32 *word = bitmap;
143 int word_offset = MAX_APIC_VECTOR >> 5;
144
145 while ((word_offset != 0) && (word[(--word_offset) << 2] == 0))
146 continue;
147
148 if (likely(!word_offset && !word[0]))
149 return -1;
150 else
151 return fls(word[word_offset << 2]) - 1 + (word_offset << 5);
152}
153
154static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic)
155{
156 return apic_test_and_set_vector(vec, apic->regs + APIC_IRR);
157}
158
159static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
160{
161 apic_clear_vector(vec, apic->regs + APIC_IRR);
162}
163
164static inline int apic_find_highest_irr(struct kvm_lapic *apic)
165{
166 int result;
167
168 result = find_highest_vector(apic->regs + APIC_IRR);
169 ASSERT(result == -1 || result >= 16);
170
171 return result;
172}
173
174int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
175{
176 struct kvm_lapic *apic = vcpu->arch.apic;
177 int highest_irr;
178
179 if (!apic)
180 return 0;
181 highest_irr = apic_find_highest_irr(apic);
182
183 return highest_irr;
184}
185EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr);
186
187int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig)
188{
189 struct kvm_lapic *apic = vcpu->arch.apic;
190
191 if (!apic_test_and_set_irr(vec, apic)) {
192 /* a new pending irq is set in IRR */
193 if (trig)
194 apic_set_vector(vec, apic->regs + APIC_TMR);
195 else
196 apic_clear_vector(vec, apic->regs + APIC_TMR);
197 kvm_vcpu_kick(apic->vcpu);
198 return 1;
199 }
200 return 0;
201}
202
203static inline int apic_find_highest_isr(struct kvm_lapic *apic)
204{
205 int result;
206
207 result = find_highest_vector(apic->regs + APIC_ISR);
208 ASSERT(result == -1 || result >= 16);
209
210 return result;
211}
212
213static void apic_update_ppr(struct kvm_lapic *apic)
214{
215 u32 tpr, isrv, ppr;
216 int isr;
217
218 tpr = apic_get_reg(apic, APIC_TASKPRI);
219 isr = apic_find_highest_isr(apic);
220 isrv = (isr != -1) ? isr : 0;
221
222 if ((tpr & 0xf0) >= (isrv & 0xf0))
223 ppr = tpr & 0xff;
224 else
225 ppr = isrv & 0xf0;
226
227 apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x",
228 apic, ppr, isr, isrv);
229
230 apic_set_reg(apic, APIC_PROCPRI, ppr);
231}
232
233static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
234{
235 apic_set_reg(apic, APIC_TASKPRI, tpr);
236 apic_update_ppr(apic);
237}
238
239int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest)
240{
241 return kvm_apic_id(apic) == dest;
242}
243
244int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
245{
246 int result = 0;
247 u8 logical_id;
248
249 logical_id = GET_APIC_LOGICAL_ID(apic_get_reg(apic, APIC_LDR));
250
251 switch (apic_get_reg(apic, APIC_DFR)) {
252 case APIC_DFR_FLAT:
253 if (logical_id & mda)
254 result = 1;
255 break;
256 case APIC_DFR_CLUSTER:
257 if (((logical_id >> 4) == (mda >> 0x4))
258 && (logical_id & mda & 0xf))
259 result = 1;
260 break;
261 default:
262 printk(KERN_WARNING "Bad DFR vcpu %d: %08x\n",
263 apic->vcpu->vcpu_id, apic_get_reg(apic, APIC_DFR));
264 break;
265 }
266
267 return result;
268}
269
270static int apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
271 int short_hand, int dest, int dest_mode)
272{
273 int result = 0;
274 struct kvm_lapic *target = vcpu->arch.apic;
275
276 apic_debug("target %p, source %p, dest 0x%x, "
277 "dest_mode 0x%x, short_hand 0x%x",
278 target, source, dest, dest_mode, short_hand);
279
280 ASSERT(!target);
281 switch (short_hand) {
282 case APIC_DEST_NOSHORT:
283 if (dest_mode == 0) {
284 /* Physical mode. */
285 if ((dest == 0xFF) || (dest == kvm_apic_id(target)))
286 result = 1;
287 } else
288 /* Logical mode. */
289 result = kvm_apic_match_logical_addr(target, dest);
290 break;
291 case APIC_DEST_SELF:
292 if (target == source)
293 result = 1;
294 break;
295 case APIC_DEST_ALLINC:
296 result = 1;
297 break;
298 case APIC_DEST_ALLBUT:
299 if (target != source)
300 result = 1;
301 break;
302 default:
303 printk(KERN_WARNING "Bad dest shorthand value %x\n",
304 short_hand);
305 break;
306 }
307
308 return result;
309}
310
311/*
312 * Add a pending IRQ into lapic.
313 * Return 1 if successfully added and 0 if discarded.
314 */
315static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
316 int vector, int level, int trig_mode)
317{
318 int orig_irr, result = 0;
319 struct kvm_vcpu *vcpu = apic->vcpu;
320
321 switch (delivery_mode) {
322 case APIC_DM_FIXED:
323 case APIC_DM_LOWEST:
324 /* FIXME add logic for vcpu on reset */
325 if (unlikely(!apic_enabled(apic)))
326 break;
327
328 orig_irr = apic_test_and_set_irr(vector, apic);
329 if (orig_irr && trig_mode) {
330 apic_debug("level trig mode repeatedly for vector %d",
331 vector);
332 break;
333 }
334
335 if (trig_mode) {
336 apic_debug("level trig mode for vector %d", vector);
337 apic_set_vector(vector, apic->regs + APIC_TMR);
338 } else
339 apic_clear_vector(vector, apic->regs + APIC_TMR);
340
341 if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE)
342 kvm_vcpu_kick(vcpu);
343 else if (vcpu->arch.mp_state == VCPU_MP_STATE_HALTED) {
344 vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
345 if (waitqueue_active(&vcpu->wq))
346 wake_up_interruptible(&vcpu->wq);
347 }
348
349 result = (orig_irr == 0);
350 break;
351
352 case APIC_DM_REMRD:
353 printk(KERN_DEBUG "Ignoring delivery mode 3\n");
354 break;
355
356 case APIC_DM_SMI:
357 printk(KERN_DEBUG "Ignoring guest SMI\n");
358 break;
359 case APIC_DM_NMI:
360 printk(KERN_DEBUG "Ignoring guest NMI\n");
361 break;
362
363 case APIC_DM_INIT:
364 if (level) {
365 if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE)
366 printk(KERN_DEBUG
367 "INIT on a runnable vcpu %d\n",
368 vcpu->vcpu_id);
369 vcpu->arch.mp_state = VCPU_MP_STATE_INIT_RECEIVED;
370 kvm_vcpu_kick(vcpu);
371 } else {
372 printk(KERN_DEBUG
373 "Ignoring de-assert INIT to vcpu %d\n",
374 vcpu->vcpu_id);
375 }
376
377 break;
378
379 case APIC_DM_STARTUP:
380 printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n",
381 vcpu->vcpu_id, vector);
382 if (vcpu->arch.mp_state == VCPU_MP_STATE_INIT_RECEIVED) {
383 vcpu->arch.sipi_vector = vector;
384 vcpu->arch.mp_state = VCPU_MP_STATE_SIPI_RECEIVED;
385 if (waitqueue_active(&vcpu->wq))
386 wake_up_interruptible(&vcpu->wq);
387 }
388 break;
389
390 default:
391 printk(KERN_ERR "TODO: unsupported delivery mode %x\n",
392 delivery_mode);
393 break;
394 }
395 return result;
396}
397
398static struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
399 unsigned long bitmap)
400{
401 int last;
402 int next;
403 struct kvm_lapic *apic = NULL;
404
405 last = kvm->arch.round_robin_prev_vcpu;
406 next = last;
407
408 do {
409 if (++next == KVM_MAX_VCPUS)
410 next = 0;
411 if (kvm->vcpus[next] == NULL || !test_bit(next, &bitmap))
412 continue;
413 apic = kvm->vcpus[next]->arch.apic;
414 if (apic && apic_enabled(apic))
415 break;
416 apic = NULL;
417 } while (next != last);
418 kvm->arch.round_robin_prev_vcpu = next;
419
420 if (!apic)
421 printk(KERN_DEBUG "vcpu not ready for apic_round_robin\n");
422
423 return apic;
424}
425
426struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
427 unsigned long bitmap)
428{
429 struct kvm_lapic *apic;
430
431 apic = kvm_apic_round_robin(kvm, vector, bitmap);
432 if (apic)
433 return apic->vcpu;
434 return NULL;
435}
436
437static void apic_set_eoi(struct kvm_lapic *apic)
438{
439 int vector = apic_find_highest_isr(apic);
440
441 /*
442 * Not every write EOI will has corresponding ISR,
443 * one example is when Kernel check timer on setup_IO_APIC
444 */
445 if (vector == -1)
446 return;
447
448 apic_clear_vector(vector, apic->regs + APIC_ISR);
449 apic_update_ppr(apic);
450
451 if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR))
452 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector);
453}
454
455static void apic_send_ipi(struct kvm_lapic *apic)
456{
457 u32 icr_low = apic_get_reg(apic, APIC_ICR);
458 u32 icr_high = apic_get_reg(apic, APIC_ICR2);
459
460 unsigned int dest = GET_APIC_DEST_FIELD(icr_high);
461 unsigned int short_hand = icr_low & APIC_SHORT_MASK;
462 unsigned int trig_mode = icr_low & APIC_INT_LEVELTRIG;
463 unsigned int level = icr_low & APIC_INT_ASSERT;
464 unsigned int dest_mode = icr_low & APIC_DEST_MASK;
465 unsigned int delivery_mode = icr_low & APIC_MODE_MASK;
466 unsigned int vector = icr_low & APIC_VECTOR_MASK;
467
468 struct kvm_vcpu *target;
469 struct kvm_vcpu *vcpu;
470 unsigned long lpr_map = 0;
471 int i;
472
473 apic_debug("icr_high 0x%x, icr_low 0x%x, "
474 "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, "
475 "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n",
476 icr_high, icr_low, short_hand, dest,
477 trig_mode, level, dest_mode, delivery_mode, vector);
478
479 for (i = 0; i < KVM_MAX_VCPUS; i++) {
480 vcpu = apic->vcpu->kvm->vcpus[i];
481 if (!vcpu)
482 continue;
483
484 if (vcpu->arch.apic &&
485 apic_match_dest(vcpu, apic, short_hand, dest, dest_mode)) {
486 if (delivery_mode == APIC_DM_LOWEST)
487 set_bit(vcpu->vcpu_id, &lpr_map);
488 else
489 __apic_accept_irq(vcpu->arch.apic, delivery_mode,
490 vector, level, trig_mode);
491 }
492 }
493
494 if (delivery_mode == APIC_DM_LOWEST) {
495 target = kvm_get_lowest_prio_vcpu(vcpu->kvm, vector, lpr_map);
496 if (target != NULL)
497 __apic_accept_irq(target->arch.apic, delivery_mode,
498 vector, level, trig_mode);
499 }
500}
501
502static u32 apic_get_tmcct(struct kvm_lapic *apic)
503{
504 u64 counter_passed;
505 ktime_t passed, now;
506 u32 tmcct;
507
508 ASSERT(apic != NULL);
509
510 now = apic->timer.dev.base->get_time();
511 tmcct = apic_get_reg(apic, APIC_TMICT);
512
513 /* if initial count is 0, current count should also be 0 */
514 if (tmcct == 0)
515 return 0;
516
517 if (unlikely(ktime_to_ns(now) <=
518 ktime_to_ns(apic->timer.last_update))) {
519 /* Wrap around */
520 passed = ktime_add(( {
521 (ktime_t) {
522 .tv64 = KTIME_MAX -
523 (apic->timer.last_update).tv64}; }
524 ), now);
525 apic_debug("time elapsed\n");
526 } else
527 passed = ktime_sub(now, apic->timer.last_update);
528
529 counter_passed = div64_64(ktime_to_ns(passed),
530 (APIC_BUS_CYCLE_NS * apic->timer.divide_count));
531
532 if (counter_passed > tmcct) {
533 if (unlikely(!apic_lvtt_period(apic))) {
534 /* one-shot timers stick at 0 until reset */
535 tmcct = 0;
536 } else {
537 /*
538 * periodic timers reset to APIC_TMICT when they
539 * hit 0. The while loop simulates this happening N
540 * times. (counter_passed %= tmcct) would also work,
541 * but might be slower or not work on 32-bit??
542 */
543 while (counter_passed > tmcct)
544 counter_passed -= tmcct;
545 tmcct -= counter_passed;
546 }
547 } else {
548 tmcct -= counter_passed;
549 }
550
551 return tmcct;
552}
553
554static void __report_tpr_access(struct kvm_lapic *apic, bool write)
555{
556 struct kvm_vcpu *vcpu = apic->vcpu;
557 struct kvm_run *run = vcpu->run;
558
559 set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests);
560 kvm_x86_ops->cache_regs(vcpu);
561 run->tpr_access.rip = vcpu->arch.rip;
562 run->tpr_access.is_write = write;
563}
564
565static inline void report_tpr_access(struct kvm_lapic *apic, bool write)
566{
567 if (apic->vcpu->arch.tpr_access_reporting)
568 __report_tpr_access(apic, write);
569}
570
571static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
572{
573 u32 val = 0;
574
575 if (offset >= LAPIC_MMIO_LENGTH)
576 return 0;
577
578 switch (offset) {
579 case APIC_ARBPRI:
580 printk(KERN_WARNING "Access APIC ARBPRI register "
581 "which is for P6\n");
582 break;
583
584 case APIC_TMCCT: /* Timer CCR */
585 val = apic_get_tmcct(apic);
586 break;
587
588 case APIC_TASKPRI:
589 report_tpr_access(apic, false);
590 /* fall thru */
591 default:
592 apic_update_ppr(apic);
593 val = apic_get_reg(apic, offset);
594 break;
595 }
596
597 return val;
598}
599
600static void apic_mmio_read(struct kvm_io_device *this,
601 gpa_t address, int len, void *data)
602{
603 struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
604 unsigned int offset = address - apic->base_address;
605 unsigned char alignment = offset & 0xf;
606 u32 result;
607
608 if ((alignment + len) > 4) {
609 printk(KERN_ERR "KVM_APIC_READ: alignment error %lx %d",
610 (unsigned long)address, len);
611 return;
612 }
613 result = __apic_read(apic, offset & ~0xf);
614
615 switch (len) {
616 case 1:
617 case 2:
618 case 4:
619 memcpy(data, (char *)&result + alignment, len);
620 break;
621 default:
622 printk(KERN_ERR "Local APIC read with len = %x, "
623 "should be 1,2, or 4 instead\n", len);
624 break;
625 }
626}
627
628static void update_divide_count(struct kvm_lapic *apic)
629{
630 u32 tmp1, tmp2, tdcr;
631
632 tdcr = apic_get_reg(apic, APIC_TDCR);
633 tmp1 = tdcr & 0xf;
634 tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1;
635 apic->timer.divide_count = 0x1 << (tmp2 & 0x7);
636
637 apic_debug("timer divide count is 0x%x\n",
638 apic->timer.divide_count);
639}
640
641static void start_apic_timer(struct kvm_lapic *apic)
642{
643 ktime_t now = apic->timer.dev.base->get_time();
644
645 apic->timer.last_update = now;
646
647 apic->timer.period = apic_get_reg(apic, APIC_TMICT) *
648 APIC_BUS_CYCLE_NS * apic->timer.divide_count;
649 atomic_set(&apic->timer.pending, 0);
650 hrtimer_start(&apic->timer.dev,
651 ktime_add_ns(now, apic->timer.period),
652 HRTIMER_MODE_ABS);
653
654 apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
655 PRIx64 ", "
656 "timer initial count 0x%x, period %lldns, "
657 "expire @ 0x%016" PRIx64 ".\n", __FUNCTION__,
658 APIC_BUS_CYCLE_NS, ktime_to_ns(now),
659 apic_get_reg(apic, APIC_TMICT),
660 apic->timer.period,
661 ktime_to_ns(ktime_add_ns(now,
662 apic->timer.period)));
663}
664
665static void apic_mmio_write(struct kvm_io_device *this,
666 gpa_t address, int len, const void *data)
667{
668 struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
669 unsigned int offset = address - apic->base_address;
670 unsigned char alignment = offset & 0xf;
671 u32 val;
672
673 /*
674 * APIC register must be aligned on 128-bits boundary.
675 * 32/64/128 bits registers must be accessed thru 32 bits.
676 * Refer SDM 8.4.1
677 */
678 if (len != 4 || alignment) {
679 if (printk_ratelimit())
680 printk(KERN_ERR "apic write: bad size=%d %lx\n",
681 len, (long)address);
682 return;
683 }
684
685 val = *(u32 *) data;
686
687 /* too common printing */
688 if (offset != APIC_EOI)
689 apic_debug("%s: offset 0x%x with length 0x%x, and value is "
690 "0x%x\n", __FUNCTION__, offset, len, val);
691
692 offset &= 0xff0;
693
694 switch (offset) {
695 case APIC_ID: /* Local APIC ID */
696 apic_set_reg(apic, APIC_ID, val);
697 break;
698
699 case APIC_TASKPRI:
700 report_tpr_access(apic, true);
701 apic_set_tpr(apic, val & 0xff);
702 break;
703
704 case APIC_EOI:
705 apic_set_eoi(apic);
706 break;
707
708 case APIC_LDR:
709 apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK);
710 break;
711
712 case APIC_DFR:
713 apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF);
714 break;
715
716 case APIC_SPIV:
717 apic_set_reg(apic, APIC_SPIV, val & 0x3ff);
718 if (!(val & APIC_SPIV_APIC_ENABLED)) {
719 int i;
720 u32 lvt_val;
721
722 for (i = 0; i < APIC_LVT_NUM; i++) {
723 lvt_val = apic_get_reg(apic,
724 APIC_LVTT + 0x10 * i);
725 apic_set_reg(apic, APIC_LVTT + 0x10 * i,
726 lvt_val | APIC_LVT_MASKED);
727 }
728 atomic_set(&apic->timer.pending, 0);
729
730 }
731 break;
732
733 case APIC_ICR:
734 /* No delay here, so we always clear the pending bit */
735 apic_set_reg(apic, APIC_ICR, val & ~(1 << 12));
736 apic_send_ipi(apic);
737 break;
738
739 case APIC_ICR2:
740 apic_set_reg(apic, APIC_ICR2, val & 0xff000000);
741 break;
742
743 case APIC_LVTT:
744 case APIC_LVTTHMR:
745 case APIC_LVTPC:
746 case APIC_LVT0:
747 case APIC_LVT1:
748 case APIC_LVTERR:
749 /* TODO: Check vector */
750 if (!apic_sw_enabled(apic))
751 val |= APIC_LVT_MASKED;
752
753 val &= apic_lvt_mask[(offset - APIC_LVTT) >> 4];
754 apic_set_reg(apic, offset, val);
755
756 break;
757
758 case APIC_TMICT:
759 hrtimer_cancel(&apic->timer.dev);
760 apic_set_reg(apic, APIC_TMICT, val);
761 start_apic_timer(apic);
762 return;
763
764 case APIC_TDCR:
765 if (val & 4)
766 printk(KERN_ERR "KVM_WRITE:TDCR %x\n", val);
767 apic_set_reg(apic, APIC_TDCR, val);
768 update_divide_count(apic);
769 break;
770
771 default:
772 apic_debug("Local APIC Write to read-only register %x\n",
773 offset);
774 break;
775 }
776
777}
778
779static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr)
780{
781 struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
782 int ret = 0;
783
784
785 if (apic_hw_enabled(apic) &&
786 (addr >= apic->base_address) &&
787 (addr < (apic->base_address + LAPIC_MMIO_LENGTH)))
788 ret = 1;
789
790 return ret;
791}
792
793void kvm_free_lapic(struct kvm_vcpu *vcpu)
794{
795 if (!vcpu->arch.apic)
796 return;
797
798 hrtimer_cancel(&vcpu->arch.apic->timer.dev);
799
800 if (vcpu->arch.apic->regs_page)
801 __free_page(vcpu->arch.apic->regs_page);
802
803 kfree(vcpu->arch.apic);
804}
805
806/*
807 *----------------------------------------------------------------------
808 * LAPIC interface
809 *----------------------------------------------------------------------
810 */
811
812void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
813{
814 struct kvm_lapic *apic = vcpu->arch.apic;
815
816 if (!apic)
817 return;
818 apic_set_tpr(apic, ((cr8 & 0x0f) << 4)
819 | (apic_get_reg(apic, APIC_TASKPRI) & 4));
820}
821
822u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
823{
824 struct kvm_lapic *apic = vcpu->arch.apic;
825 u64 tpr;
826
827 if (!apic)
828 return 0;
829 tpr = (u64) apic_get_reg(apic, APIC_TASKPRI);
830
831 return (tpr & 0xf0) >> 4;
832}
833EXPORT_SYMBOL_GPL(kvm_lapic_get_cr8);
834
835void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
836{
837 struct kvm_lapic *apic = vcpu->arch.apic;
838
839 if (!apic) {
840 value |= MSR_IA32_APICBASE_BSP;
841 vcpu->arch.apic_base = value;
842 return;
843 }
844 if (apic->vcpu->vcpu_id)
845 value &= ~MSR_IA32_APICBASE_BSP;
846
847 vcpu->arch.apic_base = value;
848 apic->base_address = apic->vcpu->arch.apic_base &
849 MSR_IA32_APICBASE_BASE;
850
851 /* with FSB delivery interrupt, we can restart APIC functionality */
852 apic_debug("apic base msr is 0x%016" PRIx64 ", and base address is "
853 "0x%lx.\n", apic->vcpu->arch.apic_base, apic->base_address);
854
855}
856
857u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu)
858{
859 return vcpu->arch.apic_base;
860}
861EXPORT_SYMBOL_GPL(kvm_lapic_get_base);
862
863void kvm_lapic_reset(struct kvm_vcpu *vcpu)
864{
865 struct kvm_lapic *apic;
866 int i;
867
868 apic_debug("%s\n", __FUNCTION__);
869
870 ASSERT(vcpu);
871 apic = vcpu->arch.apic;
872 ASSERT(apic != NULL);
873
874 /* Stop the timer in case it's a reset to an active apic */
875 hrtimer_cancel(&apic->timer.dev);
876
877 apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24);
878 apic_set_reg(apic, APIC_LVR, APIC_VERSION);
879
880 for (i = 0; i < APIC_LVT_NUM; i++)
881 apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED);
882 apic_set_reg(apic, APIC_LVT0,
883 SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
884
885 apic_set_reg(apic, APIC_DFR, 0xffffffffU);
886 apic_set_reg(apic, APIC_SPIV, 0xff);
887 apic_set_reg(apic, APIC_TASKPRI, 0);
888 apic_set_reg(apic, APIC_LDR, 0);
889 apic_set_reg(apic, APIC_ESR, 0);
890 apic_set_reg(apic, APIC_ICR, 0);
891 apic_set_reg(apic, APIC_ICR2, 0);
892 apic_set_reg(apic, APIC_TDCR, 0);
893 apic_set_reg(apic, APIC_TMICT, 0);
894 for (i = 0; i < 8; i++) {
895 apic_set_reg(apic, APIC_IRR + 0x10 * i, 0);
896 apic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
897 apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
898 }
899 update_divide_count(apic);
900 atomic_set(&apic->timer.pending, 0);
901 if (vcpu->vcpu_id == 0)
902 vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
903 apic_update_ppr(apic);
904
905 apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr="
906 "0x%016" PRIx64 ", base_address=0x%0lx.\n", __FUNCTION__,
907 vcpu, kvm_apic_id(apic),
908 vcpu->arch.apic_base, apic->base_address);
909}
910EXPORT_SYMBOL_GPL(kvm_lapic_reset);
911
912int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
913{
914 struct kvm_lapic *apic = vcpu->arch.apic;
915 int ret = 0;
916
917 if (!apic)
918 return 0;
919 ret = apic_enabled(apic);
920
921 return ret;
922}
923EXPORT_SYMBOL_GPL(kvm_lapic_enabled);
924
925/*
926 *----------------------------------------------------------------------
927 * timer interface
928 *----------------------------------------------------------------------
929 */
930
931/* TODO: make sure __apic_timer_fn runs in current pCPU */
932static int __apic_timer_fn(struct kvm_lapic *apic)
933{
934 int result = 0;
935 wait_queue_head_t *q = &apic->vcpu->wq;
936
937 atomic_inc(&apic->timer.pending);
938 if (waitqueue_active(q)) {
939 apic->vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
940 wake_up_interruptible(q);
941 }
942 if (apic_lvtt_period(apic)) {
943 result = 1;
944 apic->timer.dev.expires = ktime_add_ns(
945 apic->timer.dev.expires,
946 apic->timer.period);
947 }
948 return result;
949}
950
951static int __inject_apic_timer_irq(struct kvm_lapic *apic)
952{
953 int vector;
954
955 vector = apic_lvt_vector(apic, APIC_LVTT);
956 return __apic_accept_irq(apic, APIC_DM_FIXED, vector, 1, 0);
957}
958
959static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
960{
961 struct kvm_lapic *apic;
962 int restart_timer = 0;
963
964 apic = container_of(data, struct kvm_lapic, timer.dev);
965
966 restart_timer = __apic_timer_fn(apic);
967
968 if (restart_timer)
969 return HRTIMER_RESTART;
970 else
971 return HRTIMER_NORESTART;
972}
973
974int kvm_create_lapic(struct kvm_vcpu *vcpu)
975{
976 struct kvm_lapic *apic;
977
978 ASSERT(vcpu != NULL);
979 apic_debug("apic_init %d\n", vcpu->vcpu_id);
980
981 apic = kzalloc(sizeof(*apic), GFP_KERNEL);
982 if (!apic)
983 goto nomem;
984
985 vcpu->arch.apic = apic;
986
987 apic->regs_page = alloc_page(GFP_KERNEL);
988 if (apic->regs_page == NULL) {
989 printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
990 vcpu->vcpu_id);
991 goto nomem_free_apic;
992 }
993 apic->regs = page_address(apic->regs_page);
994 memset(apic->regs, 0, PAGE_SIZE);
995 apic->vcpu = vcpu;
996
997 hrtimer_init(&apic->timer.dev, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
998 apic->timer.dev.function = apic_timer_fn;
999 apic->base_address = APIC_DEFAULT_PHYS_BASE;
1000 vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE;
1001
1002 kvm_lapic_reset(vcpu);
1003 apic->dev.read = apic_mmio_read;
1004 apic->dev.write = apic_mmio_write;
1005 apic->dev.in_range = apic_mmio_range;
1006 apic->dev.private = apic;
1007
1008 return 0;
1009nomem_free_apic:
1010 kfree(apic);
1011nomem:
1012 return -ENOMEM;
1013}
1014EXPORT_SYMBOL_GPL(kvm_create_lapic);
1015
1016int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
1017{
1018 struct kvm_lapic *apic = vcpu->arch.apic;
1019 int highest_irr;
1020
1021 if (!apic || !apic_enabled(apic))
1022 return -1;
1023
1024 apic_update_ppr(apic);
1025 highest_irr = apic_find_highest_irr(apic);
1026 if ((highest_irr == -1) ||
1027 ((highest_irr & 0xF0) <= apic_get_reg(apic, APIC_PROCPRI)))
1028 return -1;
1029 return highest_irr;
1030}
1031
1032int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
1033{
1034 u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0);
1035 int r = 0;
1036
1037 if (vcpu->vcpu_id == 0) {
1038 if (!apic_hw_enabled(vcpu->arch.apic))
1039 r = 1;
1040 if ((lvt0 & APIC_LVT_MASKED) == 0 &&
1041 GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
1042 r = 1;
1043 }
1044 return r;
1045}
1046
1047void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
1048{
1049 struct kvm_lapic *apic = vcpu->arch.apic;
1050
1051 if (apic && apic_lvt_enabled(apic, APIC_LVTT) &&
1052 atomic_read(&apic->timer.pending) > 0) {
1053 if (__inject_apic_timer_irq(apic))
1054 atomic_dec(&apic->timer.pending);
1055 }
1056}
1057
1058void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
1059{
1060 struct kvm_lapic *apic = vcpu->arch.apic;
1061
1062 if (apic && apic_lvt_vector(apic, APIC_LVTT) == vec)
1063 apic->timer.last_update = ktime_add_ns(
1064 apic->timer.last_update,
1065 apic->timer.period);
1066}
1067
1068int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
1069{
1070 int vector = kvm_apic_has_interrupt(vcpu);
1071 struct kvm_lapic *apic = vcpu->arch.apic;
1072
1073 if (vector == -1)
1074 return -1;
1075
1076 apic_set_vector(vector, apic->regs + APIC_ISR);
1077 apic_update_ppr(apic);
1078 apic_clear_irr(vector, apic);
1079 return vector;
1080}
1081
1082void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
1083{
1084 struct kvm_lapic *apic = vcpu->arch.apic;
1085
1086 apic->base_address = vcpu->arch.apic_base &
1087 MSR_IA32_APICBASE_BASE;
1088 apic_set_reg(apic, APIC_LVR, APIC_VERSION);
1089 apic_update_ppr(apic);
1090 hrtimer_cancel(&apic->timer.dev);
1091 update_divide_count(apic);
1092 start_apic_timer(apic);
1093}
1094
1095void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
1096{
1097 struct kvm_lapic *apic = vcpu->arch.apic;
1098 struct hrtimer *timer;
1099
1100 if (!apic)
1101 return;
1102
1103 timer = &apic->timer.dev;
1104 if (hrtimer_cancel(timer))
1105 hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS);
1106}
1107
1108void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
1109{
1110 u32 data;
1111 void *vapic;
1112
1113 if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr)
1114 return;
1115
1116 vapic = kmap_atomic(vcpu->arch.apic->vapic_page, KM_USER0);
1117 data = *(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr));
1118 kunmap_atomic(vapic, KM_USER0);
1119
1120 apic_set_tpr(vcpu->arch.apic, data & 0xff);
1121}
1122
1123void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
1124{
1125 u32 data, tpr;
1126 int max_irr, max_isr;
1127 struct kvm_lapic *apic;
1128 void *vapic;
1129
1130 if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr)
1131 return;
1132
1133 apic = vcpu->arch.apic;
1134 tpr = apic_get_reg(apic, APIC_TASKPRI) & 0xff;
1135 max_irr = apic_find_highest_irr(apic);
1136 if (max_irr < 0)
1137 max_irr = 0;
1138 max_isr = apic_find_highest_isr(apic);
1139 if (max_isr < 0)
1140 max_isr = 0;
1141 data = (tpr & 0xff) | ((max_isr & 0xf0) << 8) | (max_irr << 24);
1142
1143 vapic = kmap_atomic(vcpu->arch.apic->vapic_page, KM_USER0);
1144 *(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr)) = data;
1145 kunmap_atomic(vapic, KM_USER0);
1146}
1147
1148void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
1149{
1150 if (!irqchip_in_kernel(vcpu->kvm))
1151 return;
1152
1153 vcpu->arch.apic->vapic_addr = vapic_addr;
1154}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
new file mode 100644
index 00000000000..676c396c9ce
--- /dev/null
+++ b/arch/x86/kvm/lapic.h
@@ -0,0 +1,50 @@
1#ifndef __KVM_X86_LAPIC_H
2#define __KVM_X86_LAPIC_H
3
4#include "iodev.h"
5
6#include <linux/kvm_host.h>
7
8struct kvm_lapic {
9 unsigned long base_address;
10 struct kvm_io_device dev;
11 struct {
12 atomic_t pending;
13 s64 period; /* unit: ns */
14 u32 divide_count;
15 ktime_t last_update;
16 struct hrtimer dev;
17 } timer;
18 struct kvm_vcpu *vcpu;
19 struct page *regs_page;
20 void *regs;
21 gpa_t vapic_addr;
22 struct page *vapic_page;
23};
24int kvm_create_lapic(struct kvm_vcpu *vcpu);
25void kvm_free_lapic(struct kvm_vcpu *vcpu);
26
27int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
28int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
29int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
30void kvm_lapic_reset(struct kvm_vcpu *vcpu);
31u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
32void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
33void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
34
35int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
36int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
37int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig);
38
39u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
40void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
41void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu);
42int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
43int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
44void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
45
46void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
47void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu);
48void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu);
49
50#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
new file mode 100644
index 00000000000..8efdcdbebb0
--- /dev/null
+++ b/arch/x86/kvm/mmu.c
@@ -0,0 +1,1885 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * MMU support
8 *
9 * Copyright (C) 2006 Qumranet, Inc.
10 *
11 * Authors:
12 * Yaniv Kamay <yaniv@qumranet.com>
13 * Avi Kivity <avi@qumranet.com>
14 *
15 * This work is licensed under the terms of the GNU GPL, version 2. See
16 * the COPYING file in the top-level directory.
17 *
18 */
19
20#include "vmx.h"
21#include "mmu.h"
22
23#include <linux/kvm_host.h>
24#include <linux/types.h>
25#include <linux/string.h>
26#include <linux/mm.h>
27#include <linux/highmem.h>
28#include <linux/module.h>
29#include <linux/swap.h>
30
31#include <asm/page.h>
32#include <asm/cmpxchg.h>
33#include <asm/io.h>
34
35#undef MMU_DEBUG
36
37#undef AUDIT
38
39#ifdef AUDIT
40static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
41#else
42static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
43#endif
44
45#ifdef MMU_DEBUG
46
47#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
48#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
49
50#else
51
52#define pgprintk(x...) do { } while (0)
53#define rmap_printk(x...) do { } while (0)
54
55#endif
56
57#if defined(MMU_DEBUG) || defined(AUDIT)
58static int dbg = 1;
59#endif
60
61#ifndef MMU_DEBUG
62#define ASSERT(x) do { } while (0)
63#else
64#define ASSERT(x) \
65 if (!(x)) { \
66 printk(KERN_WARNING "assertion failed %s:%d: %s\n", \
67 __FILE__, __LINE__, #x); \
68 }
69#endif
70
71#define PT64_PT_BITS 9
72#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
73#define PT32_PT_BITS 10
74#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
75
76#define PT_WRITABLE_SHIFT 1
77
78#define PT_PRESENT_MASK (1ULL << 0)
79#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
80#define PT_USER_MASK (1ULL << 2)
81#define PT_PWT_MASK (1ULL << 3)
82#define PT_PCD_MASK (1ULL << 4)
83#define PT_ACCESSED_MASK (1ULL << 5)
84#define PT_DIRTY_MASK (1ULL << 6)
85#define PT_PAGE_SIZE_MASK (1ULL << 7)
86#define PT_PAT_MASK (1ULL << 7)
87#define PT_GLOBAL_MASK (1ULL << 8)
88#define PT64_NX_SHIFT 63
89#define PT64_NX_MASK (1ULL << PT64_NX_SHIFT)
90
91#define PT_PAT_SHIFT 7
92#define PT_DIR_PAT_SHIFT 12
93#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)
94
95#define PT32_DIR_PSE36_SIZE 4
96#define PT32_DIR_PSE36_SHIFT 13
97#define PT32_DIR_PSE36_MASK \
98 (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
99
100
101#define PT_FIRST_AVAIL_BITS_SHIFT 9
102#define PT64_SECOND_AVAIL_BITS_SHIFT 52
103
104#define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
105
106#define VALID_PAGE(x) ((x) != INVALID_PAGE)
107
108#define PT64_LEVEL_BITS 9
109
110#define PT64_LEVEL_SHIFT(level) \
111 (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
112
113#define PT64_LEVEL_MASK(level) \
114 (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
115
116#define PT64_INDEX(address, level)\
117 (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
118
119
120#define PT32_LEVEL_BITS 10
121
122#define PT32_LEVEL_SHIFT(level) \
123 (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
124
125#define PT32_LEVEL_MASK(level) \
126 (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
127
128#define PT32_INDEX(address, level)\
129 (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
130
131
132#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
133#define PT64_DIR_BASE_ADDR_MASK \
134 (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
135
136#define PT32_BASE_ADDR_MASK PAGE_MASK
137#define PT32_DIR_BASE_ADDR_MASK \
138 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
139
140#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
141 | PT64_NX_MASK)
142
143#define PFERR_PRESENT_MASK (1U << 0)
144#define PFERR_WRITE_MASK (1U << 1)
145#define PFERR_USER_MASK (1U << 2)
146#define PFERR_FETCH_MASK (1U << 4)
147
148#define PT64_ROOT_LEVEL 4
149#define PT32_ROOT_LEVEL 2
150#define PT32E_ROOT_LEVEL 3
151
152#define PT_DIRECTORY_LEVEL 2
153#define PT_PAGE_TABLE_LEVEL 1
154
155#define RMAP_EXT 4
156
157#define ACC_EXEC_MASK 1
158#define ACC_WRITE_MASK PT_WRITABLE_MASK
159#define ACC_USER_MASK PT_USER_MASK
160#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
161
162struct kvm_rmap_desc {
163 u64 *shadow_ptes[RMAP_EXT];
164 struct kvm_rmap_desc *more;
165};
166
167static struct kmem_cache *pte_chain_cache;
168static struct kmem_cache *rmap_desc_cache;
169static struct kmem_cache *mmu_page_header_cache;
170
171static u64 __read_mostly shadow_trap_nonpresent_pte;
172static u64 __read_mostly shadow_notrap_nonpresent_pte;
173
174void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
175{
176 shadow_trap_nonpresent_pte = trap_pte;
177 shadow_notrap_nonpresent_pte = notrap_pte;
178}
179EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
180
181static int is_write_protection(struct kvm_vcpu *vcpu)
182{
183 return vcpu->arch.cr0 & X86_CR0_WP;
184}
185
186static int is_cpuid_PSE36(void)
187{
188 return 1;
189}
190
191static int is_nx(struct kvm_vcpu *vcpu)
192{
193 return vcpu->arch.shadow_efer & EFER_NX;
194}
195
196static int is_present_pte(unsigned long pte)
197{
198 return pte & PT_PRESENT_MASK;
199}
200
201static int is_shadow_present_pte(u64 pte)
202{
203 pte &= ~PT_SHADOW_IO_MARK;
204 return pte != shadow_trap_nonpresent_pte
205 && pte != shadow_notrap_nonpresent_pte;
206}
207
208static int is_writeble_pte(unsigned long pte)
209{
210 return pte & PT_WRITABLE_MASK;
211}
212
213static int is_dirty_pte(unsigned long pte)
214{
215 return pte & PT_DIRTY_MASK;
216}
217
218static int is_io_pte(unsigned long pte)
219{
220 return pte & PT_SHADOW_IO_MARK;
221}
222
223static int is_rmap_pte(u64 pte)
224{
225 return pte != shadow_trap_nonpresent_pte
226 && pte != shadow_notrap_nonpresent_pte;
227}
228
229static gfn_t pse36_gfn_delta(u32 gpte)
230{
231 int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
232
233 return (gpte & PT32_DIR_PSE36_MASK) << shift;
234}
235
236static void set_shadow_pte(u64 *sptep, u64 spte)
237{
238#ifdef CONFIG_X86_64
239 set_64bit((unsigned long *)sptep, spte);
240#else
241 set_64bit((unsigned long long *)sptep, spte);
242#endif
243}
244
245static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
246 struct kmem_cache *base_cache, int min)
247{
248 void *obj;
249
250 if (cache->nobjs >= min)
251 return 0;
252 while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
253 obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
254 if (!obj)
255 return -ENOMEM;
256 cache->objects[cache->nobjs++] = obj;
257 }
258 return 0;
259}
260
261static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
262{
263 while (mc->nobjs)
264 kfree(mc->objects[--mc->nobjs]);
265}
266
267static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
268 int min)
269{
270 struct page *page;
271
272 if (cache->nobjs >= min)
273 return 0;
274 while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
275 page = alloc_page(GFP_KERNEL);
276 if (!page)
277 return -ENOMEM;
278 set_page_private(page, 0);
279 cache->objects[cache->nobjs++] = page_address(page);
280 }
281 return 0;
282}
283
284static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
285{
286 while (mc->nobjs)
287 free_page((unsigned long)mc->objects[--mc->nobjs]);
288}
289
290static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
291{
292 int r;
293
294 r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache,
295 pte_chain_cache, 4);
296 if (r)
297 goto out;
298 r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
299 rmap_desc_cache, 1);
300 if (r)
301 goto out;
302 r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
303 if (r)
304 goto out;
305 r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
306 mmu_page_header_cache, 4);
307out:
308 return r;
309}
310
311static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
312{
313 mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache);
314 mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache);
315 mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
316 mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
317}
318
319static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
320 size_t size)
321{
322 void *p;
323
324 BUG_ON(!mc->nobjs);
325 p = mc->objects[--mc->nobjs];
326 memset(p, 0, size);
327 return p;
328}
329
330static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
331{
332 return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache,
333 sizeof(struct kvm_pte_chain));
334}
335
336static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
337{
338 kfree(pc);
339}
340
341static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
342{
343 return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache,
344 sizeof(struct kvm_rmap_desc));
345}
346
347static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
348{
349 kfree(rd);
350}
351
352/*
353 * Take gfn and return the reverse mapping to it.
354 * Note: gfn must be unaliased before this function get called
355 */
356
357static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn)
358{
359 struct kvm_memory_slot *slot;
360
361 slot = gfn_to_memslot(kvm, gfn);
362 return &slot->rmap[gfn - slot->base_gfn];
363}
364
365/*
366 * Reverse mapping data structures:
367 *
368 * If rmapp bit zero is zero, then rmapp point to the shadw page table entry
369 * that points to page_address(page).
370 *
371 * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
372 * containing more mappings.
373 */
374static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
375{
376 struct kvm_mmu_page *sp;
377 struct kvm_rmap_desc *desc;
378 unsigned long *rmapp;
379 int i;
380
381 if (!is_rmap_pte(*spte))
382 return;
383 gfn = unalias_gfn(vcpu->kvm, gfn);
384 sp = page_header(__pa(spte));
385 sp->gfns[spte - sp->spt] = gfn;
386 rmapp = gfn_to_rmap(vcpu->kvm, gfn);
387 if (!*rmapp) {
388 rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
389 *rmapp = (unsigned long)spte;
390 } else if (!(*rmapp & 1)) {
391 rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
392 desc = mmu_alloc_rmap_desc(vcpu);
393 desc->shadow_ptes[0] = (u64 *)*rmapp;
394 desc->shadow_ptes[1] = spte;
395 *rmapp = (unsigned long)desc | 1;
396 } else {
397 rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
398 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
399 while (desc->shadow_ptes[RMAP_EXT-1] && desc->more)
400 desc = desc->more;
401 if (desc->shadow_ptes[RMAP_EXT-1]) {
402 desc->more = mmu_alloc_rmap_desc(vcpu);
403 desc = desc->more;
404 }
405 for (i = 0; desc->shadow_ptes[i]; ++i)
406 ;
407 desc->shadow_ptes[i] = spte;
408 }
409}
410
411static void rmap_desc_remove_entry(unsigned long *rmapp,
412 struct kvm_rmap_desc *desc,
413 int i,
414 struct kvm_rmap_desc *prev_desc)
415{
416 int j;
417
418 for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j)
419 ;
420 desc->shadow_ptes[i] = desc->shadow_ptes[j];
421 desc->shadow_ptes[j] = NULL;
422 if (j != 0)
423 return;
424 if (!prev_desc && !desc->more)
425 *rmapp = (unsigned long)desc->shadow_ptes[0];
426 else
427 if (prev_desc)
428 prev_desc->more = desc->more;
429 else
430 *rmapp = (unsigned long)desc->more | 1;
431 mmu_free_rmap_desc(desc);
432}
433
434static void rmap_remove(struct kvm *kvm, u64 *spte)
435{
436 struct kvm_rmap_desc *desc;
437 struct kvm_rmap_desc *prev_desc;
438 struct kvm_mmu_page *sp;
439 struct page *page;
440 unsigned long *rmapp;
441 int i;
442
443 if (!is_rmap_pte(*spte))
444 return;
445 sp = page_header(__pa(spte));
446 page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
447 mark_page_accessed(page);
448 if (is_writeble_pte(*spte))
449 kvm_release_page_dirty(page);
450 else
451 kvm_release_page_clean(page);
452 rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt]);
453 if (!*rmapp) {
454 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
455 BUG();
456 } else if (!(*rmapp & 1)) {
457 rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte);
458 if ((u64 *)*rmapp != spte) {
459 printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n",
460 spte, *spte);
461 BUG();
462 }
463 *rmapp = 0;
464 } else {
465 rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte);
466 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
467 prev_desc = NULL;
468 while (desc) {
469 for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i)
470 if (desc->shadow_ptes[i] == spte) {
471 rmap_desc_remove_entry(rmapp,
472 desc, i,
473 prev_desc);
474 return;
475 }
476 prev_desc = desc;
477 desc = desc->more;
478 }
479 BUG();
480 }
481}
482
483static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
484{
485 struct kvm_rmap_desc *desc;
486 struct kvm_rmap_desc *prev_desc;
487 u64 *prev_spte;
488 int i;
489
490 if (!*rmapp)
491 return NULL;
492 else if (!(*rmapp & 1)) {
493 if (!spte)
494 return (u64 *)*rmapp;
495 return NULL;
496 }
497 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
498 prev_desc = NULL;
499 prev_spte = NULL;
500 while (desc) {
501 for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) {
502 if (prev_spte == spte)
503 return desc->shadow_ptes[i];
504 prev_spte = desc->shadow_ptes[i];
505 }
506 desc = desc->more;
507 }
508 return NULL;
509}
510
511static void rmap_write_protect(struct kvm *kvm, u64 gfn)
512{
513 unsigned long *rmapp;
514 u64 *spte;
515 int write_protected = 0;
516
517 gfn = unalias_gfn(kvm, gfn);
518 rmapp = gfn_to_rmap(kvm, gfn);
519
520 spte = rmap_next(kvm, rmapp, NULL);
521 while (spte) {
522 BUG_ON(!spte);
523 BUG_ON(!(*spte & PT_PRESENT_MASK));
524 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
525 if (is_writeble_pte(*spte)) {
526 set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK);
527 write_protected = 1;
528 }
529 spte = rmap_next(kvm, rmapp, spte);
530 }
531 if (write_protected)
532 kvm_flush_remote_tlbs(kvm);
533}
534
535#ifdef MMU_DEBUG
536static int is_empty_shadow_page(u64 *spt)
537{
538 u64 *pos;
539 u64 *end;
540
541 for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
542 if ((*pos & ~PT_SHADOW_IO_MARK) != shadow_trap_nonpresent_pte) {
543 printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__,
544 pos, *pos);
545 return 0;
546 }
547 return 1;
548}
549#endif
550
551static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
552{
553 ASSERT(is_empty_shadow_page(sp->spt));
554 list_del(&sp->link);
555 __free_page(virt_to_page(sp->spt));
556 __free_page(virt_to_page(sp->gfns));
557 kfree(sp);
558 ++kvm->arch.n_free_mmu_pages;
559}
560
561static unsigned kvm_page_table_hashfn(gfn_t gfn)
562{
563 return gfn;
564}
565
566static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
567 u64 *parent_pte)
568{
569 struct kvm_mmu_page *sp;
570
571 sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
572 sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
573 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
574 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
575 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
576 ASSERT(is_empty_shadow_page(sp->spt));
577 sp->slot_bitmap = 0;
578 sp->multimapped = 0;
579 sp->parent_pte = parent_pte;
580 --vcpu->kvm->arch.n_free_mmu_pages;
581 return sp;
582}
583
584static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
585 struct kvm_mmu_page *sp, u64 *parent_pte)
586{
587 struct kvm_pte_chain *pte_chain;
588 struct hlist_node *node;
589 int i;
590
591 if (!parent_pte)
592 return;
593 if (!sp->multimapped) {
594 u64 *old = sp->parent_pte;
595
596 if (!old) {
597 sp->parent_pte = parent_pte;
598 return;
599 }
600 sp->multimapped = 1;
601 pte_chain = mmu_alloc_pte_chain(vcpu);
602 INIT_HLIST_HEAD(&sp->parent_ptes);
603 hlist_add_head(&pte_chain->link, &sp->parent_ptes);
604 pte_chain->parent_ptes[0] = old;
605 }
606 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) {
607 if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
608 continue;
609 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
610 if (!pte_chain->parent_ptes[i]) {
611 pte_chain->parent_ptes[i] = parent_pte;
612 return;
613 }
614 }
615 pte_chain = mmu_alloc_pte_chain(vcpu);
616 BUG_ON(!pte_chain);
617 hlist_add_head(&pte_chain->link, &sp->parent_ptes);
618 pte_chain->parent_ptes[0] = parent_pte;
619}
620
621static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
622 u64 *parent_pte)
623{
624 struct kvm_pte_chain *pte_chain;
625 struct hlist_node *node;
626 int i;
627
628 if (!sp->multimapped) {
629 BUG_ON(sp->parent_pte != parent_pte);
630 sp->parent_pte = NULL;
631 return;
632 }
633 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
634 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
635 if (!pte_chain->parent_ptes[i])
636 break;
637 if (pte_chain->parent_ptes[i] != parent_pte)
638 continue;
639 while (i + 1 < NR_PTE_CHAIN_ENTRIES
640 && pte_chain->parent_ptes[i + 1]) {
641 pte_chain->parent_ptes[i]
642 = pte_chain->parent_ptes[i + 1];
643 ++i;
644 }
645 pte_chain->parent_ptes[i] = NULL;
646 if (i == 0) {
647 hlist_del(&pte_chain->link);
648 mmu_free_pte_chain(pte_chain);
649 if (hlist_empty(&sp->parent_ptes)) {
650 sp->multimapped = 0;
651 sp->parent_pte = NULL;
652 }
653 }
654 return;
655 }
656 BUG();
657}
658
659static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
660{
661 unsigned index;
662 struct hlist_head *bucket;
663 struct kvm_mmu_page *sp;
664 struct hlist_node *node;
665
666 pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
667 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
668 bucket = &kvm->arch.mmu_page_hash[index];
669 hlist_for_each_entry(sp, node, bucket, hash_link)
670 if (sp->gfn == gfn && !sp->role.metaphysical) {
671 pgprintk("%s: found role %x\n",
672 __FUNCTION__, sp->role.word);
673 return sp;
674 }
675 return NULL;
676}
677
678static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
679 gfn_t gfn,
680 gva_t gaddr,
681 unsigned level,
682 int metaphysical,
683 unsigned access,
684 u64 *parent_pte,
685 bool *new_page)
686{
687 union kvm_mmu_page_role role;
688 unsigned index;
689 unsigned quadrant;
690 struct hlist_head *bucket;
691 struct kvm_mmu_page *sp;
692 struct hlist_node *node;
693
694 role.word = 0;
695 role.glevels = vcpu->arch.mmu.root_level;
696 role.level = level;
697 role.metaphysical = metaphysical;
698 role.access = access;
699 if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
700 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
701 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
702 role.quadrant = quadrant;
703 }
704 pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__,
705 gfn, role.word);
706 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
707 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
708 hlist_for_each_entry(sp, node, bucket, hash_link)
709 if (sp->gfn == gfn && sp->role.word == role.word) {
710 mmu_page_add_parent_pte(vcpu, sp, parent_pte);
711 pgprintk("%s: found\n", __FUNCTION__);
712 return sp;
713 }
714 ++vcpu->kvm->stat.mmu_cache_miss;
715 sp = kvm_mmu_alloc_page(vcpu, parent_pte);
716 if (!sp)
717 return sp;
718 pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word);
719 sp->gfn = gfn;
720 sp->role = role;
721 hlist_add_head(&sp->hash_link, bucket);
722 vcpu->arch.mmu.prefetch_page(vcpu, sp);
723 if (!metaphysical)
724 rmap_write_protect(vcpu->kvm, gfn);
725 if (new_page)
726 *new_page = 1;
727 return sp;
728}
729
730static void kvm_mmu_page_unlink_children(struct kvm *kvm,
731 struct kvm_mmu_page *sp)
732{
733 unsigned i;
734 u64 *pt;
735 u64 ent;
736
737 pt = sp->spt;
738
739 if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
740 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
741 if (is_shadow_present_pte(pt[i]))
742 rmap_remove(kvm, &pt[i]);
743 pt[i] = shadow_trap_nonpresent_pte;
744 }
745 kvm_flush_remote_tlbs(kvm);
746 return;
747 }
748
749 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
750 ent = pt[i];
751
752 pt[i] = shadow_trap_nonpresent_pte;
753 if (!is_shadow_present_pte(ent))
754 continue;
755 ent &= PT64_BASE_ADDR_MASK;
756 mmu_page_remove_parent_pte(page_header(ent), &pt[i]);
757 }
758 kvm_flush_remote_tlbs(kvm);
759}
760
761static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
762{
763 mmu_page_remove_parent_pte(sp, parent_pte);
764}
765
766static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
767{
768 int i;
769
770 for (i = 0; i < KVM_MAX_VCPUS; ++i)
771 if (kvm->vcpus[i])
772 kvm->vcpus[i]->arch.last_pte_updated = NULL;
773}
774
775static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
776{
777 u64 *parent_pte;
778
779 ++kvm->stat.mmu_shadow_zapped;
780 while (sp->multimapped || sp->parent_pte) {
781 if (!sp->multimapped)
782 parent_pte = sp->parent_pte;
783 else {
784 struct kvm_pte_chain *chain;
785
786 chain = container_of(sp->parent_ptes.first,
787 struct kvm_pte_chain, link);
788 parent_pte = chain->parent_ptes[0];
789 }
790 BUG_ON(!parent_pte);
791 kvm_mmu_put_page(sp, parent_pte);
792 set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte);
793 }
794 kvm_mmu_page_unlink_children(kvm, sp);
795 if (!sp->root_count) {
796 hlist_del(&sp->hash_link);
797 kvm_mmu_free_page(kvm, sp);
798 } else
799 list_move(&sp->link, &kvm->arch.active_mmu_pages);
800 kvm_mmu_reset_last_pte_updated(kvm);
801}
802
803/*
804 * Changing the number of mmu pages allocated to the vm
805 * Note: if kvm_nr_mmu_pages is too small, you will get dead lock
806 */
807void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
808{
809 /*
810 * If we set the number of mmu pages to be smaller be than the
811 * number of actived pages , we must to free some mmu pages before we
812 * change the value
813 */
814
815 if ((kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages) >
816 kvm_nr_mmu_pages) {
817 int n_used_mmu_pages = kvm->arch.n_alloc_mmu_pages
818 - kvm->arch.n_free_mmu_pages;
819
820 while (n_used_mmu_pages > kvm_nr_mmu_pages) {
821 struct kvm_mmu_page *page;
822
823 page = container_of(kvm->arch.active_mmu_pages.prev,
824 struct kvm_mmu_page, link);
825 kvm_mmu_zap_page(kvm, page);
826 n_used_mmu_pages--;
827 }
828 kvm->arch.n_free_mmu_pages = 0;
829 }
830 else
831 kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages
832 - kvm->arch.n_alloc_mmu_pages;
833
834 kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages;
835}
836
837static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
838{
839 unsigned index;
840 struct hlist_head *bucket;
841 struct kvm_mmu_page *sp;
842 struct hlist_node *node, *n;
843 int r;
844
845 pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
846 r = 0;
847 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
848 bucket = &kvm->arch.mmu_page_hash[index];
849 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
850 if (sp->gfn == gfn && !sp->role.metaphysical) {
851 pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn,
852 sp->role.word);
853 kvm_mmu_zap_page(kvm, sp);
854 r = 1;
855 }
856 return r;
857}
858
859static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
860{
861 struct kvm_mmu_page *sp;
862
863 while ((sp = kvm_mmu_lookup_page(kvm, gfn)) != NULL) {
864 pgprintk("%s: zap %lx %x\n", __FUNCTION__, gfn, sp->role.word);
865 kvm_mmu_zap_page(kvm, sp);
866 }
867}
868
869static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
870{
871 int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn));
872 struct kvm_mmu_page *sp = page_header(__pa(pte));
873
874 __set_bit(slot, &sp->slot_bitmap);
875}
876
877struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
878{
879 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
880
881 if (gpa == UNMAPPED_GVA)
882 return NULL;
883 return gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
884}
885
886static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
887 unsigned pt_access, unsigned pte_access,
888 int user_fault, int write_fault, int dirty,
889 int *ptwrite, gfn_t gfn, struct page *page)
890{
891 u64 spte;
892 int was_rmapped = is_rmap_pte(*shadow_pte);
893 int was_writeble = is_writeble_pte(*shadow_pte);
894
895 pgprintk("%s: spte %llx access %x write_fault %d"
896 " user_fault %d gfn %lx\n",
897 __FUNCTION__, *shadow_pte, pt_access,
898 write_fault, user_fault, gfn);
899
900 /*
901 * We don't set the accessed bit, since we sometimes want to see
902 * whether the guest actually used the pte (in order to detect
903 * demand paging).
904 */
905 spte = PT_PRESENT_MASK | PT_DIRTY_MASK;
906 if (!dirty)
907 pte_access &= ~ACC_WRITE_MASK;
908 if (!(pte_access & ACC_EXEC_MASK))
909 spte |= PT64_NX_MASK;
910
911 spte |= PT_PRESENT_MASK;
912 if (pte_access & ACC_USER_MASK)
913 spte |= PT_USER_MASK;
914
915 if (is_error_page(page)) {
916 set_shadow_pte(shadow_pte,
917 shadow_trap_nonpresent_pte | PT_SHADOW_IO_MARK);
918 kvm_release_page_clean(page);
919 return;
920 }
921
922 spte |= page_to_phys(page);
923
924 if ((pte_access & ACC_WRITE_MASK)
925 || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
926 struct kvm_mmu_page *shadow;
927
928 spte |= PT_WRITABLE_MASK;
929 if (user_fault) {
930 mmu_unshadow(vcpu->kvm, gfn);
931 goto unshadowed;
932 }
933
934 shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
935 if (shadow) {
936 pgprintk("%s: found shadow page for %lx, marking ro\n",
937 __FUNCTION__, gfn);
938 pte_access &= ~ACC_WRITE_MASK;
939 if (is_writeble_pte(spte)) {
940 spte &= ~PT_WRITABLE_MASK;
941 kvm_x86_ops->tlb_flush(vcpu);
942 }
943 if (write_fault)
944 *ptwrite = 1;
945 }
946 }
947
948unshadowed:
949
950 if (pte_access & ACC_WRITE_MASK)
951 mark_page_dirty(vcpu->kvm, gfn);
952
953 pgprintk("%s: setting spte %llx\n", __FUNCTION__, spte);
954 set_shadow_pte(shadow_pte, spte);
955 page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
956 if (!was_rmapped) {
957 rmap_add(vcpu, shadow_pte, gfn);
958 if (!is_rmap_pte(*shadow_pte))
959 kvm_release_page_clean(page);
960 } else {
961 if (was_writeble)
962 kvm_release_page_dirty(page);
963 else
964 kvm_release_page_clean(page);
965 }
966 if (!ptwrite || !*ptwrite)
967 vcpu->arch.last_pte_updated = shadow_pte;
968}
969
970static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
971{
972}
973
974static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write,
975 gfn_t gfn, struct page *page)
976{
977 int level = PT32E_ROOT_LEVEL;
978 hpa_t table_addr = vcpu->arch.mmu.root_hpa;
979 int pt_write = 0;
980
981 for (; ; level--) {
982 u32 index = PT64_INDEX(v, level);
983 u64 *table;
984
985 ASSERT(VALID_PAGE(table_addr));
986 table = __va(table_addr);
987
988 if (level == 1) {
989 mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
990 0, write, 1, &pt_write, gfn, page);
991 return pt_write || is_io_pte(table[index]);
992 }
993
994 if (table[index] == shadow_trap_nonpresent_pte) {
995 struct kvm_mmu_page *new_table;
996 gfn_t pseudo_gfn;
997
998 pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK)
999 >> PAGE_SHIFT;
1000 new_table = kvm_mmu_get_page(vcpu, pseudo_gfn,
1001 v, level - 1,
1002 1, ACC_ALL, &table[index],
1003 NULL);
1004 if (!new_table) {
1005 pgprintk("nonpaging_map: ENOMEM\n");
1006 kvm_release_page_clean(page);
1007 return -ENOMEM;
1008 }
1009
1010 table[index] = __pa(new_table->spt) | PT_PRESENT_MASK
1011 | PT_WRITABLE_MASK | PT_USER_MASK;
1012 }
1013 table_addr = table[index] & PT64_BASE_ADDR_MASK;
1014 }
1015}
1016
1017static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1018{
1019 int r;
1020
1021 struct page *page;
1022
1023 down_read(&current->mm->mmap_sem);
1024 page = gfn_to_page(vcpu->kvm, gfn);
1025
1026 spin_lock(&vcpu->kvm->mmu_lock);
1027 kvm_mmu_free_some_pages(vcpu);
1028 r = __nonpaging_map(vcpu, v, write, gfn, page);
1029 spin_unlock(&vcpu->kvm->mmu_lock);
1030
1031 up_read(&current->mm->mmap_sem);
1032
1033 return r;
1034}
1035
1036
1037static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
1038 struct kvm_mmu_page *sp)
1039{
1040 int i;
1041
1042 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
1043 sp->spt[i] = shadow_trap_nonpresent_pte;
1044}
1045
1046static void mmu_free_roots(struct kvm_vcpu *vcpu)
1047{
1048 int i;
1049 struct kvm_mmu_page *sp;
1050
1051 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
1052 return;
1053 spin_lock(&vcpu->kvm->mmu_lock);
1054#ifdef CONFIG_X86_64
1055 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
1056 hpa_t root = vcpu->arch.mmu.root_hpa;
1057
1058 sp = page_header(root);
1059 --sp->root_count;
1060 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
1061 spin_unlock(&vcpu->kvm->mmu_lock);
1062 return;
1063 }
1064#endif
1065 for (i = 0; i < 4; ++i) {
1066 hpa_t root = vcpu->arch.mmu.pae_root[i];
1067
1068 if (root) {
1069 root &= PT64_BASE_ADDR_MASK;
1070 sp = page_header(root);
1071 --sp->root_count;
1072 }
1073 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
1074 }
1075 spin_unlock(&vcpu->kvm->mmu_lock);
1076 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
1077}
1078
1079static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
1080{
1081 int i;
1082 gfn_t root_gfn;
1083 struct kvm_mmu_page *sp;
1084
1085 root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
1086
1087#ifdef CONFIG_X86_64
1088 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
1089 hpa_t root = vcpu->arch.mmu.root_hpa;
1090
1091 ASSERT(!VALID_PAGE(root));
1092 sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
1093 PT64_ROOT_LEVEL, 0, ACC_ALL, NULL, NULL);
1094 root = __pa(sp->spt);
1095 ++sp->root_count;
1096 vcpu->arch.mmu.root_hpa = root;
1097 return;
1098 }
1099#endif
1100 for (i = 0; i < 4; ++i) {
1101 hpa_t root = vcpu->arch.mmu.pae_root[i];
1102
1103 ASSERT(!VALID_PAGE(root));
1104 if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
1105 if (!is_present_pte(vcpu->arch.pdptrs[i])) {
1106 vcpu->arch.mmu.pae_root[i] = 0;
1107 continue;
1108 }
1109 root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT;
1110 } else if (vcpu->arch.mmu.root_level == 0)
1111 root_gfn = 0;
1112 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
1113 PT32_ROOT_LEVEL, !is_paging(vcpu),
1114 ACC_ALL, NULL, NULL);
1115 root = __pa(sp->spt);
1116 ++sp->root_count;
1117 vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
1118 }
1119 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
1120}
1121
1122static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
1123{
1124 return vaddr;
1125}
1126
1127static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
1128 u32 error_code)
1129{
1130 gfn_t gfn;
1131 int r;
1132
1133 pgprintk("%s: gva %lx error %x\n", __FUNCTION__, gva, error_code);
1134 r = mmu_topup_memory_caches(vcpu);
1135 if (r)
1136 return r;
1137
1138 ASSERT(vcpu);
1139 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
1140
1141 gfn = gva >> PAGE_SHIFT;
1142
1143 return nonpaging_map(vcpu, gva & PAGE_MASK,
1144 error_code & PFERR_WRITE_MASK, gfn);
1145}
1146
1147static void nonpaging_free(struct kvm_vcpu *vcpu)
1148{
1149 mmu_free_roots(vcpu);
1150}
1151
1152static int nonpaging_init_context(struct kvm_vcpu *vcpu)
1153{
1154 struct kvm_mmu *context = &vcpu->arch.mmu;
1155
1156 context->new_cr3 = nonpaging_new_cr3;
1157 context->page_fault = nonpaging_page_fault;
1158 context->gva_to_gpa = nonpaging_gva_to_gpa;
1159 context->free = nonpaging_free;
1160 context->prefetch_page = nonpaging_prefetch_page;
1161 context->root_level = 0;
1162 context->shadow_root_level = PT32E_ROOT_LEVEL;
1163 context->root_hpa = INVALID_PAGE;
1164 return 0;
1165}
1166
1167void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
1168{
1169 ++vcpu->stat.tlb_flush;
1170 kvm_x86_ops->tlb_flush(vcpu);
1171}
1172
1173static void paging_new_cr3(struct kvm_vcpu *vcpu)
1174{
1175 pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3);
1176 mmu_free_roots(vcpu);
1177}
1178
1179static void inject_page_fault(struct kvm_vcpu *vcpu,
1180 u64 addr,
1181 u32 err_code)
1182{
1183 kvm_inject_page_fault(vcpu, addr, err_code);
1184}
1185
1186static void paging_free(struct kvm_vcpu *vcpu)
1187{
1188 nonpaging_free(vcpu);
1189}
1190
1191#define PTTYPE 64
1192#include "paging_tmpl.h"
1193#undef PTTYPE
1194
1195#define PTTYPE 32
1196#include "paging_tmpl.h"
1197#undef PTTYPE
1198
1199static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
1200{
1201 struct kvm_mmu *context = &vcpu->arch.mmu;
1202
1203 ASSERT(is_pae(vcpu));
1204 context->new_cr3 = paging_new_cr3;
1205 context->page_fault = paging64_page_fault;
1206 context->gva_to_gpa = paging64_gva_to_gpa;
1207 context->prefetch_page = paging64_prefetch_page;
1208 context->free = paging_free;
1209 context->root_level = level;
1210 context->shadow_root_level = level;
1211 context->root_hpa = INVALID_PAGE;
1212 return 0;
1213}
1214
1215static int paging64_init_context(struct kvm_vcpu *vcpu)
1216{
1217 return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
1218}
1219
1220static int paging32_init_context(struct kvm_vcpu *vcpu)
1221{
1222 struct kvm_mmu *context = &vcpu->arch.mmu;
1223
1224 context->new_cr3 = paging_new_cr3;
1225 context->page_fault = paging32_page_fault;
1226 context->gva_to_gpa = paging32_gva_to_gpa;
1227 context->free = paging_free;
1228 context->prefetch_page = paging32_prefetch_page;
1229 context->root_level = PT32_ROOT_LEVEL;
1230 context->shadow_root_level = PT32E_ROOT_LEVEL;
1231 context->root_hpa = INVALID_PAGE;
1232 return 0;
1233}
1234
1235static int paging32E_init_context(struct kvm_vcpu *vcpu)
1236{
1237 return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
1238}
1239
1240static int init_kvm_mmu(struct kvm_vcpu *vcpu)
1241{
1242 ASSERT(vcpu);
1243 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
1244
1245 if (!is_paging(vcpu))
1246 return nonpaging_init_context(vcpu);
1247 else if (is_long_mode(vcpu))
1248 return paging64_init_context(vcpu);
1249 else if (is_pae(vcpu))
1250 return paging32E_init_context(vcpu);
1251 else
1252 return paging32_init_context(vcpu);
1253}
1254
1255static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
1256{
1257 ASSERT(vcpu);
1258 if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) {
1259 vcpu->arch.mmu.free(vcpu);
1260 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
1261 }
1262}
1263
1264int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
1265{
1266 destroy_kvm_mmu(vcpu);
1267 return init_kvm_mmu(vcpu);
1268}
1269EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
1270
1271int kvm_mmu_load(struct kvm_vcpu *vcpu)
1272{
1273 int r;
1274
1275 r = mmu_topup_memory_caches(vcpu);
1276 if (r)
1277 goto out;
1278 spin_lock(&vcpu->kvm->mmu_lock);
1279 kvm_mmu_free_some_pages(vcpu);
1280 mmu_alloc_roots(vcpu);
1281 spin_unlock(&vcpu->kvm->mmu_lock);
1282 kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
1283 kvm_mmu_flush_tlb(vcpu);
1284out:
1285 return r;
1286}
1287EXPORT_SYMBOL_GPL(kvm_mmu_load);
1288
1289void kvm_mmu_unload(struct kvm_vcpu *vcpu)
1290{
1291 mmu_free_roots(vcpu);
1292}
1293
1294static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
1295 struct kvm_mmu_page *sp,
1296 u64 *spte)
1297{
1298 u64 pte;
1299 struct kvm_mmu_page *child;
1300
1301 pte = *spte;
1302 if (is_shadow_present_pte(pte)) {
1303 if (sp->role.level == PT_PAGE_TABLE_LEVEL)
1304 rmap_remove(vcpu->kvm, spte);
1305 else {
1306 child = page_header(pte & PT64_BASE_ADDR_MASK);
1307 mmu_page_remove_parent_pte(child, spte);
1308 }
1309 }
1310 set_shadow_pte(spte, shadow_trap_nonpresent_pte);
1311}
1312
1313static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
1314 struct kvm_mmu_page *sp,
1315 u64 *spte,
1316 const void *new, int bytes,
1317 int offset_in_pte)
1318{
1319 if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
1320 ++vcpu->kvm->stat.mmu_pde_zapped;
1321 return;
1322 }
1323
1324 ++vcpu->kvm->stat.mmu_pte_updated;
1325 if (sp->role.glevels == PT32_ROOT_LEVEL)
1326 paging32_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte);
1327 else
1328 paging64_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte);
1329}
1330
1331static bool need_remote_flush(u64 old, u64 new)
1332{
1333 if (!is_shadow_present_pte(old))
1334 return false;
1335 if (!is_shadow_present_pte(new))
1336 return true;
1337 if ((old ^ new) & PT64_BASE_ADDR_MASK)
1338 return true;
1339 old ^= PT64_NX_MASK;
1340 new ^= PT64_NX_MASK;
1341 return (old & ~new & PT64_PERM_MASK) != 0;
1342}
1343
1344static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new)
1345{
1346 if (need_remote_flush(old, new))
1347 kvm_flush_remote_tlbs(vcpu->kvm);
1348 else
1349 kvm_mmu_flush_tlb(vcpu);
1350}
1351
1352static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
1353{
1354 u64 *spte = vcpu->arch.last_pte_updated;
1355
1356 return !!(spte && (*spte & PT_ACCESSED_MASK));
1357}
1358
1359static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1360 const u8 *new, int bytes)
1361{
1362 gfn_t gfn;
1363 int r;
1364 u64 gpte = 0;
1365
1366 if (bytes != 4 && bytes != 8)
1367 return;
1368
1369 /*
1370 * Assume that the pte write on a page table of the same type
1371 * as the current vcpu paging mode. This is nearly always true
1372 * (might be false while changing modes). Note it is verified later
1373 * by update_pte().
1374 */
1375 if (is_pae(vcpu)) {
1376 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
1377 if ((bytes == 4) && (gpa % 4 == 0)) {
1378 r = kvm_read_guest(vcpu->kvm, gpa & ~(u64)7, &gpte, 8);
1379 if (r)
1380 return;
1381 memcpy((void *)&gpte + (gpa % 8), new, 4);
1382 } else if ((bytes == 8) && (gpa % 8 == 0)) {
1383 memcpy((void *)&gpte, new, 8);
1384 }
1385 } else {
1386 if ((bytes == 4) && (gpa % 4 == 0))
1387 memcpy((void *)&gpte, new, 4);
1388 }
1389 if (!is_present_pte(gpte))
1390 return;
1391 gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
1392 vcpu->arch.update_pte.gfn = gfn;
1393 vcpu->arch.update_pte.page = gfn_to_page(vcpu->kvm, gfn);
1394}
1395
1396void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1397 const u8 *new, int bytes)
1398{
1399 gfn_t gfn = gpa >> PAGE_SHIFT;
1400 struct kvm_mmu_page *sp;
1401 struct hlist_node *node, *n;
1402 struct hlist_head *bucket;
1403 unsigned index;
1404 u64 entry;
1405 u64 *spte;
1406 unsigned offset = offset_in_page(gpa);
1407 unsigned pte_size;
1408 unsigned page_offset;
1409 unsigned misaligned;
1410 unsigned quadrant;
1411 int level;
1412 int flooded = 0;
1413 int npte;
1414
1415 pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes);
1416 mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes);
1417 spin_lock(&vcpu->kvm->mmu_lock);
1418 kvm_mmu_free_some_pages(vcpu);
1419 ++vcpu->kvm->stat.mmu_pte_write;
1420 kvm_mmu_audit(vcpu, "pre pte write");
1421 if (gfn == vcpu->arch.last_pt_write_gfn
1422 && !last_updated_pte_accessed(vcpu)) {
1423 ++vcpu->arch.last_pt_write_count;
1424 if (vcpu->arch.last_pt_write_count >= 3)
1425 flooded = 1;
1426 } else {
1427 vcpu->arch.last_pt_write_gfn = gfn;
1428 vcpu->arch.last_pt_write_count = 1;
1429 vcpu->arch.last_pte_updated = NULL;
1430 }
1431 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
1432 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
1433 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
1434 if (sp->gfn != gfn || sp->role.metaphysical)
1435 continue;
1436 pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
1437 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
1438 misaligned |= bytes < 4;
1439 if (misaligned || flooded) {
1440 /*
1441 * Misaligned accesses are too much trouble to fix
1442 * up; also, they usually indicate a page is not used
1443 * as a page table.
1444 *
1445 * If we're seeing too many writes to a page,
1446 * it may no longer be a page table, or we may be
1447 * forking, in which case it is better to unmap the
1448 * page.
1449 */
1450 pgprintk("misaligned: gpa %llx bytes %d role %x\n",
1451 gpa, bytes, sp->role.word);
1452 kvm_mmu_zap_page(vcpu->kvm, sp);
1453 ++vcpu->kvm->stat.mmu_flooded;
1454 continue;
1455 }
1456 page_offset = offset;
1457 level = sp->role.level;
1458 npte = 1;
1459 if (sp->role.glevels == PT32_ROOT_LEVEL) {
1460 page_offset <<= 1; /* 32->64 */
1461 /*
1462 * A 32-bit pde maps 4MB while the shadow pdes map
1463 * only 2MB. So we need to double the offset again
1464 * and zap two pdes instead of one.
1465 */
1466 if (level == PT32_ROOT_LEVEL) {
1467 page_offset &= ~7; /* kill rounding error */
1468 page_offset <<= 1;
1469 npte = 2;
1470 }
1471 quadrant = page_offset >> PAGE_SHIFT;
1472 page_offset &= ~PAGE_MASK;
1473 if (quadrant != sp->role.quadrant)
1474 continue;
1475 }
1476 spte = &sp->spt[page_offset / sizeof(*spte)];
1477 while (npte--) {
1478 entry = *spte;
1479 mmu_pte_write_zap_pte(vcpu, sp, spte);
1480 mmu_pte_write_new_pte(vcpu, sp, spte, new, bytes,
1481 page_offset & (pte_size - 1));
1482 mmu_pte_write_flush_tlb(vcpu, entry, *spte);
1483 ++spte;
1484 }
1485 }
1486 kvm_mmu_audit(vcpu, "post pte write");
1487 spin_unlock(&vcpu->kvm->mmu_lock);
1488 if (vcpu->arch.update_pte.page) {
1489 kvm_release_page_clean(vcpu->arch.update_pte.page);
1490 vcpu->arch.update_pte.page = NULL;
1491 }
1492}
1493
1494int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
1495{
1496 gpa_t gpa;
1497 int r;
1498
1499 down_read(&current->mm->mmap_sem);
1500 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
1501 up_read(&current->mm->mmap_sem);
1502
1503 spin_lock(&vcpu->kvm->mmu_lock);
1504 r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
1505 spin_unlock(&vcpu->kvm->mmu_lock);
1506 return r;
1507}
1508
1509void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
1510{
1511 while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES) {
1512 struct kvm_mmu_page *sp;
1513
1514 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
1515 struct kvm_mmu_page, link);
1516 kvm_mmu_zap_page(vcpu->kvm, sp);
1517 ++vcpu->kvm->stat.mmu_recycled;
1518 }
1519}
1520
1521int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
1522{
1523 int r;
1524 enum emulation_result er;
1525
1526 r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code);
1527 if (r < 0)
1528 goto out;
1529
1530 if (!r) {
1531 r = 1;
1532 goto out;
1533 }
1534
1535 r = mmu_topup_memory_caches(vcpu);
1536 if (r)
1537 goto out;
1538
1539 er = emulate_instruction(vcpu, vcpu->run, cr2, error_code, 0);
1540
1541 switch (er) {
1542 case EMULATE_DONE:
1543 return 1;
1544 case EMULATE_DO_MMIO:
1545 ++vcpu->stat.mmio_exits;
1546 return 0;
1547 case EMULATE_FAIL:
1548 kvm_report_emulation_failure(vcpu, "pagetable");
1549 return 1;
1550 default:
1551 BUG();
1552 }
1553out:
1554 return r;
1555}
1556EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
1557
1558static void free_mmu_pages(struct kvm_vcpu *vcpu)
1559{
1560 struct kvm_mmu_page *sp;
1561
1562 while (!list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
1563 sp = container_of(vcpu->kvm->arch.active_mmu_pages.next,
1564 struct kvm_mmu_page, link);
1565 kvm_mmu_zap_page(vcpu->kvm, sp);
1566 }
1567 free_page((unsigned long)vcpu->arch.mmu.pae_root);
1568}
1569
1570static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
1571{
1572 struct page *page;
1573 int i;
1574
1575 ASSERT(vcpu);
1576
1577 if (vcpu->kvm->arch.n_requested_mmu_pages)
1578 vcpu->kvm->arch.n_free_mmu_pages =
1579 vcpu->kvm->arch.n_requested_mmu_pages;
1580 else
1581 vcpu->kvm->arch.n_free_mmu_pages =
1582 vcpu->kvm->arch.n_alloc_mmu_pages;
1583 /*
1584 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
1585 * Therefore we need to allocate shadow page tables in the first
1586 * 4GB of memory, which happens to fit the DMA32 zone.
1587 */
1588 page = alloc_page(GFP_KERNEL | __GFP_DMA32);
1589 if (!page)
1590 goto error_1;
1591 vcpu->arch.mmu.pae_root = page_address(page);
1592 for (i = 0; i < 4; ++i)
1593 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
1594
1595 return 0;
1596
1597error_1:
1598 free_mmu_pages(vcpu);
1599 return -ENOMEM;
1600}
1601
1602int kvm_mmu_create(struct kvm_vcpu *vcpu)
1603{
1604 ASSERT(vcpu);
1605 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
1606
1607 return alloc_mmu_pages(vcpu);
1608}
1609
1610int kvm_mmu_setup(struct kvm_vcpu *vcpu)
1611{
1612 ASSERT(vcpu);
1613 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
1614
1615 return init_kvm_mmu(vcpu);
1616}
1617
1618void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
1619{
1620 ASSERT(vcpu);
1621
1622 destroy_kvm_mmu(vcpu);
1623 free_mmu_pages(vcpu);
1624 mmu_free_memory_caches(vcpu);
1625}
1626
1627void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
1628{
1629 struct kvm_mmu_page *sp;
1630
1631 list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
1632 int i;
1633 u64 *pt;
1634
1635 if (!test_bit(slot, &sp->slot_bitmap))
1636 continue;
1637
1638 pt = sp->spt;
1639 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
1640 /* avoid RMW */
1641 if (pt[i] & PT_WRITABLE_MASK)
1642 pt[i] &= ~PT_WRITABLE_MASK;
1643 }
1644}
1645
1646void kvm_mmu_zap_all(struct kvm *kvm)
1647{
1648 struct kvm_mmu_page *sp, *node;
1649
1650 spin_lock(&kvm->mmu_lock);
1651 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
1652 kvm_mmu_zap_page(kvm, sp);
1653 spin_unlock(&kvm->mmu_lock);
1654
1655 kvm_flush_remote_tlbs(kvm);
1656}
1657
1658void kvm_mmu_module_exit(void)
1659{
1660 if (pte_chain_cache)
1661 kmem_cache_destroy(pte_chain_cache);
1662 if (rmap_desc_cache)
1663 kmem_cache_destroy(rmap_desc_cache);
1664 if (mmu_page_header_cache)
1665 kmem_cache_destroy(mmu_page_header_cache);
1666}
1667
1668int kvm_mmu_module_init(void)
1669{
1670 pte_chain_cache = kmem_cache_create("kvm_pte_chain",
1671 sizeof(struct kvm_pte_chain),
1672 0, 0, NULL);
1673 if (!pte_chain_cache)
1674 goto nomem;
1675 rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
1676 sizeof(struct kvm_rmap_desc),
1677 0, 0, NULL);
1678 if (!rmap_desc_cache)
1679 goto nomem;
1680
1681 mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
1682 sizeof(struct kvm_mmu_page),
1683 0, 0, NULL);
1684 if (!mmu_page_header_cache)
1685 goto nomem;
1686
1687 return 0;
1688
1689nomem:
1690 kvm_mmu_module_exit();
1691 return -ENOMEM;
1692}
1693
1694/*
1695 * Caculate mmu pages needed for kvm.
1696 */
1697unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
1698{
1699 int i;
1700 unsigned int nr_mmu_pages;
1701 unsigned int nr_pages = 0;
1702
1703 for (i = 0; i < kvm->nmemslots; i++)
1704 nr_pages += kvm->memslots[i].npages;
1705
1706 nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
1707 nr_mmu_pages = max(nr_mmu_pages,
1708 (unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
1709
1710 return nr_mmu_pages;
1711}
1712
1713#ifdef AUDIT
1714
1715static const char *audit_msg;
1716
1717static gva_t canonicalize(gva_t gva)
1718{
1719#ifdef CONFIG_X86_64
1720 gva = (long long)(gva << 16) >> 16;
1721#endif
1722 return gva;
1723}
1724
1725static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
1726 gva_t va, int level)
1727{
1728 u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
1729 int i;
1730 gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
1731
1732 for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
1733 u64 ent = pt[i];
1734
1735 if (ent == shadow_trap_nonpresent_pte)
1736 continue;
1737
1738 va = canonicalize(va);
1739 if (level > 1) {
1740 if (ent == shadow_notrap_nonpresent_pte)
1741 printk(KERN_ERR "audit: (%s) nontrapping pte"
1742 " in nonleaf level: levels %d gva %lx"
1743 " level %d pte %llx\n", audit_msg,
1744 vcpu->arch.mmu.root_level, va, level, ent);
1745
1746 audit_mappings_page(vcpu, ent, va, level - 1);
1747 } else {
1748 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va);
1749 struct page *page = gpa_to_page(vcpu, gpa);
1750 hpa_t hpa = page_to_phys(page);
1751
1752 if (is_shadow_present_pte(ent)
1753 && (ent & PT64_BASE_ADDR_MASK) != hpa)
1754 printk(KERN_ERR "xx audit error: (%s) levels %d"
1755 " gva %lx gpa %llx hpa %llx ent %llx %d\n",
1756 audit_msg, vcpu->arch.mmu.root_level,
1757 va, gpa, hpa, ent,
1758 is_shadow_present_pte(ent));
1759 else if (ent == shadow_notrap_nonpresent_pte
1760 && !is_error_hpa(hpa))
1761 printk(KERN_ERR "audit: (%s) notrap shadow,"
1762 " valid guest gva %lx\n", audit_msg, va);
1763 kvm_release_page_clean(page);
1764
1765 }
1766 }
1767}
1768
1769static void audit_mappings(struct kvm_vcpu *vcpu)
1770{
1771 unsigned i;
1772
1773 if (vcpu->arch.mmu.root_level == 4)
1774 audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
1775 else
1776 for (i = 0; i < 4; ++i)
1777 if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
1778 audit_mappings_page(vcpu,
1779 vcpu->arch.mmu.pae_root[i],
1780 i << 30,
1781 2);
1782}
1783
1784static int count_rmaps(struct kvm_vcpu *vcpu)
1785{
1786 int nmaps = 0;
1787 int i, j, k;
1788
1789 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
1790 struct kvm_memory_slot *m = &vcpu->kvm->memslots[i];
1791 struct kvm_rmap_desc *d;
1792
1793 for (j = 0; j < m->npages; ++j) {
1794 unsigned long *rmapp = &m->rmap[j];
1795
1796 if (!*rmapp)
1797 continue;
1798 if (!(*rmapp & 1)) {
1799 ++nmaps;
1800 continue;
1801 }
1802 d = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
1803 while (d) {
1804 for (k = 0; k < RMAP_EXT; ++k)
1805 if (d->shadow_ptes[k])
1806 ++nmaps;
1807 else
1808 break;
1809 d = d->more;
1810 }
1811 }
1812 }
1813 return nmaps;
1814}
1815
1816static int count_writable_mappings(struct kvm_vcpu *vcpu)
1817{
1818 int nmaps = 0;
1819 struct kvm_mmu_page *sp;
1820 int i;
1821
1822 list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
1823 u64 *pt = sp->spt;
1824
1825 if (sp->role.level != PT_PAGE_TABLE_LEVEL)
1826 continue;
1827
1828 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1829 u64 ent = pt[i];
1830
1831 if (!(ent & PT_PRESENT_MASK))
1832 continue;
1833 if (!(ent & PT_WRITABLE_MASK))
1834 continue;
1835 ++nmaps;
1836 }
1837 }
1838 return nmaps;
1839}
1840
1841static void audit_rmap(struct kvm_vcpu *vcpu)
1842{
1843 int n_rmap = count_rmaps(vcpu);
1844 int n_actual = count_writable_mappings(vcpu);
1845
1846 if (n_rmap != n_actual)
1847 printk(KERN_ERR "%s: (%s) rmap %d actual %d\n",
1848 __FUNCTION__, audit_msg, n_rmap, n_actual);
1849}
1850
1851static void audit_write_protection(struct kvm_vcpu *vcpu)
1852{
1853 struct kvm_mmu_page *sp;
1854 struct kvm_memory_slot *slot;
1855 unsigned long *rmapp;
1856 gfn_t gfn;
1857
1858 list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
1859 if (sp->role.metaphysical)
1860 continue;
1861
1862 slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
1863 gfn = unalias_gfn(vcpu->kvm, sp->gfn);
1864 rmapp = &slot->rmap[gfn - slot->base_gfn];
1865 if (*rmapp)
1866 printk(KERN_ERR "%s: (%s) shadow page has writable"
1867 " mappings: gfn %lx role %x\n",
1868 __FUNCTION__, audit_msg, sp->gfn,
1869 sp->role.word);
1870 }
1871}
1872
1873static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
1874{
1875 int olddbg = dbg;
1876
1877 dbg = 0;
1878 audit_msg = msg;
1879 audit_rmap(vcpu);
1880 audit_write_protection(vcpu);
1881 audit_mappings(vcpu);
1882 dbg = olddbg;
1883}
1884
1885#endif
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
new file mode 100644
index 00000000000..1fce19ec7a2
--- /dev/null
+++ b/arch/x86/kvm/mmu.h
@@ -0,0 +1,44 @@
1#ifndef __KVM_X86_MMU_H
2#define __KVM_X86_MMU_H
3
4#include <linux/kvm_host.h>
5
6static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
7{
8 if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
9 __kvm_mmu_free_some_pages(vcpu);
10}
11
12static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
13{
14 if (likely(vcpu->arch.mmu.root_hpa != INVALID_PAGE))
15 return 0;
16
17 return kvm_mmu_load(vcpu);
18}
19
20static inline int is_long_mode(struct kvm_vcpu *vcpu)
21{
22#ifdef CONFIG_X86_64
23 return vcpu->arch.shadow_efer & EFER_LME;
24#else
25 return 0;
26#endif
27}
28
29static inline int is_pae(struct kvm_vcpu *vcpu)
30{
31 return vcpu->arch.cr4 & X86_CR4_PAE;
32}
33
34static inline int is_pse(struct kvm_vcpu *vcpu)
35{
36 return vcpu->arch.cr4 & X86_CR4_PSE;
37}
38
39static inline int is_paging(struct kvm_vcpu *vcpu)
40{
41 return vcpu->arch.cr0 & X86_CR0_PG;
42}
43
44#endif
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
new file mode 100644
index 00000000000..03ba8608fe0
--- /dev/null
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -0,0 +1,484 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * MMU support
8 *
9 * Copyright (C) 2006 Qumranet, Inc.
10 *
11 * Authors:
12 * Yaniv Kamay <yaniv@qumranet.com>
13 * Avi Kivity <avi@qumranet.com>
14 *
15 * This work is licensed under the terms of the GNU GPL, version 2. See
16 * the COPYING file in the top-level directory.
17 *
18 */
19
20/*
21 * We need the mmu code to access both 32-bit and 64-bit guest ptes,
22 * so the code in this file is compiled twice, once per pte size.
23 */
24
25#if PTTYPE == 64
26 #define pt_element_t u64
27 #define guest_walker guest_walker64
28 #define FNAME(name) paging##64_##name
29 #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
30 #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK
31 #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
32 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
33 #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
34 #define PT_LEVEL_BITS PT64_LEVEL_BITS
35 #ifdef CONFIG_X86_64
36 #define PT_MAX_FULL_LEVELS 4
37 #define CMPXCHG cmpxchg
38 #else
39 #define CMPXCHG cmpxchg64
40 #define PT_MAX_FULL_LEVELS 2
41 #endif
42#elif PTTYPE == 32
43 #define pt_element_t u32
44 #define guest_walker guest_walker32
45 #define FNAME(name) paging##32_##name
46 #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
47 #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK
48 #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
49 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
50 #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
51 #define PT_LEVEL_BITS PT32_LEVEL_BITS
52 #define PT_MAX_FULL_LEVELS 2
53 #define CMPXCHG cmpxchg
54#else
55 #error Invalid PTTYPE value
56#endif
57
58#define gpte_to_gfn FNAME(gpte_to_gfn)
59#define gpte_to_gfn_pde FNAME(gpte_to_gfn_pde)
60
61/*
62 * The guest_walker structure emulates the behavior of the hardware page
63 * table walker.
64 */
65struct guest_walker {
66 int level;
67 gfn_t table_gfn[PT_MAX_FULL_LEVELS];
68 pt_element_t ptes[PT_MAX_FULL_LEVELS];
69 gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
70 unsigned pt_access;
71 unsigned pte_access;
72 gfn_t gfn;
73 u32 error_code;
74};
75
76static gfn_t gpte_to_gfn(pt_element_t gpte)
77{
78 return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
79}
80
81static gfn_t gpte_to_gfn_pde(pt_element_t gpte)
82{
83 return (gpte & PT_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
84}
85
86static bool FNAME(cmpxchg_gpte)(struct kvm *kvm,
87 gfn_t table_gfn, unsigned index,
88 pt_element_t orig_pte, pt_element_t new_pte)
89{
90 pt_element_t ret;
91 pt_element_t *table;
92 struct page *page;
93
94 page = gfn_to_page(kvm, table_gfn);
95 table = kmap_atomic(page, KM_USER0);
96
97 ret = CMPXCHG(&table[index], orig_pte, new_pte);
98
99 kunmap_atomic(table, KM_USER0);
100
101 kvm_release_page_dirty(page);
102
103 return (ret != orig_pte);
104}
105
106static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
107{
108 unsigned access;
109
110 access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
111#if PTTYPE == 64
112 if (is_nx(vcpu))
113 access &= ~(gpte >> PT64_NX_SHIFT);
114#endif
115 return access;
116}
117
118/*
119 * Fetch a guest pte for a guest virtual address
120 */
121static int FNAME(walk_addr)(struct guest_walker *walker,
122 struct kvm_vcpu *vcpu, gva_t addr,
123 int write_fault, int user_fault, int fetch_fault)
124{
125 pt_element_t pte;
126 gfn_t table_gfn;
127 unsigned index, pt_access, pte_access;
128 gpa_t pte_gpa;
129
130 pgprintk("%s: addr %lx\n", __FUNCTION__, addr);
131walk:
132 walker->level = vcpu->arch.mmu.root_level;
133 pte = vcpu->arch.cr3;
134#if PTTYPE == 64
135 if (!is_long_mode(vcpu)) {
136 pte = vcpu->arch.pdptrs[(addr >> 30) & 3];
137 if (!is_present_pte(pte))
138 goto not_present;
139 --walker->level;
140 }
141#endif
142 ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
143 (vcpu->cr3 & CR3_NONPAE_RESERVED_BITS) == 0);
144
145 pt_access = ACC_ALL;
146
147 for (;;) {
148 index = PT_INDEX(addr, walker->level);
149
150 table_gfn = gpte_to_gfn(pte);
151 pte_gpa = gfn_to_gpa(table_gfn);
152 pte_gpa += index * sizeof(pt_element_t);
153 walker->table_gfn[walker->level - 1] = table_gfn;
154 walker->pte_gpa[walker->level - 1] = pte_gpa;
155 pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__,
156 walker->level - 1, table_gfn);
157
158 kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte));
159
160 if (!is_present_pte(pte))
161 goto not_present;
162
163 if (write_fault && !is_writeble_pte(pte))
164 if (user_fault || is_write_protection(vcpu))
165 goto access_error;
166
167 if (user_fault && !(pte & PT_USER_MASK))
168 goto access_error;
169
170#if PTTYPE == 64
171 if (fetch_fault && is_nx(vcpu) && (pte & PT64_NX_MASK))
172 goto access_error;
173#endif
174
175 if (!(pte & PT_ACCESSED_MASK)) {
176 mark_page_dirty(vcpu->kvm, table_gfn);
177 if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn,
178 index, pte, pte|PT_ACCESSED_MASK))
179 goto walk;
180 pte |= PT_ACCESSED_MASK;
181 }
182
183 pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);
184
185 walker->ptes[walker->level - 1] = pte;
186
187 if (walker->level == PT_PAGE_TABLE_LEVEL) {
188 walker->gfn = gpte_to_gfn(pte);
189 break;
190 }
191
192 if (walker->level == PT_DIRECTORY_LEVEL
193 && (pte & PT_PAGE_SIZE_MASK)
194 && (PTTYPE == 64 || is_pse(vcpu))) {
195 walker->gfn = gpte_to_gfn_pde(pte);
196 walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL);
197 if (PTTYPE == 32 && is_cpuid_PSE36())
198 walker->gfn += pse36_gfn_delta(pte);
199 break;
200 }
201
202 pt_access = pte_access;
203 --walker->level;
204 }
205
206 if (write_fault && !is_dirty_pte(pte)) {
207 bool ret;
208
209 mark_page_dirty(vcpu->kvm, table_gfn);
210 ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte,
211 pte|PT_DIRTY_MASK);
212 if (ret)
213 goto walk;
214 pte |= PT_DIRTY_MASK;
215 kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte));
216 walker->ptes[walker->level - 1] = pte;
217 }
218
219 walker->pt_access = pt_access;
220 walker->pte_access = pte_access;
221 pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
222 __FUNCTION__, (u64)pte, pt_access, pte_access);
223 return 1;
224
225not_present:
226 walker->error_code = 0;
227 goto err;
228
229access_error:
230 walker->error_code = PFERR_PRESENT_MASK;
231
232err:
233 if (write_fault)
234 walker->error_code |= PFERR_WRITE_MASK;
235 if (user_fault)
236 walker->error_code |= PFERR_USER_MASK;
237 if (fetch_fault)
238 walker->error_code |= PFERR_FETCH_MASK;
239 return 0;
240}
241
242static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
243 u64 *spte, const void *pte, int bytes,
244 int offset_in_pte)
245{
246 pt_element_t gpte;
247 unsigned pte_access;
248 struct page *npage;
249
250 gpte = *(const pt_element_t *)pte;
251 if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
252 if (!offset_in_pte && !is_present_pte(gpte))
253 set_shadow_pte(spte, shadow_notrap_nonpresent_pte);
254 return;
255 }
256 if (bytes < sizeof(pt_element_t))
257 return;
258 pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte);
259 pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte);
260 if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn)
261 return;
262 npage = vcpu->arch.update_pte.page;
263 if (!npage)
264 return;
265 get_page(npage);
266 mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
267 gpte & PT_DIRTY_MASK, NULL, gpte_to_gfn(gpte), npage);
268}
269
270/*
271 * Fetch a shadow pte for a specific level in the paging hierarchy.
272 */
273static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
274 struct guest_walker *walker,
275 int user_fault, int write_fault, int *ptwrite,
276 struct page *page)
277{
278 hpa_t shadow_addr;
279 int level;
280 u64 *shadow_ent;
281 unsigned access = walker->pt_access;
282
283 if (!is_present_pte(walker->ptes[walker->level - 1]))
284 return NULL;
285
286 shadow_addr = vcpu->arch.mmu.root_hpa;
287 level = vcpu->arch.mmu.shadow_root_level;
288 if (level == PT32E_ROOT_LEVEL) {
289 shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
290 shadow_addr &= PT64_BASE_ADDR_MASK;
291 --level;
292 }
293
294 for (; ; level--) {
295 u32 index = SHADOW_PT_INDEX(addr, level);
296 struct kvm_mmu_page *shadow_page;
297 u64 shadow_pte;
298 int metaphysical;
299 gfn_t table_gfn;
300 bool new_page = 0;
301
302 shadow_ent = ((u64 *)__va(shadow_addr)) + index;
303 if (level == PT_PAGE_TABLE_LEVEL)
304 break;
305 if (is_shadow_present_pte(*shadow_ent)) {
306 shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK;
307 continue;
308 }
309
310 if (level - 1 == PT_PAGE_TABLE_LEVEL
311 && walker->level == PT_DIRECTORY_LEVEL) {
312 metaphysical = 1;
313 if (!is_dirty_pte(walker->ptes[level - 1]))
314 access &= ~ACC_WRITE_MASK;
315 table_gfn = gpte_to_gfn(walker->ptes[level - 1]);
316 } else {
317 metaphysical = 0;
318 table_gfn = walker->table_gfn[level - 2];
319 }
320 shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
321 metaphysical, access,
322 shadow_ent, &new_page);
323 if (new_page && !metaphysical) {
324 int r;
325 pt_element_t curr_pte;
326 r = kvm_read_guest_atomic(vcpu->kvm,
327 walker->pte_gpa[level - 2],
328 &curr_pte, sizeof(curr_pte));
329 if (r || curr_pte != walker->ptes[level - 2]) {
330 kvm_release_page_clean(page);
331 return NULL;
332 }
333 }
334 shadow_addr = __pa(shadow_page->spt);
335 shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
336 | PT_WRITABLE_MASK | PT_USER_MASK;
337 *shadow_ent = shadow_pte;
338 }
339
340 mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access,
341 user_fault, write_fault,
342 walker->ptes[walker->level-1] & PT_DIRTY_MASK,
343 ptwrite, walker->gfn, page);
344
345 return shadow_ent;
346}
347
348/*
349 * Page fault handler. There are several causes for a page fault:
350 * - there is no shadow pte for the guest pte
351 * - write access through a shadow pte marked read only so that we can set
352 * the dirty bit
353 * - write access to a shadow pte marked read only so we can update the page
354 * dirty bitmap, when userspace requests it
355 * - mmio access; in this case we will never install a present shadow pte
356 * - normal guest page fault due to the guest pte marked not present, not
357 * writable, or not executable
358 *
359 * Returns: 1 if we need to emulate the instruction, 0 otherwise, or
360 * a negative value on error.
361 */
362static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
363 u32 error_code)
364{
365 int write_fault = error_code & PFERR_WRITE_MASK;
366 int user_fault = error_code & PFERR_USER_MASK;
367 int fetch_fault = error_code & PFERR_FETCH_MASK;
368 struct guest_walker walker;
369 u64 *shadow_pte;
370 int write_pt = 0;
371 int r;
372 struct page *page;
373
374 pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code);
375 kvm_mmu_audit(vcpu, "pre page fault");
376
377 r = mmu_topup_memory_caches(vcpu);
378 if (r)
379 return r;
380
381 down_read(&current->mm->mmap_sem);
382 /*
383 * Look up the shadow pte for the faulting address.
384 */
385 r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault,
386 fetch_fault);
387
388 /*
389 * The page is not mapped by the guest. Let the guest handle it.
390 */
391 if (!r) {
392 pgprintk("%s: guest page fault\n", __FUNCTION__);
393 inject_page_fault(vcpu, addr, walker.error_code);
394 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
395 up_read(&current->mm->mmap_sem);
396 return 0;
397 }
398
399 page = gfn_to_page(vcpu->kvm, walker.gfn);
400
401 spin_lock(&vcpu->kvm->mmu_lock);
402 kvm_mmu_free_some_pages(vcpu);
403 shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
404 &write_pt, page);
405 pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__,
406 shadow_pte, *shadow_pte, write_pt);
407
408 if (!write_pt)
409 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
410
411 /*
412 * mmio: emulate if accessible, otherwise its a guest fault.
413 */
414 if (shadow_pte && is_io_pte(*shadow_pte)) {
415 spin_unlock(&vcpu->kvm->mmu_lock);
416 up_read(&current->mm->mmap_sem);
417 return 1;
418 }
419
420 ++vcpu->stat.pf_fixed;
421 kvm_mmu_audit(vcpu, "post page fault (fixed)");
422 spin_unlock(&vcpu->kvm->mmu_lock);
423 up_read(&current->mm->mmap_sem);
424
425 return write_pt;
426}
427
428static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
429{
430 struct guest_walker walker;
431 gpa_t gpa = UNMAPPED_GVA;
432 int r;
433
434 r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0);
435
436 if (r) {
437 gpa = gfn_to_gpa(walker.gfn);
438 gpa |= vaddr & ~PAGE_MASK;
439 }
440
441 return gpa;
442}
443
444static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
445 struct kvm_mmu_page *sp)
446{
447 int i, offset = 0, r = 0;
448 pt_element_t pt;
449
450 if (sp->role.metaphysical
451 || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) {
452 nonpaging_prefetch_page(vcpu, sp);
453 return;
454 }
455
456 if (PTTYPE == 32)
457 offset = sp->role.quadrant << PT64_LEVEL_BITS;
458
459 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
460 gpa_t pte_gpa = gfn_to_gpa(sp->gfn);
461 pte_gpa += (i+offset) * sizeof(pt_element_t);
462
463 r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &pt,
464 sizeof(pt_element_t));
465 if (r || is_present_pte(pt))
466 sp->spt[i] = shadow_trap_nonpresent_pte;
467 else
468 sp->spt[i] = shadow_notrap_nonpresent_pte;
469 }
470}
471
472#undef pt_element_t
473#undef guest_walker
474#undef FNAME
475#undef PT_BASE_ADDR_MASK
476#undef PT_INDEX
477#undef SHADOW_PT_INDEX
478#undef PT_LEVEL_MASK
479#undef PT_DIR_BASE_ADDR_MASK
480#undef PT_LEVEL_BITS
481#undef PT_MAX_FULL_LEVELS
482#undef gpte_to_gfn
483#undef gpte_to_gfn_pde
484#undef CMPXCHG
diff --git a/arch/x86/kvm/segment_descriptor.h b/arch/x86/kvm/segment_descriptor.h
new file mode 100644
index 00000000000..56fc4c87338
--- /dev/null
+++ b/arch/x86/kvm/segment_descriptor.h
@@ -0,0 +1,29 @@
1#ifndef __SEGMENT_DESCRIPTOR_H
2#define __SEGMENT_DESCRIPTOR_H
3
4struct segment_descriptor {
5 u16 limit_low;
6 u16 base_low;
7 u8 base_mid;
8 u8 type : 4;
9 u8 system : 1;
10 u8 dpl : 2;
11 u8 present : 1;
12 u8 limit_high : 4;
13 u8 avl : 1;
14 u8 long_mode : 1;
15 u8 default_op : 1;
16 u8 granularity : 1;
17 u8 base_high;
18} __attribute__((packed));
19
20#ifdef CONFIG_X86_64
21/* LDT or TSS descriptor in the GDT. 16 bytes. */
22struct segment_descriptor_64 {
23 struct segment_descriptor s;
24 u32 base_higher;
25 u32 pad_zero;
26};
27
28#endif
29#endif
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
new file mode 100644
index 00000000000..de755cb1431
--- /dev/null
+++ b/arch/x86/kvm/svm.c
@@ -0,0 +1,1731 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * AMD SVM support
5 *
6 * Copyright (C) 2006 Qumranet, Inc.
7 *
8 * Authors:
9 * Yaniv Kamay <yaniv@qumranet.com>
10 * Avi Kivity <avi@qumranet.com>
11 *
12 * This work is licensed under the terms of the GNU GPL, version 2. See
13 * the COPYING file in the top-level directory.
14 *
15 */
16#include <linux/kvm_host.h>
17
18#include "kvm_svm.h"
19#include "irq.h"
20#include "mmu.h"
21
22#include <linux/module.h>
23#include <linux/kernel.h>
24#include <linux/vmalloc.h>
25#include <linux/highmem.h>
26#include <linux/sched.h>
27
28#include <asm/desc.h>
29
30MODULE_AUTHOR("Qumranet");
31MODULE_LICENSE("GPL");
32
33#define IOPM_ALLOC_ORDER 2
34#define MSRPM_ALLOC_ORDER 1
35
36#define DB_VECTOR 1
37#define UD_VECTOR 6
38#define GP_VECTOR 13
39
40#define DR7_GD_MASK (1 << 13)
41#define DR6_BD_MASK (1 << 13)
42
43#define SEG_TYPE_LDT 2
44#define SEG_TYPE_BUSY_TSS16 3
45
46#define SVM_FEATURE_NPT (1 << 0)
47#define SVM_FEATURE_LBRV (1 << 1)
48#define SVM_DEATURE_SVML (1 << 2)
49
50static void kvm_reput_irq(struct vcpu_svm *svm);
51
52static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
53{
54 return container_of(vcpu, struct vcpu_svm, vcpu);
55}
56
57unsigned long iopm_base;
58unsigned long msrpm_base;
59
60struct kvm_ldttss_desc {
61 u16 limit0;
62 u16 base0;
63 unsigned base1 : 8, type : 5, dpl : 2, p : 1;
64 unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8;
65 u32 base3;
66 u32 zero1;
67} __attribute__((packed));
68
69struct svm_cpu_data {
70 int cpu;
71
72 u64 asid_generation;
73 u32 max_asid;
74 u32 next_asid;
75 struct kvm_ldttss_desc *tss_desc;
76
77 struct page *save_area;
78};
79
80static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
81static uint32_t svm_features;
82
83struct svm_init_data {
84 int cpu;
85 int r;
86};
87
88static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
89
90#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
91#define MSRS_RANGE_SIZE 2048
92#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
93
94#define MAX_INST_SIZE 15
95
96static inline u32 svm_has(u32 feat)
97{
98 return svm_features & feat;
99}
100
101static inline u8 pop_irq(struct kvm_vcpu *vcpu)
102{
103 int word_index = __ffs(vcpu->arch.irq_summary);
104 int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
105 int irq = word_index * BITS_PER_LONG + bit_index;
106
107 clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
108 if (!vcpu->arch.irq_pending[word_index])
109 clear_bit(word_index, &vcpu->arch.irq_summary);
110 return irq;
111}
112
113static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq)
114{
115 set_bit(irq, vcpu->arch.irq_pending);
116 set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
117}
118
119static inline void clgi(void)
120{
121 asm volatile (SVM_CLGI);
122}
123
124static inline void stgi(void)
125{
126 asm volatile (SVM_STGI);
127}
128
129static inline void invlpga(unsigned long addr, u32 asid)
130{
131 asm volatile (SVM_INVLPGA :: "a"(addr), "c"(asid));
132}
133
134static inline unsigned long kvm_read_cr2(void)
135{
136 unsigned long cr2;
137
138 asm volatile ("mov %%cr2, %0" : "=r" (cr2));
139 return cr2;
140}
141
142static inline void kvm_write_cr2(unsigned long val)
143{
144 asm volatile ("mov %0, %%cr2" :: "r" (val));
145}
146
147static inline unsigned long read_dr6(void)
148{
149 unsigned long dr6;
150
151 asm volatile ("mov %%dr6, %0" : "=r" (dr6));
152 return dr6;
153}
154
155static inline void write_dr6(unsigned long val)
156{
157 asm volatile ("mov %0, %%dr6" :: "r" (val));
158}
159
160static inline unsigned long read_dr7(void)
161{
162 unsigned long dr7;
163
164 asm volatile ("mov %%dr7, %0" : "=r" (dr7));
165 return dr7;
166}
167
168static inline void write_dr7(unsigned long val)
169{
170 asm volatile ("mov %0, %%dr7" :: "r" (val));
171}
172
173static inline void force_new_asid(struct kvm_vcpu *vcpu)
174{
175 to_svm(vcpu)->asid_generation--;
176}
177
178static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
179{
180 force_new_asid(vcpu);
181}
182
183static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
184{
185 if (!(efer & EFER_LMA))
186 efer &= ~EFER_LME;
187
188 to_svm(vcpu)->vmcb->save.efer = efer | MSR_EFER_SVME_MASK;
189 vcpu->arch.shadow_efer = efer;
190}
191
192static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
193 bool has_error_code, u32 error_code)
194{
195 struct vcpu_svm *svm = to_svm(vcpu);
196
197 svm->vmcb->control.event_inj = nr
198 | SVM_EVTINJ_VALID
199 | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
200 | SVM_EVTINJ_TYPE_EXEPT;
201 svm->vmcb->control.event_inj_err = error_code;
202}
203
204static bool svm_exception_injected(struct kvm_vcpu *vcpu)
205{
206 struct vcpu_svm *svm = to_svm(vcpu);
207
208 return !(svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID);
209}
210
211static int is_external_interrupt(u32 info)
212{
213 info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
214 return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
215}
216
217static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
218{
219 struct vcpu_svm *svm = to_svm(vcpu);
220
221 if (!svm->next_rip) {
222 printk(KERN_DEBUG "%s: NOP\n", __FUNCTION__);
223 return;
224 }
225 if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE)
226 printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n",
227 __FUNCTION__,
228 svm->vmcb->save.rip,
229 svm->next_rip);
230
231 vcpu->arch.rip = svm->vmcb->save.rip = svm->next_rip;
232 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
233
234 vcpu->arch.interrupt_window_open = 1;
235}
236
237static int has_svm(void)
238{
239 uint32_t eax, ebx, ecx, edx;
240
241 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
242 printk(KERN_INFO "has_svm: not amd\n");
243 return 0;
244 }
245
246 cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
247 if (eax < SVM_CPUID_FUNC) {
248 printk(KERN_INFO "has_svm: can't execute cpuid_8000000a\n");
249 return 0;
250 }
251
252 cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
253 if (!(ecx & (1 << SVM_CPUID_FEATURE_SHIFT))) {
254 printk(KERN_DEBUG "has_svm: svm not available\n");
255 return 0;
256 }
257 return 1;
258}
259
260static void svm_hardware_disable(void *garbage)
261{
262 struct svm_cpu_data *svm_data
263 = per_cpu(svm_data, raw_smp_processor_id());
264
265 if (svm_data) {
266 uint64_t efer;
267
268 wrmsrl(MSR_VM_HSAVE_PA, 0);
269 rdmsrl(MSR_EFER, efer);
270 wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK);
271 per_cpu(svm_data, raw_smp_processor_id()) = NULL;
272 __free_page(svm_data->save_area);
273 kfree(svm_data);
274 }
275}
276
277static void svm_hardware_enable(void *garbage)
278{
279
280 struct svm_cpu_data *svm_data;
281 uint64_t efer;
282#ifdef CONFIG_X86_64
283 struct desc_ptr gdt_descr;
284#else
285 struct desc_ptr gdt_descr;
286#endif
287 struct desc_struct *gdt;
288 int me = raw_smp_processor_id();
289
290 if (!has_svm()) {
291 printk(KERN_ERR "svm_cpu_init: err EOPNOTSUPP on %d\n", me);
292 return;
293 }
294 svm_data = per_cpu(svm_data, me);
295
296 if (!svm_data) {
297 printk(KERN_ERR "svm_cpu_init: svm_data is NULL on %d\n",
298 me);
299 return;
300 }
301
302 svm_data->asid_generation = 1;
303 svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
304 svm_data->next_asid = svm_data->max_asid + 1;
305 svm_features = cpuid_edx(SVM_CPUID_FUNC);
306
307 asm volatile ("sgdt %0" : "=m"(gdt_descr));
308 gdt = (struct desc_struct *)gdt_descr.address;
309 svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
310
311 rdmsrl(MSR_EFER, efer);
312 wrmsrl(MSR_EFER, efer | MSR_EFER_SVME_MASK);
313
314 wrmsrl(MSR_VM_HSAVE_PA,
315 page_to_pfn(svm_data->save_area) << PAGE_SHIFT);
316}
317
318static int svm_cpu_init(int cpu)
319{
320 struct svm_cpu_data *svm_data;
321 int r;
322
323 svm_data = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
324 if (!svm_data)
325 return -ENOMEM;
326 svm_data->cpu = cpu;
327 svm_data->save_area = alloc_page(GFP_KERNEL);
328 r = -ENOMEM;
329 if (!svm_data->save_area)
330 goto err_1;
331
332 per_cpu(svm_data, cpu) = svm_data;
333
334 return 0;
335
336err_1:
337 kfree(svm_data);
338 return r;
339
340}
341
342static void set_msr_interception(u32 *msrpm, unsigned msr,
343 int read, int write)
344{
345 int i;
346
347 for (i = 0; i < NUM_MSR_MAPS; i++) {
348 if (msr >= msrpm_ranges[i] &&
349 msr < msrpm_ranges[i] + MSRS_IN_RANGE) {
350 u32 msr_offset = (i * MSRS_IN_RANGE + msr -
351 msrpm_ranges[i]) * 2;
352
353 u32 *base = msrpm + (msr_offset / 32);
354 u32 msr_shift = msr_offset % 32;
355 u32 mask = ((write) ? 0 : 2) | ((read) ? 0 : 1);
356 *base = (*base & ~(0x3 << msr_shift)) |
357 (mask << msr_shift);
358 return;
359 }
360 }
361 BUG();
362}
363
364static __init int svm_hardware_setup(void)
365{
366 int cpu;
367 struct page *iopm_pages;
368 struct page *msrpm_pages;
369 void *iopm_va, *msrpm_va;
370 int r;
371
372 iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER);
373
374 if (!iopm_pages)
375 return -ENOMEM;
376
377 iopm_va = page_address(iopm_pages);
378 memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
379 clear_bit(0x80, iopm_va); /* allow direct access to PC debug port */
380 iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
381
382
383 msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
384
385 r = -ENOMEM;
386 if (!msrpm_pages)
387 goto err_1;
388
389 msrpm_va = page_address(msrpm_pages);
390 memset(msrpm_va, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
391 msrpm_base = page_to_pfn(msrpm_pages) << PAGE_SHIFT;
392
393#ifdef CONFIG_X86_64
394 set_msr_interception(msrpm_va, MSR_GS_BASE, 1, 1);
395 set_msr_interception(msrpm_va, MSR_FS_BASE, 1, 1);
396 set_msr_interception(msrpm_va, MSR_KERNEL_GS_BASE, 1, 1);
397 set_msr_interception(msrpm_va, MSR_LSTAR, 1, 1);
398 set_msr_interception(msrpm_va, MSR_CSTAR, 1, 1);
399 set_msr_interception(msrpm_va, MSR_SYSCALL_MASK, 1, 1);
400#endif
401 set_msr_interception(msrpm_va, MSR_K6_STAR, 1, 1);
402 set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_CS, 1, 1);
403 set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_ESP, 1, 1);
404 set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_EIP, 1, 1);
405
406 for_each_online_cpu(cpu) {
407 r = svm_cpu_init(cpu);
408 if (r)
409 goto err_2;
410 }
411 return 0;
412
413err_2:
414 __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
415 msrpm_base = 0;
416err_1:
417 __free_pages(iopm_pages, IOPM_ALLOC_ORDER);
418 iopm_base = 0;
419 return r;
420}
421
422static __exit void svm_hardware_unsetup(void)
423{
424 __free_pages(pfn_to_page(msrpm_base >> PAGE_SHIFT), MSRPM_ALLOC_ORDER);
425 __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
426 iopm_base = msrpm_base = 0;
427}
428
429static void init_seg(struct vmcb_seg *seg)
430{
431 seg->selector = 0;
432 seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
433 SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
434 seg->limit = 0xffff;
435 seg->base = 0;
436}
437
438static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
439{
440 seg->selector = 0;
441 seg->attrib = SVM_SELECTOR_P_MASK | type;
442 seg->limit = 0xffff;
443 seg->base = 0;
444}
445
446static void init_vmcb(struct vmcb *vmcb)
447{
448 struct vmcb_control_area *control = &vmcb->control;
449 struct vmcb_save_area *save = &vmcb->save;
450
451 control->intercept_cr_read = INTERCEPT_CR0_MASK |
452 INTERCEPT_CR3_MASK |
453 INTERCEPT_CR4_MASK |
454 INTERCEPT_CR8_MASK;
455
456 control->intercept_cr_write = INTERCEPT_CR0_MASK |
457 INTERCEPT_CR3_MASK |
458 INTERCEPT_CR4_MASK |
459 INTERCEPT_CR8_MASK;
460
461 control->intercept_dr_read = INTERCEPT_DR0_MASK |
462 INTERCEPT_DR1_MASK |
463 INTERCEPT_DR2_MASK |
464 INTERCEPT_DR3_MASK;
465
466 control->intercept_dr_write = INTERCEPT_DR0_MASK |
467 INTERCEPT_DR1_MASK |
468 INTERCEPT_DR2_MASK |
469 INTERCEPT_DR3_MASK |
470 INTERCEPT_DR5_MASK |
471 INTERCEPT_DR7_MASK;
472
473 control->intercept_exceptions = (1 << PF_VECTOR) |
474 (1 << UD_VECTOR);
475
476
477 control->intercept = (1ULL << INTERCEPT_INTR) |
478 (1ULL << INTERCEPT_NMI) |
479 (1ULL << INTERCEPT_SMI) |
480 /*
481 * selective cr0 intercept bug?
482 * 0: 0f 22 d8 mov %eax,%cr3
483 * 3: 0f 20 c0 mov %cr0,%eax
484 * 6: 0d 00 00 00 80 or $0x80000000,%eax
485 * b: 0f 22 c0 mov %eax,%cr0
486 * set cr3 ->interception
487 * get cr0 ->interception
488 * set cr0 -> no interception
489 */
490 /* (1ULL << INTERCEPT_SELECTIVE_CR0) | */
491 (1ULL << INTERCEPT_CPUID) |
492 (1ULL << INTERCEPT_INVD) |
493 (1ULL << INTERCEPT_HLT) |
494 (1ULL << INTERCEPT_INVLPGA) |
495 (1ULL << INTERCEPT_IOIO_PROT) |
496 (1ULL << INTERCEPT_MSR_PROT) |
497 (1ULL << INTERCEPT_TASK_SWITCH) |
498 (1ULL << INTERCEPT_SHUTDOWN) |
499 (1ULL << INTERCEPT_VMRUN) |
500 (1ULL << INTERCEPT_VMMCALL) |
501 (1ULL << INTERCEPT_VMLOAD) |
502 (1ULL << INTERCEPT_VMSAVE) |
503 (1ULL << INTERCEPT_STGI) |
504 (1ULL << INTERCEPT_CLGI) |
505 (1ULL << INTERCEPT_SKINIT) |
506 (1ULL << INTERCEPT_WBINVD) |
507 (1ULL << INTERCEPT_MONITOR) |
508 (1ULL << INTERCEPT_MWAIT);
509
510 control->iopm_base_pa = iopm_base;
511 control->msrpm_base_pa = msrpm_base;
512 control->tsc_offset = 0;
513 control->int_ctl = V_INTR_MASKING_MASK;
514
515 init_seg(&save->es);
516 init_seg(&save->ss);
517 init_seg(&save->ds);
518 init_seg(&save->fs);
519 init_seg(&save->gs);
520
521 save->cs.selector = 0xf000;
522 /* Executable/Readable Code Segment */
523 save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
524 SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
525 save->cs.limit = 0xffff;
526 /*
527 * cs.base should really be 0xffff0000, but vmx can't handle that, so
528 * be consistent with it.
529 *
530 * Replace when we have real mode working for vmx.
531 */
532 save->cs.base = 0xf0000;
533
534 save->gdtr.limit = 0xffff;
535 save->idtr.limit = 0xffff;
536
537 init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
538 init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
539
540 save->efer = MSR_EFER_SVME_MASK;
541 save->dr6 = 0xffff0ff0;
542 save->dr7 = 0x400;
543 save->rflags = 2;
544 save->rip = 0x0000fff0;
545
546 /*
547 * cr0 val on cpu init should be 0x60000010, we enable cpu
548 * cache by default. the orderly way is to enable cache in bios.
549 */
550 save->cr0 = 0x00000010 | X86_CR0_PG | X86_CR0_WP;
551 save->cr4 = X86_CR4_PAE;
552 /* rdx = ?? */
553}
554
555static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
556{
557 struct vcpu_svm *svm = to_svm(vcpu);
558
559 init_vmcb(svm->vmcb);
560
561 if (vcpu->vcpu_id != 0) {
562 svm->vmcb->save.rip = 0;
563 svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12;
564 svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8;
565 }
566
567 return 0;
568}
569
570static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
571{
572 struct vcpu_svm *svm;
573 struct page *page;
574 int err;
575
576 svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
577 if (!svm) {
578 err = -ENOMEM;
579 goto out;
580 }
581
582 err = kvm_vcpu_init(&svm->vcpu, kvm, id);
583 if (err)
584 goto free_svm;
585
586 page = alloc_page(GFP_KERNEL);
587 if (!page) {
588 err = -ENOMEM;
589 goto uninit;
590 }
591
592 svm->vmcb = page_address(page);
593 clear_page(svm->vmcb);
594 svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
595 svm->asid_generation = 0;
596 memset(svm->db_regs, 0, sizeof(svm->db_regs));
597 init_vmcb(svm->vmcb);
598
599 fx_init(&svm->vcpu);
600 svm->vcpu.fpu_active = 1;
601 svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
602 if (svm->vcpu.vcpu_id == 0)
603 svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
604
605 return &svm->vcpu;
606
607uninit:
608 kvm_vcpu_uninit(&svm->vcpu);
609free_svm:
610 kmem_cache_free(kvm_vcpu_cache, svm);
611out:
612 return ERR_PTR(err);
613}
614
615static void svm_free_vcpu(struct kvm_vcpu *vcpu)
616{
617 struct vcpu_svm *svm = to_svm(vcpu);
618
619 __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT));
620 kvm_vcpu_uninit(vcpu);
621 kmem_cache_free(kvm_vcpu_cache, svm);
622}
623
624static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
625{
626 struct vcpu_svm *svm = to_svm(vcpu);
627 int i;
628
629 if (unlikely(cpu != vcpu->cpu)) {
630 u64 tsc_this, delta;
631
632 /*
633 * Make sure that the guest sees a monotonically
634 * increasing TSC.
635 */
636 rdtscll(tsc_this);
637 delta = vcpu->arch.host_tsc - tsc_this;
638 svm->vmcb->control.tsc_offset += delta;
639 vcpu->cpu = cpu;
640 kvm_migrate_apic_timer(vcpu);
641 }
642
643 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
644 rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
645}
646
647static void svm_vcpu_put(struct kvm_vcpu *vcpu)
648{
649 struct vcpu_svm *svm = to_svm(vcpu);
650 int i;
651
652 ++vcpu->stat.host_state_reload;
653 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
654 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
655
656 rdtscll(vcpu->arch.host_tsc);
657}
658
659static void svm_vcpu_decache(struct kvm_vcpu *vcpu)
660{
661}
662
663static void svm_cache_regs(struct kvm_vcpu *vcpu)
664{
665 struct vcpu_svm *svm = to_svm(vcpu);
666
667 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
668 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
669 vcpu->arch.rip = svm->vmcb->save.rip;
670}
671
672static void svm_decache_regs(struct kvm_vcpu *vcpu)
673{
674 struct vcpu_svm *svm = to_svm(vcpu);
675 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
676 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
677 svm->vmcb->save.rip = vcpu->arch.rip;
678}
679
680static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
681{
682 return to_svm(vcpu)->vmcb->save.rflags;
683}
684
685static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
686{
687 to_svm(vcpu)->vmcb->save.rflags = rflags;
688}
689
690static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
691{
692 struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
693
694 switch (seg) {
695 case VCPU_SREG_CS: return &save->cs;
696 case VCPU_SREG_DS: return &save->ds;
697 case VCPU_SREG_ES: return &save->es;
698 case VCPU_SREG_FS: return &save->fs;
699 case VCPU_SREG_GS: return &save->gs;
700 case VCPU_SREG_SS: return &save->ss;
701 case VCPU_SREG_TR: return &save->tr;
702 case VCPU_SREG_LDTR: return &save->ldtr;
703 }
704 BUG();
705 return NULL;
706}
707
708static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
709{
710 struct vmcb_seg *s = svm_seg(vcpu, seg);
711
712 return s->base;
713}
714
715static void svm_get_segment(struct kvm_vcpu *vcpu,
716 struct kvm_segment *var, int seg)
717{
718 struct vmcb_seg *s = svm_seg(vcpu, seg);
719
720 var->base = s->base;
721 var->limit = s->limit;
722 var->selector = s->selector;
723 var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
724 var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
725 var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
726 var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
727 var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
728 var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
729 var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
730 var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1;
731 var->unusable = !var->present;
732}
733
734static void svm_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
735{
736 struct vcpu_svm *svm = to_svm(vcpu);
737
738 dt->limit = svm->vmcb->save.idtr.limit;
739 dt->base = svm->vmcb->save.idtr.base;
740}
741
742static void svm_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
743{
744 struct vcpu_svm *svm = to_svm(vcpu);
745
746 svm->vmcb->save.idtr.limit = dt->limit;
747 svm->vmcb->save.idtr.base = dt->base ;
748}
749
750static void svm_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
751{
752 struct vcpu_svm *svm = to_svm(vcpu);
753
754 dt->limit = svm->vmcb->save.gdtr.limit;
755 dt->base = svm->vmcb->save.gdtr.base;
756}
757
758static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
759{
760 struct vcpu_svm *svm = to_svm(vcpu);
761
762 svm->vmcb->save.gdtr.limit = dt->limit;
763 svm->vmcb->save.gdtr.base = dt->base ;
764}
765
766static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
767{
768}
769
770static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
771{
772 struct vcpu_svm *svm = to_svm(vcpu);
773
774#ifdef CONFIG_X86_64
775 if (vcpu->arch.shadow_efer & EFER_LME) {
776 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
777 vcpu->arch.shadow_efer |= EFER_LMA;
778 svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
779 }
780
781 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
782 vcpu->arch.shadow_efer &= ~EFER_LMA;
783 svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
784 }
785 }
786#endif
787 if ((vcpu->arch.cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) {
788 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
789 vcpu->fpu_active = 1;
790 }
791
792 vcpu->arch.cr0 = cr0;
793 cr0 |= X86_CR0_PG | X86_CR0_WP;
794 cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
795 svm->vmcb->save.cr0 = cr0;
796}
797
798static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
799{
800 vcpu->arch.cr4 = cr4;
801 to_svm(vcpu)->vmcb->save.cr4 = cr4 | X86_CR4_PAE;
802}
803
804static void svm_set_segment(struct kvm_vcpu *vcpu,
805 struct kvm_segment *var, int seg)
806{
807 struct vcpu_svm *svm = to_svm(vcpu);
808 struct vmcb_seg *s = svm_seg(vcpu, seg);
809
810 s->base = var->base;
811 s->limit = var->limit;
812 s->selector = var->selector;
813 if (var->unusable)
814 s->attrib = 0;
815 else {
816 s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
817 s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
818 s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
819 s->attrib |= (var->present & 1) << SVM_SELECTOR_P_SHIFT;
820 s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
821 s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
822 s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
823 s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
824 }
825 if (seg == VCPU_SREG_CS)
826 svm->vmcb->save.cpl
827 = (svm->vmcb->save.cs.attrib
828 >> SVM_SELECTOR_DPL_SHIFT) & 3;
829
830}
831
832/* FIXME:
833
834 svm(vcpu)->vmcb->control.int_ctl &= ~V_TPR_MASK;
835 svm(vcpu)->vmcb->control.int_ctl |= (sregs->cr8 & V_TPR_MASK);
836
837*/
838
839static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
840{
841 return -EOPNOTSUPP;
842}
843
844static int svm_get_irq(struct kvm_vcpu *vcpu)
845{
846 struct vcpu_svm *svm = to_svm(vcpu);
847 u32 exit_int_info = svm->vmcb->control.exit_int_info;
848
849 if (is_external_interrupt(exit_int_info))
850 return exit_int_info & SVM_EVTINJ_VEC_MASK;
851 return -1;
852}
853
854static void load_host_msrs(struct kvm_vcpu *vcpu)
855{
856#ifdef CONFIG_X86_64
857 wrmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
858#endif
859}
860
861static void save_host_msrs(struct kvm_vcpu *vcpu)
862{
863#ifdef CONFIG_X86_64
864 rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
865#endif
866}
867
868static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *svm_data)
869{
870 if (svm_data->next_asid > svm_data->max_asid) {
871 ++svm_data->asid_generation;
872 svm_data->next_asid = 1;
873 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
874 }
875
876 svm->vcpu.cpu = svm_data->cpu;
877 svm->asid_generation = svm_data->asid_generation;
878 svm->vmcb->control.asid = svm_data->next_asid++;
879}
880
881static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr)
882{
883 return to_svm(vcpu)->db_regs[dr];
884}
885
886static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
887 int *exception)
888{
889 struct vcpu_svm *svm = to_svm(vcpu);
890
891 *exception = 0;
892
893 if (svm->vmcb->save.dr7 & DR7_GD_MASK) {
894 svm->vmcb->save.dr7 &= ~DR7_GD_MASK;
895 svm->vmcb->save.dr6 |= DR6_BD_MASK;
896 *exception = DB_VECTOR;
897 return;
898 }
899
900 switch (dr) {
901 case 0 ... 3:
902 svm->db_regs[dr] = value;
903 return;
904 case 4 ... 5:
905 if (vcpu->arch.cr4 & X86_CR4_DE) {
906 *exception = UD_VECTOR;
907 return;
908 }
909 case 7: {
910 if (value & ~((1ULL << 32) - 1)) {
911 *exception = GP_VECTOR;
912 return;
913 }
914 svm->vmcb->save.dr7 = value;
915 return;
916 }
917 default:
918 printk(KERN_DEBUG "%s: unexpected dr %u\n",
919 __FUNCTION__, dr);
920 *exception = UD_VECTOR;
921 return;
922 }
923}
924
925static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
926{
927 u32 exit_int_info = svm->vmcb->control.exit_int_info;
928 struct kvm *kvm = svm->vcpu.kvm;
929 u64 fault_address;
930 u32 error_code;
931
932 if (!irqchip_in_kernel(kvm) &&
933 is_external_interrupt(exit_int_info))
934 push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK);
935
936 fault_address = svm->vmcb->control.exit_info_2;
937 error_code = svm->vmcb->control.exit_info_1;
938 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
939}
940
941static int ud_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
942{
943 int er;
944
945 er = emulate_instruction(&svm->vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD);
946 if (er != EMULATE_DONE)
947 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
948 return 1;
949}
950
951static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
952{
953 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
954 if (!(svm->vcpu.arch.cr0 & X86_CR0_TS))
955 svm->vmcb->save.cr0 &= ~X86_CR0_TS;
956 svm->vcpu.fpu_active = 1;
957
958 return 1;
959}
960
961static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
962{
963 /*
964 * VMCB is undefined after a SHUTDOWN intercept
965 * so reinitialize it.
966 */
967 clear_page(svm->vmcb);
968 init_vmcb(svm->vmcb);
969
970 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
971 return 0;
972}
973
974static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
975{
976 u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
977 int size, down, in, string, rep;
978 unsigned port;
979
980 ++svm->vcpu.stat.io_exits;
981
982 svm->next_rip = svm->vmcb->control.exit_info_2;
983
984 string = (io_info & SVM_IOIO_STR_MASK) != 0;
985
986 if (string) {
987 if (emulate_instruction(&svm->vcpu,
988 kvm_run, 0, 0, 0) == EMULATE_DO_MMIO)
989 return 0;
990 return 1;
991 }
992
993 in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
994 port = io_info >> 16;
995 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
996 rep = (io_info & SVM_IOIO_REP_MASK) != 0;
997 down = (svm->vmcb->save.rflags & X86_EFLAGS_DF) != 0;
998
999 return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port);
1000}
1001
1002static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1003{
1004 return 1;
1005}
1006
1007static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1008{
1009 svm->next_rip = svm->vmcb->save.rip + 1;
1010 skip_emulated_instruction(&svm->vcpu);
1011 return kvm_emulate_halt(&svm->vcpu);
1012}
1013
1014static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1015{
1016 svm->next_rip = svm->vmcb->save.rip + 3;
1017 skip_emulated_instruction(&svm->vcpu);
1018 kvm_emulate_hypercall(&svm->vcpu);
1019 return 1;
1020}
1021
1022static int invalid_op_interception(struct vcpu_svm *svm,
1023 struct kvm_run *kvm_run)
1024{
1025 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
1026 return 1;
1027}
1028
1029static int task_switch_interception(struct vcpu_svm *svm,
1030 struct kvm_run *kvm_run)
1031{
1032 pr_unimpl(&svm->vcpu, "%s: task switch is unsupported\n", __FUNCTION__);
1033 kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
1034 return 0;
1035}
1036
1037static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1038{
1039 svm->next_rip = svm->vmcb->save.rip + 2;
1040 kvm_emulate_cpuid(&svm->vcpu);
1041 return 1;
1042}
1043
1044static int emulate_on_interception(struct vcpu_svm *svm,
1045 struct kvm_run *kvm_run)
1046{
1047 if (emulate_instruction(&svm->vcpu, NULL, 0, 0, 0) != EMULATE_DONE)
1048 pr_unimpl(&svm->vcpu, "%s: failed\n", __FUNCTION__);
1049 return 1;
1050}
1051
1052static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1053{
1054 emulate_instruction(&svm->vcpu, NULL, 0, 0, 0);
1055 if (irqchip_in_kernel(svm->vcpu.kvm))
1056 return 1;
1057 kvm_run->exit_reason = KVM_EXIT_SET_TPR;
1058 return 0;
1059}
1060
1061static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
1062{
1063 struct vcpu_svm *svm = to_svm(vcpu);
1064
1065 switch (ecx) {
1066 case MSR_IA32_TIME_STAMP_COUNTER: {
1067 u64 tsc;
1068
1069 rdtscll(tsc);
1070 *data = svm->vmcb->control.tsc_offset + tsc;
1071 break;
1072 }
1073 case MSR_K6_STAR:
1074 *data = svm->vmcb->save.star;
1075 break;
1076#ifdef CONFIG_X86_64
1077 case MSR_LSTAR:
1078 *data = svm->vmcb->save.lstar;
1079 break;
1080 case MSR_CSTAR:
1081 *data = svm->vmcb->save.cstar;
1082 break;
1083 case MSR_KERNEL_GS_BASE:
1084 *data = svm->vmcb->save.kernel_gs_base;
1085 break;
1086 case MSR_SYSCALL_MASK:
1087 *data = svm->vmcb->save.sfmask;
1088 break;
1089#endif
1090 case MSR_IA32_SYSENTER_CS:
1091 *data = svm->vmcb->save.sysenter_cs;
1092 break;
1093 case MSR_IA32_SYSENTER_EIP:
1094 *data = svm->vmcb->save.sysenter_eip;
1095 break;
1096 case MSR_IA32_SYSENTER_ESP:
1097 *data = svm->vmcb->save.sysenter_esp;
1098 break;
1099 default:
1100 return kvm_get_msr_common(vcpu, ecx, data);
1101 }
1102 return 0;
1103}
1104
1105static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1106{
1107 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
1108 u64 data;
1109
1110 if (svm_get_msr(&svm->vcpu, ecx, &data))
1111 kvm_inject_gp(&svm->vcpu, 0);
1112 else {
1113 svm->vmcb->save.rax = data & 0xffffffff;
1114 svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32;
1115 svm->next_rip = svm->vmcb->save.rip + 2;
1116 skip_emulated_instruction(&svm->vcpu);
1117 }
1118 return 1;
1119}
1120
1121static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
1122{
1123 struct vcpu_svm *svm = to_svm(vcpu);
1124
1125 switch (ecx) {
1126 case MSR_IA32_TIME_STAMP_COUNTER: {
1127 u64 tsc;
1128
1129 rdtscll(tsc);
1130 svm->vmcb->control.tsc_offset = data - tsc;
1131 break;
1132 }
1133 case MSR_K6_STAR:
1134 svm->vmcb->save.star = data;
1135 break;
1136#ifdef CONFIG_X86_64
1137 case MSR_LSTAR:
1138 svm->vmcb->save.lstar = data;
1139 break;
1140 case MSR_CSTAR:
1141 svm->vmcb->save.cstar = data;
1142 break;
1143 case MSR_KERNEL_GS_BASE:
1144 svm->vmcb->save.kernel_gs_base = data;
1145 break;
1146 case MSR_SYSCALL_MASK:
1147 svm->vmcb->save.sfmask = data;
1148 break;
1149#endif
1150 case MSR_IA32_SYSENTER_CS:
1151 svm->vmcb->save.sysenter_cs = data;
1152 break;
1153 case MSR_IA32_SYSENTER_EIP:
1154 svm->vmcb->save.sysenter_eip = data;
1155 break;
1156 case MSR_IA32_SYSENTER_ESP:
1157 svm->vmcb->save.sysenter_esp = data;
1158 break;
1159 case MSR_K7_EVNTSEL0:
1160 case MSR_K7_EVNTSEL1:
1161 case MSR_K7_EVNTSEL2:
1162 case MSR_K7_EVNTSEL3:
1163 /*
1164 * only support writing 0 to the performance counters for now
1165 * to make Windows happy. Should be replaced by a real
1166 * performance counter emulation later.
1167 */
1168 if (data != 0)
1169 goto unhandled;
1170 break;
1171 default:
1172 unhandled:
1173 return kvm_set_msr_common(vcpu, ecx, data);
1174 }
1175 return 0;
1176}
1177
1178static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1179{
1180 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
1181 u64 data = (svm->vmcb->save.rax & -1u)
1182 | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
1183 svm->next_rip = svm->vmcb->save.rip + 2;
1184 if (svm_set_msr(&svm->vcpu, ecx, data))
1185 kvm_inject_gp(&svm->vcpu, 0);
1186 else
1187 skip_emulated_instruction(&svm->vcpu);
1188 return 1;
1189}
1190
1191static int msr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1192{
1193 if (svm->vmcb->control.exit_info_1)
1194 return wrmsr_interception(svm, kvm_run);
1195 else
1196 return rdmsr_interception(svm, kvm_run);
1197}
1198
1199static int interrupt_window_interception(struct vcpu_svm *svm,
1200 struct kvm_run *kvm_run)
1201{
1202 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR);
1203 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
1204 /*
1205 * If the user space waits to inject interrupts, exit as soon as
1206 * possible
1207 */
1208 if (kvm_run->request_interrupt_window &&
1209 !svm->vcpu.arch.irq_summary) {
1210 ++svm->vcpu.stat.irq_window_exits;
1211 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
1212 return 0;
1213 }
1214
1215 return 1;
1216}
1217
1218static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
1219 struct kvm_run *kvm_run) = {
1220 [SVM_EXIT_READ_CR0] = emulate_on_interception,
1221 [SVM_EXIT_READ_CR3] = emulate_on_interception,
1222 [SVM_EXIT_READ_CR4] = emulate_on_interception,
1223 [SVM_EXIT_READ_CR8] = emulate_on_interception,
1224 /* for now: */
1225 [SVM_EXIT_WRITE_CR0] = emulate_on_interception,
1226 [SVM_EXIT_WRITE_CR3] = emulate_on_interception,
1227 [SVM_EXIT_WRITE_CR4] = emulate_on_interception,
1228 [SVM_EXIT_WRITE_CR8] = cr8_write_interception,
1229 [SVM_EXIT_READ_DR0] = emulate_on_interception,
1230 [SVM_EXIT_READ_DR1] = emulate_on_interception,
1231 [SVM_EXIT_READ_DR2] = emulate_on_interception,
1232 [SVM_EXIT_READ_DR3] = emulate_on_interception,
1233 [SVM_EXIT_WRITE_DR0] = emulate_on_interception,
1234 [SVM_EXIT_WRITE_DR1] = emulate_on_interception,
1235 [SVM_EXIT_WRITE_DR2] = emulate_on_interception,
1236 [SVM_EXIT_WRITE_DR3] = emulate_on_interception,
1237 [SVM_EXIT_WRITE_DR5] = emulate_on_interception,
1238 [SVM_EXIT_WRITE_DR7] = emulate_on_interception,
1239 [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception,
1240 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
1241 [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception,
1242 [SVM_EXIT_INTR] = nop_on_interception,
1243 [SVM_EXIT_NMI] = nop_on_interception,
1244 [SVM_EXIT_SMI] = nop_on_interception,
1245 [SVM_EXIT_INIT] = nop_on_interception,
1246 [SVM_EXIT_VINTR] = interrupt_window_interception,
1247 /* [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, */
1248 [SVM_EXIT_CPUID] = cpuid_interception,
1249 [SVM_EXIT_INVD] = emulate_on_interception,
1250 [SVM_EXIT_HLT] = halt_interception,
1251 [SVM_EXIT_INVLPG] = emulate_on_interception,
1252 [SVM_EXIT_INVLPGA] = invalid_op_interception,
1253 [SVM_EXIT_IOIO] = io_interception,
1254 [SVM_EXIT_MSR] = msr_interception,
1255 [SVM_EXIT_TASK_SWITCH] = task_switch_interception,
1256 [SVM_EXIT_SHUTDOWN] = shutdown_interception,
1257 [SVM_EXIT_VMRUN] = invalid_op_interception,
1258 [SVM_EXIT_VMMCALL] = vmmcall_interception,
1259 [SVM_EXIT_VMLOAD] = invalid_op_interception,
1260 [SVM_EXIT_VMSAVE] = invalid_op_interception,
1261 [SVM_EXIT_STGI] = invalid_op_interception,
1262 [SVM_EXIT_CLGI] = invalid_op_interception,
1263 [SVM_EXIT_SKINIT] = invalid_op_interception,
1264 [SVM_EXIT_WBINVD] = emulate_on_interception,
1265 [SVM_EXIT_MONITOR] = invalid_op_interception,
1266 [SVM_EXIT_MWAIT] = invalid_op_interception,
1267};
1268
1269
1270static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1271{
1272 struct vcpu_svm *svm = to_svm(vcpu);
1273 u32 exit_code = svm->vmcb->control.exit_code;
1274
1275 kvm_reput_irq(svm);
1276
1277 if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
1278 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
1279 kvm_run->fail_entry.hardware_entry_failure_reason
1280 = svm->vmcb->control.exit_code;
1281 return 0;
1282 }
1283
1284 if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
1285 exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR)
1286 printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x "
1287 "exit_code 0x%x\n",
1288 __FUNCTION__, svm->vmcb->control.exit_int_info,
1289 exit_code);
1290
1291 if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
1292 || !svm_exit_handlers[exit_code]) {
1293 kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
1294 kvm_run->hw.hardware_exit_reason = exit_code;
1295 return 0;
1296 }
1297
1298 return svm_exit_handlers[exit_code](svm, kvm_run);
1299}
1300
1301static void reload_tss(struct kvm_vcpu *vcpu)
1302{
1303 int cpu = raw_smp_processor_id();
1304
1305 struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu);
1306 svm_data->tss_desc->type = 9; /* available 32/64-bit TSS */
1307 load_TR_desc();
1308}
1309
1310static void pre_svm_run(struct vcpu_svm *svm)
1311{
1312 int cpu = raw_smp_processor_id();
1313
1314 struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu);
1315
1316 svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
1317 if (svm->vcpu.cpu != cpu ||
1318 svm->asid_generation != svm_data->asid_generation)
1319 new_asid(svm, svm_data);
1320}
1321
1322
1323static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
1324{
1325 struct vmcb_control_area *control;
1326
1327 control = &svm->vmcb->control;
1328 control->int_vector = irq;
1329 control->int_ctl &= ~V_INTR_PRIO_MASK;
1330 control->int_ctl |= V_IRQ_MASK |
1331 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
1332}
1333
1334static void svm_set_irq(struct kvm_vcpu *vcpu, int irq)
1335{
1336 struct vcpu_svm *svm = to_svm(vcpu);
1337
1338 svm_inject_irq(svm, irq);
1339}
1340
1341static void svm_intr_assist(struct kvm_vcpu *vcpu)
1342{
1343 struct vcpu_svm *svm = to_svm(vcpu);
1344 struct vmcb *vmcb = svm->vmcb;
1345 int intr_vector = -1;
1346
1347 if ((vmcb->control.exit_int_info & SVM_EVTINJ_VALID) &&
1348 ((vmcb->control.exit_int_info & SVM_EVTINJ_TYPE_MASK) == 0)) {
1349 intr_vector = vmcb->control.exit_int_info &
1350 SVM_EVTINJ_VEC_MASK;
1351 vmcb->control.exit_int_info = 0;
1352 svm_inject_irq(svm, intr_vector);
1353 return;
1354 }
1355
1356 if (vmcb->control.int_ctl & V_IRQ_MASK)
1357 return;
1358
1359 if (!kvm_cpu_has_interrupt(vcpu))
1360 return;
1361
1362 if (!(vmcb->save.rflags & X86_EFLAGS_IF) ||
1363 (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
1364 (vmcb->control.event_inj & SVM_EVTINJ_VALID)) {
1365 /* unable to deliver irq, set pending irq */
1366 vmcb->control.intercept |= (1ULL << INTERCEPT_VINTR);
1367 svm_inject_irq(svm, 0x0);
1368 return;
1369 }
1370 /* Okay, we can deliver the interrupt: grab it and update PIC state. */
1371 intr_vector = kvm_cpu_get_interrupt(vcpu);
1372 svm_inject_irq(svm, intr_vector);
1373 kvm_timer_intr_post(vcpu, intr_vector);
1374}
1375
1376static void kvm_reput_irq(struct vcpu_svm *svm)
1377{
1378 struct vmcb_control_area *control = &svm->vmcb->control;
1379
1380 if ((control->int_ctl & V_IRQ_MASK)
1381 && !irqchip_in_kernel(svm->vcpu.kvm)) {
1382 control->int_ctl &= ~V_IRQ_MASK;
1383 push_irq(&svm->vcpu, control->int_vector);
1384 }
1385
1386 svm->vcpu.arch.interrupt_window_open =
1387 !(control->int_state & SVM_INTERRUPT_SHADOW_MASK);
1388}
1389
1390static void svm_do_inject_vector(struct vcpu_svm *svm)
1391{
1392 struct kvm_vcpu *vcpu = &svm->vcpu;
1393 int word_index = __ffs(vcpu->arch.irq_summary);
1394 int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
1395 int irq = word_index * BITS_PER_LONG + bit_index;
1396
1397 clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
1398 if (!vcpu->arch.irq_pending[word_index])
1399 clear_bit(word_index, &vcpu->arch.irq_summary);
1400 svm_inject_irq(svm, irq);
1401}
1402
1403static void do_interrupt_requests(struct kvm_vcpu *vcpu,
1404 struct kvm_run *kvm_run)
1405{
1406 struct vcpu_svm *svm = to_svm(vcpu);
1407 struct vmcb_control_area *control = &svm->vmcb->control;
1408
1409 svm->vcpu.arch.interrupt_window_open =
1410 (!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) &&
1411 (svm->vmcb->save.rflags & X86_EFLAGS_IF));
1412
1413 if (svm->vcpu.arch.interrupt_window_open && svm->vcpu.arch.irq_summary)
1414 /*
1415 * If interrupts enabled, and not blocked by sti or mov ss. Good.
1416 */
1417 svm_do_inject_vector(svm);
1418
1419 /*
1420 * Interrupts blocked. Wait for unblock.
1421 */
1422 if (!svm->vcpu.arch.interrupt_window_open &&
1423 (svm->vcpu.arch.irq_summary || kvm_run->request_interrupt_window))
1424 control->intercept |= 1ULL << INTERCEPT_VINTR;
1425 else
1426 control->intercept &= ~(1ULL << INTERCEPT_VINTR);
1427}
1428
1429static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
1430{
1431 return 0;
1432}
1433
1434static void save_db_regs(unsigned long *db_regs)
1435{
1436 asm volatile ("mov %%dr0, %0" : "=r"(db_regs[0]));
1437 asm volatile ("mov %%dr1, %0" : "=r"(db_regs[1]));
1438 asm volatile ("mov %%dr2, %0" : "=r"(db_regs[2]));
1439 asm volatile ("mov %%dr3, %0" : "=r"(db_regs[3]));
1440}
1441
1442static void load_db_regs(unsigned long *db_regs)
1443{
1444 asm volatile ("mov %0, %%dr0" : : "r"(db_regs[0]));
1445 asm volatile ("mov %0, %%dr1" : : "r"(db_regs[1]));
1446 asm volatile ("mov %0, %%dr2" : : "r"(db_regs[2]));
1447 asm volatile ("mov %0, %%dr3" : : "r"(db_regs[3]));
1448}
1449
1450static void svm_flush_tlb(struct kvm_vcpu *vcpu)
1451{
1452 force_new_asid(vcpu);
1453}
1454
1455static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
1456{
1457}
1458
1459static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1460{
1461 struct vcpu_svm *svm = to_svm(vcpu);
1462 u16 fs_selector;
1463 u16 gs_selector;
1464 u16 ldt_selector;
1465
1466 pre_svm_run(svm);
1467
1468 save_host_msrs(vcpu);
1469 fs_selector = read_fs();
1470 gs_selector = read_gs();
1471 ldt_selector = read_ldt();
1472 svm->host_cr2 = kvm_read_cr2();
1473 svm->host_dr6 = read_dr6();
1474 svm->host_dr7 = read_dr7();
1475 svm->vmcb->save.cr2 = vcpu->arch.cr2;
1476
1477 if (svm->vmcb->save.dr7 & 0xff) {
1478 write_dr7(0);
1479 save_db_regs(svm->host_db_regs);
1480 load_db_regs(svm->db_regs);
1481 }
1482
1483 clgi();
1484
1485 local_irq_enable();
1486
1487 asm volatile (
1488#ifdef CONFIG_X86_64
1489 "push %%rbp; \n\t"
1490#else
1491 "push %%ebp; \n\t"
1492#endif
1493
1494#ifdef CONFIG_X86_64
1495 "mov %c[rbx](%[svm]), %%rbx \n\t"
1496 "mov %c[rcx](%[svm]), %%rcx \n\t"
1497 "mov %c[rdx](%[svm]), %%rdx \n\t"
1498 "mov %c[rsi](%[svm]), %%rsi \n\t"
1499 "mov %c[rdi](%[svm]), %%rdi \n\t"
1500 "mov %c[rbp](%[svm]), %%rbp \n\t"
1501 "mov %c[r8](%[svm]), %%r8 \n\t"
1502 "mov %c[r9](%[svm]), %%r9 \n\t"
1503 "mov %c[r10](%[svm]), %%r10 \n\t"
1504 "mov %c[r11](%[svm]), %%r11 \n\t"
1505 "mov %c[r12](%[svm]), %%r12 \n\t"
1506 "mov %c[r13](%[svm]), %%r13 \n\t"
1507 "mov %c[r14](%[svm]), %%r14 \n\t"
1508 "mov %c[r15](%[svm]), %%r15 \n\t"
1509#else
1510 "mov %c[rbx](%[svm]), %%ebx \n\t"
1511 "mov %c[rcx](%[svm]), %%ecx \n\t"
1512 "mov %c[rdx](%[svm]), %%edx \n\t"
1513 "mov %c[rsi](%[svm]), %%esi \n\t"
1514 "mov %c[rdi](%[svm]), %%edi \n\t"
1515 "mov %c[rbp](%[svm]), %%ebp \n\t"
1516#endif
1517
1518#ifdef CONFIG_X86_64
1519 /* Enter guest mode */
1520 "push %%rax \n\t"
1521 "mov %c[vmcb](%[svm]), %%rax \n\t"
1522 SVM_VMLOAD "\n\t"
1523 SVM_VMRUN "\n\t"
1524 SVM_VMSAVE "\n\t"
1525 "pop %%rax \n\t"
1526#else
1527 /* Enter guest mode */
1528 "push %%eax \n\t"
1529 "mov %c[vmcb](%[svm]), %%eax \n\t"
1530 SVM_VMLOAD "\n\t"
1531 SVM_VMRUN "\n\t"
1532 SVM_VMSAVE "\n\t"
1533 "pop %%eax \n\t"
1534#endif
1535
1536 /* Save guest registers, load host registers */
1537#ifdef CONFIG_X86_64
1538 "mov %%rbx, %c[rbx](%[svm]) \n\t"
1539 "mov %%rcx, %c[rcx](%[svm]) \n\t"
1540 "mov %%rdx, %c[rdx](%[svm]) \n\t"
1541 "mov %%rsi, %c[rsi](%[svm]) \n\t"
1542 "mov %%rdi, %c[rdi](%[svm]) \n\t"
1543 "mov %%rbp, %c[rbp](%[svm]) \n\t"
1544 "mov %%r8, %c[r8](%[svm]) \n\t"
1545 "mov %%r9, %c[r9](%[svm]) \n\t"
1546 "mov %%r10, %c[r10](%[svm]) \n\t"
1547 "mov %%r11, %c[r11](%[svm]) \n\t"
1548 "mov %%r12, %c[r12](%[svm]) \n\t"
1549 "mov %%r13, %c[r13](%[svm]) \n\t"
1550 "mov %%r14, %c[r14](%[svm]) \n\t"
1551 "mov %%r15, %c[r15](%[svm]) \n\t"
1552
1553 "pop %%rbp; \n\t"
1554#else
1555 "mov %%ebx, %c[rbx](%[svm]) \n\t"
1556 "mov %%ecx, %c[rcx](%[svm]) \n\t"
1557 "mov %%edx, %c[rdx](%[svm]) \n\t"
1558 "mov %%esi, %c[rsi](%[svm]) \n\t"
1559 "mov %%edi, %c[rdi](%[svm]) \n\t"
1560 "mov %%ebp, %c[rbp](%[svm]) \n\t"
1561
1562 "pop %%ebp; \n\t"
1563#endif
1564 :
1565 : [svm]"a"(svm),
1566 [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
1567 [rbx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBX])),
1568 [rcx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RCX])),
1569 [rdx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDX])),
1570 [rsi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RSI])),
1571 [rdi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDI])),
1572 [rbp]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBP]))
1573#ifdef CONFIG_X86_64
1574 , [r8]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R8])),
1575 [r9]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R9])),
1576 [r10]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R10])),
1577 [r11]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R11])),
1578 [r12]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R12])),
1579 [r13]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R13])),
1580 [r14]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R14])),
1581 [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15]))
1582#endif
1583 : "cc", "memory"
1584#ifdef CONFIG_X86_64
1585 , "rbx", "rcx", "rdx", "rsi", "rdi"
1586 , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15"
1587#else
1588 , "ebx", "ecx", "edx" , "esi", "edi"
1589#endif
1590 );
1591
1592 if ((svm->vmcb->save.dr7 & 0xff))
1593 load_db_regs(svm->host_db_regs);
1594
1595 vcpu->arch.cr2 = svm->vmcb->save.cr2;
1596
1597 write_dr6(svm->host_dr6);
1598 write_dr7(svm->host_dr7);
1599 kvm_write_cr2(svm->host_cr2);
1600
1601 load_fs(fs_selector);
1602 load_gs(gs_selector);
1603 load_ldt(ldt_selector);
1604 load_host_msrs(vcpu);
1605
1606 reload_tss(vcpu);
1607
1608 local_irq_disable();
1609
1610 stgi();
1611
1612 svm->next_rip = 0;
1613}
1614
1615static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
1616{
1617 struct vcpu_svm *svm = to_svm(vcpu);
1618
1619 svm->vmcb->save.cr3 = root;
1620 force_new_asid(vcpu);
1621
1622 if (vcpu->fpu_active) {
1623 svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR);
1624 svm->vmcb->save.cr0 |= X86_CR0_TS;
1625 vcpu->fpu_active = 0;
1626 }
1627}
1628
1629static int is_disabled(void)
1630{
1631 u64 vm_cr;
1632
1633 rdmsrl(MSR_VM_CR, vm_cr);
1634 if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE))
1635 return 1;
1636
1637 return 0;
1638}
1639
1640static void
1641svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
1642{
1643 /*
1644 * Patch in the VMMCALL instruction:
1645 */
1646 hypercall[0] = 0x0f;
1647 hypercall[1] = 0x01;
1648 hypercall[2] = 0xd9;
1649}
1650
1651static void svm_check_processor_compat(void *rtn)
1652{
1653 *(int *)rtn = 0;
1654}
1655
1656static bool svm_cpu_has_accelerated_tpr(void)
1657{
1658 return false;
1659}
1660
1661static struct kvm_x86_ops svm_x86_ops = {
1662 .cpu_has_kvm_support = has_svm,
1663 .disabled_by_bios = is_disabled,
1664 .hardware_setup = svm_hardware_setup,
1665 .hardware_unsetup = svm_hardware_unsetup,
1666 .check_processor_compatibility = svm_check_processor_compat,
1667 .hardware_enable = svm_hardware_enable,
1668 .hardware_disable = svm_hardware_disable,
1669 .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
1670
1671 .vcpu_create = svm_create_vcpu,
1672 .vcpu_free = svm_free_vcpu,
1673 .vcpu_reset = svm_vcpu_reset,
1674
1675 .prepare_guest_switch = svm_prepare_guest_switch,
1676 .vcpu_load = svm_vcpu_load,
1677 .vcpu_put = svm_vcpu_put,
1678 .vcpu_decache = svm_vcpu_decache,
1679
1680 .set_guest_debug = svm_guest_debug,
1681 .get_msr = svm_get_msr,
1682 .set_msr = svm_set_msr,
1683 .get_segment_base = svm_get_segment_base,
1684 .get_segment = svm_get_segment,
1685 .set_segment = svm_set_segment,
1686 .get_cs_db_l_bits = kvm_get_cs_db_l_bits,
1687 .decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
1688 .set_cr0 = svm_set_cr0,
1689 .set_cr3 = svm_set_cr3,
1690 .set_cr4 = svm_set_cr4,
1691 .set_efer = svm_set_efer,
1692 .get_idt = svm_get_idt,
1693 .set_idt = svm_set_idt,
1694 .get_gdt = svm_get_gdt,
1695 .set_gdt = svm_set_gdt,
1696 .get_dr = svm_get_dr,
1697 .set_dr = svm_set_dr,
1698 .cache_regs = svm_cache_regs,
1699 .decache_regs = svm_decache_regs,
1700 .get_rflags = svm_get_rflags,
1701 .set_rflags = svm_set_rflags,
1702
1703 .tlb_flush = svm_flush_tlb,
1704
1705 .run = svm_vcpu_run,
1706 .handle_exit = handle_exit,
1707 .skip_emulated_instruction = skip_emulated_instruction,
1708 .patch_hypercall = svm_patch_hypercall,
1709 .get_irq = svm_get_irq,
1710 .set_irq = svm_set_irq,
1711 .queue_exception = svm_queue_exception,
1712 .exception_injected = svm_exception_injected,
1713 .inject_pending_irq = svm_intr_assist,
1714 .inject_pending_vectors = do_interrupt_requests,
1715
1716 .set_tss_addr = svm_set_tss_addr,
1717};
1718
1719static int __init svm_init(void)
1720{
1721 return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm),
1722 THIS_MODULE);
1723}
1724
1725static void __exit svm_exit(void)
1726{
1727 kvm_exit();
1728}
1729
1730module_init(svm_init)
1731module_exit(svm_exit)
diff --git a/arch/x86/kvm/svm.h b/arch/x86/kvm/svm.h
new file mode 100644
index 00000000000..5fd50491b55
--- /dev/null
+++ b/arch/x86/kvm/svm.h
@@ -0,0 +1,325 @@
1#ifndef __SVM_H
2#define __SVM_H
3
4enum {
5 INTERCEPT_INTR,
6 INTERCEPT_NMI,
7 INTERCEPT_SMI,
8 INTERCEPT_INIT,
9 INTERCEPT_VINTR,
10 INTERCEPT_SELECTIVE_CR0,
11 INTERCEPT_STORE_IDTR,
12 INTERCEPT_STORE_GDTR,
13 INTERCEPT_STORE_LDTR,
14 INTERCEPT_STORE_TR,
15 INTERCEPT_LOAD_IDTR,
16 INTERCEPT_LOAD_GDTR,
17 INTERCEPT_LOAD_LDTR,
18 INTERCEPT_LOAD_TR,
19 INTERCEPT_RDTSC,
20 INTERCEPT_RDPMC,
21 INTERCEPT_PUSHF,
22 INTERCEPT_POPF,
23 INTERCEPT_CPUID,
24 INTERCEPT_RSM,
25 INTERCEPT_IRET,
26 INTERCEPT_INTn,
27 INTERCEPT_INVD,
28 INTERCEPT_PAUSE,
29 INTERCEPT_HLT,
30 INTERCEPT_INVLPG,
31 INTERCEPT_INVLPGA,
32 INTERCEPT_IOIO_PROT,
33 INTERCEPT_MSR_PROT,
34 INTERCEPT_TASK_SWITCH,
35 INTERCEPT_FERR_FREEZE,
36 INTERCEPT_SHUTDOWN,
37 INTERCEPT_VMRUN,
38 INTERCEPT_VMMCALL,
39 INTERCEPT_VMLOAD,
40 INTERCEPT_VMSAVE,
41 INTERCEPT_STGI,
42 INTERCEPT_CLGI,
43 INTERCEPT_SKINIT,
44 INTERCEPT_RDTSCP,
45 INTERCEPT_ICEBP,
46 INTERCEPT_WBINVD,
47 INTERCEPT_MONITOR,
48 INTERCEPT_MWAIT,
49 INTERCEPT_MWAIT_COND,
50};
51
52
53struct __attribute__ ((__packed__)) vmcb_control_area {
54 u16 intercept_cr_read;
55 u16 intercept_cr_write;
56 u16 intercept_dr_read;
57 u16 intercept_dr_write;
58 u32 intercept_exceptions;
59 u64 intercept;
60 u8 reserved_1[44];
61 u64 iopm_base_pa;
62 u64 msrpm_base_pa;
63 u64 tsc_offset;
64 u32 asid;
65 u8 tlb_ctl;
66 u8 reserved_2[3];
67 u32 int_ctl;
68 u32 int_vector;
69 u32 int_state;
70 u8 reserved_3[4];
71 u32 exit_code;
72 u32 exit_code_hi;
73 u64 exit_info_1;
74 u64 exit_info_2;
75 u32 exit_int_info;
76 u32 exit_int_info_err;
77 u64 nested_ctl;
78 u8 reserved_4[16];
79 u32 event_inj;
80 u32 event_inj_err;
81 u64 nested_cr3;
82 u64 lbr_ctl;
83 u8 reserved_5[832];
84};
85
86
87#define TLB_CONTROL_DO_NOTHING 0
88#define TLB_CONTROL_FLUSH_ALL_ASID 1
89
90#define V_TPR_MASK 0x0f
91
92#define V_IRQ_SHIFT 8
93#define V_IRQ_MASK (1 << V_IRQ_SHIFT)
94
95#define V_INTR_PRIO_SHIFT 16
96#define V_INTR_PRIO_MASK (0x0f << V_INTR_PRIO_SHIFT)
97
98#define V_IGN_TPR_SHIFT 20
99#define V_IGN_TPR_MASK (1 << V_IGN_TPR_SHIFT)
100
101#define V_INTR_MASKING_SHIFT 24
102#define V_INTR_MASKING_MASK (1 << V_INTR_MASKING_SHIFT)
103
104#define SVM_INTERRUPT_SHADOW_MASK 1
105
106#define SVM_IOIO_STR_SHIFT 2
107#define SVM_IOIO_REP_SHIFT 3
108#define SVM_IOIO_SIZE_SHIFT 4
109#define SVM_IOIO_ASIZE_SHIFT 7
110
111#define SVM_IOIO_TYPE_MASK 1
112#define SVM_IOIO_STR_MASK (1 << SVM_IOIO_STR_SHIFT)
113#define SVM_IOIO_REP_MASK (1 << SVM_IOIO_REP_SHIFT)
114#define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT)
115#define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT)
116
117struct __attribute__ ((__packed__)) vmcb_seg {
118 u16 selector;
119 u16 attrib;
120 u32 limit;
121 u64 base;
122};
123
124struct __attribute__ ((__packed__)) vmcb_save_area {
125 struct vmcb_seg es;
126 struct vmcb_seg cs;
127 struct vmcb_seg ss;
128 struct vmcb_seg ds;
129 struct vmcb_seg fs;
130 struct vmcb_seg gs;
131 struct vmcb_seg gdtr;
132 struct vmcb_seg ldtr;
133 struct vmcb_seg idtr;
134 struct vmcb_seg tr;
135 u8 reserved_1[43];
136 u8 cpl;
137 u8 reserved_2[4];
138 u64 efer;
139 u8 reserved_3[112];
140 u64 cr4;
141 u64 cr3;
142 u64 cr0;
143 u64 dr7;
144 u64 dr6;
145 u64 rflags;
146 u64 rip;
147 u8 reserved_4[88];
148 u64 rsp;
149 u8 reserved_5[24];
150 u64 rax;
151 u64 star;
152 u64 lstar;
153 u64 cstar;
154 u64 sfmask;
155 u64 kernel_gs_base;
156 u64 sysenter_cs;
157 u64 sysenter_esp;
158 u64 sysenter_eip;
159 u64 cr2;
160 u8 reserved_6[32];
161 u64 g_pat;
162 u64 dbgctl;
163 u64 br_from;
164 u64 br_to;
165 u64 last_excp_from;
166 u64 last_excp_to;
167};
168
169struct __attribute__ ((__packed__)) vmcb {
170 struct vmcb_control_area control;
171 struct vmcb_save_area save;
172};
173
174#define SVM_CPUID_FEATURE_SHIFT 2
175#define SVM_CPUID_FUNC 0x8000000a
176
177#define MSR_EFER_SVME_MASK (1ULL << 12)
178#define MSR_VM_CR 0xc0010114
179#define MSR_VM_HSAVE_PA 0xc0010117ULL
180
181#define SVM_VM_CR_SVM_DISABLE 4
182
183#define SVM_SELECTOR_S_SHIFT 4
184#define SVM_SELECTOR_DPL_SHIFT 5
185#define SVM_SELECTOR_P_SHIFT 7
186#define SVM_SELECTOR_AVL_SHIFT 8
187#define SVM_SELECTOR_L_SHIFT 9
188#define SVM_SELECTOR_DB_SHIFT 10
189#define SVM_SELECTOR_G_SHIFT 11
190
191#define SVM_SELECTOR_TYPE_MASK (0xf)
192#define SVM_SELECTOR_S_MASK (1 << SVM_SELECTOR_S_SHIFT)
193#define SVM_SELECTOR_DPL_MASK (3 << SVM_SELECTOR_DPL_SHIFT)
194#define SVM_SELECTOR_P_MASK (1 << SVM_SELECTOR_P_SHIFT)
195#define SVM_SELECTOR_AVL_MASK (1 << SVM_SELECTOR_AVL_SHIFT)
196#define SVM_SELECTOR_L_MASK (1 << SVM_SELECTOR_L_SHIFT)
197#define SVM_SELECTOR_DB_MASK (1 << SVM_SELECTOR_DB_SHIFT)
198#define SVM_SELECTOR_G_MASK (1 << SVM_SELECTOR_G_SHIFT)
199
200#define SVM_SELECTOR_WRITE_MASK (1 << 1)
201#define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK
202#define SVM_SELECTOR_CODE_MASK (1 << 3)
203
204#define INTERCEPT_CR0_MASK 1
205#define INTERCEPT_CR3_MASK (1 << 3)
206#define INTERCEPT_CR4_MASK (1 << 4)
207#define INTERCEPT_CR8_MASK (1 << 8)
208
209#define INTERCEPT_DR0_MASK 1
210#define INTERCEPT_DR1_MASK (1 << 1)
211#define INTERCEPT_DR2_MASK (1 << 2)
212#define INTERCEPT_DR3_MASK (1 << 3)
213#define INTERCEPT_DR4_MASK (1 << 4)
214#define INTERCEPT_DR5_MASK (1 << 5)
215#define INTERCEPT_DR6_MASK (1 << 6)
216#define INTERCEPT_DR7_MASK (1 << 7)
217
218#define SVM_EVTINJ_VEC_MASK 0xff
219
220#define SVM_EVTINJ_TYPE_SHIFT 8
221#define SVM_EVTINJ_TYPE_MASK (7 << SVM_EVTINJ_TYPE_SHIFT)
222
223#define SVM_EVTINJ_TYPE_INTR (0 << SVM_EVTINJ_TYPE_SHIFT)
224#define SVM_EVTINJ_TYPE_NMI (2 << SVM_EVTINJ_TYPE_SHIFT)
225#define SVM_EVTINJ_TYPE_EXEPT (3 << SVM_EVTINJ_TYPE_SHIFT)
226#define SVM_EVTINJ_TYPE_SOFT (4 << SVM_EVTINJ_TYPE_SHIFT)
227
228#define SVM_EVTINJ_VALID (1 << 31)
229#define SVM_EVTINJ_VALID_ERR (1 << 11)
230
231#define SVM_EXITINTINFO_VEC_MASK SVM_EVTINJ_VEC_MASK
232
233#define SVM_EXITINTINFO_TYPE_INTR SVM_EVTINJ_TYPE_INTR
234#define SVM_EXITINTINFO_TYPE_NMI SVM_EVTINJ_TYPE_NMI
235#define SVM_EXITINTINFO_TYPE_EXEPT SVM_EVTINJ_TYPE_EXEPT
236#define SVM_EXITINTINFO_TYPE_SOFT SVM_EVTINJ_TYPE_SOFT
237
238#define SVM_EXITINTINFO_VALID SVM_EVTINJ_VALID
239#define SVM_EXITINTINFO_VALID_ERR SVM_EVTINJ_VALID_ERR
240
241#define SVM_EXIT_READ_CR0 0x000
242#define SVM_EXIT_READ_CR3 0x003
243#define SVM_EXIT_READ_CR4 0x004
244#define SVM_EXIT_READ_CR8 0x008
245#define SVM_EXIT_WRITE_CR0 0x010
246#define SVM_EXIT_WRITE_CR3 0x013
247#define SVM_EXIT_WRITE_CR4 0x014
248#define SVM_EXIT_WRITE_CR8 0x018
249#define SVM_EXIT_READ_DR0 0x020
250#define SVM_EXIT_READ_DR1 0x021
251#define SVM_EXIT_READ_DR2 0x022
252#define SVM_EXIT_READ_DR3 0x023
253#define SVM_EXIT_READ_DR4 0x024
254#define SVM_EXIT_READ_DR5 0x025
255#define SVM_EXIT_READ_DR6 0x026
256#define SVM_EXIT_READ_DR7 0x027
257#define SVM_EXIT_WRITE_DR0 0x030
258#define SVM_EXIT_WRITE_DR1 0x031
259#define SVM_EXIT_WRITE_DR2 0x032
260#define SVM_EXIT_WRITE_DR3 0x033
261#define SVM_EXIT_WRITE_DR4 0x034
262#define SVM_EXIT_WRITE_DR5 0x035
263#define SVM_EXIT_WRITE_DR6 0x036
264#define SVM_EXIT_WRITE_DR7 0x037
265#define SVM_EXIT_EXCP_BASE 0x040
266#define SVM_EXIT_INTR 0x060
267#define SVM_EXIT_NMI 0x061
268#define SVM_EXIT_SMI 0x062
269#define SVM_EXIT_INIT 0x063
270#define SVM_EXIT_VINTR 0x064
271#define SVM_EXIT_CR0_SEL_WRITE 0x065
272#define SVM_EXIT_IDTR_READ 0x066
273#define SVM_EXIT_GDTR_READ 0x067
274#define SVM_EXIT_LDTR_READ 0x068
275#define SVM_EXIT_TR_READ 0x069
276#define SVM_EXIT_IDTR_WRITE 0x06a
277#define SVM_EXIT_GDTR_WRITE 0x06b
278#define SVM_EXIT_LDTR_WRITE 0x06c
279#define SVM_EXIT_TR_WRITE 0x06d
280#define SVM_EXIT_RDTSC 0x06e
281#define SVM_EXIT_RDPMC 0x06f
282#define SVM_EXIT_PUSHF 0x070
283#define SVM_EXIT_POPF 0x071
284#define SVM_EXIT_CPUID 0x072
285#define SVM_EXIT_RSM 0x073
286#define SVM_EXIT_IRET 0x074
287#define SVM_EXIT_SWINT 0x075
288#define SVM_EXIT_INVD 0x076
289#define SVM_EXIT_PAUSE 0x077
290#define SVM_EXIT_HLT 0x078
291#define SVM_EXIT_INVLPG 0x079
292#define SVM_EXIT_INVLPGA 0x07a
293#define SVM_EXIT_IOIO 0x07b
294#define SVM_EXIT_MSR 0x07c
295#define SVM_EXIT_TASK_SWITCH 0x07d
296#define SVM_EXIT_FERR_FREEZE 0x07e
297#define SVM_EXIT_SHUTDOWN 0x07f
298#define SVM_EXIT_VMRUN 0x080
299#define SVM_EXIT_VMMCALL 0x081
300#define SVM_EXIT_VMLOAD 0x082
301#define SVM_EXIT_VMSAVE 0x083
302#define SVM_EXIT_STGI 0x084
303#define SVM_EXIT_CLGI 0x085
304#define SVM_EXIT_SKINIT 0x086
305#define SVM_EXIT_RDTSCP 0x087
306#define SVM_EXIT_ICEBP 0x088
307#define SVM_EXIT_WBINVD 0x089
308#define SVM_EXIT_MONITOR 0x08a
309#define SVM_EXIT_MWAIT 0x08b
310#define SVM_EXIT_MWAIT_COND 0x08c
311#define SVM_EXIT_NPF 0x400
312
313#define SVM_EXIT_ERR -1
314
315#define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) /* TS and MP */
316
317#define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda"
318#define SVM_VMRUN ".byte 0x0f, 0x01, 0xd8"
319#define SVM_VMSAVE ".byte 0x0f, 0x01, 0xdb"
320#define SVM_CLGI ".byte 0x0f, 0x01, 0xdd"
321#define SVM_STGI ".byte 0x0f, 0x01, 0xdc"
322#define SVM_INVLPGA ".byte 0x0f, 0x01, 0xdf"
323
324#endif
325
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
new file mode 100644
index 00000000000..ad36447e696
--- /dev/null
+++ b/arch/x86/kvm/vmx.c
@@ -0,0 +1,2679 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * Copyright (C) 2006 Qumranet, Inc.
8 *
9 * Authors:
10 * Avi Kivity <avi@qumranet.com>
11 * Yaniv Kamay <yaniv@qumranet.com>
12 *
13 * This work is licensed under the terms of the GNU GPL, version 2. See
14 * the COPYING file in the top-level directory.
15 *
16 */
17
18#include "irq.h"
19#include "vmx.h"
20#include "segment_descriptor.h"
21#include "mmu.h"
22
23#include <linux/kvm_host.h>
24#include <linux/module.h>
25#include <linux/kernel.h>
26#include <linux/mm.h>
27#include <linux/highmem.h>
28#include <linux/sched.h>
29#include <linux/moduleparam.h>
30
31#include <asm/io.h>
32#include <asm/desc.h>
33
34MODULE_AUTHOR("Qumranet");
35MODULE_LICENSE("GPL");
36
37static int bypass_guest_pf = 1;
38module_param(bypass_guest_pf, bool, 0);
39
40struct vmcs {
41 u32 revision_id;
42 u32 abort;
43 char data[0];
44};
45
46struct vcpu_vmx {
47 struct kvm_vcpu vcpu;
48 int launched;
49 u8 fail;
50 u32 idt_vectoring_info;
51 struct kvm_msr_entry *guest_msrs;
52 struct kvm_msr_entry *host_msrs;
53 int nmsrs;
54 int save_nmsrs;
55 int msr_offset_efer;
56#ifdef CONFIG_X86_64
57 int msr_offset_kernel_gs_base;
58#endif
59 struct vmcs *vmcs;
60 struct {
61 int loaded;
62 u16 fs_sel, gs_sel, ldt_sel;
63 int gs_ldt_reload_needed;
64 int fs_reload_needed;
65 int guest_efer_loaded;
66 } host_state;
67 struct {
68 struct {
69 bool pending;
70 u8 vector;
71 unsigned rip;
72 } irq;
73 } rmode;
74};
75
76static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
77{
78 return container_of(vcpu, struct vcpu_vmx, vcpu);
79}
80
81static int init_rmode_tss(struct kvm *kvm);
82
83static DEFINE_PER_CPU(struct vmcs *, vmxarea);
84static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
85
86static struct page *vmx_io_bitmap_a;
87static struct page *vmx_io_bitmap_b;
88
89static struct vmcs_config {
90 int size;
91 int order;
92 u32 revision_id;
93 u32 pin_based_exec_ctrl;
94 u32 cpu_based_exec_ctrl;
95 u32 cpu_based_2nd_exec_ctrl;
96 u32 vmexit_ctrl;
97 u32 vmentry_ctrl;
98} vmcs_config;
99
100#define VMX_SEGMENT_FIELD(seg) \
101 [VCPU_SREG_##seg] = { \
102 .selector = GUEST_##seg##_SELECTOR, \
103 .base = GUEST_##seg##_BASE, \
104 .limit = GUEST_##seg##_LIMIT, \
105 .ar_bytes = GUEST_##seg##_AR_BYTES, \
106 }
107
108static struct kvm_vmx_segment_field {
109 unsigned selector;
110 unsigned base;
111 unsigned limit;
112 unsigned ar_bytes;
113} kvm_vmx_segment_fields[] = {
114 VMX_SEGMENT_FIELD(CS),
115 VMX_SEGMENT_FIELD(DS),
116 VMX_SEGMENT_FIELD(ES),
117 VMX_SEGMENT_FIELD(FS),
118 VMX_SEGMENT_FIELD(GS),
119 VMX_SEGMENT_FIELD(SS),
120 VMX_SEGMENT_FIELD(TR),
121 VMX_SEGMENT_FIELD(LDTR),
122};
123
124/*
125 * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it
126 * away by decrementing the array size.
127 */
128static const u32 vmx_msr_index[] = {
129#ifdef CONFIG_X86_64
130 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE,
131#endif
132 MSR_EFER, MSR_K6_STAR,
133};
134#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
135
136static void load_msrs(struct kvm_msr_entry *e, int n)
137{
138 int i;
139
140 for (i = 0; i < n; ++i)
141 wrmsrl(e[i].index, e[i].data);
142}
143
144static void save_msrs(struct kvm_msr_entry *e, int n)
145{
146 int i;
147
148 for (i = 0; i < n; ++i)
149 rdmsrl(e[i].index, e[i].data);
150}
151
152static inline int is_page_fault(u32 intr_info)
153{
154 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
155 INTR_INFO_VALID_MASK)) ==
156 (INTR_TYPE_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
157}
158
159static inline int is_no_device(u32 intr_info)
160{
161 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
162 INTR_INFO_VALID_MASK)) ==
163 (INTR_TYPE_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
164}
165
166static inline int is_invalid_opcode(u32 intr_info)
167{
168 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
169 INTR_INFO_VALID_MASK)) ==
170 (INTR_TYPE_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
171}
172
173static inline int is_external_interrupt(u32 intr_info)
174{
175 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
176 == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
177}
178
179static inline int cpu_has_vmx_tpr_shadow(void)
180{
181 return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW);
182}
183
184static inline int vm_need_tpr_shadow(struct kvm *kvm)
185{
186 return ((cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm)));
187}
188
189static inline int cpu_has_secondary_exec_ctrls(void)
190{
191 return (vmcs_config.cpu_based_exec_ctrl &
192 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS);
193}
194
195static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
196{
197 return (vmcs_config.cpu_based_2nd_exec_ctrl &
198 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
199}
200
201static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
202{
203 return ((cpu_has_vmx_virtualize_apic_accesses()) &&
204 (irqchip_in_kernel(kvm)));
205}
206
207static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
208{
209 int i;
210
211 for (i = 0; i < vmx->nmsrs; ++i)
212 if (vmx->guest_msrs[i].index == msr)
213 return i;
214 return -1;
215}
216
217static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
218{
219 int i;
220
221 i = __find_msr_index(vmx, msr);
222 if (i >= 0)
223 return &vmx->guest_msrs[i];
224 return NULL;
225}
226
227static void vmcs_clear(struct vmcs *vmcs)
228{
229 u64 phys_addr = __pa(vmcs);
230 u8 error;
231
232 asm volatile (ASM_VMX_VMCLEAR_RAX "; setna %0"
233 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
234 : "cc", "memory");
235 if (error)
236 printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
237 vmcs, phys_addr);
238}
239
240static void __vcpu_clear(void *arg)
241{
242 struct vcpu_vmx *vmx = arg;
243 int cpu = raw_smp_processor_id();
244
245 if (vmx->vcpu.cpu == cpu)
246 vmcs_clear(vmx->vmcs);
247 if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
248 per_cpu(current_vmcs, cpu) = NULL;
249 rdtscll(vmx->vcpu.arch.host_tsc);
250}
251
252static void vcpu_clear(struct vcpu_vmx *vmx)
253{
254 if (vmx->vcpu.cpu == -1)
255 return;
256 smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 0, 1);
257 vmx->launched = 0;
258}
259
260static unsigned long vmcs_readl(unsigned long field)
261{
262 unsigned long value;
263
264 asm volatile (ASM_VMX_VMREAD_RDX_RAX
265 : "=a"(value) : "d"(field) : "cc");
266 return value;
267}
268
269static u16 vmcs_read16(unsigned long field)
270{
271 return vmcs_readl(field);
272}
273
274static u32 vmcs_read32(unsigned long field)
275{
276 return vmcs_readl(field);
277}
278
279static u64 vmcs_read64(unsigned long field)
280{
281#ifdef CONFIG_X86_64
282 return vmcs_readl(field);
283#else
284 return vmcs_readl(field) | ((u64)vmcs_readl(field+1) << 32);
285#endif
286}
287
288static noinline void vmwrite_error(unsigned long field, unsigned long value)
289{
290 printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n",
291 field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
292 dump_stack();
293}
294
295static void vmcs_writel(unsigned long field, unsigned long value)
296{
297 u8 error;
298
299 asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0"
300 : "=q"(error) : "a"(value), "d"(field) : "cc");
301 if (unlikely(error))
302 vmwrite_error(field, value);
303}
304
305static void vmcs_write16(unsigned long field, u16 value)
306{
307 vmcs_writel(field, value);
308}
309
310static void vmcs_write32(unsigned long field, u32 value)
311{
312 vmcs_writel(field, value);
313}
314
315static void vmcs_write64(unsigned long field, u64 value)
316{
317#ifdef CONFIG_X86_64
318 vmcs_writel(field, value);
319#else
320 vmcs_writel(field, value);
321 asm volatile ("");
322 vmcs_writel(field+1, value >> 32);
323#endif
324}
325
326static void vmcs_clear_bits(unsigned long field, u32 mask)
327{
328 vmcs_writel(field, vmcs_readl(field) & ~mask);
329}
330
331static void vmcs_set_bits(unsigned long field, u32 mask)
332{
333 vmcs_writel(field, vmcs_readl(field) | mask);
334}
335
336static void update_exception_bitmap(struct kvm_vcpu *vcpu)
337{
338 u32 eb;
339
340 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR);
341 if (!vcpu->fpu_active)
342 eb |= 1u << NM_VECTOR;
343 if (vcpu->guest_debug.enabled)
344 eb |= 1u << 1;
345 if (vcpu->arch.rmode.active)
346 eb = ~0;
347 vmcs_write32(EXCEPTION_BITMAP, eb);
348}
349
350static void reload_tss(void)
351{
352#ifndef CONFIG_X86_64
353
354 /*
355 * VT restores TR but not its size. Useless.
356 */
357 struct descriptor_table gdt;
358 struct segment_descriptor *descs;
359
360 get_gdt(&gdt);
361 descs = (void *)gdt.base;
362 descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
363 load_TR_desc();
364#endif
365}
366
367static void load_transition_efer(struct vcpu_vmx *vmx)
368{
369 int efer_offset = vmx->msr_offset_efer;
370 u64 host_efer = vmx->host_msrs[efer_offset].data;
371 u64 guest_efer = vmx->guest_msrs[efer_offset].data;
372 u64 ignore_bits;
373
374 if (efer_offset < 0)
375 return;
376 /*
377 * NX is emulated; LMA and LME handled by hardware; SCE meaninless
378 * outside long mode
379 */
380 ignore_bits = EFER_NX | EFER_SCE;
381#ifdef CONFIG_X86_64
382 ignore_bits |= EFER_LMA | EFER_LME;
383 /* SCE is meaningful only in long mode on Intel */
384 if (guest_efer & EFER_LMA)
385 ignore_bits &= ~(u64)EFER_SCE;
386#endif
387 if ((guest_efer & ~ignore_bits) == (host_efer & ~ignore_bits))
388 return;
389
390 vmx->host_state.guest_efer_loaded = 1;
391 guest_efer &= ~ignore_bits;
392 guest_efer |= host_efer & ignore_bits;
393 wrmsrl(MSR_EFER, guest_efer);
394 vmx->vcpu.stat.efer_reload++;
395}
396
397static void reload_host_efer(struct vcpu_vmx *vmx)
398{
399 if (vmx->host_state.guest_efer_loaded) {
400 vmx->host_state.guest_efer_loaded = 0;
401 load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1);
402 }
403}
404
405static void vmx_save_host_state(struct kvm_vcpu *vcpu)
406{
407 struct vcpu_vmx *vmx = to_vmx(vcpu);
408
409 if (vmx->host_state.loaded)
410 return;
411
412 vmx->host_state.loaded = 1;
413 /*
414 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not
415 * allow segment selectors with cpl > 0 or ti == 1.
416 */
417 vmx->host_state.ldt_sel = read_ldt();
418 vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
419 vmx->host_state.fs_sel = read_fs();
420 if (!(vmx->host_state.fs_sel & 7)) {
421 vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
422 vmx->host_state.fs_reload_needed = 0;
423 } else {
424 vmcs_write16(HOST_FS_SELECTOR, 0);
425 vmx->host_state.fs_reload_needed = 1;
426 }
427 vmx->host_state.gs_sel = read_gs();
428 if (!(vmx->host_state.gs_sel & 7))
429 vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel);
430 else {
431 vmcs_write16(HOST_GS_SELECTOR, 0);
432 vmx->host_state.gs_ldt_reload_needed = 1;
433 }
434
435#ifdef CONFIG_X86_64
436 vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
437 vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
438#else
439 vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel));
440 vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel));
441#endif
442
443#ifdef CONFIG_X86_64
444 if (is_long_mode(&vmx->vcpu))
445 save_msrs(vmx->host_msrs +
446 vmx->msr_offset_kernel_gs_base, 1);
447
448#endif
449 load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
450 load_transition_efer(vmx);
451}
452
453static void vmx_load_host_state(struct vcpu_vmx *vmx)
454{
455 unsigned long flags;
456
457 if (!vmx->host_state.loaded)
458 return;
459
460 ++vmx->vcpu.stat.host_state_reload;
461 vmx->host_state.loaded = 0;
462 if (vmx->host_state.fs_reload_needed)
463 load_fs(vmx->host_state.fs_sel);
464 if (vmx->host_state.gs_ldt_reload_needed) {
465 load_ldt(vmx->host_state.ldt_sel);
466 /*
467 * If we have to reload gs, we must take care to
468 * preserve our gs base.
469 */
470 local_irq_save(flags);
471 load_gs(vmx->host_state.gs_sel);
472#ifdef CONFIG_X86_64
473 wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
474#endif
475 local_irq_restore(flags);
476 }
477 reload_tss();
478 save_msrs(vmx->guest_msrs, vmx->save_nmsrs);
479 load_msrs(vmx->host_msrs, vmx->save_nmsrs);
480 reload_host_efer(vmx);
481}
482
483/*
484 * Switches to specified vcpu, until a matching vcpu_put(), but assumes
485 * vcpu mutex is already taken.
486 */
487static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
488{
489 struct vcpu_vmx *vmx = to_vmx(vcpu);
490 u64 phys_addr = __pa(vmx->vmcs);
491 u64 tsc_this, delta;
492
493 if (vcpu->cpu != cpu) {
494 vcpu_clear(vmx);
495 kvm_migrate_apic_timer(vcpu);
496 }
497
498 if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
499 u8 error;
500
501 per_cpu(current_vmcs, cpu) = vmx->vmcs;
502 asm volatile (ASM_VMX_VMPTRLD_RAX "; setna %0"
503 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
504 : "cc");
505 if (error)
506 printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
507 vmx->vmcs, phys_addr);
508 }
509
510 if (vcpu->cpu != cpu) {
511 struct descriptor_table dt;
512 unsigned long sysenter_esp;
513
514 vcpu->cpu = cpu;
515 /*
516 * Linux uses per-cpu TSS and GDT, so set these when switching
517 * processors.
518 */
519 vmcs_writel(HOST_TR_BASE, read_tr_base()); /* 22.2.4 */
520 get_gdt(&dt);
521 vmcs_writel(HOST_GDTR_BASE, dt.base); /* 22.2.4 */
522
523 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
524 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
525
526 /*
527 * Make sure the time stamp counter is monotonous.
528 */
529 rdtscll(tsc_this);
530 delta = vcpu->arch.host_tsc - tsc_this;
531 vmcs_write64(TSC_OFFSET, vmcs_read64(TSC_OFFSET) + delta);
532 }
533}
534
535static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
536{
537 vmx_load_host_state(to_vmx(vcpu));
538}
539
540static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
541{
542 if (vcpu->fpu_active)
543 return;
544 vcpu->fpu_active = 1;
545 vmcs_clear_bits(GUEST_CR0, X86_CR0_TS);
546 if (vcpu->arch.cr0 & X86_CR0_TS)
547 vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
548 update_exception_bitmap(vcpu);
549}
550
551static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
552{
553 if (!vcpu->fpu_active)
554 return;
555 vcpu->fpu_active = 0;
556 vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
557 update_exception_bitmap(vcpu);
558}
559
560static void vmx_vcpu_decache(struct kvm_vcpu *vcpu)
561{
562 vcpu_clear(to_vmx(vcpu));
563}
564
565static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
566{
567 return vmcs_readl(GUEST_RFLAGS);
568}
569
570static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
571{
572 if (vcpu->arch.rmode.active)
573 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
574 vmcs_writel(GUEST_RFLAGS, rflags);
575}
576
577static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
578{
579 unsigned long rip;
580 u32 interruptibility;
581
582 rip = vmcs_readl(GUEST_RIP);
583 rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
584 vmcs_writel(GUEST_RIP, rip);
585
586 /*
587 * We emulated an instruction, so temporary interrupt blocking
588 * should be removed, if set.
589 */
590 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
591 if (interruptibility & 3)
592 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
593 interruptibility & ~3);
594 vcpu->arch.interrupt_window_open = 1;
595}
596
597static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
598 bool has_error_code, u32 error_code)
599{
600 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
601 nr | INTR_TYPE_EXCEPTION
602 | (has_error_code ? INTR_INFO_DELIEVER_CODE_MASK : 0)
603 | INTR_INFO_VALID_MASK);
604 if (has_error_code)
605 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
606}
607
608static bool vmx_exception_injected(struct kvm_vcpu *vcpu)
609{
610 struct vcpu_vmx *vmx = to_vmx(vcpu);
611
612 return !(vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
613}
614
615/*
616 * Swap MSR entry in host/guest MSR entry array.
617 */
618#ifdef CONFIG_X86_64
619static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
620{
621 struct kvm_msr_entry tmp;
622
623 tmp = vmx->guest_msrs[to];
624 vmx->guest_msrs[to] = vmx->guest_msrs[from];
625 vmx->guest_msrs[from] = tmp;
626 tmp = vmx->host_msrs[to];
627 vmx->host_msrs[to] = vmx->host_msrs[from];
628 vmx->host_msrs[from] = tmp;
629}
630#endif
631
632/*
633 * Set up the vmcs to automatically save and restore system
634 * msrs. Don't touch the 64-bit msrs if the guest is in legacy
635 * mode, as fiddling with msrs is very expensive.
636 */
637static void setup_msrs(struct vcpu_vmx *vmx)
638{
639 int save_nmsrs;
640
641 save_nmsrs = 0;
642#ifdef CONFIG_X86_64
643 if (is_long_mode(&vmx->vcpu)) {
644 int index;
645
646 index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
647 if (index >= 0)
648 move_msr_up(vmx, index, save_nmsrs++);
649 index = __find_msr_index(vmx, MSR_LSTAR);
650 if (index >= 0)
651 move_msr_up(vmx, index, save_nmsrs++);
652 index = __find_msr_index(vmx, MSR_CSTAR);
653 if (index >= 0)
654 move_msr_up(vmx, index, save_nmsrs++);
655 index = __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
656 if (index >= 0)
657 move_msr_up(vmx, index, save_nmsrs++);
658 /*
659 * MSR_K6_STAR is only needed on long mode guests, and only
660 * if efer.sce is enabled.
661 */
662 index = __find_msr_index(vmx, MSR_K6_STAR);
663 if ((index >= 0) && (vmx->vcpu.arch.shadow_efer & EFER_SCE))
664 move_msr_up(vmx, index, save_nmsrs++);
665 }
666#endif
667 vmx->save_nmsrs = save_nmsrs;
668
669#ifdef CONFIG_X86_64
670 vmx->msr_offset_kernel_gs_base =
671 __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
672#endif
673 vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER);
674}
675
676/*
677 * reads and returns guest's timestamp counter "register"
678 * guest_tsc = host_tsc + tsc_offset -- 21.3
679 */
680static u64 guest_read_tsc(void)
681{
682 u64 host_tsc, tsc_offset;
683
684 rdtscll(host_tsc);
685 tsc_offset = vmcs_read64(TSC_OFFSET);
686 return host_tsc + tsc_offset;
687}
688
689/*
690 * writes 'guest_tsc' into guest's timestamp counter "register"
691 * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc
692 */
693static void guest_write_tsc(u64 guest_tsc)
694{
695 u64 host_tsc;
696
697 rdtscll(host_tsc);
698 vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc);
699}
700
701/*
702 * Reads an msr value (of 'msr_index') into 'pdata'.
703 * Returns 0 on success, non-0 otherwise.
704 * Assumes vcpu_load() was already called.
705 */
706static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
707{
708 u64 data;
709 struct kvm_msr_entry *msr;
710
711 if (!pdata) {
712 printk(KERN_ERR "BUG: get_msr called with NULL pdata\n");
713 return -EINVAL;
714 }
715
716 switch (msr_index) {
717#ifdef CONFIG_X86_64
718 case MSR_FS_BASE:
719 data = vmcs_readl(GUEST_FS_BASE);
720 break;
721 case MSR_GS_BASE:
722 data = vmcs_readl(GUEST_GS_BASE);
723 break;
724 case MSR_EFER:
725 return kvm_get_msr_common(vcpu, msr_index, pdata);
726#endif
727 case MSR_IA32_TIME_STAMP_COUNTER:
728 data = guest_read_tsc();
729 break;
730 case MSR_IA32_SYSENTER_CS:
731 data = vmcs_read32(GUEST_SYSENTER_CS);
732 break;
733 case MSR_IA32_SYSENTER_EIP:
734 data = vmcs_readl(GUEST_SYSENTER_EIP);
735 break;
736 case MSR_IA32_SYSENTER_ESP:
737 data = vmcs_readl(GUEST_SYSENTER_ESP);
738 break;
739 default:
740 msr = find_msr_entry(to_vmx(vcpu), msr_index);
741 if (msr) {
742 data = msr->data;
743 break;
744 }
745 return kvm_get_msr_common(vcpu, msr_index, pdata);
746 }
747
748 *pdata = data;
749 return 0;
750}
751
752/*
753 * Writes msr value into into the appropriate "register".
754 * Returns 0 on success, non-0 otherwise.
755 * Assumes vcpu_load() was already called.
756 */
757static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
758{
759 struct vcpu_vmx *vmx = to_vmx(vcpu);
760 struct kvm_msr_entry *msr;
761 int ret = 0;
762
763 switch (msr_index) {
764#ifdef CONFIG_X86_64
765 case MSR_EFER:
766 ret = kvm_set_msr_common(vcpu, msr_index, data);
767 if (vmx->host_state.loaded) {
768 reload_host_efer(vmx);
769 load_transition_efer(vmx);
770 }
771 break;
772 case MSR_FS_BASE:
773 vmcs_writel(GUEST_FS_BASE, data);
774 break;
775 case MSR_GS_BASE:
776 vmcs_writel(GUEST_GS_BASE, data);
777 break;
778#endif
779 case MSR_IA32_SYSENTER_CS:
780 vmcs_write32(GUEST_SYSENTER_CS, data);
781 break;
782 case MSR_IA32_SYSENTER_EIP:
783 vmcs_writel(GUEST_SYSENTER_EIP, data);
784 break;
785 case MSR_IA32_SYSENTER_ESP:
786 vmcs_writel(GUEST_SYSENTER_ESP, data);
787 break;
788 case MSR_IA32_TIME_STAMP_COUNTER:
789 guest_write_tsc(data);
790 break;
791 default:
792 msr = find_msr_entry(vmx, msr_index);
793 if (msr) {
794 msr->data = data;
795 if (vmx->host_state.loaded)
796 load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
797 break;
798 }
799 ret = kvm_set_msr_common(vcpu, msr_index, data);
800 }
801
802 return ret;
803}
804
805/*
806 * Sync the rsp and rip registers into the vcpu structure. This allows
807 * registers to be accessed by indexing vcpu->arch.regs.
808 */
809static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu)
810{
811 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
812 vcpu->arch.rip = vmcs_readl(GUEST_RIP);
813}
814
815/*
816 * Syncs rsp and rip back into the vmcs. Should be called after possible
817 * modification.
818 */
819static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu)
820{
821 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
822 vmcs_writel(GUEST_RIP, vcpu->arch.rip);
823}
824
825static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
826{
827 unsigned long dr7 = 0x400;
828 int old_singlestep;
829
830 old_singlestep = vcpu->guest_debug.singlestep;
831
832 vcpu->guest_debug.enabled = dbg->enabled;
833 if (vcpu->guest_debug.enabled) {
834 int i;
835
836 dr7 |= 0x200; /* exact */
837 for (i = 0; i < 4; ++i) {
838 if (!dbg->breakpoints[i].enabled)
839 continue;
840 vcpu->guest_debug.bp[i] = dbg->breakpoints[i].address;
841 dr7 |= 2 << (i*2); /* global enable */
842 dr7 |= 0 << (i*4+16); /* execution breakpoint */
843 }
844
845 vcpu->guest_debug.singlestep = dbg->singlestep;
846 } else
847 vcpu->guest_debug.singlestep = 0;
848
849 if (old_singlestep && !vcpu->guest_debug.singlestep) {
850 unsigned long flags;
851
852 flags = vmcs_readl(GUEST_RFLAGS);
853 flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
854 vmcs_writel(GUEST_RFLAGS, flags);
855 }
856
857 update_exception_bitmap(vcpu);
858 vmcs_writel(GUEST_DR7, dr7);
859
860 return 0;
861}
862
863static int vmx_get_irq(struct kvm_vcpu *vcpu)
864{
865 struct vcpu_vmx *vmx = to_vmx(vcpu);
866 u32 idtv_info_field;
867
868 idtv_info_field = vmx->idt_vectoring_info;
869 if (idtv_info_field & INTR_INFO_VALID_MASK) {
870 if (is_external_interrupt(idtv_info_field))
871 return idtv_info_field & VECTORING_INFO_VECTOR_MASK;
872 else
873 printk(KERN_DEBUG "pending exception: not handled yet\n");
874 }
875 return -1;
876}
877
878static __init int cpu_has_kvm_support(void)
879{
880 unsigned long ecx = cpuid_ecx(1);
881 return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */
882}
883
884static __init int vmx_disabled_by_bios(void)
885{
886 u64 msr;
887
888 rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
889 return (msr & (MSR_IA32_FEATURE_CONTROL_LOCKED |
890 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
891 == MSR_IA32_FEATURE_CONTROL_LOCKED;
892 /* locked but not enabled */
893}
894
895static void hardware_enable(void *garbage)
896{
897 int cpu = raw_smp_processor_id();
898 u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
899 u64 old;
900
901 rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
902 if ((old & (MSR_IA32_FEATURE_CONTROL_LOCKED |
903 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
904 != (MSR_IA32_FEATURE_CONTROL_LOCKED |
905 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
906 /* enable and lock */
907 wrmsrl(MSR_IA32_FEATURE_CONTROL, old |
908 MSR_IA32_FEATURE_CONTROL_LOCKED |
909 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED);
910 write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
911 asm volatile (ASM_VMX_VMXON_RAX : : "a"(&phys_addr), "m"(phys_addr)
912 : "memory", "cc");
913}
914
915static void hardware_disable(void *garbage)
916{
917 asm volatile (ASM_VMX_VMXOFF : : : "cc");
918}
919
920static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
921 u32 msr, u32 *result)
922{
923 u32 vmx_msr_low, vmx_msr_high;
924 u32 ctl = ctl_min | ctl_opt;
925
926 rdmsr(msr, vmx_msr_low, vmx_msr_high);
927
928 ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
929 ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */
930
931 /* Ensure minimum (required) set of control bits are supported. */
932 if (ctl_min & ~ctl)
933 return -EIO;
934
935 *result = ctl;
936 return 0;
937}
938
939static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
940{
941 u32 vmx_msr_low, vmx_msr_high;
942 u32 min, opt;
943 u32 _pin_based_exec_control = 0;
944 u32 _cpu_based_exec_control = 0;
945 u32 _cpu_based_2nd_exec_control = 0;
946 u32 _vmexit_control = 0;
947 u32 _vmentry_control = 0;
948
949 min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
950 opt = 0;
951 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
952 &_pin_based_exec_control) < 0)
953 return -EIO;
954
955 min = CPU_BASED_HLT_EXITING |
956#ifdef CONFIG_X86_64
957 CPU_BASED_CR8_LOAD_EXITING |
958 CPU_BASED_CR8_STORE_EXITING |
959#endif
960 CPU_BASED_USE_IO_BITMAPS |
961 CPU_BASED_MOV_DR_EXITING |
962 CPU_BASED_USE_TSC_OFFSETING;
963 opt = CPU_BASED_TPR_SHADOW |
964 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
965 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
966 &_cpu_based_exec_control) < 0)
967 return -EIO;
968#ifdef CONFIG_X86_64
969 if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
970 _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
971 ~CPU_BASED_CR8_STORE_EXITING;
972#endif
973 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
974 min = 0;
975 opt = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
976 SECONDARY_EXEC_WBINVD_EXITING;
977 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS2,
978 &_cpu_based_2nd_exec_control) < 0)
979 return -EIO;
980 }
981#ifndef CONFIG_X86_64
982 if (!(_cpu_based_2nd_exec_control &
983 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
984 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
985#endif
986
987 min = 0;
988#ifdef CONFIG_X86_64
989 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
990#endif
991 opt = 0;
992 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
993 &_vmexit_control) < 0)
994 return -EIO;
995
996 min = opt = 0;
997 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
998 &_vmentry_control) < 0)
999 return -EIO;
1000
1001 rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
1002
1003 /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
1004 if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
1005 return -EIO;
1006
1007#ifdef CONFIG_X86_64
1008 /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
1009 if (vmx_msr_high & (1u<<16))
1010 return -EIO;
1011#endif
1012
1013 /* Require Write-Back (WB) memory type for VMCS accesses. */
1014 if (((vmx_msr_high >> 18) & 15) != 6)
1015 return -EIO;
1016
1017 vmcs_conf->size = vmx_msr_high & 0x1fff;
1018 vmcs_conf->order = get_order(vmcs_config.size);
1019 vmcs_conf->revision_id = vmx_msr_low;
1020
1021 vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
1022 vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
1023 vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
1024 vmcs_conf->vmexit_ctrl = _vmexit_control;
1025 vmcs_conf->vmentry_ctrl = _vmentry_control;
1026
1027 return 0;
1028}
1029
1030static struct vmcs *alloc_vmcs_cpu(int cpu)
1031{
1032 int node = cpu_to_node(cpu);
1033 struct page *pages;
1034 struct vmcs *vmcs;
1035
1036 pages = alloc_pages_node(node, GFP_KERNEL, vmcs_config.order);
1037 if (!pages)
1038 return NULL;
1039 vmcs = page_address(pages);
1040 memset(vmcs, 0, vmcs_config.size);
1041 vmcs->revision_id = vmcs_config.revision_id; /* vmcs revision id */
1042 return vmcs;
1043}
1044
1045static struct vmcs *alloc_vmcs(void)
1046{
1047 return alloc_vmcs_cpu(raw_smp_processor_id());
1048}
1049
1050static void free_vmcs(struct vmcs *vmcs)
1051{
1052 free_pages((unsigned long)vmcs, vmcs_config.order);
1053}
1054
1055static void free_kvm_area(void)
1056{
1057 int cpu;
1058
1059 for_each_online_cpu(cpu)
1060 free_vmcs(per_cpu(vmxarea, cpu));
1061}
1062
1063static __init int alloc_kvm_area(void)
1064{
1065 int cpu;
1066
1067 for_each_online_cpu(cpu) {
1068 struct vmcs *vmcs;
1069
1070 vmcs = alloc_vmcs_cpu(cpu);
1071 if (!vmcs) {
1072 free_kvm_area();
1073 return -ENOMEM;
1074 }
1075
1076 per_cpu(vmxarea, cpu) = vmcs;
1077 }
1078 return 0;
1079}
1080
1081static __init int hardware_setup(void)
1082{
1083 if (setup_vmcs_config(&vmcs_config) < 0)
1084 return -EIO;
1085 return alloc_kvm_area();
1086}
1087
1088static __exit void hardware_unsetup(void)
1089{
1090 free_kvm_area();
1091}
1092
1093static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save)
1094{
1095 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1096
1097 if (vmcs_readl(sf->base) == save->base && (save->base & AR_S_MASK)) {
1098 vmcs_write16(sf->selector, save->selector);
1099 vmcs_writel(sf->base, save->base);
1100 vmcs_write32(sf->limit, save->limit);
1101 vmcs_write32(sf->ar_bytes, save->ar);
1102 } else {
1103 u32 dpl = (vmcs_read16(sf->selector) & SELECTOR_RPL_MASK)
1104 << AR_DPL_SHIFT;
1105 vmcs_write32(sf->ar_bytes, 0x93 | dpl);
1106 }
1107}
1108
1109static void enter_pmode(struct kvm_vcpu *vcpu)
1110{
1111 unsigned long flags;
1112
1113 vcpu->arch.rmode.active = 0;
1114
1115 vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base);
1116 vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit);
1117 vmcs_write32(GUEST_TR_AR_BYTES, vcpu->arch.rmode.tr.ar);
1118
1119 flags = vmcs_readl(GUEST_RFLAGS);
1120 flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM);
1121 flags |= (vcpu->arch.rmode.save_iopl << IOPL_SHIFT);
1122 vmcs_writel(GUEST_RFLAGS, flags);
1123
1124 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
1125 (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
1126
1127 update_exception_bitmap(vcpu);
1128
1129 fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es);
1130 fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds);
1131 fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
1132 fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
1133
1134 vmcs_write16(GUEST_SS_SELECTOR, 0);
1135 vmcs_write32(GUEST_SS_AR_BYTES, 0x93);
1136
1137 vmcs_write16(GUEST_CS_SELECTOR,
1138 vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK);
1139 vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
1140}
1141
1142static gva_t rmode_tss_base(struct kvm *kvm)
1143{
1144 if (!kvm->arch.tss_addr) {
1145 gfn_t base_gfn = kvm->memslots[0].base_gfn +
1146 kvm->memslots[0].npages - 3;
1147 return base_gfn << PAGE_SHIFT;
1148 }
1149 return kvm->arch.tss_addr;
1150}
1151
1152static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
1153{
1154 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1155
1156 save->selector = vmcs_read16(sf->selector);
1157 save->base = vmcs_readl(sf->base);
1158 save->limit = vmcs_read32(sf->limit);
1159 save->ar = vmcs_read32(sf->ar_bytes);
1160 vmcs_write16(sf->selector, save->base >> 4);
1161 vmcs_write32(sf->base, save->base & 0xfffff);
1162 vmcs_write32(sf->limit, 0xffff);
1163 vmcs_write32(sf->ar_bytes, 0xf3);
1164}
1165
1166static void enter_rmode(struct kvm_vcpu *vcpu)
1167{
1168 unsigned long flags;
1169
1170 vcpu->arch.rmode.active = 1;
1171
1172 vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
1173 vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
1174
1175 vcpu->arch.rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
1176 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
1177
1178 vcpu->arch.rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
1179 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
1180
1181 flags = vmcs_readl(GUEST_RFLAGS);
1182 vcpu->arch.rmode.save_iopl
1183 = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
1184
1185 flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
1186
1187 vmcs_writel(GUEST_RFLAGS, flags);
1188 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
1189 update_exception_bitmap(vcpu);
1190
1191 vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4);
1192 vmcs_write32(GUEST_SS_LIMIT, 0xffff);
1193 vmcs_write32(GUEST_SS_AR_BYTES, 0xf3);
1194
1195 vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
1196 vmcs_write32(GUEST_CS_LIMIT, 0xffff);
1197 if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000)
1198 vmcs_writel(GUEST_CS_BASE, 0xf0000);
1199 vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
1200
1201 fix_rmode_seg(VCPU_SREG_ES, &vcpu->arch.rmode.es);
1202 fix_rmode_seg(VCPU_SREG_DS, &vcpu->arch.rmode.ds);
1203 fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
1204 fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
1205
1206 kvm_mmu_reset_context(vcpu);
1207 init_rmode_tss(vcpu->kvm);
1208}
1209
1210#ifdef CONFIG_X86_64
1211
1212static void enter_lmode(struct kvm_vcpu *vcpu)
1213{
1214 u32 guest_tr_ar;
1215
1216 guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
1217 if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
1218 printk(KERN_DEBUG "%s: tss fixup for long mode. \n",
1219 __FUNCTION__);
1220 vmcs_write32(GUEST_TR_AR_BYTES,
1221 (guest_tr_ar & ~AR_TYPE_MASK)
1222 | AR_TYPE_BUSY_64_TSS);
1223 }
1224
1225 vcpu->arch.shadow_efer |= EFER_LMA;
1226
1227 find_msr_entry(to_vmx(vcpu), MSR_EFER)->data |= EFER_LMA | EFER_LME;
1228 vmcs_write32(VM_ENTRY_CONTROLS,
1229 vmcs_read32(VM_ENTRY_CONTROLS)
1230 | VM_ENTRY_IA32E_MODE);
1231}
1232
1233static void exit_lmode(struct kvm_vcpu *vcpu)
1234{
1235 vcpu->arch.shadow_efer &= ~EFER_LMA;
1236
1237 vmcs_write32(VM_ENTRY_CONTROLS,
1238 vmcs_read32(VM_ENTRY_CONTROLS)
1239 & ~VM_ENTRY_IA32E_MODE);
1240}
1241
1242#endif
1243
1244static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
1245{
1246 vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK;
1247 vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK;
1248}
1249
1250static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1251{
1252 vmx_fpu_deactivate(vcpu);
1253
1254 if (vcpu->arch.rmode.active && (cr0 & X86_CR0_PE))
1255 enter_pmode(vcpu);
1256
1257 if (!vcpu->arch.rmode.active && !(cr0 & X86_CR0_PE))
1258 enter_rmode(vcpu);
1259
1260#ifdef CONFIG_X86_64
1261 if (vcpu->arch.shadow_efer & EFER_LME) {
1262 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
1263 enter_lmode(vcpu);
1264 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
1265 exit_lmode(vcpu);
1266 }
1267#endif
1268
1269 vmcs_writel(CR0_READ_SHADOW, cr0);
1270 vmcs_writel(GUEST_CR0,
1271 (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON);
1272 vcpu->arch.cr0 = cr0;
1273
1274 if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE))
1275 vmx_fpu_activate(vcpu);
1276}
1277
1278static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1279{
1280 vmcs_writel(GUEST_CR3, cr3);
1281 if (vcpu->arch.cr0 & X86_CR0_PE)
1282 vmx_fpu_deactivate(vcpu);
1283}
1284
1285static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1286{
1287 vmcs_writel(CR4_READ_SHADOW, cr4);
1288 vmcs_writel(GUEST_CR4, cr4 | (vcpu->arch.rmode.active ?
1289 KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON));
1290 vcpu->arch.cr4 = cr4;
1291}
1292
1293#ifdef CONFIG_X86_64
1294
1295static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
1296{
1297 struct vcpu_vmx *vmx = to_vmx(vcpu);
1298 struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
1299
1300 vcpu->arch.shadow_efer = efer;
1301 if (efer & EFER_LMA) {
1302 vmcs_write32(VM_ENTRY_CONTROLS,
1303 vmcs_read32(VM_ENTRY_CONTROLS) |
1304 VM_ENTRY_IA32E_MODE);
1305 msr->data = efer;
1306
1307 } else {
1308 vmcs_write32(VM_ENTRY_CONTROLS,
1309 vmcs_read32(VM_ENTRY_CONTROLS) &
1310 ~VM_ENTRY_IA32E_MODE);
1311
1312 msr->data = efer & ~EFER_LME;
1313 }
1314 setup_msrs(vmx);
1315}
1316
1317#endif
1318
1319static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
1320{
1321 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1322
1323 return vmcs_readl(sf->base);
1324}
1325
1326static void vmx_get_segment(struct kvm_vcpu *vcpu,
1327 struct kvm_segment *var, int seg)
1328{
1329 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1330 u32 ar;
1331
1332 var->base = vmcs_readl(sf->base);
1333 var->limit = vmcs_read32(sf->limit);
1334 var->selector = vmcs_read16(sf->selector);
1335 ar = vmcs_read32(sf->ar_bytes);
1336 if (ar & AR_UNUSABLE_MASK)
1337 ar = 0;
1338 var->type = ar & 15;
1339 var->s = (ar >> 4) & 1;
1340 var->dpl = (ar >> 5) & 3;
1341 var->present = (ar >> 7) & 1;
1342 var->avl = (ar >> 12) & 1;
1343 var->l = (ar >> 13) & 1;
1344 var->db = (ar >> 14) & 1;
1345 var->g = (ar >> 15) & 1;
1346 var->unusable = (ar >> 16) & 1;
1347}
1348
1349static u32 vmx_segment_access_rights(struct kvm_segment *var)
1350{
1351 u32 ar;
1352
1353 if (var->unusable)
1354 ar = 1 << 16;
1355 else {
1356 ar = var->type & 15;
1357 ar |= (var->s & 1) << 4;
1358 ar |= (var->dpl & 3) << 5;
1359 ar |= (var->present & 1) << 7;
1360 ar |= (var->avl & 1) << 12;
1361 ar |= (var->l & 1) << 13;
1362 ar |= (var->db & 1) << 14;
1363 ar |= (var->g & 1) << 15;
1364 }
1365 if (ar == 0) /* a 0 value means unusable */
1366 ar = AR_UNUSABLE_MASK;
1367
1368 return ar;
1369}
1370
1371static void vmx_set_segment(struct kvm_vcpu *vcpu,
1372 struct kvm_segment *var, int seg)
1373{
1374 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1375 u32 ar;
1376
1377 if (vcpu->arch.rmode.active && seg == VCPU_SREG_TR) {
1378 vcpu->arch.rmode.tr.selector = var->selector;
1379 vcpu->arch.rmode.tr.base = var->base;
1380 vcpu->arch.rmode.tr.limit = var->limit;
1381 vcpu->arch.rmode.tr.ar = vmx_segment_access_rights(var);
1382 return;
1383 }
1384 vmcs_writel(sf->base, var->base);
1385 vmcs_write32(sf->limit, var->limit);
1386 vmcs_write16(sf->selector, var->selector);
1387 if (vcpu->arch.rmode.active && var->s) {
1388 /*
1389 * Hack real-mode segments into vm86 compatibility.
1390 */
1391 if (var->base == 0xffff0000 && var->selector == 0xf000)
1392 vmcs_writel(sf->base, 0xf0000);
1393 ar = 0xf3;
1394 } else
1395 ar = vmx_segment_access_rights(var);
1396 vmcs_write32(sf->ar_bytes, ar);
1397}
1398
1399static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
1400{
1401 u32 ar = vmcs_read32(GUEST_CS_AR_BYTES);
1402
1403 *db = (ar >> 14) & 1;
1404 *l = (ar >> 13) & 1;
1405}
1406
1407static void vmx_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
1408{
1409 dt->limit = vmcs_read32(GUEST_IDTR_LIMIT);
1410 dt->base = vmcs_readl(GUEST_IDTR_BASE);
1411}
1412
1413static void vmx_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
1414{
1415 vmcs_write32(GUEST_IDTR_LIMIT, dt->limit);
1416 vmcs_writel(GUEST_IDTR_BASE, dt->base);
1417}
1418
1419static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
1420{
1421 dt->limit = vmcs_read32(GUEST_GDTR_LIMIT);
1422 dt->base = vmcs_readl(GUEST_GDTR_BASE);
1423}
1424
1425static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
1426{
1427 vmcs_write32(GUEST_GDTR_LIMIT, dt->limit);
1428 vmcs_writel(GUEST_GDTR_BASE, dt->base);
1429}
1430
1431static int init_rmode_tss(struct kvm *kvm)
1432{
1433 gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
1434 u16 data = 0;
1435 int ret = 0;
1436 int r;
1437
1438 down_read(&current->mm->mmap_sem);
1439 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
1440 if (r < 0)
1441 goto out;
1442 data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
1443 r = kvm_write_guest_page(kvm, fn++, &data, 0x66, sizeof(u16));
1444 if (r < 0)
1445 goto out;
1446 r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
1447 if (r < 0)
1448 goto out;
1449 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
1450 if (r < 0)
1451 goto out;
1452 data = ~0;
1453 r = kvm_write_guest_page(kvm, fn, &data,
1454 RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
1455 sizeof(u8));
1456 if (r < 0)
1457 goto out;
1458
1459 ret = 1;
1460out:
1461 up_read(&current->mm->mmap_sem);
1462 return ret;
1463}
1464
1465static void seg_setup(int seg)
1466{
1467 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1468
1469 vmcs_write16(sf->selector, 0);
1470 vmcs_writel(sf->base, 0);
1471 vmcs_write32(sf->limit, 0xffff);
1472 vmcs_write32(sf->ar_bytes, 0x93);
1473}
1474
1475static int alloc_apic_access_page(struct kvm *kvm)
1476{
1477 struct kvm_userspace_memory_region kvm_userspace_mem;
1478 int r = 0;
1479
1480 down_write(&current->mm->mmap_sem);
1481 if (kvm->arch.apic_access_page)
1482 goto out;
1483 kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
1484 kvm_userspace_mem.flags = 0;
1485 kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL;
1486 kvm_userspace_mem.memory_size = PAGE_SIZE;
1487 r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
1488 if (r)
1489 goto out;
1490 kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
1491out:
1492 up_write(&current->mm->mmap_sem);
1493 return r;
1494}
1495
1496/*
1497 * Sets up the vmcs for emulated real mode.
1498 */
1499static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
1500{
1501 u32 host_sysenter_cs;
1502 u32 junk;
1503 unsigned long a;
1504 struct descriptor_table dt;
1505 int i;
1506 unsigned long kvm_vmx_return;
1507 u32 exec_control;
1508
1509 /* I/O */
1510 vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a));
1511 vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b));
1512
1513 vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
1514
1515 /* Control */
1516 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
1517 vmcs_config.pin_based_exec_ctrl);
1518
1519 exec_control = vmcs_config.cpu_based_exec_ctrl;
1520 if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
1521 exec_control &= ~CPU_BASED_TPR_SHADOW;
1522#ifdef CONFIG_X86_64
1523 exec_control |= CPU_BASED_CR8_STORE_EXITING |
1524 CPU_BASED_CR8_LOAD_EXITING;
1525#endif
1526 }
1527 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
1528
1529 if (cpu_has_secondary_exec_ctrls()) {
1530 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
1531 if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
1532 exec_control &=
1533 ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
1534 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
1535 }
1536
1537 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf);
1538 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
1539 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
1540
1541 vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */
1542 vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */
1543 vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */
1544
1545 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
1546 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
1547 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */
1548 vmcs_write16(HOST_FS_SELECTOR, read_fs()); /* 22.2.4 */
1549 vmcs_write16(HOST_GS_SELECTOR, read_gs()); /* 22.2.4 */
1550 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
1551#ifdef CONFIG_X86_64
1552 rdmsrl(MSR_FS_BASE, a);
1553 vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
1554 rdmsrl(MSR_GS_BASE, a);
1555 vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */
1556#else
1557 vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
1558 vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
1559#endif
1560
1561 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
1562
1563 get_idt(&dt);
1564 vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */
1565
1566 asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
1567 vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
1568 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
1569 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
1570 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
1571
1572 rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk);
1573 vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
1574 rdmsrl(MSR_IA32_SYSENTER_ESP, a);
1575 vmcs_writel(HOST_IA32_SYSENTER_ESP, a); /* 22.2.3 */
1576 rdmsrl(MSR_IA32_SYSENTER_EIP, a);
1577 vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */
1578
1579 for (i = 0; i < NR_VMX_MSR; ++i) {
1580 u32 index = vmx_msr_index[i];
1581 u32 data_low, data_high;
1582 u64 data;
1583 int j = vmx->nmsrs;
1584
1585 if (rdmsr_safe(index, &data_low, &data_high) < 0)
1586 continue;
1587 if (wrmsr_safe(index, data_low, data_high) < 0)
1588 continue;
1589 data = data_low | ((u64)data_high << 32);
1590 vmx->host_msrs[j].index = index;
1591 vmx->host_msrs[j].reserved = 0;
1592 vmx->host_msrs[j].data = data;
1593 vmx->guest_msrs[j] = vmx->host_msrs[j];
1594 ++vmx->nmsrs;
1595 }
1596
1597 vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
1598
1599 /* 22.2.1, 20.8.1 */
1600 vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
1601
1602 vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
1603 vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK);
1604
1605 if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
1606 if (alloc_apic_access_page(vmx->vcpu.kvm) != 0)
1607 return -ENOMEM;
1608
1609 return 0;
1610}
1611
1612static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
1613{
1614 struct vcpu_vmx *vmx = to_vmx(vcpu);
1615 u64 msr;
1616 int ret;
1617
1618 if (!init_rmode_tss(vmx->vcpu.kvm)) {
1619 ret = -ENOMEM;
1620 goto out;
1621 }
1622
1623 vmx->vcpu.arch.rmode.active = 0;
1624
1625 vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
1626 set_cr8(&vmx->vcpu, 0);
1627 msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
1628 if (vmx->vcpu.vcpu_id == 0)
1629 msr |= MSR_IA32_APICBASE_BSP;
1630 kvm_set_apic_base(&vmx->vcpu, msr);
1631
1632 fx_init(&vmx->vcpu);
1633
1634 /*
1635 * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
1636 * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh.
1637 */
1638 if (vmx->vcpu.vcpu_id == 0) {
1639 vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
1640 vmcs_writel(GUEST_CS_BASE, 0x000f0000);
1641 } else {
1642 vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8);
1643 vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12);
1644 }
1645 vmcs_write32(GUEST_CS_LIMIT, 0xffff);
1646 vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
1647
1648 seg_setup(VCPU_SREG_DS);
1649 seg_setup(VCPU_SREG_ES);
1650 seg_setup(VCPU_SREG_FS);
1651 seg_setup(VCPU_SREG_GS);
1652 seg_setup(VCPU_SREG_SS);
1653
1654 vmcs_write16(GUEST_TR_SELECTOR, 0);
1655 vmcs_writel(GUEST_TR_BASE, 0);
1656 vmcs_write32(GUEST_TR_LIMIT, 0xffff);
1657 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
1658
1659 vmcs_write16(GUEST_LDTR_SELECTOR, 0);
1660 vmcs_writel(GUEST_LDTR_BASE, 0);
1661 vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
1662 vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
1663
1664 vmcs_write32(GUEST_SYSENTER_CS, 0);
1665 vmcs_writel(GUEST_SYSENTER_ESP, 0);
1666 vmcs_writel(GUEST_SYSENTER_EIP, 0);
1667
1668 vmcs_writel(GUEST_RFLAGS, 0x02);
1669 if (vmx->vcpu.vcpu_id == 0)
1670 vmcs_writel(GUEST_RIP, 0xfff0);
1671 else
1672 vmcs_writel(GUEST_RIP, 0);
1673 vmcs_writel(GUEST_RSP, 0);
1674
1675 /* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */
1676 vmcs_writel(GUEST_DR7, 0x400);
1677
1678 vmcs_writel(GUEST_GDTR_BASE, 0);
1679 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
1680
1681 vmcs_writel(GUEST_IDTR_BASE, 0);
1682 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
1683
1684 vmcs_write32(GUEST_ACTIVITY_STATE, 0);
1685 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
1686 vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
1687
1688 guest_write_tsc(0);
1689
1690 /* Special registers */
1691 vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
1692
1693 setup_msrs(vmx);
1694
1695 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */
1696
1697 if (cpu_has_vmx_tpr_shadow()) {
1698 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
1699 if (vm_need_tpr_shadow(vmx->vcpu.kvm))
1700 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
1701 page_to_phys(vmx->vcpu.arch.apic->regs_page));
1702 vmcs_write32(TPR_THRESHOLD, 0);
1703 }
1704
1705 if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
1706 vmcs_write64(APIC_ACCESS_ADDR,
1707 page_to_phys(vmx->vcpu.kvm->arch.apic_access_page));
1708
1709 vmx->vcpu.arch.cr0 = 0x60000010;
1710 vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */
1711 vmx_set_cr4(&vmx->vcpu, 0);
1712#ifdef CONFIG_X86_64
1713 vmx_set_efer(&vmx->vcpu, 0);
1714#endif
1715 vmx_fpu_activate(&vmx->vcpu);
1716 update_exception_bitmap(&vmx->vcpu);
1717
1718 return 0;
1719
1720out:
1721 return ret;
1722}
1723
1724static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
1725{
1726 struct vcpu_vmx *vmx = to_vmx(vcpu);
1727
1728 if (vcpu->arch.rmode.active) {
1729 vmx->rmode.irq.pending = true;
1730 vmx->rmode.irq.vector = irq;
1731 vmx->rmode.irq.rip = vmcs_readl(GUEST_RIP);
1732 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
1733 irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK);
1734 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
1735 vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip - 1);
1736 return;
1737 }
1738 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
1739 irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
1740}
1741
1742static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
1743{
1744 int word_index = __ffs(vcpu->arch.irq_summary);
1745 int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
1746 int irq = word_index * BITS_PER_LONG + bit_index;
1747
1748 clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
1749 if (!vcpu->arch.irq_pending[word_index])
1750 clear_bit(word_index, &vcpu->arch.irq_summary);
1751 vmx_inject_irq(vcpu, irq);
1752}
1753
1754
1755static void do_interrupt_requests(struct kvm_vcpu *vcpu,
1756 struct kvm_run *kvm_run)
1757{
1758 u32 cpu_based_vm_exec_control;
1759
1760 vcpu->arch.interrupt_window_open =
1761 ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
1762 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
1763
1764 if (vcpu->arch.interrupt_window_open &&
1765 vcpu->arch.irq_summary &&
1766 !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK))
1767 /*
1768 * If interrupts enabled, and not blocked by sti or mov ss. Good.
1769 */
1770 kvm_do_inject_irq(vcpu);
1771
1772 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
1773 if (!vcpu->arch.interrupt_window_open &&
1774 (vcpu->arch.irq_summary || kvm_run->request_interrupt_window))
1775 /*
1776 * Interrupts blocked. Wait for unblock.
1777 */
1778 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
1779 else
1780 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
1781 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
1782}
1783
1784static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
1785{
1786 int ret;
1787 struct kvm_userspace_memory_region tss_mem = {
1788 .slot = 8,
1789 .guest_phys_addr = addr,
1790 .memory_size = PAGE_SIZE * 3,
1791 .flags = 0,
1792 };
1793
1794 ret = kvm_set_memory_region(kvm, &tss_mem, 0);
1795 if (ret)
1796 return ret;
1797 kvm->arch.tss_addr = addr;
1798 return 0;
1799}
1800
1801static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu)
1802{
1803 struct kvm_guest_debug *dbg = &vcpu->guest_debug;
1804
1805 set_debugreg(dbg->bp[0], 0);
1806 set_debugreg(dbg->bp[1], 1);
1807 set_debugreg(dbg->bp[2], 2);
1808 set_debugreg(dbg->bp[3], 3);
1809
1810 if (dbg->singlestep) {
1811 unsigned long flags;
1812
1813 flags = vmcs_readl(GUEST_RFLAGS);
1814 flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
1815 vmcs_writel(GUEST_RFLAGS, flags);
1816 }
1817}
1818
1819static int handle_rmode_exception(struct kvm_vcpu *vcpu,
1820 int vec, u32 err_code)
1821{
1822 if (!vcpu->arch.rmode.active)
1823 return 0;
1824
1825 /*
1826 * Instruction with address size override prefix opcode 0x67
1827 * Cause the #SS fault with 0 error code in VM86 mode.
1828 */
1829 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
1830 if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE)
1831 return 1;
1832 return 0;
1833}
1834
1835static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1836{
1837 struct vcpu_vmx *vmx = to_vmx(vcpu);
1838 u32 intr_info, error_code;
1839 unsigned long cr2, rip;
1840 u32 vect_info;
1841 enum emulation_result er;
1842
1843 vect_info = vmx->idt_vectoring_info;
1844 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
1845
1846 if ((vect_info & VECTORING_INFO_VALID_MASK) &&
1847 !is_page_fault(intr_info))
1848 printk(KERN_ERR "%s: unexpected, vectoring info 0x%x "
1849 "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info);
1850
1851 if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) {
1852 int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
1853 set_bit(irq, vcpu->arch.irq_pending);
1854 set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
1855 }
1856
1857 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */
1858 return 1; /* already handled by vmx_vcpu_run() */
1859
1860 if (is_no_device(intr_info)) {
1861 vmx_fpu_activate(vcpu);
1862 return 1;
1863 }
1864
1865 if (is_invalid_opcode(intr_info)) {
1866 er = emulate_instruction(vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD);
1867 if (er != EMULATE_DONE)
1868 kvm_queue_exception(vcpu, UD_VECTOR);
1869 return 1;
1870 }
1871
1872 error_code = 0;
1873 rip = vmcs_readl(GUEST_RIP);
1874 if (intr_info & INTR_INFO_DELIEVER_CODE_MASK)
1875 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
1876 if (is_page_fault(intr_info)) {
1877 cr2 = vmcs_readl(EXIT_QUALIFICATION);
1878 return kvm_mmu_page_fault(vcpu, cr2, error_code);
1879 }
1880
1881 if (vcpu->arch.rmode.active &&
1882 handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
1883 error_code)) {
1884 if (vcpu->arch.halt_request) {
1885 vcpu->arch.halt_request = 0;
1886 return kvm_emulate_halt(vcpu);
1887 }
1888 return 1;
1889 }
1890
1891 if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) ==
1892 (INTR_TYPE_EXCEPTION | 1)) {
1893 kvm_run->exit_reason = KVM_EXIT_DEBUG;
1894 return 0;
1895 }
1896 kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
1897 kvm_run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK;
1898 kvm_run->ex.error_code = error_code;
1899 return 0;
1900}
1901
1902static int handle_external_interrupt(struct kvm_vcpu *vcpu,
1903 struct kvm_run *kvm_run)
1904{
1905 ++vcpu->stat.irq_exits;
1906 return 1;
1907}
1908
1909static int handle_triple_fault(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1910{
1911 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
1912 return 0;
1913}
1914
1915static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1916{
1917 unsigned long exit_qualification;
1918 int size, down, in, string, rep;
1919 unsigned port;
1920
1921 ++vcpu->stat.io_exits;
1922 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
1923 string = (exit_qualification & 16) != 0;
1924
1925 if (string) {
1926 if (emulate_instruction(vcpu,
1927 kvm_run, 0, 0, 0) == EMULATE_DO_MMIO)
1928 return 0;
1929 return 1;
1930 }
1931
1932 size = (exit_qualification & 7) + 1;
1933 in = (exit_qualification & 8) != 0;
1934 down = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0;
1935 rep = (exit_qualification & 32) != 0;
1936 port = exit_qualification >> 16;
1937
1938 return kvm_emulate_pio(vcpu, kvm_run, in, size, port);
1939}
1940
1941static void
1942vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
1943{
1944 /*
1945 * Patch in the VMCALL instruction:
1946 */
1947 hypercall[0] = 0x0f;
1948 hypercall[1] = 0x01;
1949 hypercall[2] = 0xc1;
1950}
1951
1952static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1953{
1954 unsigned long exit_qualification;
1955 int cr;
1956 int reg;
1957
1958 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
1959 cr = exit_qualification & 15;
1960 reg = (exit_qualification >> 8) & 15;
1961 switch ((exit_qualification >> 4) & 3) {
1962 case 0: /* mov to cr */
1963 switch (cr) {
1964 case 0:
1965 vcpu_load_rsp_rip(vcpu);
1966 set_cr0(vcpu, vcpu->arch.regs[reg]);
1967 skip_emulated_instruction(vcpu);
1968 return 1;
1969 case 3:
1970 vcpu_load_rsp_rip(vcpu);
1971 set_cr3(vcpu, vcpu->arch.regs[reg]);
1972 skip_emulated_instruction(vcpu);
1973 return 1;
1974 case 4:
1975 vcpu_load_rsp_rip(vcpu);
1976 set_cr4(vcpu, vcpu->arch.regs[reg]);
1977 skip_emulated_instruction(vcpu);
1978 return 1;
1979 case 8:
1980 vcpu_load_rsp_rip(vcpu);
1981 set_cr8(vcpu, vcpu->arch.regs[reg]);
1982 skip_emulated_instruction(vcpu);
1983 if (irqchip_in_kernel(vcpu->kvm))
1984 return 1;
1985 kvm_run->exit_reason = KVM_EXIT_SET_TPR;
1986 return 0;
1987 };
1988 break;
1989 case 2: /* clts */
1990 vcpu_load_rsp_rip(vcpu);
1991 vmx_fpu_deactivate(vcpu);
1992 vcpu->arch.cr0 &= ~X86_CR0_TS;
1993 vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
1994 vmx_fpu_activate(vcpu);
1995 skip_emulated_instruction(vcpu);
1996 return 1;
1997 case 1: /*mov from cr*/
1998 switch (cr) {
1999 case 3:
2000 vcpu_load_rsp_rip(vcpu);
2001 vcpu->arch.regs[reg] = vcpu->arch.cr3;
2002 vcpu_put_rsp_rip(vcpu);
2003 skip_emulated_instruction(vcpu);
2004 return 1;
2005 case 8:
2006 vcpu_load_rsp_rip(vcpu);
2007 vcpu->arch.regs[reg] = get_cr8(vcpu);
2008 vcpu_put_rsp_rip(vcpu);
2009 skip_emulated_instruction(vcpu);
2010 return 1;
2011 }
2012 break;
2013 case 3: /* lmsw */
2014 lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f);
2015
2016 skip_emulated_instruction(vcpu);
2017 return 1;
2018 default:
2019 break;
2020 }
2021 kvm_run->exit_reason = 0;
2022 pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
2023 (int)(exit_qualification >> 4) & 3, cr);
2024 return 0;
2025}
2026
2027static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2028{
2029 unsigned long exit_qualification;
2030 unsigned long val;
2031 int dr, reg;
2032
2033 /*
2034 * FIXME: this code assumes the host is debugging the guest.
2035 * need to deal with guest debugging itself too.
2036 */
2037 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
2038 dr = exit_qualification & 7;
2039 reg = (exit_qualification >> 8) & 15;
2040 vcpu_load_rsp_rip(vcpu);
2041 if (exit_qualification & 16) {
2042 /* mov from dr */
2043 switch (dr) {
2044 case 6:
2045 val = 0xffff0ff0;
2046 break;
2047 case 7:
2048 val = 0x400;
2049 break;
2050 default:
2051 val = 0;
2052 }
2053 vcpu->arch.regs[reg] = val;
2054 } else {
2055 /* mov to dr */
2056 }
2057 vcpu_put_rsp_rip(vcpu);
2058 skip_emulated_instruction(vcpu);
2059 return 1;
2060}
2061
2062static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2063{
2064 kvm_emulate_cpuid(vcpu);
2065 return 1;
2066}
2067
2068static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2069{
2070 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
2071 u64 data;
2072
2073 if (vmx_get_msr(vcpu, ecx, &data)) {
2074 kvm_inject_gp(vcpu, 0);
2075 return 1;
2076 }
2077
2078 /* FIXME: handling of bits 32:63 of rax, rdx */
2079 vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u;
2080 vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
2081 skip_emulated_instruction(vcpu);
2082 return 1;
2083}
2084
2085static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2086{
2087 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
2088 u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
2089 | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
2090
2091 if (vmx_set_msr(vcpu, ecx, data) != 0) {
2092 kvm_inject_gp(vcpu, 0);
2093 return 1;
2094 }
2095
2096 skip_emulated_instruction(vcpu);
2097 return 1;
2098}
2099
2100static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu,
2101 struct kvm_run *kvm_run)
2102{
2103 return 1;
2104}
2105
2106static int handle_interrupt_window(struct kvm_vcpu *vcpu,
2107 struct kvm_run *kvm_run)
2108{
2109 u32 cpu_based_vm_exec_control;
2110
2111 /* clear pending irq */
2112 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
2113 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
2114 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2115 /*
2116 * If the user space waits to inject interrupts, exit as soon as
2117 * possible
2118 */
2119 if (kvm_run->request_interrupt_window &&
2120 !vcpu->arch.irq_summary) {
2121 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
2122 ++vcpu->stat.irq_window_exits;
2123 return 0;
2124 }
2125 return 1;
2126}
2127
2128static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2129{
2130 skip_emulated_instruction(vcpu);
2131 return kvm_emulate_halt(vcpu);
2132}
2133
2134static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2135{
2136 skip_emulated_instruction(vcpu);
2137 kvm_emulate_hypercall(vcpu);
2138 return 1;
2139}
2140
2141static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2142{
2143 skip_emulated_instruction(vcpu);
2144 /* TODO: Add support for VT-d/pass-through device */
2145 return 1;
2146}
2147
2148static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2149{
2150 u64 exit_qualification;
2151 enum emulation_result er;
2152 unsigned long offset;
2153
2154 exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2155 offset = exit_qualification & 0xffful;
2156
2157 er = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
2158
2159 if (er != EMULATE_DONE) {
2160 printk(KERN_ERR
2161 "Fail to handle apic access vmexit! Offset is 0x%lx\n",
2162 offset);
2163 return -ENOTSUPP;
2164 }
2165 return 1;
2166}
2167
2168/*
2169 * The exit handlers return 1 if the exit was handled fully and guest execution
2170 * may resume. Otherwise they set the kvm_run parameter to indicate what needs
2171 * to be done to userspace and return 0.
2172 */
2173static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
2174 struct kvm_run *kvm_run) = {
2175 [EXIT_REASON_EXCEPTION_NMI] = handle_exception,
2176 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
2177 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault,
2178 [EXIT_REASON_IO_INSTRUCTION] = handle_io,
2179 [EXIT_REASON_CR_ACCESS] = handle_cr,
2180 [EXIT_REASON_DR_ACCESS] = handle_dr,
2181 [EXIT_REASON_CPUID] = handle_cpuid,
2182 [EXIT_REASON_MSR_READ] = handle_rdmsr,
2183 [EXIT_REASON_MSR_WRITE] = handle_wrmsr,
2184 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
2185 [EXIT_REASON_HLT] = handle_halt,
2186 [EXIT_REASON_VMCALL] = handle_vmcall,
2187 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
2188 [EXIT_REASON_APIC_ACCESS] = handle_apic_access,
2189 [EXIT_REASON_WBINVD] = handle_wbinvd,
2190};
2191
2192static const int kvm_vmx_max_exit_handlers =
2193 ARRAY_SIZE(kvm_vmx_exit_handlers);
2194
2195/*
2196 * The guest has exited. See if we can fix it or if we need userspace
2197 * assistance.
2198 */
2199static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2200{
2201 u32 exit_reason = vmcs_read32(VM_EXIT_REASON);
2202 struct vcpu_vmx *vmx = to_vmx(vcpu);
2203 u32 vectoring_info = vmx->idt_vectoring_info;
2204
2205 if (unlikely(vmx->fail)) {
2206 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
2207 kvm_run->fail_entry.hardware_entry_failure_reason
2208 = vmcs_read32(VM_INSTRUCTION_ERROR);
2209 return 0;
2210 }
2211
2212 if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
2213 exit_reason != EXIT_REASON_EXCEPTION_NMI)
2214 printk(KERN_WARNING "%s: unexpected, valid vectoring info and "
2215 "exit reason is 0x%x\n", __FUNCTION__, exit_reason);
2216 if (exit_reason < kvm_vmx_max_exit_handlers
2217 && kvm_vmx_exit_handlers[exit_reason])
2218 return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run);
2219 else {
2220 kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
2221 kvm_run->hw.hardware_exit_reason = exit_reason;
2222 }
2223 return 0;
2224}
2225
2226static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
2227{
2228}
2229
2230static void update_tpr_threshold(struct kvm_vcpu *vcpu)
2231{
2232 int max_irr, tpr;
2233
2234 if (!vm_need_tpr_shadow(vcpu->kvm))
2235 return;
2236
2237 if (!kvm_lapic_enabled(vcpu) ||
2238 ((max_irr = kvm_lapic_find_highest_irr(vcpu)) == -1)) {
2239 vmcs_write32(TPR_THRESHOLD, 0);
2240 return;
2241 }
2242
2243 tpr = (kvm_lapic_get_cr8(vcpu) & 0x0f) << 4;
2244 vmcs_write32(TPR_THRESHOLD, (max_irr > tpr) ? tpr >> 4 : max_irr >> 4);
2245}
2246
2247static void enable_irq_window(struct kvm_vcpu *vcpu)
2248{
2249 u32 cpu_based_vm_exec_control;
2250
2251 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
2252 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
2253 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2254}
2255
2256static void vmx_intr_assist(struct kvm_vcpu *vcpu)
2257{
2258 struct vcpu_vmx *vmx = to_vmx(vcpu);
2259 u32 idtv_info_field, intr_info_field;
2260 int has_ext_irq, interrupt_window_open;
2261 int vector;
2262
2263 update_tpr_threshold(vcpu);
2264
2265 has_ext_irq = kvm_cpu_has_interrupt(vcpu);
2266 intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
2267 idtv_info_field = vmx->idt_vectoring_info;
2268 if (intr_info_field & INTR_INFO_VALID_MASK) {
2269 if (idtv_info_field & INTR_INFO_VALID_MASK) {
2270 /* TODO: fault when IDT_Vectoring */
2271 if (printk_ratelimit())
2272 printk(KERN_ERR "Fault when IDT_Vectoring\n");
2273 }
2274 if (has_ext_irq)
2275 enable_irq_window(vcpu);
2276 return;
2277 }
2278 if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) {
2279 if ((idtv_info_field & VECTORING_INFO_TYPE_MASK)
2280 == INTR_TYPE_EXT_INTR
2281 && vcpu->arch.rmode.active) {
2282 u8 vect = idtv_info_field & VECTORING_INFO_VECTOR_MASK;
2283
2284 vmx_inject_irq(vcpu, vect);
2285 if (unlikely(has_ext_irq))
2286 enable_irq_window(vcpu);
2287 return;
2288 }
2289
2290 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field);
2291 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
2292 vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
2293
2294 if (unlikely(idtv_info_field & INTR_INFO_DELIEVER_CODE_MASK))
2295 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
2296 vmcs_read32(IDT_VECTORING_ERROR_CODE));
2297 if (unlikely(has_ext_irq))
2298 enable_irq_window(vcpu);
2299 return;
2300 }
2301 if (!has_ext_irq)
2302 return;
2303 interrupt_window_open =
2304 ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
2305 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
2306 if (interrupt_window_open) {
2307 vector = kvm_cpu_get_interrupt(vcpu);
2308 vmx_inject_irq(vcpu, vector);
2309 kvm_timer_intr_post(vcpu, vector);
2310 } else
2311 enable_irq_window(vcpu);
2312}
2313
2314/*
2315 * Failure to inject an interrupt should give us the information
2316 * in IDT_VECTORING_INFO_FIELD. However, if the failure occurs
2317 * when fetching the interrupt redirection bitmap in the real-mode
2318 * tss, this doesn't happen. So we do it ourselves.
2319 */
2320static void fixup_rmode_irq(struct vcpu_vmx *vmx)
2321{
2322 vmx->rmode.irq.pending = 0;
2323 if (vmcs_readl(GUEST_RIP) + 1 != vmx->rmode.irq.rip)
2324 return;
2325 vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip);
2326 if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
2327 vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK;
2328 vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR;
2329 return;
2330 }
2331 vmx->idt_vectoring_info =
2332 VECTORING_INFO_VALID_MASK
2333 | INTR_TYPE_EXT_INTR
2334 | vmx->rmode.irq.vector;
2335}
2336
2337static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2338{
2339 struct vcpu_vmx *vmx = to_vmx(vcpu);
2340 u32 intr_info;
2341
2342 /*
2343 * Loading guest fpu may have cleared host cr0.ts
2344 */
2345 vmcs_writel(HOST_CR0, read_cr0());
2346
2347 asm(
2348 /* Store host registers */
2349#ifdef CONFIG_X86_64
2350 "push %%rdx; push %%rbp;"
2351 "push %%rcx \n\t"
2352#else
2353 "push %%edx; push %%ebp;"
2354 "push %%ecx \n\t"
2355#endif
2356 ASM_VMX_VMWRITE_RSP_RDX "\n\t"
2357 /* Check if vmlaunch of vmresume is needed */
2358 "cmpl $0, %c[launched](%0) \n\t"
2359 /* Load guest registers. Don't clobber flags. */
2360#ifdef CONFIG_X86_64
2361 "mov %c[cr2](%0), %%rax \n\t"
2362 "mov %%rax, %%cr2 \n\t"
2363 "mov %c[rax](%0), %%rax \n\t"
2364 "mov %c[rbx](%0), %%rbx \n\t"
2365 "mov %c[rdx](%0), %%rdx \n\t"
2366 "mov %c[rsi](%0), %%rsi \n\t"
2367 "mov %c[rdi](%0), %%rdi \n\t"
2368 "mov %c[rbp](%0), %%rbp \n\t"
2369 "mov %c[r8](%0), %%r8 \n\t"
2370 "mov %c[r9](%0), %%r9 \n\t"
2371 "mov %c[r10](%0), %%r10 \n\t"
2372 "mov %c[r11](%0), %%r11 \n\t"
2373 "mov %c[r12](%0), %%r12 \n\t"
2374 "mov %c[r13](%0), %%r13 \n\t"
2375 "mov %c[r14](%0), %%r14 \n\t"
2376 "mov %c[r15](%0), %%r15 \n\t"
2377 "mov %c[rcx](%0), %%rcx \n\t" /* kills %0 (rcx) */
2378#else
2379 "mov %c[cr2](%0), %%eax \n\t"
2380 "mov %%eax, %%cr2 \n\t"
2381 "mov %c[rax](%0), %%eax \n\t"
2382 "mov %c[rbx](%0), %%ebx \n\t"
2383 "mov %c[rdx](%0), %%edx \n\t"
2384 "mov %c[rsi](%0), %%esi \n\t"
2385 "mov %c[rdi](%0), %%edi \n\t"
2386 "mov %c[rbp](%0), %%ebp \n\t"
2387 "mov %c[rcx](%0), %%ecx \n\t" /* kills %0 (ecx) */
2388#endif
2389 /* Enter guest mode */
2390 "jne .Llaunched \n\t"
2391 ASM_VMX_VMLAUNCH "\n\t"
2392 "jmp .Lkvm_vmx_return \n\t"
2393 ".Llaunched: " ASM_VMX_VMRESUME "\n\t"
2394 ".Lkvm_vmx_return: "
2395 /* Save guest registers, load host registers, keep flags */
2396#ifdef CONFIG_X86_64
2397 "xchg %0, (%%rsp) \n\t"
2398 "mov %%rax, %c[rax](%0) \n\t"
2399 "mov %%rbx, %c[rbx](%0) \n\t"
2400 "pushq (%%rsp); popq %c[rcx](%0) \n\t"
2401 "mov %%rdx, %c[rdx](%0) \n\t"
2402 "mov %%rsi, %c[rsi](%0) \n\t"
2403 "mov %%rdi, %c[rdi](%0) \n\t"
2404 "mov %%rbp, %c[rbp](%0) \n\t"
2405 "mov %%r8, %c[r8](%0) \n\t"
2406 "mov %%r9, %c[r9](%0) \n\t"
2407 "mov %%r10, %c[r10](%0) \n\t"
2408 "mov %%r11, %c[r11](%0) \n\t"
2409 "mov %%r12, %c[r12](%0) \n\t"
2410 "mov %%r13, %c[r13](%0) \n\t"
2411 "mov %%r14, %c[r14](%0) \n\t"
2412 "mov %%r15, %c[r15](%0) \n\t"
2413 "mov %%cr2, %%rax \n\t"
2414 "mov %%rax, %c[cr2](%0) \n\t"
2415
2416 "pop %%rbp; pop %%rbp; pop %%rdx \n\t"
2417#else
2418 "xchg %0, (%%esp) \n\t"
2419 "mov %%eax, %c[rax](%0) \n\t"
2420 "mov %%ebx, %c[rbx](%0) \n\t"
2421 "pushl (%%esp); popl %c[rcx](%0) \n\t"
2422 "mov %%edx, %c[rdx](%0) \n\t"
2423 "mov %%esi, %c[rsi](%0) \n\t"
2424 "mov %%edi, %c[rdi](%0) \n\t"
2425 "mov %%ebp, %c[rbp](%0) \n\t"
2426 "mov %%cr2, %%eax \n\t"
2427 "mov %%eax, %c[cr2](%0) \n\t"
2428
2429 "pop %%ebp; pop %%ebp; pop %%edx \n\t"
2430#endif
2431 "setbe %c[fail](%0) \n\t"
2432 : : "c"(vmx), "d"((unsigned long)HOST_RSP),
2433 [launched]"i"(offsetof(struct vcpu_vmx, launched)),
2434 [fail]"i"(offsetof(struct vcpu_vmx, fail)),
2435 [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
2436 [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
2437 [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
2438 [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])),
2439 [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])),
2440 [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])),
2441 [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])),
2442#ifdef CONFIG_X86_64
2443 [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])),
2444 [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])),
2445 [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])),
2446 [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])),
2447 [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])),
2448 [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])),
2449 [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
2450 [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
2451#endif
2452 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2))
2453 : "cc", "memory"
2454#ifdef CONFIG_X86_64
2455 , "rbx", "rdi", "rsi"
2456 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
2457#else
2458 , "ebx", "edi", "rsi"
2459#endif
2460 );
2461
2462 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
2463 if (vmx->rmode.irq.pending)
2464 fixup_rmode_irq(vmx);
2465
2466 vcpu->arch.interrupt_window_open =
2467 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0;
2468
2469 asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
2470 vmx->launched = 1;
2471
2472 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
2473
2474 /* We need to handle NMIs before interrupts are enabled */
2475 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */
2476 asm("int $2");
2477}
2478
2479static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
2480{
2481 struct vcpu_vmx *vmx = to_vmx(vcpu);
2482
2483 if (vmx->vmcs) {
2484 on_each_cpu(__vcpu_clear, vmx, 0, 1);
2485 free_vmcs(vmx->vmcs);
2486 vmx->vmcs = NULL;
2487 }
2488}
2489
2490static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
2491{
2492 struct vcpu_vmx *vmx = to_vmx(vcpu);
2493
2494 vmx_free_vmcs(vcpu);
2495 kfree(vmx->host_msrs);
2496 kfree(vmx->guest_msrs);
2497 kvm_vcpu_uninit(vcpu);
2498 kmem_cache_free(kvm_vcpu_cache, vmx);
2499}
2500
2501static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
2502{
2503 int err;
2504 struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
2505 int cpu;
2506
2507 if (!vmx)
2508 return ERR_PTR(-ENOMEM);
2509
2510 err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
2511 if (err)
2512 goto free_vcpu;
2513
2514 vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
2515 if (!vmx->guest_msrs) {
2516 err = -ENOMEM;
2517 goto uninit_vcpu;
2518 }
2519
2520 vmx->host_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
2521 if (!vmx->host_msrs)
2522 goto free_guest_msrs;
2523
2524 vmx->vmcs = alloc_vmcs();
2525 if (!vmx->vmcs)
2526 goto free_msrs;
2527
2528 vmcs_clear(vmx->vmcs);
2529
2530 cpu = get_cpu();
2531 vmx_vcpu_load(&vmx->vcpu, cpu);
2532 err = vmx_vcpu_setup(vmx);
2533 vmx_vcpu_put(&vmx->vcpu);
2534 put_cpu();
2535 if (err)
2536 goto free_vmcs;
2537
2538 return &vmx->vcpu;
2539
2540free_vmcs:
2541 free_vmcs(vmx->vmcs);
2542free_msrs:
2543 kfree(vmx->host_msrs);
2544free_guest_msrs:
2545 kfree(vmx->guest_msrs);
2546uninit_vcpu:
2547 kvm_vcpu_uninit(&vmx->vcpu);
2548free_vcpu:
2549 kmem_cache_free(kvm_vcpu_cache, vmx);
2550 return ERR_PTR(err);
2551}
2552
2553static void __init vmx_check_processor_compat(void *rtn)
2554{
2555 struct vmcs_config vmcs_conf;
2556
2557 *(int *)rtn = 0;
2558 if (setup_vmcs_config(&vmcs_conf) < 0)
2559 *(int *)rtn = -EIO;
2560 if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
2561 printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
2562 smp_processor_id());
2563 *(int *)rtn = -EIO;
2564 }
2565}
2566
2567static struct kvm_x86_ops vmx_x86_ops = {
2568 .cpu_has_kvm_support = cpu_has_kvm_support,
2569 .disabled_by_bios = vmx_disabled_by_bios,
2570 .hardware_setup = hardware_setup,
2571 .hardware_unsetup = hardware_unsetup,
2572 .check_processor_compatibility = vmx_check_processor_compat,
2573 .hardware_enable = hardware_enable,
2574 .hardware_disable = hardware_disable,
2575 .cpu_has_accelerated_tpr = cpu_has_vmx_virtualize_apic_accesses,
2576
2577 .vcpu_create = vmx_create_vcpu,
2578 .vcpu_free = vmx_free_vcpu,
2579 .vcpu_reset = vmx_vcpu_reset,
2580
2581 .prepare_guest_switch = vmx_save_host_state,
2582 .vcpu_load = vmx_vcpu_load,
2583 .vcpu_put = vmx_vcpu_put,
2584 .vcpu_decache = vmx_vcpu_decache,
2585
2586 .set_guest_debug = set_guest_debug,
2587 .guest_debug_pre = kvm_guest_debug_pre,
2588 .get_msr = vmx_get_msr,
2589 .set_msr = vmx_set_msr,
2590 .get_segment_base = vmx_get_segment_base,
2591 .get_segment = vmx_get_segment,
2592 .set_segment = vmx_set_segment,
2593 .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
2594 .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
2595 .set_cr0 = vmx_set_cr0,
2596 .set_cr3 = vmx_set_cr3,
2597 .set_cr4 = vmx_set_cr4,
2598#ifdef CONFIG_X86_64
2599 .set_efer = vmx_set_efer,
2600#endif
2601 .get_idt = vmx_get_idt,
2602 .set_idt = vmx_set_idt,
2603 .get_gdt = vmx_get_gdt,
2604 .set_gdt = vmx_set_gdt,
2605 .cache_regs = vcpu_load_rsp_rip,
2606 .decache_regs = vcpu_put_rsp_rip,
2607 .get_rflags = vmx_get_rflags,
2608 .set_rflags = vmx_set_rflags,
2609
2610 .tlb_flush = vmx_flush_tlb,
2611
2612 .run = vmx_vcpu_run,
2613 .handle_exit = kvm_handle_exit,
2614 .skip_emulated_instruction = skip_emulated_instruction,
2615 .patch_hypercall = vmx_patch_hypercall,
2616 .get_irq = vmx_get_irq,
2617 .set_irq = vmx_inject_irq,
2618 .queue_exception = vmx_queue_exception,
2619 .exception_injected = vmx_exception_injected,
2620 .inject_pending_irq = vmx_intr_assist,
2621 .inject_pending_vectors = do_interrupt_requests,
2622
2623 .set_tss_addr = vmx_set_tss_addr,
2624};
2625
2626static int __init vmx_init(void)
2627{
2628 void *iova;
2629 int r;
2630
2631 vmx_io_bitmap_a = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
2632 if (!vmx_io_bitmap_a)
2633 return -ENOMEM;
2634
2635 vmx_io_bitmap_b = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
2636 if (!vmx_io_bitmap_b) {
2637 r = -ENOMEM;
2638 goto out;
2639 }
2640
2641 /*
2642 * Allow direct access to the PC debug port (it is often used for I/O
2643 * delays, but the vmexits simply slow things down).
2644 */
2645 iova = kmap(vmx_io_bitmap_a);
2646 memset(iova, 0xff, PAGE_SIZE);
2647 clear_bit(0x80, iova);
2648 kunmap(vmx_io_bitmap_a);
2649
2650 iova = kmap(vmx_io_bitmap_b);
2651 memset(iova, 0xff, PAGE_SIZE);
2652 kunmap(vmx_io_bitmap_b);
2653
2654 r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE);
2655 if (r)
2656 goto out1;
2657
2658 if (bypass_guest_pf)
2659 kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
2660
2661 return 0;
2662
2663out1:
2664 __free_page(vmx_io_bitmap_b);
2665out:
2666 __free_page(vmx_io_bitmap_a);
2667 return r;
2668}
2669
2670static void __exit vmx_exit(void)
2671{
2672 __free_page(vmx_io_bitmap_b);
2673 __free_page(vmx_io_bitmap_a);
2674
2675 kvm_exit();
2676}
2677
2678module_init(vmx_init)
2679module_exit(vmx_exit)
diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h
new file mode 100644
index 00000000000..d52ae8d7303
--- /dev/null
+++ b/arch/x86/kvm/vmx.h
@@ -0,0 +1,324 @@
1#ifndef VMX_H
2#define VMX_H
3
4/*
5 * vmx.h: VMX Architecture related definitions
6 * Copyright (c) 2004, Intel Corporation.
7 *
8 * This program is free software; you can redistribute it and/or modify it
9 * under the terms and conditions of the GNU General Public License,
10 * version 2, as published by the Free Software Foundation.
11 *
12 * This program is distributed in the hope it will be useful, but WITHOUT
13 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
15 * more details.
16 *
17 * You should have received a copy of the GNU General Public License along with
18 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
19 * Place - Suite 330, Boston, MA 02111-1307 USA.
20 *
21 * A few random additions are:
22 * Copyright (C) 2006 Qumranet
23 * Avi Kivity <avi@qumranet.com>
24 * Yaniv Kamay <yaniv@qumranet.com>
25 *
26 */
27
28/*
29 * Definitions of Primary Processor-Based VM-Execution Controls.
30 */
31#define CPU_BASED_VIRTUAL_INTR_PENDING 0x00000004
32#define CPU_BASED_USE_TSC_OFFSETING 0x00000008
33#define CPU_BASED_HLT_EXITING 0x00000080
34#define CPU_BASED_INVLPG_EXITING 0x00000200
35#define CPU_BASED_MWAIT_EXITING 0x00000400
36#define CPU_BASED_RDPMC_EXITING 0x00000800
37#define CPU_BASED_RDTSC_EXITING 0x00001000
38#define CPU_BASED_CR8_LOAD_EXITING 0x00080000
39#define CPU_BASED_CR8_STORE_EXITING 0x00100000
40#define CPU_BASED_TPR_SHADOW 0x00200000
41#define CPU_BASED_MOV_DR_EXITING 0x00800000
42#define CPU_BASED_UNCOND_IO_EXITING 0x01000000
43#define CPU_BASED_USE_IO_BITMAPS 0x02000000
44#define CPU_BASED_USE_MSR_BITMAPS 0x10000000
45#define CPU_BASED_MONITOR_EXITING 0x20000000
46#define CPU_BASED_PAUSE_EXITING 0x40000000
47#define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS 0x80000000
48/*
49 * Definitions of Secondary Processor-Based VM-Execution Controls.
50 */
51#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
52#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040
53
54
55#define PIN_BASED_EXT_INTR_MASK 0x00000001
56#define PIN_BASED_NMI_EXITING 0x00000008
57#define PIN_BASED_VIRTUAL_NMIS 0x00000020
58
59#define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200
60#define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000
61
62#define VM_ENTRY_IA32E_MODE 0x00000200
63#define VM_ENTRY_SMM 0x00000400
64#define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800
65
66/* VMCS Encodings */
67enum vmcs_field {
68 GUEST_ES_SELECTOR = 0x00000800,
69 GUEST_CS_SELECTOR = 0x00000802,
70 GUEST_SS_SELECTOR = 0x00000804,
71 GUEST_DS_SELECTOR = 0x00000806,
72 GUEST_FS_SELECTOR = 0x00000808,
73 GUEST_GS_SELECTOR = 0x0000080a,
74 GUEST_LDTR_SELECTOR = 0x0000080c,
75 GUEST_TR_SELECTOR = 0x0000080e,
76 HOST_ES_SELECTOR = 0x00000c00,
77 HOST_CS_SELECTOR = 0x00000c02,
78 HOST_SS_SELECTOR = 0x00000c04,
79 HOST_DS_SELECTOR = 0x00000c06,
80 HOST_FS_SELECTOR = 0x00000c08,
81 HOST_GS_SELECTOR = 0x00000c0a,
82 HOST_TR_SELECTOR = 0x00000c0c,
83 IO_BITMAP_A = 0x00002000,
84 IO_BITMAP_A_HIGH = 0x00002001,
85 IO_BITMAP_B = 0x00002002,
86 IO_BITMAP_B_HIGH = 0x00002003,
87 MSR_BITMAP = 0x00002004,
88 MSR_BITMAP_HIGH = 0x00002005,
89 VM_EXIT_MSR_STORE_ADDR = 0x00002006,
90 VM_EXIT_MSR_STORE_ADDR_HIGH = 0x00002007,
91 VM_EXIT_MSR_LOAD_ADDR = 0x00002008,
92 VM_EXIT_MSR_LOAD_ADDR_HIGH = 0x00002009,
93 VM_ENTRY_MSR_LOAD_ADDR = 0x0000200a,
94 VM_ENTRY_MSR_LOAD_ADDR_HIGH = 0x0000200b,
95 TSC_OFFSET = 0x00002010,
96 TSC_OFFSET_HIGH = 0x00002011,
97 VIRTUAL_APIC_PAGE_ADDR = 0x00002012,
98 VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x00002013,
99 APIC_ACCESS_ADDR = 0x00002014,
100 APIC_ACCESS_ADDR_HIGH = 0x00002015,
101 VMCS_LINK_POINTER = 0x00002800,
102 VMCS_LINK_POINTER_HIGH = 0x00002801,
103 GUEST_IA32_DEBUGCTL = 0x00002802,
104 GUEST_IA32_DEBUGCTL_HIGH = 0x00002803,
105 PIN_BASED_VM_EXEC_CONTROL = 0x00004000,
106 CPU_BASED_VM_EXEC_CONTROL = 0x00004002,
107 EXCEPTION_BITMAP = 0x00004004,
108 PAGE_FAULT_ERROR_CODE_MASK = 0x00004006,
109 PAGE_FAULT_ERROR_CODE_MATCH = 0x00004008,
110 CR3_TARGET_COUNT = 0x0000400a,
111 VM_EXIT_CONTROLS = 0x0000400c,
112 VM_EXIT_MSR_STORE_COUNT = 0x0000400e,
113 VM_EXIT_MSR_LOAD_COUNT = 0x00004010,
114 VM_ENTRY_CONTROLS = 0x00004012,
115 VM_ENTRY_MSR_LOAD_COUNT = 0x00004014,
116 VM_ENTRY_INTR_INFO_FIELD = 0x00004016,
117 VM_ENTRY_EXCEPTION_ERROR_CODE = 0x00004018,
118 VM_ENTRY_INSTRUCTION_LEN = 0x0000401a,
119 TPR_THRESHOLD = 0x0000401c,
120 SECONDARY_VM_EXEC_CONTROL = 0x0000401e,
121 VM_INSTRUCTION_ERROR = 0x00004400,
122 VM_EXIT_REASON = 0x00004402,
123 VM_EXIT_INTR_INFO = 0x00004404,
124 VM_EXIT_INTR_ERROR_CODE = 0x00004406,
125 IDT_VECTORING_INFO_FIELD = 0x00004408,
126 IDT_VECTORING_ERROR_CODE = 0x0000440a,
127 VM_EXIT_INSTRUCTION_LEN = 0x0000440c,
128 VMX_INSTRUCTION_INFO = 0x0000440e,
129 GUEST_ES_LIMIT = 0x00004800,
130 GUEST_CS_LIMIT = 0x00004802,
131 GUEST_SS_LIMIT = 0x00004804,
132 GUEST_DS_LIMIT = 0x00004806,
133 GUEST_FS_LIMIT = 0x00004808,
134 GUEST_GS_LIMIT = 0x0000480a,
135 GUEST_LDTR_LIMIT = 0x0000480c,
136 GUEST_TR_LIMIT = 0x0000480e,
137 GUEST_GDTR_LIMIT = 0x00004810,
138 GUEST_IDTR_LIMIT = 0x00004812,
139 GUEST_ES_AR_BYTES = 0x00004814,
140 GUEST_CS_AR_BYTES = 0x00004816,
141 GUEST_SS_AR_BYTES = 0x00004818,
142 GUEST_DS_AR_BYTES = 0x0000481a,
143 GUEST_FS_AR_BYTES = 0x0000481c,
144 GUEST_GS_AR_BYTES = 0x0000481e,
145 GUEST_LDTR_AR_BYTES = 0x00004820,
146 GUEST_TR_AR_BYTES = 0x00004822,
147 GUEST_INTERRUPTIBILITY_INFO = 0x00004824,
148 GUEST_ACTIVITY_STATE = 0X00004826,
149 GUEST_SYSENTER_CS = 0x0000482A,
150 HOST_IA32_SYSENTER_CS = 0x00004c00,
151 CR0_GUEST_HOST_MASK = 0x00006000,
152 CR4_GUEST_HOST_MASK = 0x00006002,
153 CR0_READ_SHADOW = 0x00006004,
154 CR4_READ_SHADOW = 0x00006006,
155 CR3_TARGET_VALUE0 = 0x00006008,
156 CR3_TARGET_VALUE1 = 0x0000600a,
157 CR3_TARGET_VALUE2 = 0x0000600c,
158 CR3_TARGET_VALUE3 = 0x0000600e,
159 EXIT_QUALIFICATION = 0x00006400,
160 GUEST_LINEAR_ADDRESS = 0x0000640a,
161 GUEST_CR0 = 0x00006800,
162 GUEST_CR3 = 0x00006802,
163 GUEST_CR4 = 0x00006804,
164 GUEST_ES_BASE = 0x00006806,
165 GUEST_CS_BASE = 0x00006808,
166 GUEST_SS_BASE = 0x0000680a,
167 GUEST_DS_BASE = 0x0000680c,
168 GUEST_FS_BASE = 0x0000680e,
169 GUEST_GS_BASE = 0x00006810,
170 GUEST_LDTR_BASE = 0x00006812,
171 GUEST_TR_BASE = 0x00006814,
172 GUEST_GDTR_BASE = 0x00006816,
173 GUEST_IDTR_BASE = 0x00006818,
174 GUEST_DR7 = 0x0000681a,
175 GUEST_RSP = 0x0000681c,
176 GUEST_RIP = 0x0000681e,
177 GUEST_RFLAGS = 0x00006820,
178 GUEST_PENDING_DBG_EXCEPTIONS = 0x00006822,
179 GUEST_SYSENTER_ESP = 0x00006824,
180 GUEST_SYSENTER_EIP = 0x00006826,
181 HOST_CR0 = 0x00006c00,
182 HOST_CR3 = 0x00006c02,
183 HOST_CR4 = 0x00006c04,
184 HOST_FS_BASE = 0x00006c06,
185 HOST_GS_BASE = 0x00006c08,
186 HOST_TR_BASE = 0x00006c0a,
187 HOST_GDTR_BASE = 0x00006c0c,
188 HOST_IDTR_BASE = 0x00006c0e,
189 HOST_IA32_SYSENTER_ESP = 0x00006c10,
190 HOST_IA32_SYSENTER_EIP = 0x00006c12,
191 HOST_RSP = 0x00006c14,
192 HOST_RIP = 0x00006c16,
193};
194
195#define VMX_EXIT_REASONS_FAILED_VMENTRY 0x80000000
196
197#define EXIT_REASON_EXCEPTION_NMI 0
198#define EXIT_REASON_EXTERNAL_INTERRUPT 1
199#define EXIT_REASON_TRIPLE_FAULT 2
200
201#define EXIT_REASON_PENDING_INTERRUPT 7
202
203#define EXIT_REASON_TASK_SWITCH 9
204#define EXIT_REASON_CPUID 10
205#define EXIT_REASON_HLT 12
206#define EXIT_REASON_INVLPG 14
207#define EXIT_REASON_RDPMC 15
208#define EXIT_REASON_RDTSC 16
209#define EXIT_REASON_VMCALL 18
210#define EXIT_REASON_VMCLEAR 19
211#define EXIT_REASON_VMLAUNCH 20
212#define EXIT_REASON_VMPTRLD 21
213#define EXIT_REASON_VMPTRST 22
214#define EXIT_REASON_VMREAD 23
215#define EXIT_REASON_VMRESUME 24
216#define EXIT_REASON_VMWRITE 25
217#define EXIT_REASON_VMOFF 26
218#define EXIT_REASON_VMON 27
219#define EXIT_REASON_CR_ACCESS 28
220#define EXIT_REASON_DR_ACCESS 29
221#define EXIT_REASON_IO_INSTRUCTION 30
222#define EXIT_REASON_MSR_READ 31
223#define EXIT_REASON_MSR_WRITE 32
224#define EXIT_REASON_MWAIT_INSTRUCTION 36
225#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
226#define EXIT_REASON_APIC_ACCESS 44
227#define EXIT_REASON_WBINVD 54
228
229/*
230 * Interruption-information format
231 */
232#define INTR_INFO_VECTOR_MASK 0xff /* 7:0 */
233#define INTR_INFO_INTR_TYPE_MASK 0x700 /* 10:8 */
234#define INTR_INFO_DELIEVER_CODE_MASK 0x800 /* 11 */
235#define INTR_INFO_VALID_MASK 0x80000000 /* 31 */
236
237#define VECTORING_INFO_VECTOR_MASK INTR_INFO_VECTOR_MASK
238#define VECTORING_INFO_TYPE_MASK INTR_INFO_INTR_TYPE_MASK
239#define VECTORING_INFO_DELIEVER_CODE_MASK INTR_INFO_DELIEVER_CODE_MASK
240#define VECTORING_INFO_VALID_MASK INTR_INFO_VALID_MASK
241
242#define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */
243#define INTR_TYPE_EXCEPTION (3 << 8) /* processor exception */
244#define INTR_TYPE_SOFT_INTR (4 << 8) /* software interrupt */
245
246/*
247 * Exit Qualifications for MOV for Control Register Access
248 */
249#define CONTROL_REG_ACCESS_NUM 0x7 /* 2:0, number of control reg.*/
250#define CONTROL_REG_ACCESS_TYPE 0x30 /* 5:4, access type */
251#define CONTROL_REG_ACCESS_REG 0xf00 /* 10:8, general purpose reg. */
252#define LMSW_SOURCE_DATA_SHIFT 16
253#define LMSW_SOURCE_DATA (0xFFFF << LMSW_SOURCE_DATA_SHIFT) /* 16:31 lmsw source */
254#define REG_EAX (0 << 8)
255#define REG_ECX (1 << 8)
256#define REG_EDX (2 << 8)
257#define REG_EBX (3 << 8)
258#define REG_ESP (4 << 8)
259#define REG_EBP (5 << 8)
260#define REG_ESI (6 << 8)
261#define REG_EDI (7 << 8)
262#define REG_R8 (8 << 8)
263#define REG_R9 (9 << 8)
264#define REG_R10 (10 << 8)
265#define REG_R11 (11 << 8)
266#define REG_R12 (12 << 8)
267#define REG_R13 (13 << 8)
268#define REG_R14 (14 << 8)
269#define REG_R15 (15 << 8)
270
271/*
272 * Exit Qualifications for MOV for Debug Register Access
273 */
274#define DEBUG_REG_ACCESS_NUM 0x7 /* 2:0, number of debug reg. */
275#define DEBUG_REG_ACCESS_TYPE 0x10 /* 4, direction of access */
276#define TYPE_MOV_TO_DR (0 << 4)
277#define TYPE_MOV_FROM_DR (1 << 4)
278#define DEBUG_REG_ACCESS_REG 0xf00 /* 11:8, general purpose reg. */
279
280
281/* segment AR */
282#define SEGMENT_AR_L_MASK (1 << 13)
283
284#define AR_TYPE_ACCESSES_MASK 1
285#define AR_TYPE_READABLE_MASK (1 << 1)
286#define AR_TYPE_WRITEABLE_MASK (1 << 2)
287#define AR_TYPE_CODE_MASK (1 << 3)
288#define AR_TYPE_MASK 0x0f
289#define AR_TYPE_BUSY_64_TSS 11
290#define AR_TYPE_BUSY_32_TSS 11
291#define AR_TYPE_BUSY_16_TSS 3
292#define AR_TYPE_LDT 2
293
294#define AR_UNUSABLE_MASK (1 << 16)
295#define AR_S_MASK (1 << 4)
296#define AR_P_MASK (1 << 7)
297#define AR_L_MASK (1 << 13)
298#define AR_DB_MASK (1 << 14)
299#define AR_G_MASK (1 << 15)
300#define AR_DPL_SHIFT 5
301#define AR_DPL(ar) (((ar) >> AR_DPL_SHIFT) & 3)
302
303#define AR_RESERVD_MASK 0xfffe0f00
304
305#define MSR_IA32_VMX_BASIC 0x480
306#define MSR_IA32_VMX_PINBASED_CTLS 0x481
307#define MSR_IA32_VMX_PROCBASED_CTLS 0x482
308#define MSR_IA32_VMX_EXIT_CTLS 0x483
309#define MSR_IA32_VMX_ENTRY_CTLS 0x484
310#define MSR_IA32_VMX_MISC 0x485
311#define MSR_IA32_VMX_CR0_FIXED0 0x486
312#define MSR_IA32_VMX_CR0_FIXED1 0x487
313#define MSR_IA32_VMX_CR4_FIXED0 0x488
314#define MSR_IA32_VMX_CR4_FIXED1 0x489
315#define MSR_IA32_VMX_VMCS_ENUM 0x48a
316#define MSR_IA32_VMX_PROCBASED_CTLS2 0x48b
317
318#define MSR_IA32_FEATURE_CONTROL 0x3a
319#define MSR_IA32_FEATURE_CONTROL_LOCKED 0x1
320#define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED 0x4
321
322#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9
323
324#endif
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
new file mode 100644
index 00000000000..cf530814868
--- /dev/null
+++ b/arch/x86/kvm/x86.c
@@ -0,0 +1,3287 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * derived from drivers/kvm/kvm_main.c
5 *
6 * Copyright (C) 2006 Qumranet, Inc.
7 *
8 * Authors:
9 * Avi Kivity <avi@qumranet.com>
10 * Yaniv Kamay <yaniv@qumranet.com>
11 *
12 * This work is licensed under the terms of the GNU GPL, version 2. See
13 * the COPYING file in the top-level directory.
14 *
15 */
16
17#include <linux/kvm_host.h>
18#include "segment_descriptor.h"
19#include "irq.h"
20#include "mmu.h"
21
22#include <linux/kvm.h>
23#include <linux/fs.h>
24#include <linux/vmalloc.h>
25#include <linux/module.h>
26#include <linux/mman.h>
27#include <linux/highmem.h>
28
29#include <asm/uaccess.h>
30#include <asm/msr.h>
31
32#define MAX_IO_MSRS 256
33#define CR0_RESERVED_BITS \
34 (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
35 | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
36 | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
37#define CR4_RESERVED_BITS \
38 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
39 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
40 | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \
41 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
42
43#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
44#define EFER_RESERVED_BITS 0xfffffffffffff2fe
45
46#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
47#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
48
49struct kvm_x86_ops *kvm_x86_ops;
50
51struct kvm_stats_debugfs_item debugfs_entries[] = {
52 { "pf_fixed", VCPU_STAT(pf_fixed) },
53 { "pf_guest", VCPU_STAT(pf_guest) },
54 { "tlb_flush", VCPU_STAT(tlb_flush) },
55 { "invlpg", VCPU_STAT(invlpg) },
56 { "exits", VCPU_STAT(exits) },
57 { "io_exits", VCPU_STAT(io_exits) },
58 { "mmio_exits", VCPU_STAT(mmio_exits) },
59 { "signal_exits", VCPU_STAT(signal_exits) },
60 { "irq_window", VCPU_STAT(irq_window_exits) },
61 { "halt_exits", VCPU_STAT(halt_exits) },
62 { "halt_wakeup", VCPU_STAT(halt_wakeup) },
63 { "request_irq", VCPU_STAT(request_irq_exits) },
64 { "irq_exits", VCPU_STAT(irq_exits) },
65 { "host_state_reload", VCPU_STAT(host_state_reload) },
66 { "efer_reload", VCPU_STAT(efer_reload) },
67 { "fpu_reload", VCPU_STAT(fpu_reload) },
68 { "insn_emulation", VCPU_STAT(insn_emulation) },
69 { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
70 { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
71 { "mmu_pte_write", VM_STAT(mmu_pte_write) },
72 { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
73 { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
74 { "mmu_flooded", VM_STAT(mmu_flooded) },
75 { "mmu_recycled", VM_STAT(mmu_recycled) },
76 { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
77 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
78 { NULL }
79};
80
81
82unsigned long segment_base(u16 selector)
83{
84 struct descriptor_table gdt;
85 struct segment_descriptor *d;
86 unsigned long table_base;
87 unsigned long v;
88
89 if (selector == 0)
90 return 0;
91
92 asm("sgdt %0" : "=m"(gdt));
93 table_base = gdt.base;
94
95 if (selector & 4) { /* from ldt */
96 u16 ldt_selector;
97
98 asm("sldt %0" : "=g"(ldt_selector));
99 table_base = segment_base(ldt_selector);
100 }
101 d = (struct segment_descriptor *)(table_base + (selector & ~7));
102 v = d->base_low | ((unsigned long)d->base_mid << 16) |
103 ((unsigned long)d->base_high << 24);
104#ifdef CONFIG_X86_64
105 if (d->system == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
106 v |= ((unsigned long) \
107 ((struct segment_descriptor_64 *)d)->base_higher) << 32;
108#endif
109 return v;
110}
111EXPORT_SYMBOL_GPL(segment_base);
112
113u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
114{
115 if (irqchip_in_kernel(vcpu->kvm))
116 return vcpu->arch.apic_base;
117 else
118 return vcpu->arch.apic_base;
119}
120EXPORT_SYMBOL_GPL(kvm_get_apic_base);
121
122void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
123{
124 /* TODO: reserve bits check */
125 if (irqchip_in_kernel(vcpu->kvm))
126 kvm_lapic_set_base(vcpu, data);
127 else
128 vcpu->arch.apic_base = data;
129}
130EXPORT_SYMBOL_GPL(kvm_set_apic_base);
131
132void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
133{
134 WARN_ON(vcpu->arch.exception.pending);
135 vcpu->arch.exception.pending = true;
136 vcpu->arch.exception.has_error_code = false;
137 vcpu->arch.exception.nr = nr;
138}
139EXPORT_SYMBOL_GPL(kvm_queue_exception);
140
141void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
142 u32 error_code)
143{
144 ++vcpu->stat.pf_guest;
145 if (vcpu->arch.exception.pending && vcpu->arch.exception.nr == PF_VECTOR) {
146 printk(KERN_DEBUG "kvm: inject_page_fault:"
147 " double fault 0x%lx\n", addr);
148 vcpu->arch.exception.nr = DF_VECTOR;
149 vcpu->arch.exception.error_code = 0;
150 return;
151 }
152 vcpu->arch.cr2 = addr;
153 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
154}
155
156void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
157{
158 WARN_ON(vcpu->arch.exception.pending);
159 vcpu->arch.exception.pending = true;
160 vcpu->arch.exception.has_error_code = true;
161 vcpu->arch.exception.nr = nr;
162 vcpu->arch.exception.error_code = error_code;
163}
164EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
165
166static void __queue_exception(struct kvm_vcpu *vcpu)
167{
168 kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
169 vcpu->arch.exception.has_error_code,
170 vcpu->arch.exception.error_code);
171}
172
173/*
174 * Load the pae pdptrs. Return true is they are all valid.
175 */
176int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
177{
178 gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
179 unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
180 int i;
181 int ret;
182 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
183
184 down_read(&current->mm->mmap_sem);
185 ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
186 offset * sizeof(u64), sizeof(pdpte));
187 if (ret < 0) {
188 ret = 0;
189 goto out;
190 }
191 for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
192 if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) {
193 ret = 0;
194 goto out;
195 }
196 }
197 ret = 1;
198
199 memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
200out:
201 up_read(&current->mm->mmap_sem);
202
203 return ret;
204}
205
206static bool pdptrs_changed(struct kvm_vcpu *vcpu)
207{
208 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
209 bool changed = true;
210 int r;
211
212 if (is_long_mode(vcpu) || !is_pae(vcpu))
213 return false;
214
215 down_read(&current->mm->mmap_sem);
216 r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
217 if (r < 0)
218 goto out;
219 changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
220out:
221 up_read(&current->mm->mmap_sem);
222
223 return changed;
224}
225
226void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
227{
228 if (cr0 & CR0_RESERVED_BITS) {
229 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
230 cr0, vcpu->arch.cr0);
231 kvm_inject_gp(vcpu, 0);
232 return;
233 }
234
235 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
236 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
237 kvm_inject_gp(vcpu, 0);
238 return;
239 }
240
241 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
242 printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
243 "and a clear PE flag\n");
244 kvm_inject_gp(vcpu, 0);
245 return;
246 }
247
248 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
249#ifdef CONFIG_X86_64
250 if ((vcpu->arch.shadow_efer & EFER_LME)) {
251 int cs_db, cs_l;
252
253 if (!is_pae(vcpu)) {
254 printk(KERN_DEBUG "set_cr0: #GP, start paging "
255 "in long mode while PAE is disabled\n");
256 kvm_inject_gp(vcpu, 0);
257 return;
258 }
259 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
260 if (cs_l) {
261 printk(KERN_DEBUG "set_cr0: #GP, start paging "
262 "in long mode while CS.L == 1\n");
263 kvm_inject_gp(vcpu, 0);
264 return;
265
266 }
267 } else
268#endif
269 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
270 printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
271 "reserved bits\n");
272 kvm_inject_gp(vcpu, 0);
273 return;
274 }
275
276 }
277
278 kvm_x86_ops->set_cr0(vcpu, cr0);
279 vcpu->arch.cr0 = cr0;
280
281 kvm_mmu_reset_context(vcpu);
282 return;
283}
284EXPORT_SYMBOL_GPL(set_cr0);
285
286void lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
287{
288 set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
289}
290EXPORT_SYMBOL_GPL(lmsw);
291
292void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
293{
294 if (cr4 & CR4_RESERVED_BITS) {
295 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
296 kvm_inject_gp(vcpu, 0);
297 return;
298 }
299
300 if (is_long_mode(vcpu)) {
301 if (!(cr4 & X86_CR4_PAE)) {
302 printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
303 "in long mode\n");
304 kvm_inject_gp(vcpu, 0);
305 return;
306 }
307 } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE)
308 && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
309 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
310 kvm_inject_gp(vcpu, 0);
311 return;
312 }
313
314 if (cr4 & X86_CR4_VMXE) {
315 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
316 kvm_inject_gp(vcpu, 0);
317 return;
318 }
319 kvm_x86_ops->set_cr4(vcpu, cr4);
320 vcpu->arch.cr4 = cr4;
321 kvm_mmu_reset_context(vcpu);
322}
323EXPORT_SYMBOL_GPL(set_cr4);
324
325void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
326{
327 if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
328 kvm_mmu_flush_tlb(vcpu);
329 return;
330 }
331
332 if (is_long_mode(vcpu)) {
333 if (cr3 & CR3_L_MODE_RESERVED_BITS) {
334 printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
335 kvm_inject_gp(vcpu, 0);
336 return;
337 }
338 } else {
339 if (is_pae(vcpu)) {
340 if (cr3 & CR3_PAE_RESERVED_BITS) {
341 printk(KERN_DEBUG
342 "set_cr3: #GP, reserved bits\n");
343 kvm_inject_gp(vcpu, 0);
344 return;
345 }
346 if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
347 printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
348 "reserved bits\n");
349 kvm_inject_gp(vcpu, 0);
350 return;
351 }
352 }
353 /*
354 * We don't check reserved bits in nonpae mode, because
355 * this isn't enforced, and VMware depends on this.
356 */
357 }
358
359 down_read(&current->mm->mmap_sem);
360 /*
361 * Does the new cr3 value map to physical memory? (Note, we
362 * catch an invalid cr3 even in real-mode, because it would
363 * cause trouble later on when we turn on paging anyway.)
364 *
365 * A real CPU would silently accept an invalid cr3 and would
366 * attempt to use it - with largely undefined (and often hard
367 * to debug) behavior on the guest side.
368 */
369 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
370 kvm_inject_gp(vcpu, 0);
371 else {
372 vcpu->arch.cr3 = cr3;
373 vcpu->arch.mmu.new_cr3(vcpu);
374 }
375 up_read(&current->mm->mmap_sem);
376}
377EXPORT_SYMBOL_GPL(set_cr3);
378
379void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
380{
381 if (cr8 & CR8_RESERVED_BITS) {
382 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
383 kvm_inject_gp(vcpu, 0);
384 return;
385 }
386 if (irqchip_in_kernel(vcpu->kvm))
387 kvm_lapic_set_tpr(vcpu, cr8);
388 else
389 vcpu->arch.cr8 = cr8;
390}
391EXPORT_SYMBOL_GPL(set_cr8);
392
393unsigned long get_cr8(struct kvm_vcpu *vcpu)
394{
395 if (irqchip_in_kernel(vcpu->kvm))
396 return kvm_lapic_get_cr8(vcpu);
397 else
398 return vcpu->arch.cr8;
399}
400EXPORT_SYMBOL_GPL(get_cr8);
401
402/*
403 * List of msr numbers which we expose to userspace through KVM_GET_MSRS
404 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
405 *
406 * This list is modified at module load time to reflect the
407 * capabilities of the host cpu.
408 */
409static u32 msrs_to_save[] = {
410 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
411 MSR_K6_STAR,
412#ifdef CONFIG_X86_64
413 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
414#endif
415 MSR_IA32_TIME_STAMP_COUNTER,
416};
417
418static unsigned num_msrs_to_save;
419
420static u32 emulated_msrs[] = {
421 MSR_IA32_MISC_ENABLE,
422};
423
424#ifdef CONFIG_X86_64
425
426static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
427{
428 if (efer & EFER_RESERVED_BITS) {
429 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
430 efer);
431 kvm_inject_gp(vcpu, 0);
432 return;
433 }
434
435 if (is_paging(vcpu)
436 && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) {
437 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
438 kvm_inject_gp(vcpu, 0);
439 return;
440 }
441
442 kvm_x86_ops->set_efer(vcpu, efer);
443
444 efer &= ~EFER_LMA;
445 efer |= vcpu->arch.shadow_efer & EFER_LMA;
446
447 vcpu->arch.shadow_efer = efer;
448}
449
450#endif
451
452/*
453 * Writes msr value into into the appropriate "register".
454 * Returns 0 on success, non-0 otherwise.
455 * Assumes vcpu_load() was already called.
456 */
457int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
458{
459 return kvm_x86_ops->set_msr(vcpu, msr_index, data);
460}
461
462/*
463 * Adapt set_msr() to msr_io()'s calling convention
464 */
465static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
466{
467 return kvm_set_msr(vcpu, index, *data);
468}
469
470
471int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
472{
473 switch (msr) {
474#ifdef CONFIG_X86_64
475 case MSR_EFER:
476 set_efer(vcpu, data);
477 break;
478#endif
479 case MSR_IA32_MC0_STATUS:
480 pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
481 __FUNCTION__, data);
482 break;
483 case MSR_IA32_MCG_STATUS:
484 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
485 __FUNCTION__, data);
486 break;
487 case MSR_IA32_UCODE_REV:
488 case MSR_IA32_UCODE_WRITE:
489 case 0x200 ... 0x2ff: /* MTRRs */
490 break;
491 case MSR_IA32_APICBASE:
492 kvm_set_apic_base(vcpu, data);
493 break;
494 case MSR_IA32_MISC_ENABLE:
495 vcpu->arch.ia32_misc_enable_msr = data;
496 break;
497 default:
498 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data);
499 return 1;
500 }
501 return 0;
502}
503EXPORT_SYMBOL_GPL(kvm_set_msr_common);
504
505
506/*
507 * Reads an msr value (of 'msr_index') into 'pdata'.
508 * Returns 0 on success, non-0 otherwise.
509 * Assumes vcpu_load() was already called.
510 */
511int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
512{
513 return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
514}
515
516int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
517{
518 u64 data;
519
520 switch (msr) {
521 case 0xc0010010: /* SYSCFG */
522 case 0xc0010015: /* HWCR */
523 case MSR_IA32_PLATFORM_ID:
524 case MSR_IA32_P5_MC_ADDR:
525 case MSR_IA32_P5_MC_TYPE:
526 case MSR_IA32_MC0_CTL:
527 case MSR_IA32_MCG_STATUS:
528 case MSR_IA32_MCG_CAP:
529 case MSR_IA32_MC0_MISC:
530 case MSR_IA32_MC0_MISC+4:
531 case MSR_IA32_MC0_MISC+8:
532 case MSR_IA32_MC0_MISC+12:
533 case MSR_IA32_MC0_MISC+16:
534 case MSR_IA32_UCODE_REV:
535 case MSR_IA32_PERF_STATUS:
536 case MSR_IA32_EBL_CR_POWERON:
537 /* MTRR registers */
538 case 0xfe:
539 case 0x200 ... 0x2ff:
540 data = 0;
541 break;
542 case 0xcd: /* fsb frequency */
543 data = 3;
544 break;
545 case MSR_IA32_APICBASE:
546 data = kvm_get_apic_base(vcpu);
547 break;
548 case MSR_IA32_MISC_ENABLE:
549 data = vcpu->arch.ia32_misc_enable_msr;
550 break;
551#ifdef CONFIG_X86_64
552 case MSR_EFER:
553 data = vcpu->arch.shadow_efer;
554 break;
555#endif
556 default:
557 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
558 return 1;
559 }
560 *pdata = data;
561 return 0;
562}
563EXPORT_SYMBOL_GPL(kvm_get_msr_common);
564
565/*
566 * Read or write a bunch of msrs. All parameters are kernel addresses.
567 *
568 * @return number of msrs set successfully.
569 */
570static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
571 struct kvm_msr_entry *entries,
572 int (*do_msr)(struct kvm_vcpu *vcpu,
573 unsigned index, u64 *data))
574{
575 int i;
576
577 vcpu_load(vcpu);
578
579 for (i = 0; i < msrs->nmsrs; ++i)
580 if (do_msr(vcpu, entries[i].index, &entries[i].data))
581 break;
582
583 vcpu_put(vcpu);
584
585 return i;
586}
587
588/*
589 * Read or write a bunch of msrs. Parameters are user addresses.
590 *
591 * @return number of msrs set successfully.
592 */
593static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
594 int (*do_msr)(struct kvm_vcpu *vcpu,
595 unsigned index, u64 *data),
596 int writeback)
597{
598 struct kvm_msrs msrs;
599 struct kvm_msr_entry *entries;
600 int r, n;
601 unsigned size;
602
603 r = -EFAULT;
604 if (copy_from_user(&msrs, user_msrs, sizeof msrs))
605 goto out;
606
607 r = -E2BIG;
608 if (msrs.nmsrs >= MAX_IO_MSRS)
609 goto out;
610
611 r = -ENOMEM;
612 size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
613 entries = vmalloc(size);
614 if (!entries)
615 goto out;
616
617 r = -EFAULT;
618 if (copy_from_user(entries, user_msrs->entries, size))
619 goto out_free;
620
621 r = n = __msr_io(vcpu, &msrs, entries, do_msr);
622 if (r < 0)
623 goto out_free;
624
625 r = -EFAULT;
626 if (writeback && copy_to_user(user_msrs->entries, entries, size))
627 goto out_free;
628
629 r = n;
630
631out_free:
632 vfree(entries);
633out:
634 return r;
635}
636
637/*
638 * Make sure that a cpu that is being hot-unplugged does not have any vcpus
639 * cached on it.
640 */
641void decache_vcpus_on_cpu(int cpu)
642{
643 struct kvm *vm;
644 struct kvm_vcpu *vcpu;
645 int i;
646
647 spin_lock(&kvm_lock);
648 list_for_each_entry(vm, &vm_list, vm_list)
649 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
650 vcpu = vm->vcpus[i];
651 if (!vcpu)
652 continue;
653 /*
654 * If the vcpu is locked, then it is running on some
655 * other cpu and therefore it is not cached on the
656 * cpu in question.
657 *
658 * If it's not locked, check the last cpu it executed
659 * on.
660 */
661 if (mutex_trylock(&vcpu->mutex)) {
662 if (vcpu->cpu == cpu) {
663 kvm_x86_ops->vcpu_decache(vcpu);
664 vcpu->cpu = -1;
665 }
666 mutex_unlock(&vcpu->mutex);
667 }
668 }
669 spin_unlock(&kvm_lock);
670}
671
672int kvm_dev_ioctl_check_extension(long ext)
673{
674 int r;
675
676 switch (ext) {
677 case KVM_CAP_IRQCHIP:
678 case KVM_CAP_HLT:
679 case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
680 case KVM_CAP_USER_MEMORY:
681 case KVM_CAP_SET_TSS_ADDR:
682 case KVM_CAP_EXT_CPUID:
683 r = 1;
684 break;
685 case KVM_CAP_VAPIC:
686 r = !kvm_x86_ops->cpu_has_accelerated_tpr();
687 break;
688 default:
689 r = 0;
690 break;
691 }
692 return r;
693
694}
695
696long kvm_arch_dev_ioctl(struct file *filp,
697 unsigned int ioctl, unsigned long arg)
698{
699 void __user *argp = (void __user *)arg;
700 long r;
701
702 switch (ioctl) {
703 case KVM_GET_MSR_INDEX_LIST: {
704 struct kvm_msr_list __user *user_msr_list = argp;
705 struct kvm_msr_list msr_list;
706 unsigned n;
707
708 r = -EFAULT;
709 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
710 goto out;
711 n = msr_list.nmsrs;
712 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
713 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
714 goto out;
715 r = -E2BIG;
716 if (n < num_msrs_to_save)
717 goto out;
718 r = -EFAULT;
719 if (copy_to_user(user_msr_list->indices, &msrs_to_save,
720 num_msrs_to_save * sizeof(u32)))
721 goto out;
722 if (copy_to_user(user_msr_list->indices
723 + num_msrs_to_save * sizeof(u32),
724 &emulated_msrs,
725 ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
726 goto out;
727 r = 0;
728 break;
729 }
730 default:
731 r = -EINVAL;
732 }
733out:
734 return r;
735}
736
737void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
738{
739 kvm_x86_ops->vcpu_load(vcpu, cpu);
740}
741
742void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
743{
744 kvm_x86_ops->vcpu_put(vcpu);
745 kvm_put_guest_fpu(vcpu);
746}
747
748static int is_efer_nx(void)
749{
750 u64 efer;
751
752 rdmsrl(MSR_EFER, efer);
753 return efer & EFER_NX;
754}
755
756static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
757{
758 int i;
759 struct kvm_cpuid_entry2 *e, *entry;
760
761 entry = NULL;
762 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
763 e = &vcpu->arch.cpuid_entries[i];
764 if (e->function == 0x80000001) {
765 entry = e;
766 break;
767 }
768 }
769 if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) {
770 entry->edx &= ~(1 << 20);
771 printk(KERN_INFO "kvm: guest NX capability removed\n");
772 }
773}
774
775/* when an old userspace process fills a new kernel module */
776static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
777 struct kvm_cpuid *cpuid,
778 struct kvm_cpuid_entry __user *entries)
779{
780 int r, i;
781 struct kvm_cpuid_entry *cpuid_entries;
782
783 r = -E2BIG;
784 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
785 goto out;
786 r = -ENOMEM;
787 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent);
788 if (!cpuid_entries)
789 goto out;
790 r = -EFAULT;
791 if (copy_from_user(cpuid_entries, entries,
792 cpuid->nent * sizeof(struct kvm_cpuid_entry)))
793 goto out_free;
794 for (i = 0; i < cpuid->nent; i++) {
795 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
796 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
797 vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx;
798 vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx;
799 vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx;
800 vcpu->arch.cpuid_entries[i].index = 0;
801 vcpu->arch.cpuid_entries[i].flags = 0;
802 vcpu->arch.cpuid_entries[i].padding[0] = 0;
803 vcpu->arch.cpuid_entries[i].padding[1] = 0;
804 vcpu->arch.cpuid_entries[i].padding[2] = 0;
805 }
806 vcpu->arch.cpuid_nent = cpuid->nent;
807 cpuid_fix_nx_cap(vcpu);
808 r = 0;
809
810out_free:
811 vfree(cpuid_entries);
812out:
813 return r;
814}
815
816static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
817 struct kvm_cpuid2 *cpuid,
818 struct kvm_cpuid_entry2 __user *entries)
819{
820 int r;
821
822 r = -E2BIG;
823 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
824 goto out;
825 r = -EFAULT;
826 if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
827 cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
828 goto out;
829 vcpu->arch.cpuid_nent = cpuid->nent;
830 return 0;
831
832out:
833 return r;
834}
835
836static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
837 struct kvm_cpuid2 *cpuid,
838 struct kvm_cpuid_entry2 __user *entries)
839{
840 int r;
841
842 r = -E2BIG;
843 if (cpuid->nent < vcpu->arch.cpuid_nent)
844 goto out;
845 r = -EFAULT;
846 if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
847 vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
848 goto out;
849 return 0;
850
851out:
852 cpuid->nent = vcpu->arch.cpuid_nent;
853 return r;
854}
855
856static inline u32 bit(int bitno)
857{
858 return 1 << (bitno & 31);
859}
860
861static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
862 u32 index)
863{
864 entry->function = function;
865 entry->index = index;
866 cpuid_count(entry->function, entry->index,
867 &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
868 entry->flags = 0;
869}
870
871static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
872 u32 index, int *nent, int maxnent)
873{
874 const u32 kvm_supported_word0_x86_features = bit(X86_FEATURE_FPU) |
875 bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
876 bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
877 bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
878 bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
879 bit(X86_FEATURE_SEP) | bit(X86_FEATURE_PGE) |
880 bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
881 bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) |
882 bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) |
883 bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP);
884 const u32 kvm_supported_word1_x86_features = bit(X86_FEATURE_FPU) |
885 bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
886 bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
887 bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
888 bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
889 bit(X86_FEATURE_PGE) |
890 bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
891 bit(X86_FEATURE_MMX) | bit(X86_FEATURE_FXSR) |
892 bit(X86_FEATURE_SYSCALL) |
893 (bit(X86_FEATURE_NX) && is_efer_nx()) |
894#ifdef CONFIG_X86_64
895 bit(X86_FEATURE_LM) |
896#endif
897 bit(X86_FEATURE_MMXEXT) |
898 bit(X86_FEATURE_3DNOWEXT) |
899 bit(X86_FEATURE_3DNOW);
900 const u32 kvm_supported_word3_x86_features =
901 bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16);
902 const u32 kvm_supported_word6_x86_features =
903 bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY);
904
905 /* all func 2 cpuid_count() should be called on the same cpu */
906 get_cpu();
907 do_cpuid_1_ent(entry, function, index);
908 ++*nent;
909
910 switch (function) {
911 case 0:
912 entry->eax = min(entry->eax, (u32)0xb);
913 break;
914 case 1:
915 entry->edx &= kvm_supported_word0_x86_features;
916 entry->ecx &= kvm_supported_word3_x86_features;
917 break;
918 /* function 2 entries are STATEFUL. That is, repeated cpuid commands
919 * may return different values. This forces us to get_cpu() before
920 * issuing the first command, and also to emulate this annoying behavior
921 * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */
922 case 2: {
923 int t, times = entry->eax & 0xff;
924
925 entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
926 for (t = 1; t < times && *nent < maxnent; ++t) {
927 do_cpuid_1_ent(&entry[t], function, 0);
928 entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
929 ++*nent;
930 }
931 break;
932 }
933 /* function 4 and 0xb have additional index. */
934 case 4: {
935 int index, cache_type;
936
937 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
938 /* read more entries until cache_type is zero */
939 for (index = 1; *nent < maxnent; ++index) {
940 cache_type = entry[index - 1].eax & 0x1f;
941 if (!cache_type)
942 break;
943 do_cpuid_1_ent(&entry[index], function, index);
944 entry[index].flags |=
945 KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
946 ++*nent;
947 }
948 break;
949 }
950 case 0xb: {
951 int index, level_type;
952
953 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
954 /* read more entries until level_type is zero */
955 for (index = 1; *nent < maxnent; ++index) {
956 level_type = entry[index - 1].ecx & 0xff;
957 if (!level_type)
958 break;
959 do_cpuid_1_ent(&entry[index], function, index);
960 entry[index].flags |=
961 KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
962 ++*nent;
963 }
964 break;
965 }
966 case 0x80000000:
967 entry->eax = min(entry->eax, 0x8000001a);
968 break;
969 case 0x80000001:
970 entry->edx &= kvm_supported_word1_x86_features;
971 entry->ecx &= kvm_supported_word6_x86_features;
972 break;
973 }
974 put_cpu();
975}
976
977static int kvm_vm_ioctl_get_supported_cpuid(struct kvm *kvm,
978 struct kvm_cpuid2 *cpuid,
979 struct kvm_cpuid_entry2 __user *entries)
980{
981 struct kvm_cpuid_entry2 *cpuid_entries;
982 int limit, nent = 0, r = -E2BIG;
983 u32 func;
984
985 if (cpuid->nent < 1)
986 goto out;
987 r = -ENOMEM;
988 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
989 if (!cpuid_entries)
990 goto out;
991
992 do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent);
993 limit = cpuid_entries[0].eax;
994 for (func = 1; func <= limit && nent < cpuid->nent; ++func)
995 do_cpuid_ent(&cpuid_entries[nent], func, 0,
996 &nent, cpuid->nent);
997 r = -E2BIG;
998 if (nent >= cpuid->nent)
999 goto out_free;
1000
1001 do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent);
1002 limit = cpuid_entries[nent - 1].eax;
1003 for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
1004 do_cpuid_ent(&cpuid_entries[nent], func, 0,
1005 &nent, cpuid->nent);
1006 r = -EFAULT;
1007 if (copy_to_user(entries, cpuid_entries,
1008 nent * sizeof(struct kvm_cpuid_entry2)))
1009 goto out_free;
1010 cpuid->nent = nent;
1011 r = 0;
1012
1013out_free:
1014 vfree(cpuid_entries);
1015out:
1016 return r;
1017}
1018
1019static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
1020 struct kvm_lapic_state *s)
1021{
1022 vcpu_load(vcpu);
1023 memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
1024 vcpu_put(vcpu);
1025
1026 return 0;
1027}
1028
1029static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
1030 struct kvm_lapic_state *s)
1031{
1032 vcpu_load(vcpu);
1033 memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
1034 kvm_apic_post_state_restore(vcpu);
1035 vcpu_put(vcpu);
1036
1037 return 0;
1038}
1039
1040static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
1041 struct kvm_interrupt *irq)
1042{
1043 if (irq->irq < 0 || irq->irq >= 256)
1044 return -EINVAL;
1045 if (irqchip_in_kernel(vcpu->kvm))
1046 return -ENXIO;
1047 vcpu_load(vcpu);
1048
1049 set_bit(irq->irq, vcpu->arch.irq_pending);
1050 set_bit(irq->irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
1051
1052 vcpu_put(vcpu);
1053
1054 return 0;
1055}
1056
1057static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
1058 struct kvm_tpr_access_ctl *tac)
1059{
1060 if (tac->flags)
1061 return -EINVAL;
1062 vcpu->arch.tpr_access_reporting = !!tac->enabled;
1063 return 0;
1064}
1065
1066long kvm_arch_vcpu_ioctl(struct file *filp,
1067 unsigned int ioctl, unsigned long arg)
1068{
1069 struct kvm_vcpu *vcpu = filp->private_data;
1070 void __user *argp = (void __user *)arg;
1071 int r;
1072
1073 switch (ioctl) {
1074 case KVM_GET_LAPIC: {
1075 struct kvm_lapic_state lapic;
1076
1077 memset(&lapic, 0, sizeof lapic);
1078 r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic);
1079 if (r)
1080 goto out;
1081 r = -EFAULT;
1082 if (copy_to_user(argp, &lapic, sizeof lapic))
1083 goto out;
1084 r = 0;
1085 break;
1086 }
1087 case KVM_SET_LAPIC: {
1088 struct kvm_lapic_state lapic;
1089
1090 r = -EFAULT;
1091 if (copy_from_user(&lapic, argp, sizeof lapic))
1092 goto out;
1093 r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);;
1094 if (r)
1095 goto out;
1096 r = 0;
1097 break;
1098 }
1099 case KVM_INTERRUPT: {
1100 struct kvm_interrupt irq;
1101
1102 r = -EFAULT;
1103 if (copy_from_user(&irq, argp, sizeof irq))
1104 goto out;
1105 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
1106 if (r)
1107 goto out;
1108 r = 0;
1109 break;
1110 }
1111 case KVM_SET_CPUID: {
1112 struct kvm_cpuid __user *cpuid_arg = argp;
1113 struct kvm_cpuid cpuid;
1114
1115 r = -EFAULT;
1116 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1117 goto out;
1118 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
1119 if (r)
1120 goto out;
1121 break;
1122 }
1123 case KVM_SET_CPUID2: {
1124 struct kvm_cpuid2 __user *cpuid_arg = argp;
1125 struct kvm_cpuid2 cpuid;
1126
1127 r = -EFAULT;
1128 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1129 goto out;
1130 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
1131 cpuid_arg->entries);
1132 if (r)
1133 goto out;
1134 break;
1135 }
1136 case KVM_GET_CPUID2: {
1137 struct kvm_cpuid2 __user *cpuid_arg = argp;
1138 struct kvm_cpuid2 cpuid;
1139
1140 r = -EFAULT;
1141 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1142 goto out;
1143 r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
1144 cpuid_arg->entries);
1145 if (r)
1146 goto out;
1147 r = -EFAULT;
1148 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
1149 goto out;
1150 r = 0;
1151 break;
1152 }
1153 case KVM_GET_MSRS:
1154 r = msr_io(vcpu, argp, kvm_get_msr, 1);
1155 break;
1156 case KVM_SET_MSRS:
1157 r = msr_io(vcpu, argp, do_set_msr, 0);
1158 break;
1159 case KVM_TPR_ACCESS_REPORTING: {
1160 struct kvm_tpr_access_ctl tac;
1161
1162 r = -EFAULT;
1163 if (copy_from_user(&tac, argp, sizeof tac))
1164 goto out;
1165 r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);
1166 if (r)
1167 goto out;
1168 r = -EFAULT;
1169 if (copy_to_user(argp, &tac, sizeof tac))
1170 goto out;
1171 r = 0;
1172 break;
1173 };
1174 case KVM_SET_VAPIC_ADDR: {
1175 struct kvm_vapic_addr va;
1176
1177 r = -EINVAL;
1178 if (!irqchip_in_kernel(vcpu->kvm))
1179 goto out;
1180 r = -EFAULT;
1181 if (copy_from_user(&va, argp, sizeof va))
1182 goto out;
1183 r = 0;
1184 kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
1185 break;
1186 }
1187 default:
1188 r = -EINVAL;
1189 }
1190out:
1191 return r;
1192}
1193
1194static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
1195{
1196 int ret;
1197
1198 if (addr > (unsigned int)(-3 * PAGE_SIZE))
1199 return -1;
1200 ret = kvm_x86_ops->set_tss_addr(kvm, addr);
1201 return ret;
1202}
1203
1204static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
1205 u32 kvm_nr_mmu_pages)
1206{
1207 if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
1208 return -EINVAL;
1209
1210 down_write(&current->mm->mmap_sem);
1211
1212 kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
1213 kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
1214
1215 up_write(&current->mm->mmap_sem);
1216 return 0;
1217}
1218
1219static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
1220{
1221 return kvm->arch.n_alloc_mmu_pages;
1222}
1223
1224gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
1225{
1226 int i;
1227 struct kvm_mem_alias *alias;
1228
1229 for (i = 0; i < kvm->arch.naliases; ++i) {
1230 alias = &kvm->arch.aliases[i];
1231 if (gfn >= alias->base_gfn
1232 && gfn < alias->base_gfn + alias->npages)
1233 return alias->target_gfn + gfn - alias->base_gfn;
1234 }
1235 return gfn;
1236}
1237
1238/*
1239 * Set a new alias region. Aliases map a portion of physical memory into
1240 * another portion. This is useful for memory windows, for example the PC
1241 * VGA region.
1242 */
1243static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
1244 struct kvm_memory_alias *alias)
1245{
1246 int r, n;
1247 struct kvm_mem_alias *p;
1248
1249 r = -EINVAL;
1250 /* General sanity checks */
1251 if (alias->memory_size & (PAGE_SIZE - 1))
1252 goto out;
1253 if (alias->guest_phys_addr & (PAGE_SIZE - 1))
1254 goto out;
1255 if (alias->slot >= KVM_ALIAS_SLOTS)
1256 goto out;
1257 if (alias->guest_phys_addr + alias->memory_size
1258 < alias->guest_phys_addr)
1259 goto out;
1260 if (alias->target_phys_addr + alias->memory_size
1261 < alias->target_phys_addr)
1262 goto out;
1263
1264 down_write(&current->mm->mmap_sem);
1265
1266 p = &kvm->arch.aliases[alias->slot];
1267 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
1268 p->npages = alias->memory_size >> PAGE_SHIFT;
1269 p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
1270
1271 for (n = KVM_ALIAS_SLOTS; n > 0; --n)
1272 if (kvm->arch.aliases[n - 1].npages)
1273 break;
1274 kvm->arch.naliases = n;
1275
1276 kvm_mmu_zap_all(kvm);
1277
1278 up_write(&current->mm->mmap_sem);
1279
1280 return 0;
1281
1282out:
1283 return r;
1284}
1285
1286static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
1287{
1288 int r;
1289
1290 r = 0;
1291 switch (chip->chip_id) {
1292 case KVM_IRQCHIP_PIC_MASTER:
1293 memcpy(&chip->chip.pic,
1294 &pic_irqchip(kvm)->pics[0],
1295 sizeof(struct kvm_pic_state));
1296 break;
1297 case KVM_IRQCHIP_PIC_SLAVE:
1298 memcpy(&chip->chip.pic,
1299 &pic_irqchip(kvm)->pics[1],
1300 sizeof(struct kvm_pic_state));
1301 break;
1302 case KVM_IRQCHIP_IOAPIC:
1303 memcpy(&chip->chip.ioapic,
1304 ioapic_irqchip(kvm),
1305 sizeof(struct kvm_ioapic_state));
1306 break;
1307 default:
1308 r = -EINVAL;
1309 break;
1310 }
1311 return r;
1312}
1313
1314static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
1315{
1316 int r;
1317
1318 r = 0;
1319 switch (chip->chip_id) {
1320 case KVM_IRQCHIP_PIC_MASTER:
1321 memcpy(&pic_irqchip(kvm)->pics[0],
1322 &chip->chip.pic,
1323 sizeof(struct kvm_pic_state));
1324 break;
1325 case KVM_IRQCHIP_PIC_SLAVE:
1326 memcpy(&pic_irqchip(kvm)->pics[1],
1327 &chip->chip.pic,
1328 sizeof(struct kvm_pic_state));
1329 break;
1330 case KVM_IRQCHIP_IOAPIC:
1331 memcpy(ioapic_irqchip(kvm),
1332 &chip->chip.ioapic,
1333 sizeof(struct kvm_ioapic_state));
1334 break;
1335 default:
1336 r = -EINVAL;
1337 break;
1338 }
1339 kvm_pic_update_irq(pic_irqchip(kvm));
1340 return r;
1341}
1342
1343/*
1344 * Get (and clear) the dirty memory log for a memory slot.
1345 */
1346int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
1347 struct kvm_dirty_log *log)
1348{
1349 int r;
1350 int n;
1351 struct kvm_memory_slot *memslot;
1352 int is_dirty = 0;
1353
1354 down_write(&current->mm->mmap_sem);
1355
1356 r = kvm_get_dirty_log(kvm, log, &is_dirty);
1357 if (r)
1358 goto out;
1359
1360 /* If nothing is dirty, don't bother messing with page tables. */
1361 if (is_dirty) {
1362 kvm_mmu_slot_remove_write_access(kvm, log->slot);
1363 kvm_flush_remote_tlbs(kvm);
1364 memslot = &kvm->memslots[log->slot];
1365 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
1366 memset(memslot->dirty_bitmap, 0, n);
1367 }
1368 r = 0;
1369out:
1370 up_write(&current->mm->mmap_sem);
1371 return r;
1372}
1373
1374long kvm_arch_vm_ioctl(struct file *filp,
1375 unsigned int ioctl, unsigned long arg)
1376{
1377 struct kvm *kvm = filp->private_data;
1378 void __user *argp = (void __user *)arg;
1379 int r = -EINVAL;
1380
1381 switch (ioctl) {
1382 case KVM_SET_TSS_ADDR:
1383 r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
1384 if (r < 0)
1385 goto out;
1386 break;
1387 case KVM_SET_MEMORY_REGION: {
1388 struct kvm_memory_region kvm_mem;
1389 struct kvm_userspace_memory_region kvm_userspace_mem;
1390
1391 r = -EFAULT;
1392 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
1393 goto out;
1394 kvm_userspace_mem.slot = kvm_mem.slot;
1395 kvm_userspace_mem.flags = kvm_mem.flags;
1396 kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr;
1397 kvm_userspace_mem.memory_size = kvm_mem.memory_size;
1398 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0);
1399 if (r)
1400 goto out;
1401 break;
1402 }
1403 case KVM_SET_NR_MMU_PAGES:
1404 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
1405 if (r)
1406 goto out;
1407 break;
1408 case KVM_GET_NR_MMU_PAGES:
1409 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
1410 break;
1411 case KVM_SET_MEMORY_ALIAS: {
1412 struct kvm_memory_alias alias;
1413
1414 r = -EFAULT;
1415 if (copy_from_user(&alias, argp, sizeof alias))
1416 goto out;
1417 r = kvm_vm_ioctl_set_memory_alias(kvm, &alias);
1418 if (r)
1419 goto out;
1420 break;
1421 }
1422 case KVM_CREATE_IRQCHIP:
1423 r = -ENOMEM;
1424 kvm->arch.vpic = kvm_create_pic(kvm);
1425 if (kvm->arch.vpic) {
1426 r = kvm_ioapic_init(kvm);
1427 if (r) {
1428 kfree(kvm->arch.vpic);
1429 kvm->arch.vpic = NULL;
1430 goto out;
1431 }
1432 } else
1433 goto out;
1434 break;
1435 case KVM_IRQ_LINE: {
1436 struct kvm_irq_level irq_event;
1437
1438 r = -EFAULT;
1439 if (copy_from_user(&irq_event, argp, sizeof irq_event))
1440 goto out;
1441 if (irqchip_in_kernel(kvm)) {
1442 mutex_lock(&kvm->lock);
1443 if (irq_event.irq < 16)
1444 kvm_pic_set_irq(pic_irqchip(kvm),
1445 irq_event.irq,
1446 irq_event.level);
1447 kvm_ioapic_set_irq(kvm->arch.vioapic,
1448 irq_event.irq,
1449 irq_event.level);
1450 mutex_unlock(&kvm->lock);
1451 r = 0;
1452 }
1453 break;
1454 }
1455 case KVM_GET_IRQCHIP: {
1456 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
1457 struct kvm_irqchip chip;
1458
1459 r = -EFAULT;
1460 if (copy_from_user(&chip, argp, sizeof chip))
1461 goto out;
1462 r = -ENXIO;
1463 if (!irqchip_in_kernel(kvm))
1464 goto out;
1465 r = kvm_vm_ioctl_get_irqchip(kvm, &chip);
1466 if (r)
1467 goto out;
1468 r = -EFAULT;
1469 if (copy_to_user(argp, &chip, sizeof chip))
1470 goto out;
1471 r = 0;
1472 break;
1473 }
1474 case KVM_SET_IRQCHIP: {
1475 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
1476 struct kvm_irqchip chip;
1477
1478 r = -EFAULT;
1479 if (copy_from_user(&chip, argp, sizeof chip))
1480 goto out;
1481 r = -ENXIO;
1482 if (!irqchip_in_kernel(kvm))
1483 goto out;
1484 r = kvm_vm_ioctl_set_irqchip(kvm, &chip);
1485 if (r)
1486 goto out;
1487 r = 0;
1488 break;
1489 }
1490 case KVM_GET_SUPPORTED_CPUID: {
1491 struct kvm_cpuid2 __user *cpuid_arg = argp;
1492 struct kvm_cpuid2 cpuid;
1493
1494 r = -EFAULT;
1495 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1496 goto out;
1497 r = kvm_vm_ioctl_get_supported_cpuid(kvm, &cpuid,
1498 cpuid_arg->entries);
1499 if (r)
1500 goto out;
1501
1502 r = -EFAULT;
1503 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
1504 goto out;
1505 r = 0;
1506 break;
1507 }
1508 default:
1509 ;
1510 }
1511out:
1512 return r;
1513}
1514
1515static void kvm_init_msr_list(void)
1516{
1517 u32 dummy[2];
1518 unsigned i, j;
1519
1520 for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
1521 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
1522 continue;
1523 if (j < i)
1524 msrs_to_save[j] = msrs_to_save[i];
1525 j++;
1526 }
1527 num_msrs_to_save = j;
1528}
1529
1530/*
1531 * Only apic need an MMIO device hook, so shortcut now..
1532 */
1533static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
1534 gpa_t addr)
1535{
1536 struct kvm_io_device *dev;
1537
1538 if (vcpu->arch.apic) {
1539 dev = &vcpu->arch.apic->dev;
1540 if (dev->in_range(dev, addr))
1541 return dev;
1542 }
1543 return NULL;
1544}
1545
1546
1547static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
1548 gpa_t addr)
1549{
1550 struct kvm_io_device *dev;
1551
1552 dev = vcpu_find_pervcpu_dev(vcpu, addr);
1553 if (dev == NULL)
1554 dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr);
1555 return dev;
1556}
1557
1558int emulator_read_std(unsigned long addr,
1559 void *val,
1560 unsigned int bytes,
1561 struct kvm_vcpu *vcpu)
1562{
1563 void *data = val;
1564 int r = X86EMUL_CONTINUE;
1565
1566 down_read(&current->mm->mmap_sem);
1567 while (bytes) {
1568 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
1569 unsigned offset = addr & (PAGE_SIZE-1);
1570 unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
1571 int ret;
1572
1573 if (gpa == UNMAPPED_GVA) {
1574 r = X86EMUL_PROPAGATE_FAULT;
1575 goto out;
1576 }
1577 ret = kvm_read_guest(vcpu->kvm, gpa, data, tocopy);
1578 if (ret < 0) {
1579 r = X86EMUL_UNHANDLEABLE;
1580 goto out;
1581 }
1582
1583 bytes -= tocopy;
1584 data += tocopy;
1585 addr += tocopy;
1586 }
1587out:
1588 up_read(&current->mm->mmap_sem);
1589 return r;
1590}
1591EXPORT_SYMBOL_GPL(emulator_read_std);
1592
1593static int emulator_read_emulated(unsigned long addr,
1594 void *val,
1595 unsigned int bytes,
1596 struct kvm_vcpu *vcpu)
1597{
1598 struct kvm_io_device *mmio_dev;
1599 gpa_t gpa;
1600
1601 if (vcpu->mmio_read_completed) {
1602 memcpy(val, vcpu->mmio_data, bytes);
1603 vcpu->mmio_read_completed = 0;
1604 return X86EMUL_CONTINUE;
1605 }
1606
1607 down_read(&current->mm->mmap_sem);
1608 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
1609 up_read(&current->mm->mmap_sem);
1610
1611 /* For APIC access vmexit */
1612 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
1613 goto mmio;
1614
1615 if (emulator_read_std(addr, val, bytes, vcpu)
1616 == X86EMUL_CONTINUE)
1617 return X86EMUL_CONTINUE;
1618 if (gpa == UNMAPPED_GVA)
1619 return X86EMUL_PROPAGATE_FAULT;
1620
1621mmio:
1622 /*
1623 * Is this MMIO handled locally?
1624 */
1625 mutex_lock(&vcpu->kvm->lock);
1626 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
1627 if (mmio_dev) {
1628 kvm_iodevice_read(mmio_dev, gpa, bytes, val);
1629 mutex_unlock(&vcpu->kvm->lock);
1630 return X86EMUL_CONTINUE;
1631 }
1632 mutex_unlock(&vcpu->kvm->lock);
1633
1634 vcpu->mmio_needed = 1;
1635 vcpu->mmio_phys_addr = gpa;
1636 vcpu->mmio_size = bytes;
1637 vcpu->mmio_is_write = 0;
1638
1639 return X86EMUL_UNHANDLEABLE;
1640}
1641
1642static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
1643 const void *val, int bytes)
1644{
1645 int ret;
1646
1647 down_read(&current->mm->mmap_sem);
1648 ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
1649 if (ret < 0) {
1650 up_read(&current->mm->mmap_sem);
1651 return 0;
1652 }
1653 kvm_mmu_pte_write(vcpu, gpa, val, bytes);
1654 up_read(&current->mm->mmap_sem);
1655 return 1;
1656}
1657
1658static int emulator_write_emulated_onepage(unsigned long addr,
1659 const void *val,
1660 unsigned int bytes,
1661 struct kvm_vcpu *vcpu)
1662{
1663 struct kvm_io_device *mmio_dev;
1664 gpa_t gpa;
1665
1666 down_read(&current->mm->mmap_sem);
1667 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
1668 up_read(&current->mm->mmap_sem);
1669
1670 if (gpa == UNMAPPED_GVA) {
1671 kvm_inject_page_fault(vcpu, addr, 2);
1672 return X86EMUL_PROPAGATE_FAULT;
1673 }
1674
1675 /* For APIC access vmexit */
1676 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
1677 goto mmio;
1678
1679 if (emulator_write_phys(vcpu, gpa, val, bytes))
1680 return X86EMUL_CONTINUE;
1681
1682mmio:
1683 /*
1684 * Is this MMIO handled locally?
1685 */
1686 mutex_lock(&vcpu->kvm->lock);
1687 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
1688 if (mmio_dev) {
1689 kvm_iodevice_write(mmio_dev, gpa, bytes, val);
1690 mutex_unlock(&vcpu->kvm->lock);
1691 return X86EMUL_CONTINUE;
1692 }
1693 mutex_unlock(&vcpu->kvm->lock);
1694
1695 vcpu->mmio_needed = 1;
1696 vcpu->mmio_phys_addr = gpa;
1697 vcpu->mmio_size = bytes;
1698 vcpu->mmio_is_write = 1;
1699 memcpy(vcpu->mmio_data, val, bytes);
1700
1701 return X86EMUL_CONTINUE;
1702}
1703
1704int emulator_write_emulated(unsigned long addr,
1705 const void *val,
1706 unsigned int bytes,
1707 struct kvm_vcpu *vcpu)
1708{
1709 /* Crossing a page boundary? */
1710 if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
1711 int rc, now;
1712
1713 now = -addr & ~PAGE_MASK;
1714 rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
1715 if (rc != X86EMUL_CONTINUE)
1716 return rc;
1717 addr += now;
1718 val += now;
1719 bytes -= now;
1720 }
1721 return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
1722}
1723EXPORT_SYMBOL_GPL(emulator_write_emulated);
1724
1725static int emulator_cmpxchg_emulated(unsigned long addr,
1726 const void *old,
1727 const void *new,
1728 unsigned int bytes,
1729 struct kvm_vcpu *vcpu)
1730{
1731 static int reported;
1732
1733 if (!reported) {
1734 reported = 1;
1735 printk(KERN_WARNING "kvm: emulating exchange as write\n");
1736 }
1737#ifndef CONFIG_X86_64
1738 /* guests cmpxchg8b have to be emulated atomically */
1739 if (bytes == 8) {
1740 gpa_t gpa;
1741 struct page *page;
1742 char *kaddr;
1743 u64 val;
1744
1745 down_read(&current->mm->mmap_sem);
1746 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
1747
1748 if (gpa == UNMAPPED_GVA ||
1749 (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
1750 goto emul_write;
1751
1752 if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
1753 goto emul_write;
1754
1755 val = *(u64 *)new;
1756 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
1757 kaddr = kmap_atomic(page, KM_USER0);
1758 set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val);
1759 kunmap_atomic(kaddr, KM_USER0);
1760 kvm_release_page_dirty(page);
1761 emul_write:
1762 up_read(&current->mm->mmap_sem);
1763 }
1764#endif
1765
1766 return emulator_write_emulated(addr, new, bytes, vcpu);
1767}
1768
1769static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
1770{
1771 return kvm_x86_ops->get_segment_base(vcpu, seg);
1772}
1773
1774int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
1775{
1776 return X86EMUL_CONTINUE;
1777}
1778
1779int emulate_clts(struct kvm_vcpu *vcpu)
1780{
1781 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS);
1782 return X86EMUL_CONTINUE;
1783}
1784
1785int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
1786{
1787 struct kvm_vcpu *vcpu = ctxt->vcpu;
1788
1789 switch (dr) {
1790 case 0 ... 3:
1791 *dest = kvm_x86_ops->get_dr(vcpu, dr);
1792 return X86EMUL_CONTINUE;
1793 default:
1794 pr_unimpl(vcpu, "%s: unexpected dr %u\n", __FUNCTION__, dr);
1795 return X86EMUL_UNHANDLEABLE;
1796 }
1797}
1798
1799int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
1800{
1801 unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
1802 int exception;
1803
1804 kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
1805 if (exception) {
1806 /* FIXME: better handling */
1807 return X86EMUL_UNHANDLEABLE;
1808 }
1809 return X86EMUL_CONTINUE;
1810}
1811
1812void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
1813{
1814 static int reported;
1815 u8 opcodes[4];
1816 unsigned long rip = vcpu->arch.rip;
1817 unsigned long rip_linear;
1818
1819 rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
1820
1821 if (reported)
1822 return;
1823
1824 emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu);
1825
1826 printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
1827 context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
1828 reported = 1;
1829}
1830EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
1831
1832struct x86_emulate_ops emulate_ops = {
1833 .read_std = emulator_read_std,
1834 .read_emulated = emulator_read_emulated,
1835 .write_emulated = emulator_write_emulated,
1836 .cmpxchg_emulated = emulator_cmpxchg_emulated,
1837};
1838
1839int emulate_instruction(struct kvm_vcpu *vcpu,
1840 struct kvm_run *run,
1841 unsigned long cr2,
1842 u16 error_code,
1843 int emulation_type)
1844{
1845 int r;
1846 struct decode_cache *c;
1847
1848 vcpu->arch.mmio_fault_cr2 = cr2;
1849 kvm_x86_ops->cache_regs(vcpu);
1850
1851 vcpu->mmio_is_write = 0;
1852 vcpu->arch.pio.string = 0;
1853
1854 if (!(emulation_type & EMULTYPE_NO_DECODE)) {
1855 int cs_db, cs_l;
1856 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
1857
1858 vcpu->arch.emulate_ctxt.vcpu = vcpu;
1859 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
1860 vcpu->arch.emulate_ctxt.mode =
1861 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
1862 ? X86EMUL_MODE_REAL : cs_l
1863 ? X86EMUL_MODE_PROT64 : cs_db
1864 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
1865
1866 if (vcpu->arch.emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
1867 vcpu->arch.emulate_ctxt.cs_base = 0;
1868 vcpu->arch.emulate_ctxt.ds_base = 0;
1869 vcpu->arch.emulate_ctxt.es_base = 0;
1870 vcpu->arch.emulate_ctxt.ss_base = 0;
1871 } else {
1872 vcpu->arch.emulate_ctxt.cs_base =
1873 get_segment_base(vcpu, VCPU_SREG_CS);
1874 vcpu->arch.emulate_ctxt.ds_base =
1875 get_segment_base(vcpu, VCPU_SREG_DS);
1876 vcpu->arch.emulate_ctxt.es_base =
1877 get_segment_base(vcpu, VCPU_SREG_ES);
1878 vcpu->arch.emulate_ctxt.ss_base =
1879 get_segment_base(vcpu, VCPU_SREG_SS);
1880 }
1881
1882 vcpu->arch.emulate_ctxt.gs_base =
1883 get_segment_base(vcpu, VCPU_SREG_GS);
1884 vcpu->arch.emulate_ctxt.fs_base =
1885 get_segment_base(vcpu, VCPU_SREG_FS);
1886
1887 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
1888
1889 /* Reject the instructions other than VMCALL/VMMCALL when
1890 * try to emulate invalid opcode */
1891 c = &vcpu->arch.emulate_ctxt.decode;
1892 if ((emulation_type & EMULTYPE_TRAP_UD) &&
1893 (!(c->twobyte && c->b == 0x01 &&
1894 (c->modrm_reg == 0 || c->modrm_reg == 3) &&
1895 c->modrm_mod == 3 && c->modrm_rm == 1)))
1896 return EMULATE_FAIL;
1897
1898 ++vcpu->stat.insn_emulation;
1899 if (r) {
1900 ++vcpu->stat.insn_emulation_fail;
1901 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
1902 return EMULATE_DONE;
1903 return EMULATE_FAIL;
1904 }
1905 }
1906
1907 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
1908
1909 if (vcpu->arch.pio.string)
1910 return EMULATE_DO_MMIO;
1911
1912 if ((r || vcpu->mmio_is_write) && run) {
1913 run->exit_reason = KVM_EXIT_MMIO;
1914 run->mmio.phys_addr = vcpu->mmio_phys_addr;
1915 memcpy(run->mmio.data, vcpu->mmio_data, 8);
1916 run->mmio.len = vcpu->mmio_size;
1917 run->mmio.is_write = vcpu->mmio_is_write;
1918 }
1919
1920 if (r) {
1921 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
1922 return EMULATE_DONE;
1923 if (!vcpu->mmio_needed) {
1924 kvm_report_emulation_failure(vcpu, "mmio");
1925 return EMULATE_FAIL;
1926 }
1927 return EMULATE_DO_MMIO;
1928 }
1929
1930 kvm_x86_ops->decache_regs(vcpu);
1931 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
1932
1933 if (vcpu->mmio_is_write) {
1934 vcpu->mmio_needed = 0;
1935 return EMULATE_DO_MMIO;
1936 }
1937
1938 return EMULATE_DONE;
1939}
1940EXPORT_SYMBOL_GPL(emulate_instruction);
1941
1942static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
1943{
1944 int i;
1945
1946 for (i = 0; i < ARRAY_SIZE(vcpu->arch.pio.guest_pages); ++i)
1947 if (vcpu->arch.pio.guest_pages[i]) {
1948 kvm_release_page_dirty(vcpu->arch.pio.guest_pages[i]);
1949 vcpu->arch.pio.guest_pages[i] = NULL;
1950 }
1951}
1952
1953static int pio_copy_data(struct kvm_vcpu *vcpu)
1954{
1955 void *p = vcpu->arch.pio_data;
1956 void *q;
1957 unsigned bytes;
1958 int nr_pages = vcpu->arch.pio.guest_pages[1] ? 2 : 1;
1959
1960 q = vmap(vcpu->arch.pio.guest_pages, nr_pages, VM_READ|VM_WRITE,
1961 PAGE_KERNEL);
1962 if (!q) {
1963 free_pio_guest_pages(vcpu);
1964 return -ENOMEM;
1965 }
1966 q += vcpu->arch.pio.guest_page_offset;
1967 bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
1968 if (vcpu->arch.pio.in)
1969 memcpy(q, p, bytes);
1970 else
1971 memcpy(p, q, bytes);
1972 q -= vcpu->arch.pio.guest_page_offset;
1973 vunmap(q);
1974 free_pio_guest_pages(vcpu);
1975 return 0;
1976}
1977
1978int complete_pio(struct kvm_vcpu *vcpu)
1979{
1980 struct kvm_pio_request *io = &vcpu->arch.pio;
1981 long delta;
1982 int r;
1983
1984 kvm_x86_ops->cache_regs(vcpu);
1985
1986 if (!io->string) {
1987 if (io->in)
1988 memcpy(&vcpu->arch.regs[VCPU_REGS_RAX], vcpu->arch.pio_data,
1989 io->size);
1990 } else {
1991 if (io->in) {
1992 r = pio_copy_data(vcpu);
1993 if (r) {
1994 kvm_x86_ops->cache_regs(vcpu);
1995 return r;
1996 }
1997 }
1998
1999 delta = 1;
2000 if (io->rep) {
2001 delta *= io->cur_count;
2002 /*
2003 * The size of the register should really depend on
2004 * current address size.
2005 */
2006 vcpu->arch.regs[VCPU_REGS_RCX] -= delta;
2007 }
2008 if (io->down)
2009 delta = -delta;
2010 delta *= io->size;
2011 if (io->in)
2012 vcpu->arch.regs[VCPU_REGS_RDI] += delta;
2013 else
2014 vcpu->arch.regs[VCPU_REGS_RSI] += delta;
2015 }
2016
2017 kvm_x86_ops->decache_regs(vcpu);
2018
2019 io->count -= io->cur_count;
2020 io->cur_count = 0;
2021
2022 return 0;
2023}
2024
2025static void kernel_pio(struct kvm_io_device *pio_dev,
2026 struct kvm_vcpu *vcpu,
2027 void *pd)
2028{
2029 /* TODO: String I/O for in kernel device */
2030
2031 mutex_lock(&vcpu->kvm->lock);
2032 if (vcpu->arch.pio.in)
2033 kvm_iodevice_read(pio_dev, vcpu->arch.pio.port,
2034 vcpu->arch.pio.size,
2035 pd);
2036 else
2037 kvm_iodevice_write(pio_dev, vcpu->arch.pio.port,
2038 vcpu->arch.pio.size,
2039 pd);
2040 mutex_unlock(&vcpu->kvm->lock);
2041}
2042
2043static void pio_string_write(struct kvm_io_device *pio_dev,
2044 struct kvm_vcpu *vcpu)
2045{
2046 struct kvm_pio_request *io = &vcpu->arch.pio;
2047 void *pd = vcpu->arch.pio_data;
2048 int i;
2049
2050 mutex_lock(&vcpu->kvm->lock);
2051 for (i = 0; i < io->cur_count; i++) {
2052 kvm_iodevice_write(pio_dev, io->port,
2053 io->size,
2054 pd);
2055 pd += io->size;
2056 }
2057 mutex_unlock(&vcpu->kvm->lock);
2058}
2059
2060static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
2061 gpa_t addr)
2062{
2063 return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr);
2064}
2065
2066int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2067 int size, unsigned port)
2068{
2069 struct kvm_io_device *pio_dev;
2070
2071 vcpu->run->exit_reason = KVM_EXIT_IO;
2072 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
2073 vcpu->run->io.size = vcpu->arch.pio.size = size;
2074 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
2075 vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1;
2076 vcpu->run->io.port = vcpu->arch.pio.port = port;
2077 vcpu->arch.pio.in = in;
2078 vcpu->arch.pio.string = 0;
2079 vcpu->arch.pio.down = 0;
2080 vcpu->arch.pio.guest_page_offset = 0;
2081 vcpu->arch.pio.rep = 0;
2082
2083 kvm_x86_ops->cache_regs(vcpu);
2084 memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4);
2085 kvm_x86_ops->decache_regs(vcpu);
2086
2087 kvm_x86_ops->skip_emulated_instruction(vcpu);
2088
2089 pio_dev = vcpu_find_pio_dev(vcpu, port);
2090 if (pio_dev) {
2091 kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);
2092 complete_pio(vcpu);
2093 return 1;
2094 }
2095 return 0;
2096}
2097EXPORT_SYMBOL_GPL(kvm_emulate_pio);
2098
2099int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2100 int size, unsigned long count, int down,
2101 gva_t address, int rep, unsigned port)
2102{
2103 unsigned now, in_page;
2104 int i, ret = 0;
2105 int nr_pages = 1;
2106 struct page *page;
2107 struct kvm_io_device *pio_dev;
2108
2109 vcpu->run->exit_reason = KVM_EXIT_IO;
2110 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
2111 vcpu->run->io.size = vcpu->arch.pio.size = size;
2112 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
2113 vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count;
2114 vcpu->run->io.port = vcpu->arch.pio.port = port;
2115 vcpu->arch.pio.in = in;
2116 vcpu->arch.pio.string = 1;
2117 vcpu->arch.pio.down = down;
2118 vcpu->arch.pio.guest_page_offset = offset_in_page(address);
2119 vcpu->arch.pio.rep = rep;
2120
2121 if (!count) {
2122 kvm_x86_ops->skip_emulated_instruction(vcpu);
2123 return 1;
2124 }
2125
2126 if (!down)
2127 in_page = PAGE_SIZE - offset_in_page(address);
2128 else
2129 in_page = offset_in_page(address) + size;
2130 now = min(count, (unsigned long)in_page / size);
2131 if (!now) {
2132 /*
2133 * String I/O straddles page boundary. Pin two guest pages
2134 * so that we satisfy atomicity constraints. Do just one
2135 * transaction to avoid complexity.
2136 */
2137 nr_pages = 2;
2138 now = 1;
2139 }
2140 if (down) {
2141 /*
2142 * String I/O in reverse. Yuck. Kill the guest, fix later.
2143 */
2144 pr_unimpl(vcpu, "guest string pio down\n");
2145 kvm_inject_gp(vcpu, 0);
2146 return 1;
2147 }
2148 vcpu->run->io.count = now;
2149 vcpu->arch.pio.cur_count = now;
2150
2151 if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count)
2152 kvm_x86_ops->skip_emulated_instruction(vcpu);
2153
2154 for (i = 0; i < nr_pages; ++i) {
2155 down_read(&current->mm->mmap_sem);
2156 page = gva_to_page(vcpu, address + i * PAGE_SIZE);
2157 vcpu->arch.pio.guest_pages[i] = page;
2158 up_read(&current->mm->mmap_sem);
2159 if (!page) {
2160 kvm_inject_gp(vcpu, 0);
2161 free_pio_guest_pages(vcpu);
2162 return 1;
2163 }
2164 }
2165
2166 pio_dev = vcpu_find_pio_dev(vcpu, port);
2167 if (!vcpu->arch.pio.in) {
2168 /* string PIO write */
2169 ret = pio_copy_data(vcpu);
2170 if (ret >= 0 && pio_dev) {
2171 pio_string_write(pio_dev, vcpu);
2172 complete_pio(vcpu);
2173 if (vcpu->arch.pio.count == 0)
2174 ret = 1;
2175 }
2176 } else if (pio_dev)
2177 pr_unimpl(vcpu, "no string pio read support yet, "
2178 "port %x size %d count %ld\n",
2179 port, size, count);
2180
2181 return ret;
2182}
2183EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
2184
2185int kvm_arch_init(void *opaque)
2186{
2187 int r;
2188 struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
2189
2190 if (kvm_x86_ops) {
2191 printk(KERN_ERR "kvm: already loaded the other module\n");
2192 r = -EEXIST;
2193 goto out;
2194 }
2195
2196 if (!ops->cpu_has_kvm_support()) {
2197 printk(KERN_ERR "kvm: no hardware support\n");
2198 r = -EOPNOTSUPP;
2199 goto out;
2200 }
2201 if (ops->disabled_by_bios()) {
2202 printk(KERN_ERR "kvm: disabled by bios\n");
2203 r = -EOPNOTSUPP;
2204 goto out;
2205 }
2206
2207 r = kvm_mmu_module_init();
2208 if (r)
2209 goto out;
2210
2211 kvm_init_msr_list();
2212
2213 kvm_x86_ops = ops;
2214 kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
2215 return 0;
2216
2217out:
2218 return r;
2219}
2220
2221void kvm_arch_exit(void)
2222{
2223 kvm_x86_ops = NULL;
2224 kvm_mmu_module_exit();
2225}
2226
2227int kvm_emulate_halt(struct kvm_vcpu *vcpu)
2228{
2229 ++vcpu->stat.halt_exits;
2230 if (irqchip_in_kernel(vcpu->kvm)) {
2231 vcpu->arch.mp_state = VCPU_MP_STATE_HALTED;
2232 kvm_vcpu_block(vcpu);
2233 if (vcpu->arch.mp_state != VCPU_MP_STATE_RUNNABLE)
2234 return -EINTR;
2235 return 1;
2236 } else {
2237 vcpu->run->exit_reason = KVM_EXIT_HLT;
2238 return 0;
2239 }
2240}
2241EXPORT_SYMBOL_GPL(kvm_emulate_halt);
2242
2243int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
2244{
2245 unsigned long nr, a0, a1, a2, a3, ret;
2246
2247 kvm_x86_ops->cache_regs(vcpu);
2248
2249 nr = vcpu->arch.regs[VCPU_REGS_RAX];
2250 a0 = vcpu->arch.regs[VCPU_REGS_RBX];
2251 a1 = vcpu->arch.regs[VCPU_REGS_RCX];
2252 a2 = vcpu->arch.regs[VCPU_REGS_RDX];
2253 a3 = vcpu->arch.regs[VCPU_REGS_RSI];
2254
2255 if (!is_long_mode(vcpu)) {
2256 nr &= 0xFFFFFFFF;
2257 a0 &= 0xFFFFFFFF;
2258 a1 &= 0xFFFFFFFF;
2259 a2 &= 0xFFFFFFFF;
2260 a3 &= 0xFFFFFFFF;
2261 }
2262
2263 switch (nr) {
2264 case KVM_HC_VAPIC_POLL_IRQ:
2265 ret = 0;
2266 break;
2267 default:
2268 ret = -KVM_ENOSYS;
2269 break;
2270 }
2271 vcpu->arch.regs[VCPU_REGS_RAX] = ret;
2272 kvm_x86_ops->decache_regs(vcpu);
2273 return 0;
2274}
2275EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
2276
2277int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
2278{
2279 char instruction[3];
2280 int ret = 0;
2281
2282
2283 /*
2284 * Blow out the MMU to ensure that no other VCPU has an active mapping
2285 * to ensure that the updated hypercall appears atomically across all
2286 * VCPUs.
2287 */
2288 kvm_mmu_zap_all(vcpu->kvm);
2289
2290 kvm_x86_ops->cache_regs(vcpu);
2291 kvm_x86_ops->patch_hypercall(vcpu, instruction);
2292 if (emulator_write_emulated(vcpu->arch.rip, instruction, 3, vcpu)
2293 != X86EMUL_CONTINUE)
2294 ret = -EFAULT;
2295
2296 return ret;
2297}
2298
2299static u64 mk_cr_64(u64 curr_cr, u32 new_val)
2300{
2301 return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
2302}
2303
2304void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
2305{
2306 struct descriptor_table dt = { limit, base };
2307
2308 kvm_x86_ops->set_gdt(vcpu, &dt);
2309}
2310
2311void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
2312{
2313 struct descriptor_table dt = { limit, base };
2314
2315 kvm_x86_ops->set_idt(vcpu, &dt);
2316}
2317
2318void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
2319 unsigned long *rflags)
2320{
2321 lmsw(vcpu, msw);
2322 *rflags = kvm_x86_ops->get_rflags(vcpu);
2323}
2324
2325unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
2326{
2327 kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2328 switch (cr) {
2329 case 0:
2330 return vcpu->arch.cr0;
2331 case 2:
2332 return vcpu->arch.cr2;
2333 case 3:
2334 return vcpu->arch.cr3;
2335 case 4:
2336 return vcpu->arch.cr4;
2337 case 8:
2338 return get_cr8(vcpu);
2339 default:
2340 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
2341 return 0;
2342 }
2343}
2344
2345void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
2346 unsigned long *rflags)
2347{
2348 switch (cr) {
2349 case 0:
2350 set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
2351 *rflags = kvm_x86_ops->get_rflags(vcpu);
2352 break;
2353 case 2:
2354 vcpu->arch.cr2 = val;
2355 break;
2356 case 3:
2357 set_cr3(vcpu, val);
2358 break;
2359 case 4:
2360 set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val));
2361 break;
2362 case 8:
2363 set_cr8(vcpu, val & 0xfUL);
2364 break;
2365 default:
2366 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
2367 }
2368}
2369
2370static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
2371{
2372 struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
2373 int j, nent = vcpu->arch.cpuid_nent;
2374
2375 e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
2376 /* when no next entry is found, the current entry[i] is reselected */
2377 for (j = i + 1; j == i; j = (j + 1) % nent) {
2378 struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
2379 if (ej->function == e->function) {
2380 ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
2381 return j;
2382 }
2383 }
2384 return 0; /* silence gcc, even though control never reaches here */
2385}
2386
2387/* find an entry with matching function, matching index (if needed), and that
2388 * should be read next (if it's stateful) */
2389static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
2390 u32 function, u32 index)
2391{
2392 if (e->function != function)
2393 return 0;
2394 if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
2395 return 0;
2396 if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
2397 !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
2398 return 0;
2399 return 1;
2400}
2401
2402void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
2403{
2404 int i;
2405 u32 function, index;
2406 struct kvm_cpuid_entry2 *e, *best;
2407
2408 kvm_x86_ops->cache_regs(vcpu);
2409 function = vcpu->arch.regs[VCPU_REGS_RAX];
2410 index = vcpu->arch.regs[VCPU_REGS_RCX];
2411 vcpu->arch.regs[VCPU_REGS_RAX] = 0;
2412 vcpu->arch.regs[VCPU_REGS_RBX] = 0;
2413 vcpu->arch.regs[VCPU_REGS_RCX] = 0;
2414 vcpu->arch.regs[VCPU_REGS_RDX] = 0;
2415 best = NULL;
2416 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
2417 e = &vcpu->arch.cpuid_entries[i];
2418 if (is_matching_cpuid_entry(e, function, index)) {
2419 if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
2420 move_to_next_stateful_cpuid_entry(vcpu, i);
2421 best = e;
2422 break;
2423 }
2424 /*
2425 * Both basic or both extended?
2426 */
2427 if (((e->function ^ function) & 0x80000000) == 0)
2428 if (!best || e->function > best->function)
2429 best = e;
2430 }
2431 if (best) {
2432 vcpu->arch.regs[VCPU_REGS_RAX] = best->eax;
2433 vcpu->arch.regs[VCPU_REGS_RBX] = best->ebx;
2434 vcpu->arch.regs[VCPU_REGS_RCX] = best->ecx;
2435 vcpu->arch.regs[VCPU_REGS_RDX] = best->edx;
2436 }
2437 kvm_x86_ops->decache_regs(vcpu);
2438 kvm_x86_ops->skip_emulated_instruction(vcpu);
2439}
2440EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
2441
2442/*
2443 * Check if userspace requested an interrupt window, and that the
2444 * interrupt window is open.
2445 *
2446 * No need to exit to userspace if we already have an interrupt queued.
2447 */
2448static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
2449 struct kvm_run *kvm_run)
2450{
2451 return (!vcpu->arch.irq_summary &&
2452 kvm_run->request_interrupt_window &&
2453 vcpu->arch.interrupt_window_open &&
2454 (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF));
2455}
2456
2457static void post_kvm_run_save(struct kvm_vcpu *vcpu,
2458 struct kvm_run *kvm_run)
2459{
2460 kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
2461 kvm_run->cr8 = get_cr8(vcpu);
2462 kvm_run->apic_base = kvm_get_apic_base(vcpu);
2463 if (irqchip_in_kernel(vcpu->kvm))
2464 kvm_run->ready_for_interrupt_injection = 1;
2465 else
2466 kvm_run->ready_for_interrupt_injection =
2467 (vcpu->arch.interrupt_window_open &&
2468 vcpu->arch.irq_summary == 0);
2469}
2470
2471static void vapic_enter(struct kvm_vcpu *vcpu)
2472{
2473 struct kvm_lapic *apic = vcpu->arch.apic;
2474 struct page *page;
2475
2476 if (!apic || !apic->vapic_addr)
2477 return;
2478
2479 down_read(&current->mm->mmap_sem);
2480 page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
2481 vcpu->arch.apic->vapic_page = page;
2482 up_read(&current->mm->mmap_sem);
2483}
2484
2485static void vapic_exit(struct kvm_vcpu *vcpu)
2486{
2487 struct kvm_lapic *apic = vcpu->arch.apic;
2488
2489 if (!apic || !apic->vapic_addr)
2490 return;
2491
2492 kvm_release_page_dirty(apic->vapic_page);
2493 mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
2494}
2495
2496static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2497{
2498 int r;
2499
2500 if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) {
2501 pr_debug("vcpu %d received sipi with vector # %x\n",
2502 vcpu->vcpu_id, vcpu->arch.sipi_vector);
2503 kvm_lapic_reset(vcpu);
2504 r = kvm_x86_ops->vcpu_reset(vcpu);
2505 if (r)
2506 return r;
2507 vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
2508 }
2509
2510 vapic_enter(vcpu);
2511
2512preempted:
2513 if (vcpu->guest_debug.enabled)
2514 kvm_x86_ops->guest_debug_pre(vcpu);
2515
2516again:
2517 r = kvm_mmu_reload(vcpu);
2518 if (unlikely(r))
2519 goto out;
2520
2521 if (vcpu->requests) {
2522 if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
2523 __kvm_migrate_apic_timer(vcpu);
2524 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
2525 &vcpu->requests)) {
2526 kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS;
2527 r = 0;
2528 goto out;
2529 }
2530 }
2531
2532 kvm_inject_pending_timer_irqs(vcpu);
2533
2534 preempt_disable();
2535
2536 kvm_x86_ops->prepare_guest_switch(vcpu);
2537 kvm_load_guest_fpu(vcpu);
2538
2539 local_irq_disable();
2540
2541 if (need_resched()) {
2542 local_irq_enable();
2543 preempt_enable();
2544 r = 1;
2545 goto out;
2546 }
2547
2548 if (signal_pending(current)) {
2549 local_irq_enable();
2550 preempt_enable();
2551 r = -EINTR;
2552 kvm_run->exit_reason = KVM_EXIT_INTR;
2553 ++vcpu->stat.signal_exits;
2554 goto out;
2555 }
2556
2557 if (vcpu->arch.exception.pending)
2558 __queue_exception(vcpu);
2559 else if (irqchip_in_kernel(vcpu->kvm))
2560 kvm_x86_ops->inject_pending_irq(vcpu);
2561 else
2562 kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run);
2563
2564 kvm_lapic_sync_to_vapic(vcpu);
2565
2566 vcpu->guest_mode = 1;
2567 kvm_guest_enter();
2568
2569 if (vcpu->requests)
2570 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
2571 kvm_x86_ops->tlb_flush(vcpu);
2572
2573 kvm_x86_ops->run(vcpu, kvm_run);
2574
2575 vcpu->guest_mode = 0;
2576 local_irq_enable();
2577
2578 ++vcpu->stat.exits;
2579
2580 /*
2581 * We must have an instruction between local_irq_enable() and
2582 * kvm_guest_exit(), so the timer interrupt isn't delayed by
2583 * the interrupt shadow. The stat.exits increment will do nicely.
2584 * But we need to prevent reordering, hence this barrier():
2585 */
2586 barrier();
2587
2588 kvm_guest_exit();
2589
2590 preempt_enable();
2591
2592 /*
2593 * Profile KVM exit RIPs:
2594 */
2595 if (unlikely(prof_on == KVM_PROFILING)) {
2596 kvm_x86_ops->cache_regs(vcpu);
2597 profile_hit(KVM_PROFILING, (void *)vcpu->arch.rip);
2598 }
2599
2600 if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu))
2601 vcpu->arch.exception.pending = false;
2602
2603 kvm_lapic_sync_from_vapic(vcpu);
2604
2605 r = kvm_x86_ops->handle_exit(kvm_run, vcpu);
2606
2607 if (r > 0) {
2608 if (dm_request_for_irq_injection(vcpu, kvm_run)) {
2609 r = -EINTR;
2610 kvm_run->exit_reason = KVM_EXIT_INTR;
2611 ++vcpu->stat.request_irq_exits;
2612 goto out;
2613 }
2614 if (!need_resched())
2615 goto again;
2616 }
2617
2618out:
2619 if (r > 0) {
2620 kvm_resched(vcpu);
2621 goto preempted;
2622 }
2623
2624 post_kvm_run_save(vcpu, kvm_run);
2625
2626 vapic_exit(vcpu);
2627
2628 return r;
2629}
2630
2631int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2632{
2633 int r;
2634 sigset_t sigsaved;
2635
2636 vcpu_load(vcpu);
2637
2638 if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_UNINITIALIZED)) {
2639 kvm_vcpu_block(vcpu);
2640 vcpu_put(vcpu);
2641 return -EAGAIN;
2642 }
2643
2644 if (vcpu->sigset_active)
2645 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
2646
2647 /* re-sync apic's tpr */
2648 if (!irqchip_in_kernel(vcpu->kvm))
2649 set_cr8(vcpu, kvm_run->cr8);
2650
2651 if (vcpu->arch.pio.cur_count) {
2652 r = complete_pio(vcpu);
2653 if (r)
2654 goto out;
2655 }
2656#if CONFIG_HAS_IOMEM
2657 if (vcpu->mmio_needed) {
2658 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
2659 vcpu->mmio_read_completed = 1;
2660 vcpu->mmio_needed = 0;
2661 r = emulate_instruction(vcpu, kvm_run,
2662 vcpu->arch.mmio_fault_cr2, 0,
2663 EMULTYPE_NO_DECODE);
2664 if (r == EMULATE_DO_MMIO) {
2665 /*
2666 * Read-modify-write. Back to userspace.
2667 */
2668 r = 0;
2669 goto out;
2670 }
2671 }
2672#endif
2673 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) {
2674 kvm_x86_ops->cache_regs(vcpu);
2675 vcpu->arch.regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret;
2676 kvm_x86_ops->decache_regs(vcpu);
2677 }
2678
2679 r = __vcpu_run(vcpu, kvm_run);
2680
2681out:
2682 if (vcpu->sigset_active)
2683 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
2684
2685 vcpu_put(vcpu);
2686 return r;
2687}
2688
2689int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
2690{
2691 vcpu_load(vcpu);
2692
2693 kvm_x86_ops->cache_regs(vcpu);
2694
2695 regs->rax = vcpu->arch.regs[VCPU_REGS_RAX];
2696 regs->rbx = vcpu->arch.regs[VCPU_REGS_RBX];
2697 regs->rcx = vcpu->arch.regs[VCPU_REGS_RCX];
2698 regs->rdx = vcpu->arch.regs[VCPU_REGS_RDX];
2699 regs->rsi = vcpu->arch.regs[VCPU_REGS_RSI];
2700 regs->rdi = vcpu->arch.regs[VCPU_REGS_RDI];
2701 regs->rsp = vcpu->arch.regs[VCPU_REGS_RSP];
2702 regs->rbp = vcpu->arch.regs[VCPU_REGS_RBP];
2703#ifdef CONFIG_X86_64
2704 regs->r8 = vcpu->arch.regs[VCPU_REGS_R8];
2705 regs->r9 = vcpu->arch.regs[VCPU_REGS_R9];
2706 regs->r10 = vcpu->arch.regs[VCPU_REGS_R10];
2707 regs->r11 = vcpu->arch.regs[VCPU_REGS_R11];
2708 regs->r12 = vcpu->arch.regs[VCPU_REGS_R12];
2709 regs->r13 = vcpu->arch.regs[VCPU_REGS_R13];
2710 regs->r14 = vcpu->arch.regs[VCPU_REGS_R14];
2711 regs->r15 = vcpu->arch.regs[VCPU_REGS_R15];
2712#endif
2713
2714 regs->rip = vcpu->arch.rip;
2715 regs->rflags = kvm_x86_ops->get_rflags(vcpu);
2716
2717 /*
2718 * Don't leak debug flags in case they were set for guest debugging
2719 */
2720 if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
2721 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
2722
2723 vcpu_put(vcpu);
2724
2725 return 0;
2726}
2727
2728int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
2729{
2730 vcpu_load(vcpu);
2731
2732 vcpu->arch.regs[VCPU_REGS_RAX] = regs->rax;
2733 vcpu->arch.regs[VCPU_REGS_RBX] = regs->rbx;
2734 vcpu->arch.regs[VCPU_REGS_RCX] = regs->rcx;
2735 vcpu->arch.regs[VCPU_REGS_RDX] = regs->rdx;
2736 vcpu->arch.regs[VCPU_REGS_RSI] = regs->rsi;
2737 vcpu->arch.regs[VCPU_REGS_RDI] = regs->rdi;
2738 vcpu->arch.regs[VCPU_REGS_RSP] = regs->rsp;
2739 vcpu->arch.regs[VCPU_REGS_RBP] = regs->rbp;
2740#ifdef CONFIG_X86_64
2741 vcpu->arch.regs[VCPU_REGS_R8] = regs->r8;
2742 vcpu->arch.regs[VCPU_REGS_R9] = regs->r9;
2743 vcpu->arch.regs[VCPU_REGS_R10] = regs->r10;
2744 vcpu->arch.regs[VCPU_REGS_R11] = regs->r11;
2745 vcpu->arch.regs[VCPU_REGS_R12] = regs->r12;
2746 vcpu->arch.regs[VCPU_REGS_R13] = regs->r13;
2747 vcpu->arch.regs[VCPU_REGS_R14] = regs->r14;
2748 vcpu->arch.regs[VCPU_REGS_R15] = regs->r15;
2749#endif
2750
2751 vcpu->arch.rip = regs->rip;
2752 kvm_x86_ops->set_rflags(vcpu, regs->rflags);
2753
2754 kvm_x86_ops->decache_regs(vcpu);
2755
2756 vcpu_put(vcpu);
2757
2758 return 0;
2759}
2760
2761static void get_segment(struct kvm_vcpu *vcpu,
2762 struct kvm_segment *var, int seg)
2763{
2764 return kvm_x86_ops->get_segment(vcpu, var, seg);
2765}
2766
2767void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
2768{
2769 struct kvm_segment cs;
2770
2771 get_segment(vcpu, &cs, VCPU_SREG_CS);
2772 *db = cs.db;
2773 *l = cs.l;
2774}
2775EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
2776
2777int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
2778 struct kvm_sregs *sregs)
2779{
2780 struct descriptor_table dt;
2781 int pending_vec;
2782
2783 vcpu_load(vcpu);
2784
2785 get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
2786 get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
2787 get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
2788 get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
2789 get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
2790 get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
2791
2792 get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
2793 get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
2794
2795 kvm_x86_ops->get_idt(vcpu, &dt);
2796 sregs->idt.limit = dt.limit;
2797 sregs->idt.base = dt.base;
2798 kvm_x86_ops->get_gdt(vcpu, &dt);
2799 sregs->gdt.limit = dt.limit;
2800 sregs->gdt.base = dt.base;
2801
2802 kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2803 sregs->cr0 = vcpu->arch.cr0;
2804 sregs->cr2 = vcpu->arch.cr2;
2805 sregs->cr3 = vcpu->arch.cr3;
2806 sregs->cr4 = vcpu->arch.cr4;
2807 sregs->cr8 = get_cr8(vcpu);
2808 sregs->efer = vcpu->arch.shadow_efer;
2809 sregs->apic_base = kvm_get_apic_base(vcpu);
2810
2811 if (irqchip_in_kernel(vcpu->kvm)) {
2812 memset(sregs->interrupt_bitmap, 0,
2813 sizeof sregs->interrupt_bitmap);
2814 pending_vec = kvm_x86_ops->get_irq(vcpu);
2815 if (pending_vec >= 0)
2816 set_bit(pending_vec,
2817 (unsigned long *)sregs->interrupt_bitmap);
2818 } else
2819 memcpy(sregs->interrupt_bitmap, vcpu->arch.irq_pending,
2820 sizeof sregs->interrupt_bitmap);
2821
2822 vcpu_put(vcpu);
2823
2824 return 0;
2825}
2826
2827static void set_segment(struct kvm_vcpu *vcpu,
2828 struct kvm_segment *var, int seg)
2829{
2830 return kvm_x86_ops->set_segment(vcpu, var, seg);
2831}
2832
2833int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
2834 struct kvm_sregs *sregs)
2835{
2836 int mmu_reset_needed = 0;
2837 int i, pending_vec, max_bits;
2838 struct descriptor_table dt;
2839
2840 vcpu_load(vcpu);
2841
2842 dt.limit = sregs->idt.limit;
2843 dt.base = sregs->idt.base;
2844 kvm_x86_ops->set_idt(vcpu, &dt);
2845 dt.limit = sregs->gdt.limit;
2846 dt.base = sregs->gdt.base;
2847 kvm_x86_ops->set_gdt(vcpu, &dt);
2848
2849 vcpu->arch.cr2 = sregs->cr2;
2850 mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
2851 vcpu->arch.cr3 = sregs->cr3;
2852
2853 set_cr8(vcpu, sregs->cr8);
2854
2855 mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer;
2856#ifdef CONFIG_X86_64
2857 kvm_x86_ops->set_efer(vcpu, sregs->efer);
2858#endif
2859 kvm_set_apic_base(vcpu, sregs->apic_base);
2860
2861 kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2862
2863 mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0;
2864 vcpu->arch.cr0 = sregs->cr0;
2865 kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
2866
2867 mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4;
2868 kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
2869 if (!is_long_mode(vcpu) && is_pae(vcpu))
2870 load_pdptrs(vcpu, vcpu->arch.cr3);
2871
2872 if (mmu_reset_needed)
2873 kvm_mmu_reset_context(vcpu);
2874
2875 if (!irqchip_in_kernel(vcpu->kvm)) {
2876 memcpy(vcpu->arch.irq_pending, sregs->interrupt_bitmap,
2877 sizeof vcpu->arch.irq_pending);
2878 vcpu->arch.irq_summary = 0;
2879 for (i = 0; i < ARRAY_SIZE(vcpu->arch.irq_pending); ++i)
2880 if (vcpu->arch.irq_pending[i])
2881 __set_bit(i, &vcpu->arch.irq_summary);
2882 } else {
2883 max_bits = (sizeof sregs->interrupt_bitmap) << 3;
2884 pending_vec = find_first_bit(
2885 (const unsigned long *)sregs->interrupt_bitmap,
2886 max_bits);
2887 /* Only pending external irq is handled here */
2888 if (pending_vec < max_bits) {
2889 kvm_x86_ops->set_irq(vcpu, pending_vec);
2890 pr_debug("Set back pending irq %d\n",
2891 pending_vec);
2892 }
2893 }
2894
2895 set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
2896 set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
2897 set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
2898 set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
2899 set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
2900 set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
2901
2902 set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
2903 set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
2904
2905 vcpu_put(vcpu);
2906
2907 return 0;
2908}
2909
2910int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
2911 struct kvm_debug_guest *dbg)
2912{
2913 int r;
2914
2915 vcpu_load(vcpu);
2916
2917 r = kvm_x86_ops->set_guest_debug(vcpu, dbg);
2918
2919 vcpu_put(vcpu);
2920
2921 return r;
2922}
2923
2924/*
2925 * fxsave fpu state. Taken from x86_64/processor.h. To be killed when
2926 * we have asm/x86/processor.h
2927 */
2928struct fxsave {
2929 u16 cwd;
2930 u16 swd;
2931 u16 twd;
2932 u16 fop;
2933 u64 rip;
2934 u64 rdp;
2935 u32 mxcsr;
2936 u32 mxcsr_mask;
2937 u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
2938#ifdef CONFIG_X86_64
2939 u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */
2940#else
2941 u32 xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */
2942#endif
2943};
2944
2945/*
2946 * Translate a guest virtual address to a guest physical address.
2947 */
2948int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
2949 struct kvm_translation *tr)
2950{
2951 unsigned long vaddr = tr->linear_address;
2952 gpa_t gpa;
2953
2954 vcpu_load(vcpu);
2955 down_read(&current->mm->mmap_sem);
2956 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr);
2957 up_read(&current->mm->mmap_sem);
2958 tr->physical_address = gpa;
2959 tr->valid = gpa != UNMAPPED_GVA;
2960 tr->writeable = 1;
2961 tr->usermode = 0;
2962 vcpu_put(vcpu);
2963
2964 return 0;
2965}
2966
2967int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2968{
2969 struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
2970
2971 vcpu_load(vcpu);
2972
2973 memcpy(fpu->fpr, fxsave->st_space, 128);
2974 fpu->fcw = fxsave->cwd;
2975 fpu->fsw = fxsave->swd;
2976 fpu->ftwx = fxsave->twd;
2977 fpu->last_opcode = fxsave->fop;
2978 fpu->last_ip = fxsave->rip;
2979 fpu->last_dp = fxsave->rdp;
2980 memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
2981
2982 vcpu_put(vcpu);
2983
2984 return 0;
2985}
2986
2987int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2988{
2989 struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
2990
2991 vcpu_load(vcpu);
2992
2993 memcpy(fxsave->st_space, fpu->fpr, 128);
2994 fxsave->cwd = fpu->fcw;
2995 fxsave->swd = fpu->fsw;
2996 fxsave->twd = fpu->ftwx;
2997 fxsave->fop = fpu->last_opcode;
2998 fxsave->rip = fpu->last_ip;
2999 fxsave->rdp = fpu->last_dp;
3000 memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
3001
3002 vcpu_put(vcpu);
3003
3004 return 0;
3005}
3006
3007void fx_init(struct kvm_vcpu *vcpu)
3008{
3009 unsigned after_mxcsr_mask;
3010
3011 /* Initialize guest FPU by resetting ours and saving into guest's */
3012 preempt_disable();
3013 fx_save(&vcpu->arch.host_fx_image);
3014 fpu_init();
3015 fx_save(&vcpu->arch.guest_fx_image);
3016 fx_restore(&vcpu->arch.host_fx_image);
3017 preempt_enable();
3018
3019 vcpu->arch.cr0 |= X86_CR0_ET;
3020 after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
3021 vcpu->arch.guest_fx_image.mxcsr = 0x1f80;
3022 memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask,
3023 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
3024}
3025EXPORT_SYMBOL_GPL(fx_init);
3026
3027void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
3028{
3029 if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
3030 return;
3031
3032 vcpu->guest_fpu_loaded = 1;
3033 fx_save(&vcpu->arch.host_fx_image);
3034 fx_restore(&vcpu->arch.guest_fx_image);
3035}
3036EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
3037
3038void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
3039{
3040 if (!vcpu->guest_fpu_loaded)
3041 return;
3042
3043 vcpu->guest_fpu_loaded = 0;
3044 fx_save(&vcpu->arch.guest_fx_image);
3045 fx_restore(&vcpu->arch.host_fx_image);
3046 ++vcpu->stat.fpu_reload;
3047}
3048EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
3049
3050void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
3051{
3052 kvm_x86_ops->vcpu_free(vcpu);
3053}
3054
3055struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
3056 unsigned int id)
3057{
3058 return kvm_x86_ops->vcpu_create(kvm, id);
3059}
3060
3061int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
3062{
3063 int r;
3064
3065 /* We do fxsave: this must be aligned. */
3066 BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
3067
3068 vcpu_load(vcpu);
3069 r = kvm_arch_vcpu_reset(vcpu);
3070 if (r == 0)
3071 r = kvm_mmu_setup(vcpu);
3072 vcpu_put(vcpu);
3073 if (r < 0)
3074 goto free_vcpu;
3075
3076 return 0;
3077free_vcpu:
3078 kvm_x86_ops->vcpu_free(vcpu);
3079 return r;
3080}
3081
3082void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
3083{
3084 vcpu_load(vcpu);
3085 kvm_mmu_unload(vcpu);
3086 vcpu_put(vcpu);
3087
3088 kvm_x86_ops->vcpu_free(vcpu);
3089}
3090
3091int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
3092{
3093 return kvm_x86_ops->vcpu_reset(vcpu);
3094}
3095
3096void kvm_arch_hardware_enable(void *garbage)
3097{
3098 kvm_x86_ops->hardware_enable(garbage);
3099}
3100
3101void kvm_arch_hardware_disable(void *garbage)
3102{
3103 kvm_x86_ops->hardware_disable(garbage);
3104}
3105
3106int kvm_arch_hardware_setup(void)
3107{
3108 return kvm_x86_ops->hardware_setup();
3109}
3110
3111void kvm_arch_hardware_unsetup(void)
3112{
3113 kvm_x86_ops->hardware_unsetup();
3114}
3115
3116void kvm_arch_check_processor_compat(void *rtn)
3117{
3118 kvm_x86_ops->check_processor_compatibility(rtn);
3119}
3120
3121int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
3122{
3123 struct page *page;
3124 struct kvm *kvm;
3125 int r;
3126
3127 BUG_ON(vcpu->kvm == NULL);
3128 kvm = vcpu->kvm;
3129
3130 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
3131 if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0)
3132 vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
3133 else
3134 vcpu->arch.mp_state = VCPU_MP_STATE_UNINITIALIZED;
3135
3136 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
3137 if (!page) {
3138 r = -ENOMEM;
3139 goto fail;
3140 }
3141 vcpu->arch.pio_data = page_address(page);
3142
3143 r = kvm_mmu_create(vcpu);
3144 if (r < 0)
3145 goto fail_free_pio_data;
3146
3147 if (irqchip_in_kernel(kvm)) {
3148 r = kvm_create_lapic(vcpu);
3149 if (r < 0)
3150 goto fail_mmu_destroy;
3151 }
3152
3153 return 0;
3154
3155fail_mmu_destroy:
3156 kvm_mmu_destroy(vcpu);
3157fail_free_pio_data:
3158 free_page((unsigned long)vcpu->arch.pio_data);
3159fail:
3160 return r;
3161}
3162
3163void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
3164{
3165 kvm_free_lapic(vcpu);
3166 kvm_mmu_destroy(vcpu);
3167 free_page((unsigned long)vcpu->arch.pio_data);
3168}
3169
3170struct kvm *kvm_arch_create_vm(void)
3171{
3172 struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
3173
3174 if (!kvm)
3175 return ERR_PTR(-ENOMEM);
3176
3177 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
3178
3179 return kvm;
3180}
3181
3182static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
3183{
3184 vcpu_load(vcpu);
3185 kvm_mmu_unload(vcpu);
3186 vcpu_put(vcpu);
3187}
3188
3189static void kvm_free_vcpus(struct kvm *kvm)
3190{
3191 unsigned int i;
3192
3193 /*
3194 * Unpin any mmu pages first.
3195 */
3196 for (i = 0; i < KVM_MAX_VCPUS; ++i)
3197 if (kvm->vcpus[i])
3198 kvm_unload_vcpu_mmu(kvm->vcpus[i]);
3199 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
3200 if (kvm->vcpus[i]) {
3201 kvm_arch_vcpu_free(kvm->vcpus[i]);
3202 kvm->vcpus[i] = NULL;
3203 }
3204 }
3205
3206}
3207
3208void kvm_arch_destroy_vm(struct kvm *kvm)
3209{
3210 kfree(kvm->arch.vpic);
3211 kfree(kvm->arch.vioapic);
3212 kvm_free_vcpus(kvm);
3213 kvm_free_physmem(kvm);
3214 kfree(kvm);
3215}
3216
3217int kvm_arch_set_memory_region(struct kvm *kvm,
3218 struct kvm_userspace_memory_region *mem,
3219 struct kvm_memory_slot old,
3220 int user_alloc)
3221{
3222 int npages = mem->memory_size >> PAGE_SHIFT;
3223 struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];
3224
3225 /*To keep backward compatibility with older userspace,
3226 *x86 needs to hanlde !user_alloc case.
3227 */
3228 if (!user_alloc) {
3229 if (npages && !old.rmap) {
3230 memslot->userspace_addr = do_mmap(NULL, 0,
3231 npages * PAGE_SIZE,
3232 PROT_READ | PROT_WRITE,
3233 MAP_SHARED | MAP_ANONYMOUS,
3234 0);
3235
3236 if (IS_ERR((void *)memslot->userspace_addr))
3237 return PTR_ERR((void *)memslot->userspace_addr);
3238 } else {
3239 if (!old.user_alloc && old.rmap) {
3240 int ret;
3241
3242 ret = do_munmap(current->mm, old.userspace_addr,
3243 old.npages * PAGE_SIZE);
3244 if (ret < 0)
3245 printk(KERN_WARNING
3246 "kvm_vm_ioctl_set_memory_region: "
3247 "failed to munmap memory\n");
3248 }
3249 }
3250 }
3251
3252 if (!kvm->arch.n_requested_mmu_pages) {
3253 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
3254 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
3255 }
3256
3257 kvm_mmu_slot_remove_write_access(kvm, mem->slot);
3258 kvm_flush_remote_tlbs(kvm);
3259
3260 return 0;
3261}
3262
3263int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
3264{
3265 return vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE
3266 || vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED;
3267}
3268
3269static void vcpu_kick_intr(void *info)
3270{
3271#ifdef DEBUG
3272 struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info;
3273 printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu);
3274#endif
3275}
3276
3277void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
3278{
3279 int ipi_pcpu = vcpu->cpu;
3280
3281 if (waitqueue_active(&vcpu->wq)) {
3282 wake_up_interruptible(&vcpu->wq);
3283 ++vcpu->stat.halt_wakeup;
3284 }
3285 if (vcpu->guest_mode)
3286 smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0);
3287}
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
new file mode 100644
index 00000000000..79586003397
--- /dev/null
+++ b/arch/x86/kvm/x86_emulate.c
@@ -0,0 +1,1912 @@
1/******************************************************************************
2 * x86_emulate.c
3 *
4 * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
5 *
6 * Copyright (c) 2005 Keir Fraser
7 *
8 * Linux coding style, mod r/m decoder, segment base fixes, real-mode
9 * privileged instructions:
10 *
11 * Copyright (C) 2006 Qumranet
12 *
13 * Avi Kivity <avi@qumranet.com>
14 * Yaniv Kamay <yaniv@qumranet.com>
15 *
16 * This work is licensed under the terms of the GNU GPL, version 2. See
17 * the COPYING file in the top-level directory.
18 *
19 * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
20 */
21
22#ifndef __KERNEL__
23#include <stdio.h>
24#include <stdint.h>
25#include <public/xen.h>
26#define DPRINTF(_f, _a ...) printf(_f , ## _a)
27#else
28#include <linux/kvm_host.h>
29#define DPRINTF(x...) do {} while (0)
30#endif
31#include <linux/module.h>
32#include <asm/kvm_x86_emulate.h>
33
34/*
35 * Opcode effective-address decode tables.
36 * Note that we only emulate instructions that have at least one memory
37 * operand (excluding implicit stack references). We assume that stack
38 * references and instruction fetches will never occur in special memory
39 * areas that require emulation. So, for example, 'mov <imm>,<reg>' need
40 * not be handled.
41 */
42
43/* Operand sizes: 8-bit operands or specified/overridden size. */
44#define ByteOp (1<<0) /* 8-bit operands. */
45/* Destination operand type. */
46#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */
47#define DstReg (2<<1) /* Register operand. */
48#define DstMem (3<<1) /* Memory operand. */
49#define DstMask (3<<1)
50/* Source operand type. */
51#define SrcNone (0<<3) /* No source operand. */
52#define SrcImplicit (0<<3) /* Source operand is implicit in the opcode. */
53#define SrcReg (1<<3) /* Register operand. */
54#define SrcMem (2<<3) /* Memory operand. */
55#define SrcMem16 (3<<3) /* Memory operand (16-bit). */
56#define SrcMem32 (4<<3) /* Memory operand (32-bit). */
57#define SrcImm (5<<3) /* Immediate operand. */
58#define SrcImmByte (6<<3) /* 8-bit sign-extended immediate operand. */
59#define SrcMask (7<<3)
60/* Generic ModRM decode. */
61#define ModRM (1<<6)
62/* Destination is only written; never read. */
63#define Mov (1<<7)
64#define BitOp (1<<8)
65#define MemAbs (1<<9) /* Memory operand is absolute displacement */
66#define String (1<<10) /* String instruction (rep capable) */
67#define Stack (1<<11) /* Stack instruction (push/pop) */
68
69static u16 opcode_table[256] = {
70 /* 0x00 - 0x07 */
71 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
72 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
73 0, 0, 0, 0,
74 /* 0x08 - 0x0F */
75 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
76 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
77 0, 0, 0, 0,
78 /* 0x10 - 0x17 */
79 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
80 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
81 0, 0, 0, 0,
82 /* 0x18 - 0x1F */
83 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
84 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
85 0, 0, 0, 0,
86 /* 0x20 - 0x27 */
87 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
88 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
89 SrcImmByte, SrcImm, 0, 0,
90 /* 0x28 - 0x2F */
91 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
92 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
93 0, 0, 0, 0,
94 /* 0x30 - 0x37 */
95 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
96 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
97 0, 0, 0, 0,
98 /* 0x38 - 0x3F */
99 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
100 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
101 0, 0, 0, 0,
102 /* 0x40 - 0x47 */
103 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
104 /* 0x48 - 0x4F */
105 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
106 /* 0x50 - 0x57 */
107 SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
108 SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
109 /* 0x58 - 0x5F */
110 DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
111 DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
112 /* 0x60 - 0x67 */
113 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
114 0, 0, 0, 0,
115 /* 0x68 - 0x6F */
116 0, 0, ImplicitOps | Mov | Stack, 0,
117 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */
118 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */
119 /* 0x70 - 0x77 */
120 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
121 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
122 /* 0x78 - 0x7F */
123 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
124 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
125 /* 0x80 - 0x87 */
126 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
127 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
128 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
129 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
130 /* 0x88 - 0x8F */
131 ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
132 ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
133 0, ModRM | DstReg, 0, DstMem | SrcNone | ModRM | Mov | Stack,
134 /* 0x90 - 0x9F */
135 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
137 /* 0xA0 - 0xA7 */
138 ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
139 ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs,
140 ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
141 ByteOp | ImplicitOps | String, ImplicitOps | String,
142 /* 0xA8 - 0xAF */
143 0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
144 ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
145 ByteOp | ImplicitOps | String, ImplicitOps | String,
146 /* 0xB0 - 0xBF */
147 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
148 /* 0xC0 - 0xC7 */
149 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
150 0, ImplicitOps | Stack, 0, 0,
151 ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
152 /* 0xC8 - 0xCF */
153 0, 0, 0, 0, 0, 0, 0, 0,
154 /* 0xD0 - 0xD7 */
155 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
156 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
157 0, 0, 0, 0,
158 /* 0xD8 - 0xDF */
159 0, 0, 0, 0, 0, 0, 0, 0,
160 /* 0xE0 - 0xE7 */
161 0, 0, 0, 0, 0, 0, 0, 0,
162 /* 0xE8 - 0xEF */
163 ImplicitOps | Stack, SrcImm|ImplicitOps, 0, SrcImmByte|ImplicitOps,
164 0, 0, 0, 0,
165 /* 0xF0 - 0xF7 */
166 0, 0, 0, 0,
167 ImplicitOps, ImplicitOps,
168 ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
169 /* 0xF8 - 0xFF */
170 ImplicitOps, 0, ImplicitOps, ImplicitOps,
171 0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM
172};
173
174static u16 twobyte_table[256] = {
175 /* 0x00 - 0x0F */
176 0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0,
177 ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
178 /* 0x10 - 0x1F */
179 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
180 /* 0x20 - 0x2F */
181 ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 /* 0x30 - 0x3F */
184 ImplicitOps, 0, ImplicitOps, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
185 /* 0x40 - 0x47 */
186 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
187 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
188 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
189 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
190 /* 0x48 - 0x4F */
191 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
192 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
193 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
194 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
195 /* 0x50 - 0x5F */
196 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
197 /* 0x60 - 0x6F */
198 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
199 /* 0x70 - 0x7F */
200 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
201 /* 0x80 - 0x8F */
202 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
203 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
204 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
205 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
206 /* 0x90 - 0x9F */
207 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
208 /* 0xA0 - 0xA7 */
209 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
210 /* 0xA8 - 0xAF */
211 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
212 /* 0xB0 - 0xB7 */
213 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0,
214 DstMem | SrcReg | ModRM | BitOp,
215 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
216 DstReg | SrcMem16 | ModRM | Mov,
217 /* 0xB8 - 0xBF */
218 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp,
219 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
220 DstReg | SrcMem16 | ModRM | Mov,
221 /* 0xC0 - 0xCF */
222 0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM,
223 0, 0, 0, 0, 0, 0, 0, 0,
224 /* 0xD0 - 0xDF */
225 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
226 /* 0xE0 - 0xEF */
227 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
228 /* 0xF0 - 0xFF */
229 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
230};
231
232/* EFLAGS bit definitions. */
233#define EFLG_OF (1<<11)
234#define EFLG_DF (1<<10)
235#define EFLG_SF (1<<7)
236#define EFLG_ZF (1<<6)
237#define EFLG_AF (1<<4)
238#define EFLG_PF (1<<2)
239#define EFLG_CF (1<<0)
240
241/*
242 * Instruction emulation:
243 * Most instructions are emulated directly via a fragment of inline assembly
244 * code. This allows us to save/restore EFLAGS and thus very easily pick up
245 * any modified flags.
246 */
247
248#if defined(CONFIG_X86_64)
249#define _LO32 "k" /* force 32-bit operand */
250#define _STK "%%rsp" /* stack pointer */
251#elif defined(__i386__)
252#define _LO32 "" /* force 32-bit operand */
253#define _STK "%%esp" /* stack pointer */
254#endif
255
256/*
257 * These EFLAGS bits are restored from saved value during emulation, and
258 * any changes are written back to the saved value after emulation.
259 */
260#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF)
261
262/* Before executing instruction: restore necessary bits in EFLAGS. */
263#define _PRE_EFLAGS(_sav, _msk, _tmp) \
264 /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); _sav &= ~_msk; */ \
265 "movl %"_sav",%"_LO32 _tmp"; " \
266 "push %"_tmp"; " \
267 "push %"_tmp"; " \
268 "movl %"_msk",%"_LO32 _tmp"; " \
269 "andl %"_LO32 _tmp",("_STK"); " \
270 "pushf; " \
271 "notl %"_LO32 _tmp"; " \
272 "andl %"_LO32 _tmp",("_STK"); " \
273 "andl %"_LO32 _tmp","__stringify(BITS_PER_LONG/4)"("_STK"); " \
274 "pop %"_tmp"; " \
275 "orl %"_LO32 _tmp",("_STK"); " \
276 "popf; " \
277 "pop %"_sav"; "
278
279/* After executing instruction: write-back necessary bits in EFLAGS. */
280#define _POST_EFLAGS(_sav, _msk, _tmp) \
281 /* _sav |= EFLAGS & _msk; */ \
282 "pushf; " \
283 "pop %"_tmp"; " \
284 "andl %"_msk",%"_LO32 _tmp"; " \
285 "orl %"_LO32 _tmp",%"_sav"; "
286
287/* Raw emulation: instruction has two explicit operands. */
288#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \
289 do { \
290 unsigned long _tmp; \
291 \
292 switch ((_dst).bytes) { \
293 case 2: \
294 __asm__ __volatile__ ( \
295 _PRE_EFLAGS("0", "4", "2") \
296 _op"w %"_wx"3,%1; " \
297 _POST_EFLAGS("0", "4", "2") \
298 : "=m" (_eflags), "=m" ((_dst).val), \
299 "=&r" (_tmp) \
300 : _wy ((_src).val), "i" (EFLAGS_MASK)); \
301 break; \
302 case 4: \
303 __asm__ __volatile__ ( \
304 _PRE_EFLAGS("0", "4", "2") \
305 _op"l %"_lx"3,%1; " \
306 _POST_EFLAGS("0", "4", "2") \
307 : "=m" (_eflags), "=m" ((_dst).val), \
308 "=&r" (_tmp) \
309 : _ly ((_src).val), "i" (EFLAGS_MASK)); \
310 break; \
311 case 8: \
312 __emulate_2op_8byte(_op, _src, _dst, \
313 _eflags, _qx, _qy); \
314 break; \
315 } \
316 } while (0)
317
318#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \
319 do { \
320 unsigned long _tmp; \
321 switch ((_dst).bytes) { \
322 case 1: \
323 __asm__ __volatile__ ( \
324 _PRE_EFLAGS("0", "4", "2") \
325 _op"b %"_bx"3,%1; " \
326 _POST_EFLAGS("0", "4", "2") \
327 : "=m" (_eflags), "=m" ((_dst).val), \
328 "=&r" (_tmp) \
329 : _by ((_src).val), "i" (EFLAGS_MASK)); \
330 break; \
331 default: \
332 __emulate_2op_nobyte(_op, _src, _dst, _eflags, \
333 _wx, _wy, _lx, _ly, _qx, _qy); \
334 break; \
335 } \
336 } while (0)
337
338/* Source operand is byte-sized and may be restricted to just %cl. */
339#define emulate_2op_SrcB(_op, _src, _dst, _eflags) \
340 __emulate_2op(_op, _src, _dst, _eflags, \
341 "b", "c", "b", "c", "b", "c", "b", "c")
342
343/* Source operand is byte, word, long or quad sized. */
344#define emulate_2op_SrcV(_op, _src, _dst, _eflags) \
345 __emulate_2op(_op, _src, _dst, _eflags, \
346 "b", "q", "w", "r", _LO32, "r", "", "r")
347
348/* Source operand is word, long or quad sized. */
349#define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags) \
350 __emulate_2op_nobyte(_op, _src, _dst, _eflags, \
351 "w", "r", _LO32, "r", "", "r")
352
353/* Instruction has only one explicit operand (no source operand). */
354#define emulate_1op(_op, _dst, _eflags) \
355 do { \
356 unsigned long _tmp; \
357 \
358 switch ((_dst).bytes) { \
359 case 1: \
360 __asm__ __volatile__ ( \
361 _PRE_EFLAGS("0", "3", "2") \
362 _op"b %1; " \
363 _POST_EFLAGS("0", "3", "2") \
364 : "=m" (_eflags), "=m" ((_dst).val), \
365 "=&r" (_tmp) \
366 : "i" (EFLAGS_MASK)); \
367 break; \
368 case 2: \
369 __asm__ __volatile__ ( \
370 _PRE_EFLAGS("0", "3", "2") \
371 _op"w %1; " \
372 _POST_EFLAGS("0", "3", "2") \
373 : "=m" (_eflags), "=m" ((_dst).val), \
374 "=&r" (_tmp) \
375 : "i" (EFLAGS_MASK)); \
376 break; \
377 case 4: \
378 __asm__ __volatile__ ( \
379 _PRE_EFLAGS("0", "3", "2") \
380 _op"l %1; " \
381 _POST_EFLAGS("0", "3", "2") \
382 : "=m" (_eflags), "=m" ((_dst).val), \
383 "=&r" (_tmp) \
384 : "i" (EFLAGS_MASK)); \
385 break; \
386 case 8: \
387 __emulate_1op_8byte(_op, _dst, _eflags); \
388 break; \
389 } \
390 } while (0)
391
392/* Emulate an instruction with quadword operands (x86/64 only). */
393#if defined(CONFIG_X86_64)
394#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) \
395 do { \
396 __asm__ __volatile__ ( \
397 _PRE_EFLAGS("0", "4", "2") \
398 _op"q %"_qx"3,%1; " \
399 _POST_EFLAGS("0", "4", "2") \
400 : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
401 : _qy ((_src).val), "i" (EFLAGS_MASK)); \
402 } while (0)
403
404#define __emulate_1op_8byte(_op, _dst, _eflags) \
405 do { \
406 __asm__ __volatile__ ( \
407 _PRE_EFLAGS("0", "3", "2") \
408 _op"q %1; " \
409 _POST_EFLAGS("0", "3", "2") \
410 : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
411 : "i" (EFLAGS_MASK)); \
412 } while (0)
413
414#elif defined(__i386__)
415#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy)
416#define __emulate_1op_8byte(_op, _dst, _eflags)
417#endif /* __i386__ */
418
419/* Fetch next part of the instruction being emulated. */
420#define insn_fetch(_type, _size, _eip) \
421({ unsigned long _x; \
422 rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size)); \
423 if (rc != 0) \
424 goto done; \
425 (_eip) += (_size); \
426 (_type)_x; \
427})
428
429/* Access/update address held in a register, based on addressing mode. */
430#define address_mask(reg) \
431 ((c->ad_bytes == sizeof(unsigned long)) ? \
432 (reg) : ((reg) & ((1UL << (c->ad_bytes << 3)) - 1)))
433#define register_address(base, reg) \
434 ((base) + address_mask(reg))
435#define register_address_increment(reg, inc) \
436 do { \
437 /* signed type ensures sign extension to long */ \
438 int _inc = (inc); \
439 if (c->ad_bytes == sizeof(unsigned long)) \
440 (reg) += _inc; \
441 else \
442 (reg) = ((reg) & \
443 ~((1UL << (c->ad_bytes << 3)) - 1)) | \
444 (((reg) + _inc) & \
445 ((1UL << (c->ad_bytes << 3)) - 1)); \
446 } while (0)
447
448#define JMP_REL(rel) \
449 do { \
450 register_address_increment(c->eip, rel); \
451 } while (0)
452
453static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
454 struct x86_emulate_ops *ops,
455 unsigned long linear, u8 *dest)
456{
457 struct fetch_cache *fc = &ctxt->decode.fetch;
458 int rc;
459 int size;
460
461 if (linear < fc->start || linear >= fc->end) {
462 size = min(15UL, PAGE_SIZE - offset_in_page(linear));
463 rc = ops->read_std(linear, fc->data, size, ctxt->vcpu);
464 if (rc)
465 return rc;
466 fc->start = linear;
467 fc->end = linear + size;
468 }
469 *dest = fc->data[linear - fc->start];
470 return 0;
471}
472
473static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
474 struct x86_emulate_ops *ops,
475 unsigned long eip, void *dest, unsigned size)
476{
477 int rc = 0;
478
479 eip += ctxt->cs_base;
480 while (size--) {
481 rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++);
482 if (rc)
483 return rc;
484 }
485 return 0;
486}
487
488/*
489 * Given the 'reg' portion of a ModRM byte, and a register block, return a
490 * pointer into the block that addresses the relevant register.
491 * @highbyte_regs specifies whether to decode AH,CH,DH,BH.
492 */
493static void *decode_register(u8 modrm_reg, unsigned long *regs,
494 int highbyte_regs)
495{
496 void *p;
497
498 p = &regs[modrm_reg];
499 if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8)
500 p = (unsigned char *)&regs[modrm_reg & 3] + 1;
501 return p;
502}
503
504static int read_descriptor(struct x86_emulate_ctxt *ctxt,
505 struct x86_emulate_ops *ops,
506 void *ptr,
507 u16 *size, unsigned long *address, int op_bytes)
508{
509 int rc;
510
511 if (op_bytes == 2)
512 op_bytes = 3;
513 *address = 0;
514 rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2,
515 ctxt->vcpu);
516 if (rc)
517 return rc;
518 rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes,
519 ctxt->vcpu);
520 return rc;
521}
522
523static int test_cc(unsigned int condition, unsigned int flags)
524{
525 int rc = 0;
526
527 switch ((condition & 15) >> 1) {
528 case 0: /* o */
529 rc |= (flags & EFLG_OF);
530 break;
531 case 1: /* b/c/nae */
532 rc |= (flags & EFLG_CF);
533 break;
534 case 2: /* z/e */
535 rc |= (flags & EFLG_ZF);
536 break;
537 case 3: /* be/na */
538 rc |= (flags & (EFLG_CF|EFLG_ZF));
539 break;
540 case 4: /* s */
541 rc |= (flags & EFLG_SF);
542 break;
543 case 5: /* p/pe */
544 rc |= (flags & EFLG_PF);
545 break;
546 case 7: /* le/ng */
547 rc |= (flags & EFLG_ZF);
548 /* fall through */
549 case 6: /* l/nge */
550 rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF));
551 break;
552 }
553
554 /* Odd condition identifiers (lsb == 1) have inverted sense. */
555 return (!!rc ^ (condition & 1));
556}
557
558static void decode_register_operand(struct operand *op,
559 struct decode_cache *c,
560 int inhibit_bytereg)
561{
562 unsigned reg = c->modrm_reg;
563 int highbyte_regs = c->rex_prefix == 0;
564
565 if (!(c->d & ModRM))
566 reg = (c->b & 7) | ((c->rex_prefix & 1) << 3);
567 op->type = OP_REG;
568 if ((c->d & ByteOp) && !inhibit_bytereg) {
569 op->ptr = decode_register(reg, c->regs, highbyte_regs);
570 op->val = *(u8 *)op->ptr;
571 op->bytes = 1;
572 } else {
573 op->ptr = decode_register(reg, c->regs, 0);
574 op->bytes = c->op_bytes;
575 switch (op->bytes) {
576 case 2:
577 op->val = *(u16 *)op->ptr;
578 break;
579 case 4:
580 op->val = *(u32 *)op->ptr;
581 break;
582 case 8:
583 op->val = *(u64 *) op->ptr;
584 break;
585 }
586 }
587 op->orig_val = op->val;
588}
589
590static int decode_modrm(struct x86_emulate_ctxt *ctxt,
591 struct x86_emulate_ops *ops)
592{
593 struct decode_cache *c = &ctxt->decode;
594 u8 sib;
595 int index_reg = 0, base_reg = 0, scale, rip_relative = 0;
596 int rc = 0;
597
598 if (c->rex_prefix) {
599 c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */
600 index_reg = (c->rex_prefix & 2) << 2; /* REX.X */
601 c->modrm_rm = base_reg = (c->rex_prefix & 1) << 3; /* REG.B */
602 }
603
604 c->modrm = insn_fetch(u8, 1, c->eip);
605 c->modrm_mod |= (c->modrm & 0xc0) >> 6;
606 c->modrm_reg |= (c->modrm & 0x38) >> 3;
607 c->modrm_rm |= (c->modrm & 0x07);
608 c->modrm_ea = 0;
609 c->use_modrm_ea = 1;
610
611 if (c->modrm_mod == 3) {
612 c->modrm_val = *(unsigned long *)
613 decode_register(c->modrm_rm, c->regs, c->d & ByteOp);
614 return rc;
615 }
616
617 if (c->ad_bytes == 2) {
618 unsigned bx = c->regs[VCPU_REGS_RBX];
619 unsigned bp = c->regs[VCPU_REGS_RBP];
620 unsigned si = c->regs[VCPU_REGS_RSI];
621 unsigned di = c->regs[VCPU_REGS_RDI];
622
623 /* 16-bit ModR/M decode. */
624 switch (c->modrm_mod) {
625 case 0:
626 if (c->modrm_rm == 6)
627 c->modrm_ea += insn_fetch(u16, 2, c->eip);
628 break;
629 case 1:
630 c->modrm_ea += insn_fetch(s8, 1, c->eip);
631 break;
632 case 2:
633 c->modrm_ea += insn_fetch(u16, 2, c->eip);
634 break;
635 }
636 switch (c->modrm_rm) {
637 case 0:
638 c->modrm_ea += bx + si;
639 break;
640 case 1:
641 c->modrm_ea += bx + di;
642 break;
643 case 2:
644 c->modrm_ea += bp + si;
645 break;
646 case 3:
647 c->modrm_ea += bp + di;
648 break;
649 case 4:
650 c->modrm_ea += si;
651 break;
652 case 5:
653 c->modrm_ea += di;
654 break;
655 case 6:
656 if (c->modrm_mod != 0)
657 c->modrm_ea += bp;
658 break;
659 case 7:
660 c->modrm_ea += bx;
661 break;
662 }
663 if (c->modrm_rm == 2 || c->modrm_rm == 3 ||
664 (c->modrm_rm == 6 && c->modrm_mod != 0))
665 if (!c->override_base)
666 c->override_base = &ctxt->ss_base;
667 c->modrm_ea = (u16)c->modrm_ea;
668 } else {
669 /* 32/64-bit ModR/M decode. */
670 switch (c->modrm_rm) {
671 case 4:
672 case 12:
673 sib = insn_fetch(u8, 1, c->eip);
674 index_reg |= (sib >> 3) & 7;
675 base_reg |= sib & 7;
676 scale = sib >> 6;
677
678 switch (base_reg) {
679 case 5:
680 if (c->modrm_mod != 0)
681 c->modrm_ea += c->regs[base_reg];
682 else
683 c->modrm_ea +=
684 insn_fetch(s32, 4, c->eip);
685 break;
686 default:
687 c->modrm_ea += c->regs[base_reg];
688 }
689 switch (index_reg) {
690 case 4:
691 break;
692 default:
693 c->modrm_ea += c->regs[index_reg] << scale;
694 }
695 break;
696 case 5:
697 if (c->modrm_mod != 0)
698 c->modrm_ea += c->regs[c->modrm_rm];
699 else if (ctxt->mode == X86EMUL_MODE_PROT64)
700 rip_relative = 1;
701 break;
702 default:
703 c->modrm_ea += c->regs[c->modrm_rm];
704 break;
705 }
706 switch (c->modrm_mod) {
707 case 0:
708 if (c->modrm_rm == 5)
709 c->modrm_ea += insn_fetch(s32, 4, c->eip);
710 break;
711 case 1:
712 c->modrm_ea += insn_fetch(s8, 1, c->eip);
713 break;
714 case 2:
715 c->modrm_ea += insn_fetch(s32, 4, c->eip);
716 break;
717 }
718 }
719 if (rip_relative) {
720 c->modrm_ea += c->eip;
721 switch (c->d & SrcMask) {
722 case SrcImmByte:
723 c->modrm_ea += 1;
724 break;
725 case SrcImm:
726 if (c->d & ByteOp)
727 c->modrm_ea += 1;
728 else
729 if (c->op_bytes == 8)
730 c->modrm_ea += 4;
731 else
732 c->modrm_ea += c->op_bytes;
733 }
734 }
735done:
736 return rc;
737}
738
739static int decode_abs(struct x86_emulate_ctxt *ctxt,
740 struct x86_emulate_ops *ops)
741{
742 struct decode_cache *c = &ctxt->decode;
743 int rc = 0;
744
745 switch (c->ad_bytes) {
746 case 2:
747 c->modrm_ea = insn_fetch(u16, 2, c->eip);
748 break;
749 case 4:
750 c->modrm_ea = insn_fetch(u32, 4, c->eip);
751 break;
752 case 8:
753 c->modrm_ea = insn_fetch(u64, 8, c->eip);
754 break;
755 }
756done:
757 return rc;
758}
759
760int
761x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
762{
763 struct decode_cache *c = &ctxt->decode;
764 int rc = 0;
765 int mode = ctxt->mode;
766 int def_op_bytes, def_ad_bytes;
767
768 /* Shadow copy of register state. Committed on successful emulation. */
769
770 memset(c, 0, sizeof(struct decode_cache));
771 c->eip = ctxt->vcpu->arch.rip;
772 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
773
774 switch (mode) {
775 case X86EMUL_MODE_REAL:
776 case X86EMUL_MODE_PROT16:
777 def_op_bytes = def_ad_bytes = 2;
778 break;
779 case X86EMUL_MODE_PROT32:
780 def_op_bytes = def_ad_bytes = 4;
781 break;
782#ifdef CONFIG_X86_64
783 case X86EMUL_MODE_PROT64:
784 def_op_bytes = 4;
785 def_ad_bytes = 8;
786 break;
787#endif
788 default:
789 return -1;
790 }
791
792 c->op_bytes = def_op_bytes;
793 c->ad_bytes = def_ad_bytes;
794
795 /* Legacy prefixes. */
796 for (;;) {
797 switch (c->b = insn_fetch(u8, 1, c->eip)) {
798 case 0x66: /* operand-size override */
799 /* switch between 2/4 bytes */
800 c->op_bytes = def_op_bytes ^ 6;
801 break;
802 case 0x67: /* address-size override */
803 if (mode == X86EMUL_MODE_PROT64)
804 /* switch between 4/8 bytes */
805 c->ad_bytes = def_ad_bytes ^ 12;
806 else
807 /* switch between 2/4 bytes */
808 c->ad_bytes = def_ad_bytes ^ 6;
809 break;
810 case 0x2e: /* CS override */
811 c->override_base = &ctxt->cs_base;
812 break;
813 case 0x3e: /* DS override */
814 c->override_base = &ctxt->ds_base;
815 break;
816 case 0x26: /* ES override */
817 c->override_base = &ctxt->es_base;
818 break;
819 case 0x64: /* FS override */
820 c->override_base = &ctxt->fs_base;
821 break;
822 case 0x65: /* GS override */
823 c->override_base = &ctxt->gs_base;
824 break;
825 case 0x36: /* SS override */
826 c->override_base = &ctxt->ss_base;
827 break;
828 case 0x40 ... 0x4f: /* REX */
829 if (mode != X86EMUL_MODE_PROT64)
830 goto done_prefixes;
831 c->rex_prefix = c->b;
832 continue;
833 case 0xf0: /* LOCK */
834 c->lock_prefix = 1;
835 break;
836 case 0xf2: /* REPNE/REPNZ */
837 c->rep_prefix = REPNE_PREFIX;
838 break;
839 case 0xf3: /* REP/REPE/REPZ */
840 c->rep_prefix = REPE_PREFIX;
841 break;
842 default:
843 goto done_prefixes;
844 }
845
846 /* Any legacy prefix after a REX prefix nullifies its effect. */
847
848 c->rex_prefix = 0;
849 }
850
851done_prefixes:
852
853 /* REX prefix. */
854 if (c->rex_prefix)
855 if (c->rex_prefix & 8)
856 c->op_bytes = 8; /* REX.W */
857
858 /* Opcode byte(s). */
859 c->d = opcode_table[c->b];
860 if (c->d == 0) {
861 /* Two-byte opcode? */
862 if (c->b == 0x0f) {
863 c->twobyte = 1;
864 c->b = insn_fetch(u8, 1, c->eip);
865 c->d = twobyte_table[c->b];
866 }
867
868 /* Unrecognised? */
869 if (c->d == 0) {
870 DPRINTF("Cannot emulate %02x\n", c->b);
871 return -1;
872 }
873 }
874
875 if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
876 c->op_bytes = 8;
877
878 /* ModRM and SIB bytes. */
879 if (c->d & ModRM)
880 rc = decode_modrm(ctxt, ops);
881 else if (c->d & MemAbs)
882 rc = decode_abs(ctxt, ops);
883 if (rc)
884 goto done;
885
886 if (!c->override_base)
887 c->override_base = &ctxt->ds_base;
888 if (mode == X86EMUL_MODE_PROT64 &&
889 c->override_base != &ctxt->fs_base &&
890 c->override_base != &ctxt->gs_base)
891 c->override_base = NULL;
892
893 if (c->override_base)
894 c->modrm_ea += *c->override_base;
895
896 if (c->ad_bytes != 8)
897 c->modrm_ea = (u32)c->modrm_ea;
898 /*
899 * Decode and fetch the source operand: register, memory
900 * or immediate.
901 */
902 switch (c->d & SrcMask) {
903 case SrcNone:
904 break;
905 case SrcReg:
906 decode_register_operand(&c->src, c, 0);
907 break;
908 case SrcMem16:
909 c->src.bytes = 2;
910 goto srcmem_common;
911 case SrcMem32:
912 c->src.bytes = 4;
913 goto srcmem_common;
914 case SrcMem:
915 c->src.bytes = (c->d & ByteOp) ? 1 :
916 c->op_bytes;
917 /* Don't fetch the address for invlpg: it could be unmapped. */
918 if (c->twobyte && c->b == 0x01 && c->modrm_reg == 7)
919 break;
920 srcmem_common:
921 /*
922 * For instructions with a ModR/M byte, switch to register
923 * access if Mod = 3.
924 */
925 if ((c->d & ModRM) && c->modrm_mod == 3) {
926 c->src.type = OP_REG;
927 break;
928 }
929 c->src.type = OP_MEM;
930 break;
931 case SrcImm:
932 c->src.type = OP_IMM;
933 c->src.ptr = (unsigned long *)c->eip;
934 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
935 if (c->src.bytes == 8)
936 c->src.bytes = 4;
937 /* NB. Immediates are sign-extended as necessary. */
938 switch (c->src.bytes) {
939 case 1:
940 c->src.val = insn_fetch(s8, 1, c->eip);
941 break;
942 case 2:
943 c->src.val = insn_fetch(s16, 2, c->eip);
944 break;
945 case 4:
946 c->src.val = insn_fetch(s32, 4, c->eip);
947 break;
948 }
949 break;
950 case SrcImmByte:
951 c->src.type = OP_IMM;
952 c->src.ptr = (unsigned long *)c->eip;
953 c->src.bytes = 1;
954 c->src.val = insn_fetch(s8, 1, c->eip);
955 break;
956 }
957
958 /* Decode and fetch the destination operand: register or memory. */
959 switch (c->d & DstMask) {
960 case ImplicitOps:
961 /* Special instructions do their own operand decoding. */
962 return 0;
963 case DstReg:
964 decode_register_operand(&c->dst, c,
965 c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
966 break;
967 case DstMem:
968 if ((c->d & ModRM) && c->modrm_mod == 3) {
969 c->dst.type = OP_REG;
970 break;
971 }
972 c->dst.type = OP_MEM;
973 break;
974 }
975
976done:
977 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
978}
979
980static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
981{
982 struct decode_cache *c = &ctxt->decode;
983
984 c->dst.type = OP_MEM;
985 c->dst.bytes = c->op_bytes;
986 c->dst.val = c->src.val;
987 register_address_increment(c->regs[VCPU_REGS_RSP], -c->op_bytes);
988 c->dst.ptr = (void *) register_address(ctxt->ss_base,
989 c->regs[VCPU_REGS_RSP]);
990}
991
992static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
993 struct x86_emulate_ops *ops)
994{
995 struct decode_cache *c = &ctxt->decode;
996 int rc;
997
998 rc = ops->read_std(register_address(ctxt->ss_base,
999 c->regs[VCPU_REGS_RSP]),
1000 &c->dst.val, c->dst.bytes, ctxt->vcpu);
1001 if (rc != 0)
1002 return rc;
1003
1004 register_address_increment(c->regs[VCPU_REGS_RSP], c->dst.bytes);
1005
1006 return 0;
1007}
1008
1009static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt)
1010{
1011 struct decode_cache *c = &ctxt->decode;
1012 switch (c->modrm_reg) {
1013 case 0: /* rol */
1014 emulate_2op_SrcB("rol", c->src, c->dst, ctxt->eflags);
1015 break;
1016 case 1: /* ror */
1017 emulate_2op_SrcB("ror", c->src, c->dst, ctxt->eflags);
1018 break;
1019 case 2: /* rcl */
1020 emulate_2op_SrcB("rcl", c->src, c->dst, ctxt->eflags);
1021 break;
1022 case 3: /* rcr */
1023 emulate_2op_SrcB("rcr", c->src, c->dst, ctxt->eflags);
1024 break;
1025 case 4: /* sal/shl */
1026 case 6: /* sal/shl */
1027 emulate_2op_SrcB("sal", c->src, c->dst, ctxt->eflags);
1028 break;
1029 case 5: /* shr */
1030 emulate_2op_SrcB("shr", c->src, c->dst, ctxt->eflags);
1031 break;
1032 case 7: /* sar */
1033 emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags);
1034 break;
1035 }
1036}
1037
1038static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
1039 struct x86_emulate_ops *ops)
1040{
1041 struct decode_cache *c = &ctxt->decode;
1042 int rc = 0;
1043
1044 switch (c->modrm_reg) {
1045 case 0 ... 1: /* test */
1046 /*
1047 * Special case in Grp3: test has an immediate
1048 * source operand.
1049 */
1050 c->src.type = OP_IMM;
1051 c->src.ptr = (unsigned long *)c->eip;
1052 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1053 if (c->src.bytes == 8)
1054 c->src.bytes = 4;
1055 switch (c->src.bytes) {
1056 case 1:
1057 c->src.val = insn_fetch(s8, 1, c->eip);
1058 break;
1059 case 2:
1060 c->src.val = insn_fetch(s16, 2, c->eip);
1061 break;
1062 case 4:
1063 c->src.val = insn_fetch(s32, 4, c->eip);
1064 break;
1065 }
1066 emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
1067 break;
1068 case 2: /* not */
1069 c->dst.val = ~c->dst.val;
1070 break;
1071 case 3: /* neg */
1072 emulate_1op("neg", c->dst, ctxt->eflags);
1073 break;
1074 default:
1075 DPRINTF("Cannot emulate %02x\n", c->b);
1076 rc = X86EMUL_UNHANDLEABLE;
1077 break;
1078 }
1079done:
1080 return rc;
1081}
1082
1083static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
1084 struct x86_emulate_ops *ops)
1085{
1086 struct decode_cache *c = &ctxt->decode;
1087 int rc;
1088
1089 switch (c->modrm_reg) {
1090 case 0: /* inc */
1091 emulate_1op("inc", c->dst, ctxt->eflags);
1092 break;
1093 case 1: /* dec */
1094 emulate_1op("dec", c->dst, ctxt->eflags);
1095 break;
1096 case 4: /* jmp abs */
1097 if (c->b == 0xff)
1098 c->eip = c->dst.val;
1099 else {
1100 DPRINTF("Cannot emulate %02x\n", c->b);
1101 return X86EMUL_UNHANDLEABLE;
1102 }
1103 break;
1104 case 6: /* push */
1105
1106 /* 64-bit mode: PUSH always pushes a 64-bit operand. */
1107
1108 if (ctxt->mode == X86EMUL_MODE_PROT64) {
1109 c->dst.bytes = 8;
1110 rc = ops->read_std((unsigned long)c->dst.ptr,
1111 &c->dst.val, 8, ctxt->vcpu);
1112 if (rc != 0)
1113 return rc;
1114 }
1115 register_address_increment(c->regs[VCPU_REGS_RSP],
1116 -c->dst.bytes);
1117 rc = ops->write_emulated(register_address(ctxt->ss_base,
1118 c->regs[VCPU_REGS_RSP]), &c->dst.val,
1119 c->dst.bytes, ctxt->vcpu);
1120 if (rc != 0)
1121 return rc;
1122 c->dst.type = OP_NONE;
1123 break;
1124 default:
1125 DPRINTF("Cannot emulate %02x\n", c->b);
1126 return X86EMUL_UNHANDLEABLE;
1127 }
1128 return 0;
1129}
1130
1131static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
1132 struct x86_emulate_ops *ops,
1133 unsigned long memop)
1134{
1135 struct decode_cache *c = &ctxt->decode;
1136 u64 old, new;
1137 int rc;
1138
1139 rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu);
1140 if (rc != 0)
1141 return rc;
1142
1143 if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) ||
1144 ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) {
1145
1146 c->regs[VCPU_REGS_RAX] = (u32) (old >> 0);
1147 c->regs[VCPU_REGS_RDX] = (u32) (old >> 32);
1148 ctxt->eflags &= ~EFLG_ZF;
1149
1150 } else {
1151 new = ((u64)c->regs[VCPU_REGS_RCX] << 32) |
1152 (u32) c->regs[VCPU_REGS_RBX];
1153
1154 rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu);
1155 if (rc != 0)
1156 return rc;
1157 ctxt->eflags |= EFLG_ZF;
1158 }
1159 return 0;
1160}
1161
1162static inline int writeback(struct x86_emulate_ctxt *ctxt,
1163 struct x86_emulate_ops *ops)
1164{
1165 int rc;
1166 struct decode_cache *c = &ctxt->decode;
1167
1168 switch (c->dst.type) {
1169 case OP_REG:
1170 /* The 4-byte case *is* correct:
1171 * in 64-bit mode we zero-extend.
1172 */
1173 switch (c->dst.bytes) {
1174 case 1:
1175 *(u8 *)c->dst.ptr = (u8)c->dst.val;
1176 break;
1177 case 2:
1178 *(u16 *)c->dst.ptr = (u16)c->dst.val;
1179 break;
1180 case 4:
1181 *c->dst.ptr = (u32)c->dst.val;
1182 break; /* 64b: zero-ext */
1183 case 8:
1184 *c->dst.ptr = c->dst.val;
1185 break;
1186 }
1187 break;
1188 case OP_MEM:
1189 if (c->lock_prefix)
1190 rc = ops->cmpxchg_emulated(
1191 (unsigned long)c->dst.ptr,
1192 &c->dst.orig_val,
1193 &c->dst.val,
1194 c->dst.bytes,
1195 ctxt->vcpu);
1196 else
1197 rc = ops->write_emulated(
1198 (unsigned long)c->dst.ptr,
1199 &c->dst.val,
1200 c->dst.bytes,
1201 ctxt->vcpu);
1202 if (rc != 0)
1203 return rc;
1204 break;
1205 case OP_NONE:
1206 /* no writeback */
1207 break;
1208 default:
1209 break;
1210 }
1211 return 0;
1212}
1213
1214int
1215x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1216{
1217 unsigned long memop = 0;
1218 u64 msr_data;
1219 unsigned long saved_eip = 0;
1220 struct decode_cache *c = &ctxt->decode;
1221 int rc = 0;
1222
1223 /* Shadow copy of register state. Committed on successful emulation.
1224 * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't
1225 * modify them.
1226 */
1227
1228 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
1229 saved_eip = c->eip;
1230
1231 if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs))
1232 memop = c->modrm_ea;
1233
1234 if (c->rep_prefix && (c->d & String)) {
1235 /* All REP prefixes have the same first termination condition */
1236 if (c->regs[VCPU_REGS_RCX] == 0) {
1237 ctxt->vcpu->arch.rip = c->eip;
1238 goto done;
1239 }
1240 /* The second termination condition only applies for REPE
1241 * and REPNE. Test if the repeat string operation prefix is
1242 * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the
1243 * corresponding termination condition according to:
1244 * - if REPE/REPZ and ZF = 0 then done
1245 * - if REPNE/REPNZ and ZF = 1 then done
1246 */
1247 if ((c->b == 0xa6) || (c->b == 0xa7) ||
1248 (c->b == 0xae) || (c->b == 0xaf)) {
1249 if ((c->rep_prefix == REPE_PREFIX) &&
1250 ((ctxt->eflags & EFLG_ZF) == 0)) {
1251 ctxt->vcpu->arch.rip = c->eip;
1252 goto done;
1253 }
1254 if ((c->rep_prefix == REPNE_PREFIX) &&
1255 ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) {
1256 ctxt->vcpu->arch.rip = c->eip;
1257 goto done;
1258 }
1259 }
1260 c->regs[VCPU_REGS_RCX]--;
1261 c->eip = ctxt->vcpu->arch.rip;
1262 }
1263
1264 if (c->src.type == OP_MEM) {
1265 c->src.ptr = (unsigned long *)memop;
1266 c->src.val = 0;
1267 rc = ops->read_emulated((unsigned long)c->src.ptr,
1268 &c->src.val,
1269 c->src.bytes,
1270 ctxt->vcpu);
1271 if (rc != 0)
1272 goto done;
1273 c->src.orig_val = c->src.val;
1274 }
1275
1276 if ((c->d & DstMask) == ImplicitOps)
1277 goto special_insn;
1278
1279
1280 if (c->dst.type == OP_MEM) {
1281 c->dst.ptr = (unsigned long *)memop;
1282 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1283 c->dst.val = 0;
1284 if (c->d & BitOp) {
1285 unsigned long mask = ~(c->dst.bytes * 8 - 1);
1286
1287 c->dst.ptr = (void *)c->dst.ptr +
1288 (c->src.val & mask) / 8;
1289 }
1290 if (!(c->d & Mov) &&
1291 /* optimisation - avoid slow emulated read */
1292 ((rc = ops->read_emulated((unsigned long)c->dst.ptr,
1293 &c->dst.val,
1294 c->dst.bytes, ctxt->vcpu)) != 0))
1295 goto done;
1296 }
1297 c->dst.orig_val = c->dst.val;
1298
1299special_insn:
1300
1301 if (c->twobyte)
1302 goto twobyte_insn;
1303
1304 switch (c->b) {
1305 case 0x00 ... 0x05:
1306 add: /* add */
1307 emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
1308 break;
1309 case 0x08 ... 0x0d:
1310 or: /* or */
1311 emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
1312 break;
1313 case 0x10 ... 0x15:
1314 adc: /* adc */
1315 emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
1316 break;
1317 case 0x18 ... 0x1d:
1318 sbb: /* sbb */
1319 emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
1320 break;
1321 case 0x20 ... 0x23:
1322 and: /* and */
1323 emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
1324 break;
1325 case 0x24: /* and al imm8 */
1326 c->dst.type = OP_REG;
1327 c->dst.ptr = &c->regs[VCPU_REGS_RAX];
1328 c->dst.val = *(u8 *)c->dst.ptr;
1329 c->dst.bytes = 1;
1330 c->dst.orig_val = c->dst.val;
1331 goto and;
1332 case 0x25: /* and ax imm16, or eax imm32 */
1333 c->dst.type = OP_REG;
1334 c->dst.bytes = c->op_bytes;
1335 c->dst.ptr = &c->regs[VCPU_REGS_RAX];
1336 if (c->op_bytes == 2)
1337 c->dst.val = *(u16 *)c->dst.ptr;
1338 else
1339 c->dst.val = *(u32 *)c->dst.ptr;
1340 c->dst.orig_val = c->dst.val;
1341 goto and;
1342 case 0x28 ... 0x2d:
1343 sub: /* sub */
1344 emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags);
1345 break;
1346 case 0x30 ... 0x35:
1347 xor: /* xor */
1348 emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags);
1349 break;
1350 case 0x38 ... 0x3d:
1351 cmp: /* cmp */
1352 emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
1353 break;
1354 case 0x40 ... 0x47: /* inc r16/r32 */
1355 emulate_1op("inc", c->dst, ctxt->eflags);
1356 break;
1357 case 0x48 ... 0x4f: /* dec r16/r32 */
1358 emulate_1op("dec", c->dst, ctxt->eflags);
1359 break;
1360 case 0x50 ... 0x57: /* push reg */
1361 c->dst.type = OP_MEM;
1362 c->dst.bytes = c->op_bytes;
1363 c->dst.val = c->src.val;
1364 register_address_increment(c->regs[VCPU_REGS_RSP],
1365 -c->op_bytes);
1366 c->dst.ptr = (void *) register_address(
1367 ctxt->ss_base, c->regs[VCPU_REGS_RSP]);
1368 break;
1369 case 0x58 ... 0x5f: /* pop reg */
1370 pop_instruction:
1371 if ((rc = ops->read_std(register_address(ctxt->ss_base,
1372 c->regs[VCPU_REGS_RSP]), c->dst.ptr,
1373 c->op_bytes, ctxt->vcpu)) != 0)
1374 goto done;
1375
1376 register_address_increment(c->regs[VCPU_REGS_RSP],
1377 c->op_bytes);
1378 c->dst.type = OP_NONE; /* Disable writeback. */
1379 break;
1380 case 0x63: /* movsxd */
1381 if (ctxt->mode != X86EMUL_MODE_PROT64)
1382 goto cannot_emulate;
1383 c->dst.val = (s32) c->src.val;
1384 break;
1385 case 0x6a: /* push imm8 */
1386 c->src.val = 0L;
1387 c->src.val = insn_fetch(s8, 1, c->eip);
1388 emulate_push(ctxt);
1389 break;
1390 case 0x6c: /* insb */
1391 case 0x6d: /* insw/insd */
1392 if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
1393 1,
1394 (c->d & ByteOp) ? 1 : c->op_bytes,
1395 c->rep_prefix ?
1396 address_mask(c->regs[VCPU_REGS_RCX]) : 1,
1397 (ctxt->eflags & EFLG_DF),
1398 register_address(ctxt->es_base,
1399 c->regs[VCPU_REGS_RDI]),
1400 c->rep_prefix,
1401 c->regs[VCPU_REGS_RDX]) == 0) {
1402 c->eip = saved_eip;
1403 return -1;
1404 }
1405 return 0;
1406 case 0x6e: /* outsb */
1407 case 0x6f: /* outsw/outsd */
1408 if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
1409 0,
1410 (c->d & ByteOp) ? 1 : c->op_bytes,
1411 c->rep_prefix ?
1412 address_mask(c->regs[VCPU_REGS_RCX]) : 1,
1413 (ctxt->eflags & EFLG_DF),
1414 register_address(c->override_base ?
1415 *c->override_base :
1416 ctxt->ds_base,
1417 c->regs[VCPU_REGS_RSI]),
1418 c->rep_prefix,
1419 c->regs[VCPU_REGS_RDX]) == 0) {
1420 c->eip = saved_eip;
1421 return -1;
1422 }
1423 return 0;
1424 case 0x70 ... 0x7f: /* jcc (short) */ {
1425 int rel = insn_fetch(s8, 1, c->eip);
1426
1427 if (test_cc(c->b, ctxt->eflags))
1428 JMP_REL(rel);
1429 break;
1430 }
1431 case 0x80 ... 0x83: /* Grp1 */
1432 switch (c->modrm_reg) {
1433 case 0:
1434 goto add;
1435 case 1:
1436 goto or;
1437 case 2:
1438 goto adc;
1439 case 3:
1440 goto sbb;
1441 case 4:
1442 goto and;
1443 case 5:
1444 goto sub;
1445 case 6:
1446 goto xor;
1447 case 7:
1448 goto cmp;
1449 }
1450 break;
1451 case 0x84 ... 0x85:
1452 emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
1453 break;
1454 case 0x86 ... 0x87: /* xchg */
1455 /* Write back the register source. */
1456 switch (c->dst.bytes) {
1457 case 1:
1458 *(u8 *) c->src.ptr = (u8) c->dst.val;
1459 break;
1460 case 2:
1461 *(u16 *) c->src.ptr = (u16) c->dst.val;
1462 break;
1463 case 4:
1464 *c->src.ptr = (u32) c->dst.val;
1465 break; /* 64b reg: zero-extend */
1466 case 8:
1467 *c->src.ptr = c->dst.val;
1468 break;
1469 }
1470 /*
1471 * Write back the memory destination with implicit LOCK
1472 * prefix.
1473 */
1474 c->dst.val = c->src.val;
1475 c->lock_prefix = 1;
1476 break;
1477 case 0x88 ... 0x8b: /* mov */
1478 goto mov;
1479 case 0x8d: /* lea r16/r32, m */
1480 c->dst.val = c->modrm_val;
1481 break;
1482 case 0x8f: /* pop (sole member of Grp1a) */
1483 rc = emulate_grp1a(ctxt, ops);
1484 if (rc != 0)
1485 goto done;
1486 break;
1487 case 0x9c: /* pushf */
1488 c->src.val = (unsigned long) ctxt->eflags;
1489 emulate_push(ctxt);
1490 break;
1491 case 0x9d: /* popf */
1492 c->dst.ptr = (unsigned long *) &ctxt->eflags;
1493 goto pop_instruction;
1494 case 0xa0 ... 0xa1: /* mov */
1495 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
1496 c->dst.val = c->src.val;
1497 break;
1498 case 0xa2 ... 0xa3: /* mov */
1499 c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX];
1500 break;
1501 case 0xa4 ... 0xa5: /* movs */
1502 c->dst.type = OP_MEM;
1503 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1504 c->dst.ptr = (unsigned long *)register_address(
1505 ctxt->es_base,
1506 c->regs[VCPU_REGS_RDI]);
1507 if ((rc = ops->read_emulated(register_address(
1508 c->override_base ? *c->override_base :
1509 ctxt->ds_base,
1510 c->regs[VCPU_REGS_RSI]),
1511 &c->dst.val,
1512 c->dst.bytes, ctxt->vcpu)) != 0)
1513 goto done;
1514 register_address_increment(c->regs[VCPU_REGS_RSI],
1515 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
1516 : c->dst.bytes);
1517 register_address_increment(c->regs[VCPU_REGS_RDI],
1518 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
1519 : c->dst.bytes);
1520 break;
1521 case 0xa6 ... 0xa7: /* cmps */
1522 c->src.type = OP_NONE; /* Disable writeback. */
1523 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1524 c->src.ptr = (unsigned long *)register_address(
1525 c->override_base ? *c->override_base :
1526 ctxt->ds_base,
1527 c->regs[VCPU_REGS_RSI]);
1528 if ((rc = ops->read_emulated((unsigned long)c->src.ptr,
1529 &c->src.val,
1530 c->src.bytes,
1531 ctxt->vcpu)) != 0)
1532 goto done;
1533
1534 c->dst.type = OP_NONE; /* Disable writeback. */
1535 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1536 c->dst.ptr = (unsigned long *)register_address(
1537 ctxt->es_base,
1538 c->regs[VCPU_REGS_RDI]);
1539 if ((rc = ops->read_emulated((unsigned long)c->dst.ptr,
1540 &c->dst.val,
1541 c->dst.bytes,
1542 ctxt->vcpu)) != 0)
1543 goto done;
1544
1545 DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr);
1546
1547 emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
1548
1549 register_address_increment(c->regs[VCPU_REGS_RSI],
1550 (ctxt->eflags & EFLG_DF) ? -c->src.bytes
1551 : c->src.bytes);
1552 register_address_increment(c->regs[VCPU_REGS_RDI],
1553 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
1554 : c->dst.bytes);
1555
1556 break;
1557 case 0xaa ... 0xab: /* stos */
1558 c->dst.type = OP_MEM;
1559 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1560 c->dst.ptr = (unsigned long *)register_address(
1561 ctxt->es_base,
1562 c->regs[VCPU_REGS_RDI]);
1563 c->dst.val = c->regs[VCPU_REGS_RAX];
1564 register_address_increment(c->regs[VCPU_REGS_RDI],
1565 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
1566 : c->dst.bytes);
1567 break;
1568 case 0xac ... 0xad: /* lods */
1569 c->dst.type = OP_REG;
1570 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1571 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
1572 if ((rc = ops->read_emulated(register_address(
1573 c->override_base ? *c->override_base :
1574 ctxt->ds_base,
1575 c->regs[VCPU_REGS_RSI]),
1576 &c->dst.val,
1577 c->dst.bytes,
1578 ctxt->vcpu)) != 0)
1579 goto done;
1580 register_address_increment(c->regs[VCPU_REGS_RSI],
1581 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
1582 : c->dst.bytes);
1583 break;
1584 case 0xae ... 0xaf: /* scas */
1585 DPRINTF("Urk! I don't handle SCAS.\n");
1586 goto cannot_emulate;
1587 case 0xc0 ... 0xc1:
1588 emulate_grp2(ctxt);
1589 break;
1590 case 0xc3: /* ret */
1591 c->dst.ptr = &c->eip;
1592 goto pop_instruction;
1593 case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */
1594 mov:
1595 c->dst.val = c->src.val;
1596 break;
1597 case 0xd0 ... 0xd1: /* Grp2 */
1598 c->src.val = 1;
1599 emulate_grp2(ctxt);
1600 break;
1601 case 0xd2 ... 0xd3: /* Grp2 */
1602 c->src.val = c->regs[VCPU_REGS_RCX];
1603 emulate_grp2(ctxt);
1604 break;
1605 case 0xe8: /* call (near) */ {
1606 long int rel;
1607 switch (c->op_bytes) {
1608 case 2:
1609 rel = insn_fetch(s16, 2, c->eip);
1610 break;
1611 case 4:
1612 rel = insn_fetch(s32, 4, c->eip);
1613 break;
1614 default:
1615 DPRINTF("Call: Invalid op_bytes\n");
1616 goto cannot_emulate;
1617 }
1618 c->src.val = (unsigned long) c->eip;
1619 JMP_REL(rel);
1620 c->op_bytes = c->ad_bytes;
1621 emulate_push(ctxt);
1622 break;
1623 }
1624 case 0xe9: /* jmp rel */
1625 case 0xeb: /* jmp rel short */
1626 JMP_REL(c->src.val);
1627 c->dst.type = OP_NONE; /* Disable writeback. */
1628 break;
1629 case 0xf4: /* hlt */
1630 ctxt->vcpu->arch.halt_request = 1;
1631 goto done;
1632 case 0xf5: /* cmc */
1633 /* complement carry flag from eflags reg */
1634 ctxt->eflags ^= EFLG_CF;
1635 c->dst.type = OP_NONE; /* Disable writeback. */
1636 break;
1637 case 0xf6 ... 0xf7: /* Grp3 */
1638 rc = emulate_grp3(ctxt, ops);
1639 if (rc != 0)
1640 goto done;
1641 break;
1642 case 0xf8: /* clc */
1643 ctxt->eflags &= ~EFLG_CF;
1644 c->dst.type = OP_NONE; /* Disable writeback. */
1645 break;
1646 case 0xfa: /* cli */
1647 ctxt->eflags &= ~X86_EFLAGS_IF;
1648 c->dst.type = OP_NONE; /* Disable writeback. */
1649 break;
1650 case 0xfb: /* sti */
1651 ctxt->eflags |= X86_EFLAGS_IF;
1652 c->dst.type = OP_NONE; /* Disable writeback. */
1653 break;
1654 case 0xfe ... 0xff: /* Grp4/Grp5 */
1655 rc = emulate_grp45(ctxt, ops);
1656 if (rc != 0)
1657 goto done;
1658 break;
1659 }
1660
1661writeback:
1662 rc = writeback(ctxt, ops);
1663 if (rc != 0)
1664 goto done;
1665
1666 /* Commit shadow register state. */
1667 memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
1668 ctxt->vcpu->arch.rip = c->eip;
1669
1670done:
1671 if (rc == X86EMUL_UNHANDLEABLE) {
1672 c->eip = saved_eip;
1673 return -1;
1674 }
1675 return 0;
1676
1677twobyte_insn:
1678 switch (c->b) {
1679 case 0x01: /* lgdt, lidt, lmsw */
1680 switch (c->modrm_reg) {
1681 u16 size;
1682 unsigned long address;
1683
1684 case 0: /* vmcall */
1685 if (c->modrm_mod != 3 || c->modrm_rm != 1)
1686 goto cannot_emulate;
1687
1688 rc = kvm_fix_hypercall(ctxt->vcpu);
1689 if (rc)
1690 goto done;
1691
1692 kvm_emulate_hypercall(ctxt->vcpu);
1693 break;
1694 case 2: /* lgdt */
1695 rc = read_descriptor(ctxt, ops, c->src.ptr,
1696 &size, &address, c->op_bytes);
1697 if (rc)
1698 goto done;
1699 realmode_lgdt(ctxt->vcpu, size, address);
1700 break;
1701 case 3: /* lidt/vmmcall */
1702 if (c->modrm_mod == 3 && c->modrm_rm == 1) {
1703 rc = kvm_fix_hypercall(ctxt->vcpu);
1704 if (rc)
1705 goto done;
1706 kvm_emulate_hypercall(ctxt->vcpu);
1707 } else {
1708 rc = read_descriptor(ctxt, ops, c->src.ptr,
1709 &size, &address,
1710 c->op_bytes);
1711 if (rc)
1712 goto done;
1713 realmode_lidt(ctxt->vcpu, size, address);
1714 }
1715 break;
1716 case 4: /* smsw */
1717 if (c->modrm_mod != 3)
1718 goto cannot_emulate;
1719 *(u16 *)&c->regs[c->modrm_rm]
1720 = realmode_get_cr(ctxt->vcpu, 0);
1721 break;
1722 case 6: /* lmsw */
1723 if (c->modrm_mod != 3)
1724 goto cannot_emulate;
1725 realmode_lmsw(ctxt->vcpu, (u16)c->modrm_val,
1726 &ctxt->eflags);
1727 break;
1728 case 7: /* invlpg*/
1729 emulate_invlpg(ctxt->vcpu, memop);
1730 break;
1731 default:
1732 goto cannot_emulate;
1733 }
1734 /* Disable writeback. */
1735 c->dst.type = OP_NONE;
1736 break;
1737 case 0x06:
1738 emulate_clts(ctxt->vcpu);
1739 c->dst.type = OP_NONE;
1740 break;
1741 case 0x08: /* invd */
1742 case 0x09: /* wbinvd */
1743 case 0x0d: /* GrpP (prefetch) */
1744 case 0x18: /* Grp16 (prefetch/nop) */
1745 c->dst.type = OP_NONE;
1746 break;
1747 case 0x20: /* mov cr, reg */
1748 if (c->modrm_mod != 3)
1749 goto cannot_emulate;
1750 c->regs[c->modrm_rm] =
1751 realmode_get_cr(ctxt->vcpu, c->modrm_reg);
1752 c->dst.type = OP_NONE; /* no writeback */
1753 break;
1754 case 0x21: /* mov from dr to reg */
1755 if (c->modrm_mod != 3)
1756 goto cannot_emulate;
1757 rc = emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]);
1758 if (rc)
1759 goto cannot_emulate;
1760 c->dst.type = OP_NONE; /* no writeback */
1761 break;
1762 case 0x22: /* mov reg, cr */
1763 if (c->modrm_mod != 3)
1764 goto cannot_emulate;
1765 realmode_set_cr(ctxt->vcpu,
1766 c->modrm_reg, c->modrm_val, &ctxt->eflags);
1767 c->dst.type = OP_NONE;
1768 break;
1769 case 0x23: /* mov from reg to dr */
1770 if (c->modrm_mod != 3)
1771 goto cannot_emulate;
1772 rc = emulator_set_dr(ctxt, c->modrm_reg,
1773 c->regs[c->modrm_rm]);
1774 if (rc)
1775 goto cannot_emulate;
1776 c->dst.type = OP_NONE; /* no writeback */
1777 break;
1778 case 0x30:
1779 /* wrmsr */
1780 msr_data = (u32)c->regs[VCPU_REGS_RAX]
1781 | ((u64)c->regs[VCPU_REGS_RDX] << 32);
1782 rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data);
1783 if (rc) {
1784 kvm_inject_gp(ctxt->vcpu, 0);
1785 c->eip = ctxt->vcpu->arch.rip;
1786 }
1787 rc = X86EMUL_CONTINUE;
1788 c->dst.type = OP_NONE;
1789 break;
1790 case 0x32:
1791 /* rdmsr */
1792 rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data);
1793 if (rc) {
1794 kvm_inject_gp(ctxt->vcpu, 0);
1795 c->eip = ctxt->vcpu->arch.rip;
1796 } else {
1797 c->regs[VCPU_REGS_RAX] = (u32)msr_data;
1798 c->regs[VCPU_REGS_RDX] = msr_data >> 32;
1799 }
1800 rc = X86EMUL_CONTINUE;
1801 c->dst.type = OP_NONE;
1802 break;
1803 case 0x40 ... 0x4f: /* cmov */
1804 c->dst.val = c->dst.orig_val = c->src.val;
1805 if (!test_cc(c->b, ctxt->eflags))
1806 c->dst.type = OP_NONE; /* no writeback */
1807 break;
1808 case 0x80 ... 0x8f: /* jnz rel, etc*/ {
1809 long int rel;
1810
1811 switch (c->op_bytes) {
1812 case 2:
1813 rel = insn_fetch(s16, 2, c->eip);
1814 break;
1815 case 4:
1816 rel = insn_fetch(s32, 4, c->eip);
1817 break;
1818 case 8:
1819 rel = insn_fetch(s64, 8, c->eip);
1820 break;
1821 default:
1822 DPRINTF("jnz: Invalid op_bytes\n");
1823 goto cannot_emulate;
1824 }
1825 if (test_cc(c->b, ctxt->eflags))
1826 JMP_REL(rel);
1827 c->dst.type = OP_NONE;
1828 break;
1829 }
1830 case 0xa3:
1831 bt: /* bt */
1832 c->dst.type = OP_NONE;
1833 /* only subword offset */
1834 c->src.val &= (c->dst.bytes << 3) - 1;
1835 emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags);
1836 break;
1837 case 0xab:
1838 bts: /* bts */
1839 /* only subword offset */
1840 c->src.val &= (c->dst.bytes << 3) - 1;
1841 emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags);
1842 break;
1843 case 0xb0 ... 0xb1: /* cmpxchg */
1844 /*
1845 * Save real source value, then compare EAX against
1846 * destination.
1847 */
1848 c->src.orig_val = c->src.val;
1849 c->src.val = c->regs[VCPU_REGS_RAX];
1850 emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
1851 if (ctxt->eflags & EFLG_ZF) {
1852 /* Success: write back to memory. */
1853 c->dst.val = c->src.orig_val;
1854 } else {
1855 /* Failure: write the value we saw to EAX. */
1856 c->dst.type = OP_REG;
1857 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
1858 }
1859 break;
1860 case 0xb3:
1861 btr: /* btr */
1862 /* only subword offset */
1863 c->src.val &= (c->dst.bytes << 3) - 1;
1864 emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags);
1865 break;
1866 case 0xb6 ... 0xb7: /* movzx */
1867 c->dst.bytes = c->op_bytes;
1868 c->dst.val = (c->d & ByteOp) ? (u8) c->src.val
1869 : (u16) c->src.val;
1870 break;
1871 case 0xba: /* Grp8 */
1872 switch (c->modrm_reg & 3) {
1873 case 0:
1874 goto bt;
1875 case 1:
1876 goto bts;
1877 case 2:
1878 goto btr;
1879 case 3:
1880 goto btc;
1881 }
1882 break;
1883 case 0xbb:
1884 btc: /* btc */
1885 /* only subword offset */
1886 c->src.val &= (c->dst.bytes << 3) - 1;
1887 emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags);
1888 break;
1889 case 0xbe ... 0xbf: /* movsx */
1890 c->dst.bytes = c->op_bytes;
1891 c->dst.val = (c->d & ByteOp) ? (s8) c->src.val :
1892 (s16) c->src.val;
1893 break;
1894 case 0xc3: /* movnti */
1895 c->dst.bytes = c->op_bytes;
1896 c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val :
1897 (u64) c->src.val;
1898 break;
1899 case 0xc7: /* Grp9 (cmpxchg8b) */
1900 rc = emulate_grp9(ctxt, ops, memop);
1901 if (rc != 0)
1902 goto done;
1903 c->dst.type = OP_NONE;
1904 break;
1905 }
1906 goto writeback;
1907
1908cannot_emulate:
1909 DPRINTF("Cannot emulate %02x\n", c->b);
1910 c->eip = saved_eip;
1911 return -1;
1912}
diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig
index 19626ace0f5..964dfa36d36 100644
--- a/arch/x86/lguest/Kconfig
+++ b/arch/x86/lguest/Kconfig
@@ -1,6 +1,7 @@
1config LGUEST_GUEST 1config LGUEST_GUEST
2 bool "Lguest guest support" 2 bool "Lguest guest support"
3 select PARAVIRT 3 select PARAVIRT
4 depends on X86_32
4 depends on !X86_PAE 5 depends on !X86_PAE
5 depends on !(X86_VISWS || X86_VOYAGER) 6 depends on !(X86_VISWS || X86_VOYAGER)
6 select VIRTIO 7 select VIRTIO
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 92c56117eae..5afdde4895d 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -67,6 +67,7 @@
67#include <asm/mce.h> 67#include <asm/mce.h>
68#include <asm/io.h> 68#include <asm/io.h>
69#include <asm/i387.h> 69#include <asm/i387.h>
70#include <asm/reboot.h> /* for struct machine_ops */
70 71
71/*G:010 Welcome to the Guest! 72/*G:010 Welcome to the Guest!
72 * 73 *
@@ -175,8 +176,8 @@ static void lguest_leave_lazy_mode(void)
175 * check there when it wants to deliver an interrupt. 176 * check there when it wants to deliver an interrupt.
176 */ 177 */
177 178
178/* save_flags() is expected to return the processor state (ie. "eflags"). The 179/* save_flags() is expected to return the processor state (ie. "flags"). The
179 * eflags word contains all kind of stuff, but in practice Linux only cares 180 * flags word contains all kind of stuff, but in practice Linux only cares
180 * about the interrupt flag. Our "save_flags()" just returns that. */ 181 * about the interrupt flag. Our "save_flags()" just returns that. */
181static unsigned long save_fl(void) 182static unsigned long save_fl(void)
182{ 183{
@@ -217,19 +218,20 @@ static void irq_enable(void)
217 * address of the handler, and... well, who cares? The Guest just asks the 218 * address of the handler, and... well, who cares? The Guest just asks the
218 * Host to make the change anyway, because the Host controls the real IDT. 219 * Host to make the change anyway, because the Host controls the real IDT.
219 */ 220 */
220static void lguest_write_idt_entry(struct desc_struct *dt, 221static void lguest_write_idt_entry(gate_desc *dt,
221 int entrynum, u32 low, u32 high) 222 int entrynum, const gate_desc *g)
222{ 223{
224 u32 *desc = (u32 *)g;
223 /* Keep the local copy up to date. */ 225 /* Keep the local copy up to date. */
224 write_dt_entry(dt, entrynum, low, high); 226 native_write_idt_entry(dt, entrynum, g);
225 /* Tell Host about this new entry. */ 227 /* Tell Host about this new entry. */
226 hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, low, high); 228 hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1]);
227} 229}
228 230
229/* Changing to a different IDT is very rare: we keep the IDT up-to-date every 231/* Changing to a different IDT is very rare: we keep the IDT up-to-date every
230 * time it is written, so we can simply loop through all entries and tell the 232 * time it is written, so we can simply loop through all entries and tell the
231 * Host about them. */ 233 * Host about them. */
232static void lguest_load_idt(const struct Xgt_desc_struct *desc) 234static void lguest_load_idt(const struct desc_ptr *desc)
233{ 235{
234 unsigned int i; 236 unsigned int i;
235 struct desc_struct *idt = (void *)desc->address; 237 struct desc_struct *idt = (void *)desc->address;
@@ -252,7 +254,7 @@ static void lguest_load_idt(const struct Xgt_desc_struct *desc)
252 * hypercall and use that repeatedly to load a new IDT. I don't think it 254 * hypercall and use that repeatedly to load a new IDT. I don't think it
253 * really matters, but wouldn't it be nice if they were the same? 255 * really matters, but wouldn't it be nice if they were the same?
254 */ 256 */
255static void lguest_load_gdt(const struct Xgt_desc_struct *desc) 257static void lguest_load_gdt(const struct desc_ptr *desc)
256{ 258{
257 BUG_ON((desc->size+1)/8 != GDT_ENTRIES); 259 BUG_ON((desc->size+1)/8 != GDT_ENTRIES);
258 hcall(LHCALL_LOAD_GDT, __pa(desc->address), GDT_ENTRIES, 0); 260 hcall(LHCALL_LOAD_GDT, __pa(desc->address), GDT_ENTRIES, 0);
@@ -261,10 +263,10 @@ static void lguest_load_gdt(const struct Xgt_desc_struct *desc)
261/* For a single GDT entry which changes, we do the lazy thing: alter our GDT, 263/* For a single GDT entry which changes, we do the lazy thing: alter our GDT,
262 * then tell the Host to reload the entire thing. This operation is so rare 264 * then tell the Host to reload the entire thing. This operation is so rare
263 * that this naive implementation is reasonable. */ 265 * that this naive implementation is reasonable. */
264static void lguest_write_gdt_entry(struct desc_struct *dt, 266static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum,
265 int entrynum, u32 low, u32 high) 267 const void *desc, int type)
266{ 268{
267 write_dt_entry(dt, entrynum, low, high); 269 native_write_gdt_entry(dt, entrynum, desc, type);
268 hcall(LHCALL_LOAD_GDT, __pa(dt), GDT_ENTRIES, 0); 270 hcall(LHCALL_LOAD_GDT, __pa(dt), GDT_ENTRIES, 0);
269} 271}
270 272
@@ -323,30 +325,30 @@ static void lguest_load_tr_desc(void)
323 * anyone (including userspace) can just use the raw "cpuid" instruction and 325 * anyone (including userspace) can just use the raw "cpuid" instruction and
324 * the Host won't even notice since it isn't privileged. So we try not to get 326 * the Host won't even notice since it isn't privileged. So we try not to get
325 * too worked up about it. */ 327 * too worked up about it. */
326static void lguest_cpuid(unsigned int *eax, unsigned int *ebx, 328static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
327 unsigned int *ecx, unsigned int *edx) 329 unsigned int *cx, unsigned int *dx)
328{ 330{
329 int function = *eax; 331 int function = *ax;
330 332
331 native_cpuid(eax, ebx, ecx, edx); 333 native_cpuid(ax, bx, cx, dx);
332 switch (function) { 334 switch (function) {
333 case 1: /* Basic feature request. */ 335 case 1: /* Basic feature request. */
334 /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */ 336 /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */
335 *ecx &= 0x00002201; 337 *cx &= 0x00002201;
336 /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, FPU. */ 338 /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, FPU. */
337 *edx &= 0x07808101; 339 *dx &= 0x07808101;
338 /* The Host can do a nice optimization if it knows that the 340 /* The Host can do a nice optimization if it knows that the
339 * kernel mappings (addresses above 0xC0000000 or whatever 341 * kernel mappings (addresses above 0xC0000000 or whatever
340 * PAGE_OFFSET is set to) haven't changed. But Linux calls 342 * PAGE_OFFSET is set to) haven't changed. But Linux calls
341 * flush_tlb_user() for both user and kernel mappings unless 343 * flush_tlb_user() for both user and kernel mappings unless
342 * the Page Global Enable (PGE) feature bit is set. */ 344 * the Page Global Enable (PGE) feature bit is set. */
343 *edx |= 0x00002000; 345 *dx |= 0x00002000;
344 break; 346 break;
345 case 0x80000000: 347 case 0x80000000:
346 /* Futureproof this a little: if they ask how much extended 348 /* Futureproof this a little: if they ask how much extended
347 * processor information there is, limit it to known fields. */ 349 * processor information there is, limit it to known fields. */
348 if (*eax > 0x80000008) 350 if (*ax > 0x80000008)
349 *eax = 0x80000008; 351 *ax = 0x80000008;
350 break; 352 break;
351 } 353 }
352} 354}
@@ -755,10 +757,10 @@ static void lguest_time_init(void)
755 * segment), the privilege level (we're privilege level 1, the Host is 0 and 757 * segment), the privilege level (we're privilege level 1, the Host is 0 and
756 * will not tolerate us trying to use that), the stack pointer, and the number 758 * will not tolerate us trying to use that), the stack pointer, and the number
757 * of pages in the stack. */ 759 * of pages in the stack. */
758static void lguest_load_esp0(struct tss_struct *tss, 760static void lguest_load_sp0(struct tss_struct *tss,
759 struct thread_struct *thread) 761 struct thread_struct *thread)
760{ 762{
761 lazy_hcall(LHCALL_SET_STACK, __KERNEL_DS|0x1, thread->esp0, 763 lazy_hcall(LHCALL_SET_STACK, __KERNEL_DS|0x1, thread->sp0,
762 THREAD_SIZE/PAGE_SIZE); 764 THREAD_SIZE/PAGE_SIZE);
763} 765}
764 766
@@ -788,11 +790,11 @@ static void lguest_wbinvd(void)
788 * code qualifies for Advanced. It will also never interrupt anything. It 790 * code qualifies for Advanced. It will also never interrupt anything. It
789 * does, however, allow us to get through the Linux boot code. */ 791 * does, however, allow us to get through the Linux boot code. */
790#ifdef CONFIG_X86_LOCAL_APIC 792#ifdef CONFIG_X86_LOCAL_APIC
791static void lguest_apic_write(unsigned long reg, unsigned long v) 793static void lguest_apic_write(unsigned long reg, u32 v)
792{ 794{
793} 795}
794 796
795static unsigned long lguest_apic_read(unsigned long reg) 797static u32 lguest_apic_read(unsigned long reg)
796{ 798{
797 return 0; 799 return 0;
798} 800}
@@ -812,7 +814,7 @@ static void lguest_safe_halt(void)
812 * rather than virtual addresses, so we use __pa() here. */ 814 * rather than virtual addresses, so we use __pa() here. */
813static void lguest_power_off(void) 815static void lguest_power_off(void)
814{ 816{
815 hcall(LHCALL_CRASH, __pa("Power down"), 0, 0); 817 hcall(LHCALL_SHUTDOWN, __pa("Power down"), LGUEST_SHUTDOWN_POWEROFF, 0);
816} 818}
817 819
818/* 820/*
@@ -822,7 +824,7 @@ static void lguest_power_off(void)
822 */ 824 */
823static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p) 825static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p)
824{ 826{
825 hcall(LHCALL_CRASH, __pa(p), 0, 0); 827 hcall(LHCALL_SHUTDOWN, __pa(p), LGUEST_SHUTDOWN_POWEROFF, 0);
826 /* The hcall won't return, but to keep gcc happy, we're "done". */ 828 /* The hcall won't return, but to keep gcc happy, we're "done". */
827 return NOTIFY_DONE; 829 return NOTIFY_DONE;
828} 830}
@@ -926,6 +928,11 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf,
926 return insn_len; 928 return insn_len;
927} 929}
928 930
931static void lguest_restart(char *reason)
932{
933 hcall(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART, 0);
934}
935
929/*G:030 Once we get to lguest_init(), we know we're a Guest. The pv_ops 936/*G:030 Once we get to lguest_init(), we know we're a Guest. The pv_ops
930 * structures in the kernel provide points for (almost) every routine we have 937 * structures in the kernel provide points for (almost) every routine we have
931 * to override to avoid privileged instructions. */ 938 * to override to avoid privileged instructions. */
@@ -957,7 +964,7 @@ __init void lguest_init(void)
957 pv_cpu_ops.cpuid = lguest_cpuid; 964 pv_cpu_ops.cpuid = lguest_cpuid;
958 pv_cpu_ops.load_idt = lguest_load_idt; 965 pv_cpu_ops.load_idt = lguest_load_idt;
959 pv_cpu_ops.iret = lguest_iret; 966 pv_cpu_ops.iret = lguest_iret;
960 pv_cpu_ops.load_esp0 = lguest_load_esp0; 967 pv_cpu_ops.load_sp0 = lguest_load_sp0;
961 pv_cpu_ops.load_tr_desc = lguest_load_tr_desc; 968 pv_cpu_ops.load_tr_desc = lguest_load_tr_desc;
962 pv_cpu_ops.set_ldt = lguest_set_ldt; 969 pv_cpu_ops.set_ldt = lguest_set_ldt;
963 pv_cpu_ops.load_tls = lguest_load_tls; 970 pv_cpu_ops.load_tls = lguest_load_tls;
@@ -1059,6 +1066,7 @@ __init void lguest_init(void)
1059 * the Guest routine to power off. */ 1066 * the Guest routine to power off. */
1060 pm_power_off = lguest_power_off; 1067 pm_power_off = lguest_power_off;
1061 1068
1069 machine_ops.restart = lguest_restart;
1062 /* Now we're set up, call start_kernel() in init/main.c and we proceed 1070 /* Now we're set up, call start_kernel() in init/main.c and we proceed
1063 * to boot as normal. It never returns. */ 1071 * to boot as normal. It never returns. */
1064 start_kernel(); 1072 start_kernel();
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 329da276c6f..25df1c1989f 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -1,5 +1,27 @@
1#
2# Makefile for x86 specific library files.
3#
4
5obj-$(CONFIG_SMP) := msr-on-cpu.o
6
7lib-y := delay_$(BITS).o
8lib-y += usercopy_$(BITS).o getuser_$(BITS).o putuser_$(BITS).o
9lib-y += memcpy_$(BITS).o
10
1ifeq ($(CONFIG_X86_32),y) 11ifeq ($(CONFIG_X86_32),y)
2include ${srctree}/arch/x86/lib/Makefile_32 12 lib-y += checksum_32.o
13 lib-y += strstr_32.o
14 lib-y += bitops_32.o semaphore_32.o string_32.o
15
16 lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o
3else 17else
4include ${srctree}/arch/x86/lib/Makefile_64 18 obj-y += io_64.o iomap_copy_64.o
19
20 CFLAGS_csum-partial_64.o := -funroll-loops
21
22 lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
23 lib-y += thunk_64.o clear_page_64.o copy_page_64.o
24 lib-y += bitops_64.o
25 lib-y += memmove_64.o memset_64.o
26 lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o
5endif 27endif
diff --git a/arch/x86/lib/Makefile_32 b/arch/x86/lib/Makefile_32
deleted file mode 100644
index 98d1f1e2e2e..00000000000
--- a/arch/x86/lib/Makefile_32
+++ /dev/null
@@ -1,11 +0,0 @@
1#
2# Makefile for i386-specific library files..
3#
4
5
6lib-y = checksum_32.o delay_32.o usercopy_32.o getuser_32.o putuser_32.o memcpy_32.o strstr_32.o \
7 bitops_32.o semaphore_32.o string_32.o
8
9lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o
10
11obj-$(CONFIG_SMP) += msr-on-cpu.o
diff --git a/arch/x86/lib/Makefile_64 b/arch/x86/lib/Makefile_64
deleted file mode 100644
index bbabad3c933..00000000000
--- a/arch/x86/lib/Makefile_64
+++ /dev/null
@@ -1,13 +0,0 @@
1#
2# Makefile for x86_64-specific library files.
3#
4
5CFLAGS_csum-partial_64.o := -funroll-loops
6
7obj-y := io_64.o iomap_copy_64.o
8obj-$(CONFIG_SMP) += msr-on-cpu.o
9
10lib-y := csum-partial_64.o csum-copy_64.o csum-wrappers_64.o delay_64.o \
11 usercopy_64.o getuser_64.o putuser_64.o \
12 thunk_64.o clear_page_64.o copy_page_64.o bitstr_64.o bitops_64.o
13lib-y += memcpy_64.o memmove_64.o memset_64.o copy_user_64.o rwlock_64.o copy_user_nocache_64.o
diff --git a/arch/x86/lib/bitops_32.c b/arch/x86/lib/bitops_32.c
index afd0045595d..b6544045985 100644
--- a/arch/x86/lib/bitops_32.c
+++ b/arch/x86/lib/bitops_32.c
@@ -2,7 +2,7 @@
2#include <linux/module.h> 2#include <linux/module.h>
3 3
4/** 4/**
5 * find_next_bit - find the first set bit in a memory region 5 * find_next_bit - find the next set bit in a memory region
6 * @addr: The address to base the search on 6 * @addr: The address to base the search on
7 * @offset: The bitnumber to start searching at 7 * @offset: The bitnumber to start searching at
8 * @size: The maximum size to search 8 * @size: The maximum size to search
diff --git a/arch/x86/lib/bitops_64.c b/arch/x86/lib/bitops_64.c
index 95b6d9639fb..0e8f491e6cc 100644
--- a/arch/x86/lib/bitops_64.c
+++ b/arch/x86/lib/bitops_64.c
@@ -58,7 +58,7 @@ long find_first_zero_bit(const unsigned long * addr, unsigned long size)
58} 58}
59 59
60/** 60/**
61 * find_next_zero_bit - find the first zero bit in a memory region 61 * find_next_zero_bit - find the next zero bit in a memory region
62 * @addr: The address to base the search on 62 * @addr: The address to base the search on
63 * @offset: The bitnumber to start searching at 63 * @offset: The bitnumber to start searching at
64 * @size: The maximum size to search 64 * @size: The maximum size to search
diff --git a/arch/x86/lib/bitstr_64.c b/arch/x86/lib/bitstr_64.c
deleted file mode 100644
index 7445caf1b5d..00000000000
--- a/arch/x86/lib/bitstr_64.c
+++ /dev/null
@@ -1,28 +0,0 @@
1#include <linux/module.h>
2#include <linux/bitops.h>
3
4/* Find string of zero bits in a bitmap */
5unsigned long
6find_next_zero_string(unsigned long *bitmap, long start, long nbits, int len)
7{
8 unsigned long n, end, i;
9
10 again:
11 n = find_next_zero_bit(bitmap, nbits, start);
12 if (n == -1)
13 return -1;
14
15 /* could test bitsliced, but it's hardly worth it */
16 end = n+len;
17 if (end > nbits)
18 return -1;
19 for (i = n+1; i < end; i++) {
20 if (test_bit(i, bitmap)) {
21 start = i+1;
22 goto again;
23 }
24 }
25 return n;
26}
27
28EXPORT_SYMBOL(find_next_zero_string);
diff --git a/arch/x86/lib/delay_32.c b/arch/x86/lib/delay_32.c
index aad9d95469d..4535e6d147a 100644
--- a/arch/x86/lib/delay_32.c
+++ b/arch/x86/lib/delay_32.c
@@ -12,8 +12,10 @@
12 12
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/sched.h> 14#include <linux/sched.h>
15#include <linux/timex.h>
15#include <linux/preempt.h> 16#include <linux/preempt.h>
16#include <linux/delay.h> 17#include <linux/delay.h>
18#include <linux/init.h>
17 19
18#include <asm/processor.h> 20#include <asm/processor.h>
19#include <asm/delay.h> 21#include <asm/delay.h>
@@ -63,7 +65,7 @@ void use_tsc_delay(void)
63 delay_fn = delay_tsc; 65 delay_fn = delay_tsc;
64} 66}
65 67
66int read_current_timer(unsigned long *timer_val) 68int __devinit read_current_timer(unsigned long *timer_val)
67{ 69{
68 if (delay_fn == delay_tsc) { 70 if (delay_fn == delay_tsc) {
69 rdtscl(*timer_val); 71 rdtscl(*timer_val);
diff --git a/arch/x86/lib/delay_64.c b/arch/x86/lib/delay_64.c
index 45cdd3fbd91..bbc61051851 100644
--- a/arch/x86/lib/delay_64.c
+++ b/arch/x86/lib/delay_64.c
@@ -10,8 +10,10 @@
10 10
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/sched.h> 12#include <linux/sched.h>
13#include <linux/timex.h>
13#include <linux/preempt.h> 14#include <linux/preempt.h>
14#include <linux/delay.h> 15#include <linux/delay.h>
16#include <linux/init.h>
15 17
16#include <asm/delay.h> 18#include <asm/delay.h>
17#include <asm/msr.h> 19#include <asm/msr.h>
@@ -20,7 +22,7 @@
20#include <asm/smp.h> 22#include <asm/smp.h>
21#endif 23#endif
22 24
23int read_current_timer(unsigned long *timer_value) 25int __devinit read_current_timer(unsigned long *timer_value)
24{ 26{
25 rdtscll(*timer_value); 27 rdtscll(*timer_value);
26 return 0; 28 return 0;
diff --git a/arch/x86/lib/memcpy_32.c b/arch/x86/lib/memcpy_32.c
index 8ac51b82a63..37756b6fb32 100644
--- a/arch/x86/lib/memcpy_32.c
+++ b/arch/x86/lib/memcpy_32.c
@@ -34,8 +34,8 @@ void *memmove(void *dest, const void *src, size_t n)
34 "cld" 34 "cld"
35 : "=&c" (d0), "=&S" (d1), "=&D" (d2) 35 : "=&c" (d0), "=&S" (d1), "=&D" (d2)
36 :"0" (n), 36 :"0" (n),
37 "1" (n-1+(const char *)src), 37 "1" (n-1+src),
38 "2" (n-1+(char *)dest) 38 "2" (n-1+dest)
39 :"memory"); 39 :"memory");
40 } 40 }
41 return dest; 41 return dest;
diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c
index 751ebae8ec4..80175e47b19 100644
--- a/arch/x86/lib/memmove_64.c
+++ b/arch/x86/lib/memmove_64.c
@@ -11,8 +11,8 @@ void *memmove(void * dest,const void *src,size_t count)
11 if (dest < src) { 11 if (dest < src) {
12 return memcpy(dest,src,count); 12 return memcpy(dest,src,count);
13 } else { 13 } else {
14 char *p = (char *) dest + count; 14 char *p = dest + count;
15 char *s = (char *) src + count; 15 const char *s = src + count;
16 while (count--) 16 while (count--)
17 *--p = *--s; 17 *--p = *--s;
18 } 18 }
diff --git a/arch/x86/lib/mmx_32.c b/arch/x86/lib/mmx_32.c
index 28084d2e8dd..cc9b4a4450f 100644
--- a/arch/x86/lib/mmx_32.c
+++ b/arch/x86/lib/mmx_32.c
@@ -4,6 +4,7 @@
4#include <linux/hardirq.h> 4#include <linux/hardirq.h>
5#include <linux/module.h> 5#include <linux/module.h>
6 6
7#include <asm/asm.h>
7#include <asm/i387.h> 8#include <asm/i387.h>
8 9
9 10
@@ -50,10 +51,7 @@ void *_mmx_memcpy(void *to, const void *from, size_t len)
50 "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ 51 "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
51 " jmp 2b\n" 52 " jmp 2b\n"
52 ".previous\n" 53 ".previous\n"
53 ".section __ex_table,\"a\"\n" 54 _ASM_EXTABLE(1b,3b)
54 " .align 4\n"
55 " .long 1b, 3b\n"
56 ".previous"
57 : : "r" (from) ); 55 : : "r" (from) );
58 56
59 57
@@ -81,10 +79,7 @@ void *_mmx_memcpy(void *to, const void *from, size_t len)
81 "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ 79 "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
82 " jmp 2b\n" 80 " jmp 2b\n"
83 ".previous\n" 81 ".previous\n"
84 ".section __ex_table,\"a\"\n" 82 _ASM_EXTABLE(1b,3b)
85 " .align 4\n"
86 " .long 1b, 3b\n"
87 ".previous"
88 : : "r" (from), "r" (to) : "memory"); 83 : : "r" (from), "r" (to) : "memory");
89 from+=64; 84 from+=64;
90 to+=64; 85 to+=64;
@@ -181,10 +176,7 @@ static void fast_copy_page(void *to, void *from)
181 "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ 176 "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
182 " jmp 2b\n" 177 " jmp 2b\n"
183 ".previous\n" 178 ".previous\n"
184 ".section __ex_table,\"a\"\n" 179 _ASM_EXTABLE(1b,3b)
185 " .align 4\n"
186 " .long 1b, 3b\n"
187 ".previous"
188 : : "r" (from) ); 180 : : "r" (from) );
189 181
190 for(i=0; i<(4096-320)/64; i++) 182 for(i=0; i<(4096-320)/64; i++)
@@ -211,10 +203,7 @@ static void fast_copy_page(void *to, void *from)
211 "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ 203 "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
212 " jmp 2b\n" 204 " jmp 2b\n"
213 ".previous\n" 205 ".previous\n"
214 ".section __ex_table,\"a\"\n" 206 _ASM_EXTABLE(1b,3b)
215 " .align 4\n"
216 " .long 1b, 3b\n"
217 ".previous"
218 : : "r" (from), "r" (to) : "memory"); 207 : : "r" (from), "r" (to) : "memory");
219 from+=64; 208 from+=64;
220 to+=64; 209 to+=64;
@@ -311,10 +300,7 @@ static void fast_copy_page(void *to, void *from)
311 "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ 300 "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
312 " jmp 2b\n" 301 " jmp 2b\n"
313 ".previous\n" 302 ".previous\n"
314 ".section __ex_table,\"a\"\n" 303 _ASM_EXTABLE(1b,3b)
315 " .align 4\n"
316 " .long 1b, 3b\n"
317 ".previous"
318 : : "r" (from) ); 304 : : "r" (from) );
319 305
320 for(i=0; i<4096/64; i++) 306 for(i=0; i<4096/64; i++)
@@ -341,10 +327,7 @@ static void fast_copy_page(void *to, void *from)
341 "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ 327 "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
342 " jmp 2b\n" 328 " jmp 2b\n"
343 ".previous\n" 329 ".previous\n"
344 ".section __ex_table,\"a\"\n" 330 _ASM_EXTABLE(1b,3b)
345 " .align 4\n"
346 " .long 1b, 3b\n"
347 ".previous"
348 : : "r" (from), "r" (to) : "memory"); 331 : : "r" (from), "r" (to) : "memory");
349 from+=64; 332 from+=64;
350 to+=64; 333 to+=64;
diff --git a/arch/x86/lib/semaphore_32.S b/arch/x86/lib/semaphore_32.S
index 444fba40098..3899bd37fdf 100644
--- a/arch/x86/lib/semaphore_32.S
+++ b/arch/x86/lib/semaphore_32.S
@@ -29,7 +29,7 @@
29 * registers (%eax, %edx and %ecx) except %eax whish is either a return 29 * registers (%eax, %edx and %ecx) except %eax whish is either a return
30 * value or just clobbered.. 30 * value or just clobbered..
31 */ 31 */
32 .section .sched.text 32 .section .sched.text, "ax"
33ENTRY(__down_failed) 33ENTRY(__down_failed)
34 CFI_STARTPROC 34 CFI_STARTPROC
35 FRAME 35 FRAME
@@ -49,7 +49,7 @@ ENTRY(__down_failed)
49 ENDFRAME 49 ENDFRAME
50 ret 50 ret
51 CFI_ENDPROC 51 CFI_ENDPROC
52 END(__down_failed) 52 ENDPROC(__down_failed)
53 53
54ENTRY(__down_failed_interruptible) 54ENTRY(__down_failed_interruptible)
55 CFI_STARTPROC 55 CFI_STARTPROC
@@ -70,7 +70,7 @@ ENTRY(__down_failed_interruptible)
70 ENDFRAME 70 ENDFRAME
71 ret 71 ret
72 CFI_ENDPROC 72 CFI_ENDPROC
73 END(__down_failed_interruptible) 73 ENDPROC(__down_failed_interruptible)
74 74
75ENTRY(__down_failed_trylock) 75ENTRY(__down_failed_trylock)
76 CFI_STARTPROC 76 CFI_STARTPROC
@@ -91,7 +91,7 @@ ENTRY(__down_failed_trylock)
91 ENDFRAME 91 ENDFRAME
92 ret 92 ret
93 CFI_ENDPROC 93 CFI_ENDPROC
94 END(__down_failed_trylock) 94 ENDPROC(__down_failed_trylock)
95 95
96ENTRY(__up_wakeup) 96ENTRY(__up_wakeup)
97 CFI_STARTPROC 97 CFI_STARTPROC
@@ -112,7 +112,7 @@ ENTRY(__up_wakeup)
112 ENDFRAME 112 ENDFRAME
113 ret 113 ret
114 CFI_ENDPROC 114 CFI_ENDPROC
115 END(__up_wakeup) 115 ENDPROC(__up_wakeup)
116 116
117/* 117/*
118 * rw spinlock fallbacks 118 * rw spinlock fallbacks
@@ -132,7 +132,7 @@ ENTRY(__write_lock_failed)
132 ENDFRAME 132 ENDFRAME
133 ret 133 ret
134 CFI_ENDPROC 134 CFI_ENDPROC
135 END(__write_lock_failed) 135 ENDPROC(__write_lock_failed)
136 136
137ENTRY(__read_lock_failed) 137ENTRY(__read_lock_failed)
138 CFI_STARTPROC 138 CFI_STARTPROC
@@ -148,7 +148,7 @@ ENTRY(__read_lock_failed)
148 ENDFRAME 148 ENDFRAME
149 ret 149 ret
150 CFI_ENDPROC 150 CFI_ENDPROC
151 END(__read_lock_failed) 151 ENDPROC(__read_lock_failed)
152 152
153#endif 153#endif
154 154
@@ -170,7 +170,7 @@ ENTRY(call_rwsem_down_read_failed)
170 CFI_ADJUST_CFA_OFFSET -4 170 CFI_ADJUST_CFA_OFFSET -4
171 ret 171 ret
172 CFI_ENDPROC 172 CFI_ENDPROC
173 END(call_rwsem_down_read_failed) 173 ENDPROC(call_rwsem_down_read_failed)
174 174
175ENTRY(call_rwsem_down_write_failed) 175ENTRY(call_rwsem_down_write_failed)
176 CFI_STARTPROC 176 CFI_STARTPROC
@@ -182,7 +182,7 @@ ENTRY(call_rwsem_down_write_failed)
182 CFI_ADJUST_CFA_OFFSET -4 182 CFI_ADJUST_CFA_OFFSET -4
183 ret 183 ret
184 CFI_ENDPROC 184 CFI_ENDPROC
185 END(call_rwsem_down_write_failed) 185 ENDPROC(call_rwsem_down_write_failed)
186 186
187ENTRY(call_rwsem_wake) 187ENTRY(call_rwsem_wake)
188 CFI_STARTPROC 188 CFI_STARTPROC
@@ -196,7 +196,7 @@ ENTRY(call_rwsem_wake)
196 CFI_ADJUST_CFA_OFFSET -4 196 CFI_ADJUST_CFA_OFFSET -4
1971: ret 1971: ret
198 CFI_ENDPROC 198 CFI_ENDPROC
199 END(call_rwsem_wake) 199 ENDPROC(call_rwsem_wake)
200 200
201/* Fix up special calling conventions */ 201/* Fix up special calling conventions */
202ENTRY(call_rwsem_downgrade_wake) 202ENTRY(call_rwsem_downgrade_wake)
@@ -214,6 +214,6 @@ ENTRY(call_rwsem_downgrade_wake)
214 CFI_ADJUST_CFA_OFFSET -4 214 CFI_ADJUST_CFA_OFFSET -4
215 ret 215 ret
216 CFI_ENDPROC 216 CFI_ENDPROC
217 END(call_rwsem_downgrade_wake) 217 ENDPROC(call_rwsem_downgrade_wake)
218 218
219#endif 219#endif
diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S
index 6ea73f3de56..8b92d428ab0 100644
--- a/arch/x86/lib/thunk_64.S
+++ b/arch/x86/lib/thunk_64.S
@@ -33,7 +33,7 @@
33 .endm 33 .endm
34 34
35 35
36 .section .sched.text 36 .section .sched.text, "ax"
37#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM 37#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM
38 thunk rwsem_down_read_failed_thunk,rwsem_down_read_failed 38 thunk rwsem_down_read_failed_thunk,rwsem_down_read_failed
39 thunk rwsem_down_write_failed_thunk,rwsem_down_write_failed 39 thunk rwsem_down_write_failed_thunk,rwsem_down_write_failed
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index 8bab2b2efaf..e849b9998b0 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -48,10 +48,7 @@ do { \
48 "3: movl %5,%0\n" \ 48 "3: movl %5,%0\n" \
49 " jmp 2b\n" \ 49 " jmp 2b\n" \
50 ".previous\n" \ 50 ".previous\n" \
51 ".section __ex_table,\"a\"\n" \ 51 _ASM_EXTABLE(0b,3b) \
52 " .align 4\n" \
53 " .long 0b,3b\n" \
54 ".previous" \
55 : "=d"(res), "=c"(count), "=&a" (__d0), "=&S" (__d1), \ 52 : "=d"(res), "=c"(count), "=&a" (__d0), "=&S" (__d1), \
56 "=&D" (__d2) \ 53 "=&D" (__d2) \
57 : "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst) \ 54 : "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst) \
@@ -132,11 +129,8 @@ do { \
132 "3: lea 0(%2,%0,4),%0\n" \ 129 "3: lea 0(%2,%0,4),%0\n" \
133 " jmp 2b\n" \ 130 " jmp 2b\n" \
134 ".previous\n" \ 131 ".previous\n" \
135 ".section __ex_table,\"a\"\n" \ 132 _ASM_EXTABLE(0b,3b) \
136 " .align 4\n" \ 133 _ASM_EXTABLE(1b,2b) \
137 " .long 0b,3b\n" \
138 " .long 1b,2b\n" \
139 ".previous" \
140 : "=&c"(size), "=&D" (__d0) \ 134 : "=&c"(size), "=&D" (__d0) \
141 : "r"(size & 3), "0"(size / 4), "1"(addr), "a"(0)); \ 135 : "r"(size & 3), "0"(size / 4), "1"(addr), "a"(0)); \
142} while (0) 136} while (0)
@@ -817,6 +811,7 @@ unsigned long __copy_from_user_ll_nocache(void *to, const void __user *from,
817#endif 811#endif
818 return n; 812 return n;
819} 813}
814EXPORT_SYMBOL(__copy_from_user_ll_nocache);
820 815
821unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *from, 816unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *from,
822 unsigned long n) 817 unsigned long n)
@@ -831,6 +826,7 @@ unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *fr
831#endif 826#endif
832 return n; 827 return n;
833} 828}
829EXPORT_SYMBOL(__copy_from_user_ll_nocache_nozero);
834 830
835/** 831/**
836 * copy_to_user: - Copy a block of data into user space. 832 * copy_to_user: - Copy a block of data into user space.
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index 893d43f838c..0c89d1bb028 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -31,10 +31,7 @@ do { \
31 "3: movq %5,%0\n" \ 31 "3: movq %5,%0\n" \
32 " jmp 2b\n" \ 32 " jmp 2b\n" \
33 ".previous\n" \ 33 ".previous\n" \
34 ".section __ex_table,\"a\"\n" \ 34 _ASM_EXTABLE(0b,3b) \
35 " .align 8\n" \
36 " .quad 0b,3b\n" \
37 ".previous" \
38 : "=r"(res), "=c"(count), "=&a" (__d0), "=&S" (__d1), \ 35 : "=r"(res), "=c"(count), "=&a" (__d0), "=&S" (__d1), \
39 "=&D" (__d2) \ 36 "=&D" (__d2) \
40 : "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst) \ 37 : "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst) \
@@ -87,11 +84,8 @@ unsigned long __clear_user(void __user *addr, unsigned long size)
87 "3: lea 0(%[size1],%[size8],8),%[size8]\n" 84 "3: lea 0(%[size1],%[size8],8),%[size8]\n"
88 " jmp 2b\n" 85 " jmp 2b\n"
89 ".previous\n" 86 ".previous\n"
90 ".section __ex_table,\"a\"\n" 87 _ASM_EXTABLE(0b,3b)
91 " .align 8\n" 88 _ASM_EXTABLE(1b,2b)
92 " .quad 0b,3b\n"
93 " .quad 1b,2b\n"
94 ".previous"
95 : [size8] "=c"(size), [dst] "=&D" (__d0) 89 : [size8] "=c"(size), [dst] "=&D" (__d0)
96 : [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr), 90 : [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr),
97 [zero] "r" (0UL), [eight] "r" (8UL)); 91 [zero] "r" (0UL), [eight] "r" (8UL));
diff --git a/arch/x86/mach-rdc321x/Makefile b/arch/x86/mach-rdc321x/Makefile
new file mode 100644
index 00000000000..1faac8125e3
--- /dev/null
+++ b/arch/x86/mach-rdc321x/Makefile
@@ -0,0 +1,5 @@
1#
2# Makefile for the RDC321x specific parts of the kernel
3#
4obj-$(CONFIG_X86_RDC321X) := gpio.o platform.o wdt.o
5
diff --git a/arch/x86/mach-rdc321x/gpio.c b/arch/x86/mach-rdc321x/gpio.c
new file mode 100644
index 00000000000..031269163bd
--- /dev/null
+++ b/arch/x86/mach-rdc321x/gpio.c
@@ -0,0 +1,91 @@
1/*
2 * Copyright (C) 2007, OpenWrt.org, Florian Fainelli <florian@openwrt.org>
3 * RDC321x architecture specific GPIO support
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License as published by the
7 * Free Software Foundation; either version 2 of the License, or (at your
8 * option) any later version.
9 */
10
11#include <linux/autoconf.h>
12#include <linux/init.h>
13#include <linux/io.h>
14#include <linux/types.h>
15#include <linux/module.h>
16#include <linux/delay.h>
17
18#include <asm/mach-rdc321x/rdc321x_defs.h>
19
20static inline int rdc_gpio_is_valid(unsigned gpio)
21{
22 return (gpio <= RDC_MAX_GPIO);
23}
24
25static unsigned int rdc_gpio_read(unsigned gpio)
26{
27 unsigned int val;
28
29 val = 0x80000000 | (7 << 11) | ((gpio&0x20?0x84:0x48));
30 outl(val, RDC3210_CFGREG_ADDR);
31 udelay(10);
32 val = inl(RDC3210_CFGREG_DATA);
33 val |= (0x1 << (gpio & 0x1F));
34 outl(val, RDC3210_CFGREG_DATA);
35 udelay(10);
36 val = 0x80000000 | (7 << 11) | ((gpio&0x20?0x88:0x4C));
37 outl(val, RDC3210_CFGREG_ADDR);
38 udelay(10);
39 val = inl(RDC3210_CFGREG_DATA);
40
41 return val;
42}
43
44static void rdc_gpio_write(unsigned int val)
45{
46 if (val) {
47 outl(val, RDC3210_CFGREG_DATA);
48 udelay(10);
49 }
50}
51
52int rdc_gpio_get_value(unsigned gpio)
53{
54 if (rdc_gpio_is_valid(gpio))
55 return (int)rdc_gpio_read(gpio);
56 else
57 return -EINVAL;
58}
59EXPORT_SYMBOL(rdc_gpio_get_value);
60
61void rdc_gpio_set_value(unsigned gpio, int value)
62{
63 unsigned int val;
64
65 if (!rdc_gpio_is_valid(gpio))
66 return;
67
68 val = rdc_gpio_read(gpio);
69
70 if (value)
71 val &= ~(0x1 << (gpio & 0x1F));
72 else
73 val |= (0x1 << (gpio & 0x1F));
74
75 rdc_gpio_write(val);
76}
77EXPORT_SYMBOL(rdc_gpio_set_value);
78
79int rdc_gpio_direction_input(unsigned gpio)
80{
81 return 0;
82}
83EXPORT_SYMBOL(rdc_gpio_direction_input);
84
85int rdc_gpio_direction_output(unsigned gpio, int value)
86{
87 return 0;
88}
89EXPORT_SYMBOL(rdc_gpio_direction_output);
90
91
diff --git a/arch/x86/mach-rdc321x/platform.c b/arch/x86/mach-rdc321x/platform.c
new file mode 100644
index 00000000000..dda6024a586
--- /dev/null
+++ b/arch/x86/mach-rdc321x/platform.c
@@ -0,0 +1,68 @@
1/*
2 * Generic RDC321x platform devices
3 *
4 * Copyright (C) 2007 Florian Fainelli <florian@openwrt.org>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version 2
9 * of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the
18 * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
19 * Boston, MA 02110-1301, USA.
20 *
21 */
22
23#include <linux/init.h>
24#include <linux/kernel.h>
25#include <linux/list.h>
26#include <linux/device.h>
27#include <linux/platform_device.h>
28#include <linux/version.h>
29#include <linux/leds.h>
30
31#include <asm/gpio.h>
32
33/* LEDS */
34static struct gpio_led default_leds[] = {
35 { .name = "rdc:dmz", .gpio = 1, },
36};
37
38static struct gpio_led_platform_data rdc321x_led_data = {
39 .num_leds = ARRAY_SIZE(default_leds),
40 .leds = default_leds,
41};
42
43static struct platform_device rdc321x_leds = {
44 .name = "leds-gpio",
45 .id = -1,
46 .dev = {
47 .platform_data = &rdc321x_led_data,
48 }
49};
50
51/* Watchdog */
52static struct platform_device rdc321x_wdt = {
53 .name = "rdc321x-wdt",
54 .id = -1,
55 .num_resources = 0,
56};
57
58static struct platform_device *rdc321x_devs[] = {
59 &rdc321x_leds,
60 &rdc321x_wdt
61};
62
63static int __init rdc_board_setup(void)
64{
65 return platform_add_devices(rdc321x_devs, ARRAY_SIZE(rdc321x_devs));
66}
67
68arch_initcall(rdc_board_setup);
diff --git a/arch/x86/mach-rdc321x/wdt.c b/arch/x86/mach-rdc321x/wdt.c
new file mode 100644
index 00000000000..ec5625ae706
--- /dev/null
+++ b/arch/x86/mach-rdc321x/wdt.c
@@ -0,0 +1,275 @@
1/*
2 * RDC321x watchdog driver
3 *
4 * Copyright (C) 2007 Florian Fainelli <florian@openwrt.org>
5 *
6 * This driver is highly inspired from the cpu5_wdt driver
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 *
22 */
23
24#include <linux/module.h>
25#include <linux/moduleparam.h>
26#include <linux/types.h>
27#include <linux/errno.h>
28#include <linux/miscdevice.h>
29#include <linux/fs.h>
30#include <linux/init.h>
31#include <linux/ioport.h>
32#include <linux/timer.h>
33#include <linux/completion.h>
34#include <linux/jiffies.h>
35#include <linux/platform_device.h>
36#include <linux/watchdog.h>
37#include <linux/io.h>
38#include <linux/uaccess.h>
39
40#include <asm/mach-rdc321x/rdc321x_defs.h>
41
42#define RDC_WDT_MASK 0x80000000 /* Mask */
43#define RDC_WDT_EN 0x00800000 /* Enable bit */
44#define RDC_WDT_WTI 0x00200000 /* Generate CPU reset/NMI/WDT on timeout */
45#define RDC_WDT_RST 0x00100000 /* Reset bit */
46#define RDC_WDT_WIF 0x00040000 /* WDT IRQ Flag */
47#define RDC_WDT_IRT 0x00000100 /* IRQ Routing table */
48#define RDC_WDT_CNT 0x00000001 /* WDT count */
49
50#define RDC_CLS_TMR 0x80003844 /* Clear timer */
51
52#define RDC_WDT_INTERVAL (HZ/10+1)
53
54int nowayout = WATCHDOG_NOWAYOUT;
55module_param(nowayout, int, 0);
56MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default=" __MODULE_STRING(WATCHDOG_NOWAYOUT) ")");
57
58static int ticks = 1000;
59
60/* some device data */
61
62static struct {
63 struct completion stop;
64 volatile int running;
65 struct timer_list timer;
66 volatile int queue;
67 int default_ticks;
68 unsigned long inuse;
69} rdc321x_wdt_device;
70
71/* generic helper functions */
72
73static void rdc321x_wdt_trigger(unsigned long unused)
74{
75 if (rdc321x_wdt_device.running)
76 ticks--;
77
78 /* keep watchdog alive */
79 outl(RDC_WDT_EN|inl(RDC3210_CFGREG_DATA), RDC3210_CFGREG_DATA);
80
81 /* requeue?? */
82 if (rdc321x_wdt_device.queue && ticks)
83 mod_timer(&rdc321x_wdt_device.timer,
84 jiffies + RDC_WDT_INTERVAL);
85 else {
86 /* ticks doesn't matter anyway */
87 complete(&rdc321x_wdt_device.stop);
88 }
89
90}
91
92static void rdc321x_wdt_reset(void)
93{
94 ticks = rdc321x_wdt_device.default_ticks;
95}
96
97static void rdc321x_wdt_start(void)
98{
99 if (!rdc321x_wdt_device.queue) {
100 rdc321x_wdt_device.queue = 1;
101
102 /* Clear the timer */
103 outl(RDC_CLS_TMR, RDC3210_CFGREG_ADDR);
104
105 /* Enable watchdog and set the timeout to 81.92 us */
106 outl(RDC_WDT_EN|RDC_WDT_CNT, RDC3210_CFGREG_DATA);
107
108 mod_timer(&rdc321x_wdt_device.timer,
109 jiffies + RDC_WDT_INTERVAL);
110 }
111
112 /* if process dies, counter is not decremented */
113 rdc321x_wdt_device.running++;
114}
115
116static int rdc321x_wdt_stop(void)
117{
118 if (rdc321x_wdt_device.running)
119 rdc321x_wdt_device.running = 0;
120
121 ticks = rdc321x_wdt_device.default_ticks;
122
123 return -EIO;
124}
125
126/* filesystem operations */
127
128static int rdc321x_wdt_open(struct inode *inode, struct file *file)
129{
130 if (test_and_set_bit(0, &rdc321x_wdt_device.inuse))
131 return -EBUSY;
132
133 return nonseekable_open(inode, file);
134}
135
136static int rdc321x_wdt_release(struct inode *inode, struct file *file)
137{
138 clear_bit(0, &rdc321x_wdt_device.inuse);
139 return 0;
140}
141
142static int rdc321x_wdt_ioctl(struct inode *inode, struct file *file,
143 unsigned int cmd, unsigned long arg)
144{
145 void __user *argp = (void __user *)arg;
146 unsigned int value;
147 static struct watchdog_info ident = {
148 .options = WDIOF_CARDRESET,
149 .identity = "RDC321x WDT",
150 };
151
152 switch (cmd) {
153 case WDIOC_KEEPALIVE:
154 rdc321x_wdt_reset();
155 break;
156 case WDIOC_GETSTATUS:
157 /* Read the value from the DATA register */
158 value = inl(RDC3210_CFGREG_DATA);
159 if (copy_to_user(argp, &value, sizeof(int)))
160 return -EFAULT;
161 break;
162 case WDIOC_GETSUPPORT:
163 if (copy_to_user(argp, &ident, sizeof(ident)))
164 return -EFAULT;
165 break;
166 case WDIOC_SETOPTIONS:
167 if (copy_from_user(&value, argp, sizeof(int)))
168 return -EFAULT;
169 switch (value) {
170 case WDIOS_ENABLECARD:
171 rdc321x_wdt_start();
172 break;
173 case WDIOS_DISABLECARD:
174 return rdc321x_wdt_stop();
175 default:
176 return -EINVAL;
177 }
178 break;
179 default:
180 return -ENOTTY;
181 }
182 return 0;
183}
184
185static ssize_t rdc321x_wdt_write(struct file *file, const char __user *buf,
186 size_t count, loff_t *ppos)
187{
188 if (!count)
189 return -EIO;
190
191 rdc321x_wdt_reset();
192
193 return count;
194}
195
196static const struct file_operations rdc321x_wdt_fops = {
197 .owner = THIS_MODULE,
198 .llseek = no_llseek,
199 .ioctl = rdc321x_wdt_ioctl,
200 .open = rdc321x_wdt_open,
201 .write = rdc321x_wdt_write,
202 .release = rdc321x_wdt_release,
203};
204
205static struct miscdevice rdc321x_wdt_misc = {
206 .minor = WATCHDOG_MINOR,
207 .name = "watchdog",
208 .fops = &rdc321x_wdt_fops,
209};
210
211static int __devinit rdc321x_wdt_probe(struct platform_device *pdev)
212{
213 int err;
214
215 err = misc_register(&rdc321x_wdt_misc);
216 if (err < 0) {
217 printk(KERN_ERR PFX "watchdog misc_register failed\n");
218 return err;
219 }
220
221 /* Reset the watchdog */
222 outl(RDC_WDT_RST, RDC3210_CFGREG_DATA);
223
224 init_completion(&rdc321x_wdt_device.stop);
225 rdc321x_wdt_device.queue = 0;
226
227 clear_bit(0, &rdc321x_wdt_device.inuse);
228
229 setup_timer(&rdc321x_wdt_device.timer, rdc321x_wdt_trigger, 0);
230
231 rdc321x_wdt_device.default_ticks = ticks;
232
233 printk(KERN_INFO PFX "watchdog init success\n");
234
235 return 0;
236}
237
238static int rdc321x_wdt_remove(struct platform_device *pdev)
239{
240 if (rdc321x_wdt_device.queue) {
241 rdc321x_wdt_device.queue = 0;
242 wait_for_completion(&rdc321x_wdt_device.stop);
243 }
244
245 misc_deregister(&rdc321x_wdt_misc);
246
247 return 0;
248}
249
250static struct platform_driver rdc321x_wdt_driver = {
251 .probe = rdc321x_wdt_probe,
252 .remove = rdc321x_wdt_remove,
253 .driver = {
254 .owner = THIS_MODULE,
255 .name = "rdc321x-wdt",
256 },
257};
258
259static int __init rdc321x_wdt_init(void)
260{
261 return platform_driver_register(&rdc321x_wdt_driver);
262}
263
264static void __exit rdc321x_wdt_exit(void)
265{
266 platform_driver_unregister(&rdc321x_wdt_driver);
267}
268
269module_init(rdc321x_wdt_init);
270module_exit(rdc321x_wdt_exit);
271
272MODULE_AUTHOR("Florian Fainelli <florian@openwrt.org>");
273MODULE_DESCRIPTION("RDC321x watchdog driver");
274MODULE_LICENSE("GPL");
275MODULE_ALIAS_MISCDEV(WATCHDOG_MINOR);
diff --git a/arch/x86/mach-visws/mpparse.c b/arch/x86/mach-visws/mpparse.c
index f3c74fab8b9..2a8456a1f44 100644
--- a/arch/x86/mach-visws/mpparse.c
+++ b/arch/x86/mach-visws/mpparse.c
@@ -36,19 +36,19 @@ unsigned int __initdata maxcpus = NR_CPUS;
36 36
37static void __init MP_processor_info (struct mpc_config_processor *m) 37static void __init MP_processor_info (struct mpc_config_processor *m)
38{ 38{
39 int ver, logical_apicid; 39 int ver, logical_apicid;
40 physid_mask_t apic_cpus; 40 physid_mask_t apic_cpus;
41 41
42 if (!(m->mpc_cpuflag & CPU_ENABLED)) 42 if (!(m->mpc_cpuflag & CPU_ENABLED))
43 return; 43 return;
44 44
45 logical_apicid = m->mpc_apicid; 45 logical_apicid = m->mpc_apicid;
46 printk(KERN_INFO "%sCPU #%d %ld:%ld APIC version %d\n", 46 printk(KERN_INFO "%sCPU #%d %u:%u APIC version %d\n",
47 m->mpc_cpuflag & CPU_BOOTPROCESSOR ? "Bootup " : "", 47 m->mpc_cpuflag & CPU_BOOTPROCESSOR ? "Bootup " : "",
48 m->mpc_apicid, 48 m->mpc_apicid,
49 (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8, 49 (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
50 (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4, 50 (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
51 m->mpc_apicver); 51 m->mpc_apicver);
52 52
53 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) 53 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR)
54 boot_cpu_physical_apicid = m->mpc_apicid; 54 boot_cpu_physical_apicid = m->mpc_apicid;
diff --git a/arch/x86/mach-voyager/setup.c b/arch/x86/mach-voyager/setup.c
index 3bef977cb29..5ae5466b9eb 100644
--- a/arch/x86/mach-voyager/setup.c
+++ b/arch/x86/mach-voyager/setup.c
@@ -37,14 +37,14 @@ void __init pre_setup_arch_hook(void)
37{ 37{
38 /* Voyagers run their CPUs from independent clocks, so disable 38 /* Voyagers run their CPUs from independent clocks, so disable
39 * the TSC code because we can't sync them */ 39 * the TSC code because we can't sync them */
40 tsc_disable = 1; 40 setup_clear_cpu_cap(X86_FEATURE_TSC);
41} 41}
42 42
43void __init trap_init_hook(void) 43void __init trap_init_hook(void)
44{ 44{
45} 45}
46 46
47static struct irqaction irq0 = { 47static struct irqaction irq0 = {
48 .handler = timer_interrupt, 48 .handler = timer_interrupt,
49 .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL, 49 .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL,
50 .mask = CPU_MASK_NONE, 50 .mask = CPU_MASK_NONE,
@@ -59,44 +59,47 @@ void __init time_init_hook(void)
59 59
60/* Hook for machine specific memory setup. */ 60/* Hook for machine specific memory setup. */
61 61
62char * __init machine_specific_memory_setup(void) 62char *__init machine_specific_memory_setup(void)
63{ 63{
64 char *who; 64 char *who;
65 65
66 who = "NOT VOYAGER"; 66 who = "NOT VOYAGER";
67 67
68 if(voyager_level == 5) { 68 if (voyager_level == 5) {
69 __u32 addr, length; 69 __u32 addr, length;
70 int i; 70 int i;
71 71
72 who = "Voyager-SUS"; 72 who = "Voyager-SUS";
73 73
74 e820.nr_map = 0; 74 e820.nr_map = 0;
75 for(i=0; voyager_memory_detect(i, &addr, &length); i++) { 75 for (i = 0; voyager_memory_detect(i, &addr, &length); i++) {
76 add_memory_region(addr, length, E820_RAM); 76 add_memory_region(addr, length, E820_RAM);
77 } 77 }
78 return who; 78 return who;
79 } else if(voyager_level == 4) { 79 } else if (voyager_level == 4) {
80 __u32 tom; 80 __u32 tom;
81 __u16 catbase = inb(VOYAGER_SSPB_RELOCATION_PORT)<<8; 81 __u16 catbase = inb(VOYAGER_SSPB_RELOCATION_PORT) << 8;
82 /* select the DINO config space */ 82 /* select the DINO config space */
83 outb(VOYAGER_DINO, VOYAGER_CAT_CONFIG_PORT); 83 outb(VOYAGER_DINO, VOYAGER_CAT_CONFIG_PORT);
84 /* Read DINO top of memory register */ 84 /* Read DINO top of memory register */
85 tom = ((inb(catbase + 0x4) & 0xf0) << 16) 85 tom = ((inb(catbase + 0x4) & 0xf0) << 16)
86 + ((inb(catbase + 0x5) & 0x7f) << 24); 86 + ((inb(catbase + 0x5) & 0x7f) << 24);
87 87
88 if(inb(catbase) != VOYAGER_DINO) { 88 if (inb(catbase) != VOYAGER_DINO) {
89 printk(KERN_ERR "Voyager: Failed to get DINO for L4, setting tom to EXT_MEM_K\n"); 89 printk(KERN_ERR
90 tom = (boot_params.screen_info.ext_mem_k)<<10; 90 "Voyager: Failed to get DINO for L4, setting tom to EXT_MEM_K\n");
91 tom = (boot_params.screen_info.ext_mem_k) << 10;
91 } 92 }
92 who = "Voyager-TOM"; 93 who = "Voyager-TOM";
93 add_memory_region(0, 0x9f000, E820_RAM); 94 add_memory_region(0, 0x9f000, E820_RAM);
94 /* map from 1M to top of memory */ 95 /* map from 1M to top of memory */
95 add_memory_region(1*1024*1024, tom - 1*1024*1024, E820_RAM); 96 add_memory_region(1 * 1024 * 1024, tom - 1 * 1024 * 1024,
97 E820_RAM);
96 /* FIXME: Should check the ASICs to see if I need to 98 /* FIXME: Should check the ASICs to see if I need to
97 * take out the 8M window. Just do it at the moment 99 * take out the 8M window. Just do it at the moment
98 * */ 100 * */
99 add_memory_region(8*1024*1024, 8*1024*1024, E820_RESERVED); 101 add_memory_region(8 * 1024 * 1024, 8 * 1024 * 1024,
102 E820_RESERVED);
100 return who; 103 return who;
101 } 104 }
102 105
@@ -114,8 +117,7 @@ char * __init machine_specific_memory_setup(void)
114 unsigned long mem_size; 117 unsigned long mem_size;
115 118
116 /* compare results from other methods and take the greater */ 119 /* compare results from other methods and take the greater */
117 if (boot_params.alt_mem_k 120 if (boot_params.alt_mem_k < boot_params.screen_info.ext_mem_k) {
118 < boot_params.screen_info.ext_mem_k) {
119 mem_size = boot_params.screen_info.ext_mem_k; 121 mem_size = boot_params.screen_info.ext_mem_k;
120 who = "BIOS-88"; 122 who = "BIOS-88";
121 } else { 123 } else {
@@ -126,6 +128,6 @@ char * __init machine_specific_memory_setup(void)
126 e820.nr_map = 0; 128 e820.nr_map = 0;
127 add_memory_region(0, LOWMEMSIZE(), E820_RAM); 129 add_memory_region(0, LOWMEMSIZE(), E820_RAM);
128 add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM); 130 add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
129 } 131 }
130 return who; 132 return who;
131} 133}
diff --git a/arch/x86/mach-voyager/voyager_basic.c b/arch/x86/mach-voyager/voyager_basic.c
index 9b77b39b71a..6a949e4edde 100644
--- a/arch/x86/mach-voyager/voyager_basic.c
+++ b/arch/x86/mach-voyager/voyager_basic.c
@@ -35,7 +35,7 @@
35/* 35/*
36 * Power off function, if any 36 * Power off function, if any
37 */ 37 */
38void (*pm_power_off)(void); 38void (*pm_power_off) (void);
39EXPORT_SYMBOL(pm_power_off); 39EXPORT_SYMBOL(pm_power_off);
40 40
41int voyager_level = 0; 41int voyager_level = 0;
@@ -43,39 +43,38 @@ int voyager_level = 0;
43struct voyager_SUS *voyager_SUS = NULL; 43struct voyager_SUS *voyager_SUS = NULL;
44 44
45#ifdef CONFIG_SMP 45#ifdef CONFIG_SMP
46static void 46static void voyager_dump(int dummy1, struct tty_struct *dummy3)
47voyager_dump(int dummy1, struct tty_struct *dummy3)
48{ 47{
49 /* get here via a sysrq */ 48 /* get here via a sysrq */
50 voyager_smp_dump(); 49 voyager_smp_dump();
51} 50}
52 51
53static struct sysrq_key_op sysrq_voyager_dump_op = { 52static struct sysrq_key_op sysrq_voyager_dump_op = {
54 .handler = voyager_dump, 53 .handler = voyager_dump,
55 .help_msg = "Voyager", 54 .help_msg = "Voyager",
56 .action_msg = "Dump Voyager Status", 55 .action_msg = "Dump Voyager Status",
57}; 56};
58#endif 57#endif
59 58
60void 59void voyager_detect(struct voyager_bios_info *bios)
61voyager_detect(struct voyager_bios_info *bios)
62{ 60{
63 if(bios->len != 0xff) { 61 if (bios->len != 0xff) {
64 int class = (bios->class_1 << 8) 62 int class = (bios->class_1 << 8)
65 | (bios->class_2 & 0xff); 63 | (bios->class_2 & 0xff);
66 64
67 printk("Voyager System detected.\n" 65 printk("Voyager System detected.\n"
68 " Class %x, Revision %d.%d\n", 66 " Class %x, Revision %d.%d\n",
69 class, bios->major, bios->minor); 67 class, bios->major, bios->minor);
70 if(class == VOYAGER_LEVEL4) 68 if (class == VOYAGER_LEVEL4)
71 voyager_level = 4; 69 voyager_level = 4;
72 else if(class < VOYAGER_LEVEL5_AND_ABOVE) 70 else if (class < VOYAGER_LEVEL5_AND_ABOVE)
73 voyager_level = 3; 71 voyager_level = 3;
74 else 72 else
75 voyager_level = 5; 73 voyager_level = 5;
76 printk(" Architecture Level %d\n", voyager_level); 74 printk(" Architecture Level %d\n", voyager_level);
77 if(voyager_level < 4) 75 if (voyager_level < 4)
78 printk("\n**WARNING**: Voyager HAL only supports Levels 4 and 5 Architectures at the moment\n\n"); 76 printk
77 ("\n**WARNING**: Voyager HAL only supports Levels 4 and 5 Architectures at the moment\n\n");
79 /* install the power off handler */ 78 /* install the power off handler */
80 pm_power_off = voyager_power_off; 79 pm_power_off = voyager_power_off;
81#ifdef CONFIG_SMP 80#ifdef CONFIG_SMP
@@ -86,15 +85,13 @@ voyager_detect(struct voyager_bios_info *bios)
86 } 85 }
87} 86}
88 87
89void 88void voyager_system_interrupt(int cpl, void *dev_id)
90voyager_system_interrupt(int cpl, void *dev_id)
91{ 89{
92 printk("Voyager: detected system interrupt\n"); 90 printk("Voyager: detected system interrupt\n");
93} 91}
94 92
95/* Routine to read information from the extended CMOS area */ 93/* Routine to read information from the extended CMOS area */
96__u8 94__u8 voyager_extended_cmos_read(__u16 addr)
97voyager_extended_cmos_read(__u16 addr)
98{ 95{
99 outb(addr & 0xff, 0x74); 96 outb(addr & 0xff, 0x74);
100 outb((addr >> 8) & 0xff, 0x75); 97 outb((addr >> 8) & 0xff, 0x75);
@@ -108,12 +105,11 @@ voyager_extended_cmos_read(__u16 addr)
108 105
109typedef struct ClickMap { 106typedef struct ClickMap {
110 struct Entry { 107 struct Entry {
111 __u32 Address; 108 __u32 Address;
112 __u32 Length; 109 __u32 Length;
113 } Entry[CLICK_ENTRIES]; 110 } Entry[CLICK_ENTRIES];
114} ClickMap_t; 111} ClickMap_t;
115 112
116
117/* This routine is pretty much an awful hack to read the bios clickmap by 113/* This routine is pretty much an awful hack to read the bios clickmap by
118 * mapping it into page 0. There are usually three regions in the map: 114 * mapping it into page 0. There are usually three regions in the map:
119 * Base Memory 115 * Base Memory
@@ -122,8 +118,7 @@ typedef struct ClickMap {
122 * 118 *
123 * Returns are 0 for failure and 1 for success on extracting region. 119 * Returns are 0 for failure and 1 for success on extracting region.
124 */ 120 */
125int __init 121int __init voyager_memory_detect(int region, __u32 * start, __u32 * length)
126voyager_memory_detect(int region, __u32 *start, __u32 *length)
127{ 122{
128 int i; 123 int i;
129 int retval = 0; 124 int retval = 0;
@@ -132,13 +127,14 @@ voyager_memory_detect(int region, __u32 *start, __u32 *length)
132 unsigned long map_addr; 127 unsigned long map_addr;
133 unsigned long old; 128 unsigned long old;
134 129
135 if(region >= CLICK_ENTRIES) { 130 if (region >= CLICK_ENTRIES) {
136 printk("Voyager: Illegal ClickMap region %d\n", region); 131 printk("Voyager: Illegal ClickMap region %d\n", region);
137 return 0; 132 return 0;
138 } 133 }
139 134
140 for(i = 0; i < sizeof(cmos); i++) 135 for (i = 0; i < sizeof(cmos); i++)
141 cmos[i] = voyager_extended_cmos_read(VOYAGER_MEMORY_CLICKMAP + i); 136 cmos[i] =
137 voyager_extended_cmos_read(VOYAGER_MEMORY_CLICKMAP + i);
142 138
143 map_addr = *(unsigned long *)cmos; 139 map_addr = *(unsigned long *)cmos;
144 140
@@ -147,10 +143,10 @@ voyager_memory_detect(int region, __u32 *start, __u32 *length)
147 pg0[0] = ((map_addr & PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT); 143 pg0[0] = ((map_addr & PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT);
148 local_flush_tlb(); 144 local_flush_tlb();
149 /* now clear everything out but page 0 */ 145 /* now clear everything out but page 0 */
150 map = (ClickMap_t *)(map_addr & (~PAGE_MASK)); 146 map = (ClickMap_t *) (map_addr & (~PAGE_MASK));
151 147
152 /* zero length is the end of the clickmap */ 148 /* zero length is the end of the clickmap */
153 if(map->Entry[region].Length != 0) { 149 if (map->Entry[region].Length != 0) {
154 *length = map->Entry[region].Length * CLICK_SIZE; 150 *length = map->Entry[region].Length * CLICK_SIZE;
155 *start = map->Entry[region].Address; 151 *start = map->Entry[region].Address;
156 retval = 1; 152 retval = 1;
@@ -165,10 +161,9 @@ voyager_memory_detect(int region, __u32 *start, __u32 *length)
165/* voyager specific handling code for timer interrupts. Used to hand 161/* voyager specific handling code for timer interrupts. Used to hand
166 * off the timer tick to the SMP code, since the VIC doesn't have an 162 * off the timer tick to the SMP code, since the VIC doesn't have an
167 * internal timer (The QIC does, but that's another story). */ 163 * internal timer (The QIC does, but that's another story). */
168void 164void voyager_timer_interrupt(void)
169voyager_timer_interrupt(void)
170{ 165{
171 if((jiffies & 0x3ff) == 0) { 166 if ((jiffies & 0x3ff) == 0) {
172 167
173 /* There seems to be something flaky in either 168 /* There seems to be something flaky in either
174 * hardware or software that is resetting the timer 0 169 * hardware or software that is resetting the timer 0
@@ -186,18 +181,20 @@ voyager_timer_interrupt(void)
186 __u16 val; 181 __u16 val;
187 182
188 spin_lock(&i8253_lock); 183 spin_lock(&i8253_lock);
189 184
190 outb_p(0x00, 0x43); 185 outb_p(0x00, 0x43);
191 val = inb_p(0x40); 186 val = inb_p(0x40);
192 val |= inb(0x40) << 8; 187 val |= inb(0x40) << 8;
193 spin_unlock(&i8253_lock); 188 spin_unlock(&i8253_lock);
194 189
195 if(val > LATCH) { 190 if (val > LATCH) {
196 printk("\nVOYAGER: countdown timer value too high (%d), resetting\n\n", val); 191 printk
192 ("\nVOYAGER: countdown timer value too high (%d), resetting\n\n",
193 val);
197 spin_lock(&i8253_lock); 194 spin_lock(&i8253_lock);
198 outb(0x34,0x43); 195 outb(0x34, 0x43);
199 outb_p(LATCH & 0xff , 0x40); /* LSB */ 196 outb_p(LATCH & 0xff, 0x40); /* LSB */
200 outb(LATCH >> 8 , 0x40); /* MSB */ 197 outb(LATCH >> 8, 0x40); /* MSB */
201 spin_unlock(&i8253_lock); 198 spin_unlock(&i8253_lock);
202 } 199 }
203 } 200 }
@@ -206,14 +203,13 @@ voyager_timer_interrupt(void)
206#endif 203#endif
207} 204}
208 205
209void 206void voyager_power_off(void)
210voyager_power_off(void)
211{ 207{
212 printk("VOYAGER Power Off\n"); 208 printk("VOYAGER Power Off\n");
213 209
214 if(voyager_level == 5) { 210 if (voyager_level == 5) {
215 voyager_cat_power_off(); 211 voyager_cat_power_off();
216 } else if(voyager_level == 4) { 212 } else if (voyager_level == 4) {
217 /* This doesn't apparently work on most L4 machines, 213 /* This doesn't apparently work on most L4 machines,
218 * but the specs say to do this to get automatic power 214 * but the specs say to do this to get automatic power
219 * off. Unfortunately, if it doesn't power off the 215 * off. Unfortunately, if it doesn't power off the
@@ -222,10 +218,8 @@ voyager_power_off(void)
222#if 0 218#if 0
223 int port; 219 int port;
224 220
225
226 /* enable the voyager Configuration Space */ 221 /* enable the voyager Configuration Space */
227 outb((inb(VOYAGER_MC_SETUP) & 0xf0) | 0x8, 222 outb((inb(VOYAGER_MC_SETUP) & 0xf0) | 0x8, VOYAGER_MC_SETUP);
228 VOYAGER_MC_SETUP);
229 /* the port for the power off flag is an offset from the 223 /* the port for the power off flag is an offset from the
230 floating base */ 224 floating base */
231 port = (inb(VOYAGER_SSPB_RELOCATION_PORT) << 8) + 0x21; 225 port = (inb(VOYAGER_SSPB_RELOCATION_PORT) << 8) + 0x21;
@@ -235,62 +229,57 @@ voyager_power_off(void)
235 } 229 }
236 /* and wait for it to happen */ 230 /* and wait for it to happen */
237 local_irq_disable(); 231 local_irq_disable();
238 for(;;) 232 for (;;)
239 halt(); 233 halt();
240} 234}
241 235
242/* copied from process.c */ 236/* copied from process.c */
243static inline void 237static inline void kb_wait(void)
244kb_wait(void)
245{ 238{
246 int i; 239 int i;
247 240
248 for (i=0; i<0x10000; i++) 241 for (i = 0; i < 0x10000; i++)
249 if ((inb_p(0x64) & 0x02) == 0) 242 if ((inb_p(0x64) & 0x02) == 0)
250 break; 243 break;
251} 244}
252 245
253void 246void machine_shutdown(void)
254machine_shutdown(void)
255{ 247{
256 /* Architecture specific shutdown needed before a kexec */ 248 /* Architecture specific shutdown needed before a kexec */
257} 249}
258 250
259void 251void machine_restart(char *cmd)
260machine_restart(char *cmd)
261{ 252{
262 printk("Voyager Warm Restart\n"); 253 printk("Voyager Warm Restart\n");
263 kb_wait(); 254 kb_wait();
264 255
265 if(voyager_level == 5) { 256 if (voyager_level == 5) {
266 /* write magic values to the RTC to inform system that 257 /* write magic values to the RTC to inform system that
267 * shutdown is beginning */ 258 * shutdown is beginning */
268 outb(0x8f, 0x70); 259 outb(0x8f, 0x70);
269 outb(0x5 , 0x71); 260 outb(0x5, 0x71);
270 261
271 udelay(50); 262 udelay(50);
272 outb(0xfe,0x64); /* pull reset low */ 263 outb(0xfe, 0x64); /* pull reset low */
273 } else if(voyager_level == 4) { 264 } else if (voyager_level == 4) {
274 __u16 catbase = inb(VOYAGER_SSPB_RELOCATION_PORT)<<8; 265 __u16 catbase = inb(VOYAGER_SSPB_RELOCATION_PORT) << 8;
275 __u8 basebd = inb(VOYAGER_MC_SETUP); 266 __u8 basebd = inb(VOYAGER_MC_SETUP);
276 267
277 outb(basebd | 0x08, VOYAGER_MC_SETUP); 268 outb(basebd | 0x08, VOYAGER_MC_SETUP);
278 outb(0x02, catbase + 0x21); 269 outb(0x02, catbase + 0x21);
279 } 270 }
280 local_irq_disable(); 271 local_irq_disable();
281 for(;;) 272 for (;;)
282 halt(); 273 halt();
283} 274}
284 275
285void 276void machine_emergency_restart(void)
286machine_emergency_restart(void)
287{ 277{
288 /*for now, just hook this to a warm restart */ 278 /*for now, just hook this to a warm restart */
289 machine_restart(NULL); 279 machine_restart(NULL);
290} 280}
291 281
292void 282void mca_nmi_hook(void)
293mca_nmi_hook(void)
294{ 283{
295 __u8 dumpval __maybe_unused = inb(0xf823); 284 __u8 dumpval __maybe_unused = inb(0xf823);
296 __u8 swnmi __maybe_unused = inb(0xf813); 285 __u8 swnmi __maybe_unused = inb(0xf813);
@@ -301,8 +290,8 @@ mca_nmi_hook(void)
301 /* clear swnmi */ 290 /* clear swnmi */
302 outb(0xff, 0xf813); 291 outb(0xff, 0xf813);
303 /* tell SUS to ignore dump */ 292 /* tell SUS to ignore dump */
304 if(voyager_level == 5 && voyager_SUS != NULL) { 293 if (voyager_level == 5 && voyager_SUS != NULL) {
305 if(voyager_SUS->SUS_mbox == VOYAGER_DUMP_BUTTON_NMI) { 294 if (voyager_SUS->SUS_mbox == VOYAGER_DUMP_BUTTON_NMI) {
306 voyager_SUS->kernel_mbox = VOYAGER_NO_COMMAND; 295 voyager_SUS->kernel_mbox = VOYAGER_NO_COMMAND;
307 voyager_SUS->kernel_flags |= VOYAGER_OS_IN_PROGRESS; 296 voyager_SUS->kernel_flags |= VOYAGER_OS_IN_PROGRESS;
308 udelay(1000); 297 udelay(1000);
@@ -310,15 +299,14 @@ mca_nmi_hook(void)
310 voyager_SUS->kernel_flags &= ~VOYAGER_OS_IN_PROGRESS; 299 voyager_SUS->kernel_flags &= ~VOYAGER_OS_IN_PROGRESS;
311 } 300 }
312 } 301 }
313 printk(KERN_ERR "VOYAGER: Dump switch pressed, printing CPU%d tracebacks\n", smp_processor_id()); 302 printk(KERN_ERR
303 "VOYAGER: Dump switch pressed, printing CPU%d tracebacks\n",
304 smp_processor_id());
314 show_stack(NULL, NULL); 305 show_stack(NULL, NULL);
315 show_state(); 306 show_state();
316} 307}
317 308
318 309void machine_halt(void)
319
320void
321machine_halt(void)
322{ 310{
323 /* treat a halt like a power off */ 311 /* treat a halt like a power off */
324 machine_power_off(); 312 machine_power_off();
diff --git a/arch/x86/mach-voyager/voyager_cat.c b/arch/x86/mach-voyager/voyager_cat.c
index 2132ca652df..17a7904f75b 100644
--- a/arch/x86/mach-voyager/voyager_cat.c
+++ b/arch/x86/mach-voyager/voyager_cat.c
@@ -39,34 +39,32 @@
39#define CAT_DATA (sspb + 0xd) 39#define CAT_DATA (sspb + 0xd)
40 40
41/* the internal cat functions */ 41/* the internal cat functions */
42static void cat_pack(__u8 *msg, __u16 start_bit, __u8 *data, 42static void cat_pack(__u8 * msg, __u16 start_bit, __u8 * data, __u16 num_bits);
43 __u16 num_bits); 43static void cat_unpack(__u8 * msg, __u16 start_bit, __u8 * data,
44static void cat_unpack(__u8 *msg, __u16 start_bit, __u8 *data,
45 __u16 num_bits); 44 __u16 num_bits);
46static void cat_build_header(__u8 *header, const __u16 len, 45static void cat_build_header(__u8 * header, const __u16 len,
47 const __u16 smallest_reg_bits, 46 const __u16 smallest_reg_bits,
48 const __u16 longest_reg_bits); 47 const __u16 longest_reg_bits);
49static int cat_sendinst(voyager_module_t *modp, voyager_asic_t *asicp, 48static int cat_sendinst(voyager_module_t * modp, voyager_asic_t * asicp,
50 __u8 reg, __u8 op); 49 __u8 reg, __u8 op);
51static int cat_getdata(voyager_module_t *modp, voyager_asic_t *asicp, 50static int cat_getdata(voyager_module_t * modp, voyager_asic_t * asicp,
52 __u8 reg, __u8 *value); 51 __u8 reg, __u8 * value);
53static int cat_shiftout(__u8 *data, __u16 data_bytes, __u16 header_bytes, 52static int cat_shiftout(__u8 * data, __u16 data_bytes, __u16 header_bytes,
54 __u8 pad_bits); 53 __u8 pad_bits);
55static int cat_write(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg, 54static int cat_write(voyager_module_t * modp, voyager_asic_t * asicp, __u8 reg,
56 __u8 value); 55 __u8 value);
57static int cat_read(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg, 56static int cat_read(voyager_module_t * modp, voyager_asic_t * asicp, __u8 reg,
58 __u8 *value); 57 __u8 * value);
59static int cat_subread(voyager_module_t *modp, voyager_asic_t *asicp, 58static int cat_subread(voyager_module_t * modp, voyager_asic_t * asicp,
60 __u16 offset, __u16 len, void *buf); 59 __u16 offset, __u16 len, void *buf);
61static int cat_senddata(voyager_module_t *modp, voyager_asic_t *asicp, 60static int cat_senddata(voyager_module_t * modp, voyager_asic_t * asicp,
62 __u8 reg, __u8 value); 61 __u8 reg, __u8 value);
63static int cat_disconnect(voyager_module_t *modp, voyager_asic_t *asicp); 62static int cat_disconnect(voyager_module_t * modp, voyager_asic_t * asicp);
64static int cat_connect(voyager_module_t *modp, voyager_asic_t *asicp); 63static int cat_connect(voyager_module_t * modp, voyager_asic_t * asicp);
65 64
66static inline const char * 65static inline const char *cat_module_name(int module_id)
67cat_module_name(int module_id)
68{ 66{
69 switch(module_id) { 67 switch (module_id) {
70 case 0x10: 68 case 0x10:
71 return "Processor Slot 0"; 69 return "Processor Slot 0";
72 case 0x11: 70 case 0x11:
@@ -105,14 +103,14 @@ voyager_module_t *voyager_cat_list;
105 103
106/* the I/O port assignments for the VIC and QIC */ 104/* the I/O port assignments for the VIC and QIC */
107static struct resource vic_res = { 105static struct resource vic_res = {
108 .name = "Voyager Interrupt Controller", 106 .name = "Voyager Interrupt Controller",
109 .start = 0xFC00, 107 .start = 0xFC00,
110 .end = 0xFC6F 108 .end = 0xFC6F
111}; 109};
112static struct resource qic_res = { 110static struct resource qic_res = {
113 .name = "Quad Interrupt Controller", 111 .name = "Quad Interrupt Controller",
114 .start = 0xFC70, 112 .start = 0xFC70,
115 .end = 0xFCFF 113 .end = 0xFCFF
116}; 114};
117 115
118/* This function is used to pack a data bit stream inside a message. 116/* This function is used to pack a data bit stream inside a message.
@@ -120,7 +118,7 @@ static struct resource qic_res = {
120 * Note: This function assumes that any unused bit in the data stream 118 * Note: This function assumes that any unused bit in the data stream
121 * is set to zero so that the ors will work correctly */ 119 * is set to zero so that the ors will work correctly */
122static void 120static void
123cat_pack(__u8 *msg, const __u16 start_bit, __u8 *data, const __u16 num_bits) 121cat_pack(__u8 * msg, const __u16 start_bit, __u8 * data, const __u16 num_bits)
124{ 122{
125 /* compute initial shift needed */ 123 /* compute initial shift needed */
126 const __u16 offset = start_bit % BITS_PER_BYTE; 124 const __u16 offset = start_bit % BITS_PER_BYTE;
@@ -130,7 +128,7 @@ cat_pack(__u8 *msg, const __u16 start_bit, __u8 *data, const __u16 num_bits)
130 int i; 128 int i;
131 129
132 /* adjust if we have more than a byte of residue */ 130 /* adjust if we have more than a byte of residue */
133 if(residue >= BITS_PER_BYTE) { 131 if (residue >= BITS_PER_BYTE) {
134 residue -= BITS_PER_BYTE; 132 residue -= BITS_PER_BYTE;
135 len++; 133 len++;
136 } 134 }
@@ -138,24 +136,25 @@ cat_pack(__u8 *msg, const __u16 start_bit, __u8 *data, const __u16 num_bits)
138 /* clear out the bits. We assume here that if len==0 then 136 /* clear out the bits. We assume here that if len==0 then
139 * residue >= offset. This is always true for the catbus 137 * residue >= offset. This is always true for the catbus
140 * operations */ 138 * operations */
141 msg[byte] &= 0xff << (BITS_PER_BYTE - offset); 139 msg[byte] &= 0xff << (BITS_PER_BYTE - offset);
142 msg[byte++] |= data[0] >> offset; 140 msg[byte++] |= data[0] >> offset;
143 if(len == 0) 141 if (len == 0)
144 return; 142 return;
145 for(i = 1; i < len; i++) 143 for (i = 1; i < len; i++)
146 msg[byte++] = (data[i-1] << (BITS_PER_BYTE - offset)) 144 msg[byte++] = (data[i - 1] << (BITS_PER_BYTE - offset))
147 | (data[i] >> offset); 145 | (data[i] >> offset);
148 if(residue != 0) { 146 if (residue != 0) {
149 __u8 mask = 0xff >> residue; 147 __u8 mask = 0xff >> residue;
150 __u8 last_byte = data[i-1] << (BITS_PER_BYTE - offset) 148 __u8 last_byte = data[i - 1] << (BITS_PER_BYTE - offset)
151 | (data[i] >> offset); 149 | (data[i] >> offset);
152 150
153 last_byte &= ~mask; 151 last_byte &= ~mask;
154 msg[byte] &= mask; 152 msg[byte] &= mask;
155 msg[byte] |= last_byte; 153 msg[byte] |= last_byte;
156 } 154 }
157 return; 155 return;
158} 156}
157
159/* unpack the data again (same arguments as cat_pack()). data buffer 158/* unpack the data again (same arguments as cat_pack()). data buffer
160 * must be zero populated. 159 * must be zero populated.
161 * 160 *
@@ -163,7 +162,7 @@ cat_pack(__u8 *msg, const __u16 start_bit, __u8 *data, const __u16 num_bits)
163 * data (starting at bit 0 in data). 162 * data (starting at bit 0 in data).
164 */ 163 */
165static void 164static void
166cat_unpack(__u8 *msg, const __u16 start_bit, __u8 *data, const __u16 num_bits) 165cat_unpack(__u8 * msg, const __u16 start_bit, __u8 * data, const __u16 num_bits)
167{ 166{
168 /* compute initial shift needed */ 167 /* compute initial shift needed */
169 const __u16 offset = start_bit % BITS_PER_BYTE; 168 const __u16 offset = start_bit % BITS_PER_BYTE;
@@ -172,97 +171,97 @@ cat_unpack(__u8 *msg, const __u16 start_bit, __u8 *data, const __u16 num_bits)
172 __u16 byte = start_bit / BITS_PER_BYTE; 171 __u16 byte = start_bit / BITS_PER_BYTE;
173 int i; 172 int i;
174 173
175 if(last_bits != 0) 174 if (last_bits != 0)
176 len++; 175 len++;
177 176
178 /* special case: want < 8 bits from msg and we can get it from 177 /* special case: want < 8 bits from msg and we can get it from
179 * a single byte of the msg */ 178 * a single byte of the msg */
180 if(len == 0 && BITS_PER_BYTE - offset >= num_bits) { 179 if (len == 0 && BITS_PER_BYTE - offset >= num_bits) {
181 data[0] = msg[byte] << offset; 180 data[0] = msg[byte] << offset;
182 data[0] &= 0xff >> (BITS_PER_BYTE - num_bits); 181 data[0] &= 0xff >> (BITS_PER_BYTE - num_bits);
183 return; 182 return;
184 } 183 }
185 for(i = 0; i < len; i++) { 184 for (i = 0; i < len; i++) {
186 /* this annoying if has to be done just in case a read of 185 /* this annoying if has to be done just in case a read of
187 * msg one beyond the array causes a panic */ 186 * msg one beyond the array causes a panic */
188 if(offset != 0) { 187 if (offset != 0) {
189 data[i] = msg[byte++] << offset; 188 data[i] = msg[byte++] << offset;
190 data[i] |= msg[byte] >> (BITS_PER_BYTE - offset); 189 data[i] |= msg[byte] >> (BITS_PER_BYTE - offset);
191 } 190 } else {
192 else {
193 data[i] = msg[byte++]; 191 data[i] = msg[byte++];
194 } 192 }
195 } 193 }
196 /* do we need to truncate the final byte */ 194 /* do we need to truncate the final byte */
197 if(last_bits != 0) { 195 if (last_bits != 0) {
198 data[i-1] &= 0xff << (BITS_PER_BYTE - last_bits); 196 data[i - 1] &= 0xff << (BITS_PER_BYTE - last_bits);
199 } 197 }
200 return; 198 return;
201} 199}
202 200
203static void 201static void
204cat_build_header(__u8 *header, const __u16 len, const __u16 smallest_reg_bits, 202cat_build_header(__u8 * header, const __u16 len, const __u16 smallest_reg_bits,
205 const __u16 longest_reg_bits) 203 const __u16 longest_reg_bits)
206{ 204{
207 int i; 205 int i;
208 __u16 start_bit = (smallest_reg_bits - 1) % BITS_PER_BYTE; 206 __u16 start_bit = (smallest_reg_bits - 1) % BITS_PER_BYTE;
209 __u8 *last_byte = &header[len - 1]; 207 __u8 *last_byte = &header[len - 1];
210 208
211 if(start_bit == 0) 209 if (start_bit == 0)
212 start_bit = 1; /* must have at least one bit in the hdr */ 210 start_bit = 1; /* must have at least one bit in the hdr */
213 211
214 for(i=0; i < len; i++) 212 for (i = 0; i < len; i++)
215 header[i] = 0; 213 header[i] = 0;
216 214
217 for(i = start_bit; i > 0; i--) 215 for (i = start_bit; i > 0; i--)
218 *last_byte = ((*last_byte) << 1) + 1; 216 *last_byte = ((*last_byte) << 1) + 1;
219 217
220} 218}
221 219
222static int 220static int
223cat_sendinst(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg, __u8 op) 221cat_sendinst(voyager_module_t * modp, voyager_asic_t * asicp, __u8 reg, __u8 op)
224{ 222{
225 __u8 parity, inst, inst_buf[4] = { 0 }; 223 __u8 parity, inst, inst_buf[4] = { 0 };
226 __u8 iseq[VOYAGER_MAX_SCAN_PATH], hseq[VOYAGER_MAX_REG_SIZE]; 224 __u8 iseq[VOYAGER_MAX_SCAN_PATH], hseq[VOYAGER_MAX_REG_SIZE];
227 __u16 ibytes, hbytes, padbits; 225 __u16 ibytes, hbytes, padbits;
228 int i; 226 int i;
229 227
230 /* 228 /*
231 * Parity is the parity of the register number + 1 (READ_REGISTER 229 * Parity is the parity of the register number + 1 (READ_REGISTER
232 * and WRITE_REGISTER always add '1' to the number of bits == 1) 230 * and WRITE_REGISTER always add '1' to the number of bits == 1)
233 */ 231 */
234 parity = (__u8)(1 + (reg & 0x01) + 232 parity = (__u8) (1 + (reg & 0x01) +
235 ((__u8)(reg & 0x02) >> 1) + 233 ((__u8) (reg & 0x02) >> 1) +
236 ((__u8)(reg & 0x04) >> 2) + 234 ((__u8) (reg & 0x04) >> 2) +
237 ((__u8)(reg & 0x08) >> 3)) % 2; 235 ((__u8) (reg & 0x08) >> 3)) % 2;
238 236
239 inst = ((parity << 7) | (reg << 2) | op); 237 inst = ((parity << 7) | (reg << 2) | op);
240 238
241 outb(VOYAGER_CAT_IRCYC, CAT_CMD); 239 outb(VOYAGER_CAT_IRCYC, CAT_CMD);
242 if(!modp->scan_path_connected) { 240 if (!modp->scan_path_connected) {
243 if(asicp->asic_id != VOYAGER_CAT_ID) { 241 if (asicp->asic_id != VOYAGER_CAT_ID) {
244 printk("**WARNING***: cat_sendinst has disconnected scan path not to CAT asic\n"); 242 printk
243 ("**WARNING***: cat_sendinst has disconnected scan path not to CAT asic\n");
245 return 1; 244 return 1;
246 } 245 }
247 outb(VOYAGER_CAT_HEADER, CAT_DATA); 246 outb(VOYAGER_CAT_HEADER, CAT_DATA);
248 outb(inst, CAT_DATA); 247 outb(inst, CAT_DATA);
249 if(inb(CAT_DATA) != VOYAGER_CAT_HEADER) { 248 if (inb(CAT_DATA) != VOYAGER_CAT_HEADER) {
250 CDEBUG(("VOYAGER CAT: cat_sendinst failed to get CAT_HEADER\n")); 249 CDEBUG(("VOYAGER CAT: cat_sendinst failed to get CAT_HEADER\n"));
251 return 1; 250 return 1;
252 } 251 }
253 return 0; 252 return 0;
254 } 253 }
255 ibytes = modp->inst_bits / BITS_PER_BYTE; 254 ibytes = modp->inst_bits / BITS_PER_BYTE;
256 if((padbits = modp->inst_bits % BITS_PER_BYTE) != 0) { 255 if ((padbits = modp->inst_bits % BITS_PER_BYTE) != 0) {
257 padbits = BITS_PER_BYTE - padbits; 256 padbits = BITS_PER_BYTE - padbits;
258 ibytes++; 257 ibytes++;
259 } 258 }
260 hbytes = modp->largest_reg / BITS_PER_BYTE; 259 hbytes = modp->largest_reg / BITS_PER_BYTE;
261 if(modp->largest_reg % BITS_PER_BYTE) 260 if (modp->largest_reg % BITS_PER_BYTE)
262 hbytes++; 261 hbytes++;
263 CDEBUG(("cat_sendinst: ibytes=%d, hbytes=%d\n", ibytes, hbytes)); 262 CDEBUG(("cat_sendinst: ibytes=%d, hbytes=%d\n", ibytes, hbytes));
264 /* initialise the instruction sequence to 0xff */ 263 /* initialise the instruction sequence to 0xff */
265 for(i=0; i < ibytes + hbytes; i++) 264 for (i = 0; i < ibytes + hbytes; i++)
266 iseq[i] = 0xff; 265 iseq[i] = 0xff;
267 cat_build_header(hseq, hbytes, modp->smallest_reg, modp->largest_reg); 266 cat_build_header(hseq, hbytes, modp->smallest_reg, modp->largest_reg);
268 cat_pack(iseq, modp->inst_bits, hseq, hbytes * BITS_PER_BYTE); 267 cat_pack(iseq, modp->inst_bits, hseq, hbytes * BITS_PER_BYTE);
@@ -271,11 +270,11 @@ cat_sendinst(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg, __u8 op)
271 cat_pack(iseq, asicp->bit_location, inst_buf, asicp->ireg_length); 270 cat_pack(iseq, asicp->bit_location, inst_buf, asicp->ireg_length);
272#ifdef VOYAGER_CAT_DEBUG 271#ifdef VOYAGER_CAT_DEBUG
273 printk("ins = 0x%x, iseq: ", inst); 272 printk("ins = 0x%x, iseq: ", inst);
274 for(i=0; i< ibytes + hbytes; i++) 273 for (i = 0; i < ibytes + hbytes; i++)
275 printk("0x%x ", iseq[i]); 274 printk("0x%x ", iseq[i]);
276 printk("\n"); 275 printk("\n");
277#endif 276#endif
278 if(cat_shiftout(iseq, ibytes, hbytes, padbits)) { 277 if (cat_shiftout(iseq, ibytes, hbytes, padbits)) {
279 CDEBUG(("VOYAGER CAT: cat_sendinst: cat_shiftout failed\n")); 278 CDEBUG(("VOYAGER CAT: cat_sendinst: cat_shiftout failed\n"));
280 return 1; 279 return 1;
281 } 280 }
@@ -284,72 +283,74 @@ cat_sendinst(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg, __u8 op)
284} 283}
285 284
286static int 285static int
287cat_getdata(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg, 286cat_getdata(voyager_module_t * modp, voyager_asic_t * asicp, __u8 reg,
288 __u8 *value) 287 __u8 * value)
289{ 288{
290 if(!modp->scan_path_connected) { 289 if (!modp->scan_path_connected) {
291 if(asicp->asic_id != VOYAGER_CAT_ID) { 290 if (asicp->asic_id != VOYAGER_CAT_ID) {
292 CDEBUG(("VOYAGER CAT: ERROR: cat_getdata to CAT asic with scan path connected\n")); 291 CDEBUG(("VOYAGER CAT: ERROR: cat_getdata to CAT asic with scan path connected\n"));
293 return 1; 292 return 1;
294 } 293 }
295 if(reg > VOYAGER_SUBADDRHI) 294 if (reg > VOYAGER_SUBADDRHI)
296 outb(VOYAGER_CAT_RUN, CAT_CMD); 295 outb(VOYAGER_CAT_RUN, CAT_CMD);
297 outb(VOYAGER_CAT_DRCYC, CAT_CMD); 296 outb(VOYAGER_CAT_DRCYC, CAT_CMD);
298 outb(VOYAGER_CAT_HEADER, CAT_DATA); 297 outb(VOYAGER_CAT_HEADER, CAT_DATA);
299 *value = inb(CAT_DATA); 298 *value = inb(CAT_DATA);
300 outb(0xAA, CAT_DATA); 299 outb(0xAA, CAT_DATA);
301 if(inb(CAT_DATA) != VOYAGER_CAT_HEADER) { 300 if (inb(CAT_DATA) != VOYAGER_CAT_HEADER) {
302 CDEBUG(("cat_getdata: failed to get VOYAGER_CAT_HEADER\n")); 301 CDEBUG(("cat_getdata: failed to get VOYAGER_CAT_HEADER\n"));
303 return 1; 302 return 1;
304 } 303 }
305 return 0; 304 return 0;
306 } 305 } else {
307 else { 306 __u16 sbits = modp->num_asics - 1 + asicp->ireg_length;
308 __u16 sbits = modp->num_asics -1 + asicp->ireg_length;
309 __u16 sbytes = sbits / BITS_PER_BYTE; 307 __u16 sbytes = sbits / BITS_PER_BYTE;
310 __u16 tbytes; 308 __u16 tbytes;
311 __u8 string[VOYAGER_MAX_SCAN_PATH], trailer[VOYAGER_MAX_REG_SIZE]; 309 __u8 string[VOYAGER_MAX_SCAN_PATH],
310 trailer[VOYAGER_MAX_REG_SIZE];
312 __u8 padbits; 311 __u8 padbits;
313 int i; 312 int i;
314 313
315 outb(VOYAGER_CAT_DRCYC, CAT_CMD); 314 outb(VOYAGER_CAT_DRCYC, CAT_CMD);
316 315
317 if((padbits = sbits % BITS_PER_BYTE) != 0) { 316 if ((padbits = sbits % BITS_PER_BYTE) != 0) {
318 padbits = BITS_PER_BYTE - padbits; 317 padbits = BITS_PER_BYTE - padbits;
319 sbytes++; 318 sbytes++;
320 } 319 }
321 tbytes = asicp->ireg_length / BITS_PER_BYTE; 320 tbytes = asicp->ireg_length / BITS_PER_BYTE;
322 if(asicp->ireg_length % BITS_PER_BYTE) 321 if (asicp->ireg_length % BITS_PER_BYTE)
323 tbytes++; 322 tbytes++;
324 CDEBUG(("cat_getdata: tbytes = %d, sbytes = %d, padbits = %d\n", 323 CDEBUG(("cat_getdata: tbytes = %d, sbytes = %d, padbits = %d\n",
325 tbytes, sbytes, padbits)); 324 tbytes, sbytes, padbits));
326 cat_build_header(trailer, tbytes, 1, asicp->ireg_length); 325 cat_build_header(trailer, tbytes, 1, asicp->ireg_length);
327 326
328 327 for (i = tbytes - 1; i >= 0; i--) {
329 for(i = tbytes - 1; i >= 0; i--) {
330 outb(trailer[i], CAT_DATA); 328 outb(trailer[i], CAT_DATA);
331 string[sbytes + i] = inb(CAT_DATA); 329 string[sbytes + i] = inb(CAT_DATA);
332 } 330 }
333 331
334 for(i = sbytes - 1; i >= 0; i--) { 332 for (i = sbytes - 1; i >= 0; i--) {
335 outb(0xaa, CAT_DATA); 333 outb(0xaa, CAT_DATA);
336 string[i] = inb(CAT_DATA); 334 string[i] = inb(CAT_DATA);
337 } 335 }
338 *value = 0; 336 *value = 0;
339 cat_unpack(string, padbits + (tbytes * BITS_PER_BYTE) + asicp->asic_location, value, asicp->ireg_length); 337 cat_unpack(string,
338 padbits + (tbytes * BITS_PER_BYTE) +
339 asicp->asic_location, value, asicp->ireg_length);
340#ifdef VOYAGER_CAT_DEBUG 340#ifdef VOYAGER_CAT_DEBUG
341 printk("value=0x%x, string: ", *value); 341 printk("value=0x%x, string: ", *value);
342 for(i=0; i< tbytes+sbytes; i++) 342 for (i = 0; i < tbytes + sbytes; i++)
343 printk("0x%x ", string[i]); 343 printk("0x%x ", string[i]);
344 printk("\n"); 344 printk("\n");
345#endif 345#endif
346 346
347 /* sanity check the rest of the return */ 347 /* sanity check the rest of the return */
348 for(i=0; i < tbytes; i++) { 348 for (i = 0; i < tbytes; i++) {
349 __u8 input = 0; 349 __u8 input = 0;
350 350
351 cat_unpack(string, padbits + (i * BITS_PER_BYTE), &input, BITS_PER_BYTE); 351 cat_unpack(string, padbits + (i * BITS_PER_BYTE),
352 if(trailer[i] != input) { 352 &input, BITS_PER_BYTE);
353 if (trailer[i] != input) {
353 CDEBUG(("cat_getdata: failed to sanity check rest of ret(%d) 0x%x != 0x%x\n", i, input, trailer[i])); 354 CDEBUG(("cat_getdata: failed to sanity check rest of ret(%d) 0x%x != 0x%x\n", i, input, trailer[i]));
354 return 1; 355 return 1;
355 } 356 }
@@ -360,14 +361,14 @@ cat_getdata(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg,
360} 361}
361 362
362static int 363static int
363cat_shiftout(__u8 *data, __u16 data_bytes, __u16 header_bytes, __u8 pad_bits) 364cat_shiftout(__u8 * data, __u16 data_bytes, __u16 header_bytes, __u8 pad_bits)
364{ 365{
365 int i; 366 int i;
366 367
367 for(i = data_bytes + header_bytes - 1; i >= header_bytes; i--) 368 for (i = data_bytes + header_bytes - 1; i >= header_bytes; i--)
368 outb(data[i], CAT_DATA); 369 outb(data[i], CAT_DATA);
369 370
370 for(i = header_bytes - 1; i >= 0; i--) { 371 for (i = header_bytes - 1; i >= 0; i--) {
371 __u8 header = 0; 372 __u8 header = 0;
372 __u8 input; 373 __u8 input;
373 374
@@ -376,7 +377,7 @@ cat_shiftout(__u8 *data, __u16 data_bytes, __u16 header_bytes, __u8 pad_bits)
376 CDEBUG(("cat_shiftout: returned 0x%x\n", input)); 377 CDEBUG(("cat_shiftout: returned 0x%x\n", input));
377 cat_unpack(data, ((data_bytes + i) * BITS_PER_BYTE) - pad_bits, 378 cat_unpack(data, ((data_bytes + i) * BITS_PER_BYTE) - pad_bits,
378 &header, BITS_PER_BYTE); 379 &header, BITS_PER_BYTE);
379 if(input != header) { 380 if (input != header) {
380 CDEBUG(("VOYAGER CAT: cat_shiftout failed to return header 0x%x != 0x%x\n", input, header)); 381 CDEBUG(("VOYAGER CAT: cat_shiftout failed to return header 0x%x != 0x%x\n", input, header));
381 return 1; 382 return 1;
382 } 383 }
@@ -385,57 +386,57 @@ cat_shiftout(__u8 *data, __u16 data_bytes, __u16 header_bytes, __u8 pad_bits)
385} 386}
386 387
387static int 388static int
388cat_senddata(voyager_module_t *modp, voyager_asic_t *asicp, 389cat_senddata(voyager_module_t * modp, voyager_asic_t * asicp,
389 __u8 reg, __u8 value) 390 __u8 reg, __u8 value)
390{ 391{
391 outb(VOYAGER_CAT_DRCYC, CAT_CMD); 392 outb(VOYAGER_CAT_DRCYC, CAT_CMD);
392 if(!modp->scan_path_connected) { 393 if (!modp->scan_path_connected) {
393 if(asicp->asic_id != VOYAGER_CAT_ID) { 394 if (asicp->asic_id != VOYAGER_CAT_ID) {
394 CDEBUG(("VOYAGER CAT: ERROR: scan path disconnected when asic != CAT\n")); 395 CDEBUG(("VOYAGER CAT: ERROR: scan path disconnected when asic != CAT\n"));
395 return 1; 396 return 1;
396 } 397 }
397 outb(VOYAGER_CAT_HEADER, CAT_DATA); 398 outb(VOYAGER_CAT_HEADER, CAT_DATA);
398 outb(value, CAT_DATA); 399 outb(value, CAT_DATA);
399 if(inb(CAT_DATA) != VOYAGER_CAT_HEADER) { 400 if (inb(CAT_DATA) != VOYAGER_CAT_HEADER) {
400 CDEBUG(("cat_senddata: failed to get correct header response to sent data\n")); 401 CDEBUG(("cat_senddata: failed to get correct header response to sent data\n"));
401 return 1; 402 return 1;
402 } 403 }
403 if(reg > VOYAGER_SUBADDRHI) { 404 if (reg > VOYAGER_SUBADDRHI) {
404 outb(VOYAGER_CAT_RUN, CAT_CMD); 405 outb(VOYAGER_CAT_RUN, CAT_CMD);
405 outb(VOYAGER_CAT_END, CAT_CMD); 406 outb(VOYAGER_CAT_END, CAT_CMD);
406 outb(VOYAGER_CAT_RUN, CAT_CMD); 407 outb(VOYAGER_CAT_RUN, CAT_CMD);
407 } 408 }
408 409
409 return 0; 410 return 0;
410 } 411 } else {
411 else {
412 __u16 hbytes = asicp->ireg_length / BITS_PER_BYTE; 412 __u16 hbytes = asicp->ireg_length / BITS_PER_BYTE;
413 __u16 dbytes = (modp->num_asics - 1 + asicp->ireg_length)/BITS_PER_BYTE; 413 __u16 dbytes =
414 __u8 padbits, dseq[VOYAGER_MAX_SCAN_PATH], 414 (modp->num_asics - 1 + asicp->ireg_length) / BITS_PER_BYTE;
415 hseq[VOYAGER_MAX_REG_SIZE]; 415 __u8 padbits, dseq[VOYAGER_MAX_SCAN_PATH],
416 hseq[VOYAGER_MAX_REG_SIZE];
416 int i; 417 int i;
417 418
418 if((padbits = (modp->num_asics - 1 419 if ((padbits = (modp->num_asics - 1
419 + asicp->ireg_length) % BITS_PER_BYTE) != 0) { 420 + asicp->ireg_length) % BITS_PER_BYTE) != 0) {
420 padbits = BITS_PER_BYTE - padbits; 421 padbits = BITS_PER_BYTE - padbits;
421 dbytes++; 422 dbytes++;
422 } 423 }
423 if(asicp->ireg_length % BITS_PER_BYTE) 424 if (asicp->ireg_length % BITS_PER_BYTE)
424 hbytes++; 425 hbytes++;
425 426
426 cat_build_header(hseq, hbytes, 1, asicp->ireg_length); 427 cat_build_header(hseq, hbytes, 1, asicp->ireg_length);
427 428
428 for(i = 0; i < dbytes + hbytes; i++) 429 for (i = 0; i < dbytes + hbytes; i++)
429 dseq[i] = 0xff; 430 dseq[i] = 0xff;
430 CDEBUG(("cat_senddata: dbytes=%d, hbytes=%d, padbits=%d\n", 431 CDEBUG(("cat_senddata: dbytes=%d, hbytes=%d, padbits=%d\n",
431 dbytes, hbytes, padbits)); 432 dbytes, hbytes, padbits));
432 cat_pack(dseq, modp->num_asics - 1 + asicp->ireg_length, 433 cat_pack(dseq, modp->num_asics - 1 + asicp->ireg_length,
433 hseq, hbytes * BITS_PER_BYTE); 434 hseq, hbytes * BITS_PER_BYTE);
434 cat_pack(dseq, asicp->asic_location, &value, 435 cat_pack(dseq, asicp->asic_location, &value,
435 asicp->ireg_length); 436 asicp->ireg_length);
436#ifdef VOYAGER_CAT_DEBUG 437#ifdef VOYAGER_CAT_DEBUG
437 printk("dseq "); 438 printk("dseq ");
438 for(i=0; i<hbytes+dbytes; i++) { 439 for (i = 0; i < hbytes + dbytes; i++) {
439 printk("0x%x ", dseq[i]); 440 printk("0x%x ", dseq[i]);
440 } 441 }
441 printk("\n"); 442 printk("\n");
@@ -445,121 +446,125 @@ cat_senddata(voyager_module_t *modp, voyager_asic_t *asicp,
445} 446}
446 447
447static int 448static int
448cat_write(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg, 449cat_write(voyager_module_t * modp, voyager_asic_t * asicp, __u8 reg, __u8 value)
449 __u8 value)
450{ 450{
451 if(cat_sendinst(modp, asicp, reg, VOYAGER_WRITE_CONFIG)) 451 if (cat_sendinst(modp, asicp, reg, VOYAGER_WRITE_CONFIG))
452 return 1; 452 return 1;
453 return cat_senddata(modp, asicp, reg, value); 453 return cat_senddata(modp, asicp, reg, value);
454} 454}
455 455
456static int 456static int
457cat_read(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg, 457cat_read(voyager_module_t * modp, voyager_asic_t * asicp, __u8 reg,
458 __u8 *value) 458 __u8 * value)
459{ 459{
460 if(cat_sendinst(modp, asicp, reg, VOYAGER_READ_CONFIG)) 460 if (cat_sendinst(modp, asicp, reg, VOYAGER_READ_CONFIG))
461 return 1; 461 return 1;
462 return cat_getdata(modp, asicp, reg, value); 462 return cat_getdata(modp, asicp, reg, value);
463} 463}
464 464
465static int 465static int
466cat_subaddrsetup(voyager_module_t *modp, voyager_asic_t *asicp, __u16 offset, 466cat_subaddrsetup(voyager_module_t * modp, voyager_asic_t * asicp, __u16 offset,
467 __u16 len) 467 __u16 len)
468{ 468{
469 __u8 val; 469 __u8 val;
470 470
471 if(len > 1) { 471 if (len > 1) {
472 /* set auto increment */ 472 /* set auto increment */
473 __u8 newval; 473 __u8 newval;
474 474
475 if(cat_read(modp, asicp, VOYAGER_AUTO_INC_REG, &val)) { 475 if (cat_read(modp, asicp, VOYAGER_AUTO_INC_REG, &val)) {
476 CDEBUG(("cat_subaddrsetup: read of VOYAGER_AUTO_INC_REG failed\n")); 476 CDEBUG(("cat_subaddrsetup: read of VOYAGER_AUTO_INC_REG failed\n"));
477 return 1; 477 return 1;
478 } 478 }
479 CDEBUG(("cat_subaddrsetup: VOYAGER_AUTO_INC_REG = 0x%x\n", val)); 479 CDEBUG(("cat_subaddrsetup: VOYAGER_AUTO_INC_REG = 0x%x\n",
480 val));
480 newval = val | VOYAGER_AUTO_INC; 481 newval = val | VOYAGER_AUTO_INC;
481 if(newval != val) { 482 if (newval != val) {
482 if(cat_write(modp, asicp, VOYAGER_AUTO_INC_REG, val)) { 483 if (cat_write(modp, asicp, VOYAGER_AUTO_INC_REG, val)) {
483 CDEBUG(("cat_subaddrsetup: write to VOYAGER_AUTO_INC_REG failed\n")); 484 CDEBUG(("cat_subaddrsetup: write to VOYAGER_AUTO_INC_REG failed\n"));
484 return 1; 485 return 1;
485 } 486 }
486 } 487 }
487 } 488 }
488 if(cat_write(modp, asicp, VOYAGER_SUBADDRLO, (__u8)(offset &0xff))) { 489 if (cat_write(modp, asicp, VOYAGER_SUBADDRLO, (__u8) (offset & 0xff))) {
489 CDEBUG(("cat_subaddrsetup: write to SUBADDRLO failed\n")); 490 CDEBUG(("cat_subaddrsetup: write to SUBADDRLO failed\n"));
490 return 1; 491 return 1;
491 } 492 }
492 if(asicp->subaddr > VOYAGER_SUBADDR_LO) { 493 if (asicp->subaddr > VOYAGER_SUBADDR_LO) {
493 if(cat_write(modp, asicp, VOYAGER_SUBADDRHI, (__u8)(offset >> 8))) { 494 if (cat_write
495 (modp, asicp, VOYAGER_SUBADDRHI, (__u8) (offset >> 8))) {
494 CDEBUG(("cat_subaddrsetup: write to SUBADDRHI failed\n")); 496 CDEBUG(("cat_subaddrsetup: write to SUBADDRHI failed\n"));
495 return 1; 497 return 1;
496 } 498 }
497 cat_read(modp, asicp, VOYAGER_SUBADDRHI, &val); 499 cat_read(modp, asicp, VOYAGER_SUBADDRHI, &val);
498 CDEBUG(("cat_subaddrsetup: offset = %d, hi = %d\n", offset, val)); 500 CDEBUG(("cat_subaddrsetup: offset = %d, hi = %d\n", offset,
501 val));
499 } 502 }
500 cat_read(modp, asicp, VOYAGER_SUBADDRLO, &val); 503 cat_read(modp, asicp, VOYAGER_SUBADDRLO, &val);
501 CDEBUG(("cat_subaddrsetup: offset = %d, lo = %d\n", offset, val)); 504 CDEBUG(("cat_subaddrsetup: offset = %d, lo = %d\n", offset, val));
502 return 0; 505 return 0;
503} 506}
504 507
505static int 508static int
506cat_subwrite(voyager_module_t *modp, voyager_asic_t *asicp, __u16 offset, 509cat_subwrite(voyager_module_t * modp, voyager_asic_t * asicp, __u16 offset,
507 __u16 len, void *buf) 510 __u16 len, void *buf)
508{ 511{
509 int i, retval; 512 int i, retval;
510 513
511 /* FIXME: need special actions for VOYAGER_CAT_ID here */ 514 /* FIXME: need special actions for VOYAGER_CAT_ID here */
512 if(asicp->asic_id == VOYAGER_CAT_ID) { 515 if (asicp->asic_id == VOYAGER_CAT_ID) {
513 CDEBUG(("cat_subwrite: ATTEMPT TO WRITE TO CAT ASIC\n")); 516 CDEBUG(("cat_subwrite: ATTEMPT TO WRITE TO CAT ASIC\n"));
514 /* FIXME -- This is supposed to be handled better 517 /* FIXME -- This is supposed to be handled better
515 * There is a problem writing to the cat asic in the 518 * There is a problem writing to the cat asic in the
516 * PSI. The 30us delay seems to work, though */ 519 * PSI. The 30us delay seems to work, though */
517 udelay(30); 520 udelay(30);
518 } 521 }
519 522
520 if((retval = cat_subaddrsetup(modp, asicp, offset, len)) != 0) { 523 if ((retval = cat_subaddrsetup(modp, asicp, offset, len)) != 0) {
521 printk("cat_subwrite: cat_subaddrsetup FAILED\n"); 524 printk("cat_subwrite: cat_subaddrsetup FAILED\n");
522 return retval; 525 return retval;
523 } 526 }
524 527
525 if(cat_sendinst(modp, asicp, VOYAGER_SUBADDRDATA, VOYAGER_WRITE_CONFIG)) { 528 if (cat_sendinst
529 (modp, asicp, VOYAGER_SUBADDRDATA, VOYAGER_WRITE_CONFIG)) {
526 printk("cat_subwrite: cat_sendinst FAILED\n"); 530 printk("cat_subwrite: cat_sendinst FAILED\n");
527 return 1; 531 return 1;
528 } 532 }
529 for(i = 0; i < len; i++) { 533 for (i = 0; i < len; i++) {
530 if(cat_senddata(modp, asicp, 0xFF, ((__u8 *)buf)[i])) { 534 if (cat_senddata(modp, asicp, 0xFF, ((__u8 *) buf)[i])) {
531 printk("cat_subwrite: cat_sendata element at %d FAILED\n", i); 535 printk
536 ("cat_subwrite: cat_sendata element at %d FAILED\n",
537 i);
532 return 1; 538 return 1;
533 } 539 }
534 } 540 }
535 return 0; 541 return 0;
536} 542}
537static int 543static int
538cat_subread(voyager_module_t *modp, voyager_asic_t *asicp, __u16 offset, 544cat_subread(voyager_module_t * modp, voyager_asic_t * asicp, __u16 offset,
539 __u16 len, void *buf) 545 __u16 len, void *buf)
540{ 546{
541 int i, retval; 547 int i, retval;
542 548
543 if((retval = cat_subaddrsetup(modp, asicp, offset, len)) != 0) { 549 if ((retval = cat_subaddrsetup(modp, asicp, offset, len)) != 0) {
544 CDEBUG(("cat_subread: cat_subaddrsetup FAILED\n")); 550 CDEBUG(("cat_subread: cat_subaddrsetup FAILED\n"));
545 return retval; 551 return retval;
546 } 552 }
547 553
548 if(cat_sendinst(modp, asicp, VOYAGER_SUBADDRDATA, VOYAGER_READ_CONFIG)) { 554 if (cat_sendinst(modp, asicp, VOYAGER_SUBADDRDATA, VOYAGER_READ_CONFIG)) {
549 CDEBUG(("cat_subread: cat_sendinst failed\n")); 555 CDEBUG(("cat_subread: cat_sendinst failed\n"));
550 return 1; 556 return 1;
551 } 557 }
552 for(i = 0; i < len; i++) { 558 for (i = 0; i < len; i++) {
553 if(cat_getdata(modp, asicp, 0xFF, 559 if (cat_getdata(modp, asicp, 0xFF, &((__u8 *) buf)[i])) {
554 &((__u8 *)buf)[i])) { 560 CDEBUG(("cat_subread: cat_getdata element %d failed\n",
555 CDEBUG(("cat_subread: cat_getdata element %d failed\n", i)); 561 i));
556 return 1; 562 return 1;
557 } 563 }
558 } 564 }
559 return 0; 565 return 0;
560} 566}
561 567
562
563/* buffer for storing EPROM data read in during initialisation */ 568/* buffer for storing EPROM data read in during initialisation */
564static __initdata __u8 eprom_buf[0xFFFF]; 569static __initdata __u8 eprom_buf[0xFFFF];
565static voyager_module_t *voyager_initial_module; 570static voyager_module_t *voyager_initial_module;
@@ -568,8 +573,7 @@ static voyager_module_t *voyager_initial_module;
568 * boot cpu *after* all memory initialisation has been done (so we can 573 * boot cpu *after* all memory initialisation has been done (so we can
569 * use kmalloc) but before smp initialisation, so we can probe the SMP 574 * use kmalloc) but before smp initialisation, so we can probe the SMP
570 * configuration and pick up necessary information. */ 575 * configuration and pick up necessary information. */
571void __init 576void __init voyager_cat_init(void)
572voyager_cat_init(void)
573{ 577{
574 voyager_module_t **modpp = &voyager_initial_module; 578 voyager_module_t **modpp = &voyager_initial_module;
575 voyager_asic_t **asicpp; 579 voyager_asic_t **asicpp;
@@ -578,27 +582,29 @@ voyager_cat_init(void)
578 unsigned long qic_addr = 0; 582 unsigned long qic_addr = 0;
579 __u8 qabc_data[0x20]; 583 __u8 qabc_data[0x20];
580 __u8 num_submodules, val; 584 __u8 num_submodules, val;
581 voyager_eprom_hdr_t *eprom_hdr = (voyager_eprom_hdr_t *)&eprom_buf[0]; 585 voyager_eprom_hdr_t *eprom_hdr = (voyager_eprom_hdr_t *) & eprom_buf[0];
582 586
583 __u8 cmos[4]; 587 __u8 cmos[4];
584 unsigned long addr; 588 unsigned long addr;
585 589
586 /* initiallise the SUS mailbox */ 590 /* initiallise the SUS mailbox */
587 for(i=0; i<sizeof(cmos); i++) 591 for (i = 0; i < sizeof(cmos); i++)
588 cmos[i] = voyager_extended_cmos_read(VOYAGER_DUMP_LOCATION + i); 592 cmos[i] = voyager_extended_cmos_read(VOYAGER_DUMP_LOCATION + i);
589 addr = *(unsigned long *)cmos; 593 addr = *(unsigned long *)cmos;
590 if((addr & 0xff000000) != 0xff000000) { 594 if ((addr & 0xff000000) != 0xff000000) {
591 printk(KERN_ERR "Voyager failed to get SUS mailbox (addr = 0x%lx\n", addr); 595 printk(KERN_ERR
596 "Voyager failed to get SUS mailbox (addr = 0x%lx\n",
597 addr);
592 } else { 598 } else {
593 static struct resource res; 599 static struct resource res;
594 600
595 res.name = "voyager SUS"; 601 res.name = "voyager SUS";
596 res.start = addr; 602 res.start = addr;
597 res.end = addr+0x3ff; 603 res.end = addr + 0x3ff;
598 604
599 request_resource(&iomem_resource, &res); 605 request_resource(&iomem_resource, &res);
600 voyager_SUS = (struct voyager_SUS *) 606 voyager_SUS = (struct voyager_SUS *)
601 ioremap(addr, 0x400); 607 ioremap(addr, 0x400);
602 printk(KERN_NOTICE "Voyager SUS mailbox version 0x%x\n", 608 printk(KERN_NOTICE "Voyager SUS mailbox version 0x%x\n",
603 voyager_SUS->SUS_version); 609 voyager_SUS->SUS_version);
604 voyager_SUS->kernel_version = VOYAGER_MAILBOX_VERSION; 610 voyager_SUS->kernel_version = VOYAGER_MAILBOX_VERSION;
@@ -609,8 +615,6 @@ voyager_cat_init(void)
609 voyager_extended_vic_processors = 0; 615 voyager_extended_vic_processors = 0;
610 voyager_quad_processors = 0; 616 voyager_quad_processors = 0;
611 617
612
613
614 printk("VOYAGER: beginning CAT bus probe\n"); 618 printk("VOYAGER: beginning CAT bus probe\n");
615 /* set up the SuperSet Port Block which tells us where the 619 /* set up the SuperSet Port Block which tells us where the
616 * CAT communication port is */ 620 * CAT communication port is */
@@ -618,14 +622,14 @@ voyager_cat_init(void)
618 VDEBUG(("VOYAGER DEBUG: sspb = 0x%x\n", sspb)); 622 VDEBUG(("VOYAGER DEBUG: sspb = 0x%x\n", sspb));
619 623
620 /* now find out if were 8 slot or normal */ 624 /* now find out if were 8 slot or normal */
621 if((inb(VIC_PROC_WHO_AM_I) & EIGHT_SLOT_IDENTIFIER) 625 if ((inb(VIC_PROC_WHO_AM_I) & EIGHT_SLOT_IDENTIFIER)
622 == EIGHT_SLOT_IDENTIFIER) { 626 == EIGHT_SLOT_IDENTIFIER) {
623 voyager_8slot = 1; 627 voyager_8slot = 1;
624 printk(KERN_NOTICE "Voyager: Eight slot 51xx configuration detected\n"); 628 printk(KERN_NOTICE
629 "Voyager: Eight slot 51xx configuration detected\n");
625 } 630 }
626 631
627 for(i = VOYAGER_MIN_MODULE; 632 for (i = VOYAGER_MIN_MODULE; i <= VOYAGER_MAX_MODULE; i++) {
628 i <= VOYAGER_MAX_MODULE; i++) {
629 __u8 input; 633 __u8 input;
630 int asic; 634 int asic;
631 __u16 eprom_size; 635 __u16 eprom_size;
@@ -643,21 +647,21 @@ voyager_cat_init(void)
643 outb(0xAA, CAT_DATA); 647 outb(0xAA, CAT_DATA);
644 input = inb(CAT_DATA); 648 input = inb(CAT_DATA);
645 outb(VOYAGER_CAT_END, CAT_CMD); 649 outb(VOYAGER_CAT_END, CAT_CMD);
646 if(input != VOYAGER_CAT_HEADER) { 650 if (input != VOYAGER_CAT_HEADER) {
647 continue; 651 continue;
648 } 652 }
649 CDEBUG(("VOYAGER DEBUG: found module id 0x%x, %s\n", i, 653 CDEBUG(("VOYAGER DEBUG: found module id 0x%x, %s\n", i,
650 cat_module_name(i))); 654 cat_module_name(i)));
651 *modpp = kmalloc(sizeof(voyager_module_t), GFP_KERNEL); /*&voyager_module_storage[cat_count++];*/ 655 *modpp = kmalloc(sizeof(voyager_module_t), GFP_KERNEL); /*&voyager_module_storage[cat_count++]; */
652 if(*modpp == NULL) { 656 if (*modpp == NULL) {
653 printk("**WARNING** kmalloc failure in cat_init\n"); 657 printk("**WARNING** kmalloc failure in cat_init\n");
654 continue; 658 continue;
655 } 659 }
656 memset(*modpp, 0, sizeof(voyager_module_t)); 660 memset(*modpp, 0, sizeof(voyager_module_t));
657 /* need temporary asic for cat_subread. It will be 661 /* need temporary asic for cat_subread. It will be
658 * filled in correctly later */ 662 * filled in correctly later */
659 (*modpp)->asic = kmalloc(sizeof(voyager_asic_t), GFP_KERNEL); /*&voyager_asic_storage[asic_count];*/ 663 (*modpp)->asic = kmalloc(sizeof(voyager_asic_t), GFP_KERNEL); /*&voyager_asic_storage[asic_count]; */
660 if((*modpp)->asic == NULL) { 664 if ((*modpp)->asic == NULL) {
661 printk("**WARNING** kmalloc failure in cat_init\n"); 665 printk("**WARNING** kmalloc failure in cat_init\n");
662 continue; 666 continue;
663 } 667 }
@@ -666,47 +670,52 @@ voyager_cat_init(void)
666 (*modpp)->asic->subaddr = VOYAGER_SUBADDR_HI; 670 (*modpp)->asic->subaddr = VOYAGER_SUBADDR_HI;
667 (*modpp)->module_addr = i; 671 (*modpp)->module_addr = i;
668 (*modpp)->scan_path_connected = 0; 672 (*modpp)->scan_path_connected = 0;
669 if(i == VOYAGER_PSI) { 673 if (i == VOYAGER_PSI) {
670 /* Exception leg for modules with no EEPROM */ 674 /* Exception leg for modules with no EEPROM */
671 printk("Module \"%s\"\n", cat_module_name(i)); 675 printk("Module \"%s\"\n", cat_module_name(i));
672 continue; 676 continue;
673 } 677 }
674 678
675 CDEBUG(("cat_init: Reading eeprom for module 0x%x at offset %d\n", i, VOYAGER_XSUM_END_OFFSET)); 679 CDEBUG(("cat_init: Reading eeprom for module 0x%x at offset %d\n", i, VOYAGER_XSUM_END_OFFSET));
676 outb(VOYAGER_CAT_RUN, CAT_CMD); 680 outb(VOYAGER_CAT_RUN, CAT_CMD);
677 cat_disconnect(*modpp, (*modpp)->asic); 681 cat_disconnect(*modpp, (*modpp)->asic);
678 if(cat_subread(*modpp, (*modpp)->asic, 682 if (cat_subread(*modpp, (*modpp)->asic,
679 VOYAGER_XSUM_END_OFFSET, sizeof(eprom_size), 683 VOYAGER_XSUM_END_OFFSET, sizeof(eprom_size),
680 &eprom_size)) { 684 &eprom_size)) {
681 printk("**WARNING**: Voyager couldn't read EPROM size for module 0x%x\n", i); 685 printk
686 ("**WARNING**: Voyager couldn't read EPROM size for module 0x%x\n",
687 i);
682 outb(VOYAGER_CAT_END, CAT_CMD); 688 outb(VOYAGER_CAT_END, CAT_CMD);
683 continue; 689 continue;
684 } 690 }
685 if(eprom_size > sizeof(eprom_buf)) { 691 if (eprom_size > sizeof(eprom_buf)) {
686 printk("**WARNING**: Voyager insufficient size to read EPROM data, module 0x%x. Need %d\n", i, eprom_size); 692 printk
693 ("**WARNING**: Voyager insufficient size to read EPROM data, module 0x%x. Need %d\n",
694 i, eprom_size);
687 outb(VOYAGER_CAT_END, CAT_CMD); 695 outb(VOYAGER_CAT_END, CAT_CMD);
688 continue; 696 continue;
689 } 697 }
690 outb(VOYAGER_CAT_END, CAT_CMD); 698 outb(VOYAGER_CAT_END, CAT_CMD);
691 outb(VOYAGER_CAT_RUN, CAT_CMD); 699 outb(VOYAGER_CAT_RUN, CAT_CMD);
692 CDEBUG(("cat_init: module 0x%x, eeprom_size %d\n", i, eprom_size)); 700 CDEBUG(("cat_init: module 0x%x, eeprom_size %d\n", i,
693 if(cat_subread(*modpp, (*modpp)->asic, 0, 701 eprom_size));
694 eprom_size, eprom_buf)) { 702 if (cat_subread
703 (*modpp, (*modpp)->asic, 0, eprom_size, eprom_buf)) {
695 outb(VOYAGER_CAT_END, CAT_CMD); 704 outb(VOYAGER_CAT_END, CAT_CMD);
696 continue; 705 continue;
697 } 706 }
698 outb(VOYAGER_CAT_END, CAT_CMD); 707 outb(VOYAGER_CAT_END, CAT_CMD);
699 printk("Module \"%s\", version 0x%x, tracer 0x%x, asics %d\n", 708 printk("Module \"%s\", version 0x%x, tracer 0x%x, asics %d\n",
700 cat_module_name(i), eprom_hdr->version_id, 709 cat_module_name(i), eprom_hdr->version_id,
701 *((__u32 *)eprom_hdr->tracer), eprom_hdr->num_asics); 710 *((__u32 *) eprom_hdr->tracer), eprom_hdr->num_asics);
702 (*modpp)->ee_size = eprom_hdr->ee_size; 711 (*modpp)->ee_size = eprom_hdr->ee_size;
703 (*modpp)->num_asics = eprom_hdr->num_asics; 712 (*modpp)->num_asics = eprom_hdr->num_asics;
704 asicpp = &((*modpp)->asic); 713 asicpp = &((*modpp)->asic);
705 sp_offset = eprom_hdr->scan_path_offset; 714 sp_offset = eprom_hdr->scan_path_offset;
706 /* All we really care about are the Quad cards. We 715 /* All we really care about are the Quad cards. We
707 * identify them because they are in a processor slot 716 * identify them because they are in a processor slot
708 * and have only four asics */ 717 * and have only four asics */
709 if((i < 0x10 || (i>=0x14 && i < 0x1c) || i>0x1f)) { 718 if ((i < 0x10 || (i >= 0x14 && i < 0x1c) || i > 0x1f)) {
710 modpp = &((*modpp)->next); 719 modpp = &((*modpp)->next);
711 continue; 720 continue;
712 } 721 }
@@ -717,16 +726,17 @@ voyager_cat_init(void)
717 &num_submodules); 726 &num_submodules);
718 /* lowest two bits, active low */ 727 /* lowest two bits, active low */
719 num_submodules = ~(0xfc | num_submodules); 728 num_submodules = ~(0xfc | num_submodules);
720 CDEBUG(("VOYAGER CAT: %d submodules present\n", num_submodules)); 729 CDEBUG(("VOYAGER CAT: %d submodules present\n",
721 if(num_submodules == 0) { 730 num_submodules));
731 if (num_submodules == 0) {
722 /* fill in the dyadic extended processors */ 732 /* fill in the dyadic extended processors */
723 __u8 cpu = i & 0x07; 733 __u8 cpu = i & 0x07;
724 734
725 printk("Module \"%s\": Dyadic Processor Card\n", 735 printk("Module \"%s\": Dyadic Processor Card\n",
726 cat_module_name(i)); 736 cat_module_name(i));
727 voyager_extended_vic_processors |= (1<<cpu); 737 voyager_extended_vic_processors |= (1 << cpu);
728 cpu += 4; 738 cpu += 4;
729 voyager_extended_vic_processors |= (1<<cpu); 739 voyager_extended_vic_processors |= (1 << cpu);
730 outb(VOYAGER_CAT_END, CAT_CMD); 740 outb(VOYAGER_CAT_END, CAT_CMD);
731 continue; 741 continue;
732 } 742 }
@@ -740,28 +750,32 @@ voyager_cat_init(void)
740 cat_write(*modpp, (*modpp)->asic, VOYAGER_SUBMODSELECT, val); 750 cat_write(*modpp, (*modpp)->asic, VOYAGER_SUBMODSELECT, val);
741 751
742 outb(VOYAGER_CAT_END, CAT_CMD); 752 outb(VOYAGER_CAT_END, CAT_CMD);
743
744 753
745 CDEBUG(("cat_init: Reading eeprom for module 0x%x at offset %d\n", i, VOYAGER_XSUM_END_OFFSET)); 754 CDEBUG(("cat_init: Reading eeprom for module 0x%x at offset %d\n", i, VOYAGER_XSUM_END_OFFSET));
746 outb(VOYAGER_CAT_RUN, CAT_CMD); 755 outb(VOYAGER_CAT_RUN, CAT_CMD);
747 cat_disconnect(*modpp, (*modpp)->asic); 756 cat_disconnect(*modpp, (*modpp)->asic);
748 if(cat_subread(*modpp, (*modpp)->asic, 757 if (cat_subread(*modpp, (*modpp)->asic,
749 VOYAGER_XSUM_END_OFFSET, sizeof(eprom_size), 758 VOYAGER_XSUM_END_OFFSET, sizeof(eprom_size),
750 &eprom_size)) { 759 &eprom_size)) {
751 printk("**WARNING**: Voyager couldn't read EPROM size for module 0x%x\n", i); 760 printk
761 ("**WARNING**: Voyager couldn't read EPROM size for module 0x%x\n",
762 i);
752 outb(VOYAGER_CAT_END, CAT_CMD); 763 outb(VOYAGER_CAT_END, CAT_CMD);
753 continue; 764 continue;
754 } 765 }
755 if(eprom_size > sizeof(eprom_buf)) { 766 if (eprom_size > sizeof(eprom_buf)) {
756 printk("**WARNING**: Voyager insufficient size to read EPROM data, module 0x%x. Need %d\n", i, eprom_size); 767 printk
768 ("**WARNING**: Voyager insufficient size to read EPROM data, module 0x%x. Need %d\n",
769 i, eprom_size);
757 outb(VOYAGER_CAT_END, CAT_CMD); 770 outb(VOYAGER_CAT_END, CAT_CMD);
758 continue; 771 continue;
759 } 772 }
760 outb(VOYAGER_CAT_END, CAT_CMD); 773 outb(VOYAGER_CAT_END, CAT_CMD);
761 outb(VOYAGER_CAT_RUN, CAT_CMD); 774 outb(VOYAGER_CAT_RUN, CAT_CMD);
762 CDEBUG(("cat_init: module 0x%x, eeprom_size %d\n", i, eprom_size)); 775 CDEBUG(("cat_init: module 0x%x, eeprom_size %d\n", i,
763 if(cat_subread(*modpp, (*modpp)->asic, 0, 776 eprom_size));
764 eprom_size, eprom_buf)) { 777 if (cat_subread
778 (*modpp, (*modpp)->asic, 0, eprom_size, eprom_buf)) {
765 outb(VOYAGER_CAT_END, CAT_CMD); 779 outb(VOYAGER_CAT_END, CAT_CMD);
766 continue; 780 continue;
767 } 781 }
@@ -773,30 +787,35 @@ voyager_cat_init(void)
773 sp_offset = eprom_hdr->scan_path_offset; 787 sp_offset = eprom_hdr->scan_path_offset;
774 /* get rid of the dummy CAT asic and read the real one */ 788 /* get rid of the dummy CAT asic and read the real one */
775 kfree((*modpp)->asic); 789 kfree((*modpp)->asic);
776 for(asic=0; asic < (*modpp)->num_asics; asic++) { 790 for (asic = 0; asic < (*modpp)->num_asics; asic++) {
777 int j; 791 int j;
778 voyager_asic_t *asicp = *asicpp 792 voyager_asic_t *asicp = *asicpp = kzalloc(sizeof(voyager_asic_t), GFP_KERNEL); /*&voyager_asic_storage[asic_count++]; */
779 = kzalloc(sizeof(voyager_asic_t), GFP_KERNEL); /*&voyager_asic_storage[asic_count++];*/
780 voyager_sp_table_t *sp_table; 793 voyager_sp_table_t *sp_table;
781 voyager_at_t *asic_table; 794 voyager_at_t *asic_table;
782 voyager_jtt_t *jtag_table; 795 voyager_jtt_t *jtag_table;
783 796
784 if(asicp == NULL) { 797 if (asicp == NULL) {
785 printk("**WARNING** kmalloc failure in cat_init\n"); 798 printk
799 ("**WARNING** kmalloc failure in cat_init\n");
786 continue; 800 continue;
787 } 801 }
788 asicpp = &(asicp->next); 802 asicpp = &(asicp->next);
789 asicp->asic_location = asic; 803 asicp->asic_location = asic;
790 sp_table = (voyager_sp_table_t *)(eprom_buf + sp_offset); 804 sp_table =
805 (voyager_sp_table_t *) (eprom_buf + sp_offset);
791 asicp->asic_id = sp_table->asic_id; 806 asicp->asic_id = sp_table->asic_id;
792 asic_table = (voyager_at_t *)(eprom_buf + sp_table->asic_data_offset); 807 asic_table =
793 for(j=0; j<4; j++) 808 (voyager_at_t *) (eprom_buf +
809 sp_table->asic_data_offset);
810 for (j = 0; j < 4; j++)
794 asicp->jtag_id[j] = asic_table->jtag_id[j]; 811 asicp->jtag_id[j] = asic_table->jtag_id[j];
795 jtag_table = (voyager_jtt_t *)(eprom_buf + asic_table->jtag_offset); 812 jtag_table =
813 (voyager_jtt_t *) (eprom_buf +
814 asic_table->jtag_offset);
796 asicp->ireg_length = jtag_table->ireg_len; 815 asicp->ireg_length = jtag_table->ireg_len;
797 asicp->bit_location = (*modpp)->inst_bits; 816 asicp->bit_location = (*modpp)->inst_bits;
798 (*modpp)->inst_bits += asicp->ireg_length; 817 (*modpp)->inst_bits += asicp->ireg_length;
799 if(asicp->ireg_length > (*modpp)->largest_reg) 818 if (asicp->ireg_length > (*modpp)->largest_reg)
800 (*modpp)->largest_reg = asicp->ireg_length; 819 (*modpp)->largest_reg = asicp->ireg_length;
801 if (asicp->ireg_length < (*modpp)->smallest_reg || 820 if (asicp->ireg_length < (*modpp)->smallest_reg ||
802 (*modpp)->smallest_reg == 0) 821 (*modpp)->smallest_reg == 0)
@@ -804,15 +823,13 @@ voyager_cat_init(void)
804 CDEBUG(("asic 0x%x, ireg_length=%d, bit_location=%d\n", 823 CDEBUG(("asic 0x%x, ireg_length=%d, bit_location=%d\n",
805 asicp->asic_id, asicp->ireg_length, 824 asicp->asic_id, asicp->ireg_length,
806 asicp->bit_location)); 825 asicp->bit_location));
807 if(asicp->asic_id == VOYAGER_QUAD_QABC) { 826 if (asicp->asic_id == VOYAGER_QUAD_QABC) {
808 CDEBUG(("VOYAGER CAT: QABC ASIC found\n")); 827 CDEBUG(("VOYAGER CAT: QABC ASIC found\n"));
809 qabc_asic = asicp; 828 qabc_asic = asicp;
810 } 829 }
811 sp_offset += sizeof(voyager_sp_table_t); 830 sp_offset += sizeof(voyager_sp_table_t);
812 } 831 }
813 CDEBUG(("Module inst_bits = %d, largest_reg = %d, smallest_reg=%d\n", 832 CDEBUG(("Module inst_bits = %d, largest_reg = %d, smallest_reg=%d\n", (*modpp)->inst_bits, (*modpp)->largest_reg, (*modpp)->smallest_reg));
814 (*modpp)->inst_bits, (*modpp)->largest_reg,
815 (*modpp)->smallest_reg));
816 /* OK, now we have the QUAD ASICs set up, use them. 833 /* OK, now we have the QUAD ASICs set up, use them.
817 * we need to: 834 * we need to:
818 * 835 *
@@ -828,10 +845,11 @@ voyager_cat_init(void)
828 qic_addr = qabc_data[5] << 8; 845 qic_addr = qabc_data[5] << 8;
829 qic_addr = (qic_addr | qabc_data[6]) << 8; 846 qic_addr = (qic_addr | qabc_data[6]) << 8;
830 qic_addr = (qic_addr | qabc_data[7]) << 8; 847 qic_addr = (qic_addr | qabc_data[7]) << 8;
831 printk("Module \"%s\": Quad Processor Card; CPI 0x%lx, SET=0x%x\n", 848 printk
832 cat_module_name(i), qic_addr, qabc_data[8]); 849 ("Module \"%s\": Quad Processor Card; CPI 0x%lx, SET=0x%x\n",
850 cat_module_name(i), qic_addr, qabc_data[8]);
833#if 0 /* plumbing fails---FIXME */ 851#if 0 /* plumbing fails---FIXME */
834 if((qabc_data[8] & 0xf0) == 0) { 852 if ((qabc_data[8] & 0xf0) == 0) {
835 /* FIXME: 32 way 8 CPU slot monster cannot be 853 /* FIXME: 32 way 8 CPU slot monster cannot be
836 * plumbed this way---need to check for it */ 854 * plumbed this way---need to check for it */
837 855
@@ -842,94 +860,97 @@ voyager_cat_init(void)
842#ifdef VOYAGER_CAT_DEBUG 860#ifdef VOYAGER_CAT_DEBUG
843 /* verify plumbing */ 861 /* verify plumbing */
844 cat_subread(*modpp, qabc_asic, 8, 1, &qabc_data[8]); 862 cat_subread(*modpp, qabc_asic, 8, 1, &qabc_data[8]);
845 if((qabc_data[8] & 0xf0) == 0) { 863 if ((qabc_data[8] & 0xf0) == 0) {
846 CDEBUG(("PLUMBING FAILED: 0x%x\n", qabc_data[8])); 864 CDEBUG(("PLUMBING FAILED: 0x%x\n",
865 qabc_data[8]));
847 } 866 }
848#endif 867#endif
849 } 868 }
850#endif 869#endif
851 870
852 { 871 {
853 struct resource *res = kzalloc(sizeof(struct resource),GFP_KERNEL); 872 struct resource *res =
873 kzalloc(sizeof(struct resource), GFP_KERNEL);
854 res->name = kmalloc(128, GFP_KERNEL); 874 res->name = kmalloc(128, GFP_KERNEL);
855 sprintf((char *)res->name, "Voyager %s Quad CPI", cat_module_name(i)); 875 sprintf((char *)res->name, "Voyager %s Quad CPI",
876 cat_module_name(i));
856 res->start = qic_addr; 877 res->start = qic_addr;
857 res->end = qic_addr + 0x3ff; 878 res->end = qic_addr + 0x3ff;
858 request_resource(&iomem_resource, res); 879 request_resource(&iomem_resource, res);
859 } 880 }
860 881
861 qic_addr = (unsigned long)ioremap(qic_addr, 0x400); 882 qic_addr = (unsigned long)ioremap(qic_addr, 0x400);
862 883
863 for(j = 0; j < 4; j++) { 884 for (j = 0; j < 4; j++) {
864 __u8 cpu; 885 __u8 cpu;
865 886
866 if(voyager_8slot) { 887 if (voyager_8slot) {
867 /* 8 slot has a different mapping, 888 /* 8 slot has a different mapping,
868 * each slot has only one vic line, so 889 * each slot has only one vic line, so
869 * 1 cpu in each slot must be < 8 */ 890 * 1 cpu in each slot must be < 8 */
870 cpu = (i & 0x07) + j*8; 891 cpu = (i & 0x07) + j * 8;
871 } else { 892 } else {
872 cpu = (i & 0x03) + j*4; 893 cpu = (i & 0x03) + j * 4;
873 } 894 }
874 if( (qabc_data[8] & (1<<j))) { 895 if ((qabc_data[8] & (1 << j))) {
875 voyager_extended_vic_processors |= (1<<cpu); 896 voyager_extended_vic_processors |= (1 << cpu);
876 } 897 }
877 if(qabc_data[8] & (1<<(j+4)) ) { 898 if (qabc_data[8] & (1 << (j + 4))) {
878 /* Second SET register plumbed: Quad 899 /* Second SET register plumbed: Quad
879 * card has two VIC connected CPUs. 900 * card has two VIC connected CPUs.
880 * Secondary cannot be booted as a VIC 901 * Secondary cannot be booted as a VIC
881 * CPU */ 902 * CPU */
882 voyager_extended_vic_processors |= (1<<cpu); 903 voyager_extended_vic_processors |= (1 << cpu);
883 voyager_allowed_boot_processors &= (~(1<<cpu)); 904 voyager_allowed_boot_processors &=
905 (~(1 << cpu));
884 } 906 }
885 907
886 voyager_quad_processors |= (1<<cpu); 908 voyager_quad_processors |= (1 << cpu);
887 voyager_quad_cpi_addr[cpu] = (struct voyager_qic_cpi *) 909 voyager_quad_cpi_addr[cpu] = (struct voyager_qic_cpi *)
888 (qic_addr+(j<<8)); 910 (qic_addr + (j << 8));
889 CDEBUG(("CPU%d: CPI address 0x%lx\n", cpu, 911 CDEBUG(("CPU%d: CPI address 0x%lx\n", cpu,
890 (unsigned long)voyager_quad_cpi_addr[cpu])); 912 (unsigned long)voyager_quad_cpi_addr[cpu]));
891 } 913 }
892 outb(VOYAGER_CAT_END, CAT_CMD); 914 outb(VOYAGER_CAT_END, CAT_CMD);
893 915
894
895
896 *asicpp = NULL; 916 *asicpp = NULL;
897 modpp = &((*modpp)->next); 917 modpp = &((*modpp)->next);
898 } 918 }
899 *modpp = NULL; 919 *modpp = NULL;
900 printk("CAT Bus Initialisation finished: extended procs 0x%x, quad procs 0x%x, allowed vic boot = 0x%x\n", voyager_extended_vic_processors, voyager_quad_processors, voyager_allowed_boot_processors); 920 printk
921 ("CAT Bus Initialisation finished: extended procs 0x%x, quad procs 0x%x, allowed vic boot = 0x%x\n",
922 voyager_extended_vic_processors, voyager_quad_processors,
923 voyager_allowed_boot_processors);
901 request_resource(&ioport_resource, &vic_res); 924 request_resource(&ioport_resource, &vic_res);
902 if(voyager_quad_processors) 925 if (voyager_quad_processors)
903 request_resource(&ioport_resource, &qic_res); 926 request_resource(&ioport_resource, &qic_res);
904 /* set up the front power switch */ 927 /* set up the front power switch */
905} 928}
906 929
907int 930int voyager_cat_readb(__u8 module, __u8 asic, int reg)
908voyager_cat_readb(__u8 module, __u8 asic, int reg)
909{ 931{
910 return 0; 932 return 0;
911} 933}
912 934
913static int 935static int cat_disconnect(voyager_module_t * modp, voyager_asic_t * asicp)
914cat_disconnect(voyager_module_t *modp, voyager_asic_t *asicp)
915{ 936{
916 __u8 val; 937 __u8 val;
917 int err = 0; 938 int err = 0;
918 939
919 if(!modp->scan_path_connected) 940 if (!modp->scan_path_connected)
920 return 0; 941 return 0;
921 if(asicp->asic_id != VOYAGER_CAT_ID) { 942 if (asicp->asic_id != VOYAGER_CAT_ID) {
922 CDEBUG(("cat_disconnect: ASIC is not CAT\n")); 943 CDEBUG(("cat_disconnect: ASIC is not CAT\n"));
923 return 1; 944 return 1;
924 } 945 }
925 err = cat_read(modp, asicp, VOYAGER_SCANPATH, &val); 946 err = cat_read(modp, asicp, VOYAGER_SCANPATH, &val);
926 if(err) { 947 if (err) {
927 CDEBUG(("cat_disconnect: failed to read SCANPATH\n")); 948 CDEBUG(("cat_disconnect: failed to read SCANPATH\n"));
928 return err; 949 return err;
929 } 950 }
930 val &= VOYAGER_DISCONNECT_ASIC; 951 val &= VOYAGER_DISCONNECT_ASIC;
931 err = cat_write(modp, asicp, VOYAGER_SCANPATH, val); 952 err = cat_write(modp, asicp, VOYAGER_SCANPATH, val);
932 if(err) { 953 if (err) {
933 CDEBUG(("cat_disconnect: failed to write SCANPATH\n")); 954 CDEBUG(("cat_disconnect: failed to write SCANPATH\n"));
934 return err; 955 return err;
935 } 956 }
@@ -940,27 +961,26 @@ cat_disconnect(voyager_module_t *modp, voyager_asic_t *asicp)
940 return 0; 961 return 0;
941} 962}
942 963
943static int 964static int cat_connect(voyager_module_t * modp, voyager_asic_t * asicp)
944cat_connect(voyager_module_t *modp, voyager_asic_t *asicp)
945{ 965{
946 __u8 val; 966 __u8 val;
947 int err = 0; 967 int err = 0;
948 968
949 if(modp->scan_path_connected) 969 if (modp->scan_path_connected)
950 return 0; 970 return 0;
951 if(asicp->asic_id != VOYAGER_CAT_ID) { 971 if (asicp->asic_id != VOYAGER_CAT_ID) {
952 CDEBUG(("cat_connect: ASIC is not CAT\n")); 972 CDEBUG(("cat_connect: ASIC is not CAT\n"));
953 return 1; 973 return 1;
954 } 974 }
955 975
956 err = cat_read(modp, asicp, VOYAGER_SCANPATH, &val); 976 err = cat_read(modp, asicp, VOYAGER_SCANPATH, &val);
957 if(err) { 977 if (err) {
958 CDEBUG(("cat_connect: failed to read SCANPATH\n")); 978 CDEBUG(("cat_connect: failed to read SCANPATH\n"));
959 return err; 979 return err;
960 } 980 }
961 val |= VOYAGER_CONNECT_ASIC; 981 val |= VOYAGER_CONNECT_ASIC;
962 err = cat_write(modp, asicp, VOYAGER_SCANPATH, val); 982 err = cat_write(modp, asicp, VOYAGER_SCANPATH, val);
963 if(err) { 983 if (err) {
964 CDEBUG(("cat_connect: failed to write SCANPATH\n")); 984 CDEBUG(("cat_connect: failed to write SCANPATH\n"));
965 return err; 985 return err;
966 } 986 }
@@ -971,11 +991,10 @@ cat_connect(voyager_module_t *modp, voyager_asic_t *asicp)
971 return 0; 991 return 0;
972} 992}
973 993
974void 994void voyager_cat_power_off(void)
975voyager_cat_power_off(void)
976{ 995{
977 /* Power the machine off by writing to the PSI over the CAT 996 /* Power the machine off by writing to the PSI over the CAT
978 * bus */ 997 * bus */
979 __u8 data; 998 __u8 data;
980 voyager_module_t psi = { 0 }; 999 voyager_module_t psi = { 0 };
981 voyager_asic_t psi_asic = { 0 }; 1000 voyager_asic_t psi_asic = { 0 };
@@ -1009,8 +1028,7 @@ voyager_cat_power_off(void)
1009 1028
1010struct voyager_status voyager_status = { 0 }; 1029struct voyager_status voyager_status = { 0 };
1011 1030
1012void 1031void voyager_cat_psi(__u8 cmd, __u16 reg, __u8 * data)
1013voyager_cat_psi(__u8 cmd, __u16 reg, __u8 *data)
1014{ 1032{
1015 voyager_module_t psi = { 0 }; 1033 voyager_module_t psi = { 0 };
1016 voyager_asic_t psi_asic = { 0 }; 1034 voyager_asic_t psi_asic = { 0 };
@@ -1027,7 +1045,7 @@ voyager_cat_psi(__u8 cmd, __u16 reg, __u8 *data)
1027 outb(VOYAGER_PSI, VOYAGER_CAT_CONFIG_PORT); 1045 outb(VOYAGER_PSI, VOYAGER_CAT_CONFIG_PORT);
1028 outb(VOYAGER_CAT_RUN, CAT_CMD); 1046 outb(VOYAGER_CAT_RUN, CAT_CMD);
1029 cat_disconnect(&psi, &psi_asic); 1047 cat_disconnect(&psi, &psi_asic);
1030 switch(cmd) { 1048 switch (cmd) {
1031 case VOYAGER_PSI_READ: 1049 case VOYAGER_PSI_READ:
1032 cat_read(&psi, &psi_asic, reg, data); 1050 cat_read(&psi, &psi_asic, reg, data);
1033 break; 1051 break;
@@ -1047,8 +1065,7 @@ voyager_cat_psi(__u8 cmd, __u16 reg, __u8 *data)
1047 outb(VOYAGER_CAT_END, CAT_CMD); 1065 outb(VOYAGER_CAT_END, CAT_CMD);
1048} 1066}
1049 1067
1050void 1068void voyager_cat_do_common_interrupt(void)
1051voyager_cat_do_common_interrupt(void)
1052{ 1069{
1053 /* This is caused either by a memory parity error or something 1070 /* This is caused either by a memory parity error or something
1054 * in the PSI */ 1071 * in the PSI */
@@ -1057,7 +1074,7 @@ voyager_cat_do_common_interrupt(void)
1057 voyager_asic_t psi_asic = { 0 }; 1074 voyager_asic_t psi_asic = { 0 };
1058 struct voyager_psi psi_reg; 1075 struct voyager_psi psi_reg;
1059 int i; 1076 int i;
1060 re_read: 1077 re_read:
1061 psi.asic = &psi_asic; 1078 psi.asic = &psi_asic;
1062 psi.asic->asic_id = VOYAGER_CAT_ID; 1079 psi.asic->asic_id = VOYAGER_CAT_ID;
1063 psi.asic->subaddr = VOYAGER_SUBADDR_HI; 1080 psi.asic->subaddr = VOYAGER_SUBADDR_HI;
@@ -1072,43 +1089,45 @@ voyager_cat_do_common_interrupt(void)
1072 cat_disconnect(&psi, &psi_asic); 1089 cat_disconnect(&psi, &psi_asic);
1073 /* Read the status. NOTE: Need to read *all* the PSI regs here 1090 /* Read the status. NOTE: Need to read *all* the PSI regs here
1074 * otherwise the cmn int will be reasserted */ 1091 * otherwise the cmn int will be reasserted */
1075 for(i = 0; i < sizeof(psi_reg.regs); i++) { 1092 for (i = 0; i < sizeof(psi_reg.regs); i++) {
1076 cat_read(&psi, &psi_asic, i, &((__u8 *)&psi_reg.regs)[i]); 1093 cat_read(&psi, &psi_asic, i, &((__u8 *) & psi_reg.regs)[i]);
1077 } 1094 }
1078 outb(VOYAGER_CAT_END, CAT_CMD); 1095 outb(VOYAGER_CAT_END, CAT_CMD);
1079 if((psi_reg.regs.checkbit & 0x02) == 0) { 1096 if ((psi_reg.regs.checkbit & 0x02) == 0) {
1080 psi_reg.regs.checkbit |= 0x02; 1097 psi_reg.regs.checkbit |= 0x02;
1081 cat_write(&psi, &psi_asic, 5, psi_reg.regs.checkbit); 1098 cat_write(&psi, &psi_asic, 5, psi_reg.regs.checkbit);
1082 printk("VOYAGER RE-READ PSI\n"); 1099 printk("VOYAGER RE-READ PSI\n");
1083 goto re_read; 1100 goto re_read;
1084 } 1101 }
1085 outb(VOYAGER_CAT_RUN, CAT_CMD); 1102 outb(VOYAGER_CAT_RUN, CAT_CMD);
1086 for(i = 0; i < sizeof(psi_reg.subregs); i++) { 1103 for (i = 0; i < sizeof(psi_reg.subregs); i++) {
1087 /* This looks strange, but the PSI doesn't do auto increment 1104 /* This looks strange, but the PSI doesn't do auto increment
1088 * correctly */ 1105 * correctly */
1089 cat_subread(&psi, &psi_asic, VOYAGER_PSI_SUPPLY_REG + i, 1106 cat_subread(&psi, &psi_asic, VOYAGER_PSI_SUPPLY_REG + i,
1090 1, &((__u8 *)&psi_reg.subregs)[i]); 1107 1, &((__u8 *) & psi_reg.subregs)[i]);
1091 } 1108 }
1092 outb(VOYAGER_CAT_END, CAT_CMD); 1109 outb(VOYAGER_CAT_END, CAT_CMD);
1093#ifdef VOYAGER_CAT_DEBUG 1110#ifdef VOYAGER_CAT_DEBUG
1094 printk("VOYAGER PSI: "); 1111 printk("VOYAGER PSI: ");
1095 for(i=0; i<sizeof(psi_reg.regs); i++) 1112 for (i = 0; i < sizeof(psi_reg.regs); i++)
1096 printk("%02x ", ((__u8 *)&psi_reg.regs)[i]); 1113 printk("%02x ", ((__u8 *) & psi_reg.regs)[i]);
1097 printk("\n "); 1114 printk("\n ");
1098 for(i=0; i<sizeof(psi_reg.subregs); i++) 1115 for (i = 0; i < sizeof(psi_reg.subregs); i++)
1099 printk("%02x ", ((__u8 *)&psi_reg.subregs)[i]); 1116 printk("%02x ", ((__u8 *) & psi_reg.subregs)[i]);
1100 printk("\n"); 1117 printk("\n");
1101#endif 1118#endif
1102 if(psi_reg.regs.intstatus & PSI_MON) { 1119 if (psi_reg.regs.intstatus & PSI_MON) {
1103 /* switch off or power fail */ 1120 /* switch off or power fail */
1104 1121
1105 if(psi_reg.subregs.supply & PSI_SWITCH_OFF) { 1122 if (psi_reg.subregs.supply & PSI_SWITCH_OFF) {
1106 if(voyager_status.switch_off) { 1123 if (voyager_status.switch_off) {
1107 printk(KERN_ERR "Voyager front panel switch turned off again---Immediate power off!\n"); 1124 printk(KERN_ERR
1125 "Voyager front panel switch turned off again---Immediate power off!\n");
1108 voyager_cat_power_off(); 1126 voyager_cat_power_off();
1109 /* not reached */ 1127 /* not reached */
1110 } else { 1128 } else {
1111 printk(KERN_ERR "Voyager front panel switch turned off\n"); 1129 printk(KERN_ERR
1130 "Voyager front panel switch turned off\n");
1112 voyager_status.switch_off = 1; 1131 voyager_status.switch_off = 1;
1113 voyager_status.request_from_kernel = 1; 1132 voyager_status.request_from_kernel = 1;
1114 wake_up_process(voyager_thread); 1133 wake_up_process(voyager_thread);
@@ -1127,7 +1146,7 @@ voyager_cat_do_common_interrupt(void)
1127 1146
1128 VDEBUG(("Voyager ac fail reg 0x%x\n", 1147 VDEBUG(("Voyager ac fail reg 0x%x\n",
1129 psi_reg.subregs.ACfail)); 1148 psi_reg.subregs.ACfail));
1130 if((psi_reg.subregs.ACfail & AC_FAIL_STAT_CHANGE) == 0) { 1149 if ((psi_reg.subregs.ACfail & AC_FAIL_STAT_CHANGE) == 0) {
1131 /* No further update */ 1150 /* No further update */
1132 return; 1151 return;
1133 } 1152 }
@@ -1135,20 +1154,20 @@ voyager_cat_do_common_interrupt(void)
1135 /* Don't bother trying to find out who failed. 1154 /* Don't bother trying to find out who failed.
1136 * FIXME: This probably makes the code incorrect on 1155 * FIXME: This probably makes the code incorrect on
1137 * anything other than a 345x */ 1156 * anything other than a 345x */
1138 for(i=0; i< 5; i++) { 1157 for (i = 0; i < 5; i++) {
1139 if( psi_reg.subregs.ACfail &(1<<i)) { 1158 if (psi_reg.subregs.ACfail & (1 << i)) {
1140 break; 1159 break;
1141 } 1160 }
1142 } 1161 }
1143 printk(KERN_NOTICE "AC FAIL IN SUPPLY %d\n", i); 1162 printk(KERN_NOTICE "AC FAIL IN SUPPLY %d\n", i);
1144#endif 1163#endif
1145 /* DON'T do this: it shuts down the AC PSI 1164 /* DON'T do this: it shuts down the AC PSI
1146 outb(VOYAGER_CAT_RUN, CAT_CMD); 1165 outb(VOYAGER_CAT_RUN, CAT_CMD);
1147 data = PSI_MASK_MASK | i; 1166 data = PSI_MASK_MASK | i;
1148 cat_subwrite(&psi, &psi_asic, VOYAGER_PSI_MASK, 1167 cat_subwrite(&psi, &psi_asic, VOYAGER_PSI_MASK,
1149 1, &data); 1168 1, &data);
1150 outb(VOYAGER_CAT_END, CAT_CMD); 1169 outb(VOYAGER_CAT_END, CAT_CMD);
1151 */ 1170 */
1152 printk(KERN_ERR "Voyager AC power failure\n"); 1171 printk(KERN_ERR "Voyager AC power failure\n");
1153 outb(VOYAGER_CAT_RUN, CAT_CMD); 1172 outb(VOYAGER_CAT_RUN, CAT_CMD);
1154 data = PSI_COLD_START; 1173 data = PSI_COLD_START;
@@ -1159,16 +1178,16 @@ voyager_cat_do_common_interrupt(void)
1159 voyager_status.request_from_kernel = 1; 1178 voyager_status.request_from_kernel = 1;
1160 wake_up_process(voyager_thread); 1179 wake_up_process(voyager_thread);
1161 } 1180 }
1162 1181
1163 1182 } else if (psi_reg.regs.intstatus & PSI_FAULT) {
1164 } else if(psi_reg.regs.intstatus & PSI_FAULT) {
1165 /* Major fault! */ 1183 /* Major fault! */
1166 printk(KERN_ERR "Voyager PSI Detected major fault, immediate power off!\n"); 1184 printk(KERN_ERR
1185 "Voyager PSI Detected major fault, immediate power off!\n");
1167 voyager_cat_power_off(); 1186 voyager_cat_power_off();
1168 /* not reached */ 1187 /* not reached */
1169 } else if(psi_reg.regs.intstatus & (PSI_DC_FAIL | PSI_ALARM 1188 } else if (psi_reg.regs.intstatus & (PSI_DC_FAIL | PSI_ALARM
1170 | PSI_CURRENT | PSI_DVM 1189 | PSI_CURRENT | PSI_DVM
1171 | PSI_PSCFAULT | PSI_STAT_CHG)) { 1190 | PSI_PSCFAULT | PSI_STAT_CHG)) {
1172 /* other psi fault */ 1191 /* other psi fault */
1173 1192
1174 printk(KERN_WARNING "Voyager PSI status 0x%x\n", data); 1193 printk(KERN_WARNING "Voyager PSI status 0x%x\n", data);
diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c
index 88124dd3540..3cc8eb2f36a 100644
--- a/arch/x86/mach-voyager/voyager_smp.c
+++ b/arch/x86/mach-voyager/voyager_smp.c
@@ -32,7 +32,8 @@
32DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { &init_mm, 0 }; 32DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { &init_mm, 0 };
33 33
34/* CPU IRQ affinity -- set to all ones initially */ 34/* CPU IRQ affinity -- set to all ones initially */
35static unsigned long cpu_irq_affinity[NR_CPUS] __cacheline_aligned = { [0 ... NR_CPUS-1] = ~0UL }; 35static unsigned long cpu_irq_affinity[NR_CPUS] __cacheline_aligned =
36 {[0 ... NR_CPUS-1] = ~0UL };
36 37
37/* per CPU data structure (for /proc/cpuinfo et al), visible externally 38/* per CPU data structure (for /proc/cpuinfo et al), visible externally
38 * indexed physically */ 39 * indexed physically */
@@ -76,7 +77,6 @@ EXPORT_SYMBOL(cpu_online_map);
76 * by scheduler but indexed physically */ 77 * by scheduler but indexed physically */
77cpumask_t phys_cpu_present_map = CPU_MASK_NONE; 78cpumask_t phys_cpu_present_map = CPU_MASK_NONE;
78 79
79
80/* The internal functions */ 80/* The internal functions */
81static void send_CPI(__u32 cpuset, __u8 cpi); 81static void send_CPI(__u32 cpuset, __u8 cpi);
82static void ack_CPI(__u8 cpi); 82static void ack_CPI(__u8 cpi);
@@ -101,94 +101,86 @@ int hard_smp_processor_id(void);
101int safe_smp_processor_id(void); 101int safe_smp_processor_id(void);
102 102
103/* Inline functions */ 103/* Inline functions */
104static inline void 104static inline void send_one_QIC_CPI(__u8 cpu, __u8 cpi)
105send_one_QIC_CPI(__u8 cpu, __u8 cpi)
106{ 105{
107 voyager_quad_cpi_addr[cpu]->qic_cpi[cpi].cpi = 106 voyager_quad_cpi_addr[cpu]->qic_cpi[cpi].cpi =
108 (smp_processor_id() << 16) + cpi; 107 (smp_processor_id() << 16) + cpi;
109} 108}
110 109
111static inline void 110static inline void send_QIC_CPI(__u32 cpuset, __u8 cpi)
112send_QIC_CPI(__u32 cpuset, __u8 cpi)
113{ 111{
114 int cpu; 112 int cpu;
115 113
116 for_each_online_cpu(cpu) { 114 for_each_online_cpu(cpu) {
117 if(cpuset & (1<<cpu)) { 115 if (cpuset & (1 << cpu)) {
118#ifdef VOYAGER_DEBUG 116#ifdef VOYAGER_DEBUG
119 if(!cpu_isset(cpu, cpu_online_map)) 117 if (!cpu_isset(cpu, cpu_online_map))
120 VDEBUG(("CPU%d sending cpi %d to CPU%d not in cpu_online_map\n", hard_smp_processor_id(), cpi, cpu)); 118 VDEBUG(("CPU%d sending cpi %d to CPU%d not in "
119 "cpu_online_map\n",
120 hard_smp_processor_id(), cpi, cpu));
121#endif 121#endif
122 send_one_QIC_CPI(cpu, cpi - QIC_CPI_OFFSET); 122 send_one_QIC_CPI(cpu, cpi - QIC_CPI_OFFSET);
123 } 123 }
124 } 124 }
125} 125}
126 126
127static inline void 127static inline void wrapper_smp_local_timer_interrupt(void)
128wrapper_smp_local_timer_interrupt(void)
129{ 128{
130 irq_enter(); 129 irq_enter();
131 smp_local_timer_interrupt(); 130 smp_local_timer_interrupt();
132 irq_exit(); 131 irq_exit();
133} 132}
134 133
135static inline void 134static inline void send_one_CPI(__u8 cpu, __u8 cpi)
136send_one_CPI(__u8 cpu, __u8 cpi)
137{ 135{
138 if(voyager_quad_processors & (1<<cpu)) 136 if (voyager_quad_processors & (1 << cpu))
139 send_one_QIC_CPI(cpu, cpi - QIC_CPI_OFFSET); 137 send_one_QIC_CPI(cpu, cpi - QIC_CPI_OFFSET);
140 else 138 else
141 send_CPI(1<<cpu, cpi); 139 send_CPI(1 << cpu, cpi);
142} 140}
143 141
144static inline void 142static inline void send_CPI_allbutself(__u8 cpi)
145send_CPI_allbutself(__u8 cpi)
146{ 143{
147 __u8 cpu = smp_processor_id(); 144 __u8 cpu = smp_processor_id();
148 __u32 mask = cpus_addr(cpu_online_map)[0] & ~(1 << cpu); 145 __u32 mask = cpus_addr(cpu_online_map)[0] & ~(1 << cpu);
149 send_CPI(mask, cpi); 146 send_CPI(mask, cpi);
150} 147}
151 148
152static inline int 149static inline int is_cpu_quad(void)
153is_cpu_quad(void)
154{ 150{
155 __u8 cpumask = inb(VIC_PROC_WHO_AM_I); 151 __u8 cpumask = inb(VIC_PROC_WHO_AM_I);
156 return ((cpumask & QUAD_IDENTIFIER) == QUAD_IDENTIFIER); 152 return ((cpumask & QUAD_IDENTIFIER) == QUAD_IDENTIFIER);
157} 153}
158 154
159static inline int 155static inline int is_cpu_extended(void)
160is_cpu_extended(void)
161{ 156{
162 __u8 cpu = hard_smp_processor_id(); 157 __u8 cpu = hard_smp_processor_id();
163 158
164 return(voyager_extended_vic_processors & (1<<cpu)); 159 return (voyager_extended_vic_processors & (1 << cpu));
165} 160}
166 161
167static inline int 162static inline int is_cpu_vic_boot(void)
168is_cpu_vic_boot(void)
169{ 163{
170 __u8 cpu = hard_smp_processor_id(); 164 __u8 cpu = hard_smp_processor_id();
171 165
172 return(voyager_extended_vic_processors 166 return (voyager_extended_vic_processors
173 & voyager_allowed_boot_processors & (1<<cpu)); 167 & voyager_allowed_boot_processors & (1 << cpu));
174} 168}
175 169
176 170static inline void ack_CPI(__u8 cpi)
177static inline void
178ack_CPI(__u8 cpi)
179{ 171{
180 switch(cpi) { 172 switch (cpi) {
181 case VIC_CPU_BOOT_CPI: 173 case VIC_CPU_BOOT_CPI:
182 if(is_cpu_quad() && !is_cpu_vic_boot()) 174 if (is_cpu_quad() && !is_cpu_vic_boot())
183 ack_QIC_CPI(cpi); 175 ack_QIC_CPI(cpi);
184 else 176 else
185 ack_VIC_CPI(cpi); 177 ack_VIC_CPI(cpi);
186 break; 178 break;
187 case VIC_SYS_INT: 179 case VIC_SYS_INT:
188 case VIC_CMN_INT: 180 case VIC_CMN_INT:
189 /* These are slightly strange. Even on the Quad card, 181 /* These are slightly strange. Even on the Quad card,
190 * They are vectored as VIC CPIs */ 182 * They are vectored as VIC CPIs */
191 if(is_cpu_quad()) 183 if (is_cpu_quad())
192 ack_special_QIC_CPI(cpi); 184 ack_special_QIC_CPI(cpi);
193 else 185 else
194 ack_VIC_CPI(cpi); 186 ack_VIC_CPI(cpi);
@@ -205,11 +197,11 @@ ack_CPI(__u8 cpi)
205 * 8259 IRQs except that masks and things must be kept per processor 197 * 8259 IRQs except that masks and things must be kept per processor
206 */ 198 */
207static struct irq_chip vic_chip = { 199static struct irq_chip vic_chip = {
208 .name = "VIC", 200 .name = "VIC",
209 .startup = startup_vic_irq, 201 .startup = startup_vic_irq,
210 .mask = mask_vic_irq, 202 .mask = mask_vic_irq,
211 .unmask = unmask_vic_irq, 203 .unmask = unmask_vic_irq,
212 .set_affinity = set_vic_irq_affinity, 204 .set_affinity = set_vic_irq_affinity,
213}; 205};
214 206
215/* used to count up as CPUs are brought on line (starts at 0) */ 207/* used to count up as CPUs are brought on line (starts at 0) */
@@ -223,7 +215,7 @@ static __u32 trampoline_base;
223/* The per cpu profile stuff - used in smp_local_timer_interrupt */ 215/* The per cpu profile stuff - used in smp_local_timer_interrupt */
224static DEFINE_PER_CPU(int, prof_multiplier) = 1; 216static DEFINE_PER_CPU(int, prof_multiplier) = 1;
225static DEFINE_PER_CPU(int, prof_old_multiplier) = 1; 217static DEFINE_PER_CPU(int, prof_old_multiplier) = 1;
226static DEFINE_PER_CPU(int, prof_counter) = 1; 218static DEFINE_PER_CPU(int, prof_counter) = 1;
227 219
228/* the map used to check if a CPU has booted */ 220/* the map used to check if a CPU has booted */
229static __u32 cpu_booted_map; 221static __u32 cpu_booted_map;
@@ -235,7 +227,6 @@ static cpumask_t smp_commenced_mask = CPU_MASK_NONE;
235/* This is for the new dynamic CPU boot code */ 227/* This is for the new dynamic CPU boot code */
236cpumask_t cpu_callin_map = CPU_MASK_NONE; 228cpumask_t cpu_callin_map = CPU_MASK_NONE;
237cpumask_t cpu_callout_map = CPU_MASK_NONE; 229cpumask_t cpu_callout_map = CPU_MASK_NONE;
238EXPORT_SYMBOL(cpu_callout_map);
239cpumask_t cpu_possible_map = CPU_MASK_NONE; 230cpumask_t cpu_possible_map = CPU_MASK_NONE;
240EXPORT_SYMBOL(cpu_possible_map); 231EXPORT_SYMBOL(cpu_possible_map);
241 232
@@ -246,9 +237,9 @@ static __u16 vic_irq_mask[NR_CPUS] __cacheline_aligned;
246static __u16 vic_irq_enable_mask[NR_CPUS] __cacheline_aligned = { 0 }; 237static __u16 vic_irq_enable_mask[NR_CPUS] __cacheline_aligned = { 0 };
247 238
248/* Lock for enable/disable of VIC interrupts */ 239/* Lock for enable/disable of VIC interrupts */
249static __cacheline_aligned DEFINE_SPINLOCK(vic_irq_lock); 240static __cacheline_aligned DEFINE_SPINLOCK(vic_irq_lock);
250 241
251/* The boot processor is correctly set up in PC mode when it 242/* The boot processor is correctly set up in PC mode when it
252 * comes up, but the secondaries need their master/slave 8259 243 * comes up, but the secondaries need their master/slave 8259
253 * pairs initializing correctly */ 244 * pairs initializing correctly */
254 245
@@ -262,8 +253,7 @@ static unsigned long vic_tick[NR_CPUS] __cacheline_aligned = { 0 };
262static unsigned long vic_cpi_mailbox[NR_CPUS] __cacheline_aligned; 253static unsigned long vic_cpi_mailbox[NR_CPUS] __cacheline_aligned;
263 254
264/* debugging routine to read the isr of the cpu's pic */ 255/* debugging routine to read the isr of the cpu's pic */
265static inline __u16 256static inline __u16 vic_read_isr(void)
266vic_read_isr(void)
267{ 257{
268 __u16 isr; 258 __u16 isr;
269 259
@@ -275,17 +265,16 @@ vic_read_isr(void)
275 return isr; 265 return isr;
276} 266}
277 267
278static __init void 268static __init void qic_setup(void)
279qic_setup(void)
280{ 269{
281 if(!is_cpu_quad()) { 270 if (!is_cpu_quad()) {
282 /* not a quad, no setup */ 271 /* not a quad, no setup */
283 return; 272 return;
284 } 273 }
285 outb(QIC_DEFAULT_MASK0, QIC_MASK_REGISTER0); 274 outb(QIC_DEFAULT_MASK0, QIC_MASK_REGISTER0);
286 outb(QIC_CPI_ENABLE, QIC_MASK_REGISTER1); 275 outb(QIC_CPI_ENABLE, QIC_MASK_REGISTER1);
287 276
288 if(is_cpu_extended()) { 277 if (is_cpu_extended()) {
289 /* the QIC duplicate of the VIC base register */ 278 /* the QIC duplicate of the VIC base register */
290 outb(VIC_DEFAULT_CPI_BASE, QIC_VIC_CPI_BASE_REGISTER); 279 outb(VIC_DEFAULT_CPI_BASE, QIC_VIC_CPI_BASE_REGISTER);
291 outb(QIC_DEFAULT_CPI_BASE, QIC_CPI_BASE_REGISTER); 280 outb(QIC_DEFAULT_CPI_BASE, QIC_CPI_BASE_REGISTER);
@@ -295,8 +284,7 @@ qic_setup(void)
295 } 284 }
296} 285}
297 286
298static __init void 287static __init void vic_setup_pic(void)
299vic_setup_pic(void)
300{ 288{
301 outb(1, VIC_REDIRECT_REGISTER_1); 289 outb(1, VIC_REDIRECT_REGISTER_1);
302 /* clear the claim registers for dynamic routing */ 290 /* clear the claim registers for dynamic routing */
@@ -333,7 +321,7 @@ vic_setup_pic(void)
333 321
334 /* ICW2: slave vector base */ 322 /* ICW2: slave vector base */
335 outb(FIRST_EXTERNAL_VECTOR + 8, 0xA1); 323 outb(FIRST_EXTERNAL_VECTOR + 8, 0xA1);
336 324
337 /* ICW3: slave ID */ 325 /* ICW3: slave ID */
338 outb(0x02, 0xA1); 326 outb(0x02, 0xA1);
339 327
@@ -341,19 +329,18 @@ vic_setup_pic(void)
341 outb(0x01, 0xA1); 329 outb(0x01, 0xA1);
342} 330}
343 331
344static void 332static void do_quad_bootstrap(void)
345do_quad_bootstrap(void)
346{ 333{
347 if(is_cpu_quad() && is_cpu_vic_boot()) { 334 if (is_cpu_quad() && is_cpu_vic_boot()) {
348 int i; 335 int i;
349 unsigned long flags; 336 unsigned long flags;
350 __u8 cpuid = hard_smp_processor_id(); 337 __u8 cpuid = hard_smp_processor_id();
351 338
352 local_irq_save(flags); 339 local_irq_save(flags);
353 340
354 for(i = 0; i<4; i++) { 341 for (i = 0; i < 4; i++) {
355 /* FIXME: this would be >>3 &0x7 on the 32 way */ 342 /* FIXME: this would be >>3 &0x7 on the 32 way */
356 if(((cpuid >> 2) & 0x03) == i) 343 if (((cpuid >> 2) & 0x03) == i)
357 /* don't lower our own mask! */ 344 /* don't lower our own mask! */
358 continue; 345 continue;
359 346
@@ -368,12 +355,10 @@ do_quad_bootstrap(void)
368 } 355 }
369} 356}
370 357
371
372/* Set up all the basic stuff: read the SMP config and make all the 358/* Set up all the basic stuff: read the SMP config and make all the
373 * SMP information reflect only the boot cpu. All others will be 359 * SMP information reflect only the boot cpu. All others will be
374 * brought on-line later. */ 360 * brought on-line later. */
375void __init 361void __init find_smp_config(void)
376find_smp_config(void)
377{ 362{
378 int i; 363 int i;
379 364
@@ -382,24 +367,31 @@ find_smp_config(void)
382 printk("VOYAGER SMP: Boot cpu is %d\n", boot_cpu_id); 367 printk("VOYAGER SMP: Boot cpu is %d\n", boot_cpu_id);
383 368
384 /* initialize the CPU structures (moved from smp_boot_cpus) */ 369 /* initialize the CPU structures (moved from smp_boot_cpus) */
385 for(i=0; i<NR_CPUS; i++) { 370 for (i = 0; i < NR_CPUS; i++) {
386 cpu_irq_affinity[i] = ~0; 371 cpu_irq_affinity[i] = ~0;
387 } 372 }
388 cpu_online_map = cpumask_of_cpu(boot_cpu_id); 373 cpu_online_map = cpumask_of_cpu(boot_cpu_id);
389 374
390 /* The boot CPU must be extended */ 375 /* The boot CPU must be extended */
391 voyager_extended_vic_processors = 1<<boot_cpu_id; 376 voyager_extended_vic_processors = 1 << boot_cpu_id;
392 /* initially, all of the first 8 CPUs can boot */ 377 /* initially, all of the first 8 CPUs can boot */
393 voyager_allowed_boot_processors = 0xff; 378 voyager_allowed_boot_processors = 0xff;
394 /* set up everything for just this CPU, we can alter 379 /* set up everything for just this CPU, we can alter
395 * this as we start the other CPUs later */ 380 * this as we start the other CPUs later */
396 /* now get the CPU disposition from the extended CMOS */ 381 /* now get the CPU disposition from the extended CMOS */
397 cpus_addr(phys_cpu_present_map)[0] = voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK); 382 cpus_addr(phys_cpu_present_map)[0] =
398 cpus_addr(phys_cpu_present_map)[0] |= voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK + 1) << 8; 383 voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK);
399 cpus_addr(phys_cpu_present_map)[0] |= voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK + 2) << 16; 384 cpus_addr(phys_cpu_present_map)[0] |=
400 cpus_addr(phys_cpu_present_map)[0] |= voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK + 3) << 24; 385 voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK + 1) << 8;
386 cpus_addr(phys_cpu_present_map)[0] |=
387 voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK +
388 2) << 16;
389 cpus_addr(phys_cpu_present_map)[0] |=
390 voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK +
391 3) << 24;
401 cpu_possible_map = phys_cpu_present_map; 392 cpu_possible_map = phys_cpu_present_map;
402 printk("VOYAGER SMP: phys_cpu_present_map = 0x%lx\n", cpus_addr(phys_cpu_present_map)[0]); 393 printk("VOYAGER SMP: phys_cpu_present_map = 0x%lx\n",
394 cpus_addr(phys_cpu_present_map)[0]);
403 /* Here we set up the VIC to enable SMP */ 395 /* Here we set up the VIC to enable SMP */
404 /* enable the CPIs by writing the base vector to their register */ 396 /* enable the CPIs by writing the base vector to their register */
405 outb(VIC_DEFAULT_CPI_BASE, VIC_CPI_BASE_REGISTER); 397 outb(VIC_DEFAULT_CPI_BASE, VIC_CPI_BASE_REGISTER);
@@ -427,8 +419,7 @@ find_smp_config(void)
427/* 419/*
428 * The bootstrap kernel entry code has set these up. Save them 420 * The bootstrap kernel entry code has set these up. Save them
429 * for a given CPU, id is physical */ 421 * for a given CPU, id is physical */
430void __init 422void __init smp_store_cpu_info(int id)
431smp_store_cpu_info(int id)
432{ 423{
433 struct cpuinfo_x86 *c = &cpu_data(id); 424 struct cpuinfo_x86 *c = &cpu_data(id);
434 425
@@ -438,25 +429,21 @@ smp_store_cpu_info(int id)
438} 429}
439 430
440/* set up the trampoline and return the physical address of the code */ 431/* set up the trampoline and return the physical address of the code */
441static __u32 __init 432static __u32 __init setup_trampoline(void)
442setup_trampoline(void)
443{ 433{
444 /* these two are global symbols in trampoline.S */ 434 /* these two are global symbols in trampoline.S */
445 extern const __u8 trampoline_end[]; 435 extern const __u8 trampoline_end[];
446 extern const __u8 trampoline_data[]; 436 extern const __u8 trampoline_data[];
447 437
448 memcpy((__u8 *)trampoline_base, trampoline_data, 438 memcpy((__u8 *) trampoline_base, trampoline_data,
449 trampoline_end - trampoline_data); 439 trampoline_end - trampoline_data);
450 return virt_to_phys((__u8 *)trampoline_base); 440 return virt_to_phys((__u8 *) trampoline_base);
451} 441}
452 442
453/* Routine initially called when a non-boot CPU is brought online */ 443/* Routine initially called when a non-boot CPU is brought online */
454static void __init 444static void __init start_secondary(void *unused)
455start_secondary(void *unused)
456{ 445{
457 __u8 cpuid = hard_smp_processor_id(); 446 __u8 cpuid = hard_smp_processor_id();
458 /* external functions not defined in the headers */
459 extern void calibrate_delay(void);
460 447
461 cpu_init(); 448 cpu_init();
462 449
@@ -464,17 +451,18 @@ start_secondary(void *unused)
464 ack_CPI(VIC_CPU_BOOT_CPI); 451 ack_CPI(VIC_CPU_BOOT_CPI);
465 452
466 /* setup the 8259 master slave pair belonging to this CPU --- 453 /* setup the 8259 master slave pair belonging to this CPU ---
467 * we won't actually receive any until the boot CPU 454 * we won't actually receive any until the boot CPU
468 * relinquishes it's static routing mask */ 455 * relinquishes it's static routing mask */
469 vic_setup_pic(); 456 vic_setup_pic();
470 457
471 qic_setup(); 458 qic_setup();
472 459
473 if(is_cpu_quad() && !is_cpu_vic_boot()) { 460 if (is_cpu_quad() && !is_cpu_vic_boot()) {
474 /* clear the boot CPI */ 461 /* clear the boot CPI */
475 __u8 dummy; 462 __u8 dummy;
476 463
477 dummy = voyager_quad_cpi_addr[cpuid]->qic_cpi[VIC_CPU_BOOT_CPI].cpi; 464 dummy =
465 voyager_quad_cpi_addr[cpuid]->qic_cpi[VIC_CPU_BOOT_CPI].cpi;
478 printk("read dummy %d\n", dummy); 466 printk("read dummy %d\n", dummy);
479 } 467 }
480 468
@@ -516,7 +504,6 @@ start_secondary(void *unused)
516 cpu_idle(); 504 cpu_idle();
517} 505}
518 506
519
520/* Routine to kick start the given CPU and wait for it to report ready 507/* Routine to kick start the given CPU and wait for it to report ready
521 * (or timeout in startup). When this routine returns, the requested 508 * (or timeout in startup). When this routine returns, the requested
522 * CPU is either fully running and configured or known to be dead. 509 * CPU is either fully running and configured or known to be dead.
@@ -524,29 +511,28 @@ start_secondary(void *unused)
524 * We call this routine sequentially 1 CPU at a time, so no need for 511 * We call this routine sequentially 1 CPU at a time, so no need for
525 * locking */ 512 * locking */
526 513
527static void __init 514static void __init do_boot_cpu(__u8 cpu)
528do_boot_cpu(__u8 cpu)
529{ 515{
530 struct task_struct *idle; 516 struct task_struct *idle;
531 int timeout; 517 int timeout;
532 unsigned long flags; 518 unsigned long flags;
533 int quad_boot = (1<<cpu) & voyager_quad_processors 519 int quad_boot = (1 << cpu) & voyager_quad_processors
534 & ~( voyager_extended_vic_processors 520 & ~(voyager_extended_vic_processors
535 & voyager_allowed_boot_processors); 521 & voyager_allowed_boot_processors);
536 522
537 /* This is an area in head.S which was used to set up the 523 /* This is an area in head.S which was used to set up the
538 * initial kernel stack. We need to alter this to give the 524 * initial kernel stack. We need to alter this to give the
539 * booting CPU a new stack (taken from its idle process) */ 525 * booting CPU a new stack (taken from its idle process) */
540 extern struct { 526 extern struct {
541 __u8 *esp; 527 __u8 *sp;
542 unsigned short ss; 528 unsigned short ss;
543 } stack_start; 529 } stack_start;
544 /* This is the format of the CPI IDT gate (in real mode) which 530 /* This is the format of the CPI IDT gate (in real mode) which
545 * we're hijacking to boot the CPU */ 531 * we're hijacking to boot the CPU */
546 union IDTFormat { 532 union IDTFormat {
547 struct seg { 533 struct seg {
548 __u16 Offset; 534 __u16 Offset;
549 __u16 Segment; 535 __u16 Segment;
550 } idt; 536 } idt;
551 __u32 val; 537 __u32 val;
552 } hijack_source; 538 } hijack_source;
@@ -565,37 +551,44 @@ do_boot_cpu(__u8 cpu)
565 alternatives_smp_switch(1); 551 alternatives_smp_switch(1);
566 552
567 idle = fork_idle(cpu); 553 idle = fork_idle(cpu);
568 if(IS_ERR(idle)) 554 if (IS_ERR(idle))
569 panic("failed fork for CPU%d", cpu); 555 panic("failed fork for CPU%d", cpu);
570 idle->thread.eip = (unsigned long) start_secondary; 556 idle->thread.ip = (unsigned long)start_secondary;
571 /* init_tasks (in sched.c) is indexed logically */ 557 /* init_tasks (in sched.c) is indexed logically */
572 stack_start.esp = (void *) idle->thread.esp; 558 stack_start.sp = (void *)idle->thread.sp;
573 559
574 init_gdt(cpu); 560 init_gdt(cpu);
575 per_cpu(current_task, cpu) = idle; 561 per_cpu(current_task, cpu) = idle;
576 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); 562 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
577 irq_ctx_init(cpu); 563 irq_ctx_init(cpu);
578 564
579 /* Note: Don't modify initial ss override */ 565 /* Note: Don't modify initial ss override */
580 VDEBUG(("VOYAGER SMP: Booting CPU%d at 0x%lx[%x:%x], stack %p\n", cpu, 566 VDEBUG(("VOYAGER SMP: Booting CPU%d at 0x%lx[%x:%x], stack %p\n", cpu,
581 (unsigned long)hijack_source.val, hijack_source.idt.Segment, 567 (unsigned long)hijack_source.val, hijack_source.idt.Segment,
582 hijack_source.idt.Offset, stack_start.esp)); 568 hijack_source.idt.Offset, stack_start.sp));
583 569
584 /* init lowmem identity mapping */ 570 /* init lowmem identity mapping */
585 clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, 571 clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
586 min_t(unsigned long, KERNEL_PGD_PTRS, USER_PGD_PTRS)); 572 min_t(unsigned long, KERNEL_PGD_PTRS, USER_PGD_PTRS));
587 flush_tlb_all(); 573 flush_tlb_all();
588 574
589 if(quad_boot) { 575 if (quad_boot) {
590 printk("CPU %d: non extended Quad boot\n", cpu); 576 printk("CPU %d: non extended Quad boot\n", cpu);
591 hijack_vector = (__u32 *)phys_to_virt((VIC_CPU_BOOT_CPI + QIC_DEFAULT_CPI_BASE)*4); 577 hijack_vector =
578 (__u32 *)
579 phys_to_virt((VIC_CPU_BOOT_CPI + QIC_DEFAULT_CPI_BASE) * 4);
592 *hijack_vector = hijack_source.val; 580 *hijack_vector = hijack_source.val;
593 } else { 581 } else {
594 printk("CPU%d: extended VIC boot\n", cpu); 582 printk("CPU%d: extended VIC boot\n", cpu);
595 hijack_vector = (__u32 *)phys_to_virt((VIC_CPU_BOOT_CPI + VIC_DEFAULT_CPI_BASE)*4); 583 hijack_vector =
584 (__u32 *)
585 phys_to_virt((VIC_CPU_BOOT_CPI + VIC_DEFAULT_CPI_BASE) * 4);
596 *hijack_vector = hijack_source.val; 586 *hijack_vector = hijack_source.val;
597 /* VIC errata, may also receive interrupt at this address */ 587 /* VIC errata, may also receive interrupt at this address */
598 hijack_vector = (__u32 *)phys_to_virt((VIC_CPU_BOOT_ERRATA_CPI + VIC_DEFAULT_CPI_BASE)*4); 588 hijack_vector =
589 (__u32 *)
590 phys_to_virt((VIC_CPU_BOOT_ERRATA_CPI +
591 VIC_DEFAULT_CPI_BASE) * 4);
599 *hijack_vector = hijack_source.val; 592 *hijack_vector = hijack_source.val;
600 } 593 }
601 /* All non-boot CPUs start with interrupts fully masked. Need 594 /* All non-boot CPUs start with interrupts fully masked. Need
@@ -603,73 +596,76 @@ do_boot_cpu(__u8 cpu)
603 * this in the VIC by masquerading as the processor we're 596 * this in the VIC by masquerading as the processor we're
604 * about to boot and lowering its interrupt mask */ 597 * about to boot and lowering its interrupt mask */
605 local_irq_save(flags); 598 local_irq_save(flags);
606 if(quad_boot) { 599 if (quad_boot) {
607 send_one_QIC_CPI(cpu, VIC_CPU_BOOT_CPI); 600 send_one_QIC_CPI(cpu, VIC_CPU_BOOT_CPI);
608 } else { 601 } else {
609 outb(VIC_CPU_MASQUERADE_ENABLE | cpu, VIC_PROCESSOR_ID); 602 outb(VIC_CPU_MASQUERADE_ENABLE | cpu, VIC_PROCESSOR_ID);
610 /* here we're altering registers belonging to `cpu' */ 603 /* here we're altering registers belonging to `cpu' */
611 604
612 outb(VIC_BOOT_INTERRUPT_MASK, 0x21); 605 outb(VIC_BOOT_INTERRUPT_MASK, 0x21);
613 /* now go back to our original identity */ 606 /* now go back to our original identity */
614 outb(boot_cpu_id, VIC_PROCESSOR_ID); 607 outb(boot_cpu_id, VIC_PROCESSOR_ID);
615 608
616 /* and boot the CPU */ 609 /* and boot the CPU */
617 610
618 send_CPI((1<<cpu), VIC_CPU_BOOT_CPI); 611 send_CPI((1 << cpu), VIC_CPU_BOOT_CPI);
619 } 612 }
620 cpu_booted_map = 0; 613 cpu_booted_map = 0;
621 local_irq_restore(flags); 614 local_irq_restore(flags);
622 615
623 /* now wait for it to become ready (or timeout) */ 616 /* now wait for it to become ready (or timeout) */
624 for(timeout = 0; timeout < 50000; timeout++) { 617 for (timeout = 0; timeout < 50000; timeout++) {
625 if(cpu_booted_map) 618 if (cpu_booted_map)
626 break; 619 break;
627 udelay(100); 620 udelay(100);
628 } 621 }
629 /* reset the page table */ 622 /* reset the page table */
630 zap_low_mappings(); 623 zap_low_mappings();
631 624
632 if (cpu_booted_map) { 625 if (cpu_booted_map) {
633 VDEBUG(("CPU%d: Booted successfully, back in CPU %d\n", 626 VDEBUG(("CPU%d: Booted successfully, back in CPU %d\n",
634 cpu, smp_processor_id())); 627 cpu, smp_processor_id()));
635 628
636 printk("CPU%d: ", cpu); 629 printk("CPU%d: ", cpu);
637 print_cpu_info(&cpu_data(cpu)); 630 print_cpu_info(&cpu_data(cpu));
638 wmb(); 631 wmb();
639 cpu_set(cpu, cpu_callout_map); 632 cpu_set(cpu, cpu_callout_map);
640 cpu_set(cpu, cpu_present_map); 633 cpu_set(cpu, cpu_present_map);
641 } 634 } else {
642 else {
643 printk("CPU%d FAILED TO BOOT: ", cpu); 635 printk("CPU%d FAILED TO BOOT: ", cpu);
644 if (*((volatile unsigned char *)phys_to_virt(start_phys_address))==0xA5) 636 if (*
637 ((volatile unsigned char *)phys_to_virt(start_phys_address))
638 == 0xA5)
645 printk("Stuck.\n"); 639 printk("Stuck.\n");
646 else 640 else
647 printk("Not responding.\n"); 641 printk("Not responding.\n");
648 642
649 cpucount--; 643 cpucount--;
650 } 644 }
651} 645}
652 646
653void __init 647void __init smp_boot_cpus(void)
654smp_boot_cpus(void)
655{ 648{
656 int i; 649 int i;
657 650
658 /* CAT BUS initialisation must be done after the memory */ 651 /* CAT BUS initialisation must be done after the memory */
659 /* FIXME: The L4 has a catbus too, it just needs to be 652 /* FIXME: The L4 has a catbus too, it just needs to be
660 * accessed in a totally different way */ 653 * accessed in a totally different way */
661 if(voyager_level == 5) { 654 if (voyager_level == 5) {
662 voyager_cat_init(); 655 voyager_cat_init();
663 656
664 /* now that the cat has probed the Voyager System Bus, sanity 657 /* now that the cat has probed the Voyager System Bus, sanity
665 * check the cpu map */ 658 * check the cpu map */
666 if( ((voyager_quad_processors | voyager_extended_vic_processors) 659 if (((voyager_quad_processors | voyager_extended_vic_processors)
667 & cpus_addr(phys_cpu_present_map)[0]) != cpus_addr(phys_cpu_present_map)[0]) { 660 & cpus_addr(phys_cpu_present_map)[0]) !=
661 cpus_addr(phys_cpu_present_map)[0]) {
668 /* should panic */ 662 /* should panic */
669 printk("\n\n***WARNING*** Sanity check of CPU present map FAILED\n"); 663 printk("\n\n***WARNING*** "
664 "Sanity check of CPU present map FAILED\n");
670 } 665 }
671 } else if(voyager_level == 4) 666 } else if (voyager_level == 4)
672 voyager_extended_vic_processors = cpus_addr(phys_cpu_present_map)[0]; 667 voyager_extended_vic_processors =
668 cpus_addr(phys_cpu_present_map)[0];
673 669
674 /* this sets up the idle task to run on the current cpu */ 670 /* this sets up the idle task to run on the current cpu */
675 voyager_extended_cpus = 1; 671 voyager_extended_cpus = 1;
@@ -678,14 +674,14 @@ smp_boot_cpus(void)
678 //global_irq_holder = boot_cpu_id; 674 //global_irq_holder = boot_cpu_id;
679 675
680 /* FIXME: Need to do something about this but currently only works 676 /* FIXME: Need to do something about this but currently only works
681 * on CPUs with a tsc which none of mine have. 677 * on CPUs with a tsc which none of mine have.
682 smp_tune_scheduling(); 678 smp_tune_scheduling();
683 */ 679 */
684 smp_store_cpu_info(boot_cpu_id); 680 smp_store_cpu_info(boot_cpu_id);
685 printk("CPU%d: ", boot_cpu_id); 681 printk("CPU%d: ", boot_cpu_id);
686 print_cpu_info(&cpu_data(boot_cpu_id)); 682 print_cpu_info(&cpu_data(boot_cpu_id));
687 683
688 if(is_cpu_quad()) { 684 if (is_cpu_quad()) {
689 /* booting on a Quad CPU */ 685 /* booting on a Quad CPU */
690 printk("VOYAGER SMP: Boot CPU is Quad\n"); 686 printk("VOYAGER SMP: Boot CPU is Quad\n");
691 qic_setup(); 687 qic_setup();
@@ -697,11 +693,11 @@ smp_boot_cpus(void)
697 693
698 cpu_set(boot_cpu_id, cpu_online_map); 694 cpu_set(boot_cpu_id, cpu_online_map);
699 cpu_set(boot_cpu_id, cpu_callout_map); 695 cpu_set(boot_cpu_id, cpu_callout_map);
700 696
701 /* loop over all the extended VIC CPUs and boot them. The 697 /* loop over all the extended VIC CPUs and boot them. The
702 * Quad CPUs must be bootstrapped by their extended VIC cpu */ 698 * Quad CPUs must be bootstrapped by their extended VIC cpu */
703 for(i = 0; i < NR_CPUS; i++) { 699 for (i = 0; i < NR_CPUS; i++) {
704 if(i == boot_cpu_id || !cpu_isset(i, phys_cpu_present_map)) 700 if (i == boot_cpu_id || !cpu_isset(i, phys_cpu_present_map))
705 continue; 701 continue;
706 do_boot_cpu(i); 702 do_boot_cpu(i);
707 /* This udelay seems to be needed for the Quad boots 703 /* This udelay seems to be needed for the Quad boots
@@ -715,25 +711,26 @@ smp_boot_cpus(void)
715 for (i = 0; i < NR_CPUS; i++) 711 for (i = 0; i < NR_CPUS; i++)
716 if (cpu_isset(i, cpu_online_map)) 712 if (cpu_isset(i, cpu_online_map))
717 bogosum += cpu_data(i).loops_per_jiffy; 713 bogosum += cpu_data(i).loops_per_jiffy;
718 printk(KERN_INFO "Total of %d processors activated (%lu.%02lu BogoMIPS).\n", 714 printk(KERN_INFO "Total of %d processors activated "
719 cpucount+1, 715 "(%lu.%02lu BogoMIPS).\n",
720 bogosum/(500000/HZ), 716 cpucount + 1, bogosum / (500000 / HZ),
721 (bogosum/(5000/HZ))%100); 717 (bogosum / (5000 / HZ)) % 100);
722 } 718 }
723 voyager_extended_cpus = hweight32(voyager_extended_vic_processors); 719 voyager_extended_cpus = hweight32(voyager_extended_vic_processors);
724 printk("VOYAGER: Extended (interrupt handling CPUs): %d, non-extended: %d\n", voyager_extended_cpus, num_booting_cpus() - voyager_extended_cpus); 720 printk("VOYAGER: Extended (interrupt handling CPUs): "
721 "%d, non-extended: %d\n", voyager_extended_cpus,
722 num_booting_cpus() - voyager_extended_cpus);
725 /* that's it, switch to symmetric mode */ 723 /* that's it, switch to symmetric mode */
726 outb(0, VIC_PRIORITY_REGISTER); 724 outb(0, VIC_PRIORITY_REGISTER);
727 outb(0, VIC_CLAIM_REGISTER_0); 725 outb(0, VIC_CLAIM_REGISTER_0);
728 outb(0, VIC_CLAIM_REGISTER_1); 726 outb(0, VIC_CLAIM_REGISTER_1);
729 727
730 VDEBUG(("VOYAGER SMP: Booted with %d CPUs\n", num_booting_cpus())); 728 VDEBUG(("VOYAGER SMP: Booted with %d CPUs\n", num_booting_cpus()));
731} 729}
732 730
733/* Reload the secondary CPUs task structure (this function does not 731/* Reload the secondary CPUs task structure (this function does not
734 * return ) */ 732 * return ) */
735void __init 733void __init initialize_secondary(void)
736initialize_secondary(void)
737{ 734{
738#if 0 735#if 0
739 // AC kernels only 736 // AC kernels only
@@ -745,11 +742,9 @@ initialize_secondary(void)
745 * basically just the stack pointer and the eip. 742 * basically just the stack pointer and the eip.
746 */ 743 */
747 744
748 asm volatile( 745 asm volatile ("movl %0,%%esp\n\t"
749 "movl %0,%%esp\n\t" 746 "jmp *%1"::"r" (current->thread.sp),
750 "jmp *%1" 747 "r"(current->thread.ip));
751 :
752 :"r" (current->thread.esp),"r" (current->thread.eip));
753} 748}
754 749
755/* handle a Voyager SYS_INT -- If we don't, the base board will 750/* handle a Voyager SYS_INT -- If we don't, the base board will
@@ -758,25 +753,23 @@ initialize_secondary(void)
758 * System interrupts occur because some problem was detected on the 753 * System interrupts occur because some problem was detected on the
759 * various busses. To find out what you have to probe all the 754 * various busses. To find out what you have to probe all the
760 * hardware via the CAT bus. FIXME: At the moment we do nothing. */ 755 * hardware via the CAT bus. FIXME: At the moment we do nothing. */
761fastcall void 756void smp_vic_sys_interrupt(struct pt_regs *regs)
762smp_vic_sys_interrupt(struct pt_regs *regs)
763{ 757{
764 ack_CPI(VIC_SYS_INT); 758 ack_CPI(VIC_SYS_INT);
765 printk("Voyager SYSTEM INTERRUPT\n"); 759 printk("Voyager SYSTEM INTERRUPT\n");
766} 760}
767 761
768/* Handle a voyager CMN_INT; These interrupts occur either because of 762/* Handle a voyager CMN_INT; These interrupts occur either because of
769 * a system status change or because a single bit memory error 763 * a system status change or because a single bit memory error
770 * occurred. FIXME: At the moment, ignore all this. */ 764 * occurred. FIXME: At the moment, ignore all this. */
771fastcall void 765void smp_vic_cmn_interrupt(struct pt_regs *regs)
772smp_vic_cmn_interrupt(struct pt_regs *regs)
773{ 766{
774 static __u8 in_cmn_int = 0; 767 static __u8 in_cmn_int = 0;
775 static DEFINE_SPINLOCK(cmn_int_lock); 768 static DEFINE_SPINLOCK(cmn_int_lock);
776 769
777 /* common ints are broadcast, so make sure we only do this once */ 770 /* common ints are broadcast, so make sure we only do this once */
778 _raw_spin_lock(&cmn_int_lock); 771 _raw_spin_lock(&cmn_int_lock);
779 if(in_cmn_int) 772 if (in_cmn_int)
780 goto unlock_end; 773 goto unlock_end;
781 774
782 in_cmn_int++; 775 in_cmn_int++;
@@ -784,12 +777,12 @@ smp_vic_cmn_interrupt(struct pt_regs *regs)
784 777
785 VDEBUG(("Voyager COMMON INTERRUPT\n")); 778 VDEBUG(("Voyager COMMON INTERRUPT\n"));
786 779
787 if(voyager_level == 5) 780 if (voyager_level == 5)
788 voyager_cat_do_common_interrupt(); 781 voyager_cat_do_common_interrupt();
789 782
790 _raw_spin_lock(&cmn_int_lock); 783 _raw_spin_lock(&cmn_int_lock);
791 in_cmn_int = 0; 784 in_cmn_int = 0;
792 unlock_end: 785 unlock_end:
793 _raw_spin_unlock(&cmn_int_lock); 786 _raw_spin_unlock(&cmn_int_lock);
794 ack_CPI(VIC_CMN_INT); 787 ack_CPI(VIC_CMN_INT);
795} 788}
@@ -797,26 +790,23 @@ smp_vic_cmn_interrupt(struct pt_regs *regs)
797/* 790/*
798 * Reschedule call back. Nothing to do, all the work is done 791 * Reschedule call back. Nothing to do, all the work is done
799 * automatically when we return from the interrupt. */ 792 * automatically when we return from the interrupt. */
800static void 793static void smp_reschedule_interrupt(void)
801smp_reschedule_interrupt(void)
802{ 794{
803 /* do nothing */ 795 /* do nothing */
804} 796}
805 797
806static struct mm_struct * flush_mm; 798static struct mm_struct *flush_mm;
807static unsigned long flush_va; 799static unsigned long flush_va;
808static DEFINE_SPINLOCK(tlbstate_lock); 800static DEFINE_SPINLOCK(tlbstate_lock);
809#define FLUSH_ALL 0xffffffff
810 801
811/* 802/*
812 * We cannot call mmdrop() because we are in interrupt context, 803 * We cannot call mmdrop() because we are in interrupt context,
813 * instead update mm->cpu_vm_mask. 804 * instead update mm->cpu_vm_mask.
814 * 805 *
815 * We need to reload %cr3 since the page tables may be going 806 * We need to reload %cr3 since the page tables may be going
816 * away from under us.. 807 * away from under us..
817 */ 808 */
818static inline void 809static inline void voyager_leave_mm(unsigned long cpu)
819leave_mm (unsigned long cpu)
820{ 810{
821 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) 811 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
822 BUG(); 812 BUG();
@@ -824,12 +814,10 @@ leave_mm (unsigned long cpu)
824 load_cr3(swapper_pg_dir); 814 load_cr3(swapper_pg_dir);
825} 815}
826 816
827
828/* 817/*
829 * Invalidate call-back 818 * Invalidate call-back
830 */ 819 */
831static void 820static void smp_invalidate_interrupt(void)
832smp_invalidate_interrupt(void)
833{ 821{
834 __u8 cpu = smp_processor_id(); 822 __u8 cpu = smp_processor_id();
835 823
@@ -837,18 +825,18 @@ smp_invalidate_interrupt(void)
837 return; 825 return;
838 /* This will flood messages. Don't uncomment unless you see 826 /* This will flood messages. Don't uncomment unless you see
839 * Problems with cross cpu invalidation 827 * Problems with cross cpu invalidation
840 VDEBUG(("VOYAGER SMP: CPU%d received INVALIDATE_CPI\n", 828 VDEBUG(("VOYAGER SMP: CPU%d received INVALIDATE_CPI\n",
841 smp_processor_id())); 829 smp_processor_id()));
842 */ 830 */
843 831
844 if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) { 832 if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) {
845 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) { 833 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) {
846 if (flush_va == FLUSH_ALL) 834 if (flush_va == TLB_FLUSH_ALL)
847 local_flush_tlb(); 835 local_flush_tlb();
848 else 836 else
849 __flush_tlb_one(flush_va); 837 __flush_tlb_one(flush_va);
850 } else 838 } else
851 leave_mm(cpu); 839 voyager_leave_mm(cpu);
852 } 840 }
853 smp_mb__before_clear_bit(); 841 smp_mb__before_clear_bit();
854 clear_bit(cpu, &smp_invalidate_needed); 842 clear_bit(cpu, &smp_invalidate_needed);
@@ -857,11 +845,10 @@ smp_invalidate_interrupt(void)
857 845
858/* All the new flush operations for 2.4 */ 846/* All the new flush operations for 2.4 */
859 847
860
861/* This routine is called with a physical cpu mask */ 848/* This routine is called with a physical cpu mask */
862static void 849static void
863voyager_flush_tlb_others (unsigned long cpumask, struct mm_struct *mm, 850voyager_flush_tlb_others(unsigned long cpumask, struct mm_struct *mm,
864 unsigned long va) 851 unsigned long va)
865{ 852{
866 int stuck = 50000; 853 int stuck = 50000;
867 854
@@ -875,7 +862,7 @@ voyager_flush_tlb_others (unsigned long cpumask, struct mm_struct *mm,
875 BUG(); 862 BUG();
876 863
877 spin_lock(&tlbstate_lock); 864 spin_lock(&tlbstate_lock);
878 865
879 flush_mm = mm; 866 flush_mm = mm;
880 flush_va = va; 867 flush_va = va;
881 atomic_set_mask(cpumask, &smp_invalidate_needed); 868 atomic_set_mask(cpumask, &smp_invalidate_needed);
@@ -887,23 +874,23 @@ voyager_flush_tlb_others (unsigned long cpumask, struct mm_struct *mm,
887 874
888 while (smp_invalidate_needed) { 875 while (smp_invalidate_needed) {
889 mb(); 876 mb();
890 if(--stuck == 0) { 877 if (--stuck == 0) {
891 printk("***WARNING*** Stuck doing invalidate CPI (CPU%d)\n", smp_processor_id()); 878 printk("***WARNING*** Stuck doing invalidate CPI "
879 "(CPU%d)\n", smp_processor_id());
892 break; 880 break;
893 } 881 }
894 } 882 }
895 883
896 /* Uncomment only to debug invalidation problems 884 /* Uncomment only to debug invalidation problems
897 VDEBUG(("VOYAGER SMP: Completed invalidate CPI (CPU%d)\n", cpu)); 885 VDEBUG(("VOYAGER SMP: Completed invalidate CPI (CPU%d)\n", cpu));
898 */ 886 */
899 887
900 flush_mm = NULL; 888 flush_mm = NULL;
901 flush_va = 0; 889 flush_va = 0;
902 spin_unlock(&tlbstate_lock); 890 spin_unlock(&tlbstate_lock);
903} 891}
904 892
905void 893void flush_tlb_current_task(void)
906flush_tlb_current_task(void)
907{ 894{
908 struct mm_struct *mm = current->mm; 895 struct mm_struct *mm = current->mm;
909 unsigned long cpu_mask; 896 unsigned long cpu_mask;
@@ -913,14 +900,12 @@ flush_tlb_current_task(void)
913 cpu_mask = cpus_addr(mm->cpu_vm_mask)[0] & ~(1 << smp_processor_id()); 900 cpu_mask = cpus_addr(mm->cpu_vm_mask)[0] & ~(1 << smp_processor_id());
914 local_flush_tlb(); 901 local_flush_tlb();
915 if (cpu_mask) 902 if (cpu_mask)
916 voyager_flush_tlb_others(cpu_mask, mm, FLUSH_ALL); 903 voyager_flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
917 904
918 preempt_enable(); 905 preempt_enable();
919} 906}
920 907
921 908void flush_tlb_mm(struct mm_struct *mm)
922void
923flush_tlb_mm (struct mm_struct * mm)
924{ 909{
925 unsigned long cpu_mask; 910 unsigned long cpu_mask;
926 911
@@ -932,15 +917,15 @@ flush_tlb_mm (struct mm_struct * mm)
932 if (current->mm) 917 if (current->mm)
933 local_flush_tlb(); 918 local_flush_tlb();
934 else 919 else
935 leave_mm(smp_processor_id()); 920 voyager_leave_mm(smp_processor_id());
936 } 921 }
937 if (cpu_mask) 922 if (cpu_mask)
938 voyager_flush_tlb_others(cpu_mask, mm, FLUSH_ALL); 923 voyager_flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
939 924
940 preempt_enable(); 925 preempt_enable();
941} 926}
942 927
943void flush_tlb_page(struct vm_area_struct * vma, unsigned long va) 928void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
944{ 929{
945 struct mm_struct *mm = vma->vm_mm; 930 struct mm_struct *mm = vma->vm_mm;
946 unsigned long cpu_mask; 931 unsigned long cpu_mask;
@@ -949,10 +934,10 @@ void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
949 934
950 cpu_mask = cpus_addr(mm->cpu_vm_mask)[0] & ~(1 << smp_processor_id()); 935 cpu_mask = cpus_addr(mm->cpu_vm_mask)[0] & ~(1 << smp_processor_id());
951 if (current->active_mm == mm) { 936 if (current->active_mm == mm) {
952 if(current->mm) 937 if (current->mm)
953 __flush_tlb_one(va); 938 __flush_tlb_one(va);
954 else 939 else
955 leave_mm(smp_processor_id()); 940 voyager_leave_mm(smp_processor_id());
956 } 941 }
957 942
958 if (cpu_mask) 943 if (cpu_mask)
@@ -960,21 +945,21 @@ void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
960 945
961 preempt_enable(); 946 preempt_enable();
962} 947}
948
963EXPORT_SYMBOL(flush_tlb_page); 949EXPORT_SYMBOL(flush_tlb_page);
964 950
965/* enable the requested IRQs */ 951/* enable the requested IRQs */
966static void 952static void smp_enable_irq_interrupt(void)
967smp_enable_irq_interrupt(void)
968{ 953{
969 __u8 irq; 954 __u8 irq;
970 __u8 cpu = get_cpu(); 955 __u8 cpu = get_cpu();
971 956
972 VDEBUG(("VOYAGER SMP: CPU%d enabling irq mask 0x%x\n", cpu, 957 VDEBUG(("VOYAGER SMP: CPU%d enabling irq mask 0x%x\n", cpu,
973 vic_irq_enable_mask[cpu])); 958 vic_irq_enable_mask[cpu]));
974 959
975 spin_lock(&vic_irq_lock); 960 spin_lock(&vic_irq_lock);
976 for(irq = 0; irq < 16; irq++) { 961 for (irq = 0; irq < 16; irq++) {
977 if(vic_irq_enable_mask[cpu] & (1<<irq)) 962 if (vic_irq_enable_mask[cpu] & (1 << irq))
978 enable_local_vic_irq(irq); 963 enable_local_vic_irq(irq);
979 } 964 }
980 vic_irq_enable_mask[cpu] = 0; 965 vic_irq_enable_mask[cpu] = 0;
@@ -982,17 +967,16 @@ smp_enable_irq_interrupt(void)
982 967
983 put_cpu_no_resched(); 968 put_cpu_no_resched();
984} 969}
985 970
986/* 971/*
987 * CPU halt call-back 972 * CPU halt call-back
988 */ 973 */
989static void 974static void smp_stop_cpu_function(void *dummy)
990smp_stop_cpu_function(void *dummy)
991{ 975{
992 VDEBUG(("VOYAGER SMP: CPU%d is STOPPING\n", smp_processor_id())); 976 VDEBUG(("VOYAGER SMP: CPU%d is STOPPING\n", smp_processor_id()));
993 cpu_clear(smp_processor_id(), cpu_online_map); 977 cpu_clear(smp_processor_id(), cpu_online_map);
994 local_irq_disable(); 978 local_irq_disable();
995 for(;;) 979 for (;;)
996 halt(); 980 halt();
997} 981}
998 982
@@ -1006,14 +990,13 @@ struct call_data_struct {
1006 int wait; 990 int wait;
1007}; 991};
1008 992
1009static struct call_data_struct * call_data; 993static struct call_data_struct *call_data;
1010 994
1011/* execute a thread on a new CPU. The function to be called must be 995/* execute a thread on a new CPU. The function to be called must be
1012 * previously set up. This is used to schedule a function for 996 * previously set up. This is used to schedule a function for
1013 * execution on all CPUs - set up the function then broadcast a 997 * execution on all CPUs - set up the function then broadcast a
1014 * function_interrupt CPI to come here on each CPU */ 998 * function_interrupt CPI to come here on each CPU */
1015static void 999static void smp_call_function_interrupt(void)
1016smp_call_function_interrupt(void)
1017{ 1000{
1018 void (*func) (void *info) = call_data->func; 1001 void (*func) (void *info) = call_data->func;
1019 void *info = call_data->info; 1002 void *info = call_data->info;
@@ -1027,16 +1010,17 @@ smp_call_function_interrupt(void)
1027 * about to execute the function 1010 * about to execute the function
1028 */ 1011 */
1029 mb(); 1012 mb();
1030 if(!test_and_clear_bit(cpu, &call_data->started)) { 1013 if (!test_and_clear_bit(cpu, &call_data->started)) {
1031 /* If the bit wasn't set, this could be a replay */ 1014 /* If the bit wasn't set, this could be a replay */
1032 printk(KERN_WARNING "VOYAGER SMP: CPU %d received call funtion with no call pending\n", cpu); 1015 printk(KERN_WARNING "VOYAGER SMP: CPU %d received call funtion"
1016 " with no call pending\n", cpu);
1033 return; 1017 return;
1034 } 1018 }
1035 /* 1019 /*
1036 * At this point the info structure may be out of scope unless wait==1 1020 * At this point the info structure may be out of scope unless wait==1
1037 */ 1021 */
1038 irq_enter(); 1022 irq_enter();
1039 (*func)(info); 1023 (*func) (info);
1040 __get_cpu_var(irq_stat).irq_call_count++; 1024 __get_cpu_var(irq_stat).irq_call_count++;
1041 irq_exit(); 1025 irq_exit();
1042 if (wait) { 1026 if (wait) {
@@ -1046,14 +1030,13 @@ smp_call_function_interrupt(void)
1046} 1030}
1047 1031
1048static int 1032static int
1049voyager_smp_call_function_mask (cpumask_t cpumask, 1033voyager_smp_call_function_mask(cpumask_t cpumask,
1050 void (*func) (void *info), void *info, 1034 void (*func) (void *info), void *info, int wait)
1051 int wait)
1052{ 1035{
1053 struct call_data_struct data; 1036 struct call_data_struct data;
1054 u32 mask = cpus_addr(cpumask)[0]; 1037 u32 mask = cpus_addr(cpumask)[0];
1055 1038
1056 mask &= ~(1<<smp_processor_id()); 1039 mask &= ~(1 << smp_processor_id());
1057 1040
1058 if (!mask) 1041 if (!mask)
1059 return 0; 1042 return 0;
@@ -1093,7 +1076,7 @@ voyager_smp_call_function_mask (cpumask_t cpumask,
1093 * so we use the system clock to interrupt one processor, which in 1076 * so we use the system clock to interrupt one processor, which in
1094 * turn, broadcasts a timer CPI to all the others --- we receive that 1077 * turn, broadcasts a timer CPI to all the others --- we receive that
1095 * CPI here. We don't use this actually for counting so losing 1078 * CPI here. We don't use this actually for counting so losing
1096 * ticks doesn't matter 1079 * ticks doesn't matter
1097 * 1080 *
1098 * FIXME: For those CPUs which actually have a local APIC, we could 1081 * FIXME: For those CPUs which actually have a local APIC, we could
1099 * try to use it to trigger this interrupt instead of having to 1082 * try to use it to trigger this interrupt instead of having to
@@ -1101,8 +1084,7 @@ voyager_smp_call_function_mask (cpumask_t cpumask,
1101 * no local APIC, so I can't do this 1084 * no local APIC, so I can't do this
1102 * 1085 *
1103 * This function is currently a placeholder and is unused in the code */ 1086 * This function is currently a placeholder and is unused in the code */
1104fastcall void 1087void smp_apic_timer_interrupt(struct pt_regs *regs)
1105smp_apic_timer_interrupt(struct pt_regs *regs)
1106{ 1088{
1107 struct pt_regs *old_regs = set_irq_regs(regs); 1089 struct pt_regs *old_regs = set_irq_regs(regs);
1108 wrapper_smp_local_timer_interrupt(); 1090 wrapper_smp_local_timer_interrupt();
@@ -1110,8 +1092,7 @@ smp_apic_timer_interrupt(struct pt_regs *regs)
1110} 1092}
1111 1093
1112/* All of the QUAD interrupt GATES */ 1094/* All of the QUAD interrupt GATES */
1113fastcall void 1095void smp_qic_timer_interrupt(struct pt_regs *regs)
1114smp_qic_timer_interrupt(struct pt_regs *regs)
1115{ 1096{
1116 struct pt_regs *old_regs = set_irq_regs(regs); 1097 struct pt_regs *old_regs = set_irq_regs(regs);
1117 ack_QIC_CPI(QIC_TIMER_CPI); 1098 ack_QIC_CPI(QIC_TIMER_CPI);
@@ -1119,127 +1100,112 @@ smp_qic_timer_interrupt(struct pt_regs *regs)
1119 set_irq_regs(old_regs); 1100 set_irq_regs(old_regs);
1120} 1101}
1121 1102
1122fastcall void 1103void smp_qic_invalidate_interrupt(struct pt_regs *regs)
1123smp_qic_invalidate_interrupt(struct pt_regs *regs)
1124{ 1104{
1125 ack_QIC_CPI(QIC_INVALIDATE_CPI); 1105 ack_QIC_CPI(QIC_INVALIDATE_CPI);
1126 smp_invalidate_interrupt(); 1106 smp_invalidate_interrupt();
1127} 1107}
1128 1108
1129fastcall void 1109void smp_qic_reschedule_interrupt(struct pt_regs *regs)
1130smp_qic_reschedule_interrupt(struct pt_regs *regs)
1131{ 1110{
1132 ack_QIC_CPI(QIC_RESCHEDULE_CPI); 1111 ack_QIC_CPI(QIC_RESCHEDULE_CPI);
1133 smp_reschedule_interrupt(); 1112 smp_reschedule_interrupt();
1134} 1113}
1135 1114
1136fastcall void 1115void smp_qic_enable_irq_interrupt(struct pt_regs *regs)
1137smp_qic_enable_irq_interrupt(struct pt_regs *regs)
1138{ 1116{
1139 ack_QIC_CPI(QIC_ENABLE_IRQ_CPI); 1117 ack_QIC_CPI(QIC_ENABLE_IRQ_CPI);
1140 smp_enable_irq_interrupt(); 1118 smp_enable_irq_interrupt();
1141} 1119}
1142 1120
1143fastcall void 1121void smp_qic_call_function_interrupt(struct pt_regs *regs)
1144smp_qic_call_function_interrupt(struct pt_regs *regs)
1145{ 1122{
1146 ack_QIC_CPI(QIC_CALL_FUNCTION_CPI); 1123 ack_QIC_CPI(QIC_CALL_FUNCTION_CPI);
1147 smp_call_function_interrupt(); 1124 smp_call_function_interrupt();
1148} 1125}
1149 1126
1150fastcall void 1127void smp_vic_cpi_interrupt(struct pt_regs *regs)
1151smp_vic_cpi_interrupt(struct pt_regs *regs)
1152{ 1128{
1153 struct pt_regs *old_regs = set_irq_regs(regs); 1129 struct pt_regs *old_regs = set_irq_regs(regs);
1154 __u8 cpu = smp_processor_id(); 1130 __u8 cpu = smp_processor_id();
1155 1131
1156 if(is_cpu_quad()) 1132 if (is_cpu_quad())
1157 ack_QIC_CPI(VIC_CPI_LEVEL0); 1133 ack_QIC_CPI(VIC_CPI_LEVEL0);
1158 else 1134 else
1159 ack_VIC_CPI(VIC_CPI_LEVEL0); 1135 ack_VIC_CPI(VIC_CPI_LEVEL0);
1160 1136
1161 if(test_and_clear_bit(VIC_TIMER_CPI, &vic_cpi_mailbox[cpu])) 1137 if (test_and_clear_bit(VIC_TIMER_CPI, &vic_cpi_mailbox[cpu]))
1162 wrapper_smp_local_timer_interrupt(); 1138 wrapper_smp_local_timer_interrupt();
1163 if(test_and_clear_bit(VIC_INVALIDATE_CPI, &vic_cpi_mailbox[cpu])) 1139 if (test_and_clear_bit(VIC_INVALIDATE_CPI, &vic_cpi_mailbox[cpu]))
1164 smp_invalidate_interrupt(); 1140 smp_invalidate_interrupt();
1165 if(test_and_clear_bit(VIC_RESCHEDULE_CPI, &vic_cpi_mailbox[cpu])) 1141 if (test_and_clear_bit(VIC_RESCHEDULE_CPI, &vic_cpi_mailbox[cpu]))
1166 smp_reschedule_interrupt(); 1142 smp_reschedule_interrupt();
1167 if(test_and_clear_bit(VIC_ENABLE_IRQ_CPI, &vic_cpi_mailbox[cpu])) 1143 if (test_and_clear_bit(VIC_ENABLE_IRQ_CPI, &vic_cpi_mailbox[cpu]))
1168 smp_enable_irq_interrupt(); 1144 smp_enable_irq_interrupt();
1169 if(test_and_clear_bit(VIC_CALL_FUNCTION_CPI, &vic_cpi_mailbox[cpu])) 1145 if (test_and_clear_bit(VIC_CALL_FUNCTION_CPI, &vic_cpi_mailbox[cpu]))
1170 smp_call_function_interrupt(); 1146 smp_call_function_interrupt();
1171 set_irq_regs(old_regs); 1147 set_irq_regs(old_regs);
1172} 1148}
1173 1149
1174static void 1150static void do_flush_tlb_all(void *info)
1175do_flush_tlb_all(void* info)
1176{ 1151{
1177 unsigned long cpu = smp_processor_id(); 1152 unsigned long cpu = smp_processor_id();
1178 1153
1179 __flush_tlb_all(); 1154 __flush_tlb_all();
1180 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY) 1155 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY)
1181 leave_mm(cpu); 1156 voyager_leave_mm(cpu);
1182} 1157}
1183 1158
1184
1185/* flush the TLB of every active CPU in the system */ 1159/* flush the TLB of every active CPU in the system */
1186void 1160void flush_tlb_all(void)
1187flush_tlb_all(void)
1188{ 1161{
1189 on_each_cpu(do_flush_tlb_all, 0, 1, 1); 1162 on_each_cpu(do_flush_tlb_all, 0, 1, 1);
1190} 1163}
1191 1164
1192/* used to set up the trampoline for other CPUs when the memory manager 1165/* used to set up the trampoline for other CPUs when the memory manager
1193 * is sorted out */ 1166 * is sorted out */
1194void __init 1167void __init smp_alloc_memory(void)
1195smp_alloc_memory(void)
1196{ 1168{
1197 trampoline_base = (__u32)alloc_bootmem_low_pages(PAGE_SIZE); 1169 trampoline_base = (__u32) alloc_bootmem_low_pages(PAGE_SIZE);
1198 if(__pa(trampoline_base) >= 0x93000) 1170 if (__pa(trampoline_base) >= 0x93000)
1199 BUG(); 1171 BUG();
1200} 1172}
1201 1173
1202/* send a reschedule CPI to one CPU by physical CPU number*/ 1174/* send a reschedule CPI to one CPU by physical CPU number*/
1203static void 1175static void voyager_smp_send_reschedule(int cpu)
1204voyager_smp_send_reschedule(int cpu)
1205{ 1176{
1206 send_one_CPI(cpu, VIC_RESCHEDULE_CPI); 1177 send_one_CPI(cpu, VIC_RESCHEDULE_CPI);
1207} 1178}
1208 1179
1209 1180int hard_smp_processor_id(void)
1210int
1211hard_smp_processor_id(void)
1212{ 1181{
1213 __u8 i; 1182 __u8 i;
1214 __u8 cpumask = inb(VIC_PROC_WHO_AM_I); 1183 __u8 cpumask = inb(VIC_PROC_WHO_AM_I);
1215 if((cpumask & QUAD_IDENTIFIER) == QUAD_IDENTIFIER) 1184 if ((cpumask & QUAD_IDENTIFIER) == QUAD_IDENTIFIER)
1216 return cpumask & 0x1F; 1185 return cpumask & 0x1F;
1217 1186
1218 for(i = 0; i < 8; i++) { 1187 for (i = 0; i < 8; i++) {
1219 if(cpumask & (1<<i)) 1188 if (cpumask & (1 << i))
1220 return i; 1189 return i;
1221 } 1190 }
1222 printk("** WARNING ** Illegal cpuid returned by VIC: %d", cpumask); 1191 printk("** WARNING ** Illegal cpuid returned by VIC: %d", cpumask);
1223 return 0; 1192 return 0;
1224} 1193}
1225 1194
1226int 1195int safe_smp_processor_id(void)
1227safe_smp_processor_id(void)
1228{ 1196{
1229 return hard_smp_processor_id(); 1197 return hard_smp_processor_id();
1230} 1198}
1231 1199
1232/* broadcast a halt to all other CPUs */ 1200/* broadcast a halt to all other CPUs */
1233static void 1201static void voyager_smp_send_stop(void)
1234voyager_smp_send_stop(void)
1235{ 1202{
1236 smp_call_function(smp_stop_cpu_function, NULL, 1, 1); 1203 smp_call_function(smp_stop_cpu_function, NULL, 1, 1);
1237} 1204}
1238 1205
1239/* this function is triggered in time.c when a clock tick fires 1206/* this function is triggered in time.c when a clock tick fires
1240 * we need to re-broadcast the tick to all CPUs */ 1207 * we need to re-broadcast the tick to all CPUs */
1241void 1208void smp_vic_timer_interrupt(void)
1242smp_vic_timer_interrupt(void)
1243{ 1209{
1244 send_CPI_allbutself(VIC_TIMER_CPI); 1210 send_CPI_allbutself(VIC_TIMER_CPI);
1245 smp_local_timer_interrupt(); 1211 smp_local_timer_interrupt();
@@ -1253,8 +1219,7 @@ smp_vic_timer_interrupt(void)
1253 * multiplier is 1 and it can be changed by writing the new multiplier 1219 * multiplier is 1 and it can be changed by writing the new multiplier
1254 * value into /proc/profile. 1220 * value into /proc/profile.
1255 */ 1221 */
1256void 1222void smp_local_timer_interrupt(void)
1257smp_local_timer_interrupt(void)
1258{ 1223{
1259 int cpu = smp_processor_id(); 1224 int cpu = smp_processor_id();
1260 long weight; 1225 long weight;
@@ -1269,18 +1234,18 @@ smp_local_timer_interrupt(void)
1269 * 1234 *
1270 * Interrupts are already masked off at this point. 1235 * Interrupts are already masked off at this point.
1271 */ 1236 */
1272 per_cpu(prof_counter,cpu) = per_cpu(prof_multiplier, cpu); 1237 per_cpu(prof_counter, cpu) = per_cpu(prof_multiplier, cpu);
1273 if (per_cpu(prof_counter, cpu) != 1238 if (per_cpu(prof_counter, cpu) !=
1274 per_cpu(prof_old_multiplier, cpu)) { 1239 per_cpu(prof_old_multiplier, cpu)) {
1275 /* FIXME: need to update the vic timer tick here */ 1240 /* FIXME: need to update the vic timer tick here */
1276 per_cpu(prof_old_multiplier, cpu) = 1241 per_cpu(prof_old_multiplier, cpu) =
1277 per_cpu(prof_counter, cpu); 1242 per_cpu(prof_counter, cpu);
1278 } 1243 }
1279 1244
1280 update_process_times(user_mode_vm(get_irq_regs())); 1245 update_process_times(user_mode_vm(get_irq_regs()));
1281 } 1246 }
1282 1247
1283 if( ((1<<cpu) & voyager_extended_vic_processors) == 0) 1248 if (((1 << cpu) & voyager_extended_vic_processors) == 0)
1284 /* only extended VIC processors participate in 1249 /* only extended VIC processors participate in
1285 * interrupt distribution */ 1250 * interrupt distribution */
1286 return; 1251 return;
@@ -1296,12 +1261,12 @@ smp_local_timer_interrupt(void)
1296 * we can take more than 100K local irqs per second on a 100 MHz P5. 1261 * we can take more than 100K local irqs per second on a 100 MHz P5.
1297 */ 1262 */
1298 1263
1299 if((++vic_tick[cpu] & 0x7) != 0) 1264 if ((++vic_tick[cpu] & 0x7) != 0)
1300 return; 1265 return;
1301 /* get here every 16 ticks (about every 1/6 of a second) */ 1266 /* get here every 16 ticks (about every 1/6 of a second) */
1302 1267
1303 /* Change our priority to give someone else a chance at getting 1268 /* Change our priority to give someone else a chance at getting
1304 * the IRQ. The algorithm goes like this: 1269 * the IRQ. The algorithm goes like this:
1305 * 1270 *
1306 * In the VIC, the dynamically routed interrupt is always 1271 * In the VIC, the dynamically routed interrupt is always
1307 * handled by the lowest priority eligible (i.e. receiving 1272 * handled by the lowest priority eligible (i.e. receiving
@@ -1325,18 +1290,18 @@ smp_local_timer_interrupt(void)
1325 * affinity code since we now try to even up the interrupt 1290 * affinity code since we now try to even up the interrupt
1326 * counts when an affinity binding is keeping them on a 1291 * counts when an affinity binding is keeping them on a
1327 * particular CPU*/ 1292 * particular CPU*/
1328 weight = (vic_intr_count[cpu]*voyager_extended_cpus 1293 weight = (vic_intr_count[cpu] * voyager_extended_cpus
1329 - vic_intr_total) >> 4; 1294 - vic_intr_total) >> 4;
1330 weight += 4; 1295 weight += 4;
1331 if(weight > 7) 1296 if (weight > 7)
1332 weight = 7; 1297 weight = 7;
1333 if(weight < 0) 1298 if (weight < 0)
1334 weight = 0; 1299 weight = 0;
1335 1300
1336 outb((__u8)weight, VIC_PRIORITY_REGISTER); 1301 outb((__u8) weight, VIC_PRIORITY_REGISTER);
1337 1302
1338#ifdef VOYAGER_DEBUG 1303#ifdef VOYAGER_DEBUG
1339 if((vic_tick[cpu] & 0xFFF) == 0) { 1304 if ((vic_tick[cpu] & 0xFFF) == 0) {
1340 /* print this message roughly every 25 secs */ 1305 /* print this message roughly every 25 secs */
1341 printk("VOYAGER SMP: vic_tick[%d] = %lu, weight = %ld\n", 1306 printk("VOYAGER SMP: vic_tick[%d] = %lu, weight = %ld\n",
1342 cpu, vic_tick[cpu], weight); 1307 cpu, vic_tick[cpu], weight);
@@ -1345,15 +1310,14 @@ smp_local_timer_interrupt(void)
1345} 1310}
1346 1311
1347/* setup the profiling timer */ 1312/* setup the profiling timer */
1348int 1313int setup_profiling_timer(unsigned int multiplier)
1349setup_profiling_timer(unsigned int multiplier)
1350{ 1314{
1351 int i; 1315 int i;
1352 1316
1353 if ( (!multiplier)) 1317 if ((!multiplier))
1354 return -EINVAL; 1318 return -EINVAL;
1355 1319
1356 /* 1320 /*
1357 * Set the new multiplier for each CPU. CPUs don't start using the 1321 * Set the new multiplier for each CPU. CPUs don't start using the
1358 * new values until the next timer interrupt in which they do process 1322 * new values until the next timer interrupt in which they do process
1359 * accounting. 1323 * accounting.
@@ -1367,15 +1331,13 @@ setup_profiling_timer(unsigned int multiplier)
1367/* This is a bit of a mess, but forced on us by the genirq changes 1331/* This is a bit of a mess, but forced on us by the genirq changes
1368 * there's no genirq handler that really does what voyager wants 1332 * there's no genirq handler that really does what voyager wants
1369 * so hack it up with the simple IRQ handler */ 1333 * so hack it up with the simple IRQ handler */
1370static void fastcall 1334static void handle_vic_irq(unsigned int irq, struct irq_desc *desc)
1371handle_vic_irq(unsigned int irq, struct irq_desc *desc)
1372{ 1335{
1373 before_handle_vic_irq(irq); 1336 before_handle_vic_irq(irq);
1374 handle_simple_irq(irq, desc); 1337 handle_simple_irq(irq, desc);
1375 after_handle_vic_irq(irq); 1338 after_handle_vic_irq(irq);
1376} 1339}
1377 1340
1378
1379/* The CPIs are handled in the per cpu 8259s, so they must be 1341/* The CPIs are handled in the per cpu 8259s, so they must be
1380 * enabled to be received: FIX: enabling the CPIs in the early 1342 * enabled to be received: FIX: enabling the CPIs in the early
1381 * boot sequence interferes with bug checking; enable them later 1343 * boot sequence interferes with bug checking; enable them later
@@ -1385,13 +1347,12 @@ handle_vic_irq(unsigned int irq, struct irq_desc *desc)
1385#define QIC_SET_GATE(cpi, vector) \ 1347#define QIC_SET_GATE(cpi, vector) \
1386 set_intr_gate((cpi) + QIC_DEFAULT_CPI_BASE, (vector)) 1348 set_intr_gate((cpi) + QIC_DEFAULT_CPI_BASE, (vector))
1387 1349
1388void __init 1350void __init smp_intr_init(void)
1389smp_intr_init(void)
1390{ 1351{
1391 int i; 1352 int i;
1392 1353
1393 /* initialize the per cpu irq mask to all disabled */ 1354 /* initialize the per cpu irq mask to all disabled */
1394 for(i = 0; i < NR_CPUS; i++) 1355 for (i = 0; i < NR_CPUS; i++)
1395 vic_irq_mask[i] = 0xFFFF; 1356 vic_irq_mask[i] = 0xFFFF;
1396 1357
1397 VIC_SET_GATE(VIC_CPI_LEVEL0, vic_cpi_interrupt); 1358 VIC_SET_GATE(VIC_CPI_LEVEL0, vic_cpi_interrupt);
@@ -1404,42 +1365,40 @@ smp_intr_init(void)
1404 QIC_SET_GATE(QIC_RESCHEDULE_CPI, qic_reschedule_interrupt); 1365 QIC_SET_GATE(QIC_RESCHEDULE_CPI, qic_reschedule_interrupt);
1405 QIC_SET_GATE(QIC_ENABLE_IRQ_CPI, qic_enable_irq_interrupt); 1366 QIC_SET_GATE(QIC_ENABLE_IRQ_CPI, qic_enable_irq_interrupt);
1406 QIC_SET_GATE(QIC_CALL_FUNCTION_CPI, qic_call_function_interrupt); 1367 QIC_SET_GATE(QIC_CALL_FUNCTION_CPI, qic_call_function_interrupt);
1407
1408 1368
1409 /* now put the VIC descriptor into the first 48 IRQs 1369 /* now put the VIC descriptor into the first 48 IRQs
1410 * 1370 *
1411 * This is for later: first 16 correspond to PC IRQs; next 16 1371 * This is for later: first 16 correspond to PC IRQs; next 16
1412 * are Primary MC IRQs and final 16 are Secondary MC IRQs */ 1372 * are Primary MC IRQs and final 16 are Secondary MC IRQs */
1413 for(i = 0; i < 48; i++) 1373 for (i = 0; i < 48; i++)
1414 set_irq_chip_and_handler(i, &vic_chip, handle_vic_irq); 1374 set_irq_chip_and_handler(i, &vic_chip, handle_vic_irq);
1415} 1375}
1416 1376
1417/* send a CPI at level cpi to a set of cpus in cpuset (set 1 bit per 1377/* send a CPI at level cpi to a set of cpus in cpuset (set 1 bit per
1418 * processor to receive CPI */ 1378 * processor to receive CPI */
1419static void 1379static void send_CPI(__u32 cpuset, __u8 cpi)
1420send_CPI(__u32 cpuset, __u8 cpi)
1421{ 1380{
1422 int cpu; 1381 int cpu;
1423 __u32 quad_cpuset = (cpuset & voyager_quad_processors); 1382 __u32 quad_cpuset = (cpuset & voyager_quad_processors);
1424 1383
1425 if(cpi < VIC_START_FAKE_CPI) { 1384 if (cpi < VIC_START_FAKE_CPI) {
1426 /* fake CPI are only used for booting, so send to the 1385 /* fake CPI are only used for booting, so send to the
1427 * extended quads as well---Quads must be VIC booted */ 1386 * extended quads as well---Quads must be VIC booted */
1428 outb((__u8)(cpuset), VIC_CPI_Registers[cpi]); 1387 outb((__u8) (cpuset), VIC_CPI_Registers[cpi]);
1429 return; 1388 return;
1430 } 1389 }
1431 if(quad_cpuset) 1390 if (quad_cpuset)
1432 send_QIC_CPI(quad_cpuset, cpi); 1391 send_QIC_CPI(quad_cpuset, cpi);
1433 cpuset &= ~quad_cpuset; 1392 cpuset &= ~quad_cpuset;
1434 cpuset &= 0xff; /* only first 8 CPUs vaild for VIC CPI */ 1393 cpuset &= 0xff; /* only first 8 CPUs vaild for VIC CPI */
1435 if(cpuset == 0) 1394 if (cpuset == 0)
1436 return; 1395 return;
1437 for_each_online_cpu(cpu) { 1396 for_each_online_cpu(cpu) {
1438 if(cpuset & (1<<cpu)) 1397 if (cpuset & (1 << cpu))
1439 set_bit(cpi, &vic_cpi_mailbox[cpu]); 1398 set_bit(cpi, &vic_cpi_mailbox[cpu]);
1440 } 1399 }
1441 if(cpuset) 1400 if (cpuset)
1442 outb((__u8)cpuset, VIC_CPI_Registers[VIC_CPI_LEVEL0]); 1401 outb((__u8) cpuset, VIC_CPI_Registers[VIC_CPI_LEVEL0]);
1443} 1402}
1444 1403
1445/* Acknowledge receipt of CPI in the QIC, clear in QIC hardware and 1404/* Acknowledge receipt of CPI in the QIC, clear in QIC hardware and
@@ -1448,20 +1407,19 @@ send_CPI(__u32 cpuset, __u8 cpi)
1448 * DON'T make this inline otherwise the cache line read will be 1407 * DON'T make this inline otherwise the cache line read will be
1449 * optimised away 1408 * optimised away
1450 * */ 1409 * */
1451static int 1410static int ack_QIC_CPI(__u8 cpi)
1452ack_QIC_CPI(__u8 cpi) { 1411{
1453 __u8 cpu = hard_smp_processor_id(); 1412 __u8 cpu = hard_smp_processor_id();
1454 1413
1455 cpi &= 7; 1414 cpi &= 7;
1456 1415
1457 outb(1<<cpi, QIC_INTERRUPT_CLEAR1); 1416 outb(1 << cpi, QIC_INTERRUPT_CLEAR1);
1458 return voyager_quad_cpi_addr[cpu]->qic_cpi[cpi].cpi; 1417 return voyager_quad_cpi_addr[cpu]->qic_cpi[cpi].cpi;
1459} 1418}
1460 1419
1461static void 1420static void ack_special_QIC_CPI(__u8 cpi)
1462ack_special_QIC_CPI(__u8 cpi)
1463{ 1421{
1464 switch(cpi) { 1422 switch (cpi) {
1465 case VIC_CMN_INT: 1423 case VIC_CMN_INT:
1466 outb(QIC_CMN_INT, QIC_INTERRUPT_CLEAR0); 1424 outb(QIC_CMN_INT, QIC_INTERRUPT_CLEAR0);
1467 break; 1425 break;
@@ -1474,8 +1432,7 @@ ack_special_QIC_CPI(__u8 cpi)
1474} 1432}
1475 1433
1476/* Acknowledge receipt of CPI in the VIC (essentially an EOI) */ 1434/* Acknowledge receipt of CPI in the VIC (essentially an EOI) */
1477static void 1435static void ack_VIC_CPI(__u8 cpi)
1478ack_VIC_CPI(__u8 cpi)
1479{ 1436{
1480#ifdef VOYAGER_DEBUG 1437#ifdef VOYAGER_DEBUG
1481 unsigned long flags; 1438 unsigned long flags;
@@ -1484,17 +1441,17 @@ ack_VIC_CPI(__u8 cpi)
1484 1441
1485 local_irq_save(flags); 1442 local_irq_save(flags);
1486 isr = vic_read_isr(); 1443 isr = vic_read_isr();
1487 if((isr & (1<<(cpi &7))) == 0) { 1444 if ((isr & (1 << (cpi & 7))) == 0) {
1488 printk("VOYAGER SMP: CPU%d lost CPI%d\n", cpu, cpi); 1445 printk("VOYAGER SMP: CPU%d lost CPI%d\n", cpu, cpi);
1489 } 1446 }
1490#endif 1447#endif
1491 /* send specific EOI; the two system interrupts have 1448 /* send specific EOI; the two system interrupts have
1492 * bit 4 set for a separate vector but behave as the 1449 * bit 4 set for a separate vector but behave as the
1493 * corresponding 3 bit intr */ 1450 * corresponding 3 bit intr */
1494 outb_p(0x60|(cpi & 7),0x20); 1451 outb_p(0x60 | (cpi & 7), 0x20);
1495 1452
1496#ifdef VOYAGER_DEBUG 1453#ifdef VOYAGER_DEBUG
1497 if((vic_read_isr() & (1<<(cpi &7))) != 0) { 1454 if ((vic_read_isr() & (1 << (cpi & 7))) != 0) {
1498 printk("VOYAGER SMP: CPU%d still asserting CPI%d\n", cpu, cpi); 1455 printk("VOYAGER SMP: CPU%d still asserting CPI%d\n", cpu, cpi);
1499 } 1456 }
1500 local_irq_restore(flags); 1457 local_irq_restore(flags);
@@ -1502,12 +1459,11 @@ ack_VIC_CPI(__u8 cpi)
1502} 1459}
1503 1460
1504/* cribbed with thanks from irq.c */ 1461/* cribbed with thanks from irq.c */
1505#define __byte(x,y) (((unsigned char *)&(y))[x]) 1462#define __byte(x,y) (((unsigned char *)&(y))[x])
1506#define cached_21(cpu) (__byte(0,vic_irq_mask[cpu])) 1463#define cached_21(cpu) (__byte(0,vic_irq_mask[cpu]))
1507#define cached_A1(cpu) (__byte(1,vic_irq_mask[cpu])) 1464#define cached_A1(cpu) (__byte(1,vic_irq_mask[cpu]))
1508 1465
1509static unsigned int 1466static unsigned int startup_vic_irq(unsigned int irq)
1510startup_vic_irq(unsigned int irq)
1511{ 1467{
1512 unmask_vic_irq(irq); 1468 unmask_vic_irq(irq);
1513 1469
@@ -1535,13 +1491,12 @@ startup_vic_irq(unsigned int irq)
1535 * broadcast an Interrupt enable CPI which causes all other CPUs to 1491 * broadcast an Interrupt enable CPI which causes all other CPUs to
1536 * adjust their masks accordingly. */ 1492 * adjust their masks accordingly. */
1537 1493
1538static void 1494static void unmask_vic_irq(unsigned int irq)
1539unmask_vic_irq(unsigned int irq)
1540{ 1495{
1541 /* linux doesn't to processor-irq affinity, so enable on 1496 /* linux doesn't to processor-irq affinity, so enable on
1542 * all CPUs we know about */ 1497 * all CPUs we know about */
1543 int cpu = smp_processor_id(), real_cpu; 1498 int cpu = smp_processor_id(), real_cpu;
1544 __u16 mask = (1<<irq); 1499 __u16 mask = (1 << irq);
1545 __u32 processorList = 0; 1500 __u32 processorList = 0;
1546 unsigned long flags; 1501 unsigned long flags;
1547 1502
@@ -1549,78 +1504,72 @@ unmask_vic_irq(unsigned int irq)
1549 irq, cpu, cpu_irq_affinity[cpu])); 1504 irq, cpu, cpu_irq_affinity[cpu]));
1550 spin_lock_irqsave(&vic_irq_lock, flags); 1505 spin_lock_irqsave(&vic_irq_lock, flags);
1551 for_each_online_cpu(real_cpu) { 1506 for_each_online_cpu(real_cpu) {
1552 if(!(voyager_extended_vic_processors & (1<<real_cpu))) 1507 if (!(voyager_extended_vic_processors & (1 << real_cpu)))
1553 continue; 1508 continue;
1554 if(!(cpu_irq_affinity[real_cpu] & mask)) { 1509 if (!(cpu_irq_affinity[real_cpu] & mask)) {
1555 /* irq has no affinity for this CPU, ignore */ 1510 /* irq has no affinity for this CPU, ignore */
1556 continue; 1511 continue;
1557 } 1512 }
1558 if(real_cpu == cpu) { 1513 if (real_cpu == cpu) {
1559 enable_local_vic_irq(irq); 1514 enable_local_vic_irq(irq);
1560 } 1515 } else if (vic_irq_mask[real_cpu] & mask) {
1561 else if(vic_irq_mask[real_cpu] & mask) {
1562 vic_irq_enable_mask[real_cpu] |= mask; 1516 vic_irq_enable_mask[real_cpu] |= mask;
1563 processorList |= (1<<real_cpu); 1517 processorList |= (1 << real_cpu);
1564 } 1518 }
1565 } 1519 }
1566 spin_unlock_irqrestore(&vic_irq_lock, flags); 1520 spin_unlock_irqrestore(&vic_irq_lock, flags);
1567 if(processorList) 1521 if (processorList)
1568 send_CPI(processorList, VIC_ENABLE_IRQ_CPI); 1522 send_CPI(processorList, VIC_ENABLE_IRQ_CPI);
1569} 1523}
1570 1524
1571static void 1525static void mask_vic_irq(unsigned int irq)
1572mask_vic_irq(unsigned int irq)
1573{ 1526{
1574 /* lazy disable, do nothing */ 1527 /* lazy disable, do nothing */
1575} 1528}
1576 1529
1577static void 1530static void enable_local_vic_irq(unsigned int irq)
1578enable_local_vic_irq(unsigned int irq)
1579{ 1531{
1580 __u8 cpu = smp_processor_id(); 1532 __u8 cpu = smp_processor_id();
1581 __u16 mask = ~(1 << irq); 1533 __u16 mask = ~(1 << irq);
1582 __u16 old_mask = vic_irq_mask[cpu]; 1534 __u16 old_mask = vic_irq_mask[cpu];
1583 1535
1584 vic_irq_mask[cpu] &= mask; 1536 vic_irq_mask[cpu] &= mask;
1585 if(vic_irq_mask[cpu] == old_mask) 1537 if (vic_irq_mask[cpu] == old_mask)
1586 return; 1538 return;
1587 1539
1588 VDEBUG(("VOYAGER DEBUG: Enabling irq %d in hardware on CPU %d\n", 1540 VDEBUG(("VOYAGER DEBUG: Enabling irq %d in hardware on CPU %d\n",
1589 irq, cpu)); 1541 irq, cpu));
1590 1542
1591 if (irq & 8) { 1543 if (irq & 8) {
1592 outb_p(cached_A1(cpu),0xA1); 1544 outb_p(cached_A1(cpu), 0xA1);
1593 (void)inb_p(0xA1); 1545 (void)inb_p(0xA1);
1594 } 1546 } else {
1595 else { 1547 outb_p(cached_21(cpu), 0x21);
1596 outb_p(cached_21(cpu),0x21);
1597 (void)inb_p(0x21); 1548 (void)inb_p(0x21);
1598 } 1549 }
1599} 1550}
1600 1551
1601static void 1552static void disable_local_vic_irq(unsigned int irq)
1602disable_local_vic_irq(unsigned int irq)
1603{ 1553{
1604 __u8 cpu = smp_processor_id(); 1554 __u8 cpu = smp_processor_id();
1605 __u16 mask = (1 << irq); 1555 __u16 mask = (1 << irq);
1606 __u16 old_mask = vic_irq_mask[cpu]; 1556 __u16 old_mask = vic_irq_mask[cpu];
1607 1557
1608 if(irq == 7) 1558 if (irq == 7)
1609 return; 1559 return;
1610 1560
1611 vic_irq_mask[cpu] |= mask; 1561 vic_irq_mask[cpu] |= mask;
1612 if(old_mask == vic_irq_mask[cpu]) 1562 if (old_mask == vic_irq_mask[cpu])
1613 return; 1563 return;
1614 1564
1615 VDEBUG(("VOYAGER DEBUG: Disabling irq %d in hardware on CPU %d\n", 1565 VDEBUG(("VOYAGER DEBUG: Disabling irq %d in hardware on CPU %d\n",
1616 irq, cpu)); 1566 irq, cpu));
1617 1567
1618 if (irq & 8) { 1568 if (irq & 8) {
1619 outb_p(cached_A1(cpu),0xA1); 1569 outb_p(cached_A1(cpu), 0xA1);
1620 (void)inb_p(0xA1); 1570 (void)inb_p(0xA1);
1621 } 1571 } else {
1622 else { 1572 outb_p(cached_21(cpu), 0x21);
1623 outb_p(cached_21(cpu),0x21);
1624 (void)inb_p(0x21); 1573 (void)inb_p(0x21);
1625 } 1574 }
1626} 1575}
@@ -1631,8 +1580,7 @@ disable_local_vic_irq(unsigned int irq)
1631 * interrupt in the vic, so we merely set a flag (IRQ_DISABLED). If 1580 * interrupt in the vic, so we merely set a flag (IRQ_DISABLED). If
1632 * this interrupt actually comes in, then we mask and ack here to push 1581 * this interrupt actually comes in, then we mask and ack here to push
1633 * the interrupt off to another CPU */ 1582 * the interrupt off to another CPU */
1634static void 1583static void before_handle_vic_irq(unsigned int irq)
1635before_handle_vic_irq(unsigned int irq)
1636{ 1584{
1637 irq_desc_t *desc = irq_desc + irq; 1585 irq_desc_t *desc = irq_desc + irq;
1638 __u8 cpu = smp_processor_id(); 1586 __u8 cpu = smp_processor_id();
@@ -1641,16 +1589,16 @@ before_handle_vic_irq(unsigned int irq)
1641 vic_intr_total++; 1589 vic_intr_total++;
1642 vic_intr_count[cpu]++; 1590 vic_intr_count[cpu]++;
1643 1591
1644 if(!(cpu_irq_affinity[cpu] & (1<<irq))) { 1592 if (!(cpu_irq_affinity[cpu] & (1 << irq))) {
1645 /* The irq is not in our affinity mask, push it off 1593 /* The irq is not in our affinity mask, push it off
1646 * onto another CPU */ 1594 * onto another CPU */
1647 VDEBUG(("VOYAGER DEBUG: affinity triggered disable of irq %d on cpu %d\n", 1595 VDEBUG(("VOYAGER DEBUG: affinity triggered disable of irq %d "
1648 irq, cpu)); 1596 "on cpu %d\n", irq, cpu));
1649 disable_local_vic_irq(irq); 1597 disable_local_vic_irq(irq);
1650 /* set IRQ_INPROGRESS to prevent the handler in irq.c from 1598 /* set IRQ_INPROGRESS to prevent the handler in irq.c from
1651 * actually calling the interrupt routine */ 1599 * actually calling the interrupt routine */
1652 desc->status |= IRQ_REPLAY | IRQ_INPROGRESS; 1600 desc->status |= IRQ_REPLAY | IRQ_INPROGRESS;
1653 } else if(desc->status & IRQ_DISABLED) { 1601 } else if (desc->status & IRQ_DISABLED) {
1654 /* Damn, the interrupt actually arrived, do the lazy 1602 /* Damn, the interrupt actually arrived, do the lazy
1655 * disable thing. The interrupt routine in irq.c will 1603 * disable thing. The interrupt routine in irq.c will
1656 * not handle a IRQ_DISABLED interrupt, so nothing more 1604 * not handle a IRQ_DISABLED interrupt, so nothing more
@@ -1667,8 +1615,7 @@ before_handle_vic_irq(unsigned int irq)
1667} 1615}
1668 1616
1669/* Finish the VIC interrupt: basically mask */ 1617/* Finish the VIC interrupt: basically mask */
1670static void 1618static void after_handle_vic_irq(unsigned int irq)
1671after_handle_vic_irq(unsigned int irq)
1672{ 1619{
1673 irq_desc_t *desc = irq_desc + irq; 1620 irq_desc_t *desc = irq_desc + irq;
1674 1621
@@ -1685,11 +1632,11 @@ after_handle_vic_irq(unsigned int irq)
1685#ifdef VOYAGER_DEBUG 1632#ifdef VOYAGER_DEBUG
1686 /* DEBUG: before we ack, check what's in progress */ 1633 /* DEBUG: before we ack, check what's in progress */
1687 isr = vic_read_isr(); 1634 isr = vic_read_isr();
1688 if((isr & (1<<irq) && !(status & IRQ_REPLAY)) == 0) { 1635 if ((isr & (1 << irq) && !(status & IRQ_REPLAY)) == 0) {
1689 int i; 1636 int i;
1690 __u8 cpu = smp_processor_id(); 1637 __u8 cpu = smp_processor_id();
1691 __u8 real_cpu; 1638 __u8 real_cpu;
1692 int mask; /* Um... initialize me??? --RR */ 1639 int mask; /* Um... initialize me??? --RR */
1693 1640
1694 printk("VOYAGER SMP: CPU%d lost interrupt %d\n", 1641 printk("VOYAGER SMP: CPU%d lost interrupt %d\n",
1695 cpu, irq); 1642 cpu, irq);
@@ -1698,9 +1645,10 @@ after_handle_vic_irq(unsigned int irq)
1698 outb(VIC_CPU_MASQUERADE_ENABLE | real_cpu, 1645 outb(VIC_CPU_MASQUERADE_ENABLE | real_cpu,
1699 VIC_PROCESSOR_ID); 1646 VIC_PROCESSOR_ID);
1700 isr = vic_read_isr(); 1647 isr = vic_read_isr();
1701 if(isr & (1<<irq)) { 1648 if (isr & (1 << irq)) {
1702 printk("VOYAGER SMP: CPU%d ack irq %d\n", 1649 printk
1703 real_cpu, irq); 1650 ("VOYAGER SMP: CPU%d ack irq %d\n",
1651 real_cpu, irq);
1704 ack_vic_irq(irq); 1652 ack_vic_irq(irq);
1705 } 1653 }
1706 outb(cpu, VIC_PROCESSOR_ID); 1654 outb(cpu, VIC_PROCESSOR_ID);
@@ -1711,7 +1659,7 @@ after_handle_vic_irq(unsigned int irq)
1711 * receipt by another CPU so everything must be in 1659 * receipt by another CPU so everything must be in
1712 * order here */ 1660 * order here */
1713 ack_vic_irq(irq); 1661 ack_vic_irq(irq);
1714 if(status & IRQ_REPLAY) { 1662 if (status & IRQ_REPLAY) {
1715 /* replay is set if we disable the interrupt 1663 /* replay is set if we disable the interrupt
1716 * in the before_handle_vic_irq() routine, so 1664 * in the before_handle_vic_irq() routine, so
1717 * clear the in progress bit here to allow the 1665 * clear the in progress bit here to allow the
@@ -1720,9 +1668,9 @@ after_handle_vic_irq(unsigned int irq)
1720 } 1668 }
1721#ifdef VOYAGER_DEBUG 1669#ifdef VOYAGER_DEBUG
1722 isr = vic_read_isr(); 1670 isr = vic_read_isr();
1723 if((isr & (1<<irq)) != 0) 1671 if ((isr & (1 << irq)) != 0)
1724 printk("VOYAGER SMP: after_handle_vic_irq() after ack irq=%d, isr=0x%x\n", 1672 printk("VOYAGER SMP: after_handle_vic_irq() after "
1725 irq, isr); 1673 "ack irq=%d, isr=0x%x\n", irq, isr);
1726#endif /* VOYAGER_DEBUG */ 1674#endif /* VOYAGER_DEBUG */
1727 } 1675 }
1728 _raw_spin_unlock(&vic_irq_lock); 1676 _raw_spin_unlock(&vic_irq_lock);
@@ -1731,7 +1679,6 @@ after_handle_vic_irq(unsigned int irq)
1731 * may be intercepted by another CPU if reasserted */ 1679 * may be intercepted by another CPU if reasserted */
1732} 1680}
1733 1681
1734
1735/* Linux processor - interrupt affinity manipulations. 1682/* Linux processor - interrupt affinity manipulations.
1736 * 1683 *
1737 * For each processor, we maintain a 32 bit irq affinity mask. 1684 * For each processor, we maintain a 32 bit irq affinity mask.
@@ -1748,8 +1695,7 @@ after_handle_vic_irq(unsigned int irq)
1748 * change the mask and then do an interrupt enable CPI to re-enable on 1695 * change the mask and then do an interrupt enable CPI to re-enable on
1749 * the selected processors */ 1696 * the selected processors */
1750 1697
1751void 1698void set_vic_irq_affinity(unsigned int irq, cpumask_t mask)
1752set_vic_irq_affinity(unsigned int irq, cpumask_t mask)
1753{ 1699{
1754 /* Only extended processors handle interrupts */ 1700 /* Only extended processors handle interrupts */
1755 unsigned long real_mask; 1701 unsigned long real_mask;
@@ -1757,13 +1703,13 @@ set_vic_irq_affinity(unsigned int irq, cpumask_t mask)
1757 int cpu; 1703 int cpu;
1758 1704
1759 real_mask = cpus_addr(mask)[0] & voyager_extended_vic_processors; 1705 real_mask = cpus_addr(mask)[0] & voyager_extended_vic_processors;
1760 1706
1761 if(cpus_addr(mask)[0] == 0) 1707 if (cpus_addr(mask)[0] == 0)
1762 /* can't have no CPUs to accept the interrupt -- extremely 1708 /* can't have no CPUs to accept the interrupt -- extremely
1763 * bad things will happen */ 1709 * bad things will happen */
1764 return; 1710 return;
1765 1711
1766 if(irq == 0) 1712 if (irq == 0)
1767 /* can't change the affinity of the timer IRQ. This 1713 /* can't change the affinity of the timer IRQ. This
1768 * is due to the constraint in the voyager 1714 * is due to the constraint in the voyager
1769 * architecture that the CPI also comes in on and IRQ 1715 * architecture that the CPI also comes in on and IRQ
@@ -1772,7 +1718,7 @@ set_vic_irq_affinity(unsigned int irq, cpumask_t mask)
1772 * will no-longer be able to accept VIC CPIs */ 1718 * will no-longer be able to accept VIC CPIs */
1773 return; 1719 return;
1774 1720
1775 if(irq >= 32) 1721 if (irq >= 32)
1776 /* You can only have 32 interrupts in a voyager system 1722 /* You can only have 32 interrupts in a voyager system
1777 * (and 32 only if you have a secondary microchannel 1723 * (and 32 only if you have a secondary microchannel
1778 * bus) */ 1724 * bus) */
@@ -1780,8 +1726,8 @@ set_vic_irq_affinity(unsigned int irq, cpumask_t mask)
1780 1726
1781 for_each_online_cpu(cpu) { 1727 for_each_online_cpu(cpu) {
1782 unsigned long cpu_mask = 1 << cpu; 1728 unsigned long cpu_mask = 1 << cpu;
1783 1729
1784 if(cpu_mask & real_mask) { 1730 if (cpu_mask & real_mask) {
1785 /* enable the interrupt for this cpu */ 1731 /* enable the interrupt for this cpu */
1786 cpu_irq_affinity[cpu] |= irq_mask; 1732 cpu_irq_affinity[cpu] |= irq_mask;
1787 } else { 1733 } else {
@@ -1800,25 +1746,23 @@ set_vic_irq_affinity(unsigned int irq, cpumask_t mask)
1800 unmask_vic_irq(irq); 1746 unmask_vic_irq(irq);
1801} 1747}
1802 1748
1803static void 1749static void ack_vic_irq(unsigned int irq)
1804ack_vic_irq(unsigned int irq)
1805{ 1750{
1806 if (irq & 8) { 1751 if (irq & 8) {
1807 outb(0x62,0x20); /* Specific EOI to cascade */ 1752 outb(0x62, 0x20); /* Specific EOI to cascade */
1808 outb(0x60|(irq & 7),0xA0); 1753 outb(0x60 | (irq & 7), 0xA0);
1809 } else { 1754 } else {
1810 outb(0x60 | (irq & 7),0x20); 1755 outb(0x60 | (irq & 7), 0x20);
1811 } 1756 }
1812} 1757}
1813 1758
1814/* enable the CPIs. In the VIC, the CPIs are delivered by the 8259 1759/* enable the CPIs. In the VIC, the CPIs are delivered by the 8259
1815 * but are not vectored by it. This means that the 8259 mask must be 1760 * but are not vectored by it. This means that the 8259 mask must be
1816 * lowered to receive them */ 1761 * lowered to receive them */
1817static __init void 1762static __init void vic_enable_cpi(void)
1818vic_enable_cpi(void)
1819{ 1763{
1820 __u8 cpu = smp_processor_id(); 1764 __u8 cpu = smp_processor_id();
1821 1765
1822 /* just take a copy of the current mask (nop for boot cpu) */ 1766 /* just take a copy of the current mask (nop for boot cpu) */
1823 vic_irq_mask[cpu] = vic_irq_mask[boot_cpu_id]; 1767 vic_irq_mask[cpu] = vic_irq_mask[boot_cpu_id];
1824 1768
@@ -1827,7 +1771,7 @@ vic_enable_cpi(void)
1827 /* for sys int and cmn int */ 1771 /* for sys int and cmn int */
1828 enable_local_vic_irq(7); 1772 enable_local_vic_irq(7);
1829 1773
1830 if(is_cpu_quad()) { 1774 if (is_cpu_quad()) {
1831 outb(QIC_DEFAULT_MASK0, QIC_MASK_REGISTER0); 1775 outb(QIC_DEFAULT_MASK0, QIC_MASK_REGISTER0);
1832 outb(QIC_CPI_ENABLE, QIC_MASK_REGISTER1); 1776 outb(QIC_CPI_ENABLE, QIC_MASK_REGISTER1);
1833 VDEBUG(("VOYAGER SMP: QIC ENABLE CPI: CPU%d: MASK 0x%x\n", 1777 VDEBUG(("VOYAGER SMP: QIC ENABLE CPI: CPU%d: MASK 0x%x\n",
@@ -1838,8 +1782,7 @@ vic_enable_cpi(void)
1838 cpu, vic_irq_mask[cpu])); 1782 cpu, vic_irq_mask[cpu]));
1839} 1783}
1840 1784
1841void 1785void voyager_smp_dump()
1842voyager_smp_dump()
1843{ 1786{
1844 int old_cpu = smp_processor_id(), cpu; 1787 int old_cpu = smp_processor_id(), cpu;
1845 1788
@@ -1865,10 +1808,10 @@ voyager_smp_dump()
1865 cpu, vic_irq_mask[cpu], imr, irr, isr); 1808 cpu, vic_irq_mask[cpu], imr, irr, isr);
1866#if 0 1809#if 0
1867 /* These lines are put in to try to unstick an un ack'd irq */ 1810 /* These lines are put in to try to unstick an un ack'd irq */
1868 if(isr != 0) { 1811 if (isr != 0) {
1869 int irq; 1812 int irq;
1870 for(irq=0; irq<16; irq++) { 1813 for (irq = 0; irq < 16; irq++) {
1871 if(isr & (1<<irq)) { 1814 if (isr & (1 << irq)) {
1872 printk("\tCPU%d: ack irq %d\n", 1815 printk("\tCPU%d: ack irq %d\n",
1873 cpu, irq); 1816 cpu, irq);
1874 local_irq_save(flags); 1817 local_irq_save(flags);
@@ -1884,17 +1827,15 @@ voyager_smp_dump()
1884 } 1827 }
1885} 1828}
1886 1829
1887void 1830void smp_voyager_power_off(void *dummy)
1888smp_voyager_power_off(void *dummy)
1889{ 1831{
1890 if(smp_processor_id() == boot_cpu_id) 1832 if (smp_processor_id() == boot_cpu_id)
1891 voyager_power_off(); 1833 voyager_power_off();
1892 else 1834 else
1893 smp_stop_cpu_function(NULL); 1835 smp_stop_cpu_function(NULL);
1894} 1836}
1895 1837
1896static void __init 1838static void __init voyager_smp_prepare_cpus(unsigned int max_cpus)
1897voyager_smp_prepare_cpus(unsigned int max_cpus)
1898{ 1839{
1899 /* FIXME: ignore max_cpus for now */ 1840 /* FIXME: ignore max_cpus for now */
1900 smp_boot_cpus(); 1841 smp_boot_cpus();
@@ -1911,8 +1852,7 @@ static void __cpuinit voyager_smp_prepare_boot_cpu(void)
1911 cpu_set(smp_processor_id(), cpu_present_map); 1852 cpu_set(smp_processor_id(), cpu_present_map);
1912} 1853}
1913 1854
1914static int __cpuinit 1855static int __cpuinit voyager_cpu_up(unsigned int cpu)
1915voyager_cpu_up(unsigned int cpu)
1916{ 1856{
1917 /* This only works at boot for x86. See "rewrite" above. */ 1857 /* This only works at boot for x86. See "rewrite" above. */
1918 if (cpu_isset(cpu, smp_commenced_mask)) 1858 if (cpu_isset(cpu, smp_commenced_mask))
@@ -1928,14 +1868,12 @@ voyager_cpu_up(unsigned int cpu)
1928 return 0; 1868 return 0;
1929} 1869}
1930 1870
1931static void __init 1871static void __init voyager_smp_cpus_done(unsigned int max_cpus)
1932voyager_smp_cpus_done(unsigned int max_cpus)
1933{ 1872{
1934 zap_low_mappings(); 1873 zap_low_mappings();
1935} 1874}
1936 1875
1937void __init 1876void __init smp_setup_processor_id(void)
1938smp_setup_processor_id(void)
1939{ 1877{
1940 current_thread_info()->cpu = hard_smp_processor_id(); 1878 current_thread_info()->cpu = hard_smp_processor_id();
1941 x86_write_percpu(cpu_number, hard_smp_processor_id()); 1879 x86_write_percpu(cpu_number, hard_smp_processor_id());
diff --git a/arch/x86/mach-voyager/voyager_thread.c b/arch/x86/mach-voyager/voyager_thread.c
index 50f9366c411..c69c931818e 100644
--- a/arch/x86/mach-voyager/voyager_thread.c
+++ b/arch/x86/mach-voyager/voyager_thread.c
@@ -30,12 +30,10 @@
30#include <asm/mtrr.h> 30#include <asm/mtrr.h>
31#include <asm/msr.h> 31#include <asm/msr.h>
32 32
33
34struct task_struct *voyager_thread; 33struct task_struct *voyager_thread;
35static __u8 set_timeout; 34static __u8 set_timeout;
36 35
37static int 36static int execute(const char *string)
38execute(const char *string)
39{ 37{
40 int ret; 38 int ret;
41 39
@@ -52,48 +50,48 @@ execute(const char *string)
52 NULL, 50 NULL,
53 }; 51 };
54 52
55 if ((ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC)) != 0) { 53 if ((ret =
56 printk(KERN_ERR "Voyager failed to run \"%s\": %i\n", 54 call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC)) != 0) {
57 string, ret); 55 printk(KERN_ERR "Voyager failed to run \"%s\": %i\n", string,
56 ret);
58 } 57 }
59 return ret; 58 return ret;
60} 59}
61 60
62static void 61static void check_from_kernel(void)
63check_from_kernel(void)
64{ 62{
65 if(voyager_status.switch_off) { 63 if (voyager_status.switch_off) {
66 64
67 /* FIXME: This should be configurable via proc */ 65 /* FIXME: This should be configurable via proc */
68 execute("umask 600; echo 0 > /etc/initrunlvl; kill -HUP 1"); 66 execute("umask 600; echo 0 > /etc/initrunlvl; kill -HUP 1");
69 } else if(voyager_status.power_fail) { 67 } else if (voyager_status.power_fail) {
70 VDEBUG(("Voyager daemon detected AC power failure\n")); 68 VDEBUG(("Voyager daemon detected AC power failure\n"));
71 69
72 /* FIXME: This should be configureable via proc */ 70 /* FIXME: This should be configureable via proc */
73 execute("umask 600; echo F > /etc/powerstatus; kill -PWR 1"); 71 execute("umask 600; echo F > /etc/powerstatus; kill -PWR 1");
74 set_timeout = 1; 72 set_timeout = 1;
75 } 73 }
76} 74}
77 75
78static void 76static void check_continuing_condition(void)
79check_continuing_condition(void)
80{ 77{
81 if(voyager_status.power_fail) { 78 if (voyager_status.power_fail) {
82 __u8 data; 79 __u8 data;
83 voyager_cat_psi(VOYAGER_PSI_SUBREAD, 80 voyager_cat_psi(VOYAGER_PSI_SUBREAD,
84 VOYAGER_PSI_AC_FAIL_REG, &data); 81 VOYAGER_PSI_AC_FAIL_REG, &data);
85 if((data & 0x1f) == 0) { 82 if ((data & 0x1f) == 0) {
86 /* all power restored */ 83 /* all power restored */
87 printk(KERN_NOTICE "VOYAGER AC power restored, cancelling shutdown\n"); 84 printk(KERN_NOTICE
85 "VOYAGER AC power restored, cancelling shutdown\n");
88 /* FIXME: should be user configureable */ 86 /* FIXME: should be user configureable */
89 execute("umask 600; echo O > /etc/powerstatus; kill -PWR 1"); 87 execute
88 ("umask 600; echo O > /etc/powerstatus; kill -PWR 1");
90 set_timeout = 0; 89 set_timeout = 0;
91 } 90 }
92 } 91 }
93} 92}
94 93
95static int 94static int thread(void *unused)
96thread(void *unused)
97{ 95{
98 printk(KERN_NOTICE "Voyager starting monitor thread\n"); 96 printk(KERN_NOTICE "Voyager starting monitor thread\n");
99 97
@@ -102,7 +100,7 @@ thread(void *unused)
102 schedule_timeout(set_timeout ? HZ : MAX_SCHEDULE_TIMEOUT); 100 schedule_timeout(set_timeout ? HZ : MAX_SCHEDULE_TIMEOUT);
103 101
104 VDEBUG(("Voyager Daemon awoken\n")); 102 VDEBUG(("Voyager Daemon awoken\n"));
105 if(voyager_status.request_from_kernel == 0) { 103 if (voyager_status.request_from_kernel == 0) {
106 /* probably awoken from timeout */ 104 /* probably awoken from timeout */
107 check_continuing_condition(); 105 check_continuing_condition();
108 } else { 106 } else {
@@ -112,20 +110,18 @@ thread(void *unused)
112 } 110 }
113} 111}
114 112
115static int __init 113static int __init voyager_thread_start(void)
116voyager_thread_start(void)
117{ 114{
118 voyager_thread = kthread_run(thread, NULL, "kvoyagerd"); 115 voyager_thread = kthread_run(thread, NULL, "kvoyagerd");
119 if (IS_ERR(voyager_thread)) { 116 if (IS_ERR(voyager_thread)) {
120 printk(KERN_ERR "Voyager: Failed to create system monitor thread.\n"); 117 printk(KERN_ERR
118 "Voyager: Failed to create system monitor thread.\n");
121 return PTR_ERR(voyager_thread); 119 return PTR_ERR(voyager_thread);
122 } 120 }
123 return 0; 121 return 0;
124} 122}
125 123
126 124static void __exit voyager_thread_stop(void)
127static void __exit
128voyager_thread_stop(void)
129{ 125{
130 kthread_stop(voyager_thread); 126 kthread_stop(voyager_thread);
131} 127}
diff --git a/arch/x86/math-emu/errors.c b/arch/x86/math-emu/errors.c
index a1b0d22f697..59d353d2c59 100644
--- a/arch/x86/math-emu/errors.c
+++ b/arch/x86/math-emu/errors.c
@@ -33,45 +33,41 @@
33#undef PRINT_MESSAGES 33#undef PRINT_MESSAGES
34/* */ 34/* */
35 35
36
37#if 0 36#if 0
38void Un_impl(void) 37void Un_impl(void)
39{ 38{
40 u_char byte1, FPU_modrm; 39 u_char byte1, FPU_modrm;
41 unsigned long address = FPU_ORIG_EIP; 40 unsigned long address = FPU_ORIG_EIP;
42 41
43 RE_ENTRANT_CHECK_OFF; 42 RE_ENTRANT_CHECK_OFF;
44 /* No need to check access_ok(), we have previously fetched these bytes. */ 43 /* No need to check access_ok(), we have previously fetched these bytes. */
45 printk("Unimplemented FPU Opcode at eip=%p : ", (void __user *) address); 44 printk("Unimplemented FPU Opcode at eip=%p : ", (void __user *)address);
46 if ( FPU_CS == __USER_CS ) 45 if (FPU_CS == __USER_CS) {
47 { 46 while (1) {
48 while ( 1 ) 47 FPU_get_user(byte1, (u_char __user *) address);
49 { 48 if ((byte1 & 0xf8) == 0xd8)
50 FPU_get_user(byte1, (u_char __user *) address); 49 break;
51 if ( (byte1 & 0xf8) == 0xd8 ) break; 50 printk("[%02x]", byte1);
52 printk("[%02x]", byte1); 51 address++;
53 address++; 52 }
53 printk("%02x ", byte1);
54 FPU_get_user(FPU_modrm, 1 + (u_char __user *) address);
55
56 if (FPU_modrm >= 0300)
57 printk("%02x (%02x+%d)\n", FPU_modrm, FPU_modrm & 0xf8,
58 FPU_modrm & 7);
59 else
60 printk("/%d\n", (FPU_modrm >> 3) & 7);
61 } else {
62 printk("cs selector = %04x\n", FPU_CS);
54 } 63 }
55 printk("%02x ", byte1);
56 FPU_get_user(FPU_modrm, 1 + (u_char __user *) address);
57
58 if (FPU_modrm >= 0300)
59 printk("%02x (%02x+%d)\n", FPU_modrm, FPU_modrm & 0xf8, FPU_modrm & 7);
60 else
61 printk("/%d\n", (FPU_modrm >> 3) & 7);
62 }
63 else
64 {
65 printk("cs selector = %04x\n", FPU_CS);
66 }
67
68 RE_ENTRANT_CHECK_ON;
69
70 EXCEPTION(EX_Invalid);
71 64
72} 65 RE_ENTRANT_CHECK_ON;
73#endif /* 0 */
74 66
67 EXCEPTION(EX_Invalid);
68
69}
70#endif /* 0 */
75 71
76/* 72/*
77 Called for opcodes which are illegal and which are known to result in a 73 Called for opcodes which are illegal and which are known to result in a
@@ -79,139 +75,152 @@ void Un_impl(void)
79 */ 75 */
80void FPU_illegal(void) 76void FPU_illegal(void)
81{ 77{
82 math_abort(FPU_info,SIGILL); 78 math_abort(FPU_info, SIGILL);
83} 79}
84 80
85
86
87void FPU_printall(void) 81void FPU_printall(void)
88{ 82{
89 int i; 83 int i;
90 static const char *tag_desc[] = { "Valid", "Zero", "ERROR", "Empty", 84 static const char *tag_desc[] = { "Valid", "Zero", "ERROR", "Empty",
91 "DeNorm", "Inf", "NaN" }; 85 "DeNorm", "Inf", "NaN"
92 u_char byte1, FPU_modrm; 86 };
93 unsigned long address = FPU_ORIG_EIP; 87 u_char byte1, FPU_modrm;
94 88 unsigned long address = FPU_ORIG_EIP;
95 RE_ENTRANT_CHECK_OFF; 89
96 /* No need to check access_ok(), we have previously fetched these bytes. */ 90 RE_ENTRANT_CHECK_OFF;
97 printk("At %p:", (void *) address); 91 /* No need to check access_ok(), we have previously fetched these bytes. */
98 if ( FPU_CS == __USER_CS ) 92 printk("At %p:", (void *)address);
99 { 93 if (FPU_CS == __USER_CS) {
100#define MAX_PRINTED_BYTES 20 94#define MAX_PRINTED_BYTES 20
101 for ( i = 0; i < MAX_PRINTED_BYTES; i++ ) 95 for (i = 0; i < MAX_PRINTED_BYTES; i++) {
102 { 96 FPU_get_user(byte1, (u_char __user *) address);
103 FPU_get_user(byte1, (u_char __user *) address); 97 if ((byte1 & 0xf8) == 0xd8) {
104 if ( (byte1 & 0xf8) == 0xd8 ) 98 printk(" %02x", byte1);
105 { 99 break;
106 printk(" %02x", byte1); 100 }
107 break; 101 printk(" [%02x]", byte1);
108 } 102 address++;
109 printk(" [%02x]", byte1); 103 }
110 address++; 104 if (i == MAX_PRINTED_BYTES)
111 } 105 printk(" [more..]\n");
112 if ( i == MAX_PRINTED_BYTES ) 106 else {
113 printk(" [more..]\n"); 107 FPU_get_user(FPU_modrm, 1 + (u_char __user *) address);
114 else 108
115 { 109 if (FPU_modrm >= 0300)
116 FPU_get_user(FPU_modrm, 1 + (u_char __user *) address); 110 printk(" %02x (%02x+%d)\n", FPU_modrm,
117 111 FPU_modrm & 0xf8, FPU_modrm & 7);
118 if (FPU_modrm >= 0300) 112 else
119 printk(" %02x (%02x+%d)\n", FPU_modrm, FPU_modrm & 0xf8, FPU_modrm & 7); 113 printk(" /%d, mod=%d rm=%d\n",
120 else 114 (FPU_modrm >> 3) & 7,
121 printk(" /%d, mod=%d rm=%d\n", 115 (FPU_modrm >> 6) & 3, FPU_modrm & 7);
122 (FPU_modrm >> 3) & 7, (FPU_modrm >> 6) & 3, FPU_modrm & 7); 116 }
117 } else {
118 printk("%04x\n", FPU_CS);
123 } 119 }
124 }
125 else
126 {
127 printk("%04x\n", FPU_CS);
128 }
129 120
130 partial_status = status_word(); 121 partial_status = status_word();
131 122
132#ifdef DEBUGGING 123#ifdef DEBUGGING
133if ( partial_status & SW_Backward ) printk("SW: backward compatibility\n"); 124 if (partial_status & SW_Backward)
134if ( partial_status & SW_C3 ) printk("SW: condition bit 3\n"); 125 printk("SW: backward compatibility\n");
135if ( partial_status & SW_C2 ) printk("SW: condition bit 2\n"); 126 if (partial_status & SW_C3)
136if ( partial_status & SW_C1 ) printk("SW: condition bit 1\n"); 127 printk("SW: condition bit 3\n");
137if ( partial_status & SW_C0 ) printk("SW: condition bit 0\n"); 128 if (partial_status & SW_C2)
138if ( partial_status & SW_Summary ) printk("SW: exception summary\n"); 129 printk("SW: condition bit 2\n");
139if ( partial_status & SW_Stack_Fault ) printk("SW: stack fault\n"); 130 if (partial_status & SW_C1)
140if ( partial_status & SW_Precision ) printk("SW: loss of precision\n"); 131 printk("SW: condition bit 1\n");
141if ( partial_status & SW_Underflow ) printk("SW: underflow\n"); 132 if (partial_status & SW_C0)
142if ( partial_status & SW_Overflow ) printk("SW: overflow\n"); 133 printk("SW: condition bit 0\n");
143if ( partial_status & SW_Zero_Div ) printk("SW: divide by zero\n"); 134 if (partial_status & SW_Summary)
144if ( partial_status & SW_Denorm_Op ) printk("SW: denormalized operand\n"); 135 printk("SW: exception summary\n");
145if ( partial_status & SW_Invalid ) printk("SW: invalid operation\n"); 136 if (partial_status & SW_Stack_Fault)
137 printk("SW: stack fault\n");
138 if (partial_status & SW_Precision)
139 printk("SW: loss of precision\n");
140 if (partial_status & SW_Underflow)
141 printk("SW: underflow\n");
142 if (partial_status & SW_Overflow)
143 printk("SW: overflow\n");
144 if (partial_status & SW_Zero_Div)
145 printk("SW: divide by zero\n");
146 if (partial_status & SW_Denorm_Op)
147 printk("SW: denormalized operand\n");
148 if (partial_status & SW_Invalid)
149 printk("SW: invalid operation\n");
146#endif /* DEBUGGING */ 150#endif /* DEBUGGING */
147 151
148 printk(" SW: b=%d st=%ld es=%d sf=%d cc=%d%d%d%d ef=%d%d%d%d%d%d\n", 152 printk(" SW: b=%d st=%d es=%d sf=%d cc=%d%d%d%d ef=%d%d%d%d%d%d\n", partial_status & 0x8000 ? 1 : 0, /* busy */
149 partial_status & 0x8000 ? 1 : 0, /* busy */ 153 (partial_status & 0x3800) >> 11, /* stack top pointer */
150 (partial_status & 0x3800) >> 11, /* stack top pointer */ 154 partial_status & 0x80 ? 1 : 0, /* Error summary status */
151 partial_status & 0x80 ? 1 : 0, /* Error summary status */ 155 partial_status & 0x40 ? 1 : 0, /* Stack flag */
152 partial_status & 0x40 ? 1 : 0, /* Stack flag */ 156 partial_status & SW_C3 ? 1 : 0, partial_status & SW_C2 ? 1 : 0, /* cc */
153 partial_status & SW_C3?1:0, partial_status & SW_C2?1:0, /* cc */ 157 partial_status & SW_C1 ? 1 : 0, partial_status & SW_C0 ? 1 : 0, /* cc */
154 partial_status & SW_C1?1:0, partial_status & SW_C0?1:0, /* cc */ 158 partial_status & SW_Precision ? 1 : 0,
155 partial_status & SW_Precision?1:0, partial_status & SW_Underflow?1:0, 159 partial_status & SW_Underflow ? 1 : 0,
156 partial_status & SW_Overflow?1:0, partial_status & SW_Zero_Div?1:0, 160 partial_status & SW_Overflow ? 1 : 0,
157 partial_status & SW_Denorm_Op?1:0, partial_status & SW_Invalid?1:0); 161 partial_status & SW_Zero_Div ? 1 : 0,
158 162 partial_status & SW_Denorm_Op ? 1 : 0,
159printk(" CW: ic=%d rc=%ld%ld pc=%ld%ld iem=%d ef=%d%d%d%d%d%d\n", 163 partial_status & SW_Invalid ? 1 : 0);
160 control_word & 0x1000 ? 1 : 0, 164
161 (control_word & 0x800) >> 11, (control_word & 0x400) >> 10, 165 printk(" CW: ic=%d rc=%d%d pc=%d%d iem=%d ef=%d%d%d%d%d%d\n",
162 (control_word & 0x200) >> 9, (control_word & 0x100) >> 8, 166 control_word & 0x1000 ? 1 : 0,
163 control_word & 0x80 ? 1 : 0, 167 (control_word & 0x800) >> 11, (control_word & 0x400) >> 10,
164 control_word & SW_Precision?1:0, control_word & SW_Underflow?1:0, 168 (control_word & 0x200) >> 9, (control_word & 0x100) >> 8,
165 control_word & SW_Overflow?1:0, control_word & SW_Zero_Div?1:0, 169 control_word & 0x80 ? 1 : 0,
166 control_word & SW_Denorm_Op?1:0, control_word & SW_Invalid?1:0); 170 control_word & SW_Precision ? 1 : 0,
167 171 control_word & SW_Underflow ? 1 : 0,
168 for ( i = 0; i < 8; i++ ) 172 control_word & SW_Overflow ? 1 : 0,
169 { 173 control_word & SW_Zero_Div ? 1 : 0,
170 FPU_REG *r = &st(i); 174 control_word & SW_Denorm_Op ? 1 : 0,
171 u_char tagi = FPU_gettagi(i); 175 control_word & SW_Invalid ? 1 : 0);
172 switch (tagi) 176
173 { 177 for (i = 0; i < 8; i++) {
174 case TAG_Empty: 178 FPU_REG *r = &st(i);
175 continue; 179 u_char tagi = FPU_gettagi(i);
176 break; 180 switch (tagi) {
177 case TAG_Zero: 181 case TAG_Empty:
178 case TAG_Special: 182 continue;
179 tagi = FPU_Special(r); 183 break;
180 case TAG_Valid: 184 case TAG_Zero:
181 printk("st(%d) %c .%04lx %04lx %04lx %04lx e%+-6d ", i, 185 case TAG_Special:
182 getsign(r) ? '-' : '+', 186 tagi = FPU_Special(r);
183 (long)(r->sigh >> 16), 187 case TAG_Valid:
184 (long)(r->sigh & 0xFFFF), 188 printk("st(%d) %c .%04lx %04lx %04lx %04lx e%+-6d ", i,
185 (long)(r->sigl >> 16), 189 getsign(r) ? '-' : '+',
186 (long)(r->sigl & 0xFFFF), 190 (long)(r->sigh >> 16),
187 exponent(r) - EXP_BIAS + 1); 191 (long)(r->sigh & 0xFFFF),
188 break; 192 (long)(r->sigl >> 16),
189 default: 193 (long)(r->sigl & 0xFFFF),
190 printk("Whoops! Error in errors.c: tag%d is %d ", i, tagi); 194 exponent(r) - EXP_BIAS + 1);
191 continue; 195 break;
192 break; 196 default:
197 printk("Whoops! Error in errors.c: tag%d is %d ", i,
198 tagi);
199 continue;
200 break;
201 }
202 printk("%s\n", tag_desc[(int)(unsigned)tagi]);
193 } 203 }
194 printk("%s\n", tag_desc[(int) (unsigned) tagi]);
195 }
196 204
197 RE_ENTRANT_CHECK_ON; 205 RE_ENTRANT_CHECK_ON;
198 206
199} 207}
200 208
201static struct { 209static struct {
202 int type; 210 int type;
203 const char *name; 211 const char *name;
204} exception_names[] = { 212} exception_names[] = {
205 { EX_StackOver, "stack overflow" }, 213 {
206 { EX_StackUnder, "stack underflow" }, 214 EX_StackOver, "stack overflow"}, {
207 { EX_Precision, "loss of precision" }, 215 EX_StackUnder, "stack underflow"}, {
208 { EX_Underflow, "underflow" }, 216 EX_Precision, "loss of precision"}, {
209 { EX_Overflow, "overflow" }, 217 EX_Underflow, "underflow"}, {
210 { EX_ZeroDiv, "divide by zero" }, 218 EX_Overflow, "overflow"}, {
211 { EX_Denormal, "denormalized operand" }, 219 EX_ZeroDiv, "divide by zero"}, {
212 { EX_Invalid, "invalid operation" }, 220 EX_Denormal, "denormalized operand"}, {
213 { EX_INTERNAL, "INTERNAL BUG in "FPU_VERSION }, 221 EX_Invalid, "invalid operation"}, {
214 { 0, NULL } 222 EX_INTERNAL, "INTERNAL BUG in " FPU_VERSION}, {
223 0, NULL}
215}; 224};
216 225
217/* 226/*
@@ -295,445 +304,386 @@ static struct {
295 304
296asmlinkage void FPU_exception(int n) 305asmlinkage void FPU_exception(int n)
297{ 306{
298 int i, int_type; 307 int i, int_type;
299 308
300 int_type = 0; /* Needed only to stop compiler warnings */ 309 int_type = 0; /* Needed only to stop compiler warnings */
301 if ( n & EX_INTERNAL ) 310 if (n & EX_INTERNAL) {
302 { 311 int_type = n - EX_INTERNAL;
303 int_type = n - EX_INTERNAL; 312 n = EX_INTERNAL;
304 n = EX_INTERNAL; 313 /* Set lots of exception bits! */
305 /* Set lots of exception bits! */ 314 partial_status |= (SW_Exc_Mask | SW_Summary | SW_Backward);
306 partial_status |= (SW_Exc_Mask | SW_Summary | SW_Backward); 315 } else {
307 } 316 /* Extract only the bits which we use to set the status word */
308 else 317 n &= (SW_Exc_Mask);
309 { 318 /* Set the corresponding exception bit */
310 /* Extract only the bits which we use to set the status word */ 319 partial_status |= n;
311 n &= (SW_Exc_Mask); 320 /* Set summary bits iff exception isn't masked */
312 /* Set the corresponding exception bit */ 321 if (partial_status & ~control_word & CW_Exceptions)
313 partial_status |= n; 322 partial_status |= (SW_Summary | SW_Backward);
314 /* Set summary bits iff exception isn't masked */ 323 if (n & (SW_Stack_Fault | EX_Precision)) {
315 if ( partial_status & ~control_word & CW_Exceptions ) 324 if (!(n & SW_C1))
316 partial_status |= (SW_Summary | SW_Backward); 325 /* This bit distinguishes over- from underflow for a stack fault,
317 if ( n & (SW_Stack_Fault | EX_Precision) ) 326 and roundup from round-down for precision loss. */
318 { 327 partial_status &= ~SW_C1;
319 if ( !(n & SW_C1) ) 328 }
320 /* This bit distinguishes over- from underflow for a stack fault,
321 and roundup from round-down for precision loss. */
322 partial_status &= ~SW_C1;
323 } 329 }
324 }
325 330
326 RE_ENTRANT_CHECK_OFF; 331 RE_ENTRANT_CHECK_OFF;
327 if ( (~control_word & n & CW_Exceptions) || (n == EX_INTERNAL) ) 332 if ((~control_word & n & CW_Exceptions) || (n == EX_INTERNAL)) {
328 {
329#ifdef PRINT_MESSAGES 333#ifdef PRINT_MESSAGES
330 /* My message from the sponsor */ 334 /* My message from the sponsor */
331 printk(FPU_VERSION" "__DATE__" (C) W. Metzenthen.\n"); 335 printk(FPU_VERSION " " __DATE__ " (C) W. Metzenthen.\n");
332#endif /* PRINT_MESSAGES */ 336#endif /* PRINT_MESSAGES */
333 337
334 /* Get a name string for error reporting */ 338 /* Get a name string for error reporting */
335 for (i=0; exception_names[i].type; i++) 339 for (i = 0; exception_names[i].type; i++)
336 if ( (exception_names[i].type & n) == exception_names[i].type ) 340 if ((exception_names[i].type & n) ==
337 break; 341 exception_names[i].type)
338 342 break;
339 if (exception_names[i].type) 343
340 { 344 if (exception_names[i].type) {
341#ifdef PRINT_MESSAGES 345#ifdef PRINT_MESSAGES
342 printk("FP Exception: %s!\n", exception_names[i].name); 346 printk("FP Exception: %s!\n", exception_names[i].name);
343#endif /* PRINT_MESSAGES */ 347#endif /* PRINT_MESSAGES */
344 } 348 } else
345 else 349 printk("FPU emulator: Unknown Exception: 0x%04x!\n", n);
346 printk("FPU emulator: Unknown Exception: 0x%04x!\n", n); 350
347 351 if (n == EX_INTERNAL) {
348 if ( n == EX_INTERNAL ) 352 printk("FPU emulator: Internal error type 0x%04x\n",
349 { 353 int_type);
350 printk("FPU emulator: Internal error type 0x%04x\n", int_type); 354 FPU_printall();
351 FPU_printall(); 355 }
352 }
353#ifdef PRINT_MESSAGES 356#ifdef PRINT_MESSAGES
354 else 357 else
355 FPU_printall(); 358 FPU_printall();
356#endif /* PRINT_MESSAGES */ 359#endif /* PRINT_MESSAGES */
357 360
358 /* 361 /*
359 * The 80486 generates an interrupt on the next non-control FPU 362 * The 80486 generates an interrupt on the next non-control FPU
360 * instruction. So we need some means of flagging it. 363 * instruction. So we need some means of flagging it.
361 * We use the ES (Error Summary) bit for this. 364 * We use the ES (Error Summary) bit for this.
362 */ 365 */
363 } 366 }
364 RE_ENTRANT_CHECK_ON; 367 RE_ENTRANT_CHECK_ON;
365 368
366#ifdef __DEBUG__ 369#ifdef __DEBUG__
367 math_abort(FPU_info,SIGFPE); 370 math_abort(FPU_info, SIGFPE);
368#endif /* __DEBUG__ */ 371#endif /* __DEBUG__ */
369 372
370} 373}
371 374
372
373/* Real operation attempted on a NaN. */ 375/* Real operation attempted on a NaN. */
374/* Returns < 0 if the exception is unmasked */ 376/* Returns < 0 if the exception is unmasked */
375int real_1op_NaN(FPU_REG *a) 377int real_1op_NaN(FPU_REG *a)
376{ 378{
377 int signalling, isNaN; 379 int signalling, isNaN;
378 380
379 isNaN = (exponent(a) == EXP_OVER) && (a->sigh & 0x80000000); 381 isNaN = (exponent(a) == EXP_OVER) && (a->sigh & 0x80000000);
380 382
381 /* The default result for the case of two "equal" NaNs (signs may 383 /* The default result for the case of two "equal" NaNs (signs may
382 differ) is chosen to reproduce 80486 behaviour */ 384 differ) is chosen to reproduce 80486 behaviour */
383 signalling = isNaN && !(a->sigh & 0x40000000); 385 signalling = isNaN && !(a->sigh & 0x40000000);
384 386
385 if ( !signalling ) 387 if (!signalling) {
386 { 388 if (!isNaN) { /* pseudo-NaN, or other unsupported? */
387 if ( !isNaN ) /* pseudo-NaN, or other unsupported? */ 389 if (control_word & CW_Invalid) {
388 { 390 /* Masked response */
389 if ( control_word & CW_Invalid ) 391 reg_copy(&CONST_QNaN, a);
390 { 392 }
391 /* Masked response */ 393 EXCEPTION(EX_Invalid);
392 reg_copy(&CONST_QNaN, a); 394 return (!(control_word & CW_Invalid) ? FPU_Exception :
393 } 395 0) | TAG_Special;
394 EXCEPTION(EX_Invalid); 396 }
395 return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | TAG_Special; 397 return TAG_Special;
396 } 398 }
397 return TAG_Special;
398 }
399 399
400 if ( control_word & CW_Invalid ) 400 if (control_word & CW_Invalid) {
401 { 401 /* The masked response */
402 /* The masked response */ 402 if (!(a->sigh & 0x80000000)) { /* pseudo-NaN ? */
403 if ( !(a->sigh & 0x80000000) ) /* pseudo-NaN ? */ 403 reg_copy(&CONST_QNaN, a);
404 { 404 }
405 reg_copy(&CONST_QNaN, a); 405 /* ensure a Quiet NaN */
406 a->sigh |= 0x40000000;
406 } 407 }
407 /* ensure a Quiet NaN */
408 a->sigh |= 0x40000000;
409 }
410 408
411 EXCEPTION(EX_Invalid); 409 EXCEPTION(EX_Invalid);
412 410
413 return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | TAG_Special; 411 return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | TAG_Special;
414} 412}
415 413
416
417/* Real operation attempted on two operands, one a NaN. */ 414/* Real operation attempted on two operands, one a NaN. */
418/* Returns < 0 if the exception is unmasked */ 415/* Returns < 0 if the exception is unmasked */
419int real_2op_NaN(FPU_REG const *b, u_char tagb, 416int real_2op_NaN(FPU_REG const *b, u_char tagb,
420 int deststnr, 417 int deststnr, FPU_REG const *defaultNaN)
421 FPU_REG const *defaultNaN)
422{ 418{
423 FPU_REG *dest = &st(deststnr); 419 FPU_REG *dest = &st(deststnr);
424 FPU_REG const *a = dest; 420 FPU_REG const *a = dest;
425 u_char taga = FPU_gettagi(deststnr); 421 u_char taga = FPU_gettagi(deststnr);
426 FPU_REG const *x; 422 FPU_REG const *x;
427 int signalling, unsupported; 423 int signalling, unsupported;
428 424
429 if ( taga == TAG_Special ) 425 if (taga == TAG_Special)
430 taga = FPU_Special(a); 426 taga = FPU_Special(a);
431 if ( tagb == TAG_Special ) 427 if (tagb == TAG_Special)
432 tagb = FPU_Special(b); 428 tagb = FPU_Special(b);
433 429
434 /* TW_NaN is also used for unsupported data types. */ 430 /* TW_NaN is also used for unsupported data types. */
435 unsupported = ((taga == TW_NaN) 431 unsupported = ((taga == TW_NaN)
436 && !((exponent(a) == EXP_OVER) && (a->sigh & 0x80000000))) 432 && !((exponent(a) == EXP_OVER)
437 || ((tagb == TW_NaN) 433 && (a->sigh & 0x80000000)))
438 && !((exponent(b) == EXP_OVER) && (b->sigh & 0x80000000))); 434 || ((tagb == TW_NaN)
439 if ( unsupported ) 435 && !((exponent(b) == EXP_OVER) && (b->sigh & 0x80000000)));
440 { 436 if (unsupported) {
441 if ( control_word & CW_Invalid ) 437 if (control_word & CW_Invalid) {
442 { 438 /* Masked response */
443 /* Masked response */ 439 FPU_copy_to_regi(&CONST_QNaN, TAG_Special, deststnr);
444 FPU_copy_to_regi(&CONST_QNaN, TAG_Special, deststnr); 440 }
445 } 441 EXCEPTION(EX_Invalid);
446 EXCEPTION(EX_Invalid); 442 return (!(control_word & CW_Invalid) ? FPU_Exception : 0) |
447 return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | TAG_Special; 443 TAG_Special;
448 }
449
450 if (taga == TW_NaN)
451 {
452 x = a;
453 if (tagb == TW_NaN)
454 {
455 signalling = !(a->sigh & b->sigh & 0x40000000);
456 if ( significand(b) > significand(a) )
457 x = b;
458 else if ( significand(b) == significand(a) )
459 {
460 /* The default result for the case of two "equal" NaNs (signs may
461 differ) is chosen to reproduce 80486 behaviour */
462 x = defaultNaN;
463 }
464 }
465 else
466 {
467 /* return the quiet version of the NaN in a */
468 signalling = !(a->sigh & 0x40000000);
469 } 444 }
470 } 445
471 else 446 if (taga == TW_NaN) {
447 x = a;
448 if (tagb == TW_NaN) {
449 signalling = !(a->sigh & b->sigh & 0x40000000);
450 if (significand(b) > significand(a))
451 x = b;
452 else if (significand(b) == significand(a)) {
453 /* The default result for the case of two "equal" NaNs (signs may
454 differ) is chosen to reproduce 80486 behaviour */
455 x = defaultNaN;
456 }
457 } else {
458 /* return the quiet version of the NaN in a */
459 signalling = !(a->sigh & 0x40000000);
460 }
461 } else
472#ifdef PARANOID 462#ifdef PARANOID
473 if (tagb == TW_NaN) 463 if (tagb == TW_NaN)
474#endif /* PARANOID */ 464#endif /* PARANOID */
475 { 465 {
476 signalling = !(b->sigh & 0x40000000); 466 signalling = !(b->sigh & 0x40000000);
477 x = b; 467 x = b;
478 } 468 }
479#ifdef PARANOID 469#ifdef PARANOID
480 else 470 else {
481 { 471 signalling = 0;
482 signalling = 0; 472 EXCEPTION(EX_INTERNAL | 0x113);
483 EXCEPTION(EX_INTERNAL|0x113); 473 x = &CONST_QNaN;
484 x = &CONST_QNaN; 474 }
485 }
486#endif /* PARANOID */ 475#endif /* PARANOID */
487 476
488 if ( (!signalling) || (control_word & CW_Invalid) ) 477 if ((!signalling) || (control_word & CW_Invalid)) {
489 { 478 if (!x)
490 if ( ! x ) 479 x = b;
491 x = b;
492 480
493 if ( !(x->sigh & 0x80000000) ) /* pseudo-NaN ? */ 481 if (!(x->sigh & 0x80000000)) /* pseudo-NaN ? */
494 x = &CONST_QNaN; 482 x = &CONST_QNaN;
495 483
496 FPU_copy_to_regi(x, TAG_Special, deststnr); 484 FPU_copy_to_regi(x, TAG_Special, deststnr);
497 485
498 if ( !signalling ) 486 if (!signalling)
499 return TAG_Special; 487 return TAG_Special;
500 488
501 /* ensure a Quiet NaN */ 489 /* ensure a Quiet NaN */
502 dest->sigh |= 0x40000000; 490 dest->sigh |= 0x40000000;
503 } 491 }
504 492
505 EXCEPTION(EX_Invalid); 493 EXCEPTION(EX_Invalid);
506 494
507 return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | TAG_Special; 495 return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | TAG_Special;
508} 496}
509 497
510
511/* Invalid arith operation on Valid registers */ 498/* Invalid arith operation on Valid registers */
512/* Returns < 0 if the exception is unmasked */ 499/* Returns < 0 if the exception is unmasked */
513asmlinkage int arith_invalid(int deststnr) 500asmlinkage int arith_invalid(int deststnr)
514{ 501{
515 502
516 EXCEPTION(EX_Invalid); 503 EXCEPTION(EX_Invalid);
517
518 if ( control_word & CW_Invalid )
519 {
520 /* The masked response */
521 FPU_copy_to_regi(&CONST_QNaN, TAG_Special, deststnr);
522 }
523
524 return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | TAG_Valid;
525 504
526} 505 if (control_word & CW_Invalid) {
506 /* The masked response */
507 FPU_copy_to_regi(&CONST_QNaN, TAG_Special, deststnr);
508 }
527 509
510 return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | TAG_Valid;
511
512}
528 513
529/* Divide a finite number by zero */ 514/* Divide a finite number by zero */
530asmlinkage int FPU_divide_by_zero(int deststnr, u_char sign) 515asmlinkage int FPU_divide_by_zero(int deststnr, u_char sign)
531{ 516{
532 FPU_REG *dest = &st(deststnr); 517 FPU_REG *dest = &st(deststnr);
533 int tag = TAG_Valid; 518 int tag = TAG_Valid;
519
520 if (control_word & CW_ZeroDiv) {
521 /* The masked response */
522 FPU_copy_to_regi(&CONST_INF, TAG_Special, deststnr);
523 setsign(dest, sign);
524 tag = TAG_Special;
525 }
534 526
535 if ( control_word & CW_ZeroDiv ) 527 EXCEPTION(EX_ZeroDiv);
536 {
537 /* The masked response */
538 FPU_copy_to_regi(&CONST_INF, TAG_Special, deststnr);
539 setsign(dest, sign);
540 tag = TAG_Special;
541 }
542
543 EXCEPTION(EX_ZeroDiv);
544 528
545 return (!(control_word & CW_ZeroDiv) ? FPU_Exception : 0) | tag; 529 return (!(control_word & CW_ZeroDiv) ? FPU_Exception : 0) | tag;
546 530
547} 531}
548 532
549
550/* This may be called often, so keep it lean */ 533/* This may be called often, so keep it lean */
551int set_precision_flag(int flags) 534int set_precision_flag(int flags)
552{ 535{
553 if ( control_word & CW_Precision ) 536 if (control_word & CW_Precision) {
554 { 537 partial_status &= ~(SW_C1 & flags);
555 partial_status &= ~(SW_C1 & flags); 538 partial_status |= flags; /* The masked response */
556 partial_status |= flags; /* The masked response */ 539 return 0;
557 return 0; 540 } else {
558 } 541 EXCEPTION(flags);
559 else 542 return 1;
560 { 543 }
561 EXCEPTION(flags);
562 return 1;
563 }
564} 544}
565 545
566
567/* This may be called often, so keep it lean */ 546/* This may be called often, so keep it lean */
568asmlinkage void set_precision_flag_up(void) 547asmlinkage void set_precision_flag_up(void)
569{ 548{
570 if ( control_word & CW_Precision ) 549 if (control_word & CW_Precision)
571 partial_status |= (SW_Precision | SW_C1); /* The masked response */ 550 partial_status |= (SW_Precision | SW_C1); /* The masked response */
572 else 551 else
573 EXCEPTION(EX_Precision | SW_C1); 552 EXCEPTION(EX_Precision | SW_C1);
574} 553}
575 554
576
577/* This may be called often, so keep it lean */ 555/* This may be called often, so keep it lean */
578asmlinkage void set_precision_flag_down(void) 556asmlinkage void set_precision_flag_down(void)
579{ 557{
580 if ( control_word & CW_Precision ) 558 if (control_word & CW_Precision) { /* The masked response */
581 { /* The masked response */ 559 partial_status &= ~SW_C1;
582 partial_status &= ~SW_C1; 560 partial_status |= SW_Precision;
583 partial_status |= SW_Precision; 561 } else
584 } 562 EXCEPTION(EX_Precision);
585 else
586 EXCEPTION(EX_Precision);
587} 563}
588 564
589
590asmlinkage int denormal_operand(void) 565asmlinkage int denormal_operand(void)
591{ 566{
592 if ( control_word & CW_Denormal ) 567 if (control_word & CW_Denormal) { /* The masked response */
593 { /* The masked response */ 568 partial_status |= SW_Denorm_Op;
594 partial_status |= SW_Denorm_Op; 569 return TAG_Special;
595 return TAG_Special; 570 } else {
596 } 571 EXCEPTION(EX_Denormal);
597 else 572 return TAG_Special | FPU_Exception;
598 { 573 }
599 EXCEPTION(EX_Denormal);
600 return TAG_Special | FPU_Exception;
601 }
602} 574}
603 575
604
605asmlinkage int arith_overflow(FPU_REG *dest) 576asmlinkage int arith_overflow(FPU_REG *dest)
606{ 577{
607 int tag = TAG_Valid; 578 int tag = TAG_Valid;
608 579
609 if ( control_word & CW_Overflow ) 580 if (control_word & CW_Overflow) {
610 { 581 /* The masked response */
611 /* The masked response */
612/* ###### The response here depends upon the rounding mode */ 582/* ###### The response here depends upon the rounding mode */
613 reg_copy(&CONST_INF, dest); 583 reg_copy(&CONST_INF, dest);
614 tag = TAG_Special; 584 tag = TAG_Special;
615 } 585 } else {
616 else 586 /* Subtract the magic number from the exponent */
617 { 587 addexponent(dest, (-3 * (1 << 13)));
618 /* Subtract the magic number from the exponent */ 588 }
619 addexponent(dest, (-3 * (1 << 13)));
620 }
621
622 EXCEPTION(EX_Overflow);
623 if ( control_word & CW_Overflow )
624 {
625 /* The overflow exception is masked. */
626 /* By definition, precision is lost.
627 The roundup bit (C1) is also set because we have
628 "rounded" upwards to Infinity. */
629 EXCEPTION(EX_Precision | SW_C1);
630 return tag;
631 }
632
633 return tag;
634 589
635} 590 EXCEPTION(EX_Overflow);
591 if (control_word & CW_Overflow) {
592 /* The overflow exception is masked. */
593 /* By definition, precision is lost.
594 The roundup bit (C1) is also set because we have
595 "rounded" upwards to Infinity. */
596 EXCEPTION(EX_Precision | SW_C1);
597 return tag;
598 }
599
600 return tag;
636 601
602}
637 603
638asmlinkage int arith_underflow(FPU_REG *dest) 604asmlinkage int arith_underflow(FPU_REG *dest)
639{ 605{
640 int tag = TAG_Valid; 606 int tag = TAG_Valid;
641 607
642 if ( control_word & CW_Underflow ) 608 if (control_word & CW_Underflow) {
643 { 609 /* The masked response */
644 /* The masked response */ 610 if (exponent16(dest) <= EXP_UNDER - 63) {
645 if ( exponent16(dest) <= EXP_UNDER - 63 ) 611 reg_copy(&CONST_Z, dest);
646 { 612 partial_status &= ~SW_C1; /* Round down. */
647 reg_copy(&CONST_Z, dest); 613 tag = TAG_Zero;
648 partial_status &= ~SW_C1; /* Round down. */ 614 } else {
649 tag = TAG_Zero; 615 stdexp(dest);
616 }
617 } else {
618 /* Add the magic number to the exponent. */
619 addexponent(dest, (3 * (1 << 13)) + EXTENDED_Ebias);
650 } 620 }
651 else 621
652 { 622 EXCEPTION(EX_Underflow);
653 stdexp(dest); 623 if (control_word & CW_Underflow) {
624 /* The underflow exception is masked. */
625 EXCEPTION(EX_Precision);
626 return tag;
654 } 627 }
655 }
656 else
657 {
658 /* Add the magic number to the exponent. */
659 addexponent(dest, (3 * (1 << 13)) + EXTENDED_Ebias);
660 }
661
662 EXCEPTION(EX_Underflow);
663 if ( control_word & CW_Underflow )
664 {
665 /* The underflow exception is masked. */
666 EXCEPTION(EX_Precision);
667 return tag;
668 }
669
670 return tag;
671 628
672} 629 return tag;
673 630
631}
674 632
675void FPU_stack_overflow(void) 633void FPU_stack_overflow(void)
676{ 634{
677 635
678 if ( control_word & CW_Invalid ) 636 if (control_word & CW_Invalid) {
679 { 637 /* The masked response */
680 /* The masked response */ 638 top--;
681 top--; 639 FPU_copy_to_reg0(&CONST_QNaN, TAG_Special);
682 FPU_copy_to_reg0(&CONST_QNaN, TAG_Special); 640 }
683 }
684 641
685 EXCEPTION(EX_StackOver); 642 EXCEPTION(EX_StackOver);
686 643
687 return; 644 return;
688 645
689} 646}
690 647
691
692void FPU_stack_underflow(void) 648void FPU_stack_underflow(void)
693{ 649{
694 650
695 if ( control_word & CW_Invalid ) 651 if (control_word & CW_Invalid) {
696 { 652 /* The masked response */
697 /* The masked response */ 653 FPU_copy_to_reg0(&CONST_QNaN, TAG_Special);
698 FPU_copy_to_reg0(&CONST_QNaN, TAG_Special); 654 }
699 }
700 655
701 EXCEPTION(EX_StackUnder); 656 EXCEPTION(EX_StackUnder);
702 657
703 return; 658 return;
704 659
705} 660}
706 661
707
708void FPU_stack_underflow_i(int i) 662void FPU_stack_underflow_i(int i)
709{ 663{
710 664
711 if ( control_word & CW_Invalid ) 665 if (control_word & CW_Invalid) {
712 { 666 /* The masked response */
713 /* The masked response */ 667 FPU_copy_to_regi(&CONST_QNaN, TAG_Special, i);
714 FPU_copy_to_regi(&CONST_QNaN, TAG_Special, i); 668 }
715 }
716 669
717 EXCEPTION(EX_StackUnder); 670 EXCEPTION(EX_StackUnder);
718 671
719 return; 672 return;
720 673
721} 674}
722 675
723
724void FPU_stack_underflow_pop(int i) 676void FPU_stack_underflow_pop(int i)
725{ 677{
726 678
727 if ( control_word & CW_Invalid ) 679 if (control_word & CW_Invalid) {
728 { 680 /* The masked response */
729 /* The masked response */ 681 FPU_copy_to_regi(&CONST_QNaN, TAG_Special, i);
730 FPU_copy_to_regi(&CONST_QNaN, TAG_Special, i); 682 FPU_pop();
731 FPU_pop(); 683 }
732 }
733 684
734 EXCEPTION(EX_StackUnder); 685 EXCEPTION(EX_StackUnder);
735 686
736 return; 687 return;
737 688
738} 689}
739
diff --git a/arch/x86/math-emu/exception.h b/arch/x86/math-emu/exception.h
index b463f21a811..67f43a4683d 100644
--- a/arch/x86/math-emu/exception.h
+++ b/arch/x86/math-emu/exception.h
@@ -9,7 +9,6 @@
9#ifndef _EXCEPTION_H_ 9#ifndef _EXCEPTION_H_
10#define _EXCEPTION_H_ 10#define _EXCEPTION_H_
11 11
12
13#ifdef __ASSEMBLY__ 12#ifdef __ASSEMBLY__
14#define Const_(x) $##x 13#define Const_(x) $##x
15#else 14#else
@@ -20,8 +19,8 @@
20#include "fpu_emu.h" 19#include "fpu_emu.h"
21#endif /* SW_C1 */ 20#endif /* SW_C1 */
22 21
23#define FPU_BUSY Const_(0x8000) /* FPU busy bit (8087 compatibility) */ 22#define FPU_BUSY Const_(0x8000) /* FPU busy bit (8087 compatibility) */
24#define EX_ErrorSummary Const_(0x0080) /* Error summary status */ 23#define EX_ErrorSummary Const_(0x0080) /* Error summary status */
25/* Special exceptions: */ 24/* Special exceptions: */
26#define EX_INTERNAL Const_(0x8000) /* Internal error in wm-FPU-emu */ 25#define EX_INTERNAL Const_(0x8000) /* Internal error in wm-FPU-emu */
27#define EX_StackOver Const_(0x0041|SW_C1) /* stack overflow */ 26#define EX_StackOver Const_(0x0041|SW_C1) /* stack overflow */
@@ -34,11 +33,9 @@
34#define EX_Denormal Const_(0x0002) /* denormalized operand */ 33#define EX_Denormal Const_(0x0002) /* denormalized operand */
35#define EX_Invalid Const_(0x0001) /* invalid operation */ 34#define EX_Invalid Const_(0x0001) /* invalid operation */
36 35
37
38#define PRECISION_LOST_UP Const_((EX_Precision | SW_C1)) 36#define PRECISION_LOST_UP Const_((EX_Precision | SW_C1))
39#define PRECISION_LOST_DOWN Const_(EX_Precision) 37#define PRECISION_LOST_DOWN Const_(EX_Precision)
40 38
41
42#ifndef __ASSEMBLY__ 39#ifndef __ASSEMBLY__
43 40
44#ifdef DEBUG 41#ifdef DEBUG
@@ -48,6 +45,6 @@
48#define EXCEPTION(x) FPU_exception(x) 45#define EXCEPTION(x) FPU_exception(x)
49#endif 46#endif
50 47
51#endif /* __ASSEMBLY__ */ 48#endif /* __ASSEMBLY__ */
52 49
53#endif /* _EXCEPTION_H_ */ 50#endif /* _EXCEPTION_H_ */
diff --git a/arch/x86/math-emu/fpu_arith.c b/arch/x86/math-emu/fpu_arith.c
index 6972dec01af..aeab24e083c 100644
--- a/arch/x86/math-emu/fpu_arith.c
+++ b/arch/x86/math-emu/fpu_arith.c
@@ -15,160 +15,138 @@
15#include "control_w.h" 15#include "control_w.h"
16#include "status_w.h" 16#include "status_w.h"
17 17
18
19void fadd__(void) 18void fadd__(void)
20{ 19{
21 /* fadd st,st(i) */ 20 /* fadd st,st(i) */
22 int i = FPU_rm; 21 int i = FPU_rm;
23 clear_C1(); 22 clear_C1();
24 FPU_add(&st(i), FPU_gettagi(i), 0, control_word); 23 FPU_add(&st(i), FPU_gettagi(i), 0, control_word);
25} 24}
26 25
27
28void fmul__(void) 26void fmul__(void)
29{ 27{
30 /* fmul st,st(i) */ 28 /* fmul st,st(i) */
31 int i = FPU_rm; 29 int i = FPU_rm;
32 clear_C1(); 30 clear_C1();
33 FPU_mul(&st(i), FPU_gettagi(i), 0, control_word); 31 FPU_mul(&st(i), FPU_gettagi(i), 0, control_word);
34} 32}
35 33
36
37
38void fsub__(void) 34void fsub__(void)
39{ 35{
40 /* fsub st,st(i) */ 36 /* fsub st,st(i) */
41 clear_C1(); 37 clear_C1();
42 FPU_sub(0, FPU_rm, control_word); 38 FPU_sub(0, FPU_rm, control_word);
43} 39}
44 40
45
46void fsubr_(void) 41void fsubr_(void)
47{ 42{
48 /* fsubr st,st(i) */ 43 /* fsubr st,st(i) */
49 clear_C1(); 44 clear_C1();
50 FPU_sub(REV, FPU_rm, control_word); 45 FPU_sub(REV, FPU_rm, control_word);
51} 46}
52 47
53
54void fdiv__(void) 48void fdiv__(void)
55{ 49{
56 /* fdiv st,st(i) */ 50 /* fdiv st,st(i) */
57 clear_C1(); 51 clear_C1();
58 FPU_div(0, FPU_rm, control_word); 52 FPU_div(0, FPU_rm, control_word);
59} 53}
60 54
61
62void fdivr_(void) 55void fdivr_(void)
63{ 56{
64 /* fdivr st,st(i) */ 57 /* fdivr st,st(i) */
65 clear_C1(); 58 clear_C1();
66 FPU_div(REV, FPU_rm, control_word); 59 FPU_div(REV, FPU_rm, control_word);
67} 60}
68 61
69
70
71void fadd_i(void) 62void fadd_i(void)
72{ 63{
73 /* fadd st(i),st */ 64 /* fadd st(i),st */
74 int i = FPU_rm; 65 int i = FPU_rm;
75 clear_C1(); 66 clear_C1();
76 FPU_add(&st(i), FPU_gettagi(i), i, control_word); 67 FPU_add(&st(i), FPU_gettagi(i), i, control_word);
77} 68}
78 69
79
80void fmul_i(void) 70void fmul_i(void)
81{ 71{
82 /* fmul st(i),st */ 72 /* fmul st(i),st */
83 clear_C1(); 73 clear_C1();
84 FPU_mul(&st(0), FPU_gettag0(), FPU_rm, control_word); 74 FPU_mul(&st(0), FPU_gettag0(), FPU_rm, control_word);
85} 75}
86 76
87
88void fsubri(void) 77void fsubri(void)
89{ 78{
90 /* fsubr st(i),st */ 79 /* fsubr st(i),st */
91 clear_C1(); 80 clear_C1();
92 FPU_sub(DEST_RM, FPU_rm, control_word); 81 FPU_sub(DEST_RM, FPU_rm, control_word);
93} 82}
94 83
95
96void fsub_i(void) 84void fsub_i(void)
97{ 85{
98 /* fsub st(i),st */ 86 /* fsub st(i),st */
99 clear_C1(); 87 clear_C1();
100 FPU_sub(REV|DEST_RM, FPU_rm, control_word); 88 FPU_sub(REV | DEST_RM, FPU_rm, control_word);
101} 89}
102 90
103
104void fdivri(void) 91void fdivri(void)
105{ 92{
106 /* fdivr st(i),st */ 93 /* fdivr st(i),st */
107 clear_C1(); 94 clear_C1();
108 FPU_div(DEST_RM, FPU_rm, control_word); 95 FPU_div(DEST_RM, FPU_rm, control_word);
109} 96}
110 97
111
112void fdiv_i(void) 98void fdiv_i(void)
113{ 99{
114 /* fdiv st(i),st */ 100 /* fdiv st(i),st */
115 clear_C1(); 101 clear_C1();
116 FPU_div(REV|DEST_RM, FPU_rm, control_word); 102 FPU_div(REV | DEST_RM, FPU_rm, control_word);
117} 103}
118 104
119
120
121void faddp_(void) 105void faddp_(void)
122{ 106{
123 /* faddp st(i),st */ 107 /* faddp st(i),st */
124 int i = FPU_rm; 108 int i = FPU_rm;
125 clear_C1(); 109 clear_C1();
126 if ( FPU_add(&st(i), FPU_gettagi(i), i, control_word) >= 0 ) 110 if (FPU_add(&st(i), FPU_gettagi(i), i, control_word) >= 0)
127 FPU_pop(); 111 FPU_pop();
128} 112}
129 113
130
131void fmulp_(void) 114void fmulp_(void)
132{ 115{
133 /* fmulp st(i),st */ 116 /* fmulp st(i),st */
134 clear_C1(); 117 clear_C1();
135 if ( FPU_mul(&st(0), FPU_gettag0(), FPU_rm, control_word) >= 0 ) 118 if (FPU_mul(&st(0), FPU_gettag0(), FPU_rm, control_word) >= 0)
136 FPU_pop(); 119 FPU_pop();
137} 120}
138 121
139
140
141void fsubrp(void) 122void fsubrp(void)
142{ 123{
143 /* fsubrp st(i),st */ 124 /* fsubrp st(i),st */
144 clear_C1(); 125 clear_C1();
145 if ( FPU_sub(DEST_RM, FPU_rm, control_word) >= 0 ) 126 if (FPU_sub(DEST_RM, FPU_rm, control_word) >= 0)
146 FPU_pop(); 127 FPU_pop();
147} 128}
148 129
149
150void fsubp_(void) 130void fsubp_(void)
151{ 131{
152 /* fsubp st(i),st */ 132 /* fsubp st(i),st */
153 clear_C1(); 133 clear_C1();
154 if ( FPU_sub(REV|DEST_RM, FPU_rm, control_word) >= 0 ) 134 if (FPU_sub(REV | DEST_RM, FPU_rm, control_word) >= 0)
155 FPU_pop(); 135 FPU_pop();
156} 136}
157 137
158
159void fdivrp(void) 138void fdivrp(void)
160{ 139{
161 /* fdivrp st(i),st */ 140 /* fdivrp st(i),st */
162 clear_C1(); 141 clear_C1();
163 if ( FPU_div(DEST_RM, FPU_rm, control_word) >= 0 ) 142 if (FPU_div(DEST_RM, FPU_rm, control_word) >= 0)
164 FPU_pop(); 143 FPU_pop();
165} 144}
166 145
167
168void fdivp_(void) 146void fdivp_(void)
169{ 147{
170 /* fdivp st(i),st */ 148 /* fdivp st(i),st */
171 clear_C1(); 149 clear_C1();
172 if ( FPU_div(REV|DEST_RM, FPU_rm, control_word) >= 0 ) 150 if (FPU_div(REV | DEST_RM, FPU_rm, control_word) >= 0)
173 FPU_pop(); 151 FPU_pop();
174} 152}
diff --git a/arch/x86/math-emu/fpu_asm.h b/arch/x86/math-emu/fpu_asm.h
index 9ba12416df1..955b932735a 100644
--- a/arch/x86/math-emu/fpu_asm.h
+++ b/arch/x86/math-emu/fpu_asm.h
@@ -14,7 +14,6 @@
14 14
15#define EXCEPTION FPU_exception 15#define EXCEPTION FPU_exception
16 16
17
18#define PARAM1 8(%ebp) 17#define PARAM1 8(%ebp)
19#define PARAM2 12(%ebp) 18#define PARAM2 12(%ebp)
20#define PARAM3 16(%ebp) 19#define PARAM3 16(%ebp)
diff --git a/arch/x86/math-emu/fpu_aux.c b/arch/x86/math-emu/fpu_aux.c
index 20886cfb9f7..491e737ce54 100644
--- a/arch/x86/math-emu/fpu_aux.c
+++ b/arch/x86/math-emu/fpu_aux.c
@@ -16,34 +16,34 @@
16#include "status_w.h" 16#include "status_w.h"
17#include "control_w.h" 17#include "control_w.h"
18 18
19
20static void fnop(void) 19static void fnop(void)
21{ 20{
22} 21}
23 22
24static void fclex(void) 23static void fclex(void)
25{ 24{
26 partial_status &= ~(SW_Backward|SW_Summary|SW_Stack_Fault|SW_Precision| 25 partial_status &=
27 SW_Underflow|SW_Overflow|SW_Zero_Div|SW_Denorm_Op| 26 ~(SW_Backward | SW_Summary | SW_Stack_Fault | SW_Precision |
28 SW_Invalid); 27 SW_Underflow | SW_Overflow | SW_Zero_Div | SW_Denorm_Op |
29 no_ip_update = 1; 28 SW_Invalid);
29 no_ip_update = 1;
30} 30}
31 31
32/* Needs to be externally visible */ 32/* Needs to be externally visible */
33void finit(void) 33void finit(void)
34{ 34{
35 control_word = 0x037f; 35 control_word = 0x037f;
36 partial_status = 0; 36 partial_status = 0;
37 top = 0; /* We don't keep top in the status word internally. */ 37 top = 0; /* We don't keep top in the status word internally. */
38 fpu_tag_word = 0xffff; 38 fpu_tag_word = 0xffff;
39 /* The behaviour is different from that detailed in 39 /* The behaviour is different from that detailed in
40 Section 15.1.6 of the Intel manual */ 40 Section 15.1.6 of the Intel manual */
41 operand_address.offset = 0; 41 operand_address.offset = 0;
42 operand_address.selector = 0; 42 operand_address.selector = 0;
43 instruction_address.offset = 0; 43 instruction_address.offset = 0;
44 instruction_address.selector = 0; 44 instruction_address.selector = 0;
45 instruction_address.opcode = 0; 45 instruction_address.opcode = 0;
46 no_ip_update = 1; 46 no_ip_update = 1;
47} 47}
48 48
49/* 49/*
@@ -54,151 +54,134 @@ void finit(void)
54#define fsetpm fnop 54#define fsetpm fnop
55 55
56static FUNC const finit_table[] = { 56static FUNC const finit_table[] = {
57 feni, fdisi, fclex, finit, 57 feni, fdisi, fclex, finit,
58 fsetpm, FPU_illegal, FPU_illegal, FPU_illegal 58 fsetpm, FPU_illegal, FPU_illegal, FPU_illegal
59}; 59};
60 60
61void finit_(void) 61void finit_(void)
62{ 62{
63 (finit_table[FPU_rm])(); 63 (finit_table[FPU_rm]) ();
64} 64}
65 65
66
67static void fstsw_ax(void) 66static void fstsw_ax(void)
68{ 67{
69 *(short *) &FPU_EAX = status_word(); 68 *(short *)&FPU_EAX = status_word();
70 no_ip_update = 1; 69 no_ip_update = 1;
71} 70}
72 71
73static FUNC const fstsw_table[] = { 72static FUNC const fstsw_table[] = {
74 fstsw_ax, FPU_illegal, FPU_illegal, FPU_illegal, 73 fstsw_ax, FPU_illegal, FPU_illegal, FPU_illegal,
75 FPU_illegal, FPU_illegal, FPU_illegal, FPU_illegal 74 FPU_illegal, FPU_illegal, FPU_illegal, FPU_illegal
76}; 75};
77 76
78void fstsw_(void) 77void fstsw_(void)
79{ 78{
80 (fstsw_table[FPU_rm])(); 79 (fstsw_table[FPU_rm]) ();
81} 80}
82 81
83
84static FUNC const fp_nop_table[] = { 82static FUNC const fp_nop_table[] = {
85 fnop, FPU_illegal, FPU_illegal, FPU_illegal, 83 fnop, FPU_illegal, FPU_illegal, FPU_illegal,
86 FPU_illegal, FPU_illegal, FPU_illegal, FPU_illegal 84 FPU_illegal, FPU_illegal, FPU_illegal, FPU_illegal
87}; 85};
88 86
89void fp_nop(void) 87void fp_nop(void)
90{ 88{
91 (fp_nop_table[FPU_rm])(); 89 (fp_nop_table[FPU_rm]) ();
92} 90}
93 91
94
95void fld_i_(void) 92void fld_i_(void)
96{ 93{
97 FPU_REG *st_new_ptr; 94 FPU_REG *st_new_ptr;
98 int i; 95 int i;
99 u_char tag; 96 u_char tag;
100 97
101 if ( STACK_OVERFLOW ) 98 if (STACK_OVERFLOW) {
102 { FPU_stack_overflow(); return; } 99 FPU_stack_overflow();
103 100 return;
104 /* fld st(i) */
105 i = FPU_rm;
106 if ( NOT_EMPTY(i) )
107 {
108 reg_copy(&st(i), st_new_ptr);
109 tag = FPU_gettagi(i);
110 push();
111 FPU_settag0(tag);
112 }
113 else
114 {
115 if ( control_word & CW_Invalid )
116 {
117 /* The masked response */
118 FPU_stack_underflow();
119 } 101 }
120 else
121 EXCEPTION(EX_StackUnder);
122 }
123 102
124} 103 /* fld st(i) */
104 i = FPU_rm;
105 if (NOT_EMPTY(i)) {
106 reg_copy(&st(i), st_new_ptr);
107 tag = FPU_gettagi(i);
108 push();
109 FPU_settag0(tag);
110 } else {
111 if (control_word & CW_Invalid) {
112 /* The masked response */
113 FPU_stack_underflow();
114 } else
115 EXCEPTION(EX_StackUnder);
116 }
125 117
118}
126 119
127void fxch_i(void) 120void fxch_i(void)
128{ 121{
129 /* fxch st(i) */ 122 /* fxch st(i) */
130 FPU_REG t; 123 FPU_REG t;
131 int i = FPU_rm; 124 int i = FPU_rm;
132 FPU_REG *st0_ptr = &st(0), *sti_ptr = &st(i); 125 FPU_REG *st0_ptr = &st(0), *sti_ptr = &st(i);
133 long tag_word = fpu_tag_word; 126 long tag_word = fpu_tag_word;
134 int regnr = top & 7, regnri = ((regnr + i) & 7); 127 int regnr = top & 7, regnri = ((regnr + i) & 7);
135 u_char st0_tag = (tag_word >> (regnr*2)) & 3; 128 u_char st0_tag = (tag_word >> (regnr * 2)) & 3;
136 u_char sti_tag = (tag_word >> (regnri*2)) & 3; 129 u_char sti_tag = (tag_word >> (regnri * 2)) & 3;
137 130
138 if ( st0_tag == TAG_Empty ) 131 if (st0_tag == TAG_Empty) {
139 { 132 if (sti_tag == TAG_Empty) {
140 if ( sti_tag == TAG_Empty ) 133 FPU_stack_underflow();
141 { 134 FPU_stack_underflow_i(i);
142 FPU_stack_underflow(); 135 return;
143 FPU_stack_underflow_i(i); 136 }
144 return; 137 if (control_word & CW_Invalid) {
138 /* Masked response */
139 FPU_copy_to_reg0(sti_ptr, sti_tag);
140 }
141 FPU_stack_underflow_i(i);
142 return;
145 } 143 }
146 if ( control_word & CW_Invalid ) 144 if (sti_tag == TAG_Empty) {
147 { 145 if (control_word & CW_Invalid) {
148 /* Masked response */ 146 /* Masked response */
149 FPU_copy_to_reg0(sti_ptr, sti_tag); 147 FPU_copy_to_regi(st0_ptr, st0_tag, i);
148 }
149 FPU_stack_underflow();
150 return;
150 } 151 }
151 FPU_stack_underflow_i(i); 152 clear_C1();
152 return;
153 }
154 if ( sti_tag == TAG_Empty )
155 {
156 if ( control_word & CW_Invalid )
157 {
158 /* Masked response */
159 FPU_copy_to_regi(st0_ptr, st0_tag, i);
160 }
161 FPU_stack_underflow();
162 return;
163 }
164 clear_C1();
165
166 reg_copy(st0_ptr, &t);
167 reg_copy(sti_ptr, st0_ptr);
168 reg_copy(&t, sti_ptr);
169
170 tag_word &= ~(3 << (regnr*2)) & ~(3 << (regnri*2));
171 tag_word |= (sti_tag << (regnr*2)) | (st0_tag << (regnri*2));
172 fpu_tag_word = tag_word;
173}
174 153
154 reg_copy(st0_ptr, &t);
155 reg_copy(sti_ptr, st0_ptr);
156 reg_copy(&t, sti_ptr);
157
158 tag_word &= ~(3 << (regnr * 2)) & ~(3 << (regnri * 2));
159 tag_word |= (sti_tag << (regnr * 2)) | (st0_tag << (regnri * 2));
160 fpu_tag_word = tag_word;
161}
175 162
176void ffree_(void) 163void ffree_(void)
177{ 164{
178 /* ffree st(i) */ 165 /* ffree st(i) */
179 FPU_settagi(FPU_rm, TAG_Empty); 166 FPU_settagi(FPU_rm, TAG_Empty);
180} 167}
181 168
182
183void ffreep(void) 169void ffreep(void)
184{ 170{
185 /* ffree st(i) + pop - unofficial code */ 171 /* ffree st(i) + pop - unofficial code */
186 FPU_settagi(FPU_rm, TAG_Empty); 172 FPU_settagi(FPU_rm, TAG_Empty);
187 FPU_pop(); 173 FPU_pop();
188} 174}
189 175
190
191void fst_i_(void) 176void fst_i_(void)
192{ 177{
193 /* fst st(i) */ 178 /* fst st(i) */
194 FPU_copy_to_regi(&st(0), FPU_gettag0(), FPU_rm); 179 FPU_copy_to_regi(&st(0), FPU_gettag0(), FPU_rm);
195} 180}
196 181
197
198void fstp_i(void) 182void fstp_i(void)
199{ 183{
200 /* fstp st(i) */ 184 /* fstp st(i) */
201 FPU_copy_to_regi(&st(0), FPU_gettag0(), FPU_rm); 185 FPU_copy_to_regi(&st(0), FPU_gettag0(), FPU_rm);
202 FPU_pop(); 186 FPU_pop();
203} 187}
204
diff --git a/arch/x86/math-emu/fpu_emu.h b/arch/x86/math-emu/fpu_emu.h
index 65120f52385..4dae511c85a 100644
--- a/arch/x86/math-emu/fpu_emu.h
+++ b/arch/x86/math-emu/fpu_emu.h
@@ -7,7 +7,6 @@
7 | | 7 | |
8 +---------------------------------------------------------------------------*/ 8 +---------------------------------------------------------------------------*/
9 9
10
11#ifndef _FPU_EMU_H_ 10#ifndef _FPU_EMU_H_
12#define _FPU_EMU_H_ 11#define _FPU_EMU_H_
13 12
@@ -28,15 +27,15 @@
28#endif 27#endif
29 28
30#define EXP_BIAS Const(0) 29#define EXP_BIAS Const(0)
31#define EXP_OVER Const(0x4000) /* smallest invalid large exponent */ 30#define EXP_OVER Const(0x4000) /* smallest invalid large exponent */
32#define EXP_UNDER Const(-0x3fff) /* largest invalid small exponent */ 31#define EXP_UNDER Const(-0x3fff) /* largest invalid small exponent */
33#define EXP_WAY_UNDER Const(-0x6000) /* Below the smallest denormal, but 32#define EXP_WAY_UNDER Const(-0x6000) /* Below the smallest denormal, but
34 still a 16 bit nr. */ 33 still a 16 bit nr. */
35#define EXP_Infinity EXP_OVER 34#define EXP_Infinity EXP_OVER
36#define EXP_NaN EXP_OVER 35#define EXP_NaN EXP_OVER
37 36
38#define EXTENDED_Ebias Const(0x3fff) 37#define EXTENDED_Ebias Const(0x3fff)
39#define EXTENDED_Emin (-0x3ffe) /* smallest valid exponent */ 38#define EXTENDED_Emin (-0x3ffe) /* smallest valid exponent */
40 39
41#define SIGN_POS Const(0) 40#define SIGN_POS Const(0)
42#define SIGN_NEG Const(0x80) 41#define SIGN_NEG Const(0x80)
@@ -44,10 +43,9 @@
44#define SIGN_Positive Const(0) 43#define SIGN_Positive Const(0)
45#define SIGN_Negative Const(0x8000) 44#define SIGN_Negative Const(0x8000)
46 45
47
48/* Keep the order TAG_Valid, TAG_Zero, TW_Denormal */ 46/* Keep the order TAG_Valid, TAG_Zero, TW_Denormal */
49/* The following fold to 2 (Special) in the Tag Word */ 47/* The following fold to 2 (Special) in the Tag Word */
50#define TW_Denormal Const(4) /* De-normal */ 48#define TW_Denormal Const(4) /* De-normal */
51#define TW_Infinity Const(5) /* + or - infinity */ 49#define TW_Infinity Const(5) /* + or - infinity */
52#define TW_NaN Const(6) /* Not a Number */ 50#define TW_NaN Const(6) /* Not a Number */
53#define TW_Unsupported Const(7) /* Not supported by an 80486 */ 51#define TW_Unsupported Const(7) /* Not supported by an 80486 */
@@ -67,14 +65,13 @@
67#define DEST_RM 0x20 65#define DEST_RM 0x20
68#define LOADED 0x40 66#define LOADED 0x40
69 67
70#define FPU_Exception Const(0x80000000) /* Added to tag returns. */ 68#define FPU_Exception Const(0x80000000) /* Added to tag returns. */
71
72 69
73#ifndef __ASSEMBLY__ 70#ifndef __ASSEMBLY__
74 71
75#include "fpu_system.h" 72#include "fpu_system.h"
76 73
77#include <asm/sigcontext.h> /* for struct _fpstate */ 74#include <asm/sigcontext.h> /* for struct _fpstate */
78#include <asm/math_emu.h> 75#include <asm/math_emu.h>
79#include <linux/linkage.h> 76#include <linux/linkage.h>
80 77
@@ -112,30 +109,33 @@ extern u_char emulating;
112#define PREFIX_DEFAULT 7 109#define PREFIX_DEFAULT 7
113 110
114struct address { 111struct address {
115 unsigned int offset; 112 unsigned int offset;
116 unsigned int selector:16; 113 unsigned int selector:16;
117 unsigned int opcode:11; 114 unsigned int opcode:11;
118 unsigned int empty:5; 115 unsigned int empty:5;
119}; 116};
120struct fpu__reg { 117struct fpu__reg {
121 unsigned sigl; 118 unsigned sigl;
122 unsigned sigh; 119 unsigned sigh;
123 short exp; 120 short exp;
124}; 121};
125 122
126typedef void (*FUNC)(void); 123typedef void (*FUNC) (void);
127typedef struct fpu__reg FPU_REG; 124typedef struct fpu__reg FPU_REG;
128typedef void (*FUNC_ST0)(FPU_REG *st0_ptr, u_char st0_tag); 125typedef void (*FUNC_ST0) (FPU_REG *st0_ptr, u_char st0_tag);
129typedef struct { u_char address_size, operand_size, segment; } 126typedef struct {
130 overrides; 127 u_char address_size, operand_size, segment;
128} overrides;
131/* This structure is 32 bits: */ 129/* This structure is 32 bits: */
132typedef struct { overrides override; 130typedef struct {
133 u_char default_mode; } fpu_addr_modes; 131 overrides override;
132 u_char default_mode;
133} fpu_addr_modes;
134/* PROTECTED has a restricted meaning in the emulator; it is used 134/* PROTECTED has a restricted meaning in the emulator; it is used
135 to signal that the emulator needs to do special things to ensure 135 to signal that the emulator needs to do special things to ensure
136 that protection is respected in a segmented model. */ 136 that protection is respected in a segmented model. */
137#define PROTECTED 4 137#define PROTECTED 4
138#define SIXTEEN 1 /* We rely upon this being 1 (true) */ 138#define SIXTEEN 1 /* We rely upon this being 1 (true) */
139#define VM86 SIXTEEN 139#define VM86 SIXTEEN
140#define PM16 (SIXTEEN | PROTECTED) 140#define PM16 (SIXTEEN | PROTECTED)
141#define SEG32 PROTECTED 141#define SEG32 PROTECTED
@@ -168,8 +168,8 @@ extern u_char const data_sizes_16[32];
168 168
169static inline void reg_copy(FPU_REG const *x, FPU_REG *y) 169static inline void reg_copy(FPU_REG const *x, FPU_REG *y)
170{ 170{
171 *(short *)&(y->exp) = *(const short *)&(x->exp); 171 *(short *)&(y->exp) = *(const short *)&(x->exp);
172 *(long long *)&(y->sigl) = *(const long long *)&(x->sigl); 172 *(long long *)&(y->sigl) = *(const long long *)&(x->sigl);
173} 173}
174 174
175#define exponent(x) (((*(short *)&((x)->exp)) & 0x7fff) - EXTENDED_Ebias) 175#define exponent(x) (((*(short *)&((x)->exp)) & 0x7fff) - EXTENDED_Ebias)
@@ -184,27 +184,26 @@ static inline void reg_copy(FPU_REG const *x, FPU_REG *y)
184 184
185#define significand(x) ( ((unsigned long long *)&((x)->sigl))[0] ) 185#define significand(x) ( ((unsigned long long *)&((x)->sigl))[0] )
186 186
187
188/*----- Prototypes for functions written in assembler -----*/ 187/*----- Prototypes for functions written in assembler -----*/
189/* extern void reg_move(FPU_REG *a, FPU_REG *b); */ 188/* extern void reg_move(FPU_REG *a, FPU_REG *b); */
190 189
191asmlinkage int FPU_normalize(FPU_REG *x); 190asmlinkage int FPU_normalize(FPU_REG *x);
192asmlinkage int FPU_normalize_nuo(FPU_REG *x); 191asmlinkage int FPU_normalize_nuo(FPU_REG *x);
193asmlinkage int FPU_u_sub(FPU_REG const *arg1, FPU_REG const *arg2, 192asmlinkage int FPU_u_sub(FPU_REG const *arg1, FPU_REG const *arg2,
194 FPU_REG *answ, unsigned int control_w, u_char sign, 193 FPU_REG * answ, unsigned int control_w, u_char sign,
195 int expa, int expb); 194 int expa, int expb);
196asmlinkage int FPU_u_mul(FPU_REG const *arg1, FPU_REG const *arg2, 195asmlinkage int FPU_u_mul(FPU_REG const *arg1, FPU_REG const *arg2,
197 FPU_REG *answ, unsigned int control_w, u_char sign, 196 FPU_REG * answ, unsigned int control_w, u_char sign,
198 int expon); 197 int expon);
199asmlinkage int FPU_u_div(FPU_REG const *arg1, FPU_REG const *arg2, 198asmlinkage int FPU_u_div(FPU_REG const *arg1, FPU_REG const *arg2,
200 FPU_REG *answ, unsigned int control_w, u_char sign); 199 FPU_REG * answ, unsigned int control_w, u_char sign);
201asmlinkage int FPU_u_add(FPU_REG const *arg1, FPU_REG const *arg2, 200asmlinkage int FPU_u_add(FPU_REG const *arg1, FPU_REG const *arg2,
202 FPU_REG *answ, unsigned int control_w, u_char sign, 201 FPU_REG * answ, unsigned int control_w, u_char sign,
203 int expa, int expb); 202 int expa, int expb);
204asmlinkage int wm_sqrt(FPU_REG *n, int dummy1, int dummy2, 203asmlinkage int wm_sqrt(FPU_REG *n, int dummy1, int dummy2,
205 unsigned int control_w, u_char sign); 204 unsigned int control_w, u_char sign);
206asmlinkage unsigned FPU_shrx(void *l, unsigned x); 205asmlinkage unsigned FPU_shrx(void *l, unsigned x);
207asmlinkage unsigned FPU_shrxs(void *v, unsigned x); 206asmlinkage unsigned FPU_shrxs(void *v, unsigned x);
208asmlinkage unsigned long FPU_div_small(unsigned long long *x, unsigned long y); 207asmlinkage unsigned long FPU_div_small(unsigned long long *x, unsigned long y);
209asmlinkage int FPU_round(FPU_REG *arg, unsigned int extent, int dummy, 208asmlinkage int FPU_round(FPU_REG *arg, unsigned int extent, int dummy,
210 unsigned int control_w, u_char sign); 209 unsigned int control_w, u_char sign);
diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c
index 1853524c8b5..760baeea5f0 100644
--- a/arch/x86/math-emu/fpu_entry.c
+++ b/arch/x86/math-emu/fpu_entry.c
@@ -25,10 +25,11 @@
25 +---------------------------------------------------------------------------*/ 25 +---------------------------------------------------------------------------*/
26 26
27#include <linux/signal.h> 27#include <linux/signal.h>
28#include <linux/ptrace.h> 28#include <linux/regset.h>
29 29
30#include <asm/uaccess.h> 30#include <asm/uaccess.h>
31#include <asm/desc.h> 31#include <asm/desc.h>
32#include <asm/user.h>
32 33
33#include "fpu_system.h" 34#include "fpu_system.h"
34#include "fpu_emu.h" 35#include "fpu_emu.h"
@@ -36,726 +37,727 @@
36#include "control_w.h" 37#include "control_w.h"
37#include "status_w.h" 38#include "status_w.h"
38 39
39#define __BAD__ FPU_illegal /* Illegal on an 80486, causes SIGILL */ 40#define __BAD__ FPU_illegal /* Illegal on an 80486, causes SIGILL */
40 41
41#ifndef NO_UNDOC_CODE /* Un-documented FPU op-codes supported by default. */ 42#ifndef NO_UNDOC_CODE /* Un-documented FPU op-codes supported by default. */
42 43
43/* WARNING: These codes are not documented by Intel in their 80486 manual 44/* WARNING: These codes are not documented by Intel in their 80486 manual
44 and may not work on FPU clones or later Intel FPUs. */ 45 and may not work on FPU clones or later Intel FPUs. */
45 46
46/* Changes to support the un-doc codes provided by Linus Torvalds. */ 47/* Changes to support the un-doc codes provided by Linus Torvalds. */
47 48
48#define _d9_d8_ fstp_i /* unofficial code (19) */ 49#define _d9_d8_ fstp_i /* unofficial code (19) */
49#define _dc_d0_ fcom_st /* unofficial code (14) */ 50#define _dc_d0_ fcom_st /* unofficial code (14) */
50#define _dc_d8_ fcompst /* unofficial code (1c) */ 51#define _dc_d8_ fcompst /* unofficial code (1c) */
51#define _dd_c8_ fxch_i /* unofficial code (0d) */ 52#define _dd_c8_ fxch_i /* unofficial code (0d) */
52#define _de_d0_ fcompst /* unofficial code (16) */ 53#define _de_d0_ fcompst /* unofficial code (16) */
53#define _df_c0_ ffreep /* unofficial code (07) ffree + pop */ 54#define _df_c0_ ffreep /* unofficial code (07) ffree + pop */
54#define _df_c8_ fxch_i /* unofficial code (0f) */ 55#define _df_c8_ fxch_i /* unofficial code (0f) */
55#define _df_d0_ fstp_i /* unofficial code (17) */ 56#define _df_d0_ fstp_i /* unofficial code (17) */
56#define _df_d8_ fstp_i /* unofficial code (1f) */ 57#define _df_d8_ fstp_i /* unofficial code (1f) */
57 58
58static FUNC const st_instr_table[64] = { 59static FUNC const st_instr_table[64] = {
59 fadd__, fld_i_, __BAD__, __BAD__, fadd_i, ffree_, faddp_, _df_c0_, 60 fadd__, fld_i_, __BAD__, __BAD__, fadd_i, ffree_, faddp_, _df_c0_,
60 fmul__, fxch_i, __BAD__, __BAD__, fmul_i, _dd_c8_, fmulp_, _df_c8_, 61 fmul__, fxch_i, __BAD__, __BAD__, fmul_i, _dd_c8_, fmulp_, _df_c8_,
61 fcom_st, fp_nop, __BAD__, __BAD__, _dc_d0_, fst_i_, _de_d0_, _df_d0_, 62 fcom_st, fp_nop, __BAD__, __BAD__, _dc_d0_, fst_i_, _de_d0_, _df_d0_,
62 fcompst, _d9_d8_, __BAD__, __BAD__, _dc_d8_, fstp_i, fcompp, _df_d8_, 63 fcompst, _d9_d8_, __BAD__, __BAD__, _dc_d8_, fstp_i, fcompp, _df_d8_,
63 fsub__, FPU_etc, __BAD__, finit_, fsubri, fucom_, fsubrp, fstsw_, 64 fsub__, FPU_etc, __BAD__, finit_, fsubri, fucom_, fsubrp, fstsw_,
64 fsubr_, fconst, fucompp, __BAD__, fsub_i, fucomp, fsubp_, __BAD__, 65 fsubr_, fconst, fucompp, __BAD__, fsub_i, fucomp, fsubp_, __BAD__,
65 fdiv__, FPU_triga, __BAD__, __BAD__, fdivri, __BAD__, fdivrp, __BAD__, 66 fdiv__, FPU_triga, __BAD__, __BAD__, fdivri, __BAD__, fdivrp, __BAD__,
66 fdivr_, FPU_trigb, __BAD__, __BAD__, fdiv_i, __BAD__, fdivp_, __BAD__, 67 fdivr_, FPU_trigb, __BAD__, __BAD__, fdiv_i, __BAD__, fdivp_, __BAD__,
67}; 68};
68 69
69#else /* Support only documented FPU op-codes */ 70#else /* Support only documented FPU op-codes */
70 71
71static FUNC const st_instr_table[64] = { 72static FUNC const st_instr_table[64] = {
72 fadd__, fld_i_, __BAD__, __BAD__, fadd_i, ffree_, faddp_, __BAD__, 73 fadd__, fld_i_, __BAD__, __BAD__, fadd_i, ffree_, faddp_, __BAD__,
73 fmul__, fxch_i, __BAD__, __BAD__, fmul_i, __BAD__, fmulp_, __BAD__, 74 fmul__, fxch_i, __BAD__, __BAD__, fmul_i, __BAD__, fmulp_, __BAD__,
74 fcom_st, fp_nop, __BAD__, __BAD__, __BAD__, fst_i_, __BAD__, __BAD__, 75 fcom_st, fp_nop, __BAD__, __BAD__, __BAD__, fst_i_, __BAD__, __BAD__,
75 fcompst, __BAD__, __BAD__, __BAD__, __BAD__, fstp_i, fcompp, __BAD__, 76 fcompst, __BAD__, __BAD__, __BAD__, __BAD__, fstp_i, fcompp, __BAD__,
76 fsub__, FPU_etc, __BAD__, finit_, fsubri, fucom_, fsubrp, fstsw_, 77 fsub__, FPU_etc, __BAD__, finit_, fsubri, fucom_, fsubrp, fstsw_,
77 fsubr_, fconst, fucompp, __BAD__, fsub_i, fucomp, fsubp_, __BAD__, 78 fsubr_, fconst, fucompp, __BAD__, fsub_i, fucomp, fsubp_, __BAD__,
78 fdiv__, FPU_triga, __BAD__, __BAD__, fdivri, __BAD__, fdivrp, __BAD__, 79 fdiv__, FPU_triga, __BAD__, __BAD__, fdivri, __BAD__, fdivrp, __BAD__,
79 fdivr_, FPU_trigb, __BAD__, __BAD__, fdiv_i, __BAD__, fdivp_, __BAD__, 80 fdivr_, FPU_trigb, __BAD__, __BAD__, fdiv_i, __BAD__, fdivp_, __BAD__,
80}; 81};
81 82
82#endif /* NO_UNDOC_CODE */ 83#endif /* NO_UNDOC_CODE */
83 84
84 85#define _NONE_ 0 /* Take no special action */
85#define _NONE_ 0 /* Take no special action */ 86#define _REG0_ 1 /* Need to check for not empty st(0) */
86#define _REG0_ 1 /* Need to check for not empty st(0) */ 87#define _REGI_ 2 /* Need to check for not empty st(0) and st(rm) */
87#define _REGI_ 2 /* Need to check for not empty st(0) and st(rm) */ 88#define _REGi_ 0 /* Uses st(rm) */
88#define _REGi_ 0 /* Uses st(rm) */ 89#define _PUSH_ 3 /* Need to check for space to push onto stack */
89#define _PUSH_ 3 /* Need to check for space to push onto stack */ 90#define _null_ 4 /* Function illegal or not implemented */
90#define _null_ 4 /* Function illegal or not implemented */ 91#define _REGIi 5 /* Uses st(0) and st(rm), result to st(rm) */
91#define _REGIi 5 /* Uses st(0) and st(rm), result to st(rm) */ 92#define _REGIp 6 /* Uses st(0) and st(rm), result to st(rm) then pop */
92#define _REGIp 6 /* Uses st(0) and st(rm), result to st(rm) then pop */ 93#define _REGIc 0 /* Compare st(0) and st(rm) */
93#define _REGIc 0 /* Compare st(0) and st(rm) */ 94#define _REGIn 0 /* Uses st(0) and st(rm), but handle checks later */
94#define _REGIn 0 /* Uses st(0) and st(rm), but handle checks later */
95 95
96#ifndef NO_UNDOC_CODE 96#ifndef NO_UNDOC_CODE
97 97
98/* Un-documented FPU op-codes supported by default. (see above) */ 98/* Un-documented FPU op-codes supported by default. (see above) */
99 99
100static u_char const type_table[64] = { 100static u_char const type_table[64] = {
101 _REGI_, _NONE_, _null_, _null_, _REGIi, _REGi_, _REGIp, _REGi_, 101 _REGI_, _NONE_, _null_, _null_, _REGIi, _REGi_, _REGIp, _REGi_,
102 _REGI_, _REGIn, _null_, _null_, _REGIi, _REGI_, _REGIp, _REGI_, 102 _REGI_, _REGIn, _null_, _null_, _REGIi, _REGI_, _REGIp, _REGI_,
103 _REGIc, _NONE_, _null_, _null_, _REGIc, _REG0_, _REGIc, _REG0_, 103 _REGIc, _NONE_, _null_, _null_, _REGIc, _REG0_, _REGIc, _REG0_,
104 _REGIc, _REG0_, _null_, _null_, _REGIc, _REG0_, _REGIc, _REG0_, 104 _REGIc, _REG0_, _null_, _null_, _REGIc, _REG0_, _REGIc, _REG0_,
105 _REGI_, _NONE_, _null_, _NONE_, _REGIi, _REGIc, _REGIp, _NONE_, 105 _REGI_, _NONE_, _null_, _NONE_, _REGIi, _REGIc, _REGIp, _NONE_,
106 _REGI_, _NONE_, _REGIc, _null_, _REGIi, _REGIc, _REGIp, _null_, 106 _REGI_, _NONE_, _REGIc, _null_, _REGIi, _REGIc, _REGIp, _null_,
107 _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_, 107 _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_,
108 _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_ 108 _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_
109}; 109};
110 110
111#else /* Support only documented FPU op-codes */ 111#else /* Support only documented FPU op-codes */
112 112
113static u_char const type_table[64] = { 113static u_char const type_table[64] = {
114 _REGI_, _NONE_, _null_, _null_, _REGIi, _REGi_, _REGIp, _null_, 114 _REGI_, _NONE_, _null_, _null_, _REGIi, _REGi_, _REGIp, _null_,
115 _REGI_, _REGIn, _null_, _null_, _REGIi, _null_, _REGIp, _null_, 115 _REGI_, _REGIn, _null_, _null_, _REGIi, _null_, _REGIp, _null_,
116 _REGIc, _NONE_, _null_, _null_, _null_, _REG0_, _null_, _null_, 116 _REGIc, _NONE_, _null_, _null_, _null_, _REG0_, _null_, _null_,
117 _REGIc, _null_, _null_, _null_, _null_, _REG0_, _REGIc, _null_, 117 _REGIc, _null_, _null_, _null_, _null_, _REG0_, _REGIc, _null_,
118 _REGI_, _NONE_, _null_, _NONE_, _REGIi, _REGIc, _REGIp, _NONE_, 118 _REGI_, _NONE_, _null_, _NONE_, _REGIi, _REGIc, _REGIp, _NONE_,
119 _REGI_, _NONE_, _REGIc, _null_, _REGIi, _REGIc, _REGIp, _null_, 119 _REGI_, _NONE_, _REGIc, _null_, _REGIi, _REGIc, _REGIp, _null_,
120 _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_, 120 _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_,
121 _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_ 121 _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_
122}; 122};
123 123
124#endif /* NO_UNDOC_CODE */ 124#endif /* NO_UNDOC_CODE */
125 125
126
127#ifdef RE_ENTRANT_CHECKING 126#ifdef RE_ENTRANT_CHECKING
128u_char emulating=0; 127u_char emulating = 0;
129#endif /* RE_ENTRANT_CHECKING */ 128#endif /* RE_ENTRANT_CHECKING */
130 129
131static int valid_prefix(u_char *Byte, u_char __user **fpu_eip, 130static int valid_prefix(u_char *Byte, u_char __user ** fpu_eip,
132 overrides *override); 131 overrides * override);
133 132
134asmlinkage void math_emulate(long arg) 133asmlinkage void math_emulate(long arg)
135{ 134{
136 u_char FPU_modrm, byte1; 135 u_char FPU_modrm, byte1;
137 unsigned short code; 136 unsigned short code;
138 fpu_addr_modes addr_modes; 137 fpu_addr_modes addr_modes;
139 int unmasked; 138 int unmasked;
140 FPU_REG loaded_data; 139 FPU_REG loaded_data;
141 FPU_REG *st0_ptr; 140 FPU_REG *st0_ptr;
142 u_char loaded_tag, st0_tag; 141 u_char loaded_tag, st0_tag;
143 void __user *data_address; 142 void __user *data_address;
144 struct address data_sel_off; 143 struct address data_sel_off;
145 struct address entry_sel_off; 144 struct address entry_sel_off;
146 unsigned long code_base = 0; 145 unsigned long code_base = 0;
147 unsigned long code_limit = 0; /* Initialized to stop compiler warnings */ 146 unsigned long code_limit = 0; /* Initialized to stop compiler warnings */
148 struct desc_struct code_descriptor; 147 struct desc_struct code_descriptor;
149 148
150#ifdef RE_ENTRANT_CHECKING 149#ifdef RE_ENTRANT_CHECKING
151 if ( emulating ) 150 if (emulating) {
152 { 151 printk("ERROR: wm-FPU-emu is not RE-ENTRANT!\n");
153 printk("ERROR: wm-FPU-emu is not RE-ENTRANT!\n"); 152 }
154 } 153 RE_ENTRANT_CHECK_ON;
155 RE_ENTRANT_CHECK_ON;
156#endif /* RE_ENTRANT_CHECKING */ 154#endif /* RE_ENTRANT_CHECKING */
157 155
158 if (!used_math()) 156 if (!used_math()) {
159 { 157 finit();
160 finit(); 158 set_used_math();
161 set_used_math();
162 }
163
164 SETUP_DATA_AREA(arg);
165
166 FPU_ORIG_EIP = FPU_EIP;
167
168 if ( (FPU_EFLAGS & 0x00020000) != 0 )
169 {
170 /* Virtual 8086 mode */
171 addr_modes.default_mode = VM86;
172 FPU_EIP += code_base = FPU_CS << 4;
173 code_limit = code_base + 0xffff; /* Assumes code_base <= 0xffff0000 */
174 }
175 else if ( FPU_CS == __USER_CS && FPU_DS == __USER_DS )
176 {
177 addr_modes.default_mode = 0;
178 }
179 else if ( FPU_CS == __KERNEL_CS )
180 {
181 printk("math_emulate: %04x:%08lx\n",FPU_CS,FPU_EIP);
182 panic("Math emulation needed in kernel");
183 }
184 else
185 {
186
187 if ( (FPU_CS & 4) != 4 ) /* Must be in the LDT */
188 {
189 /* Can only handle segmented addressing via the LDT
190 for now, and it must be 16 bit */
191 printk("FPU emulator: Unsupported addressing mode\n");
192 math_abort(FPU_info, SIGILL);
193 } 159 }
194 160
195 code_descriptor = LDT_DESCRIPTOR(FPU_CS); 161 SETUP_DATA_AREA(arg);
196 if ( SEG_D_SIZE(code_descriptor) ) 162
197 { 163 FPU_ORIG_EIP = FPU_EIP;
198 /* The above test may be wrong, the book is not clear */ 164
199 /* Segmented 32 bit protected mode */ 165 if ((FPU_EFLAGS & 0x00020000) != 0) {
200 addr_modes.default_mode = SEG32; 166 /* Virtual 8086 mode */
167 addr_modes.default_mode = VM86;
168 FPU_EIP += code_base = FPU_CS << 4;
169 code_limit = code_base + 0xffff; /* Assumes code_base <= 0xffff0000 */
170 } else if (FPU_CS == __USER_CS && FPU_DS == __USER_DS) {
171 addr_modes.default_mode = 0;
172 } else if (FPU_CS == __KERNEL_CS) {
173 printk("math_emulate: %04x:%08lx\n", FPU_CS, FPU_EIP);
174 panic("Math emulation needed in kernel");
175 } else {
176
177 if ((FPU_CS & 4) != 4) { /* Must be in the LDT */
178 /* Can only handle segmented addressing via the LDT
179 for now, and it must be 16 bit */
180 printk("FPU emulator: Unsupported addressing mode\n");
181 math_abort(FPU_info, SIGILL);
182 }
183
184 code_descriptor = LDT_DESCRIPTOR(FPU_CS);
185 if (SEG_D_SIZE(code_descriptor)) {
186 /* The above test may be wrong, the book is not clear */
187 /* Segmented 32 bit protected mode */
188 addr_modes.default_mode = SEG32;
189 } else {
190 /* 16 bit protected mode */
191 addr_modes.default_mode = PM16;
192 }
193 FPU_EIP += code_base = SEG_BASE_ADDR(code_descriptor);
194 code_limit = code_base
195 + (SEG_LIMIT(code_descriptor) +
196 1) * SEG_GRANULARITY(code_descriptor)
197 - 1;
198 if (code_limit < code_base)
199 code_limit = 0xffffffff;
201 } 200 }
202 else 201
203 { 202 FPU_lookahead = !(FPU_EFLAGS & X86_EFLAGS_TF);
204 /* 16 bit protected mode */ 203
205 addr_modes.default_mode = PM16; 204 if (!valid_prefix(&byte1, (u_char __user **) & FPU_EIP,
205 &addr_modes.override)) {
206 RE_ENTRANT_CHECK_OFF;
207 printk
208 ("FPU emulator: Unknown prefix byte 0x%02x, probably due to\n"
209 "FPU emulator: self-modifying code! (emulation impossible)\n",
210 byte1);
211 RE_ENTRANT_CHECK_ON;
212 EXCEPTION(EX_INTERNAL | 0x126);
213 math_abort(FPU_info, SIGILL);
206 } 214 }
207 FPU_EIP += code_base = SEG_BASE_ADDR(code_descriptor); 215
208 code_limit = code_base 216 do_another_FPU_instruction:
209 + (SEG_LIMIT(code_descriptor)+1) * SEG_GRANULARITY(code_descriptor) 217
210 - 1; 218 no_ip_update = 0;
211 if ( code_limit < code_base ) code_limit = 0xffffffff; 219
212 } 220 FPU_EIP++; /* We have fetched the prefix and first code bytes. */
213 221
214 FPU_lookahead = 1; 222 if (addr_modes.default_mode) {
215 if (current->ptrace & PT_PTRACED) 223 /* This checks for the minimum instruction bytes.
216 FPU_lookahead = 0; 224 We also need to check any extra (address mode) code access. */
217 225 if (FPU_EIP > code_limit)
218 if ( !valid_prefix(&byte1, (u_char __user **)&FPU_EIP, 226 math_abort(FPU_info, SIGSEGV);
219 &addr_modes.override) )
220 {
221 RE_ENTRANT_CHECK_OFF;
222 printk("FPU emulator: Unknown prefix byte 0x%02x, probably due to\n"
223 "FPU emulator: self-modifying code! (emulation impossible)\n",
224 byte1);
225 RE_ENTRANT_CHECK_ON;
226 EXCEPTION(EX_INTERNAL|0x126);
227 math_abort(FPU_info,SIGILL);
228 }
229
230do_another_FPU_instruction:
231
232 no_ip_update = 0;
233
234 FPU_EIP++; /* We have fetched the prefix and first code bytes. */
235
236 if ( addr_modes.default_mode )
237 {
238 /* This checks for the minimum instruction bytes.
239 We also need to check any extra (address mode) code access. */
240 if ( FPU_EIP > code_limit )
241 math_abort(FPU_info,SIGSEGV);
242 }
243
244 if ( (byte1 & 0xf8) != 0xd8 )
245 {
246 if ( byte1 == FWAIT_OPCODE )
247 {
248 if (partial_status & SW_Summary)
249 goto do_the_FPU_interrupt;
250 else
251 goto FPU_fwait_done;
252 } 227 }
228
229 if ((byte1 & 0xf8) != 0xd8) {
230 if (byte1 == FWAIT_OPCODE) {
231 if (partial_status & SW_Summary)
232 goto do_the_FPU_interrupt;
233 else
234 goto FPU_fwait_done;
235 }
253#ifdef PARANOID 236#ifdef PARANOID
254 EXCEPTION(EX_INTERNAL|0x128); 237 EXCEPTION(EX_INTERNAL | 0x128);
255 math_abort(FPU_info,SIGILL); 238 math_abort(FPU_info, SIGILL);
256#endif /* PARANOID */ 239#endif /* PARANOID */
257 }
258
259 RE_ENTRANT_CHECK_OFF;
260 FPU_code_access_ok(1);
261 FPU_get_user(FPU_modrm, (u_char __user *) FPU_EIP);
262 RE_ENTRANT_CHECK_ON;
263 FPU_EIP++;
264
265 if (partial_status & SW_Summary)
266 {
267 /* Ignore the error for now if the current instruction is a no-wait
268 control instruction */
269 /* The 80486 manual contradicts itself on this topic,
270 but a real 80486 uses the following instructions:
271 fninit, fnstenv, fnsave, fnstsw, fnstenv, fnclex.
272 */
273 code = (FPU_modrm << 8) | byte1;
274 if ( ! ( (((code & 0xf803) == 0xe003) || /* fnclex, fninit, fnstsw */
275 (((code & 0x3003) == 0x3001) && /* fnsave, fnstcw, fnstenv,
276 fnstsw */
277 ((code & 0xc000) != 0xc000))) ) )
278 {
279 /*
280 * We need to simulate the action of the kernel to FPU
281 * interrupts here.
282 */
283 do_the_FPU_interrupt:
284
285 FPU_EIP = FPU_ORIG_EIP; /* Point to current FPU instruction. */
286
287 RE_ENTRANT_CHECK_OFF;
288 current->thread.trap_no = 16;
289 current->thread.error_code = 0;
290 send_sig(SIGFPE, current, 1);
291 return;
292 }
293 }
294
295 entry_sel_off.offset = FPU_ORIG_EIP;
296 entry_sel_off.selector = FPU_CS;
297 entry_sel_off.opcode = (byte1 << 8) | FPU_modrm;
298
299 FPU_rm = FPU_modrm & 7;
300
301 if ( FPU_modrm < 0300 )
302 {
303 /* All of these instructions use the mod/rm byte to get a data address */
304
305 if ( (addr_modes.default_mode & SIXTEEN)
306 ^ (addr_modes.override.address_size == ADDR_SIZE_PREFIX) )
307 data_address = FPU_get_address_16(FPU_modrm, &FPU_EIP, &data_sel_off,
308 addr_modes);
309 else
310 data_address = FPU_get_address(FPU_modrm, &FPU_EIP, &data_sel_off,
311 addr_modes);
312
313 if ( addr_modes.default_mode )
314 {
315 if ( FPU_EIP-1 > code_limit )
316 math_abort(FPU_info,SIGSEGV);
317 } 240 }
318 241
319 if ( !(byte1 & 1) ) 242 RE_ENTRANT_CHECK_OFF;
320 { 243 FPU_code_access_ok(1);
321 unsigned short status1 = partial_status; 244 FPU_get_user(FPU_modrm, (u_char __user *) FPU_EIP);
322 245 RE_ENTRANT_CHECK_ON;
323 st0_ptr = &st(0); 246 FPU_EIP++;
324 st0_tag = FPU_gettag0(); 247
325 248 if (partial_status & SW_Summary) {
326 /* Stack underflow has priority */ 249 /* Ignore the error for now if the current instruction is a no-wait
327 if ( NOT_EMPTY_ST0 ) 250 control instruction */
328 { 251 /* The 80486 manual contradicts itself on this topic,
329 if ( addr_modes.default_mode & PROTECTED ) 252 but a real 80486 uses the following instructions:
330 { 253 fninit, fnstenv, fnsave, fnstsw, fnstenv, fnclex.
331 /* This table works for 16 and 32 bit protected mode */ 254 */
332 if ( access_limit < data_sizes_16[(byte1 >> 1) & 3] ) 255 code = (FPU_modrm << 8) | byte1;
333 math_abort(FPU_info,SIGSEGV); 256 if (!((((code & 0xf803) == 0xe003) || /* fnclex, fninit, fnstsw */
257 (((code & 0x3003) == 0x3001) && /* fnsave, fnstcw, fnstenv,
258 fnstsw */
259 ((code & 0xc000) != 0xc000))))) {
260 /*
261 * We need to simulate the action of the kernel to FPU
262 * interrupts here.
263 */
264 do_the_FPU_interrupt:
265
266 FPU_EIP = FPU_ORIG_EIP; /* Point to current FPU instruction. */
267
268 RE_ENTRANT_CHECK_OFF;
269 current->thread.trap_no = 16;
270 current->thread.error_code = 0;
271 send_sig(SIGFPE, current, 1);
272 return;
334 } 273 }
274 }
335 275
336 unmasked = 0; /* Do this here to stop compiler warnings. */ 276 entry_sel_off.offset = FPU_ORIG_EIP;
337 switch ( (byte1 >> 1) & 3 ) 277 entry_sel_off.selector = FPU_CS;
338 { 278 entry_sel_off.opcode = (byte1 << 8) | FPU_modrm;
339 case 0:
340 unmasked = FPU_load_single((float __user *)data_address,
341 &loaded_data);
342 loaded_tag = unmasked & 0xff;
343 unmasked &= ~0xff;
344 break;
345 case 1:
346 loaded_tag = FPU_load_int32((long __user *)data_address, &loaded_data);
347 break;
348 case 2:
349 unmasked = FPU_load_double((double __user *)data_address,
350 &loaded_data);
351 loaded_tag = unmasked & 0xff;
352 unmasked &= ~0xff;
353 break;
354 case 3:
355 default: /* Used here to suppress gcc warnings. */
356 loaded_tag = FPU_load_int16((short __user *)data_address, &loaded_data);
357 break;
358 }
359 279
360 /* No more access to user memory, it is safe 280 FPU_rm = FPU_modrm & 7;
361 to use static data now */
362
363 /* NaN operands have the next priority. */
364 /* We have to delay looking at st(0) until after
365 loading the data, because that data might contain an SNaN */
366 if ( ((st0_tag == TAG_Special) && isNaN(st0_ptr)) ||
367 ((loaded_tag == TAG_Special) && isNaN(&loaded_data)) )
368 {
369 /* Restore the status word; we might have loaded a
370 denormal. */
371 partial_status = status1;
372 if ( (FPU_modrm & 0x30) == 0x10 )
373 {
374 /* fcom or fcomp */
375 EXCEPTION(EX_Invalid);
376 setcc(SW_C3 | SW_C2 | SW_C0);
377 if ( (FPU_modrm & 0x08) && (control_word & CW_Invalid) )
378 FPU_pop(); /* fcomp, masked, so we pop. */
379 }
380 else
381 {
382 if ( loaded_tag == TAG_Special )
383 loaded_tag = FPU_Special(&loaded_data);
384#ifdef PECULIAR_486
385 /* This is not really needed, but gives behaviour
386 identical to an 80486 */
387 if ( (FPU_modrm & 0x28) == 0x20 )
388 /* fdiv or fsub */
389 real_2op_NaN(&loaded_data, loaded_tag, 0, &loaded_data);
390 else
391#endif /* PECULIAR_486 */
392 /* fadd, fdivr, fmul, or fsubr */
393 real_2op_NaN(&loaded_data, loaded_tag, 0, st0_ptr);
394 }
395 goto reg_mem_instr_done;
396 }
397 281
398 if ( unmasked && !((FPU_modrm & 0x30) == 0x10) ) 282 if (FPU_modrm < 0300) {
399 { 283 /* All of these instructions use the mod/rm byte to get a data address */
400 /* Is not a comparison instruction. */
401 if ( (FPU_modrm & 0x38) == 0x38 )
402 {
403 /* fdivr */
404 if ( (st0_tag == TAG_Zero) &&
405 ((loaded_tag == TAG_Valid)
406 || (loaded_tag == TAG_Special
407 && isdenormal(&loaded_data))) )
408 {
409 if ( FPU_divide_by_zero(0, getsign(&loaded_data))
410 < 0 )
411 {
412 /* We use the fact here that the unmasked
413 exception in the loaded data was for a
414 denormal operand */
415 /* Restore the state of the denormal op bit */
416 partial_status &= ~SW_Denorm_Op;
417 partial_status |= status1 & SW_Denorm_Op;
418 }
419 else
420 setsign(st0_ptr, getsign(&loaded_data));
421 }
422 }
423 goto reg_mem_instr_done;
424 }
425 284
426 switch ( (FPU_modrm >> 3) & 7 ) 285 if ((addr_modes.default_mode & SIXTEEN)
427 { 286 ^ (addr_modes.override.address_size == ADDR_SIZE_PREFIX))
428 case 0: /* fadd */ 287 data_address =
429 clear_C1(); 288 FPU_get_address_16(FPU_modrm, &FPU_EIP,
430 FPU_add(&loaded_data, loaded_tag, 0, control_word); 289 &data_sel_off, addr_modes);
431 break; 290 else
432 case 1: /* fmul */ 291 data_address =
433 clear_C1(); 292 FPU_get_address(FPU_modrm, &FPU_EIP, &data_sel_off,
434 FPU_mul(&loaded_data, loaded_tag, 0, control_word); 293 addr_modes);
435 break; 294
436 case 2: /* fcom */ 295 if (addr_modes.default_mode) {
437 FPU_compare_st_data(&loaded_data, loaded_tag); 296 if (FPU_EIP - 1 > code_limit)
438 break; 297 math_abort(FPU_info, SIGSEGV);
439 case 3: /* fcomp */
440 if ( !FPU_compare_st_data(&loaded_data, loaded_tag)
441 && !unmasked )
442 FPU_pop();
443 break;
444 case 4: /* fsub */
445 clear_C1();
446 FPU_sub(LOADED|loaded_tag, (int)&loaded_data, control_word);
447 break;
448 case 5: /* fsubr */
449 clear_C1();
450 FPU_sub(REV|LOADED|loaded_tag, (int)&loaded_data, control_word);
451 break;
452 case 6: /* fdiv */
453 clear_C1();
454 FPU_div(LOADED|loaded_tag, (int)&loaded_data, control_word);
455 break;
456 case 7: /* fdivr */
457 clear_C1();
458 if ( st0_tag == TAG_Zero )
459 partial_status = status1; /* Undo any denorm tag,
460 zero-divide has priority. */
461 FPU_div(REV|LOADED|loaded_tag, (int)&loaded_data, control_word);
462 break;
463 } 298 }
464 } 299
465 else 300 if (!(byte1 & 1)) {
466 { 301 unsigned short status1 = partial_status;
467 if ( (FPU_modrm & 0x30) == 0x10 ) 302
468 { 303 st0_ptr = &st(0);
469 /* The instruction is fcom or fcomp */ 304 st0_tag = FPU_gettag0();
470 EXCEPTION(EX_StackUnder); 305
471 setcc(SW_C3 | SW_C2 | SW_C0); 306 /* Stack underflow has priority */
472 if ( (FPU_modrm & 0x08) && (control_word & CW_Invalid) ) 307 if (NOT_EMPTY_ST0) {
473 FPU_pop(); /* fcomp */ 308 if (addr_modes.default_mode & PROTECTED) {
309 /* This table works for 16 and 32 bit protected mode */
310 if (access_limit <
311 data_sizes_16[(byte1 >> 1) & 3])
312 math_abort(FPU_info, SIGSEGV);
313 }
314
315 unmasked = 0; /* Do this here to stop compiler warnings. */
316 switch ((byte1 >> 1) & 3) {
317 case 0:
318 unmasked =
319 FPU_load_single((float __user *)
320 data_address,
321 &loaded_data);
322 loaded_tag = unmasked & 0xff;
323 unmasked &= ~0xff;
324 break;
325 case 1:
326 loaded_tag =
327 FPU_load_int32((long __user *)
328 data_address,
329 &loaded_data);
330 break;
331 case 2:
332 unmasked =
333 FPU_load_double((double __user *)
334 data_address,
335 &loaded_data);
336 loaded_tag = unmasked & 0xff;
337 unmasked &= ~0xff;
338 break;
339 case 3:
340 default: /* Used here to suppress gcc warnings. */
341 loaded_tag =
342 FPU_load_int16((short __user *)
343 data_address,
344 &loaded_data);
345 break;
346 }
347
348 /* No more access to user memory, it is safe
349 to use static data now */
350
351 /* NaN operands have the next priority. */
352 /* We have to delay looking at st(0) until after
353 loading the data, because that data might contain an SNaN */
354 if (((st0_tag == TAG_Special) && isNaN(st0_ptr))
355 || ((loaded_tag == TAG_Special)
356 && isNaN(&loaded_data))) {
357 /* Restore the status word; we might have loaded a
358 denormal. */
359 partial_status = status1;
360 if ((FPU_modrm & 0x30) == 0x10) {
361 /* fcom or fcomp */
362 EXCEPTION(EX_Invalid);
363 setcc(SW_C3 | SW_C2 | SW_C0);
364 if ((FPU_modrm & 0x08)
365 && (control_word &
366 CW_Invalid))
367 FPU_pop(); /* fcomp, masked, so we pop. */
368 } else {
369 if (loaded_tag == TAG_Special)
370 loaded_tag =
371 FPU_Special
372 (&loaded_data);
373#ifdef PECULIAR_486
374 /* This is not really needed, but gives behaviour
375 identical to an 80486 */
376 if ((FPU_modrm & 0x28) == 0x20)
377 /* fdiv or fsub */
378 real_2op_NaN
379 (&loaded_data,
380 loaded_tag, 0,
381 &loaded_data);
382 else
383#endif /* PECULIAR_486 */
384 /* fadd, fdivr, fmul, or fsubr */
385 real_2op_NaN
386 (&loaded_data,
387 loaded_tag, 0,
388 st0_ptr);
389 }
390 goto reg_mem_instr_done;
391 }
392
393 if (unmasked && !((FPU_modrm & 0x30) == 0x10)) {
394 /* Is not a comparison instruction. */
395 if ((FPU_modrm & 0x38) == 0x38) {
396 /* fdivr */
397 if ((st0_tag == TAG_Zero) &&
398 ((loaded_tag == TAG_Valid)
399 || (loaded_tag ==
400 TAG_Special
401 &&
402 isdenormal
403 (&loaded_data)))) {
404 if (FPU_divide_by_zero
405 (0,
406 getsign
407 (&loaded_data))
408 < 0) {
409 /* We use the fact here that the unmasked
410 exception in the loaded data was for a
411 denormal operand */
412 /* Restore the state of the denormal op bit */
413 partial_status
414 &=
415 ~SW_Denorm_Op;
416 partial_status
417 |=
418 status1 &
419 SW_Denorm_Op;
420 } else
421 setsign(st0_ptr,
422 getsign
423 (&loaded_data));
424 }
425 }
426 goto reg_mem_instr_done;
427 }
428
429 switch ((FPU_modrm >> 3) & 7) {
430 case 0: /* fadd */
431 clear_C1();
432 FPU_add(&loaded_data, loaded_tag, 0,
433 control_word);
434 break;
435 case 1: /* fmul */
436 clear_C1();
437 FPU_mul(&loaded_data, loaded_tag, 0,
438 control_word);
439 break;
440 case 2: /* fcom */
441 FPU_compare_st_data(&loaded_data,
442 loaded_tag);
443 break;
444 case 3: /* fcomp */
445 if (!FPU_compare_st_data
446 (&loaded_data, loaded_tag)
447 && !unmasked)
448 FPU_pop();
449 break;
450 case 4: /* fsub */
451 clear_C1();
452 FPU_sub(LOADED | loaded_tag,
453 (int)&loaded_data,
454 control_word);
455 break;
456 case 5: /* fsubr */
457 clear_C1();
458 FPU_sub(REV | LOADED | loaded_tag,
459 (int)&loaded_data,
460 control_word);
461 break;
462 case 6: /* fdiv */
463 clear_C1();
464 FPU_div(LOADED | loaded_tag,
465 (int)&loaded_data,
466 control_word);
467 break;
468 case 7: /* fdivr */
469 clear_C1();
470 if (st0_tag == TAG_Zero)
471 partial_status = status1; /* Undo any denorm tag,
472 zero-divide has priority. */
473 FPU_div(REV | LOADED | loaded_tag,
474 (int)&loaded_data,
475 control_word);
476 break;
477 }
478 } else {
479 if ((FPU_modrm & 0x30) == 0x10) {
480 /* The instruction is fcom or fcomp */
481 EXCEPTION(EX_StackUnder);
482 setcc(SW_C3 | SW_C2 | SW_C0);
483 if ((FPU_modrm & 0x08)
484 && (control_word & CW_Invalid))
485 FPU_pop(); /* fcomp */
486 } else
487 FPU_stack_underflow();
488 }
489 reg_mem_instr_done:
490 operand_address = data_sel_off;
491 } else {
492 if (!(no_ip_update =
493 FPU_load_store(((FPU_modrm & 0x38) | (byte1 & 6))
494 >> 1, addr_modes, data_address))) {
495 operand_address = data_sel_off;
496 }
474 } 497 }
475 else
476 FPU_stack_underflow();
477 }
478 reg_mem_instr_done:
479 operand_address = data_sel_off;
480 }
481 else
482 {
483 if ( !(no_ip_update =
484 FPU_load_store(((FPU_modrm & 0x38) | (byte1 & 6)) >> 1,
485 addr_modes, data_address)) )
486 {
487 operand_address = data_sel_off;
488 }
489 }
490 498
491 } 499 } else {
492 else 500 /* None of these instructions access user memory */
493 { 501 u_char instr_index = (FPU_modrm & 0x38) | (byte1 & 7);
494 /* None of these instructions access user memory */
495 u_char instr_index = (FPU_modrm & 0x38) | (byte1 & 7);
496 502
497#ifdef PECULIAR_486 503#ifdef PECULIAR_486
498 /* This is supposed to be undefined, but a real 80486 seems 504 /* This is supposed to be undefined, but a real 80486 seems
499 to do this: */ 505 to do this: */
500 operand_address.offset = 0; 506 operand_address.offset = 0;
501 operand_address.selector = FPU_DS; 507 operand_address.selector = FPU_DS;
502#endif /* PECULIAR_486 */ 508#endif /* PECULIAR_486 */
503 509
504 st0_ptr = &st(0); 510 st0_ptr = &st(0);
505 st0_tag = FPU_gettag0(); 511 st0_tag = FPU_gettag0();
506 switch ( type_table[(int) instr_index] ) 512 switch (type_table[(int)instr_index]) {
507 { 513 case _NONE_: /* also _REGIc: _REGIn */
508 case _NONE_: /* also _REGIc: _REGIn */ 514 break;
509 break; 515 case _REG0_:
510 case _REG0_: 516 if (!NOT_EMPTY_ST0) {
511 if ( !NOT_EMPTY_ST0 ) 517 FPU_stack_underflow();
512 { 518 goto FPU_instruction_done;
513 FPU_stack_underflow(); 519 }
514 goto FPU_instruction_done; 520 break;
515 } 521 case _REGIi:
516 break; 522 if (!NOT_EMPTY_ST0 || !NOT_EMPTY(FPU_rm)) {
517 case _REGIi: 523 FPU_stack_underflow_i(FPU_rm);
518 if ( !NOT_EMPTY_ST0 || !NOT_EMPTY(FPU_rm) ) 524 goto FPU_instruction_done;
519 { 525 }
520 FPU_stack_underflow_i(FPU_rm); 526 break;
521 goto FPU_instruction_done; 527 case _REGIp:
522 } 528 if (!NOT_EMPTY_ST0 || !NOT_EMPTY(FPU_rm)) {
523 break; 529 FPU_stack_underflow_pop(FPU_rm);
524 case _REGIp: 530 goto FPU_instruction_done;
525 if ( !NOT_EMPTY_ST0 || !NOT_EMPTY(FPU_rm) ) 531 }
526 { 532 break;
527 FPU_stack_underflow_pop(FPU_rm); 533 case _REGI_:
528 goto FPU_instruction_done; 534 if (!NOT_EMPTY_ST0 || !NOT_EMPTY(FPU_rm)) {
529 } 535 FPU_stack_underflow();
530 break; 536 goto FPU_instruction_done;
531 case _REGI_: 537 }
532 if ( !NOT_EMPTY_ST0 || !NOT_EMPTY(FPU_rm) ) 538 break;
533 { 539 case _PUSH_: /* Only used by the fld st(i) instruction */
534 FPU_stack_underflow(); 540 break;
535 goto FPU_instruction_done; 541 case _null_:
536 } 542 FPU_illegal();
537 break; 543 goto FPU_instruction_done;
538 case _PUSH_: /* Only used by the fld st(i) instruction */ 544 default:
539 break; 545 EXCEPTION(EX_INTERNAL | 0x111);
540 case _null_: 546 goto FPU_instruction_done;
541 FPU_illegal(); 547 }
542 goto FPU_instruction_done; 548 (*st_instr_table[(int)instr_index]) ();
543 default:
544 EXCEPTION(EX_INTERNAL|0x111);
545 goto FPU_instruction_done;
546 }
547 (*st_instr_table[(int) instr_index])();
548 549
549FPU_instruction_done: 550 FPU_instruction_done:
550 ; 551 ;
551 } 552 }
552 553
553 if ( ! no_ip_update ) 554 if (!no_ip_update)
554 instruction_address = entry_sel_off; 555 instruction_address = entry_sel_off;
555 556
556FPU_fwait_done: 557 FPU_fwait_done:
557 558
558#ifdef DEBUG 559#ifdef DEBUG
559 RE_ENTRANT_CHECK_OFF; 560 RE_ENTRANT_CHECK_OFF;
560 FPU_printall(); 561 FPU_printall();
561 RE_ENTRANT_CHECK_ON; 562 RE_ENTRANT_CHECK_ON;
562#endif /* DEBUG */ 563#endif /* DEBUG */
563 564
564 if (FPU_lookahead && !need_resched()) 565 if (FPU_lookahead && !need_resched()) {
565 { 566 FPU_ORIG_EIP = FPU_EIP - code_base;
566 FPU_ORIG_EIP = FPU_EIP - code_base; 567 if (valid_prefix(&byte1, (u_char __user **) & FPU_EIP,
567 if ( valid_prefix(&byte1, (u_char __user **)&FPU_EIP, 568 &addr_modes.override))
568 &addr_modes.override) ) 569 goto do_another_FPU_instruction;
569 goto do_another_FPU_instruction; 570 }
570 }
571 571
572 if ( addr_modes.default_mode ) 572 if (addr_modes.default_mode)
573 FPU_EIP -= code_base; 573 FPU_EIP -= code_base;
574 574
575 RE_ENTRANT_CHECK_OFF; 575 RE_ENTRANT_CHECK_OFF;
576} 576}
577 577
578
579/* Support for prefix bytes is not yet complete. To properly handle 578/* Support for prefix bytes is not yet complete. To properly handle
580 all prefix bytes, further changes are needed in the emulator code 579 all prefix bytes, further changes are needed in the emulator code
581 which accesses user address space. Access to separate segments is 580 which accesses user address space. Access to separate segments is
582 important for msdos emulation. */ 581 important for msdos emulation. */
583static int valid_prefix(u_char *Byte, u_char __user **fpu_eip, 582static int valid_prefix(u_char *Byte, u_char __user **fpu_eip,
584 overrides *override) 583 overrides * override)
585{ 584{
586 u_char byte; 585 u_char byte;
587 u_char __user *ip = *fpu_eip; 586 u_char __user *ip = *fpu_eip;
588 587
589 *override = (overrides) { 0, 0, PREFIX_DEFAULT }; /* defaults */ 588 *override = (overrides) {
590 589 0, 0, PREFIX_DEFAULT}; /* defaults */
591 RE_ENTRANT_CHECK_OFF; 590
592 FPU_code_access_ok(1); 591 RE_ENTRANT_CHECK_OFF;
593 FPU_get_user(byte, ip); 592 FPU_code_access_ok(1);
594 RE_ENTRANT_CHECK_ON; 593 FPU_get_user(byte, ip);
595 594 RE_ENTRANT_CHECK_ON;
596 while ( 1 ) 595
597 { 596 while (1) {
598 switch ( byte ) 597 switch (byte) {
599 { 598 case ADDR_SIZE_PREFIX:
600 case ADDR_SIZE_PREFIX: 599 override->address_size = ADDR_SIZE_PREFIX;
601 override->address_size = ADDR_SIZE_PREFIX; 600 goto do_next_byte;
602 goto do_next_byte; 601
603 602 case OP_SIZE_PREFIX:
604 case OP_SIZE_PREFIX: 603 override->operand_size = OP_SIZE_PREFIX;
605 override->operand_size = OP_SIZE_PREFIX; 604 goto do_next_byte;
606 goto do_next_byte; 605
607 606 case PREFIX_CS:
608 case PREFIX_CS: 607 override->segment = PREFIX_CS_;
609 override->segment = PREFIX_CS_; 608 goto do_next_byte;
610 goto do_next_byte; 609 case PREFIX_ES:
611 case PREFIX_ES: 610 override->segment = PREFIX_ES_;
612 override->segment = PREFIX_ES_; 611 goto do_next_byte;
613 goto do_next_byte; 612 case PREFIX_SS:
614 case PREFIX_SS: 613 override->segment = PREFIX_SS_;
615 override->segment = PREFIX_SS_; 614 goto do_next_byte;
616 goto do_next_byte; 615 case PREFIX_FS:
617 case PREFIX_FS: 616 override->segment = PREFIX_FS_;
618 override->segment = PREFIX_FS_; 617 goto do_next_byte;
619 goto do_next_byte; 618 case PREFIX_GS:
620 case PREFIX_GS: 619 override->segment = PREFIX_GS_;
621 override->segment = PREFIX_GS_; 620 goto do_next_byte;
622 goto do_next_byte; 621 case PREFIX_DS:
623 case PREFIX_DS: 622 override->segment = PREFIX_DS_;
624 override->segment = PREFIX_DS_; 623 goto do_next_byte;
625 goto do_next_byte;
626 624
627/* lock is not a valid prefix for FPU instructions, 625/* lock is not a valid prefix for FPU instructions,
628 let the cpu handle it to generate a SIGILL. */ 626 let the cpu handle it to generate a SIGILL. */
629/* case PREFIX_LOCK: */ 627/* case PREFIX_LOCK: */
630 628
631 /* rep.. prefixes have no meaning for FPU instructions */ 629 /* rep.. prefixes have no meaning for FPU instructions */
632 case PREFIX_REPE: 630 case PREFIX_REPE:
633 case PREFIX_REPNE: 631 case PREFIX_REPNE:
634 632
635 do_next_byte: 633 do_next_byte:
636 ip++; 634 ip++;
637 RE_ENTRANT_CHECK_OFF; 635 RE_ENTRANT_CHECK_OFF;
638 FPU_code_access_ok(1); 636 FPU_code_access_ok(1);
639 FPU_get_user(byte, ip); 637 FPU_get_user(byte, ip);
640 RE_ENTRANT_CHECK_ON; 638 RE_ENTRANT_CHECK_ON;
641 break; 639 break;
642 case FWAIT_OPCODE: 640 case FWAIT_OPCODE:
643 *Byte = byte; 641 *Byte = byte;
644 return 1; 642 return 1;
645 default: 643 default:
646 if ( (byte & 0xf8) == 0xd8 ) 644 if ((byte & 0xf8) == 0xd8) {
647 { 645 *Byte = byte;
648 *Byte = byte; 646 *fpu_eip = ip;
649 *fpu_eip = ip; 647 return 1;
650 return 1; 648 } else {
651 } 649 /* Not a valid sequence of prefix bytes followed by
652 else 650 an FPU instruction. */
653 { 651 *Byte = byte; /* Needed for error message. */
654 /* Not a valid sequence of prefix bytes followed by 652 return 0;
655 an FPU instruction. */ 653 }
656 *Byte = byte; /* Needed for error message. */ 654 }
657 return 0;
658 }
659 } 655 }
660 }
661} 656}
662 657
663 658void math_abort(struct info *info, unsigned int signal)
664void math_abort(struct info * info, unsigned int signal)
665{ 659{
666 FPU_EIP = FPU_ORIG_EIP; 660 FPU_EIP = FPU_ORIG_EIP;
667 current->thread.trap_no = 16; 661 current->thread.trap_no = 16;
668 current->thread.error_code = 0; 662 current->thread.error_code = 0;
669 send_sig(signal,current,1); 663 send_sig(signal, current, 1);
670 RE_ENTRANT_CHECK_OFF; 664 RE_ENTRANT_CHECK_OFF;
671 __asm__("movl %0,%%esp ; ret": :"g" (((long) info)-4)); 665 __asm__("movl %0,%%esp ; ret": :"g"(((long)info) - 4));
672#ifdef PARANOID 666#ifdef PARANOID
673 printk("ERROR: wm-FPU-emu math_abort failed!\n"); 667 printk("ERROR: wm-FPU-emu math_abort failed!\n");
674#endif /* PARANOID */ 668#endif /* PARANOID */
675} 669}
676 670
677
678
679#define S387 ((struct i387_soft_struct *)s387) 671#define S387 ((struct i387_soft_struct *)s387)
680#define sstatus_word() \ 672#define sstatus_word() \
681 ((S387->swd & ~SW_Top & 0xffff) | ((S387->ftop << SW_Top_Shift) & SW_Top)) 673 ((S387->swd & ~SW_Top & 0xffff) | ((S387->ftop << SW_Top_Shift) & SW_Top))
682 674
683int restore_i387_soft(void *s387, struct _fpstate __user *buf) 675int fpregs_soft_set(struct task_struct *target,
676 const struct user_regset *regset,
677 unsigned int pos, unsigned int count,
678 const void *kbuf, const void __user *ubuf)
684{ 679{
685 u_char __user *d = (u_char __user *)buf; 680 struct i387_soft_struct *s387 = &target->thread.i387.soft;
686 int offset, other, i, tags, regnr, tag, newtop; 681 void *space = s387->st_space;
687 682 int ret;
688 RE_ENTRANT_CHECK_OFF; 683 int offset, other, i, tags, regnr, tag, newtop;
689 FPU_access_ok(VERIFY_READ, d, 7*4 + 8*10); 684
690 if (__copy_from_user(&S387->cwd, d, 7*4)) 685 RE_ENTRANT_CHECK_OFF;
691 return -1; 686 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, s387, 0,
692 RE_ENTRANT_CHECK_ON; 687 offsetof(struct i387_soft_struct, st_space));
693 688 RE_ENTRANT_CHECK_ON;
694 d += 7*4; 689
695 690 if (ret)
696 S387->ftop = (S387->swd >> SW_Top_Shift) & 7; 691 return ret;
697 offset = (S387->ftop & 7) * 10; 692
698 other = 80 - offset; 693 S387->ftop = (S387->swd >> SW_Top_Shift) & 7;
699 694 offset = (S387->ftop & 7) * 10;
700 RE_ENTRANT_CHECK_OFF; 695 other = 80 - offset;
701 /* Copy all registers in stack order. */ 696
702 if (__copy_from_user(((u_char *)&S387->st_space)+offset, d, other)) 697 RE_ENTRANT_CHECK_OFF;
703 return -1; 698
704 if ( offset ) 699 /* Copy all registers in stack order. */
705 if (__copy_from_user((u_char *)&S387->st_space, d+other, offset)) 700 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
706 return -1; 701 space + offset, 0, other);
707 RE_ENTRANT_CHECK_ON; 702 if (!ret && offset)
708 703 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
709 /* The tags may need to be corrected now. */ 704 space, 0, offset);
710 tags = S387->twd; 705
711 newtop = S387->ftop; 706 RE_ENTRANT_CHECK_ON;
712 for ( i = 0; i < 8; i++ ) 707
713 { 708 /* The tags may need to be corrected now. */
714 regnr = (i+newtop) & 7; 709 tags = S387->twd;
715 if ( ((tags >> ((regnr & 7)*2)) & 3) != TAG_Empty ) 710 newtop = S387->ftop;
716 { 711 for (i = 0; i < 8; i++) {
717 /* The loaded data over-rides all other cases. */ 712 regnr = (i + newtop) & 7;
718 tag = FPU_tagof((FPU_REG *)((u_char *)S387->st_space + 10*regnr)); 713 if (((tags >> ((regnr & 7) * 2)) & 3) != TAG_Empty) {
719 tags &= ~(3 << (regnr*2)); 714 /* The loaded data over-rides all other cases. */
720 tags |= (tag & 3) << (regnr*2); 715 tag =
716 FPU_tagof((FPU_REG *) ((u_char *) S387->st_space +
717 10 * regnr));
718 tags &= ~(3 << (regnr * 2));
719 tags |= (tag & 3) << (regnr * 2);
720 }
721 } 721 }
722 } 722 S387->twd = tags;
723 S387->twd = tags;
724 723
725 return 0; 724 return ret;
726} 725}
727 726
728 727int fpregs_soft_get(struct task_struct *target,
729int save_i387_soft(void *s387, struct _fpstate __user * buf) 728 const struct user_regset *regset,
729 unsigned int pos, unsigned int count,
730 void *kbuf, void __user *ubuf)
730{ 731{
731 u_char __user *d = (u_char __user *)buf; 732 struct i387_soft_struct *s387 = &target->thread.i387.soft;
732 int offset = (S387->ftop & 7) * 10, other = 80 - offset; 733 const void *space = s387->st_space;
734 int ret;
735 int offset = (S387->ftop & 7) * 10, other = 80 - offset;
736
737 RE_ENTRANT_CHECK_OFF;
733 738
734 RE_ENTRANT_CHECK_OFF;
735 FPU_access_ok(VERIFY_WRITE, d, 7*4 + 8*10);
736#ifdef PECULIAR_486 739#ifdef PECULIAR_486
737 S387->cwd &= ~0xe080; 740 S387->cwd &= ~0xe080;
738 /* An 80486 sets nearly all of the reserved bits to 1. */ 741 /* An 80486 sets nearly all of the reserved bits to 1. */
739 S387->cwd |= 0xffff0040; 742 S387->cwd |= 0xffff0040;
740 S387->swd = sstatus_word() | 0xffff0000; 743 S387->swd = sstatus_word() | 0xffff0000;
741 S387->twd |= 0xffff0000; 744 S387->twd |= 0xffff0000;
742 S387->fcs &= ~0xf8000000; 745 S387->fcs &= ~0xf8000000;
743 S387->fos |= 0xffff0000; 746 S387->fos |= 0xffff0000;
744#endif /* PECULIAR_486 */ 747#endif /* PECULIAR_486 */
745 if (__copy_to_user(d, &S387->cwd, 7*4)) 748
746 return -1; 749 ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, s387, 0,
747 RE_ENTRANT_CHECK_ON; 750 offsetof(struct i387_soft_struct, st_space));
748 751
749 d += 7*4; 752 /* Copy all registers in stack order. */
750 753 if (!ret)
751 RE_ENTRANT_CHECK_OFF; 754 ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
752 /* Copy all registers in stack order. */ 755 space + offset, 0, other);
753 if (__copy_to_user(d, ((u_char *)&S387->st_space)+offset, other)) 756 if (!ret)
754 return -1; 757 ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
755 if ( offset ) 758 space, 0, offset);
756 if (__copy_to_user(d+other, (u_char *)&S387->st_space, offset)) 759
757 return -1; 760 RE_ENTRANT_CHECK_ON;
758 RE_ENTRANT_CHECK_ON; 761
759 762 return ret;
760 return 1;
761} 763}
diff --git a/arch/x86/math-emu/fpu_etc.c b/arch/x86/math-emu/fpu_etc.c
index e3b5d465587..233e5af566f 100644
--- a/arch/x86/math-emu/fpu_etc.c
+++ b/arch/x86/math-emu/fpu_etc.c
@@ -16,128 +16,115 @@
16#include "status_w.h" 16#include "status_w.h"
17#include "reg_constant.h" 17#include "reg_constant.h"
18 18
19
20static void fchs(FPU_REG *st0_ptr, u_char st0tag) 19static void fchs(FPU_REG *st0_ptr, u_char st0tag)
21{ 20{
22 if ( st0tag ^ TAG_Empty ) 21 if (st0tag ^ TAG_Empty) {
23 { 22 signbyte(st0_ptr) ^= SIGN_NEG;
24 signbyte(st0_ptr) ^= SIGN_NEG; 23 clear_C1();
25 clear_C1(); 24 } else
26 } 25 FPU_stack_underflow();
27 else
28 FPU_stack_underflow();
29} 26}
30 27
31
32static void fabs(FPU_REG *st0_ptr, u_char st0tag) 28static void fabs(FPU_REG *st0_ptr, u_char st0tag)
33{ 29{
34 if ( st0tag ^ TAG_Empty ) 30 if (st0tag ^ TAG_Empty) {
35 { 31 setpositive(st0_ptr);
36 setpositive(st0_ptr); 32 clear_C1();
37 clear_C1(); 33 } else
38 } 34 FPU_stack_underflow();
39 else
40 FPU_stack_underflow();
41} 35}
42 36
43
44static void ftst_(FPU_REG *st0_ptr, u_char st0tag) 37static void ftst_(FPU_REG *st0_ptr, u_char st0tag)
45{ 38{
46 switch (st0tag) 39 switch (st0tag) {
47 { 40 case TAG_Zero:
48 case TAG_Zero:
49 setcc(SW_C3);
50 break;
51 case TAG_Valid:
52 if (getsign(st0_ptr) == SIGN_POS)
53 setcc(0);
54 else
55 setcc(SW_C0);
56 break;
57 case TAG_Special:
58 switch ( FPU_Special(st0_ptr) )
59 {
60 case TW_Denormal:
61 if (getsign(st0_ptr) == SIGN_POS)
62 setcc(0);
63 else
64 setcc(SW_C0);
65 if ( denormal_operand() < 0 )
66 {
67#ifdef PECULIAR_486
68 /* This is weird! */
69 if (getsign(st0_ptr) == SIGN_POS)
70 setcc(SW_C3); 41 setcc(SW_C3);
42 break;
43 case TAG_Valid:
44 if (getsign(st0_ptr) == SIGN_POS)
45 setcc(0);
46 else
47 setcc(SW_C0);
48 break;
49 case TAG_Special:
50 switch (FPU_Special(st0_ptr)) {
51 case TW_Denormal:
52 if (getsign(st0_ptr) == SIGN_POS)
53 setcc(0);
54 else
55 setcc(SW_C0);
56 if (denormal_operand() < 0) {
57#ifdef PECULIAR_486
58 /* This is weird! */
59 if (getsign(st0_ptr) == SIGN_POS)
60 setcc(SW_C3);
71#endif /* PECULIAR_486 */ 61#endif /* PECULIAR_486 */
72 return; 62 return;
73 } 63 }
74 break; 64 break;
75 case TW_NaN: 65 case TW_NaN:
76 setcc(SW_C0|SW_C2|SW_C3); /* Operand is not comparable */ 66 setcc(SW_C0 | SW_C2 | SW_C3); /* Operand is not comparable */
77 EXCEPTION(EX_Invalid); 67 EXCEPTION(EX_Invalid);
78 break; 68 break;
79 case TW_Infinity: 69 case TW_Infinity:
80 if (getsign(st0_ptr) == SIGN_POS) 70 if (getsign(st0_ptr) == SIGN_POS)
81 setcc(0); 71 setcc(0);
82 else 72 else
83 setcc(SW_C0); 73 setcc(SW_C0);
84 break; 74 break;
85 default: 75 default:
86 setcc(SW_C0|SW_C2|SW_C3); /* Operand is not comparable */ 76 setcc(SW_C0 | SW_C2 | SW_C3); /* Operand is not comparable */
87 EXCEPTION(EX_INTERNAL|0x14); 77 EXCEPTION(EX_INTERNAL | 0x14);
88 break; 78 break;
79 }
80 break;
81 case TAG_Empty:
82 setcc(SW_C0 | SW_C2 | SW_C3);
83 EXCEPTION(EX_StackUnder);
84 break;
89 } 85 }
90 break;
91 case TAG_Empty:
92 setcc(SW_C0|SW_C2|SW_C3);
93 EXCEPTION(EX_StackUnder);
94 break;
95 }
96} 86}
97 87
98
99static void fxam(FPU_REG *st0_ptr, u_char st0tag) 88static void fxam(FPU_REG *st0_ptr, u_char st0tag)
100{ 89{
101 int c = 0; 90 int c = 0;
102 switch (st0tag) 91 switch (st0tag) {
103 { 92 case TAG_Empty:
104 case TAG_Empty: 93 c = SW_C3 | SW_C0;
105 c = SW_C3|SW_C0; 94 break;
106 break; 95 case TAG_Zero:
107 case TAG_Zero: 96 c = SW_C3;
108 c = SW_C3; 97 break;
109 break; 98 case TAG_Valid:
110 case TAG_Valid: 99 c = SW_C2;
111 c = SW_C2; 100 break;
112 break; 101 case TAG_Special:
113 case TAG_Special: 102 switch (FPU_Special(st0_ptr)) {
114 switch ( FPU_Special(st0_ptr) ) 103 case TW_Denormal:
115 { 104 c = SW_C2 | SW_C3; /* Denormal */
116 case TW_Denormal: 105 break;
117 c = SW_C2|SW_C3; /* Denormal */ 106 case TW_NaN:
118 break; 107 /* We also use NaN for unsupported types. */
119 case TW_NaN: 108 if ((st0_ptr->sigh & 0x80000000)
120 /* We also use NaN for unsupported types. */ 109 && (exponent(st0_ptr) == EXP_OVER))
121 if ( (st0_ptr->sigh & 0x80000000) && (exponent(st0_ptr) == EXP_OVER) ) 110 c = SW_C0;
122 c = SW_C0; 111 break;
123 break; 112 case TW_Infinity:
124 case TW_Infinity: 113 c = SW_C2 | SW_C0;
125 c = SW_C2|SW_C0; 114 break;
126 break; 115 }
127 } 116 }
128 } 117 if (getsign(st0_ptr) == SIGN_NEG)
129 if ( getsign(st0_ptr) == SIGN_NEG ) 118 c |= SW_C1;
130 c |= SW_C1; 119 setcc(c);
131 setcc(c);
132} 120}
133 121
134
135static FUNC_ST0 const fp_etc_table[] = { 122static FUNC_ST0 const fp_etc_table[] = {
136 fchs, fabs, (FUNC_ST0)FPU_illegal, (FUNC_ST0)FPU_illegal, 123 fchs, fabs, (FUNC_ST0) FPU_illegal, (FUNC_ST0) FPU_illegal,
137 ftst_, fxam, (FUNC_ST0)FPU_illegal, (FUNC_ST0)FPU_illegal 124 ftst_, fxam, (FUNC_ST0) FPU_illegal, (FUNC_ST0) FPU_illegal
138}; 125};
139 126
140void FPU_etc(void) 127void FPU_etc(void)
141{ 128{
142 (fp_etc_table[FPU_rm])(&st(0), FPU_gettag0()); 129 (fp_etc_table[FPU_rm]) (&st(0), FPU_gettag0());
143} 130}
diff --git a/arch/x86/math-emu/fpu_proto.h b/arch/x86/math-emu/fpu_proto.h
index 37a8a7fe7e2..aa49b6a0d85 100644
--- a/arch/x86/math-emu/fpu_proto.h
+++ b/arch/x86/math-emu/fpu_proto.h
@@ -66,7 +66,7 @@ extern int FPU_Special(FPU_REG const *ptr);
66extern int isNaN(FPU_REG const *ptr); 66extern int isNaN(FPU_REG const *ptr);
67extern void FPU_pop(void); 67extern void FPU_pop(void);
68extern int FPU_empty_i(int stnr); 68extern int FPU_empty_i(int stnr);
69extern int FPU_stackoverflow(FPU_REG **st_new_ptr); 69extern int FPU_stackoverflow(FPU_REG ** st_new_ptr);
70extern void FPU_copy_to_regi(FPU_REG const *r, u_char tag, int stnr); 70extern void FPU_copy_to_regi(FPU_REG const *r, u_char tag, int stnr);
71extern void FPU_copy_to_reg1(FPU_REG const *r, u_char tag); 71extern void FPU_copy_to_reg1(FPU_REG const *r, u_char tag);
72extern void FPU_copy_to_reg0(FPU_REG const *r, u_char tag); 72extern void FPU_copy_to_reg0(FPU_REG const *r, u_char tag);
@@ -75,21 +75,23 @@ extern void FPU_triga(void);
75extern void FPU_trigb(void); 75extern void FPU_trigb(void);
76/* get_address.c */ 76/* get_address.c */
77extern void __user *FPU_get_address(u_char FPU_modrm, unsigned long *fpu_eip, 77extern void __user *FPU_get_address(u_char FPU_modrm, unsigned long *fpu_eip,
78 struct address *addr, fpu_addr_modes addr_modes); 78 struct address *addr,
79 fpu_addr_modes addr_modes);
79extern void __user *FPU_get_address_16(u_char FPU_modrm, unsigned long *fpu_eip, 80extern void __user *FPU_get_address_16(u_char FPU_modrm, unsigned long *fpu_eip,
80 struct address *addr, fpu_addr_modes addr_modes); 81 struct address *addr,
82 fpu_addr_modes addr_modes);
81/* load_store.c */ 83/* load_store.c */
82extern int FPU_load_store(u_char type, fpu_addr_modes addr_modes, 84extern int FPU_load_store(u_char type, fpu_addr_modes addr_modes,
83 void __user *data_address); 85 void __user * data_address);
84/* poly_2xm1.c */ 86/* poly_2xm1.c */
85extern int poly_2xm1(u_char sign, FPU_REG *arg, FPU_REG *result); 87extern int poly_2xm1(u_char sign, FPU_REG * arg, FPU_REG *result);
86/* poly_atan.c */ 88/* poly_atan.c */
87extern void poly_atan(FPU_REG *st0_ptr, u_char st0_tag, FPU_REG *st1_ptr, 89extern void poly_atan(FPU_REG * st0_ptr, u_char st0_tag, FPU_REG *st1_ptr,
88 u_char st1_tag); 90 u_char st1_tag);
89/* poly_l2.c */ 91/* poly_l2.c */
90extern void poly_l2(FPU_REG *st0_ptr, FPU_REG *st1_ptr, u_char st1_sign); 92extern void poly_l2(FPU_REG *st0_ptr, FPU_REG *st1_ptr, u_char st1_sign);
91extern int poly_l2p1(u_char s0, u_char s1, FPU_REG *r0, FPU_REG *r1, 93extern int poly_l2p1(u_char s0, u_char s1, FPU_REG *r0, FPU_REG *r1,
92 FPU_REG *d); 94 FPU_REG * d);
93/* poly_sin.c */ 95/* poly_sin.c */
94extern void poly_sine(FPU_REG *st0_ptr); 96extern void poly_sine(FPU_REG *st0_ptr);
95extern void poly_cos(FPU_REG *st0_ptr); 97extern void poly_cos(FPU_REG *st0_ptr);
@@ -117,10 +119,13 @@ extern int FPU_load_int32(long __user *_s, FPU_REG *loaded_data);
117extern int FPU_load_int16(short __user *_s, FPU_REG *loaded_data); 119extern int FPU_load_int16(short __user *_s, FPU_REG *loaded_data);
118extern int FPU_load_bcd(u_char __user *s); 120extern int FPU_load_bcd(u_char __user *s);
119extern int FPU_store_extended(FPU_REG *st0_ptr, u_char st0_tag, 121extern int FPU_store_extended(FPU_REG *st0_ptr, u_char st0_tag,
120 long double __user *d); 122 long double __user * d);
121extern int FPU_store_double(FPU_REG *st0_ptr, u_char st0_tag, double __user *dfloat); 123extern int FPU_store_double(FPU_REG *st0_ptr, u_char st0_tag,
122extern int FPU_store_single(FPU_REG *st0_ptr, u_char st0_tag, float __user *single); 124 double __user * dfloat);
123extern int FPU_store_int64(FPU_REG *st0_ptr, u_char st0_tag, long long __user *d); 125extern int FPU_store_single(FPU_REG *st0_ptr, u_char st0_tag,
126 float __user * single);
127extern int FPU_store_int64(FPU_REG *st0_ptr, u_char st0_tag,
128 long long __user * d);
124extern int FPU_store_int32(FPU_REG *st0_ptr, u_char st0_tag, long __user *d); 129extern int FPU_store_int32(FPU_REG *st0_ptr, u_char st0_tag, long __user *d);
125extern int FPU_store_int16(FPU_REG *st0_ptr, u_char st0_tag, short __user *d); 130extern int FPU_store_int16(FPU_REG *st0_ptr, u_char st0_tag, short __user *d);
126extern int FPU_store_bcd(FPU_REG *st0_ptr, u_char st0_tag, u_char __user *d); 131extern int FPU_store_bcd(FPU_REG *st0_ptr, u_char st0_tag, u_char __user *d);
@@ -137,4 +142,3 @@ extern int FPU_div(int flags, int regrm, int control_w);
137/* reg_convert.c */ 142/* reg_convert.c */
138extern int FPU_to_exp16(FPU_REG const *a, FPU_REG *x); 143extern int FPU_to_exp16(FPU_REG const *a, FPU_REG *x);
139#endif /* _FPU_PROTO_H */ 144#endif /* _FPU_PROTO_H */
140
diff --git a/arch/x86/math-emu/fpu_tags.c b/arch/x86/math-emu/fpu_tags.c
index cb436fe20e4..d9c657cd774 100644
--- a/arch/x86/math-emu/fpu_tags.c
+++ b/arch/x86/math-emu/fpu_tags.c
@@ -14,114 +14,102 @@
14#include "fpu_system.h" 14#include "fpu_system.h"
15#include "exception.h" 15#include "exception.h"
16 16
17
18void FPU_pop(void) 17void FPU_pop(void)
19{ 18{
20 fpu_tag_word |= 3 << ((top & 7)*2); 19 fpu_tag_word |= 3 << ((top & 7) * 2);
21 top++; 20 top++;
22} 21}
23 22
24
25int FPU_gettag0(void) 23int FPU_gettag0(void)
26{ 24{
27 return (fpu_tag_word >> ((top & 7)*2)) & 3; 25 return (fpu_tag_word >> ((top & 7) * 2)) & 3;
28} 26}
29 27
30
31int FPU_gettagi(int stnr) 28int FPU_gettagi(int stnr)
32{ 29{
33 return (fpu_tag_word >> (((top+stnr) & 7)*2)) & 3; 30 return (fpu_tag_word >> (((top + stnr) & 7) * 2)) & 3;
34} 31}
35 32
36
37int FPU_gettag(int regnr) 33int FPU_gettag(int regnr)
38{ 34{
39 return (fpu_tag_word >> ((regnr & 7)*2)) & 3; 35 return (fpu_tag_word >> ((regnr & 7) * 2)) & 3;
40} 36}
41 37
42
43void FPU_settag0(int tag) 38void FPU_settag0(int tag)
44{ 39{
45 int regnr = top; 40 int regnr = top;
46 regnr &= 7; 41 regnr &= 7;
47 fpu_tag_word &= ~(3 << (regnr*2)); 42 fpu_tag_word &= ~(3 << (regnr * 2));
48 fpu_tag_word |= (tag & 3) << (regnr*2); 43 fpu_tag_word |= (tag & 3) << (regnr * 2);
49} 44}
50 45
51
52void FPU_settagi(int stnr, int tag) 46void FPU_settagi(int stnr, int tag)
53{ 47{
54 int regnr = stnr+top; 48 int regnr = stnr + top;
55 regnr &= 7; 49 regnr &= 7;
56 fpu_tag_word &= ~(3 << (regnr*2)); 50 fpu_tag_word &= ~(3 << (regnr * 2));
57 fpu_tag_word |= (tag & 3) << (regnr*2); 51 fpu_tag_word |= (tag & 3) << (regnr * 2);
58} 52}
59 53
60
61void FPU_settag(int regnr, int tag) 54void FPU_settag(int regnr, int tag)
62{ 55{
63 regnr &= 7; 56 regnr &= 7;
64 fpu_tag_word &= ~(3 << (regnr*2)); 57 fpu_tag_word &= ~(3 << (regnr * 2));
65 fpu_tag_word |= (tag & 3) << (regnr*2); 58 fpu_tag_word |= (tag & 3) << (regnr * 2);
66} 59}
67 60
68
69int FPU_Special(FPU_REG const *ptr) 61int FPU_Special(FPU_REG const *ptr)
70{ 62{
71 int exp = exponent(ptr); 63 int exp = exponent(ptr);
72 64
73 if ( exp == EXP_BIAS+EXP_UNDER ) 65 if (exp == EXP_BIAS + EXP_UNDER)
74 return TW_Denormal; 66 return TW_Denormal;
75 else if ( exp != EXP_BIAS+EXP_OVER ) 67 else if (exp != EXP_BIAS + EXP_OVER)
76 return TW_NaN; 68 return TW_NaN;
77 else if ( (ptr->sigh == 0x80000000) && (ptr->sigl == 0) ) 69 else if ((ptr->sigh == 0x80000000) && (ptr->sigl == 0))
78 return TW_Infinity; 70 return TW_Infinity;
79 return TW_NaN; 71 return TW_NaN;
80} 72}
81 73
82
83int isNaN(FPU_REG const *ptr) 74int isNaN(FPU_REG const *ptr)
84{ 75{
85 return ( (exponent(ptr) == EXP_BIAS+EXP_OVER) 76 return ((exponent(ptr) == EXP_BIAS + EXP_OVER)
86 && !((ptr->sigh == 0x80000000) && (ptr->sigl == 0)) ); 77 && !((ptr->sigh == 0x80000000) && (ptr->sigl == 0)));
87} 78}
88 79
89
90int FPU_empty_i(int stnr) 80int FPU_empty_i(int stnr)
91{ 81{
92 int regnr = (top+stnr) & 7; 82 int regnr = (top + stnr) & 7;
93 83
94 return ((fpu_tag_word >> (regnr*2)) & 3) == TAG_Empty; 84 return ((fpu_tag_word >> (regnr * 2)) & 3) == TAG_Empty;
95} 85}
96 86
97 87int FPU_stackoverflow(FPU_REG ** st_new_ptr)
98int FPU_stackoverflow(FPU_REG **st_new_ptr)
99{ 88{
100 *st_new_ptr = &st(-1); 89 *st_new_ptr = &st(-1);
101 90
102 return ((fpu_tag_word >> (((top - 1) & 7)*2)) & 3) != TAG_Empty; 91 return ((fpu_tag_word >> (((top - 1) & 7) * 2)) & 3) != TAG_Empty;
103} 92}
104 93
105
106void FPU_copy_to_regi(FPU_REG const *r, u_char tag, int stnr) 94void FPU_copy_to_regi(FPU_REG const *r, u_char tag, int stnr)
107{ 95{
108 reg_copy(r, &st(stnr)); 96 reg_copy(r, &st(stnr));
109 FPU_settagi(stnr, tag); 97 FPU_settagi(stnr, tag);
110} 98}
111 99
112void FPU_copy_to_reg1(FPU_REG const *r, u_char tag) 100void FPU_copy_to_reg1(FPU_REG const *r, u_char tag)
113{ 101{
114 reg_copy(r, &st(1)); 102 reg_copy(r, &st(1));
115 FPU_settagi(1, tag); 103 FPU_settagi(1, tag);
116} 104}
117 105
118void FPU_copy_to_reg0(FPU_REG const *r, u_char tag) 106void FPU_copy_to_reg0(FPU_REG const *r, u_char tag)
119{ 107{
120 int regnr = top; 108 int regnr = top;
121 regnr &= 7; 109 regnr &= 7;
122 110
123 reg_copy(r, &st(0)); 111 reg_copy(r, &st(0));
124 112
125 fpu_tag_word &= ~(3 << (regnr*2)); 113 fpu_tag_word &= ~(3 << (regnr * 2));
126 fpu_tag_word |= (tag & 3) << (regnr*2); 114 fpu_tag_word |= (tag & 3) << (regnr * 2);
127} 115}
diff --git a/arch/x86/math-emu/fpu_trig.c b/arch/x86/math-emu/fpu_trig.c
index 403cbde1d42..ecd06680581 100644
--- a/arch/x86/math-emu/fpu_trig.c
+++ b/arch/x86/math-emu/fpu_trig.c
@@ -15,11 +15,10 @@
15#include "fpu_emu.h" 15#include "fpu_emu.h"
16#include "status_w.h" 16#include "status_w.h"
17#include "control_w.h" 17#include "control_w.h"
18#include "reg_constant.h" 18#include "reg_constant.h"
19 19
20static void rem_kernel(unsigned long long st0, unsigned long long *y, 20static void rem_kernel(unsigned long long st0, unsigned long long *y,
21 unsigned long long st1, 21 unsigned long long st1, unsigned long long q, int n);
22 unsigned long long q, int n);
23 22
24#define BETTER_THAN_486 23#define BETTER_THAN_486
25 24
@@ -33,788 +32,706 @@ static void rem_kernel(unsigned long long st0, unsigned long long *y,
33 precision of the result sometimes degrades to about 63.9 bits */ 32 precision of the result sometimes degrades to about 63.9 bits */
34static int trig_arg(FPU_REG *st0_ptr, int even) 33static int trig_arg(FPU_REG *st0_ptr, int even)
35{ 34{
36 FPU_REG tmp; 35 FPU_REG tmp;
37 u_char tmptag; 36 u_char tmptag;
38 unsigned long long q; 37 unsigned long long q;
39 int old_cw = control_word, saved_status = partial_status; 38 int old_cw = control_word, saved_status = partial_status;
40 int tag, st0_tag = TAG_Valid; 39 int tag, st0_tag = TAG_Valid;
41 40
42 if ( exponent(st0_ptr) >= 63 ) 41 if (exponent(st0_ptr) >= 63) {
43 { 42 partial_status |= SW_C2; /* Reduction incomplete. */
44 partial_status |= SW_C2; /* Reduction incomplete. */ 43 return -1;
45 return -1; 44 }
46 }
47
48 control_word &= ~CW_RC;
49 control_word |= RC_CHOP;
50
51 setpositive(st0_ptr);
52 tag = FPU_u_div(st0_ptr, &CONST_PI2, &tmp, PR_64_BITS | RC_CHOP | 0x3f,
53 SIGN_POS);
54
55 FPU_round_to_int(&tmp, tag); /* Fortunately, this can't overflow
56 to 2^64 */
57 q = significand(&tmp);
58 if ( q )
59 {
60 rem_kernel(significand(st0_ptr),
61 &significand(&tmp),
62 significand(&CONST_PI2),
63 q, exponent(st0_ptr) - exponent(&CONST_PI2));
64 setexponent16(&tmp, exponent(&CONST_PI2));
65 st0_tag = FPU_normalize(&tmp);
66 FPU_copy_to_reg0(&tmp, st0_tag);
67 }
68
69 if ( (even && !(q & 1)) || (!even && (q & 1)) )
70 {
71 st0_tag = FPU_sub(REV|LOADED|TAG_Valid, (int)&CONST_PI2, FULL_PRECISION);
72 45
73#ifdef BETTER_THAN_486 46 control_word &= ~CW_RC;
74 /* So far, the results are exact but based upon a 64 bit 47 control_word |= RC_CHOP;
75 precision approximation to pi/2. The technique used 48
76 now is equivalent to using an approximation to pi/2 which 49 setpositive(st0_ptr);
77 is accurate to about 128 bits. */ 50 tag = FPU_u_div(st0_ptr, &CONST_PI2, &tmp, PR_64_BITS | RC_CHOP | 0x3f,
78 if ( (exponent(st0_ptr) <= exponent(&CONST_PI2extra) + 64) || (q > 1) ) 51 SIGN_POS);
79 { 52
80 /* This code gives the effect of having pi/2 to better than 53 FPU_round_to_int(&tmp, tag); /* Fortunately, this can't overflow
81 128 bits precision. */ 54 to 2^64 */
82 55 q = significand(&tmp);
83 significand(&tmp) = q + 1; 56 if (q) {
84 setexponent16(&tmp, 63); 57 rem_kernel(significand(st0_ptr),
85 FPU_normalize(&tmp); 58 &significand(&tmp),
86 tmptag = 59 significand(&CONST_PI2),
87 FPU_u_mul(&CONST_PI2extra, &tmp, &tmp, FULL_PRECISION, SIGN_POS, 60 q, exponent(st0_ptr) - exponent(&CONST_PI2));
88 exponent(&CONST_PI2extra) + exponent(&tmp)); 61 setexponent16(&tmp, exponent(&CONST_PI2));
89 setsign(&tmp, getsign(&CONST_PI2extra)); 62 st0_tag = FPU_normalize(&tmp);
90 st0_tag = FPU_add(&tmp, tmptag, 0, FULL_PRECISION); 63 FPU_copy_to_reg0(&tmp, st0_tag);
91 if ( signnegative(st0_ptr) )
92 {
93 /* CONST_PI2extra is negative, so the result of the addition
94 can be negative. This means that the argument is actually
95 in a different quadrant. The correction is always < pi/2,
96 so it can't overflow into yet another quadrant. */
97 setpositive(st0_ptr);
98 q++;
99 }
100 } 64 }
65
66 if ((even && !(q & 1)) || (!even && (q & 1))) {
67 st0_tag =
68 FPU_sub(REV | LOADED | TAG_Valid, (int)&CONST_PI2,
69 FULL_PRECISION);
70
71#ifdef BETTER_THAN_486
72 /* So far, the results are exact but based upon a 64 bit
73 precision approximation to pi/2. The technique used
74 now is equivalent to using an approximation to pi/2 which
75 is accurate to about 128 bits. */
76 if ((exponent(st0_ptr) <= exponent(&CONST_PI2extra) + 64)
77 || (q > 1)) {
78 /* This code gives the effect of having pi/2 to better than
79 128 bits precision. */
80
81 significand(&tmp) = q + 1;
82 setexponent16(&tmp, 63);
83 FPU_normalize(&tmp);
84 tmptag =
85 FPU_u_mul(&CONST_PI2extra, &tmp, &tmp,
86 FULL_PRECISION, SIGN_POS,
87 exponent(&CONST_PI2extra) +
88 exponent(&tmp));
89 setsign(&tmp, getsign(&CONST_PI2extra));
90 st0_tag = FPU_add(&tmp, tmptag, 0, FULL_PRECISION);
91 if (signnegative(st0_ptr)) {
92 /* CONST_PI2extra is negative, so the result of the addition
93 can be negative. This means that the argument is actually
94 in a different quadrant. The correction is always < pi/2,
95 so it can't overflow into yet another quadrant. */
96 setpositive(st0_ptr);
97 q++;
98 }
99 }
101#endif /* BETTER_THAN_486 */ 100#endif /* BETTER_THAN_486 */
102 } 101 }
103#ifdef BETTER_THAN_486 102#ifdef BETTER_THAN_486
104 else 103 else {
105 { 104 /* So far, the results are exact but based upon a 64 bit
106 /* So far, the results are exact but based upon a 64 bit 105 precision approximation to pi/2. The technique used
107 precision approximation to pi/2. The technique used 106 now is equivalent to using an approximation to pi/2 which
108 now is equivalent to using an approximation to pi/2 which 107 is accurate to about 128 bits. */
109 is accurate to about 128 bits. */ 108 if (((q > 0)
110 if ( ((q > 0) && (exponent(st0_ptr) <= exponent(&CONST_PI2extra) + 64)) 109 && (exponent(st0_ptr) <= exponent(&CONST_PI2extra) + 64))
111 || (q > 1) ) 110 || (q > 1)) {
112 { 111 /* This code gives the effect of having p/2 to better than
113 /* This code gives the effect of having p/2 to better than 112 128 bits precision. */
114 128 bits precision. */ 113
115 114 significand(&tmp) = q;
116 significand(&tmp) = q; 115 setexponent16(&tmp, 63);
117 setexponent16(&tmp, 63); 116 FPU_normalize(&tmp); /* This must return TAG_Valid */
118 FPU_normalize(&tmp); /* This must return TAG_Valid */ 117 tmptag =
119 tmptag = FPU_u_mul(&CONST_PI2extra, &tmp, &tmp, FULL_PRECISION, 118 FPU_u_mul(&CONST_PI2extra, &tmp, &tmp,
120 SIGN_POS, 119 FULL_PRECISION, SIGN_POS,
121 exponent(&CONST_PI2extra) + exponent(&tmp)); 120 exponent(&CONST_PI2extra) +
122 setsign(&tmp, getsign(&CONST_PI2extra)); 121 exponent(&tmp));
123 st0_tag = FPU_sub(LOADED|(tmptag & 0x0f), (int)&tmp, 122 setsign(&tmp, getsign(&CONST_PI2extra));
124 FULL_PRECISION); 123 st0_tag = FPU_sub(LOADED | (tmptag & 0x0f), (int)&tmp,
125 if ( (exponent(st0_ptr) == exponent(&CONST_PI2)) && 124 FULL_PRECISION);
126 ((st0_ptr->sigh > CONST_PI2.sigh) 125 if ((exponent(st0_ptr) == exponent(&CONST_PI2)) &&
127 || ((st0_ptr->sigh == CONST_PI2.sigh) 126 ((st0_ptr->sigh > CONST_PI2.sigh)
128 && (st0_ptr->sigl > CONST_PI2.sigl))) ) 127 || ((st0_ptr->sigh == CONST_PI2.sigh)
129 { 128 && (st0_ptr->sigl > CONST_PI2.sigl)))) {
130 /* CONST_PI2extra is negative, so the result of the 129 /* CONST_PI2extra is negative, so the result of the
131 subtraction can be larger than pi/2. This means 130 subtraction can be larger than pi/2. This means
132 that the argument is actually in a different quadrant. 131 that the argument is actually in a different quadrant.
133 The correction is always < pi/2, so it can't overflow 132 The correction is always < pi/2, so it can't overflow
134 into yet another quadrant. */ 133 into yet another quadrant. */
135 st0_tag = FPU_sub(REV|LOADED|TAG_Valid, (int)&CONST_PI2, 134 st0_tag =
136 FULL_PRECISION); 135 FPU_sub(REV | LOADED | TAG_Valid,
137 q++; 136 (int)&CONST_PI2, FULL_PRECISION);
138 } 137 q++;
138 }
139 }
139 } 140 }
140 }
141#endif /* BETTER_THAN_486 */ 141#endif /* BETTER_THAN_486 */
142 142
143 FPU_settag0(st0_tag); 143 FPU_settag0(st0_tag);
144 control_word = old_cw; 144 control_word = old_cw;
145 partial_status = saved_status & ~SW_C2; /* Reduction complete. */ 145 partial_status = saved_status & ~SW_C2; /* Reduction complete. */
146 146
147 return (q & 3) | even; 147 return (q & 3) | even;
148} 148}
149 149
150
151/* Convert a long to register */ 150/* Convert a long to register */
152static void convert_l2reg(long const *arg, int deststnr) 151static void convert_l2reg(long const *arg, int deststnr)
153{ 152{
154 int tag; 153 int tag;
155 long num = *arg; 154 long num = *arg;
156 u_char sign; 155 u_char sign;
157 FPU_REG *dest = &st(deststnr); 156 FPU_REG *dest = &st(deststnr);
158
159 if (num == 0)
160 {
161 FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr);
162 return;
163 }
164
165 if (num > 0)
166 { sign = SIGN_POS; }
167 else
168 { num = -num; sign = SIGN_NEG; }
169
170 dest->sigh = num;
171 dest->sigl = 0;
172 setexponent16(dest, 31);
173 tag = FPU_normalize(dest);
174 FPU_settagi(deststnr, tag);
175 setsign(dest, sign);
176 return;
177}
178 157
158 if (num == 0) {
159 FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr);
160 return;
161 }
162
163 if (num > 0) {
164 sign = SIGN_POS;
165 } else {
166 num = -num;
167 sign = SIGN_NEG;
168 }
169
170 dest->sigh = num;
171 dest->sigl = 0;
172 setexponent16(dest, 31);
173 tag = FPU_normalize(dest);
174 FPU_settagi(deststnr, tag);
175 setsign(dest, sign);
176 return;
177}
179 178
180static void single_arg_error(FPU_REG *st0_ptr, u_char st0_tag) 179static void single_arg_error(FPU_REG *st0_ptr, u_char st0_tag)
181{ 180{
182 if ( st0_tag == TAG_Empty ) 181 if (st0_tag == TAG_Empty)
183 FPU_stack_underflow(); /* Puts a QNaN in st(0) */ 182 FPU_stack_underflow(); /* Puts a QNaN in st(0) */
184 else if ( st0_tag == TW_NaN ) 183 else if (st0_tag == TW_NaN)
185 real_1op_NaN(st0_ptr); /* return with a NaN in st(0) */ 184 real_1op_NaN(st0_ptr); /* return with a NaN in st(0) */
186#ifdef PARANOID 185#ifdef PARANOID
187 else 186 else
188 EXCEPTION(EX_INTERNAL|0x0112); 187 EXCEPTION(EX_INTERNAL | 0x0112);
189#endif /* PARANOID */ 188#endif /* PARANOID */
190} 189}
191 190
192
193static void single_arg_2_error(FPU_REG *st0_ptr, u_char st0_tag) 191static void single_arg_2_error(FPU_REG *st0_ptr, u_char st0_tag)
194{ 192{
195 int isNaN; 193 int isNaN;
196 194
197 switch ( st0_tag ) 195 switch (st0_tag) {
198 { 196 case TW_NaN:
199 case TW_NaN: 197 isNaN = (exponent(st0_ptr) == EXP_OVER)
200 isNaN = (exponent(st0_ptr) == EXP_OVER) && (st0_ptr->sigh & 0x80000000); 198 && (st0_ptr->sigh & 0x80000000);
201 if ( isNaN && !(st0_ptr->sigh & 0x40000000) ) /* Signaling ? */ 199 if (isNaN && !(st0_ptr->sigh & 0x40000000)) { /* Signaling ? */
202 { 200 EXCEPTION(EX_Invalid);
203 EXCEPTION(EX_Invalid); 201 if (control_word & CW_Invalid) {
204 if ( control_word & CW_Invalid ) 202 /* The masked response */
205 { 203 /* Convert to a QNaN */
206 /* The masked response */ 204 st0_ptr->sigh |= 0x40000000;
207 /* Convert to a QNaN */ 205 push();
208 st0_ptr->sigh |= 0x40000000; 206 FPU_copy_to_reg0(st0_ptr, TAG_Special);
209 push(); 207 }
210 FPU_copy_to_reg0(st0_ptr, TAG_Special); 208 } else if (isNaN) {
211 } 209 /* A QNaN */
212 } 210 push();
213 else if ( isNaN ) 211 FPU_copy_to_reg0(st0_ptr, TAG_Special);
214 { 212 } else {
215 /* A QNaN */ 213 /* pseudoNaN or other unsupported */
216 push(); 214 EXCEPTION(EX_Invalid);
217 FPU_copy_to_reg0(st0_ptr, TAG_Special); 215 if (control_word & CW_Invalid) {
218 } 216 /* The masked response */
219 else 217 FPU_copy_to_reg0(&CONST_QNaN, TAG_Special);
220 { 218 push();
221 /* pseudoNaN or other unsupported */ 219 FPU_copy_to_reg0(&CONST_QNaN, TAG_Special);
222 EXCEPTION(EX_Invalid); 220 }
223 if ( control_word & CW_Invalid ) 221 }
224 { 222 break; /* return with a NaN in st(0) */
225 /* The masked response */
226 FPU_copy_to_reg0(&CONST_QNaN, TAG_Special);
227 push();
228 FPU_copy_to_reg0(&CONST_QNaN, TAG_Special);
229 }
230 }
231 break; /* return with a NaN in st(0) */
232#ifdef PARANOID 223#ifdef PARANOID
233 default: 224 default:
234 EXCEPTION(EX_INTERNAL|0x0112); 225 EXCEPTION(EX_INTERNAL | 0x0112);
235#endif /* PARANOID */ 226#endif /* PARANOID */
236 } 227 }
237} 228}
238 229
239
240/*---------------------------------------------------------------------------*/ 230/*---------------------------------------------------------------------------*/
241 231
242static void f2xm1(FPU_REG *st0_ptr, u_char tag) 232static void f2xm1(FPU_REG *st0_ptr, u_char tag)
243{ 233{
244 FPU_REG a; 234 FPU_REG a;
245 235
246 clear_C1(); 236 clear_C1();
247 237
248 if ( tag == TAG_Valid ) 238 if (tag == TAG_Valid) {
249 { 239 /* For an 80486 FPU, the result is undefined if the arg is >= 1.0 */
250 /* For an 80486 FPU, the result is undefined if the arg is >= 1.0 */ 240 if (exponent(st0_ptr) < 0) {
251 if ( exponent(st0_ptr) < 0 ) 241 denormal_arg:
252 {
253 denormal_arg:
254 242
255 FPU_to_exp16(st0_ptr, &a); 243 FPU_to_exp16(st0_ptr, &a);
256 244
257 /* poly_2xm1(x) requires 0 < st(0) < 1. */ 245 /* poly_2xm1(x) requires 0 < st(0) < 1. */
258 poly_2xm1(getsign(st0_ptr), &a, st0_ptr); 246 poly_2xm1(getsign(st0_ptr), &a, st0_ptr);
247 }
248 set_precision_flag_up(); /* 80486 appears to always do this */
249 return;
259 } 250 }
260 set_precision_flag_up(); /* 80486 appears to always do this */
261 return;
262 }
263 251
264 if ( tag == TAG_Zero ) 252 if (tag == TAG_Zero)
265 return; 253 return;
266 254
267 if ( tag == TAG_Special ) 255 if (tag == TAG_Special)
268 tag = FPU_Special(st0_ptr); 256 tag = FPU_Special(st0_ptr);
269 257
270 switch ( tag ) 258 switch (tag) {
271 { 259 case TW_Denormal:
272 case TW_Denormal: 260 if (denormal_operand() < 0)
273 if ( denormal_operand() < 0 ) 261 return;
274 return; 262 goto denormal_arg;
275 goto denormal_arg; 263 case TW_Infinity:
276 case TW_Infinity: 264 if (signnegative(st0_ptr)) {
277 if ( signnegative(st0_ptr) ) 265 /* -infinity gives -1 (p16-10) */
278 { 266 FPU_copy_to_reg0(&CONST_1, TAG_Valid);
279 /* -infinity gives -1 (p16-10) */ 267 setnegative(st0_ptr);
280 FPU_copy_to_reg0(&CONST_1, TAG_Valid); 268 }
281 setnegative(st0_ptr); 269 return;
270 default:
271 single_arg_error(st0_ptr, tag);
282 } 272 }
283 return;
284 default:
285 single_arg_error(st0_ptr, tag);
286 }
287} 273}
288 274
289
290static void fptan(FPU_REG *st0_ptr, u_char st0_tag) 275static void fptan(FPU_REG *st0_ptr, u_char st0_tag)
291{ 276{
292 FPU_REG *st_new_ptr; 277 FPU_REG *st_new_ptr;
293 int q; 278 int q;
294 u_char arg_sign = getsign(st0_ptr); 279 u_char arg_sign = getsign(st0_ptr);
295 280
296 /* Stack underflow has higher priority */ 281 /* Stack underflow has higher priority */
297 if ( st0_tag == TAG_Empty ) 282 if (st0_tag == TAG_Empty) {
298 { 283 FPU_stack_underflow(); /* Puts a QNaN in st(0) */
299 FPU_stack_underflow(); /* Puts a QNaN in st(0) */ 284 if (control_word & CW_Invalid) {
300 if ( control_word & CW_Invalid ) 285 st_new_ptr = &st(-1);
301 { 286 push();
302 st_new_ptr = &st(-1); 287 FPU_stack_underflow(); /* Puts a QNaN in the new st(0) */
303 push(); 288 }
304 FPU_stack_underflow(); /* Puts a QNaN in the new st(0) */ 289 return;
305 } 290 }
306 return; 291
307 } 292 if (STACK_OVERFLOW) {
308 293 FPU_stack_overflow();
309 if ( STACK_OVERFLOW ) 294 return;
310 { FPU_stack_overflow(); return; }
311
312 if ( st0_tag == TAG_Valid )
313 {
314 if ( exponent(st0_ptr) > -40 )
315 {
316 if ( (q = trig_arg(st0_ptr, 0)) == -1 )
317 {
318 /* Operand is out of range */
319 return;
320 }
321
322 poly_tan(st0_ptr);
323 setsign(st0_ptr, (q & 1) ^ (arg_sign != 0));
324 set_precision_flag_up(); /* We do not really know if up or down */
325 } 295 }
326 else
327 {
328 /* For a small arg, the result == the argument */
329 /* Underflow may happen */
330 296
331 denormal_arg: 297 if (st0_tag == TAG_Valid) {
298 if (exponent(st0_ptr) > -40) {
299 if ((q = trig_arg(st0_ptr, 0)) == -1) {
300 /* Operand is out of range */
301 return;
302 }
303
304 poly_tan(st0_ptr);
305 setsign(st0_ptr, (q & 1) ^ (arg_sign != 0));
306 set_precision_flag_up(); /* We do not really know if up or down */
307 } else {
308 /* For a small arg, the result == the argument */
309 /* Underflow may happen */
310
311 denormal_arg:
312
313 FPU_to_exp16(st0_ptr, st0_ptr);
332 314
333 FPU_to_exp16(st0_ptr, st0_ptr); 315 st0_tag =
334 316 FPU_round(st0_ptr, 1, 0, FULL_PRECISION, arg_sign);
335 st0_tag = FPU_round(st0_ptr, 1, 0, FULL_PRECISION, arg_sign); 317 FPU_settag0(st0_tag);
336 FPU_settag0(st0_tag); 318 }
319 push();
320 FPU_copy_to_reg0(&CONST_1, TAG_Valid);
321 return;
337 } 322 }
338 push();
339 FPU_copy_to_reg0(&CONST_1, TAG_Valid);
340 return;
341 }
342
343 if ( st0_tag == TAG_Zero )
344 {
345 push();
346 FPU_copy_to_reg0(&CONST_1, TAG_Valid);
347 setcc(0);
348 return;
349 }
350
351 if ( st0_tag == TAG_Special )
352 st0_tag = FPU_Special(st0_ptr);
353
354 if ( st0_tag == TW_Denormal )
355 {
356 if ( denormal_operand() < 0 )
357 return;
358 323
359 goto denormal_arg; 324 if (st0_tag == TAG_Zero) {
360 } 325 push();
361 326 FPU_copy_to_reg0(&CONST_1, TAG_Valid);
362 if ( st0_tag == TW_Infinity ) 327 setcc(0);
363 { 328 return;
364 /* The 80486 treats infinity as an invalid operand */ 329 }
365 if ( arith_invalid(0) >= 0 ) 330
366 { 331 if (st0_tag == TAG_Special)
367 st_new_ptr = &st(-1); 332 st0_tag = FPU_Special(st0_ptr);
368 push(); 333
369 arith_invalid(0); 334 if (st0_tag == TW_Denormal) {
335 if (denormal_operand() < 0)
336 return;
337
338 goto denormal_arg;
370 } 339 }
371 return;
372 }
373 340
374 single_arg_2_error(st0_ptr, st0_tag); 341 if (st0_tag == TW_Infinity) {
375} 342 /* The 80486 treats infinity as an invalid operand */
343 if (arith_invalid(0) >= 0) {
344 st_new_ptr = &st(-1);
345 push();
346 arith_invalid(0);
347 }
348 return;
349 }
376 350
351 single_arg_2_error(st0_ptr, st0_tag);
352}
377 353
378static void fxtract(FPU_REG *st0_ptr, u_char st0_tag) 354static void fxtract(FPU_REG *st0_ptr, u_char st0_tag)
379{ 355{
380 FPU_REG *st_new_ptr; 356 FPU_REG *st_new_ptr;
381 u_char sign; 357 u_char sign;
382 register FPU_REG *st1_ptr = st0_ptr; /* anticipate */ 358 register FPU_REG *st1_ptr = st0_ptr; /* anticipate */
383
384 if ( STACK_OVERFLOW )
385 { FPU_stack_overflow(); return; }
386
387 clear_C1();
388
389 if ( st0_tag == TAG_Valid )
390 {
391 long e;
392
393 push();
394 sign = getsign(st1_ptr);
395 reg_copy(st1_ptr, st_new_ptr);
396 setexponent16(st_new_ptr, exponent(st_new_ptr));
397
398 denormal_arg:
399
400 e = exponent16(st_new_ptr);
401 convert_l2reg(&e, 1);
402 setexponentpos(st_new_ptr, 0);
403 setsign(st_new_ptr, sign);
404 FPU_settag0(TAG_Valid); /* Needed if arg was a denormal */
405 return;
406 }
407 else if ( st0_tag == TAG_Zero )
408 {
409 sign = getsign(st0_ptr);
410
411 if ( FPU_divide_by_zero(0, SIGN_NEG) < 0 )
412 return;
413 359
414 push(); 360 if (STACK_OVERFLOW) {
415 FPU_copy_to_reg0(&CONST_Z, TAG_Zero); 361 FPU_stack_overflow();
416 setsign(st_new_ptr, sign); 362 return;
417 return; 363 }
418 }
419 364
420 if ( st0_tag == TAG_Special ) 365 clear_C1();
421 st0_tag = FPU_Special(st0_ptr);
422 366
423 if ( st0_tag == TW_Denormal ) 367 if (st0_tag == TAG_Valid) {
424 { 368 long e;
425 if (denormal_operand() < 0 )
426 return;
427 369
428 push(); 370 push();
429 sign = getsign(st1_ptr); 371 sign = getsign(st1_ptr);
430 FPU_to_exp16(st1_ptr, st_new_ptr); 372 reg_copy(st1_ptr, st_new_ptr);
431 goto denormal_arg; 373 setexponent16(st_new_ptr, exponent(st_new_ptr));
432 } 374
433 else if ( st0_tag == TW_Infinity ) 375 denormal_arg:
434 { 376
435 sign = getsign(st0_ptr); 377 e = exponent16(st_new_ptr);
436 setpositive(st0_ptr); 378 convert_l2reg(&e, 1);
437 push(); 379 setexponentpos(st_new_ptr, 0);
438 FPU_copy_to_reg0(&CONST_INF, TAG_Special); 380 setsign(st_new_ptr, sign);
439 setsign(st_new_ptr, sign); 381 FPU_settag0(TAG_Valid); /* Needed if arg was a denormal */
440 return; 382 return;
441 } 383 } else if (st0_tag == TAG_Zero) {
442 else if ( st0_tag == TW_NaN ) 384 sign = getsign(st0_ptr);
443 { 385
444 if ( real_1op_NaN(st0_ptr) < 0 ) 386 if (FPU_divide_by_zero(0, SIGN_NEG) < 0)
445 return; 387 return;
446 388
447 push(); 389 push();
448 FPU_copy_to_reg0(st0_ptr, TAG_Special); 390 FPU_copy_to_reg0(&CONST_Z, TAG_Zero);
449 return; 391 setsign(st_new_ptr, sign);
450 } 392 return;
451 else if ( st0_tag == TAG_Empty ) 393 }
452 { 394
453 /* Is this the correct behaviour? */ 395 if (st0_tag == TAG_Special)
454 if ( control_word & EX_Invalid ) 396 st0_tag = FPU_Special(st0_ptr);
455 { 397
456 FPU_stack_underflow(); 398 if (st0_tag == TW_Denormal) {
457 push(); 399 if (denormal_operand() < 0)
458 FPU_stack_underflow(); 400 return;
401
402 push();
403 sign = getsign(st1_ptr);
404 FPU_to_exp16(st1_ptr, st_new_ptr);
405 goto denormal_arg;
406 } else if (st0_tag == TW_Infinity) {
407 sign = getsign(st0_ptr);
408 setpositive(st0_ptr);
409 push();
410 FPU_copy_to_reg0(&CONST_INF, TAG_Special);
411 setsign(st_new_ptr, sign);
412 return;
413 } else if (st0_tag == TW_NaN) {
414 if (real_1op_NaN(st0_ptr) < 0)
415 return;
416
417 push();
418 FPU_copy_to_reg0(st0_ptr, TAG_Special);
419 return;
420 } else if (st0_tag == TAG_Empty) {
421 /* Is this the correct behaviour? */
422 if (control_word & EX_Invalid) {
423 FPU_stack_underflow();
424 push();
425 FPU_stack_underflow();
426 } else
427 EXCEPTION(EX_StackUnder);
459 } 428 }
460 else
461 EXCEPTION(EX_StackUnder);
462 }
463#ifdef PARANOID 429#ifdef PARANOID
464 else 430 else
465 EXCEPTION(EX_INTERNAL | 0x119); 431 EXCEPTION(EX_INTERNAL | 0x119);
466#endif /* PARANOID */ 432#endif /* PARANOID */
467} 433}
468 434
469
470static void fdecstp(void) 435static void fdecstp(void)
471{ 436{
472 clear_C1(); 437 clear_C1();
473 top--; 438 top--;
474} 439}
475 440
476static void fincstp(void) 441static void fincstp(void)
477{ 442{
478 clear_C1(); 443 clear_C1();
479 top++; 444 top++;
480} 445}
481 446
482
483static void fsqrt_(FPU_REG *st0_ptr, u_char st0_tag) 447static void fsqrt_(FPU_REG *st0_ptr, u_char st0_tag)
484{ 448{
485 int expon; 449 int expon;
486 450
487 clear_C1(); 451 clear_C1();
488
489 if ( st0_tag == TAG_Valid )
490 {
491 u_char tag;
492
493 if (signnegative(st0_ptr))
494 {
495 arith_invalid(0); /* sqrt(negative) is invalid */
496 return;
497 }
498 452
499 /* make st(0) in [1.0 .. 4.0) */ 453 if (st0_tag == TAG_Valid) {
500 expon = exponent(st0_ptr); 454 u_char tag;
501 455
502 denormal_arg: 456 if (signnegative(st0_ptr)) {
503 457 arith_invalid(0); /* sqrt(negative) is invalid */
504 setexponent16(st0_ptr, (expon & 1)); 458 return;
505 459 }
506 /* Do the computation, the sign of the result will be positive. */ 460
507 tag = wm_sqrt(st0_ptr, 0, 0, control_word, SIGN_POS); 461 /* make st(0) in [1.0 .. 4.0) */
508 addexponent(st0_ptr, expon >> 1); 462 expon = exponent(st0_ptr);
509 FPU_settag0(tag); 463
510 return; 464 denormal_arg:
511 } 465
512 466 setexponent16(st0_ptr, (expon & 1));
513 if ( st0_tag == TAG_Zero ) 467
514 return; 468 /* Do the computation, the sign of the result will be positive. */
515 469 tag = wm_sqrt(st0_ptr, 0, 0, control_word, SIGN_POS);
516 if ( st0_tag == TAG_Special ) 470 addexponent(st0_ptr, expon >> 1);
517 st0_tag = FPU_Special(st0_ptr); 471 FPU_settag0(tag);
518 472 return;
519 if ( st0_tag == TW_Infinity )
520 {
521 if ( signnegative(st0_ptr) )
522 arith_invalid(0); /* sqrt(-Infinity) is invalid */
523 return;
524 }
525 else if ( st0_tag == TW_Denormal )
526 {
527 if (signnegative(st0_ptr))
528 {
529 arith_invalid(0); /* sqrt(negative) is invalid */
530 return;
531 } 473 }
532 474
533 if ( denormal_operand() < 0 ) 475 if (st0_tag == TAG_Zero)
534 return; 476 return;
535 477
536 FPU_to_exp16(st0_ptr, st0_ptr); 478 if (st0_tag == TAG_Special)
479 st0_tag = FPU_Special(st0_ptr);
537 480
538 expon = exponent16(st0_ptr); 481 if (st0_tag == TW_Infinity) {
482 if (signnegative(st0_ptr))
483 arith_invalid(0); /* sqrt(-Infinity) is invalid */
484 return;
485 } else if (st0_tag == TW_Denormal) {
486 if (signnegative(st0_ptr)) {
487 arith_invalid(0); /* sqrt(negative) is invalid */
488 return;
489 }
539 490
540 goto denormal_arg; 491 if (denormal_operand() < 0)
541 } 492 return;
542 493
543 single_arg_error(st0_ptr, st0_tag); 494 FPU_to_exp16(st0_ptr, st0_ptr);
544 495
545} 496 expon = exponent16(st0_ptr);
497
498 goto denormal_arg;
499 }
546 500
501 single_arg_error(st0_ptr, st0_tag);
502
503}
547 504
548static void frndint_(FPU_REG *st0_ptr, u_char st0_tag) 505static void frndint_(FPU_REG *st0_ptr, u_char st0_tag)
549{ 506{
550 int flags, tag; 507 int flags, tag;
551 508
552 if ( st0_tag == TAG_Valid ) 509 if (st0_tag == TAG_Valid) {
553 { 510 u_char sign;
554 u_char sign;
555 511
556 denormal_arg: 512 denormal_arg:
557 513
558 sign = getsign(st0_ptr); 514 sign = getsign(st0_ptr);
559 515
560 if (exponent(st0_ptr) > 63) 516 if (exponent(st0_ptr) > 63)
561 return; 517 return;
518
519 if (st0_tag == TW_Denormal) {
520 if (denormal_operand() < 0)
521 return;
522 }
523
524 /* Fortunately, this can't overflow to 2^64 */
525 if ((flags = FPU_round_to_int(st0_ptr, st0_tag)))
526 set_precision_flag(flags);
562 527
563 if ( st0_tag == TW_Denormal ) 528 setexponent16(st0_ptr, 63);
564 { 529 tag = FPU_normalize(st0_ptr);
565 if (denormal_operand() < 0 ) 530 setsign(st0_ptr, sign);
566 return; 531 FPU_settag0(tag);
532 return;
567 } 533 }
568 534
569 /* Fortunately, this can't overflow to 2^64 */ 535 if (st0_tag == TAG_Zero)
570 if ( (flags = FPU_round_to_int(st0_ptr, st0_tag)) ) 536 return;
571 set_precision_flag(flags);
572
573 setexponent16(st0_ptr, 63);
574 tag = FPU_normalize(st0_ptr);
575 setsign(st0_ptr, sign);
576 FPU_settag0(tag);
577 return;
578 }
579
580 if ( st0_tag == TAG_Zero )
581 return;
582
583 if ( st0_tag == TAG_Special )
584 st0_tag = FPU_Special(st0_ptr);
585
586 if ( st0_tag == TW_Denormal )
587 goto denormal_arg;
588 else if ( st0_tag == TW_Infinity )
589 return;
590 else
591 single_arg_error(st0_ptr, st0_tag);
592}
593 537
538 if (st0_tag == TAG_Special)
539 st0_tag = FPU_Special(st0_ptr);
540
541 if (st0_tag == TW_Denormal)
542 goto denormal_arg;
543 else if (st0_tag == TW_Infinity)
544 return;
545 else
546 single_arg_error(st0_ptr, st0_tag);
547}
594 548
595static int fsin(FPU_REG *st0_ptr, u_char tag) 549static int fsin(FPU_REG *st0_ptr, u_char tag)
596{ 550{
597 u_char arg_sign = getsign(st0_ptr); 551 u_char arg_sign = getsign(st0_ptr);
598 552
599 if ( tag == TAG_Valid ) 553 if (tag == TAG_Valid) {
600 { 554 int q;
601 int q; 555
602 556 if (exponent(st0_ptr) > -40) {
603 if ( exponent(st0_ptr) > -40 ) 557 if ((q = trig_arg(st0_ptr, 0)) == -1) {
604 { 558 /* Operand is out of range */
605 if ( (q = trig_arg(st0_ptr, 0)) == -1 ) 559 return 1;
606 { 560 }
607 /* Operand is out of range */ 561
608 return 1; 562 poly_sine(st0_ptr);
609 } 563
610 564 if (q & 2)
611 poly_sine(st0_ptr); 565 changesign(st0_ptr);
612 566
613 if (q & 2) 567 setsign(st0_ptr, getsign(st0_ptr) ^ arg_sign);
614 changesign(st0_ptr); 568
615 569 /* We do not really know if up or down */
616 setsign(st0_ptr, getsign(st0_ptr) ^ arg_sign); 570 set_precision_flag_up();
617 571 return 0;
618 /* We do not really know if up or down */ 572 } else {
619 set_precision_flag_up(); 573 /* For a small arg, the result == the argument */
620 return 0; 574 set_precision_flag_up(); /* Must be up. */
575 return 0;
576 }
621 } 577 }
622 else 578
623 { 579 if (tag == TAG_Zero) {
624 /* For a small arg, the result == the argument */ 580 setcc(0);
625 set_precision_flag_up(); /* Must be up. */ 581 return 0;
626 return 0;
627 } 582 }
628 }
629
630 if ( tag == TAG_Zero )
631 {
632 setcc(0);
633 return 0;
634 }
635
636 if ( tag == TAG_Special )
637 tag = FPU_Special(st0_ptr);
638
639 if ( tag == TW_Denormal )
640 {
641 if ( denormal_operand() < 0 )
642 return 1;
643
644 /* For a small arg, the result == the argument */
645 /* Underflow may happen */
646 FPU_to_exp16(st0_ptr, st0_ptr);
647
648 tag = FPU_round(st0_ptr, 1, 0, FULL_PRECISION, arg_sign);
649
650 FPU_settag0(tag);
651
652 return 0;
653 }
654 else if ( tag == TW_Infinity )
655 {
656 /* The 80486 treats infinity as an invalid operand */
657 arith_invalid(0);
658 return 1;
659 }
660 else
661 {
662 single_arg_error(st0_ptr, tag);
663 return 1;
664 }
665}
666 583
584 if (tag == TAG_Special)
585 tag = FPU_Special(st0_ptr);
586
587 if (tag == TW_Denormal) {
588 if (denormal_operand() < 0)
589 return 1;
590
591 /* For a small arg, the result == the argument */
592 /* Underflow may happen */
593 FPU_to_exp16(st0_ptr, st0_ptr);
594
595 tag = FPU_round(st0_ptr, 1, 0, FULL_PRECISION, arg_sign);
596
597 FPU_settag0(tag);
598
599 return 0;
600 } else if (tag == TW_Infinity) {
601 /* The 80486 treats infinity as an invalid operand */
602 arith_invalid(0);
603 return 1;
604 } else {
605 single_arg_error(st0_ptr, tag);
606 return 1;
607 }
608}
667 609
668static int f_cos(FPU_REG *st0_ptr, u_char tag) 610static int f_cos(FPU_REG *st0_ptr, u_char tag)
669{ 611{
670 u_char st0_sign; 612 u_char st0_sign;
671 613
672 st0_sign = getsign(st0_ptr); 614 st0_sign = getsign(st0_ptr);
673
674 if ( tag == TAG_Valid )
675 {
676 int q;
677
678 if ( exponent(st0_ptr) > -40 )
679 {
680 if ( (exponent(st0_ptr) < 0)
681 || ((exponent(st0_ptr) == 0)
682 && (significand(st0_ptr) <= 0xc90fdaa22168c234LL)) )
683 {
684 poly_cos(st0_ptr);
685
686 /* We do not really know if up or down */
687 set_precision_flag_down();
688
689 return 0;
690 }
691 else if ( (q = trig_arg(st0_ptr, FCOS)) != -1 )
692 {
693 poly_sine(st0_ptr);
694
695 if ((q+1) & 2)
696 changesign(st0_ptr);
697
698 /* We do not really know if up or down */
699 set_precision_flag_down();
700
701 return 0;
702 }
703 else
704 {
705 /* Operand is out of range */
706 return 1;
707 }
708 }
709 else
710 {
711 denormal_arg:
712 615
713 setcc(0); 616 if (tag == TAG_Valid) {
714 FPU_copy_to_reg0(&CONST_1, TAG_Valid); 617 int q;
618
619 if (exponent(st0_ptr) > -40) {
620 if ((exponent(st0_ptr) < 0)
621 || ((exponent(st0_ptr) == 0)
622 && (significand(st0_ptr) <=
623 0xc90fdaa22168c234LL))) {
624 poly_cos(st0_ptr);
625
626 /* We do not really know if up or down */
627 set_precision_flag_down();
628
629 return 0;
630 } else if ((q = trig_arg(st0_ptr, FCOS)) != -1) {
631 poly_sine(st0_ptr);
632
633 if ((q + 1) & 2)
634 changesign(st0_ptr);
635
636 /* We do not really know if up or down */
637 set_precision_flag_down();
638
639 return 0;
640 } else {
641 /* Operand is out of range */
642 return 1;
643 }
644 } else {
645 denormal_arg:
646
647 setcc(0);
648 FPU_copy_to_reg0(&CONST_1, TAG_Valid);
715#ifdef PECULIAR_486 649#ifdef PECULIAR_486
716 set_precision_flag_down(); /* 80486 appears to do this. */ 650 set_precision_flag_down(); /* 80486 appears to do this. */
717#else 651#else
718 set_precision_flag_up(); /* Must be up. */ 652 set_precision_flag_up(); /* Must be up. */
719#endif /* PECULIAR_486 */ 653#endif /* PECULIAR_486 */
720 return 0; 654 return 0;
655 }
656 } else if (tag == TAG_Zero) {
657 FPU_copy_to_reg0(&CONST_1, TAG_Valid);
658 setcc(0);
659 return 0;
721 } 660 }
722 }
723 else if ( tag == TAG_Zero )
724 {
725 FPU_copy_to_reg0(&CONST_1, TAG_Valid);
726 setcc(0);
727 return 0;
728 }
729
730 if ( tag == TAG_Special )
731 tag = FPU_Special(st0_ptr);
732
733 if ( tag == TW_Denormal )
734 {
735 if ( denormal_operand() < 0 )
736 return 1;
737
738 goto denormal_arg;
739 }
740 else if ( tag == TW_Infinity )
741 {
742 /* The 80486 treats infinity as an invalid operand */
743 arith_invalid(0);
744 return 1;
745 }
746 else
747 {
748 single_arg_error(st0_ptr, tag); /* requires st0_ptr == &st(0) */
749 return 1;
750 }
751}
752 661
662 if (tag == TAG_Special)
663 tag = FPU_Special(st0_ptr);
664
665 if (tag == TW_Denormal) {
666 if (denormal_operand() < 0)
667 return 1;
668
669 goto denormal_arg;
670 } else if (tag == TW_Infinity) {
671 /* The 80486 treats infinity as an invalid operand */
672 arith_invalid(0);
673 return 1;
674 } else {
675 single_arg_error(st0_ptr, tag); /* requires st0_ptr == &st(0) */
676 return 1;
677 }
678}
753 679
754static void fcos(FPU_REG *st0_ptr, u_char st0_tag) 680static void fcos(FPU_REG *st0_ptr, u_char st0_tag)
755{ 681{
756 f_cos(st0_ptr, st0_tag); 682 f_cos(st0_ptr, st0_tag);
757} 683}
758 684
759
760static void fsincos(FPU_REG *st0_ptr, u_char st0_tag) 685static void fsincos(FPU_REG *st0_ptr, u_char st0_tag)
761{ 686{
762 FPU_REG *st_new_ptr; 687 FPU_REG *st_new_ptr;
763 FPU_REG arg; 688 FPU_REG arg;
764 u_char tag; 689 u_char tag;
765 690
766 /* Stack underflow has higher priority */ 691 /* Stack underflow has higher priority */
767 if ( st0_tag == TAG_Empty ) 692 if (st0_tag == TAG_Empty) {
768 { 693 FPU_stack_underflow(); /* Puts a QNaN in st(0) */
769 FPU_stack_underflow(); /* Puts a QNaN in st(0) */ 694 if (control_word & CW_Invalid) {
770 if ( control_word & CW_Invalid ) 695 st_new_ptr = &st(-1);
771 { 696 push();
772 st_new_ptr = &st(-1); 697 FPU_stack_underflow(); /* Puts a QNaN in the new st(0) */
773 push(); 698 }
774 FPU_stack_underflow(); /* Puts a QNaN in the new st(0) */ 699 return;
775 } 700 }
776 return; 701
777 } 702 if (STACK_OVERFLOW) {
778 703 FPU_stack_overflow();
779 if ( STACK_OVERFLOW ) 704 return;
780 { FPU_stack_overflow(); return; }
781
782 if ( st0_tag == TAG_Special )
783 tag = FPU_Special(st0_ptr);
784 else
785 tag = st0_tag;
786
787 if ( tag == TW_NaN )
788 {
789 single_arg_2_error(st0_ptr, TW_NaN);
790 return;
791 }
792 else if ( tag == TW_Infinity )
793 {
794 /* The 80486 treats infinity as an invalid operand */
795 if ( arith_invalid(0) >= 0 )
796 {
797 /* Masked response */
798 push();
799 arith_invalid(0);
800 } 705 }
801 return;
802 }
803
804 reg_copy(st0_ptr, &arg);
805 if ( !fsin(st0_ptr, st0_tag) )
806 {
807 push();
808 FPU_copy_to_reg0(&arg, st0_tag);
809 f_cos(&st(0), st0_tag);
810 }
811 else
812 {
813 /* An error, so restore st(0) */
814 FPU_copy_to_reg0(&arg, st0_tag);
815 }
816}
817 706
707 if (st0_tag == TAG_Special)
708 tag = FPU_Special(st0_ptr);
709 else
710 tag = st0_tag;
711
712 if (tag == TW_NaN) {
713 single_arg_2_error(st0_ptr, TW_NaN);
714 return;
715 } else if (tag == TW_Infinity) {
716 /* The 80486 treats infinity as an invalid operand */
717 if (arith_invalid(0) >= 0) {
718 /* Masked response */
719 push();
720 arith_invalid(0);
721 }
722 return;
723 }
724
725 reg_copy(st0_ptr, &arg);
726 if (!fsin(st0_ptr, st0_tag)) {
727 push();
728 FPU_copy_to_reg0(&arg, st0_tag);
729 f_cos(&st(0), st0_tag);
730 } else {
731 /* An error, so restore st(0) */
732 FPU_copy_to_reg0(&arg, st0_tag);
733 }
734}
818 735
819/*---------------------------------------------------------------------------*/ 736/*---------------------------------------------------------------------------*/
820/* The following all require two arguments: st(0) and st(1) */ 737/* The following all require two arguments: st(0) and st(1) */
@@ -826,1020 +743,901 @@ static void fsincos(FPU_REG *st0_ptr, u_char st0_tag)
826 result must be zero. 743 result must be zero.
827 */ 744 */
828static void rem_kernel(unsigned long long st0, unsigned long long *y, 745static void rem_kernel(unsigned long long st0, unsigned long long *y,
829 unsigned long long st1, 746 unsigned long long st1, unsigned long long q, int n)
830 unsigned long long q, int n)
831{ 747{
832 int dummy; 748 int dummy;
833 unsigned long long x; 749 unsigned long long x;
834 750
835 x = st0 << n; 751 x = st0 << n;
836 752
837 /* Do the required multiplication and subtraction in the one operation */ 753 /* Do the required multiplication and subtraction in the one operation */
838 754
839 /* lsw x -= lsw st1 * lsw q */ 755 /* lsw x -= lsw st1 * lsw q */
840 asm volatile ("mull %4; subl %%eax,%0; sbbl %%edx,%1" 756 asm volatile ("mull %4; subl %%eax,%0; sbbl %%edx,%1":"=m"
841 :"=m" (((unsigned *)&x)[0]), "=m" (((unsigned *)&x)[1]), 757 (((unsigned *)&x)[0]), "=m"(((unsigned *)&x)[1]),
842 "=a" (dummy) 758 "=a"(dummy)
843 :"2" (((unsigned *)&st1)[0]), "m" (((unsigned *)&q)[0]) 759 :"2"(((unsigned *)&st1)[0]), "m"(((unsigned *)&q)[0])
844 :"%dx"); 760 :"%dx");
845 /* msw x -= msw st1 * lsw q */ 761 /* msw x -= msw st1 * lsw q */
846 asm volatile ("mull %3; subl %%eax,%0" 762 asm volatile ("mull %3; subl %%eax,%0":"=m" (((unsigned *)&x)[1]),
847 :"=m" (((unsigned *)&x)[1]), "=a" (dummy) 763 "=a"(dummy)
848 :"1" (((unsigned *)&st1)[1]), "m" (((unsigned *)&q)[0]) 764 :"1"(((unsigned *)&st1)[1]), "m"(((unsigned *)&q)[0])
849 :"%dx"); 765 :"%dx");
850 /* msw x -= lsw st1 * msw q */ 766 /* msw x -= lsw st1 * msw q */
851 asm volatile ("mull %3; subl %%eax,%0" 767 asm volatile ("mull %3; subl %%eax,%0":"=m" (((unsigned *)&x)[1]),
852 :"=m" (((unsigned *)&x)[1]), "=a" (dummy) 768 "=a"(dummy)
853 :"1" (((unsigned *)&st1)[0]), "m" (((unsigned *)&q)[1]) 769 :"1"(((unsigned *)&st1)[0]), "m"(((unsigned *)&q)[1])
854 :"%dx"); 770 :"%dx");
855 771
856 *y = x; 772 *y = x;
857} 773}
858 774
859
860/* Remainder of st(0) / st(1) */ 775/* Remainder of st(0) / st(1) */
861/* This routine produces exact results, i.e. there is never any 776/* This routine produces exact results, i.e. there is never any
862 rounding or truncation, etc of the result. */ 777 rounding or truncation, etc of the result. */
863static void do_fprem(FPU_REG *st0_ptr, u_char st0_tag, int round) 778static void do_fprem(FPU_REG *st0_ptr, u_char st0_tag, int round)
864{ 779{
865 FPU_REG *st1_ptr = &st(1); 780 FPU_REG *st1_ptr = &st(1);
866 u_char st1_tag = FPU_gettagi(1); 781 u_char st1_tag = FPU_gettagi(1);
867 782
868 if ( !((st0_tag ^ TAG_Valid) | (st1_tag ^ TAG_Valid)) ) 783 if (!((st0_tag ^ TAG_Valid) | (st1_tag ^ TAG_Valid))) {
869 { 784 FPU_REG tmp, st0, st1;
870 FPU_REG tmp, st0, st1; 785 u_char st0_sign, st1_sign;
871 u_char st0_sign, st1_sign; 786 u_char tmptag;
872 u_char tmptag; 787 int tag;
873 int tag; 788 int old_cw;
874 int old_cw; 789 int expdif;
875 int expdif; 790 long long q;
876 long long q; 791 unsigned short saved_status;
877 unsigned short saved_status; 792 int cc;
878 int cc; 793
879 794 fprem_valid:
880 fprem_valid: 795 /* Convert registers for internal use. */
881 /* Convert registers for internal use. */ 796 st0_sign = FPU_to_exp16(st0_ptr, &st0);
882 st0_sign = FPU_to_exp16(st0_ptr, &st0); 797 st1_sign = FPU_to_exp16(st1_ptr, &st1);
883 st1_sign = FPU_to_exp16(st1_ptr, &st1); 798 expdif = exponent16(&st0) - exponent16(&st1);
884 expdif = exponent16(&st0) - exponent16(&st1); 799
885 800 old_cw = control_word;
886 old_cw = control_word; 801 cc = 0;
887 cc = 0; 802
888 803 /* We want the status following the denorm tests, but don't want
889 /* We want the status following the denorm tests, but don't want 804 the status changed by the arithmetic operations. */
890 the status changed by the arithmetic operations. */ 805 saved_status = partial_status;
891 saved_status = partial_status; 806 control_word &= ~CW_RC;
892 control_word &= ~CW_RC; 807 control_word |= RC_CHOP;
893 control_word |= RC_CHOP; 808
894 809 if (expdif < 64) {
895 if ( expdif < 64 ) 810 /* This should be the most common case */
896 { 811
897 /* This should be the most common case */ 812 if (expdif > -2) {
898 813 u_char sign = st0_sign ^ st1_sign;
899 if ( expdif > -2 ) 814 tag = FPU_u_div(&st0, &st1, &tmp,
900 { 815 PR_64_BITS | RC_CHOP | 0x3f,
901 u_char sign = st0_sign ^ st1_sign; 816 sign);
902 tag = FPU_u_div(&st0, &st1, &tmp, 817 setsign(&tmp, sign);
903 PR_64_BITS | RC_CHOP | 0x3f, 818
904 sign); 819 if (exponent(&tmp) >= 0) {
905 setsign(&tmp, sign); 820 FPU_round_to_int(&tmp, tag); /* Fortunately, this can't
906 821 overflow to 2^64 */
907 if ( exponent(&tmp) >= 0 ) 822 q = significand(&tmp);
908 { 823
909 FPU_round_to_int(&tmp, tag); /* Fortunately, this can't 824 rem_kernel(significand(&st0),
910 overflow to 2^64 */ 825 &significand(&tmp),
911 q = significand(&tmp); 826 significand(&st1),
912 827 q, expdif);
913 rem_kernel(significand(&st0), 828
914 &significand(&tmp), 829 setexponent16(&tmp, exponent16(&st1));
915 significand(&st1), 830 } else {
916 q, expdif); 831 reg_copy(&st0, &tmp);
917 832 q = 0;
918 setexponent16(&tmp, exponent16(&st1)); 833 }
919 } 834
920 else 835 if ((round == RC_RND)
921 { 836 && (tmp.sigh & 0xc0000000)) {
922 reg_copy(&st0, &tmp); 837 /* We may need to subtract st(1) once more,
923 q = 0; 838 to get a result <= 1/2 of st(1). */
924 } 839 unsigned long long x;
925 840 expdif =
926 if ( (round == RC_RND) && (tmp.sigh & 0xc0000000) ) 841 exponent16(&st1) - exponent16(&tmp);
927 { 842 if (expdif <= 1) {
928 /* We may need to subtract st(1) once more, 843 if (expdif == 0)
929 to get a result <= 1/2 of st(1). */ 844 x = significand(&st1) -
930 unsigned long long x; 845 significand(&tmp);
931 expdif = exponent16(&st1) - exponent16(&tmp); 846 else /* expdif is 1 */
932 if ( expdif <= 1 ) 847 x = (significand(&st1)
933 { 848 << 1) -
934 if ( expdif == 0 ) 849 significand(&tmp);
935 x = significand(&st1) - significand(&tmp); 850 if ((x < significand(&tmp)) ||
936 else /* expdif is 1 */ 851 /* or equi-distant (from 0 & st(1)) and q is odd */
937 x = (significand(&st1) << 1) - significand(&tmp); 852 ((x == significand(&tmp))
938 if ( (x < significand(&tmp)) || 853 && (q & 1))) {
939 /* or equi-distant (from 0 & st(1)) and q is odd */ 854 st0_sign = !st0_sign;
940 ((x == significand(&tmp)) && (q & 1) ) ) 855 significand(&tmp) = x;
941 { 856 q++;
942 st0_sign = ! st0_sign; 857 }
943 significand(&tmp) = x; 858 }
944 q++; 859 }
860
861 if (q & 4)
862 cc |= SW_C0;
863 if (q & 2)
864 cc |= SW_C3;
865 if (q & 1)
866 cc |= SW_C1;
867 } else {
868 control_word = old_cw;
869 setcc(0);
870 return;
945 } 871 }
946 } 872 } else {
947 } 873 /* There is a large exponent difference ( >= 64 ) */
948 874 /* To make much sense, the code in this section should
949 if (q & 4) cc |= SW_C0; 875 be done at high precision. */
950 if (q & 2) cc |= SW_C3; 876 int exp_1, N;
951 if (q & 1) cc |= SW_C1; 877 u_char sign;
952 } 878
953 else 879 /* prevent overflow here */
954 { 880 /* N is 'a number between 32 and 63' (p26-113) */
955 control_word = old_cw; 881 reg_copy(&st0, &tmp);
956 setcc(0); 882 tmptag = st0_tag;
957 return; 883 N = (expdif & 0x0000001f) + 32; /* This choice gives results
958 } 884 identical to an AMD 486 */
959 } 885 setexponent16(&tmp, N);
960 else 886 exp_1 = exponent16(&st1);
961 { 887 setexponent16(&st1, 0);
962 /* There is a large exponent difference ( >= 64 ) */ 888 expdif -= N;
963 /* To make much sense, the code in this section should 889
964 be done at high precision. */ 890 sign = getsign(&tmp) ^ st1_sign;
965 int exp_1, N; 891 tag =
966 u_char sign; 892 FPU_u_div(&tmp, &st1, &tmp,
967 893 PR_64_BITS | RC_CHOP | 0x3f, sign);
968 /* prevent overflow here */ 894 setsign(&tmp, sign);
969 /* N is 'a number between 32 and 63' (p26-113) */ 895
970 reg_copy(&st0, &tmp); 896 FPU_round_to_int(&tmp, tag); /* Fortunately, this can't
971 tmptag = st0_tag; 897 overflow to 2^64 */
972 N = (expdif & 0x0000001f) + 32; /* This choice gives results 898
973 identical to an AMD 486 */ 899 rem_kernel(significand(&st0),
974 setexponent16(&tmp, N); 900 &significand(&tmp),
975 exp_1 = exponent16(&st1); 901 significand(&st1),
976 setexponent16(&st1, 0); 902 significand(&tmp), exponent(&tmp)
977 expdif -= N; 903 );
978 904 setexponent16(&tmp, exp_1 + expdif);
979 sign = getsign(&tmp) ^ st1_sign; 905
980 tag = FPU_u_div(&tmp, &st1, &tmp, PR_64_BITS | RC_CHOP | 0x3f, 906 /* It is possible for the operation to be complete here.
981 sign); 907 What does the IEEE standard say? The Intel 80486 manual
982 setsign(&tmp, sign); 908 implies that the operation will never be completed at this
983 909 point, and the behaviour of a real 80486 confirms this.
984 FPU_round_to_int(&tmp, tag); /* Fortunately, this can't 910 */
985 overflow to 2^64 */ 911 if (!(tmp.sigh | tmp.sigl)) {
986 912 /* The result is zero */
987 rem_kernel(significand(&st0), 913 control_word = old_cw;
988 &significand(&tmp), 914 partial_status = saved_status;
989 significand(&st1), 915 FPU_copy_to_reg0(&CONST_Z, TAG_Zero);
990 significand(&tmp), 916 setsign(&st0, st0_sign);
991 exponent(&tmp)
992 );
993 setexponent16(&tmp, exp_1 + expdif);
994
995 /* It is possible for the operation to be complete here.
996 What does the IEEE standard say? The Intel 80486 manual
997 implies that the operation will never be completed at this
998 point, and the behaviour of a real 80486 confirms this.
999 */
1000 if ( !(tmp.sigh | tmp.sigl) )
1001 {
1002 /* The result is zero */
1003 control_word = old_cw;
1004 partial_status = saved_status;
1005 FPU_copy_to_reg0(&CONST_Z, TAG_Zero);
1006 setsign(&st0, st0_sign);
1007#ifdef PECULIAR_486 917#ifdef PECULIAR_486
1008 setcc(SW_C2); 918 setcc(SW_C2);
1009#else 919#else
1010 setcc(0); 920 setcc(0);
1011#endif /* PECULIAR_486 */ 921#endif /* PECULIAR_486 */
1012 return; 922 return;
1013 } 923 }
1014 cc = SW_C2; 924 cc = SW_C2;
1015 } 925 }
1016 926
1017 control_word = old_cw; 927 control_word = old_cw;
1018 partial_status = saved_status; 928 partial_status = saved_status;
1019 tag = FPU_normalize_nuo(&tmp); 929 tag = FPU_normalize_nuo(&tmp);
1020 reg_copy(&tmp, st0_ptr); 930 reg_copy(&tmp, st0_ptr);
1021 931
1022 /* The only condition to be looked for is underflow, 932 /* The only condition to be looked for is underflow,
1023 and it can occur here only if underflow is unmasked. */ 933 and it can occur here only if underflow is unmasked. */
1024 if ( (exponent16(&tmp) <= EXP_UNDER) && (tag != TAG_Zero) 934 if ((exponent16(&tmp) <= EXP_UNDER) && (tag != TAG_Zero)
1025 && !(control_word & CW_Underflow) ) 935 && !(control_word & CW_Underflow)) {
1026 { 936 setcc(cc);
1027 setcc(cc); 937 tag = arith_underflow(st0_ptr);
1028 tag = arith_underflow(st0_ptr); 938 setsign(st0_ptr, st0_sign);
1029 setsign(st0_ptr, st0_sign); 939 FPU_settag0(tag);
1030 FPU_settag0(tag); 940 return;
1031 return; 941 } else if ((exponent16(&tmp) > EXP_UNDER) || (tag == TAG_Zero)) {
1032 } 942 stdexp(st0_ptr);
1033 else if ( (exponent16(&tmp) > EXP_UNDER) || (tag == TAG_Zero) ) 943 setsign(st0_ptr, st0_sign);
1034 { 944 } else {
1035 stdexp(st0_ptr); 945 tag =
1036 setsign(st0_ptr, st0_sign); 946 FPU_round(st0_ptr, 0, 0, FULL_PRECISION, st0_sign);
1037 } 947 }
1038 else 948 FPU_settag0(tag);
1039 { 949 setcc(cc);
1040 tag = FPU_round(st0_ptr, 0, 0, FULL_PRECISION, st0_sign);
1041 }
1042 FPU_settag0(tag);
1043 setcc(cc);
1044 950
1045 return; 951 return;
1046 } 952 }
1047 953
1048 if ( st0_tag == TAG_Special ) 954 if (st0_tag == TAG_Special)
1049 st0_tag = FPU_Special(st0_ptr); 955 st0_tag = FPU_Special(st0_ptr);
1050 if ( st1_tag == TAG_Special ) 956 if (st1_tag == TAG_Special)
1051 st1_tag = FPU_Special(st1_ptr); 957 st1_tag = FPU_Special(st1_ptr);
1052 958
1053 if ( ((st0_tag == TAG_Valid) && (st1_tag == TW_Denormal)) 959 if (((st0_tag == TAG_Valid) && (st1_tag == TW_Denormal))
1054 || ((st0_tag == TW_Denormal) && (st1_tag == TAG_Valid)) 960 || ((st0_tag == TW_Denormal) && (st1_tag == TAG_Valid))
1055 || ((st0_tag == TW_Denormal) && (st1_tag == TW_Denormal)) ) 961 || ((st0_tag == TW_Denormal) && (st1_tag == TW_Denormal))) {
1056 { 962 if (denormal_operand() < 0)
1057 if ( denormal_operand() < 0 ) 963 return;
1058 return; 964 goto fprem_valid;
1059 goto fprem_valid; 965 } else if ((st0_tag == TAG_Empty) || (st1_tag == TAG_Empty)) {
1060 } 966 FPU_stack_underflow();
1061 else if ( (st0_tag == TAG_Empty) || (st1_tag == TAG_Empty) ) 967 return;
1062 { 968 } else if (st0_tag == TAG_Zero) {
1063 FPU_stack_underflow(); 969 if (st1_tag == TAG_Valid) {
1064 return; 970 setcc(0);
1065 } 971 return;
1066 else if ( st0_tag == TAG_Zero ) 972 } else if (st1_tag == TW_Denormal) {
1067 { 973 if (denormal_operand() < 0)
1068 if ( st1_tag == TAG_Valid ) 974 return;
1069 { 975 setcc(0);
1070 setcc(0); return; 976 return;
1071 } 977 } else if (st1_tag == TAG_Zero) {
1072 else if ( st1_tag == TW_Denormal ) 978 arith_invalid(0);
1073 { 979 return;
1074 if ( denormal_operand() < 0 ) 980 } /* fprem(?,0) always invalid */
1075 return; 981 else if (st1_tag == TW_Infinity) {
1076 setcc(0); return; 982 setcc(0);
1077 } 983 return;
1078 else if ( st1_tag == TAG_Zero ) 984 }
1079 { arith_invalid(0); return; } /* fprem(?,0) always invalid */ 985 } else if ((st0_tag == TAG_Valid) || (st0_tag == TW_Denormal)) {
1080 else if ( st1_tag == TW_Infinity ) 986 if (st1_tag == TAG_Zero) {
1081 { setcc(0); return; } 987 arith_invalid(0); /* fprem(Valid,Zero) is invalid */
1082 } 988 return;
1083 else if ( (st0_tag == TAG_Valid) || (st0_tag == TW_Denormal) ) 989 } else if (st1_tag != TW_NaN) {
1084 { 990 if (((st0_tag == TW_Denormal)
1085 if ( st1_tag == TAG_Zero ) 991 || (st1_tag == TW_Denormal))
1086 { 992 && (denormal_operand() < 0))
1087 arith_invalid(0); /* fprem(Valid,Zero) is invalid */ 993 return;
1088 return; 994
1089 } 995 if (st1_tag == TW_Infinity) {
1090 else if ( st1_tag != TW_NaN ) 996 /* fprem(Valid,Infinity) is o.k. */
1091 { 997 setcc(0);
1092 if ( ((st0_tag == TW_Denormal) || (st1_tag == TW_Denormal)) 998 return;
1093 && (denormal_operand() < 0) ) 999 }
1094 return; 1000 }
1095 1001 } else if (st0_tag == TW_Infinity) {
1096 if ( st1_tag == TW_Infinity ) 1002 if (st1_tag != TW_NaN) {
1097 { 1003 arith_invalid(0); /* fprem(Infinity,?) is invalid */
1098 /* fprem(Valid,Infinity) is o.k. */ 1004 return;
1099 setcc(0); return; 1005 }
1100 }
1101 }
1102 }
1103 else if ( st0_tag == TW_Infinity )
1104 {
1105 if ( st1_tag != TW_NaN )
1106 {
1107 arith_invalid(0); /* fprem(Infinity,?) is invalid */
1108 return;
1109 } 1006 }
1110 }
1111 1007
1112 /* One of the registers must contain a NaN if we got here. */ 1008 /* One of the registers must contain a NaN if we got here. */
1113 1009
1114#ifdef PARANOID 1010#ifdef PARANOID
1115 if ( (st0_tag != TW_NaN) && (st1_tag != TW_NaN) ) 1011 if ((st0_tag != TW_NaN) && (st1_tag != TW_NaN))
1116 EXCEPTION(EX_INTERNAL | 0x118); 1012 EXCEPTION(EX_INTERNAL | 0x118);
1117#endif /* PARANOID */ 1013#endif /* PARANOID */
1118 1014
1119 real_2op_NaN(st1_ptr, st1_tag, 0, st1_ptr); 1015 real_2op_NaN(st1_ptr, st1_tag, 0, st1_ptr);
1120 1016
1121} 1017}
1122 1018
1123
1124/* ST(1) <- ST(1) * log ST; pop ST */ 1019/* ST(1) <- ST(1) * log ST; pop ST */
1125static void fyl2x(FPU_REG *st0_ptr, u_char st0_tag) 1020static void fyl2x(FPU_REG *st0_ptr, u_char st0_tag)
1126{ 1021{
1127 FPU_REG *st1_ptr = &st(1), exponent; 1022 FPU_REG *st1_ptr = &st(1), exponent;
1128 u_char st1_tag = FPU_gettagi(1); 1023 u_char st1_tag = FPU_gettagi(1);
1129 u_char sign; 1024 u_char sign;
1130 int e, tag; 1025 int e, tag;
1131 1026
1132 clear_C1(); 1027 clear_C1();
1133 1028
1134 if ( (st0_tag == TAG_Valid) && (st1_tag == TAG_Valid) ) 1029 if ((st0_tag == TAG_Valid) && (st1_tag == TAG_Valid)) {
1135 { 1030 both_valid:
1136 both_valid: 1031 /* Both regs are Valid or Denormal */
1137 /* Both regs are Valid or Denormal */ 1032 if (signpositive(st0_ptr)) {
1138 if ( signpositive(st0_ptr) ) 1033 if (st0_tag == TW_Denormal)
1139 { 1034 FPU_to_exp16(st0_ptr, st0_ptr);
1140 if ( st0_tag == TW_Denormal ) 1035 else
1141 FPU_to_exp16(st0_ptr, st0_ptr); 1036 /* Convert st(0) for internal use. */
1142 else 1037 setexponent16(st0_ptr, exponent(st0_ptr));
1143 /* Convert st(0) for internal use. */ 1038
1144 setexponent16(st0_ptr, exponent(st0_ptr)); 1039 if ((st0_ptr->sigh == 0x80000000)
1145 1040 && (st0_ptr->sigl == 0)) {
1146 if ( (st0_ptr->sigh == 0x80000000) && (st0_ptr->sigl == 0) ) 1041 /* Special case. The result can be precise. */
1147 { 1042 u_char esign;
1148 /* Special case. The result can be precise. */ 1043 e = exponent16(st0_ptr);
1149 u_char esign; 1044 if (e >= 0) {
1150 e = exponent16(st0_ptr); 1045 exponent.sigh = e;
1151 if ( e >= 0 ) 1046 esign = SIGN_POS;
1152 { 1047 } else {
1153 exponent.sigh = e; 1048 exponent.sigh = -e;
1154 esign = SIGN_POS; 1049 esign = SIGN_NEG;
1155 } 1050 }
1156 else 1051 exponent.sigl = 0;
1157 { 1052 setexponent16(&exponent, 31);
1158 exponent.sigh = -e; 1053 tag = FPU_normalize_nuo(&exponent);
1159 esign = SIGN_NEG; 1054 stdexp(&exponent);
1055 setsign(&exponent, esign);
1056 tag =
1057 FPU_mul(&exponent, tag, 1, FULL_PRECISION);
1058 if (tag >= 0)
1059 FPU_settagi(1, tag);
1060 } else {
1061 /* The usual case */
1062 sign = getsign(st1_ptr);
1063 if (st1_tag == TW_Denormal)
1064 FPU_to_exp16(st1_ptr, st1_ptr);
1065 else
1066 /* Convert st(1) for internal use. */
1067 setexponent16(st1_ptr,
1068 exponent(st1_ptr));
1069 poly_l2(st0_ptr, st1_ptr, sign);
1070 }
1071 } else {
1072 /* negative */
1073 if (arith_invalid(1) < 0)
1074 return;
1160 } 1075 }
1161 exponent.sigl = 0;
1162 setexponent16(&exponent, 31);
1163 tag = FPU_normalize_nuo(&exponent);
1164 stdexp(&exponent);
1165 setsign(&exponent, esign);
1166 tag = FPU_mul(&exponent, tag, 1, FULL_PRECISION);
1167 if ( tag >= 0 )
1168 FPU_settagi(1, tag);
1169 }
1170 else
1171 {
1172 /* The usual case */
1173 sign = getsign(st1_ptr);
1174 if ( st1_tag == TW_Denormal )
1175 FPU_to_exp16(st1_ptr, st1_ptr);
1176 else
1177 /* Convert st(1) for internal use. */
1178 setexponent16(st1_ptr, exponent(st1_ptr));
1179 poly_l2(st0_ptr, st1_ptr, sign);
1180 }
1181 }
1182 else
1183 {
1184 /* negative */
1185 if ( arith_invalid(1) < 0 )
1186 return;
1187 }
1188 1076
1189 FPU_pop(); 1077 FPU_pop();
1190
1191 return;
1192 }
1193
1194 if ( st0_tag == TAG_Special )
1195 st0_tag = FPU_Special(st0_ptr);
1196 if ( st1_tag == TAG_Special )
1197 st1_tag = FPU_Special(st1_ptr);
1198
1199 if ( (st0_tag == TAG_Empty) || (st1_tag == TAG_Empty) )
1200 {
1201 FPU_stack_underflow_pop(1);
1202 return;
1203 }
1204 else if ( (st0_tag <= TW_Denormal) && (st1_tag <= TW_Denormal) )
1205 {
1206 if ( st0_tag == TAG_Zero )
1207 {
1208 if ( st1_tag == TAG_Zero )
1209 {
1210 /* Both args zero is invalid */
1211 if ( arith_invalid(1) < 0 )
1212 return;
1213 }
1214 else
1215 {
1216 u_char sign;
1217 sign = getsign(st1_ptr)^SIGN_NEG;
1218 if ( FPU_divide_by_zero(1, sign) < 0 )
1219 return;
1220 1078
1221 setsign(st1_ptr, sign);
1222 }
1223 }
1224 else if ( st1_tag == TAG_Zero )
1225 {
1226 /* st(1) contains zero, st(0) valid <> 0 */
1227 /* Zero is the valid answer */
1228 sign = getsign(st1_ptr);
1229
1230 if ( signnegative(st0_ptr) )
1231 {
1232 /* log(negative) */
1233 if ( arith_invalid(1) < 0 )
1234 return; 1079 return;
1235 }
1236 else if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) )
1237 return;
1238 else
1239 {
1240 if ( exponent(st0_ptr) < 0 )
1241 sign ^= SIGN_NEG;
1242
1243 FPU_copy_to_reg1(&CONST_Z, TAG_Zero);
1244 setsign(st1_ptr, sign);
1245 }
1246 } 1080 }
1247 else
1248 {
1249 /* One or both operands are denormals. */
1250 if ( denormal_operand() < 0 )
1251 return;
1252 goto both_valid;
1253 }
1254 }
1255 else if ( (st0_tag == TW_NaN) || (st1_tag == TW_NaN) )
1256 {
1257 if ( real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0 )
1258 return;
1259 }
1260 /* One or both arg must be an infinity */
1261 else if ( st0_tag == TW_Infinity )
1262 {
1263 if ( (signnegative(st0_ptr)) || (st1_tag == TAG_Zero) )
1264 {
1265 /* log(-infinity) or 0*log(infinity) */
1266 if ( arith_invalid(1) < 0 )
1267 return;
1268 }
1269 else
1270 {
1271 u_char sign = getsign(st1_ptr);
1272 1081
1273 if ( (st1_tag == TW_Denormal) && (denormal_operand() < 0) ) 1082 if (st0_tag == TAG_Special)
1274 return; 1083 st0_tag = FPU_Special(st0_ptr);
1084 if (st1_tag == TAG_Special)
1085 st1_tag = FPU_Special(st1_ptr);
1275 1086
1276 FPU_copy_to_reg1(&CONST_INF, TAG_Special); 1087 if ((st0_tag == TAG_Empty) || (st1_tag == TAG_Empty)) {
1277 setsign(st1_ptr, sign); 1088 FPU_stack_underflow_pop(1);
1278 }
1279 }
1280 /* st(1) must be infinity here */
1281 else if ( ((st0_tag == TAG_Valid) || (st0_tag == TW_Denormal))
1282 && ( signpositive(st0_ptr) ) )
1283 {
1284 if ( exponent(st0_ptr) >= 0 )
1285 {
1286 if ( (exponent(st0_ptr) == 0) &&
1287 (st0_ptr->sigh == 0x80000000) &&
1288 (st0_ptr->sigl == 0) )
1289 {
1290 /* st(0) holds 1.0 */
1291 /* infinity*log(1) */
1292 if ( arith_invalid(1) < 0 )
1293 return; 1089 return;
1294 } 1090 } else if ((st0_tag <= TW_Denormal) && (st1_tag <= TW_Denormal)) {
1295 /* else st(0) is positive and > 1.0 */ 1091 if (st0_tag == TAG_Zero) {
1092 if (st1_tag == TAG_Zero) {
1093 /* Both args zero is invalid */
1094 if (arith_invalid(1) < 0)
1095 return;
1096 } else {
1097 u_char sign;
1098 sign = getsign(st1_ptr) ^ SIGN_NEG;
1099 if (FPU_divide_by_zero(1, sign) < 0)
1100 return;
1101
1102 setsign(st1_ptr, sign);
1103 }
1104 } else if (st1_tag == TAG_Zero) {
1105 /* st(1) contains zero, st(0) valid <> 0 */
1106 /* Zero is the valid answer */
1107 sign = getsign(st1_ptr);
1108
1109 if (signnegative(st0_ptr)) {
1110 /* log(negative) */
1111 if (arith_invalid(1) < 0)
1112 return;
1113 } else if ((st0_tag == TW_Denormal)
1114 && (denormal_operand() < 0))
1115 return;
1116 else {
1117 if (exponent(st0_ptr) < 0)
1118 sign ^= SIGN_NEG;
1119
1120 FPU_copy_to_reg1(&CONST_Z, TAG_Zero);
1121 setsign(st1_ptr, sign);
1122 }
1123 } else {
1124 /* One or both operands are denormals. */
1125 if (denormal_operand() < 0)
1126 return;
1127 goto both_valid;
1128 }
1129 } else if ((st0_tag == TW_NaN) || (st1_tag == TW_NaN)) {
1130 if (real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0)
1131 return;
1132 }
1133 /* One or both arg must be an infinity */
1134 else if (st0_tag == TW_Infinity) {
1135 if ((signnegative(st0_ptr)) || (st1_tag == TAG_Zero)) {
1136 /* log(-infinity) or 0*log(infinity) */
1137 if (arith_invalid(1) < 0)
1138 return;
1139 } else {
1140 u_char sign = getsign(st1_ptr);
1141
1142 if ((st1_tag == TW_Denormal)
1143 && (denormal_operand() < 0))
1144 return;
1145
1146 FPU_copy_to_reg1(&CONST_INF, TAG_Special);
1147 setsign(st1_ptr, sign);
1148 }
1296 } 1149 }
1297 else 1150 /* st(1) must be infinity here */
1298 { 1151 else if (((st0_tag == TAG_Valid) || (st0_tag == TW_Denormal))
1299 /* st(0) is positive and < 1.0 */ 1152 && (signpositive(st0_ptr))) {
1153 if (exponent(st0_ptr) >= 0) {
1154 if ((exponent(st0_ptr) == 0) &&
1155 (st0_ptr->sigh == 0x80000000) &&
1156 (st0_ptr->sigl == 0)) {
1157 /* st(0) holds 1.0 */
1158 /* infinity*log(1) */
1159 if (arith_invalid(1) < 0)
1160 return;
1161 }
1162 /* else st(0) is positive and > 1.0 */
1163 } else {
1164 /* st(0) is positive and < 1.0 */
1300 1165
1301 if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) ) 1166 if ((st0_tag == TW_Denormal)
1302 return; 1167 && (denormal_operand() < 0))
1168 return;
1303 1169
1304 changesign(st1_ptr); 1170 changesign(st1_ptr);
1305 } 1171 }
1306 } 1172 } else {
1307 else 1173 /* st(0) must be zero or negative */
1308 { 1174 if (st0_tag == TAG_Zero) {
1309 /* st(0) must be zero or negative */ 1175 /* This should be invalid, but a real 80486 is happy with it. */
1310 if ( st0_tag == TAG_Zero )
1311 {
1312 /* This should be invalid, but a real 80486 is happy with it. */
1313 1176
1314#ifndef PECULIAR_486 1177#ifndef PECULIAR_486
1315 sign = getsign(st1_ptr); 1178 sign = getsign(st1_ptr);
1316 if ( FPU_divide_by_zero(1, sign) < 0 ) 1179 if (FPU_divide_by_zero(1, sign) < 0)
1317 return; 1180 return;
1318#endif /* PECULIAR_486 */ 1181#endif /* PECULIAR_486 */
1319 1182
1320 changesign(st1_ptr); 1183 changesign(st1_ptr);
1184 } else if (arith_invalid(1) < 0) /* log(negative) */
1185 return;
1321 } 1186 }
1322 else if ( arith_invalid(1) < 0 ) /* log(negative) */
1323 return;
1324 }
1325 1187
1326 FPU_pop(); 1188 FPU_pop();
1327} 1189}
1328 1190
1329
1330static void fpatan(FPU_REG *st0_ptr, u_char st0_tag) 1191static void fpatan(FPU_REG *st0_ptr, u_char st0_tag)
1331{ 1192{
1332 FPU_REG *st1_ptr = &st(1); 1193 FPU_REG *st1_ptr = &st(1);
1333 u_char st1_tag = FPU_gettagi(1); 1194 u_char st1_tag = FPU_gettagi(1);
1334 int tag; 1195 int tag;
1335 1196
1336 clear_C1(); 1197 clear_C1();
1337 if ( !((st0_tag ^ TAG_Valid) | (st1_tag ^ TAG_Valid)) ) 1198 if (!((st0_tag ^ TAG_Valid) | (st1_tag ^ TAG_Valid))) {
1338 { 1199 valid_atan:
1339 valid_atan:
1340 1200
1341 poly_atan(st0_ptr, st0_tag, st1_ptr, st1_tag); 1201 poly_atan(st0_ptr, st0_tag, st1_ptr, st1_tag);
1342 1202
1343 FPU_pop(); 1203 FPU_pop();
1344 1204
1345 return; 1205 return;
1346 } 1206 }
1347 1207
1348 if ( st0_tag == TAG_Special ) 1208 if (st0_tag == TAG_Special)
1349 st0_tag = FPU_Special(st0_ptr); 1209 st0_tag = FPU_Special(st0_ptr);
1350 if ( st1_tag == TAG_Special ) 1210 if (st1_tag == TAG_Special)
1351 st1_tag = FPU_Special(st1_ptr); 1211 st1_tag = FPU_Special(st1_ptr);
1352 1212
1353 if ( ((st0_tag == TAG_Valid) && (st1_tag == TW_Denormal)) 1213 if (((st0_tag == TAG_Valid) && (st1_tag == TW_Denormal))
1354 || ((st0_tag == TW_Denormal) && (st1_tag == TAG_Valid)) 1214 || ((st0_tag == TW_Denormal) && (st1_tag == TAG_Valid))
1355 || ((st0_tag == TW_Denormal) && (st1_tag == TW_Denormal)) ) 1215 || ((st0_tag == TW_Denormal) && (st1_tag == TW_Denormal))) {
1356 { 1216 if (denormal_operand() < 0)
1357 if ( denormal_operand() < 0 ) 1217 return;
1358 return;
1359 1218
1360 goto valid_atan; 1219 goto valid_atan;
1361 } 1220 } else if ((st0_tag == TAG_Empty) || (st1_tag == TAG_Empty)) {
1362 else if ( (st0_tag == TAG_Empty) || (st1_tag == TAG_Empty) ) 1221 FPU_stack_underflow_pop(1);
1363 { 1222 return;
1364 FPU_stack_underflow_pop(1); 1223 } else if ((st0_tag == TW_NaN) || (st1_tag == TW_NaN)) {
1365 return; 1224 if (real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) >= 0)
1366 } 1225 FPU_pop();
1367 else if ( (st0_tag == TW_NaN) || (st1_tag == TW_NaN) )
1368 {
1369 if ( real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) >= 0 )
1370 FPU_pop();
1371 return;
1372 }
1373 else if ( (st0_tag == TW_Infinity) || (st1_tag == TW_Infinity) )
1374 {
1375 u_char sign = getsign(st1_ptr);
1376 if ( st0_tag == TW_Infinity )
1377 {
1378 if ( st1_tag == TW_Infinity )
1379 {
1380 if ( signpositive(st0_ptr) )
1381 {
1382 FPU_copy_to_reg1(&CONST_PI4, TAG_Valid);
1383 }
1384 else
1385 {
1386 setpositive(st1_ptr);
1387 tag = FPU_u_add(&CONST_PI4, &CONST_PI2, st1_ptr,
1388 FULL_PRECISION, SIGN_POS,
1389 exponent(&CONST_PI4), exponent(&CONST_PI2));
1390 if ( tag >= 0 )
1391 FPU_settagi(1, tag);
1392 }
1393 }
1394 else
1395 {
1396 if ( (st1_tag == TW_Denormal) && (denormal_operand() < 0) )
1397 return; 1226 return;
1227 } else if ((st0_tag == TW_Infinity) || (st1_tag == TW_Infinity)) {
1228 u_char sign = getsign(st1_ptr);
1229 if (st0_tag == TW_Infinity) {
1230 if (st1_tag == TW_Infinity) {
1231 if (signpositive(st0_ptr)) {
1232 FPU_copy_to_reg1(&CONST_PI4, TAG_Valid);
1233 } else {
1234 setpositive(st1_ptr);
1235 tag =
1236 FPU_u_add(&CONST_PI4, &CONST_PI2,
1237 st1_ptr, FULL_PRECISION,
1238 SIGN_POS,
1239 exponent(&CONST_PI4),
1240 exponent(&CONST_PI2));
1241 if (tag >= 0)
1242 FPU_settagi(1, tag);
1243 }
1244 } else {
1245 if ((st1_tag == TW_Denormal)
1246 && (denormal_operand() < 0))
1247 return;
1248
1249 if (signpositive(st0_ptr)) {
1250 FPU_copy_to_reg1(&CONST_Z, TAG_Zero);
1251 setsign(st1_ptr, sign); /* An 80486 preserves the sign */
1252 FPU_pop();
1253 return;
1254 } else {
1255 FPU_copy_to_reg1(&CONST_PI, TAG_Valid);
1256 }
1257 }
1258 } else {
1259 /* st(1) is infinity, st(0) not infinity */
1260 if ((st0_tag == TW_Denormal)
1261 && (denormal_operand() < 0))
1262 return;
1398 1263
1399 if ( signpositive(st0_ptr) ) 1264 FPU_copy_to_reg1(&CONST_PI2, TAG_Valid);
1400 {
1401 FPU_copy_to_reg1(&CONST_Z, TAG_Zero);
1402 setsign(st1_ptr, sign); /* An 80486 preserves the sign */
1403 FPU_pop();
1404 return;
1405 } 1265 }
1406 else 1266 setsign(st1_ptr, sign);
1407 { 1267 } else if (st1_tag == TAG_Zero) {
1408 FPU_copy_to_reg1(&CONST_PI, TAG_Valid); 1268 /* st(0) must be valid or zero */
1269 u_char sign = getsign(st1_ptr);
1270
1271 if ((st0_tag == TW_Denormal) && (denormal_operand() < 0))
1272 return;
1273
1274 if (signpositive(st0_ptr)) {
1275 /* An 80486 preserves the sign */
1276 FPU_pop();
1277 return;
1409 } 1278 }
1410 }
1411 }
1412 else
1413 {
1414 /* st(1) is infinity, st(0) not infinity */
1415 if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) )
1416 return;
1417 1279
1418 FPU_copy_to_reg1(&CONST_PI2, TAG_Valid); 1280 FPU_copy_to_reg1(&CONST_PI, TAG_Valid);
1419 } 1281 setsign(st1_ptr, sign);
1420 setsign(st1_ptr, sign); 1282 } else if (st0_tag == TAG_Zero) {
1421 } 1283 /* st(1) must be TAG_Valid here */
1422 else if ( st1_tag == TAG_Zero ) 1284 u_char sign = getsign(st1_ptr);
1423 {
1424 /* st(0) must be valid or zero */
1425 u_char sign = getsign(st1_ptr);
1426
1427 if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) )
1428 return;
1429 1285
1430 if ( signpositive(st0_ptr) ) 1286 if ((st1_tag == TW_Denormal) && (denormal_operand() < 0))
1431 { 1287 return;
1432 /* An 80486 preserves the sign */
1433 FPU_pop();
1434 return;
1435 }
1436 1288
1437 FPU_copy_to_reg1(&CONST_PI, TAG_Valid); 1289 FPU_copy_to_reg1(&CONST_PI2, TAG_Valid);
1438 setsign(st1_ptr, sign); 1290 setsign(st1_ptr, sign);
1439 } 1291 }
1440 else if ( st0_tag == TAG_Zero )
1441 {
1442 /* st(1) must be TAG_Valid here */
1443 u_char sign = getsign(st1_ptr);
1444
1445 if ( (st1_tag == TW_Denormal) && (denormal_operand() < 0) )
1446 return;
1447
1448 FPU_copy_to_reg1(&CONST_PI2, TAG_Valid);
1449 setsign(st1_ptr, sign);
1450 }
1451#ifdef PARANOID 1292#ifdef PARANOID
1452 else 1293 else
1453 EXCEPTION(EX_INTERNAL | 0x125); 1294 EXCEPTION(EX_INTERNAL | 0x125);
1454#endif /* PARANOID */ 1295#endif /* PARANOID */
1455 1296
1456 FPU_pop(); 1297 FPU_pop();
1457 set_precision_flag_up(); /* We do not really know if up or down */ 1298 set_precision_flag_up(); /* We do not really know if up or down */
1458} 1299}
1459 1300
1460
1461static void fprem(FPU_REG *st0_ptr, u_char st0_tag) 1301static void fprem(FPU_REG *st0_ptr, u_char st0_tag)
1462{ 1302{
1463 do_fprem(st0_ptr, st0_tag, RC_CHOP); 1303 do_fprem(st0_ptr, st0_tag, RC_CHOP);
1464} 1304}
1465 1305
1466
1467static void fprem1(FPU_REG *st0_ptr, u_char st0_tag) 1306static void fprem1(FPU_REG *st0_ptr, u_char st0_tag)
1468{ 1307{
1469 do_fprem(st0_ptr, st0_tag, RC_RND); 1308 do_fprem(st0_ptr, st0_tag, RC_RND);
1470} 1309}
1471 1310
1472
1473static void fyl2xp1(FPU_REG *st0_ptr, u_char st0_tag) 1311static void fyl2xp1(FPU_REG *st0_ptr, u_char st0_tag)
1474{ 1312{
1475 u_char sign, sign1; 1313 u_char sign, sign1;
1476 FPU_REG *st1_ptr = &st(1), a, b; 1314 FPU_REG *st1_ptr = &st(1), a, b;
1477 u_char st1_tag = FPU_gettagi(1); 1315 u_char st1_tag = FPU_gettagi(1);
1478 1316
1479 clear_C1(); 1317 clear_C1();
1480 if ( !((st0_tag ^ TAG_Valid) | (st1_tag ^ TAG_Valid)) ) 1318 if (!((st0_tag ^ TAG_Valid) | (st1_tag ^ TAG_Valid))) {
1481 { 1319 valid_yl2xp1:
1482 valid_yl2xp1:
1483 1320
1484 sign = getsign(st0_ptr); 1321 sign = getsign(st0_ptr);
1485 sign1 = getsign(st1_ptr); 1322 sign1 = getsign(st1_ptr);
1486 1323
1487 FPU_to_exp16(st0_ptr, &a); 1324 FPU_to_exp16(st0_ptr, &a);
1488 FPU_to_exp16(st1_ptr, &b); 1325 FPU_to_exp16(st1_ptr, &b);
1489 1326
1490 if ( poly_l2p1(sign, sign1, &a, &b, st1_ptr) ) 1327 if (poly_l2p1(sign, sign1, &a, &b, st1_ptr))
1491 return; 1328 return;
1492 1329
1493 FPU_pop(); 1330 FPU_pop();
1494 return; 1331 return;
1495 } 1332 }
1496 1333
1497 if ( st0_tag == TAG_Special ) 1334 if (st0_tag == TAG_Special)
1498 st0_tag = FPU_Special(st0_ptr); 1335 st0_tag = FPU_Special(st0_ptr);
1499 if ( st1_tag == TAG_Special ) 1336 if (st1_tag == TAG_Special)
1500 st1_tag = FPU_Special(st1_ptr); 1337 st1_tag = FPU_Special(st1_ptr);
1501 1338
1502 if ( ((st0_tag == TAG_Valid) && (st1_tag == TW_Denormal)) 1339 if (((st0_tag == TAG_Valid) && (st1_tag == TW_Denormal))
1503 || ((st0_tag == TW_Denormal) && (st1_tag == TAG_Valid)) 1340 || ((st0_tag == TW_Denormal) && (st1_tag == TAG_Valid))
1504 || ((st0_tag == TW_Denormal) && (st1_tag == TW_Denormal)) ) 1341 || ((st0_tag == TW_Denormal) && (st1_tag == TW_Denormal))) {
1505 { 1342 if (denormal_operand() < 0)
1506 if ( denormal_operand() < 0 ) 1343 return;
1507 return;
1508
1509 goto valid_yl2xp1;
1510 }
1511 else if ( (st0_tag == TAG_Empty) | (st1_tag == TAG_Empty) )
1512 {
1513 FPU_stack_underflow_pop(1);
1514 return;
1515 }
1516 else if ( st0_tag == TAG_Zero )
1517 {
1518 switch ( st1_tag )
1519 {
1520 case TW_Denormal:
1521 if ( denormal_operand() < 0 )
1522 return;
1523
1524 case TAG_Zero:
1525 case TAG_Valid:
1526 setsign(st0_ptr, getsign(st0_ptr) ^ getsign(st1_ptr));
1527 FPU_copy_to_reg1(st0_ptr, st0_tag);
1528 break;
1529
1530 case TW_Infinity:
1531 /* Infinity*log(1) */
1532 if ( arith_invalid(1) < 0 )
1533 return;
1534 break;
1535 1344
1536 case TW_NaN: 1345 goto valid_yl2xp1;
1537 if ( real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0 ) 1346 } else if ((st0_tag == TAG_Empty) | (st1_tag == TAG_Empty)) {
1538 return; 1347 FPU_stack_underflow_pop(1);
1539 break; 1348 return;
1540 1349 } else if (st0_tag == TAG_Zero) {
1541 default: 1350 switch (st1_tag) {
1351 case TW_Denormal:
1352 if (denormal_operand() < 0)
1353 return;
1354
1355 case TAG_Zero:
1356 case TAG_Valid:
1357 setsign(st0_ptr, getsign(st0_ptr) ^ getsign(st1_ptr));
1358 FPU_copy_to_reg1(st0_ptr, st0_tag);
1359 break;
1360
1361 case TW_Infinity:
1362 /* Infinity*log(1) */
1363 if (arith_invalid(1) < 0)
1364 return;
1365 break;
1366
1367 case TW_NaN:
1368 if (real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0)
1369 return;
1370 break;
1371
1372 default:
1542#ifdef PARANOID 1373#ifdef PARANOID
1543 EXCEPTION(EX_INTERNAL | 0x116); 1374 EXCEPTION(EX_INTERNAL | 0x116);
1544 return; 1375 return;
1545#endif /* PARANOID */ 1376#endif /* PARANOID */
1546 break; 1377 break;
1547 } 1378 }
1548 } 1379 } else if ((st0_tag == TAG_Valid) || (st0_tag == TW_Denormal)) {
1549 else if ( (st0_tag == TAG_Valid) || (st0_tag == TW_Denormal) ) 1380 switch (st1_tag) {
1550 { 1381 case TAG_Zero:
1551 switch ( st1_tag ) 1382 if (signnegative(st0_ptr)) {
1552 { 1383 if (exponent(st0_ptr) >= 0) {
1553 case TAG_Zero: 1384 /* st(0) holds <= -1.0 */
1554 if ( signnegative(st0_ptr) ) 1385#ifdef PECULIAR_486 /* Stupid 80486 doesn't worry about log(negative). */
1555 { 1386 changesign(st1_ptr);
1556 if ( exponent(st0_ptr) >= 0 )
1557 {
1558 /* st(0) holds <= -1.0 */
1559#ifdef PECULIAR_486 /* Stupid 80486 doesn't worry about log(negative). */
1560 changesign(st1_ptr);
1561#else 1387#else
1562 if ( arith_invalid(1) < 0 ) 1388 if (arith_invalid(1) < 0)
1563 return; 1389 return;
1564#endif /* PECULIAR_486 */ 1390#endif /* PECULIAR_486 */
1565 } 1391 } else if ((st0_tag == TW_Denormal)
1566 else if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) ) 1392 && (denormal_operand() < 0))
1567 return; 1393 return;
1568 else 1394 else
1569 changesign(st1_ptr); 1395 changesign(st1_ptr);
1570 } 1396 } else if ((st0_tag == TW_Denormal)
1571 else if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) ) 1397 && (denormal_operand() < 0))
1572 return; 1398 return;
1573 break; 1399 break;
1574 1400
1575 case TW_Infinity: 1401 case TW_Infinity:
1576 if ( signnegative(st0_ptr) ) 1402 if (signnegative(st0_ptr)) {
1577 { 1403 if ((exponent(st0_ptr) >= 0) &&
1578 if ( (exponent(st0_ptr) >= 0) && 1404 !((st0_ptr->sigh == 0x80000000) &&
1579 !((st0_ptr->sigh == 0x80000000) && 1405 (st0_ptr->sigl == 0))) {
1580 (st0_ptr->sigl == 0)) ) 1406 /* st(0) holds < -1.0 */
1581 { 1407#ifdef PECULIAR_486 /* Stupid 80486 doesn't worry about log(negative). */
1582 /* st(0) holds < -1.0 */ 1408 changesign(st1_ptr);
1583#ifdef PECULIAR_486 /* Stupid 80486 doesn't worry about log(negative). */
1584 changesign(st1_ptr);
1585#else 1409#else
1586 if ( arith_invalid(1) < 0 ) return; 1410 if (arith_invalid(1) < 0)
1411 return;
1587#endif /* PECULIAR_486 */ 1412#endif /* PECULIAR_486 */
1413 } else if ((st0_tag == TW_Denormal)
1414 && (denormal_operand() < 0))
1415 return;
1416 else
1417 changesign(st1_ptr);
1418 } else if ((st0_tag == TW_Denormal)
1419 && (denormal_operand() < 0))
1420 return;
1421 break;
1422
1423 case TW_NaN:
1424 if (real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0)
1425 return;
1588 } 1426 }
1589 else if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) )
1590 return;
1591 else
1592 changesign(st1_ptr);
1593 }
1594 else if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) )
1595 return;
1596 break;
1597
1598 case TW_NaN:
1599 if ( real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0 )
1600 return;
1601 }
1602 1427
1603 } 1428 } else if (st0_tag == TW_NaN) {
1604 else if ( st0_tag == TW_NaN ) 1429 if (real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0)
1605 { 1430 return;
1606 if ( real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0 ) 1431 } else if (st0_tag == TW_Infinity) {
1607 return; 1432 if (st1_tag == TW_NaN) {
1608 } 1433 if (real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0)
1609 else if ( st0_tag == TW_Infinity ) 1434 return;
1610 { 1435 } else if (signnegative(st0_ptr)) {
1611 if ( st1_tag == TW_NaN )
1612 {
1613 if ( real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0 )
1614 return;
1615 }
1616 else if ( signnegative(st0_ptr) )
1617 {
1618#ifndef PECULIAR_486 1436#ifndef PECULIAR_486
1619 /* This should have higher priority than denormals, but... */ 1437 /* This should have higher priority than denormals, but... */
1620 if ( arith_invalid(1) < 0 ) /* log(-infinity) */ 1438 if (arith_invalid(1) < 0) /* log(-infinity) */
1621 return; 1439 return;
1622#endif /* PECULIAR_486 */ 1440#endif /* PECULIAR_486 */
1623 if ( (st1_tag == TW_Denormal) && (denormal_operand() < 0) ) 1441 if ((st1_tag == TW_Denormal)
1624 return; 1442 && (denormal_operand() < 0))
1443 return;
1625#ifdef PECULIAR_486 1444#ifdef PECULIAR_486
1626 /* Denormal operands actually get higher priority */ 1445 /* Denormal operands actually get higher priority */
1627 if ( arith_invalid(1) < 0 ) /* log(-infinity) */ 1446 if (arith_invalid(1) < 0) /* log(-infinity) */
1628 return; 1447 return;
1629#endif /* PECULIAR_486 */ 1448#endif /* PECULIAR_486 */
1630 } 1449 } else if (st1_tag == TAG_Zero) {
1631 else if ( st1_tag == TAG_Zero ) 1450 /* log(infinity) */
1632 { 1451 if (arith_invalid(1) < 0)
1633 /* log(infinity) */ 1452 return;
1634 if ( arith_invalid(1) < 0 ) 1453 }
1635 return;
1636 }
1637
1638 /* st(1) must be valid here. */
1639 1454
1640 else if ( (st1_tag == TW_Denormal) && (denormal_operand() < 0) ) 1455 /* st(1) must be valid here. */
1641 return; 1456
1457 else if ((st1_tag == TW_Denormal) && (denormal_operand() < 0))
1458 return;
1642 1459
1643 /* The Manual says that log(Infinity) is invalid, but a real 1460 /* The Manual says that log(Infinity) is invalid, but a real
1644 80486 sensibly says that it is o.k. */ 1461 80486 sensibly says that it is o.k. */
1645 else 1462 else {
1646 { 1463 u_char sign = getsign(st1_ptr);
1647 u_char sign = getsign(st1_ptr); 1464 FPU_copy_to_reg1(&CONST_INF, TAG_Special);
1648 FPU_copy_to_reg1(&CONST_INF, TAG_Special); 1465 setsign(st1_ptr, sign);
1649 setsign(st1_ptr, sign); 1466 }
1650 } 1467 }
1651 }
1652#ifdef PARANOID 1468#ifdef PARANOID
1653 else 1469 else {
1654 { 1470 EXCEPTION(EX_INTERNAL | 0x117);
1655 EXCEPTION(EX_INTERNAL | 0x117); 1471 return;
1656 return; 1472 }
1657 }
1658#endif /* PARANOID */ 1473#endif /* PARANOID */
1659 1474
1660 FPU_pop(); 1475 FPU_pop();
1661 return; 1476 return;
1662 1477
1663} 1478}
1664 1479
1665
1666static void fscale(FPU_REG *st0_ptr, u_char st0_tag) 1480static void fscale(FPU_REG *st0_ptr, u_char st0_tag)
1667{ 1481{
1668 FPU_REG *st1_ptr = &st(1); 1482 FPU_REG *st1_ptr = &st(1);
1669 u_char st1_tag = FPU_gettagi(1); 1483 u_char st1_tag = FPU_gettagi(1);
1670 int old_cw = control_word; 1484 int old_cw = control_word;
1671 u_char sign = getsign(st0_ptr); 1485 u_char sign = getsign(st0_ptr);
1672 1486
1673 clear_C1(); 1487 clear_C1();
1674 if ( !((st0_tag ^ TAG_Valid) | (st1_tag ^ TAG_Valid)) ) 1488 if (!((st0_tag ^ TAG_Valid) | (st1_tag ^ TAG_Valid))) {
1675 { 1489 long scale;
1676 long scale; 1490 FPU_REG tmp;
1677 FPU_REG tmp; 1491
1678 1492 /* Convert register for internal use. */
1679 /* Convert register for internal use. */ 1493 setexponent16(st0_ptr, exponent(st0_ptr));
1680 setexponent16(st0_ptr, exponent(st0_ptr)); 1494
1681 1495 valid_scale:
1682 valid_scale: 1496
1683 1497 if (exponent(st1_ptr) > 30) {
1684 if ( exponent(st1_ptr) > 30 ) 1498 /* 2^31 is far too large, would require 2^(2^30) or 2^(-2^30) */
1685 { 1499
1686 /* 2^31 is far too large, would require 2^(2^30) or 2^(-2^30) */ 1500 if (signpositive(st1_ptr)) {
1687 1501 EXCEPTION(EX_Overflow);
1688 if ( signpositive(st1_ptr) ) 1502 FPU_copy_to_reg0(&CONST_INF, TAG_Special);
1689 { 1503 } else {
1690 EXCEPTION(EX_Overflow); 1504 EXCEPTION(EX_Underflow);
1691 FPU_copy_to_reg0(&CONST_INF, TAG_Special); 1505 FPU_copy_to_reg0(&CONST_Z, TAG_Zero);
1692 } 1506 }
1693 else 1507 setsign(st0_ptr, sign);
1694 { 1508 return;
1695 EXCEPTION(EX_Underflow); 1509 }
1696 FPU_copy_to_reg0(&CONST_Z, TAG_Zero);
1697 }
1698 setsign(st0_ptr, sign);
1699 return;
1700 }
1701
1702 control_word &= ~CW_RC;
1703 control_word |= RC_CHOP;
1704 reg_copy(st1_ptr, &tmp);
1705 FPU_round_to_int(&tmp, st1_tag); /* This can never overflow here */
1706 control_word = old_cw;
1707 scale = signnegative(st1_ptr) ? -tmp.sigl : tmp.sigl;
1708 scale += exponent16(st0_ptr);
1709
1710 setexponent16(st0_ptr, scale);
1711
1712 /* Use FPU_round() to properly detect under/overflow etc */
1713 FPU_round(st0_ptr, 0, 0, control_word, sign);
1714
1715 return;
1716 }
1717
1718 if ( st0_tag == TAG_Special )
1719 st0_tag = FPU_Special(st0_ptr);
1720 if ( st1_tag == TAG_Special )
1721 st1_tag = FPU_Special(st1_ptr);
1722
1723 if ( (st0_tag == TAG_Valid) || (st0_tag == TW_Denormal) )
1724 {
1725 switch ( st1_tag )
1726 {
1727 case TAG_Valid:
1728 /* st(0) must be a denormal */
1729 if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) )
1730 return;
1731
1732 FPU_to_exp16(st0_ptr, st0_ptr); /* Will not be left on stack */
1733 goto valid_scale;
1734
1735 case TAG_Zero:
1736 if ( st0_tag == TW_Denormal )
1737 denormal_operand();
1738 return;
1739
1740 case TW_Denormal:
1741 denormal_operand();
1742 return;
1743
1744 case TW_Infinity:
1745 if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) )
1746 return;
1747
1748 if ( signpositive(st1_ptr) )
1749 FPU_copy_to_reg0(&CONST_INF, TAG_Special);
1750 else
1751 FPU_copy_to_reg0(&CONST_Z, TAG_Zero);
1752 setsign(st0_ptr, sign);
1753 return;
1754 1510
1755 case TW_NaN: 1511 control_word &= ~CW_RC;
1756 real_2op_NaN(st1_ptr, st1_tag, 0, st0_ptr); 1512 control_word |= RC_CHOP;
1757 return; 1513 reg_copy(st1_ptr, &tmp);
1758 } 1514 FPU_round_to_int(&tmp, st1_tag); /* This can never overflow here */
1759 } 1515 control_word = old_cw;
1760 else if ( st0_tag == TAG_Zero ) 1516 scale = signnegative(st1_ptr) ? -tmp.sigl : tmp.sigl;
1761 { 1517 scale += exponent16(st0_ptr);
1762 switch ( st1_tag )
1763 {
1764 case TAG_Valid:
1765 case TAG_Zero:
1766 return;
1767 1518
1768 case TW_Denormal: 1519 setexponent16(st0_ptr, scale);
1769 denormal_operand();
1770 return;
1771 1520
1772 case TW_Infinity: 1521 /* Use FPU_round() to properly detect under/overflow etc */
1773 if ( signpositive(st1_ptr) ) 1522 FPU_round(st0_ptr, 0, 0, control_word, sign);
1774 arith_invalid(0); /* Zero scaled by +Infinity */
1775 return;
1776 1523
1777 case TW_NaN: 1524 return;
1778 real_2op_NaN(st1_ptr, st1_tag, 0, st0_ptr);
1779 return;
1780 } 1525 }
1781 }
1782 else if ( st0_tag == TW_Infinity )
1783 {
1784 switch ( st1_tag )
1785 {
1786 case TAG_Valid:
1787 case TAG_Zero:
1788 return;
1789
1790 case TW_Denormal:
1791 denormal_operand();
1792 return;
1793 1526
1794 case TW_Infinity: 1527 if (st0_tag == TAG_Special)
1795 if ( signnegative(st1_ptr) ) 1528 st0_tag = FPU_Special(st0_ptr);
1796 arith_invalid(0); /* Infinity scaled by -Infinity */ 1529 if (st1_tag == TAG_Special)
1797 return; 1530 st1_tag = FPU_Special(st1_ptr);
1798 1531
1799 case TW_NaN: 1532 if ((st0_tag == TAG_Valid) || (st0_tag == TW_Denormal)) {
1800 real_2op_NaN(st1_ptr, st1_tag, 0, st0_ptr); 1533 switch (st1_tag) {
1801 return; 1534 case TAG_Valid:
1535 /* st(0) must be a denormal */
1536 if ((st0_tag == TW_Denormal)
1537 && (denormal_operand() < 0))
1538 return;
1539
1540 FPU_to_exp16(st0_ptr, st0_ptr); /* Will not be left on stack */
1541 goto valid_scale;
1542
1543 case TAG_Zero:
1544 if (st0_tag == TW_Denormal)
1545 denormal_operand();
1546 return;
1547
1548 case TW_Denormal:
1549 denormal_operand();
1550 return;
1551
1552 case TW_Infinity:
1553 if ((st0_tag == TW_Denormal)
1554 && (denormal_operand() < 0))
1555 return;
1556
1557 if (signpositive(st1_ptr))
1558 FPU_copy_to_reg0(&CONST_INF, TAG_Special);
1559 else
1560 FPU_copy_to_reg0(&CONST_Z, TAG_Zero);
1561 setsign(st0_ptr, sign);
1562 return;
1563
1564 case TW_NaN:
1565 real_2op_NaN(st1_ptr, st1_tag, 0, st0_ptr);
1566 return;
1567 }
1568 } else if (st0_tag == TAG_Zero) {
1569 switch (st1_tag) {
1570 case TAG_Valid:
1571 case TAG_Zero:
1572 return;
1573
1574 case TW_Denormal:
1575 denormal_operand();
1576 return;
1577
1578 case TW_Infinity:
1579 if (signpositive(st1_ptr))
1580 arith_invalid(0); /* Zero scaled by +Infinity */
1581 return;
1582
1583 case TW_NaN:
1584 real_2op_NaN(st1_ptr, st1_tag, 0, st0_ptr);
1585 return;
1586 }
1587 } else if (st0_tag == TW_Infinity) {
1588 switch (st1_tag) {
1589 case TAG_Valid:
1590 case TAG_Zero:
1591 return;
1592
1593 case TW_Denormal:
1594 denormal_operand();
1595 return;
1596
1597 case TW_Infinity:
1598 if (signnegative(st1_ptr))
1599 arith_invalid(0); /* Infinity scaled by -Infinity */
1600 return;
1601
1602 case TW_NaN:
1603 real_2op_NaN(st1_ptr, st1_tag, 0, st0_ptr);
1604 return;
1605 }
1606 } else if (st0_tag == TW_NaN) {
1607 if (st1_tag != TAG_Empty) {
1608 real_2op_NaN(st1_ptr, st1_tag, 0, st0_ptr);
1609 return;
1610 }
1802 } 1611 }
1803 }
1804 else if ( st0_tag == TW_NaN )
1805 {
1806 if ( st1_tag != TAG_Empty )
1807 { real_2op_NaN(st1_ptr, st1_tag, 0, st0_ptr); return; }
1808 }
1809
1810#ifdef PARANOID 1612#ifdef PARANOID
1811 if ( !((st0_tag == TAG_Empty) || (st1_tag == TAG_Empty)) ) 1613 if (!((st0_tag == TAG_Empty) || (st1_tag == TAG_Empty))) {
1812 { 1614 EXCEPTION(EX_INTERNAL | 0x115);
1813 EXCEPTION(EX_INTERNAL | 0x115); 1615 return;
1814 return; 1616 }
1815 }
1816#endif 1617#endif
1817 1618
1818 /* At least one of st(0), st(1) must be empty */ 1619 /* At least one of st(0), st(1) must be empty */
1819 FPU_stack_underflow(); 1620 FPU_stack_underflow();
1820 1621
1821} 1622}
1822 1623
1823
1824/*---------------------------------------------------------------------------*/ 1624/*---------------------------------------------------------------------------*/
1825 1625
1826static FUNC_ST0 const trig_table_a[] = { 1626static FUNC_ST0 const trig_table_a[] = {
1827 f2xm1, fyl2x, fptan, fpatan, 1627 f2xm1, fyl2x, fptan, fpatan,
1828 fxtract, fprem1, (FUNC_ST0)fdecstp, (FUNC_ST0)fincstp 1628 fxtract, fprem1, (FUNC_ST0) fdecstp, (FUNC_ST0) fincstp
1829}; 1629};
1830 1630
1831void FPU_triga(void) 1631void FPU_triga(void)
1832{ 1632{
1833 (trig_table_a[FPU_rm])(&st(0), FPU_gettag0()); 1633 (trig_table_a[FPU_rm]) (&st(0), FPU_gettag0());
1834} 1634}
1835 1635
1836 1636static FUNC_ST0 const trig_table_b[] = {
1837static FUNC_ST0 const trig_table_b[] = 1637 fprem, fyl2xp1, fsqrt_, fsincos, frndint_, fscale, (FUNC_ST0) fsin, fcos
1838 { 1638};
1839 fprem, fyl2xp1, fsqrt_, fsincos, frndint_, fscale, (FUNC_ST0)fsin, fcos
1840 };
1841 1639
1842void FPU_trigb(void) 1640void FPU_trigb(void)
1843{ 1641{
1844 (trig_table_b[FPU_rm])(&st(0), FPU_gettag0()); 1642 (trig_table_b[FPU_rm]) (&st(0), FPU_gettag0());
1845} 1643}
diff --git a/arch/x86/math-emu/get_address.c b/arch/x86/math-emu/get_address.c
index 2e2c51a8bd3..d701e2b39e4 100644
--- a/arch/x86/math-emu/get_address.c
+++ b/arch/x86/math-emu/get_address.c
@@ -17,7 +17,6 @@
17 | other processes using the emulator while swapping is in progress. | 17 | other processes using the emulator while swapping is in progress. |
18 +---------------------------------------------------------------------------*/ 18 +---------------------------------------------------------------------------*/
19 19
20
21#include <linux/stddef.h> 20#include <linux/stddef.h>
22 21
23#include <asm/uaccess.h> 22#include <asm/uaccess.h>
@@ -27,31 +26,30 @@
27#include "exception.h" 26#include "exception.h"
28#include "fpu_emu.h" 27#include "fpu_emu.h"
29 28
30
31#define FPU_WRITE_BIT 0x10 29#define FPU_WRITE_BIT 0x10
32 30
33static int reg_offset[] = { 31static int reg_offset[] = {
34 offsetof(struct info,___eax), 32 offsetof(struct info, ___eax),
35 offsetof(struct info,___ecx), 33 offsetof(struct info, ___ecx),
36 offsetof(struct info,___edx), 34 offsetof(struct info, ___edx),
37 offsetof(struct info,___ebx), 35 offsetof(struct info, ___ebx),
38 offsetof(struct info,___esp), 36 offsetof(struct info, ___esp),
39 offsetof(struct info,___ebp), 37 offsetof(struct info, ___ebp),
40 offsetof(struct info,___esi), 38 offsetof(struct info, ___esi),
41 offsetof(struct info,___edi) 39 offsetof(struct info, ___edi)
42}; 40};
43 41
44#define REG_(x) (*(long *)(reg_offset[(x)]+(u_char *) FPU_info)) 42#define REG_(x) (*(long *)(reg_offset[(x)]+(u_char *) FPU_info))
45 43
46static int reg_offset_vm86[] = { 44static int reg_offset_vm86[] = {
47 offsetof(struct info,___cs), 45 offsetof(struct info, ___cs),
48 offsetof(struct info,___vm86_ds), 46 offsetof(struct info, ___vm86_ds),
49 offsetof(struct info,___vm86_es), 47 offsetof(struct info, ___vm86_es),
50 offsetof(struct info,___vm86_fs), 48 offsetof(struct info, ___vm86_fs),
51 offsetof(struct info,___vm86_gs), 49 offsetof(struct info, ___vm86_gs),
52 offsetof(struct info,___ss), 50 offsetof(struct info, ___ss),
53 offsetof(struct info,___vm86_ds) 51 offsetof(struct info, ___vm86_ds)
54 }; 52};
55 53
56#define VM86_REG_(x) (*(unsigned short *) \ 54#define VM86_REG_(x) (*(unsigned short *) \
57 (reg_offset_vm86[((unsigned)x)]+(u_char *) FPU_info)) 55 (reg_offset_vm86[((unsigned)x)]+(u_char *) FPU_info))
@@ -60,158 +58,141 @@ static int reg_offset_vm86[] = {
60#define ___GS ___ds 58#define ___GS ___ds
61 59
62static int reg_offset_pm[] = { 60static int reg_offset_pm[] = {
63 offsetof(struct info,___cs), 61 offsetof(struct info, ___cs),
64 offsetof(struct info,___ds), 62 offsetof(struct info, ___ds),
65 offsetof(struct info,___es), 63 offsetof(struct info, ___es),
66 offsetof(struct info,___fs), 64 offsetof(struct info, ___fs),
67 offsetof(struct info,___GS), 65 offsetof(struct info, ___GS),
68 offsetof(struct info,___ss), 66 offsetof(struct info, ___ss),
69 offsetof(struct info,___ds) 67 offsetof(struct info, ___ds)
70 }; 68};
71 69
72#define PM_REG_(x) (*(unsigned short *) \ 70#define PM_REG_(x) (*(unsigned short *) \
73 (reg_offset_pm[((unsigned)x)]+(u_char *) FPU_info)) 71 (reg_offset_pm[((unsigned)x)]+(u_char *) FPU_info))
74 72
75
76/* Decode the SIB byte. This function assumes mod != 0 */ 73/* Decode the SIB byte. This function assumes mod != 0 */
77static int sib(int mod, unsigned long *fpu_eip) 74static int sib(int mod, unsigned long *fpu_eip)
78{ 75{
79 u_char ss,index,base; 76 u_char ss, index, base;
80 long offset; 77 long offset;
81 78
82 RE_ENTRANT_CHECK_OFF; 79 RE_ENTRANT_CHECK_OFF;
83 FPU_code_access_ok(1); 80 FPU_code_access_ok(1);
84 FPU_get_user(base, (u_char __user *) (*fpu_eip)); /* The SIB byte */ 81 FPU_get_user(base, (u_char __user *) (*fpu_eip)); /* The SIB byte */
85 RE_ENTRANT_CHECK_ON; 82 RE_ENTRANT_CHECK_ON;
86 (*fpu_eip)++; 83 (*fpu_eip)++;
87 ss = base >> 6; 84 ss = base >> 6;
88 index = (base >> 3) & 7; 85 index = (base >> 3) & 7;
89 base &= 7; 86 base &= 7;
90 87
91 if ((mod == 0) && (base == 5)) 88 if ((mod == 0) && (base == 5))
92 offset = 0; /* No base register */ 89 offset = 0; /* No base register */
93 else 90 else
94 offset = REG_(base); 91 offset = REG_(base);
95 92
96 if (index == 4) 93 if (index == 4) {
97 { 94 /* No index register */
98 /* No index register */ 95 /* A non-zero ss is illegal */
99 /* A non-zero ss is illegal */ 96 if (ss)
100 if ( ss ) 97 EXCEPTION(EX_Invalid);
101 EXCEPTION(EX_Invalid); 98 } else {
102 } 99 offset += (REG_(index)) << ss;
103 else 100 }
104 { 101
105 offset += (REG_(index)) << ss; 102 if (mod == 1) {
106 } 103 /* 8 bit signed displacement */
107 104 long displacement;
108 if (mod == 1) 105 RE_ENTRANT_CHECK_OFF;
109 { 106 FPU_code_access_ok(1);
110 /* 8 bit signed displacement */ 107 FPU_get_user(displacement, (signed char __user *)(*fpu_eip));
111 long displacement; 108 offset += displacement;
112 RE_ENTRANT_CHECK_OFF; 109 RE_ENTRANT_CHECK_ON;
113 FPU_code_access_ok(1); 110 (*fpu_eip)++;
114 FPU_get_user(displacement, (signed char __user *) (*fpu_eip)); 111 } else if (mod == 2 || base == 5) { /* The second condition also has mod==0 */
115 offset += displacement; 112 /* 32 bit displacement */
116 RE_ENTRANT_CHECK_ON; 113 long displacement;
117 (*fpu_eip)++; 114 RE_ENTRANT_CHECK_OFF;
118 } 115 FPU_code_access_ok(4);
119 else if (mod == 2 || base == 5) /* The second condition also has mod==0 */ 116 FPU_get_user(displacement, (long __user *)(*fpu_eip));
120 { 117 offset += displacement;
121 /* 32 bit displacement */ 118 RE_ENTRANT_CHECK_ON;
122 long displacement; 119 (*fpu_eip) += 4;
123 RE_ENTRANT_CHECK_OFF; 120 }
124 FPU_code_access_ok(4);
125 FPU_get_user(displacement, (long __user *) (*fpu_eip));
126 offset += displacement;
127 RE_ENTRANT_CHECK_ON;
128 (*fpu_eip) += 4;
129 }
130
131 return offset;
132}
133 121
122 return offset;
123}
134 124
135static unsigned long vm86_segment(u_char segment, 125static unsigned long vm86_segment(u_char segment, struct address *addr)
136 struct address *addr)
137{ 126{
138 segment--; 127 segment--;
139#ifdef PARANOID 128#ifdef PARANOID
140 if ( segment > PREFIX_SS_ ) 129 if (segment > PREFIX_SS_) {
141 { 130 EXCEPTION(EX_INTERNAL | 0x130);
142 EXCEPTION(EX_INTERNAL|0x130); 131 math_abort(FPU_info, SIGSEGV);
143 math_abort(FPU_info,SIGSEGV); 132 }
144 }
145#endif /* PARANOID */ 133#endif /* PARANOID */
146 addr->selector = VM86_REG_(segment); 134 addr->selector = VM86_REG_(segment);
147 return (unsigned long)VM86_REG_(segment) << 4; 135 return (unsigned long)VM86_REG_(segment) << 4;
148} 136}
149 137
150
151/* This should work for 16 and 32 bit protected mode. */ 138/* This should work for 16 and 32 bit protected mode. */
152static long pm_address(u_char FPU_modrm, u_char segment, 139static long pm_address(u_char FPU_modrm, u_char segment,
153 struct address *addr, long offset) 140 struct address *addr, long offset)
154{ 141{
155 struct desc_struct descriptor; 142 struct desc_struct descriptor;
156 unsigned long base_address, limit, address, seg_top; 143 unsigned long base_address, limit, address, seg_top;
157 144
158 segment--; 145 segment--;
159 146
160#ifdef PARANOID 147#ifdef PARANOID
161 /* segment is unsigned, so this also detects if segment was 0: */ 148 /* segment is unsigned, so this also detects if segment was 0: */
162 if ( segment > PREFIX_SS_ ) 149 if (segment > PREFIX_SS_) {
163 { 150 EXCEPTION(EX_INTERNAL | 0x132);
164 EXCEPTION(EX_INTERNAL|0x132); 151 math_abort(FPU_info, SIGSEGV);
165 math_abort(FPU_info,SIGSEGV); 152 }
166 }
167#endif /* PARANOID */ 153#endif /* PARANOID */
168 154
169 switch ( segment ) 155 switch (segment) {
170 { 156 /* gs isn't used by the kernel, so it still has its
171 /* gs isn't used by the kernel, so it still has its 157 user-space value. */
172 user-space value. */ 158 case PREFIX_GS_ - 1:
173 case PREFIX_GS_-1: 159 /* N.B. - movl %seg, mem is a 2 byte write regardless of prefix */
174 /* N.B. - movl %seg, mem is a 2 byte write regardless of prefix */ 160 savesegment(gs, addr->selector);
175 savesegment(gs, addr->selector); 161 break;
176 break; 162 default:
177 default: 163 addr->selector = PM_REG_(segment);
178 addr->selector = PM_REG_(segment);
179 }
180
181 descriptor = LDT_DESCRIPTOR(PM_REG_(segment));
182 base_address = SEG_BASE_ADDR(descriptor);
183 address = base_address + offset;
184 limit = base_address
185 + (SEG_LIMIT(descriptor)+1) * SEG_GRANULARITY(descriptor) - 1;
186 if ( limit < base_address ) limit = 0xffffffff;
187
188 if ( SEG_EXPAND_DOWN(descriptor) )
189 {
190 if ( SEG_G_BIT(descriptor) )
191 seg_top = 0xffffffff;
192 else
193 {
194 seg_top = base_address + (1 << 20);
195 if ( seg_top < base_address ) seg_top = 0xffffffff;
196 } 164 }
197 access_limit =
198 (address <= limit) || (address >= seg_top) ? 0 :
199 ((seg_top-address) >= 255 ? 255 : seg_top-address);
200 }
201 else
202 {
203 access_limit =
204 (address > limit) || (address < base_address) ? 0 :
205 ((limit-address) >= 254 ? 255 : limit-address+1);
206 }
207 if ( SEG_EXECUTE_ONLY(descriptor) ||
208 (!SEG_WRITE_PERM(descriptor) && (FPU_modrm & FPU_WRITE_BIT)) )
209 {
210 access_limit = 0;
211 }
212 return address;
213}
214 165
166 descriptor = LDT_DESCRIPTOR(PM_REG_(segment));
167 base_address = SEG_BASE_ADDR(descriptor);
168 address = base_address + offset;
169 limit = base_address
170 + (SEG_LIMIT(descriptor) + 1) * SEG_GRANULARITY(descriptor) - 1;
171 if (limit < base_address)
172 limit = 0xffffffff;
173
174 if (SEG_EXPAND_DOWN(descriptor)) {
175 if (SEG_G_BIT(descriptor))
176 seg_top = 0xffffffff;
177 else {
178 seg_top = base_address + (1 << 20);
179 if (seg_top < base_address)
180 seg_top = 0xffffffff;
181 }
182 access_limit =
183 (address <= limit) || (address >= seg_top) ? 0 :
184 ((seg_top - address) >= 255 ? 255 : seg_top - address);
185 } else {
186 access_limit =
187 (address > limit) || (address < base_address) ? 0 :
188 ((limit - address) >= 254 ? 255 : limit - address + 1);
189 }
190 if (SEG_EXECUTE_ONLY(descriptor) ||
191 (!SEG_WRITE_PERM(descriptor) && (FPU_modrm & FPU_WRITE_BIT))) {
192 access_limit = 0;
193 }
194 return address;
195}
215 196
216/* 197/*
217 MOD R/M byte: MOD == 3 has a special use for the FPU 198 MOD R/M byte: MOD == 3 has a special use for the FPU
@@ -221,7 +202,6 @@ static long pm_address(u_char FPU_modrm, u_char segment,
221 ..... ......... ......... 202 ..... ......... .........
222 MOD OPCODE(2) R/M 203 MOD OPCODE(2) R/M
223 204
224
225 SIB byte 205 SIB byte
226 206
227 7 6 5 4 3 2 1 0 207 7 6 5 4 3 2 1 0
@@ -231,208 +211,194 @@ static long pm_address(u_char FPU_modrm, u_char segment,
231*/ 211*/
232 212
233void __user *FPU_get_address(u_char FPU_modrm, unsigned long *fpu_eip, 213void __user *FPU_get_address(u_char FPU_modrm, unsigned long *fpu_eip,
234 struct address *addr, 214 struct address *addr, fpu_addr_modes addr_modes)
235 fpu_addr_modes addr_modes) 215{
216 u_char mod;
217 unsigned rm = FPU_modrm & 7;
218 long *cpu_reg_ptr;
219 int address = 0; /* Initialized just to stop compiler warnings. */
220
221 /* Memory accessed via the cs selector is write protected
222 in `non-segmented' 32 bit protected mode. */
223 if (!addr_modes.default_mode && (FPU_modrm & FPU_WRITE_BIT)
224 && (addr_modes.override.segment == PREFIX_CS_)) {
225 math_abort(FPU_info, SIGSEGV);
226 }
227
228 addr->selector = FPU_DS; /* Default, for 32 bit non-segmented mode. */
229
230 mod = (FPU_modrm >> 6) & 3;
231
232 if (rm == 4 && mod != 3) {
233 address = sib(mod, fpu_eip);
234 } else {
235 cpu_reg_ptr = &REG_(rm);
236 switch (mod) {
237 case 0:
238 if (rm == 5) {
239 /* Special case: disp32 */
240 RE_ENTRANT_CHECK_OFF;
241 FPU_code_access_ok(4);
242 FPU_get_user(address,
243 (unsigned long __user
244 *)(*fpu_eip));
245 (*fpu_eip) += 4;
246 RE_ENTRANT_CHECK_ON;
247 addr->offset = address;
248 return (void __user *)address;
249 } else {
250 address = *cpu_reg_ptr; /* Just return the contents
251 of the cpu register */
252 addr->offset = address;
253 return (void __user *)address;
254 }
255 case 1:
256 /* 8 bit signed displacement */
257 RE_ENTRANT_CHECK_OFF;
258 FPU_code_access_ok(1);
259 FPU_get_user(address, (signed char __user *)(*fpu_eip));
260 RE_ENTRANT_CHECK_ON;
261 (*fpu_eip)++;
262 break;
263 case 2:
264 /* 32 bit displacement */
265 RE_ENTRANT_CHECK_OFF;
266 FPU_code_access_ok(4);
267 FPU_get_user(address, (long __user *)(*fpu_eip));
268 (*fpu_eip) += 4;
269 RE_ENTRANT_CHECK_ON;
270 break;
271 case 3:
272 /* Not legal for the FPU */
273 EXCEPTION(EX_Invalid);
274 }
275 address += *cpu_reg_ptr;
276 }
277
278 addr->offset = address;
279
280 switch (addr_modes.default_mode) {
281 case 0:
282 break;
283 case VM86:
284 address += vm86_segment(addr_modes.override.segment, addr);
285 break;
286 case PM16:
287 case SEG32:
288 address = pm_address(FPU_modrm, addr_modes.override.segment,
289 addr, address);
290 break;
291 default:
292 EXCEPTION(EX_INTERNAL | 0x133);
293 }
294
295 return (void __user *)address;
296}
297
298void __user *FPU_get_address_16(u_char FPU_modrm, unsigned long *fpu_eip,
299 struct address *addr, fpu_addr_modes addr_modes)
236{ 300{
237 u_char mod; 301 u_char mod;
238 unsigned rm = FPU_modrm & 7; 302 unsigned rm = FPU_modrm & 7;
239 long *cpu_reg_ptr; 303 int address = 0; /* Default used for mod == 0 */
240 int address = 0; /* Initialized just to stop compiler warnings. */ 304
241 305 /* Memory accessed via the cs selector is write protected
242 /* Memory accessed via the cs selector is write protected 306 in `non-segmented' 32 bit protected mode. */
243 in `non-segmented' 32 bit protected mode. */ 307 if (!addr_modes.default_mode && (FPU_modrm & FPU_WRITE_BIT)
244 if ( !addr_modes.default_mode && (FPU_modrm & FPU_WRITE_BIT) 308 && (addr_modes.override.segment == PREFIX_CS_)) {
245 && (addr_modes.override.segment == PREFIX_CS_) ) 309 math_abort(FPU_info, SIGSEGV);
246 { 310 }
247 math_abort(FPU_info,SIGSEGV); 311
248 } 312 addr->selector = FPU_DS; /* Default, for 32 bit non-segmented mode. */
249 313
250 addr->selector = FPU_DS; /* Default, for 32 bit non-segmented mode. */ 314 mod = (FPU_modrm >> 6) & 3;
251 315
252 mod = (FPU_modrm >> 6) & 3; 316 switch (mod) {
253
254 if (rm == 4 && mod != 3)
255 {
256 address = sib(mod, fpu_eip);
257 }
258 else
259 {
260 cpu_reg_ptr = & REG_(rm);
261 switch (mod)
262 {
263 case 0: 317 case 0:
264 if (rm == 5) 318 if (rm == 6) {
265 { 319 /* Special case: disp16 */
266 /* Special case: disp32 */ 320 RE_ENTRANT_CHECK_OFF;
267 RE_ENTRANT_CHECK_OFF; 321 FPU_code_access_ok(2);
268 FPU_code_access_ok(4); 322 FPU_get_user(address,
269 FPU_get_user(address, (unsigned long __user *) (*fpu_eip)); 323 (unsigned short __user *)(*fpu_eip));
270 (*fpu_eip) += 4; 324 (*fpu_eip) += 2;
271 RE_ENTRANT_CHECK_ON; 325 RE_ENTRANT_CHECK_ON;
272 addr->offset = address; 326 goto add_segment;
273 return (void __user *) address; 327 }
274 } 328 break;
275 else
276 {
277 address = *cpu_reg_ptr; /* Just return the contents
278 of the cpu register */
279 addr->offset = address;
280 return (void __user *) address;
281 }
282 case 1: 329 case 1:
283 /* 8 bit signed displacement */ 330 /* 8 bit signed displacement */
284 RE_ENTRANT_CHECK_OFF; 331 RE_ENTRANT_CHECK_OFF;
285 FPU_code_access_ok(1); 332 FPU_code_access_ok(1);
286 FPU_get_user(address, (signed char __user *) (*fpu_eip)); 333 FPU_get_user(address, (signed char __user *)(*fpu_eip));
287 RE_ENTRANT_CHECK_ON; 334 RE_ENTRANT_CHECK_ON;
288 (*fpu_eip)++; 335 (*fpu_eip)++;
289 break; 336 break;
290 case 2: 337 case 2:
291 /* 32 bit displacement */ 338 /* 16 bit displacement */
292 RE_ENTRANT_CHECK_OFF; 339 RE_ENTRANT_CHECK_OFF;
293 FPU_code_access_ok(4); 340 FPU_code_access_ok(2);
294 FPU_get_user(address, (long __user *) (*fpu_eip)); 341 FPU_get_user(address, (unsigned short __user *)(*fpu_eip));
295 (*fpu_eip) += 4; 342 (*fpu_eip) += 2;
296 RE_ENTRANT_CHECK_ON; 343 RE_ENTRANT_CHECK_ON;
297 break; 344 break;
298 case 3: 345 case 3:
299 /* Not legal for the FPU */ 346 /* Not legal for the FPU */
300 EXCEPTION(EX_Invalid); 347 EXCEPTION(EX_Invalid);
348 break;
349 }
350 switch (rm) {
351 case 0:
352 address += FPU_info->___ebx + FPU_info->___esi;
353 break;
354 case 1:
355 address += FPU_info->___ebx + FPU_info->___edi;
356 break;
357 case 2:
358 address += FPU_info->___ebp + FPU_info->___esi;
359 if (addr_modes.override.segment == PREFIX_DEFAULT)
360 addr_modes.override.segment = PREFIX_SS_;
361 break;
362 case 3:
363 address += FPU_info->___ebp + FPU_info->___edi;
364 if (addr_modes.override.segment == PREFIX_DEFAULT)
365 addr_modes.override.segment = PREFIX_SS_;
366 break;
367 case 4:
368 address += FPU_info->___esi;
369 break;
370 case 5:
371 address += FPU_info->___edi;
372 break;
373 case 6:
374 address += FPU_info->___ebp;
375 if (addr_modes.override.segment == PREFIX_DEFAULT)
376 addr_modes.override.segment = PREFIX_SS_;
377 break;
378 case 7:
379 address += FPU_info->___ebx;
380 break;
301 } 381 }
302 address += *cpu_reg_ptr;
303 }
304
305 addr->offset = address;
306
307 switch ( addr_modes.default_mode )
308 {
309 case 0:
310 break;
311 case VM86:
312 address += vm86_segment(addr_modes.override.segment, addr);
313 break;
314 case PM16:
315 case SEG32:
316 address = pm_address(FPU_modrm, addr_modes.override.segment,
317 addr, address);
318 break;
319 default:
320 EXCEPTION(EX_INTERNAL|0x133);
321 }
322
323 return (void __user *)address;
324}
325 382
383 add_segment:
384 address &= 0xffff;
326 385
327void __user *FPU_get_address_16(u_char FPU_modrm, unsigned long *fpu_eip, 386 addr->offset = address;
328 struct address *addr, 387
329 fpu_addr_modes addr_modes) 388 switch (addr_modes.default_mode) {
330{ 389 case 0:
331 u_char mod; 390 break;
332 unsigned rm = FPU_modrm & 7; 391 case VM86:
333 int address = 0; /* Default used for mod == 0 */ 392 address += vm86_segment(addr_modes.override.segment, addr);
334 393 break;
335 /* Memory accessed via the cs selector is write protected 394 case PM16:
336 in `non-segmented' 32 bit protected mode. */ 395 case SEG32:
337 if ( !addr_modes.default_mode && (FPU_modrm & FPU_WRITE_BIT) 396 address = pm_address(FPU_modrm, addr_modes.override.segment,
338 && (addr_modes.override.segment == PREFIX_CS_) ) 397 addr, address);
339 { 398 break;
340 math_abort(FPU_info,SIGSEGV); 399 default:
341 } 400 EXCEPTION(EX_INTERNAL | 0x131);
342
343 addr->selector = FPU_DS; /* Default, for 32 bit non-segmented mode. */
344
345 mod = (FPU_modrm >> 6) & 3;
346
347 switch (mod)
348 {
349 case 0:
350 if (rm == 6)
351 {
352 /* Special case: disp16 */
353 RE_ENTRANT_CHECK_OFF;
354 FPU_code_access_ok(2);
355 FPU_get_user(address, (unsigned short __user *) (*fpu_eip));
356 (*fpu_eip) += 2;
357 RE_ENTRANT_CHECK_ON;
358 goto add_segment;
359 } 401 }
360 break; 402
361 case 1: 403 return (void __user *)address;
362 /* 8 bit signed displacement */
363 RE_ENTRANT_CHECK_OFF;
364 FPU_code_access_ok(1);
365 FPU_get_user(address, (signed char __user *) (*fpu_eip));
366 RE_ENTRANT_CHECK_ON;
367 (*fpu_eip)++;
368 break;
369 case 2:
370 /* 16 bit displacement */
371 RE_ENTRANT_CHECK_OFF;
372 FPU_code_access_ok(2);
373 FPU_get_user(address, (unsigned short __user *) (*fpu_eip));
374 (*fpu_eip) += 2;
375 RE_ENTRANT_CHECK_ON;
376 break;
377 case 3:
378 /* Not legal for the FPU */
379 EXCEPTION(EX_Invalid);
380 break;
381 }
382 switch ( rm )
383 {
384 case 0:
385 address += FPU_info->___ebx + FPU_info->___esi;
386 break;
387 case 1:
388 address += FPU_info->___ebx + FPU_info->___edi;
389 break;
390 case 2:
391 address += FPU_info->___ebp + FPU_info->___esi;
392 if ( addr_modes.override.segment == PREFIX_DEFAULT )
393 addr_modes.override.segment = PREFIX_SS_;
394 break;
395 case 3:
396 address += FPU_info->___ebp + FPU_info->___edi;
397 if ( addr_modes.override.segment == PREFIX_DEFAULT )
398 addr_modes.override.segment = PREFIX_SS_;
399 break;
400 case 4:
401 address += FPU_info->___esi;
402 break;
403 case 5:
404 address += FPU_info->___edi;
405 break;
406 case 6:
407 address += FPU_info->___ebp;
408 if ( addr_modes.override.segment == PREFIX_DEFAULT )
409 addr_modes.override.segment = PREFIX_SS_;
410 break;
411 case 7:
412 address += FPU_info->___ebx;
413 break;
414 }
415
416 add_segment:
417 address &= 0xffff;
418
419 addr->offset = address;
420
421 switch ( addr_modes.default_mode )
422 {
423 case 0:
424 break;
425 case VM86:
426 address += vm86_segment(addr_modes.override.segment, addr);
427 break;
428 case PM16:
429 case SEG32:
430 address = pm_address(FPU_modrm, addr_modes.override.segment,
431 addr, address);
432 break;
433 default:
434 EXCEPTION(EX_INTERNAL|0x131);
435 }
436
437 return (void __user *)address ;
438} 404}
diff --git a/arch/x86/math-emu/load_store.c b/arch/x86/math-emu/load_store.c
index eebd6fb1c8a..2931ff35521 100644
--- a/arch/x86/math-emu/load_store.c
+++ b/arch/x86/math-emu/load_store.c
@@ -26,247 +26,257 @@
26#include "status_w.h" 26#include "status_w.h"
27#include "control_w.h" 27#include "control_w.h"
28 28
29 29#define _NONE_ 0 /* st0_ptr etc not needed */
30#define _NONE_ 0 /* st0_ptr etc not needed */ 30#define _REG0_ 1 /* Will be storing st(0) */
31#define _REG0_ 1 /* Will be storing st(0) */ 31#define _PUSH_ 3 /* Need to check for space to push onto stack */
32#define _PUSH_ 3 /* Need to check for space to push onto stack */ 32#define _null_ 4 /* Function illegal or not implemented */
33#define _null_ 4 /* Function illegal or not implemented */
34 33
35#define pop_0() { FPU_settag0(TAG_Empty); top++; } 34#define pop_0() { FPU_settag0(TAG_Empty); top++; }
36 35
37
38static u_char const type_table[32] = { 36static u_char const type_table[32] = {
39 _PUSH_, _PUSH_, _PUSH_, _PUSH_, 37 _PUSH_, _PUSH_, _PUSH_, _PUSH_,
40 _null_, _null_, _null_, _null_, 38 _null_, _null_, _null_, _null_,
41 _REG0_, _REG0_, _REG0_, _REG0_, 39 _REG0_, _REG0_, _REG0_, _REG0_,
42 _REG0_, _REG0_, _REG0_, _REG0_, 40 _REG0_, _REG0_, _REG0_, _REG0_,
43 _NONE_, _null_, _NONE_, _PUSH_, 41 _NONE_, _null_, _NONE_, _PUSH_,
44 _NONE_, _PUSH_, _null_, _PUSH_, 42 _NONE_, _PUSH_, _null_, _PUSH_,
45 _NONE_, _null_, _NONE_, _REG0_, 43 _NONE_, _null_, _NONE_, _REG0_,
46 _NONE_, _REG0_, _NONE_, _REG0_ 44 _NONE_, _REG0_, _NONE_, _REG0_
47 }; 45};
48 46
49u_char const data_sizes_16[32] = { 47u_char const data_sizes_16[32] = {
50 4, 4, 8, 2, 0, 0, 0, 0, 48 4, 4, 8, 2, 0, 0, 0, 0,
51 4, 4, 8, 2, 4, 4, 8, 2, 49 4, 4, 8, 2, 4, 4, 8, 2,
52 14, 0, 94, 10, 2, 10, 0, 8, 50 14, 0, 94, 10, 2, 10, 0, 8,
53 14, 0, 94, 10, 2, 10, 2, 8 51 14, 0, 94, 10, 2, 10, 2, 8
54}; 52};
55 53
56static u_char const data_sizes_32[32] = { 54static u_char const data_sizes_32[32] = {
57 4, 4, 8, 2, 0, 0, 0, 0, 55 4, 4, 8, 2, 0, 0, 0, 0,
58 4, 4, 8, 2, 4, 4, 8, 2, 56 4, 4, 8, 2, 4, 4, 8, 2,
59 28, 0,108, 10, 2, 10, 0, 8, 57 28, 0, 108, 10, 2, 10, 0, 8,
60 28, 0,108, 10, 2, 10, 2, 8 58 28, 0, 108, 10, 2, 10, 2, 8
61}; 59};
62 60
63int FPU_load_store(u_char type, fpu_addr_modes addr_modes, 61int FPU_load_store(u_char type, fpu_addr_modes addr_modes,
64 void __user *data_address) 62 void __user * data_address)
65{ 63{
66 FPU_REG loaded_data; 64 FPU_REG loaded_data;
67 FPU_REG *st0_ptr; 65 FPU_REG *st0_ptr;
68 u_char st0_tag = TAG_Empty; /* This is just to stop a gcc warning. */ 66 u_char st0_tag = TAG_Empty; /* This is just to stop a gcc warning. */
69 u_char loaded_tag; 67 u_char loaded_tag;
70 68
71 st0_ptr = NULL; /* Initialized just to stop compiler warnings. */ 69 st0_ptr = NULL; /* Initialized just to stop compiler warnings. */
72 70
73 if ( addr_modes.default_mode & PROTECTED ) 71 if (addr_modes.default_mode & PROTECTED) {
74 { 72 if (addr_modes.default_mode == SEG32) {
75 if ( addr_modes.default_mode == SEG32 ) 73 if (access_limit < data_sizes_32[type])
76 { 74 math_abort(FPU_info, SIGSEGV);
77 if ( access_limit < data_sizes_32[type] ) 75 } else if (addr_modes.default_mode == PM16) {
78 math_abort(FPU_info,SIGSEGV); 76 if (access_limit < data_sizes_16[type])
79 } 77 math_abort(FPU_info, SIGSEGV);
80 else if ( addr_modes.default_mode == PM16 ) 78 }
81 {
82 if ( access_limit < data_sizes_16[type] )
83 math_abort(FPU_info,SIGSEGV);
84 }
85#ifdef PARANOID 79#ifdef PARANOID
86 else 80 else
87 EXCEPTION(EX_INTERNAL|0x140); 81 EXCEPTION(EX_INTERNAL | 0x140);
88#endif /* PARANOID */ 82#endif /* PARANOID */
89 } 83 }
90 84
91 switch ( type_table[type] ) 85 switch (type_table[type]) {
92 { 86 case _NONE_:
93 case _NONE_: 87 break;
94 break; 88 case _REG0_:
95 case _REG0_: 89 st0_ptr = &st(0); /* Some of these instructions pop after
96 st0_ptr = &st(0); /* Some of these instructions pop after 90 storing */
97 storing */ 91 st0_tag = FPU_gettag0();
98 st0_tag = FPU_gettag0(); 92 break;
99 break; 93 case _PUSH_:
100 case _PUSH_: 94 {
101 { 95 if (FPU_gettagi(-1) != TAG_Empty) {
102 if ( FPU_gettagi(-1) != TAG_Empty ) 96 FPU_stack_overflow();
103 { FPU_stack_overflow(); return 0; } 97 return 0;
104 top--; 98 }
105 st0_ptr = &st(0); 99 top--;
106 } 100 st0_ptr = &st(0);
107 break; 101 }
108 case _null_: 102 break;
109 FPU_illegal(); 103 case _null_:
110 return 0; 104 FPU_illegal();
105 return 0;
111#ifdef PARANOID 106#ifdef PARANOID
112 default: 107 default:
113 EXCEPTION(EX_INTERNAL|0x141); 108 EXCEPTION(EX_INTERNAL | 0x141);
114 return 0; 109 return 0;
115#endif /* PARANOID */ 110#endif /* PARANOID */
116 }
117
118 switch ( type )
119 {
120 case 000: /* fld m32real */
121 clear_C1();
122 loaded_tag = FPU_load_single((float __user *)data_address, &loaded_data);
123 if ( (loaded_tag == TAG_Special)
124 && isNaN(&loaded_data)
125 && (real_1op_NaN(&loaded_data) < 0) )
126 {
127 top++;
128 break;
129 }
130 FPU_copy_to_reg0(&loaded_data, loaded_tag);
131 break;
132 case 001: /* fild m32int */
133 clear_C1();
134 loaded_tag = FPU_load_int32((long __user *)data_address, &loaded_data);
135 FPU_copy_to_reg0(&loaded_data, loaded_tag);
136 break;
137 case 002: /* fld m64real */
138 clear_C1();
139 loaded_tag = FPU_load_double((double __user *)data_address, &loaded_data);
140 if ( (loaded_tag == TAG_Special)
141 && isNaN(&loaded_data)
142 && (real_1op_NaN(&loaded_data) < 0) )
143 {
144 top++;
145 break;
146 } 111 }
147 FPU_copy_to_reg0(&loaded_data, loaded_tag); 112
148 break; 113 switch (type) {
149 case 003: /* fild m16int */ 114 case 000: /* fld m32real */
150 clear_C1(); 115 clear_C1();
151 loaded_tag = FPU_load_int16((short __user *)data_address, &loaded_data); 116 loaded_tag =
152 FPU_copy_to_reg0(&loaded_data, loaded_tag); 117 FPU_load_single((float __user *)data_address, &loaded_data);
153 break; 118 if ((loaded_tag == TAG_Special)
154 case 010: /* fst m32real */ 119 && isNaN(&loaded_data)
155 clear_C1(); 120 && (real_1op_NaN(&loaded_data) < 0)) {
156 FPU_store_single(st0_ptr, st0_tag, (float __user *)data_address); 121 top++;
157 break; 122 break;
158 case 011: /* fist m32int */ 123 }
159 clear_C1(); 124 FPU_copy_to_reg0(&loaded_data, loaded_tag);
160 FPU_store_int32(st0_ptr, st0_tag, (long __user *)data_address); 125 break;
161 break; 126 case 001: /* fild m32int */
162 case 012: /* fst m64real */ 127 clear_C1();
163 clear_C1(); 128 loaded_tag =
164 FPU_store_double(st0_ptr, st0_tag, (double __user *)data_address); 129 FPU_load_int32((long __user *)data_address, &loaded_data);
165 break; 130 FPU_copy_to_reg0(&loaded_data, loaded_tag);
166 case 013: /* fist m16int */ 131 break;
167 clear_C1(); 132 case 002: /* fld m64real */
168 FPU_store_int16(st0_ptr, st0_tag, (short __user *)data_address); 133 clear_C1();
169 break; 134 loaded_tag =
170 case 014: /* fstp m32real */ 135 FPU_load_double((double __user *)data_address,
171 clear_C1(); 136 &loaded_data);
172 if ( FPU_store_single(st0_ptr, st0_tag, (float __user *)data_address) ) 137 if ((loaded_tag == TAG_Special)
173 pop_0(); /* pop only if the number was actually stored 138 && isNaN(&loaded_data)
174 (see the 80486 manual p16-28) */ 139 && (real_1op_NaN(&loaded_data) < 0)) {
175 break; 140 top++;
176 case 015: /* fistp m32int */ 141 break;
177 clear_C1(); 142 }
178 if ( FPU_store_int32(st0_ptr, st0_tag, (long __user *)data_address) ) 143 FPU_copy_to_reg0(&loaded_data, loaded_tag);
179 pop_0(); /* pop only if the number was actually stored 144 break;
180 (see the 80486 manual p16-28) */ 145 case 003: /* fild m16int */
181 break; 146 clear_C1();
182 case 016: /* fstp m64real */ 147 loaded_tag =
183 clear_C1(); 148 FPU_load_int16((short __user *)data_address, &loaded_data);
184 if ( FPU_store_double(st0_ptr, st0_tag, (double __user *)data_address) ) 149 FPU_copy_to_reg0(&loaded_data, loaded_tag);
185 pop_0(); /* pop only if the number was actually stored 150 break;
186 (see the 80486 manual p16-28) */ 151 case 010: /* fst m32real */
187 break; 152 clear_C1();
188 case 017: /* fistp m16int */ 153 FPU_store_single(st0_ptr, st0_tag,
189 clear_C1(); 154 (float __user *)data_address);
190 if ( FPU_store_int16(st0_ptr, st0_tag, (short __user *)data_address) ) 155 break;
191 pop_0(); /* pop only if the number was actually stored 156 case 011: /* fist m32int */
192 (see the 80486 manual p16-28) */ 157 clear_C1();
193 break; 158 FPU_store_int32(st0_ptr, st0_tag, (long __user *)data_address);
194 case 020: /* fldenv m14/28byte */ 159 break;
195 fldenv(addr_modes, (u_char __user *)data_address); 160 case 012: /* fst m64real */
196 /* Ensure that the values just loaded are not changed by 161 clear_C1();
197 fix-up operations. */ 162 FPU_store_double(st0_ptr, st0_tag,
198 return 1; 163 (double __user *)data_address);
199 case 022: /* frstor m94/108byte */ 164 break;
200 frstor(addr_modes, (u_char __user *)data_address); 165 case 013: /* fist m16int */
201 /* Ensure that the values just loaded are not changed by 166 clear_C1();
202 fix-up operations. */ 167 FPU_store_int16(st0_ptr, st0_tag, (short __user *)data_address);
203 return 1; 168 break;
204 case 023: /* fbld m80dec */ 169 case 014: /* fstp m32real */
205 clear_C1(); 170 clear_C1();
206 loaded_tag = FPU_load_bcd((u_char __user *)data_address); 171 if (FPU_store_single
207 FPU_settag0(loaded_tag); 172 (st0_ptr, st0_tag, (float __user *)data_address))
208 break; 173 pop_0(); /* pop only if the number was actually stored
209 case 024: /* fldcw */ 174 (see the 80486 manual p16-28) */
210 RE_ENTRANT_CHECK_OFF; 175 break;
211 FPU_access_ok(VERIFY_READ, data_address, 2); 176 case 015: /* fistp m32int */
212 FPU_get_user(control_word, (unsigned short __user *) data_address); 177 clear_C1();
213 RE_ENTRANT_CHECK_ON; 178 if (FPU_store_int32
214 if ( partial_status & ~control_word & CW_Exceptions ) 179 (st0_ptr, st0_tag, (long __user *)data_address))
215 partial_status |= (SW_Summary | SW_Backward); 180 pop_0(); /* pop only if the number was actually stored
216 else 181 (see the 80486 manual p16-28) */
217 partial_status &= ~(SW_Summary | SW_Backward); 182 break;
183 case 016: /* fstp m64real */
184 clear_C1();
185 if (FPU_store_double
186 (st0_ptr, st0_tag, (double __user *)data_address))
187 pop_0(); /* pop only if the number was actually stored
188 (see the 80486 manual p16-28) */
189 break;
190 case 017: /* fistp m16int */
191 clear_C1();
192 if (FPU_store_int16
193 (st0_ptr, st0_tag, (short __user *)data_address))
194 pop_0(); /* pop only if the number was actually stored
195 (see the 80486 manual p16-28) */
196 break;
197 case 020: /* fldenv m14/28byte */
198 fldenv(addr_modes, (u_char __user *) data_address);
199 /* Ensure that the values just loaded are not changed by
200 fix-up operations. */
201 return 1;
202 case 022: /* frstor m94/108byte */
203 frstor(addr_modes, (u_char __user *) data_address);
204 /* Ensure that the values just loaded are not changed by
205 fix-up operations. */
206 return 1;
207 case 023: /* fbld m80dec */
208 clear_C1();
209 loaded_tag = FPU_load_bcd((u_char __user *) data_address);
210 FPU_settag0(loaded_tag);
211 break;
212 case 024: /* fldcw */
213 RE_ENTRANT_CHECK_OFF;
214 FPU_access_ok(VERIFY_READ, data_address, 2);
215 FPU_get_user(control_word,
216 (unsigned short __user *)data_address);
217 RE_ENTRANT_CHECK_ON;
218 if (partial_status & ~control_word & CW_Exceptions)
219 partial_status |= (SW_Summary | SW_Backward);
220 else
221 partial_status &= ~(SW_Summary | SW_Backward);
218#ifdef PECULIAR_486 222#ifdef PECULIAR_486
219 control_word |= 0x40; /* An 80486 appears to always set this bit */ 223 control_word |= 0x40; /* An 80486 appears to always set this bit */
220#endif /* PECULIAR_486 */ 224#endif /* PECULIAR_486 */
221 return 1; 225 return 1;
222 case 025: /* fld m80real */ 226 case 025: /* fld m80real */
223 clear_C1(); 227 clear_C1();
224 loaded_tag = FPU_load_extended((long double __user *)data_address, 0); 228 loaded_tag =
225 FPU_settag0(loaded_tag); 229 FPU_load_extended((long double __user *)data_address, 0);
226 break; 230 FPU_settag0(loaded_tag);
227 case 027: /* fild m64int */ 231 break;
228 clear_C1(); 232 case 027: /* fild m64int */
229 loaded_tag = FPU_load_int64((long long __user *)data_address); 233 clear_C1();
230 if (loaded_tag == TAG_Error) 234 loaded_tag = FPU_load_int64((long long __user *)data_address);
235 if (loaded_tag == TAG_Error)
236 return 0;
237 FPU_settag0(loaded_tag);
238 break;
239 case 030: /* fstenv m14/28byte */
240 fstenv(addr_modes, (u_char __user *) data_address);
241 return 1;
242 case 032: /* fsave */
243 fsave(addr_modes, (u_char __user *) data_address);
244 return 1;
245 case 033: /* fbstp m80dec */
246 clear_C1();
247 if (FPU_store_bcd
248 (st0_ptr, st0_tag, (u_char __user *) data_address))
249 pop_0(); /* pop only if the number was actually stored
250 (see the 80486 manual p16-28) */
251 break;
252 case 034: /* fstcw m16int */
253 RE_ENTRANT_CHECK_OFF;
254 FPU_access_ok(VERIFY_WRITE, data_address, 2);
255 FPU_put_user(control_word,
256 (unsigned short __user *)data_address);
257 RE_ENTRANT_CHECK_ON;
258 return 1;
259 case 035: /* fstp m80real */
260 clear_C1();
261 if (FPU_store_extended
262 (st0_ptr, st0_tag, (long double __user *)data_address))
263 pop_0(); /* pop only if the number was actually stored
264 (see the 80486 manual p16-28) */
265 break;
266 case 036: /* fstsw m2byte */
267 RE_ENTRANT_CHECK_OFF;
268 FPU_access_ok(VERIFY_WRITE, data_address, 2);
269 FPU_put_user(status_word(),
270 (unsigned short __user *)data_address);
271 RE_ENTRANT_CHECK_ON;
272 return 1;
273 case 037: /* fistp m64int */
274 clear_C1();
275 if (FPU_store_int64
276 (st0_ptr, st0_tag, (long long __user *)data_address))
277 pop_0(); /* pop only if the number was actually stored
278 (see the 80486 manual p16-28) */
279 break;
280 }
231 return 0; 281 return 0;
232 FPU_settag0(loaded_tag);
233 break;
234 case 030: /* fstenv m14/28byte */
235 fstenv(addr_modes, (u_char __user *)data_address);
236 return 1;
237 case 032: /* fsave */
238 fsave(addr_modes, (u_char __user *)data_address);
239 return 1;
240 case 033: /* fbstp m80dec */
241 clear_C1();
242 if ( FPU_store_bcd(st0_ptr, st0_tag, (u_char __user *)data_address) )
243 pop_0(); /* pop only if the number was actually stored
244 (see the 80486 manual p16-28) */
245 break;
246 case 034: /* fstcw m16int */
247 RE_ENTRANT_CHECK_OFF;
248 FPU_access_ok(VERIFY_WRITE,data_address,2);
249 FPU_put_user(control_word, (unsigned short __user *) data_address);
250 RE_ENTRANT_CHECK_ON;
251 return 1;
252 case 035: /* fstp m80real */
253 clear_C1();
254 if ( FPU_store_extended(st0_ptr, st0_tag, (long double __user *)data_address) )
255 pop_0(); /* pop only if the number was actually stored
256 (see the 80486 manual p16-28) */
257 break;
258 case 036: /* fstsw m2byte */
259 RE_ENTRANT_CHECK_OFF;
260 FPU_access_ok(VERIFY_WRITE,data_address,2);
261 FPU_put_user(status_word(),(unsigned short __user *) data_address);
262 RE_ENTRANT_CHECK_ON;
263 return 1;
264 case 037: /* fistp m64int */
265 clear_C1();
266 if ( FPU_store_int64(st0_ptr, st0_tag, (long long __user *)data_address) )
267 pop_0(); /* pop only if the number was actually stored
268 (see the 80486 manual p16-28) */
269 break;
270 }
271 return 0;
272} 282}
diff --git a/arch/x86/math-emu/poly.h b/arch/x86/math-emu/poly.h
index 4db79811492..168eb44c93c 100644
--- a/arch/x86/math-emu/poly.h
+++ b/arch/x86/math-emu/poly.h
@@ -21,9 +21,9 @@
21 allows. 9-byte would probably be sufficient. 21 allows. 9-byte would probably be sufficient.
22 */ 22 */
23typedef struct { 23typedef struct {
24 unsigned long lsw; 24 unsigned long lsw;
25 unsigned long midw; 25 unsigned long midw;
26 unsigned long msw; 26 unsigned long msw;
27} Xsig; 27} Xsig;
28 28
29asmlinkage void mul64(unsigned long long const *a, unsigned long long const *b, 29asmlinkage void mul64(unsigned long long const *a, unsigned long long const *b,
@@ -49,7 +49,6 @@ asmlinkage void div_Xsig(Xsig *x1, const Xsig *x2, const Xsig *dest);
49/* Macro to access the 8 ms bytes of an Xsig as a long long */ 49/* Macro to access the 8 ms bytes of an Xsig as a long long */
50#define XSIG_LL(x) (*(unsigned long long *)&x.midw) 50#define XSIG_LL(x) (*(unsigned long long *)&x.midw)
51 51
52
53/* 52/*
54 Need to run gcc with optimizations on to get these to 53 Need to run gcc with optimizations on to get these to
55 actually be in-line. 54 actually be in-line.
@@ -63,59 +62,53 @@ asmlinkage void div_Xsig(Xsig *x1, const Xsig *x2, const Xsig *dest);
63static inline unsigned long mul_32_32(const unsigned long arg1, 62static inline unsigned long mul_32_32(const unsigned long arg1,
64 const unsigned long arg2) 63 const unsigned long arg2)
65{ 64{
66 int retval; 65 int retval;
67 asm volatile ("mull %2; movl %%edx,%%eax" \ 66 asm volatile ("mull %2; movl %%edx,%%eax":"=a" (retval)
68 :"=a" (retval) \ 67 :"0"(arg1), "g"(arg2)
69 :"0" (arg1), "g" (arg2) \ 68 :"dx");
70 :"dx"); 69 return retval;
71 return retval;
72} 70}
73 71
74
75/* Add the 12 byte Xsig x2 to Xsig dest, with no checks for overflow. */ 72/* Add the 12 byte Xsig x2 to Xsig dest, with no checks for overflow. */
76static inline void add_Xsig_Xsig(Xsig *dest, const Xsig *x2) 73static inline void add_Xsig_Xsig(Xsig *dest, const Xsig *x2)
77{ 74{
78 asm volatile ("movl %1,%%edi; movl %2,%%esi;\n" 75 asm volatile ("movl %1,%%edi; movl %2,%%esi;\n"
79 "movl (%%esi),%%eax; addl %%eax,(%%edi);\n" 76 "movl (%%esi),%%eax; addl %%eax,(%%edi);\n"
80 "movl 4(%%esi),%%eax; adcl %%eax,4(%%edi);\n" 77 "movl 4(%%esi),%%eax; adcl %%eax,4(%%edi);\n"
81 "movl 8(%%esi),%%eax; adcl %%eax,8(%%edi);\n" 78 "movl 8(%%esi),%%eax; adcl %%eax,8(%%edi);\n":"=g"
82 :"=g" (*dest):"g" (dest), "g" (x2) 79 (*dest):"g"(dest), "g"(x2)
83 :"ax","si","di"); 80 :"ax", "si", "di");
84} 81}
85 82
86
87/* Add the 12 byte Xsig x2 to Xsig dest, adjust exp if overflow occurs. */ 83/* Add the 12 byte Xsig x2 to Xsig dest, adjust exp if overflow occurs. */
88/* Note: the constraints in the asm statement didn't always work properly 84/* Note: the constraints in the asm statement didn't always work properly
89 with gcc 2.5.8. Changing from using edi to using ecx got around the 85 with gcc 2.5.8. Changing from using edi to using ecx got around the
90 problem, but keep fingers crossed! */ 86 problem, but keep fingers crossed! */
91static inline void add_two_Xsig(Xsig *dest, const Xsig *x2, long int *exp) 87static inline void add_two_Xsig(Xsig *dest, const Xsig *x2, long int *exp)
92{ 88{
93 asm volatile ("movl %2,%%ecx; movl %3,%%esi;\n" 89 asm volatile ("movl %2,%%ecx; movl %3,%%esi;\n"
94 "movl (%%esi),%%eax; addl %%eax,(%%ecx);\n" 90 "movl (%%esi),%%eax; addl %%eax,(%%ecx);\n"
95 "movl 4(%%esi),%%eax; adcl %%eax,4(%%ecx);\n" 91 "movl 4(%%esi),%%eax; adcl %%eax,4(%%ecx);\n"
96 "movl 8(%%esi),%%eax; adcl %%eax,8(%%ecx);\n" 92 "movl 8(%%esi),%%eax; adcl %%eax,8(%%ecx);\n"
97 "jnc 0f;\n" 93 "jnc 0f;\n"
98 "rcrl 8(%%ecx); rcrl 4(%%ecx); rcrl (%%ecx)\n" 94 "rcrl 8(%%ecx); rcrl 4(%%ecx); rcrl (%%ecx)\n"
99 "movl %4,%%ecx; incl (%%ecx)\n" 95 "movl %4,%%ecx; incl (%%ecx)\n"
100 "movl $1,%%eax; jmp 1f;\n" 96 "movl $1,%%eax; jmp 1f;\n"
101 "0: xorl %%eax,%%eax;\n" 97 "0: xorl %%eax,%%eax;\n" "1:\n":"=g" (*exp), "=g"(*dest)
102 "1:\n" 98 :"g"(dest), "g"(x2), "g"(exp)
103 :"=g" (*exp), "=g" (*dest) 99 :"cx", "si", "ax");
104 :"g" (dest), "g" (x2), "g" (exp)
105 :"cx","si","ax");
106} 100}
107 101
108
109/* Negate (subtract from 1.0) the 12 byte Xsig */ 102/* Negate (subtract from 1.0) the 12 byte Xsig */
110/* This is faster in a loop on my 386 than using the "neg" instruction. */ 103/* This is faster in a loop on my 386 than using the "neg" instruction. */
111static inline void negate_Xsig(Xsig *x) 104static inline void negate_Xsig(Xsig *x)
112{ 105{
113 asm volatile("movl %1,%%esi;\n" 106 asm volatile ("movl %1,%%esi;\n"
114 "xorl %%ecx,%%ecx;\n" 107 "xorl %%ecx,%%ecx;\n"
115 "movl %%ecx,%%eax; subl (%%esi),%%eax; movl %%eax,(%%esi);\n" 108 "movl %%ecx,%%eax; subl (%%esi),%%eax; movl %%eax,(%%esi);\n"
116 "movl %%ecx,%%eax; sbbl 4(%%esi),%%eax; movl %%eax,4(%%esi);\n" 109 "movl %%ecx,%%eax; sbbl 4(%%esi),%%eax; movl %%eax,4(%%esi);\n"
117 "movl %%ecx,%%eax; sbbl 8(%%esi),%%eax; movl %%eax,8(%%esi);\n" 110 "movl %%ecx,%%eax; sbbl 8(%%esi),%%eax; movl %%eax,8(%%esi);\n":"=g"
118 :"=g" (*x):"g" (x):"si","ax","cx"); 111 (*x):"g"(x):"si", "ax", "cx");
119} 112}
120 113
121#endif /* _POLY_H */ 114#endif /* _POLY_H */
diff --git a/arch/x86/math-emu/poly_2xm1.c b/arch/x86/math-emu/poly_2xm1.c
index 9766ad5e974..b00e9e10cdc 100644
--- a/arch/x86/math-emu/poly_2xm1.c
+++ b/arch/x86/math-emu/poly_2xm1.c
@@ -17,21 +17,19 @@
17#include "control_w.h" 17#include "control_w.h"
18#include "poly.h" 18#include "poly.h"
19 19
20
21#define HIPOWER 11 20#define HIPOWER 11
22static const unsigned long long lterms[HIPOWER] = 21static const unsigned long long lterms[HIPOWER] = {
23{ 22 0x0000000000000000LL, /* This term done separately as 12 bytes */
24 0x0000000000000000LL, /* This term done separately as 12 bytes */ 23 0xf5fdeffc162c7543LL,
25 0xf5fdeffc162c7543LL, 24 0x1c6b08d704a0bfa6LL,
26 0x1c6b08d704a0bfa6LL, 25 0x0276556df749cc21LL,
27 0x0276556df749cc21LL, 26 0x002bb0ffcf14f6b8LL,
28 0x002bb0ffcf14f6b8LL, 27 0x0002861225ef751cLL,
29 0x0002861225ef751cLL, 28 0x00001ffcbfcd5422LL,
30 0x00001ffcbfcd5422LL, 29 0x00000162c005d5f1LL,
31 0x00000162c005d5f1LL, 30 0x0000000da96ccb1bLL,
32 0x0000000da96ccb1bLL, 31 0x0000000078d1b897LL,
33 0x0000000078d1b897LL, 32 0x000000000422b029LL
34 0x000000000422b029LL
35}; 33};
36 34
37static const Xsig hiterm = MK_XSIG(0xb17217f7, 0xd1cf79ab, 0xc8a39194); 35static const Xsig hiterm = MK_XSIG(0xb17217f7, 0xd1cf79ab, 0xc8a39194);
@@ -45,112 +43,103 @@ static const Xsig shiftterm2 = MK_XSIG(0xb504f333, 0xf9de6484, 0x597d89b3);
45static const Xsig shiftterm3 = MK_XSIG(0xd744fcca, 0xd69d6af4, 0x39a68bb9); 43static const Xsig shiftterm3 = MK_XSIG(0xd744fcca, 0xd69d6af4, 0x39a68bb9);
46 44
47static const Xsig *shiftterm[] = { &shiftterm0, &shiftterm1, 45static const Xsig *shiftterm[] = { &shiftterm0, &shiftterm1,
48 &shiftterm2, &shiftterm3 }; 46 &shiftterm2, &shiftterm3
49 47};
50 48
51/*--- poly_2xm1() -----------------------------------------------------------+ 49/*--- poly_2xm1() -----------------------------------------------------------+
52 | Requires st(0) which is TAG_Valid and < 1. | 50 | Requires st(0) which is TAG_Valid and < 1. |
53 +---------------------------------------------------------------------------*/ 51 +---------------------------------------------------------------------------*/
54int poly_2xm1(u_char sign, FPU_REG *arg, FPU_REG *result) 52int poly_2xm1(u_char sign, FPU_REG *arg, FPU_REG *result)
55{ 53{
56 long int exponent, shift; 54 long int exponent, shift;
57 unsigned long long Xll; 55 unsigned long long Xll;
58 Xsig accumulator, Denom, argSignif; 56 Xsig accumulator, Denom, argSignif;
59 u_char tag; 57 u_char tag;
60 58
61 exponent = exponent16(arg); 59 exponent = exponent16(arg);
62 60
63#ifdef PARANOID 61#ifdef PARANOID
64 if ( exponent >= 0 ) /* Don't want a |number| >= 1.0 */ 62 if (exponent >= 0) { /* Don't want a |number| >= 1.0 */
65 { 63 /* Number negative, too large, or not Valid. */
66 /* Number negative, too large, or not Valid. */ 64 EXCEPTION(EX_INTERNAL | 0x127);
67 EXCEPTION(EX_INTERNAL|0x127); 65 return 1;
68 return 1; 66 }
69 }
70#endif /* PARANOID */ 67#endif /* PARANOID */
71 68
72 argSignif.lsw = 0; 69 argSignif.lsw = 0;
73 XSIG_LL(argSignif) = Xll = significand(arg); 70 XSIG_LL(argSignif) = Xll = significand(arg);
74 71
75 if ( exponent == -1 ) 72 if (exponent == -1) {
76 { 73 shift = (argSignif.msw & 0x40000000) ? 3 : 2;
77 shift = (argSignif.msw & 0x40000000) ? 3 : 2; 74 /* subtract 0.5 or 0.75 */
78 /* subtract 0.5 or 0.75 */ 75 exponent -= 2;
79 exponent -= 2; 76 XSIG_LL(argSignif) <<= 2;
80 XSIG_LL(argSignif) <<= 2; 77 Xll <<= 2;
81 Xll <<= 2; 78 } else if (exponent == -2) {
82 } 79 shift = 1;
83 else if ( exponent == -2 ) 80 /* subtract 0.25 */
84 { 81 exponent--;
85 shift = 1; 82 XSIG_LL(argSignif) <<= 1;
86 /* subtract 0.25 */ 83 Xll <<= 1;
87 exponent--; 84 } else
88 XSIG_LL(argSignif) <<= 1; 85 shift = 0;
89 Xll <<= 1; 86
90 } 87 if (exponent < -2) {
91 else 88 /* Shift the argument right by the required places. */
92 shift = 0; 89 if (FPU_shrx(&Xll, -2 - exponent) >= 0x80000000U)
93 90 Xll++; /* round up */
94 if ( exponent < -2 ) 91 }
95 { 92
96 /* Shift the argument right by the required places. */ 93 accumulator.lsw = accumulator.midw = accumulator.msw = 0;
97 if ( FPU_shrx(&Xll, -2-exponent) >= 0x80000000U ) 94 polynomial_Xsig(&accumulator, &Xll, lterms, HIPOWER - 1);
98 Xll++; /* round up */ 95 mul_Xsig_Xsig(&accumulator, &argSignif);
99 } 96 shr_Xsig(&accumulator, 3);
100 97
101 accumulator.lsw = accumulator.midw = accumulator.msw = 0; 98 mul_Xsig_Xsig(&argSignif, &hiterm); /* The leading term */
102 polynomial_Xsig(&accumulator, &Xll, lterms, HIPOWER-1); 99 add_two_Xsig(&accumulator, &argSignif, &exponent);
103 mul_Xsig_Xsig(&accumulator, &argSignif); 100
104 shr_Xsig(&accumulator, 3); 101 if (shift) {
105 102 /* The argument is large, use the identity:
106 mul_Xsig_Xsig(&argSignif, &hiterm); /* The leading term */ 103 f(x+a) = f(a) * (f(x) + 1) - 1;
107 add_two_Xsig(&accumulator, &argSignif, &exponent); 104 */
108 105 shr_Xsig(&accumulator, -exponent);
109 if ( shift ) 106 accumulator.msw |= 0x80000000; /* add 1.0 */
110 { 107 mul_Xsig_Xsig(&accumulator, shiftterm[shift]);
111 /* The argument is large, use the identity: 108 accumulator.msw &= 0x3fffffff; /* subtract 1.0 */
112 f(x+a) = f(a) * (f(x) + 1) - 1; 109 exponent = 1;
113 */ 110 }
114 shr_Xsig(&accumulator, - exponent); 111
115 accumulator.msw |= 0x80000000; /* add 1.0 */ 112 if (sign != SIGN_POS) {
116 mul_Xsig_Xsig(&accumulator, shiftterm[shift]); 113 /* The argument is negative, use the identity:
117 accumulator.msw &= 0x3fffffff; /* subtract 1.0 */ 114 f(-x) = -f(x) / (1 + f(x))
118 exponent = 1; 115 */
119 } 116 Denom.lsw = accumulator.lsw;
120 117 XSIG_LL(Denom) = XSIG_LL(accumulator);
121 if ( sign != SIGN_POS ) 118 if (exponent < 0)
122 { 119 shr_Xsig(&Denom, -exponent);
123 /* The argument is negative, use the identity: 120 else if (exponent > 0) {
124 f(-x) = -f(x) / (1 + f(x)) 121 /* exponent must be 1 here */
125 */ 122 XSIG_LL(Denom) <<= 1;
126 Denom.lsw = accumulator.lsw; 123 if (Denom.lsw & 0x80000000)
127 XSIG_LL(Denom) = XSIG_LL(accumulator); 124 XSIG_LL(Denom) |= 1;
128 if ( exponent < 0 ) 125 (Denom.lsw) <<= 1;
129 shr_Xsig(&Denom, - exponent); 126 }
130 else if ( exponent > 0 ) 127 Denom.msw |= 0x80000000; /* add 1.0 */
131 { 128 div_Xsig(&accumulator, &Denom, &accumulator);
132 /* exponent must be 1 here */
133 XSIG_LL(Denom) <<= 1;
134 if ( Denom.lsw & 0x80000000 )
135 XSIG_LL(Denom) |= 1;
136 (Denom.lsw) <<= 1;
137 } 129 }
138 Denom.msw |= 0x80000000; /* add 1.0 */
139 div_Xsig(&accumulator, &Denom, &accumulator);
140 }
141 130
142 /* Convert to 64 bit signed-compatible */ 131 /* Convert to 64 bit signed-compatible */
143 exponent += round_Xsig(&accumulator); 132 exponent += round_Xsig(&accumulator);
144 133
145 result = &st(0); 134 result = &st(0);
146 significand(result) = XSIG_LL(accumulator); 135 significand(result) = XSIG_LL(accumulator);
147 setexponent16(result, exponent); 136 setexponent16(result, exponent);
148 137
149 tag = FPU_round(result, 1, 0, FULL_PRECISION, sign); 138 tag = FPU_round(result, 1, 0, FULL_PRECISION, sign);
150 139
151 setsign(result, sign); 140 setsign(result, sign);
152 FPU_settag0(tag); 141 FPU_settag0(tag);
153 142
154 return 0; 143 return 0;
155 144
156} 145}
diff --git a/arch/x86/math-emu/poly_atan.c b/arch/x86/math-emu/poly_atan.c
index 82f702952f6..20c28e58e2d 100644
--- a/arch/x86/math-emu/poly_atan.c
+++ b/arch/x86/math-emu/poly_atan.c
@@ -18,28 +18,25 @@
18#include "control_w.h" 18#include "control_w.h"
19#include "poly.h" 19#include "poly.h"
20 20
21
22#define HIPOWERon 6 /* odd poly, negative terms */ 21#define HIPOWERon 6 /* odd poly, negative terms */
23static const unsigned long long oddnegterms[HIPOWERon] = 22static const unsigned long long oddnegterms[HIPOWERon] = {
24{ 23 0x0000000000000000LL, /* Dummy (not for - 1.0) */
25 0x0000000000000000LL, /* Dummy (not for - 1.0) */ 24 0x015328437f756467LL,
26 0x015328437f756467LL, 25 0x0005dda27b73dec6LL,
27 0x0005dda27b73dec6LL, 26 0x0000226bf2bfb91aLL,
28 0x0000226bf2bfb91aLL, 27 0x000000ccc439c5f7LL,
29 0x000000ccc439c5f7LL, 28 0x0000000355438407LL
30 0x0000000355438407LL 29};
31} ;
32 30
33#define HIPOWERop 6 /* odd poly, positive terms */ 31#define HIPOWERop 6 /* odd poly, positive terms */
34static const unsigned long long oddplterms[HIPOWERop] = 32static const unsigned long long oddplterms[HIPOWERop] = {
35{
36/* 0xaaaaaaaaaaaaaaabLL, transferred to fixedpterm[] */ 33/* 0xaaaaaaaaaaaaaaabLL, transferred to fixedpterm[] */
37 0x0db55a71875c9ac2LL, 34 0x0db55a71875c9ac2LL,
38 0x0029fce2d67880b0LL, 35 0x0029fce2d67880b0LL,
39 0x0000dfd3908b4596LL, 36 0x0000dfd3908b4596LL,
40 0x00000550fd61dab4LL, 37 0x00000550fd61dab4LL,
41 0x0000001c9422b3f9LL, 38 0x0000001c9422b3f9LL,
42 0x000000003e3301e1LL 39 0x000000003e3301e1LL
43}; 40};
44 41
45static const unsigned long long denomterm = 0xebd9b842c5c53a0eLL; 42static const unsigned long long denomterm = 0xebd9b842c5c53a0eLL;
@@ -48,182 +45,164 @@ static const Xsig fixedpterm = MK_XSIG(0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa);
48 45
49static const Xsig pi_signif = MK_XSIG(0xc90fdaa2, 0x2168c234, 0xc4c6628b); 46static const Xsig pi_signif = MK_XSIG(0xc90fdaa2, 0x2168c234, 0xc4c6628b);
50 47
51
52/*--- poly_atan() -----------------------------------------------------------+ 48/*--- poly_atan() -----------------------------------------------------------+
53 | | 49 | |
54 +---------------------------------------------------------------------------*/ 50 +---------------------------------------------------------------------------*/
55void poly_atan(FPU_REG *st0_ptr, u_char st0_tag, 51void poly_atan(FPU_REG *st0_ptr, u_char st0_tag,
56 FPU_REG *st1_ptr, u_char st1_tag) 52 FPU_REG *st1_ptr, u_char st1_tag)
57{ 53{
58 u_char transformed, inverted, 54 u_char transformed, inverted, sign1, sign2;
59 sign1, sign2; 55 int exponent;
60 int exponent; 56 long int dummy_exp;
61 long int dummy_exp; 57 Xsig accumulator, Numer, Denom, accumulatore, argSignif, argSq, argSqSq;
62 Xsig accumulator, Numer, Denom, accumulatore, argSignif, 58 u_char tag;
63 argSq, argSqSq; 59
64 u_char tag; 60 sign1 = getsign(st0_ptr);
65 61 sign2 = getsign(st1_ptr);
66 sign1 = getsign(st0_ptr); 62 if (st0_tag == TAG_Valid) {
67 sign2 = getsign(st1_ptr); 63 exponent = exponent(st0_ptr);
68 if ( st0_tag == TAG_Valid ) 64 } else {
69 { 65 /* This gives non-compatible stack contents... */
70 exponent = exponent(st0_ptr); 66 FPU_to_exp16(st0_ptr, st0_ptr);
71 } 67 exponent = exponent16(st0_ptr);
72 else 68 }
73 { 69 if (st1_tag == TAG_Valid) {
74 /* This gives non-compatible stack contents... */ 70 exponent -= exponent(st1_ptr);
75 FPU_to_exp16(st0_ptr, st0_ptr); 71 } else {
76 exponent = exponent16(st0_ptr); 72 /* This gives non-compatible stack contents... */
77 } 73 FPU_to_exp16(st1_ptr, st1_ptr);
78 if ( st1_tag == TAG_Valid ) 74 exponent -= exponent16(st1_ptr);
79 { 75 }
80 exponent -= exponent(st1_ptr); 76
81 } 77 if ((exponent < 0) || ((exponent == 0) &&
82 else 78 ((st0_ptr->sigh < st1_ptr->sigh) ||
83 { 79 ((st0_ptr->sigh == st1_ptr->sigh) &&
84 /* This gives non-compatible stack contents... */ 80 (st0_ptr->sigl < st1_ptr->sigl))))) {
85 FPU_to_exp16(st1_ptr, st1_ptr); 81 inverted = 1;
86 exponent -= exponent16(st1_ptr); 82 Numer.lsw = Denom.lsw = 0;
87 } 83 XSIG_LL(Numer) = significand(st0_ptr);
88 84 XSIG_LL(Denom) = significand(st1_ptr);
89 if ( (exponent < 0) || ((exponent == 0) && 85 } else {
90 ((st0_ptr->sigh < st1_ptr->sigh) || 86 inverted = 0;
91 ((st0_ptr->sigh == st1_ptr->sigh) && 87 exponent = -exponent;
92 (st0_ptr->sigl < st1_ptr->sigl))) ) ) 88 Numer.lsw = Denom.lsw = 0;
93 { 89 XSIG_LL(Numer) = significand(st1_ptr);
94 inverted = 1; 90 XSIG_LL(Denom) = significand(st0_ptr);
95 Numer.lsw = Denom.lsw = 0; 91 }
96 XSIG_LL(Numer) = significand(st0_ptr); 92 div_Xsig(&Numer, &Denom, &argSignif);
97 XSIG_LL(Denom) = significand(st1_ptr); 93 exponent += norm_Xsig(&argSignif);
98 } 94
99 else 95 if ((exponent >= -1)
100 { 96 || ((exponent == -2) && (argSignif.msw > 0xd413ccd0))) {
101 inverted = 0; 97 /* The argument is greater than sqrt(2)-1 (=0.414213562...) */
102 exponent = -exponent; 98 /* Convert the argument by an identity for atan */
103 Numer.lsw = Denom.lsw = 0; 99 transformed = 1;
104 XSIG_LL(Numer) = significand(st1_ptr); 100
105 XSIG_LL(Denom) = significand(st0_ptr); 101 if (exponent >= 0) {
106 }
107 div_Xsig(&Numer, &Denom, &argSignif);
108 exponent += norm_Xsig(&argSignif);
109
110 if ( (exponent >= -1)
111 || ((exponent == -2) && (argSignif.msw > 0xd413ccd0)) )
112 {
113 /* The argument is greater than sqrt(2)-1 (=0.414213562...) */
114 /* Convert the argument by an identity for atan */
115 transformed = 1;
116
117 if ( exponent >= 0 )
118 {
119#ifdef PARANOID 102#ifdef PARANOID
120 if ( !( (exponent == 0) && 103 if (!((exponent == 0) &&
121 (argSignif.lsw == 0) && (argSignif.midw == 0) && 104 (argSignif.lsw == 0) && (argSignif.midw == 0) &&
122 (argSignif.msw == 0x80000000) ) ) 105 (argSignif.msw == 0x80000000))) {
123 { 106 EXCEPTION(EX_INTERNAL | 0x104); /* There must be a logic error */
124 EXCEPTION(EX_INTERNAL|0x104); /* There must be a logic error */ 107 return;
125 return; 108 }
126 }
127#endif /* PARANOID */ 109#endif /* PARANOID */
128 argSignif.msw = 0; /* Make the transformed arg -> 0.0 */ 110 argSignif.msw = 0; /* Make the transformed arg -> 0.0 */
111 } else {
112 Numer.lsw = Denom.lsw = argSignif.lsw;
113 XSIG_LL(Numer) = XSIG_LL(Denom) = XSIG_LL(argSignif);
114
115 if (exponent < -1)
116 shr_Xsig(&Numer, -1 - exponent);
117 negate_Xsig(&Numer);
118
119 shr_Xsig(&Denom, -exponent);
120 Denom.msw |= 0x80000000;
121
122 div_Xsig(&Numer, &Denom, &argSignif);
123
124 exponent = -1 + norm_Xsig(&argSignif);
125 }
126 } else {
127 transformed = 0;
128 }
129
130 argSq.lsw = argSignif.lsw;
131 argSq.midw = argSignif.midw;
132 argSq.msw = argSignif.msw;
133 mul_Xsig_Xsig(&argSq, &argSq);
134
135 argSqSq.lsw = argSq.lsw;
136 argSqSq.midw = argSq.midw;
137 argSqSq.msw = argSq.msw;
138 mul_Xsig_Xsig(&argSqSq, &argSqSq);
139
140 accumulatore.lsw = argSq.lsw;
141 XSIG_LL(accumulatore) = XSIG_LL(argSq);
142
143 shr_Xsig(&argSq, 2 * (-1 - exponent - 1));
144 shr_Xsig(&argSqSq, 4 * (-1 - exponent - 1));
145
146 /* Now have argSq etc with binary point at the left
147 .1xxxxxxxx */
148
149 /* Do the basic fixed point polynomial evaluation */
150 accumulator.msw = accumulator.midw = accumulator.lsw = 0;
151 polynomial_Xsig(&accumulator, &XSIG_LL(argSqSq),
152 oddplterms, HIPOWERop - 1);
153 mul64_Xsig(&accumulator, &XSIG_LL(argSq));
154 negate_Xsig(&accumulator);
155 polynomial_Xsig(&accumulator, &XSIG_LL(argSqSq), oddnegterms,
156 HIPOWERon - 1);
157 negate_Xsig(&accumulator);
158 add_two_Xsig(&accumulator, &fixedpterm, &dummy_exp);
159
160 mul64_Xsig(&accumulatore, &denomterm);
161 shr_Xsig(&accumulatore, 1 + 2 * (-1 - exponent));
162 accumulatore.msw |= 0x80000000;
163
164 div_Xsig(&accumulator, &accumulatore, &accumulator);
165
166 mul_Xsig_Xsig(&accumulator, &argSignif);
167 mul_Xsig_Xsig(&accumulator, &argSq);
168
169 shr_Xsig(&accumulator, 3);
170 negate_Xsig(&accumulator);
171 add_Xsig_Xsig(&accumulator, &argSignif);
172
173 if (transformed) {
174 /* compute pi/4 - accumulator */
175 shr_Xsig(&accumulator, -1 - exponent);
176 negate_Xsig(&accumulator);
177 add_Xsig_Xsig(&accumulator, &pi_signif);
178 exponent = -1;
179 }
180
181 if (inverted) {
182 /* compute pi/2 - accumulator */
183 shr_Xsig(&accumulator, -exponent);
184 negate_Xsig(&accumulator);
185 add_Xsig_Xsig(&accumulator, &pi_signif);
186 exponent = 0;
129 } 187 }
130 else 188
131 { 189 if (sign1) {
132 Numer.lsw = Denom.lsw = argSignif.lsw; 190 /* compute pi - accumulator */
133 XSIG_LL(Numer) = XSIG_LL(Denom) = XSIG_LL(argSignif); 191 shr_Xsig(&accumulator, 1 - exponent);
134 192 negate_Xsig(&accumulator);
135 if ( exponent < -1 ) 193 add_Xsig_Xsig(&accumulator, &pi_signif);
136 shr_Xsig(&Numer, -1-exponent); 194 exponent = 1;
137 negate_Xsig(&Numer);
138
139 shr_Xsig(&Denom, -exponent);
140 Denom.msw |= 0x80000000;
141
142 div_Xsig(&Numer, &Denom, &argSignif);
143
144 exponent = -1 + norm_Xsig(&argSignif);
145 } 195 }
146 } 196
147 else 197 exponent += round_Xsig(&accumulator);
148 { 198
149 transformed = 0; 199 significand(st1_ptr) = XSIG_LL(accumulator);
150 } 200 setexponent16(st1_ptr, exponent);
151 201
152 argSq.lsw = argSignif.lsw; argSq.midw = argSignif.midw; 202 tag = FPU_round(st1_ptr, 1, 0, FULL_PRECISION, sign2);
153 argSq.msw = argSignif.msw; 203 FPU_settagi(1, tag);
154 mul_Xsig_Xsig(&argSq, &argSq); 204
155 205 set_precision_flag_up(); /* We do not really know if up or down,
156 argSqSq.lsw = argSq.lsw; argSqSq.midw = argSq.midw; argSqSq.msw = argSq.msw; 206 use this as the default. */
157 mul_Xsig_Xsig(&argSqSq, &argSqSq);
158
159 accumulatore.lsw = argSq.lsw;
160 XSIG_LL(accumulatore) = XSIG_LL(argSq);
161
162 shr_Xsig(&argSq, 2*(-1-exponent-1));
163 shr_Xsig(&argSqSq, 4*(-1-exponent-1));
164
165 /* Now have argSq etc with binary point at the left
166 .1xxxxxxxx */
167
168 /* Do the basic fixed point polynomial evaluation */
169 accumulator.msw = accumulator.midw = accumulator.lsw = 0;
170 polynomial_Xsig(&accumulator, &XSIG_LL(argSqSq),
171 oddplterms, HIPOWERop-1);
172 mul64_Xsig(&accumulator, &XSIG_LL(argSq));
173 negate_Xsig(&accumulator);
174 polynomial_Xsig(&accumulator, &XSIG_LL(argSqSq), oddnegterms, HIPOWERon-1);
175 negate_Xsig(&accumulator);
176 add_two_Xsig(&accumulator, &fixedpterm, &dummy_exp);
177
178 mul64_Xsig(&accumulatore, &denomterm);
179 shr_Xsig(&accumulatore, 1 + 2*(-1-exponent));
180 accumulatore.msw |= 0x80000000;
181
182 div_Xsig(&accumulator, &accumulatore, &accumulator);
183
184 mul_Xsig_Xsig(&accumulator, &argSignif);
185 mul_Xsig_Xsig(&accumulator, &argSq);
186
187 shr_Xsig(&accumulator, 3);
188 negate_Xsig(&accumulator);
189 add_Xsig_Xsig(&accumulator, &argSignif);
190
191 if ( transformed )
192 {
193 /* compute pi/4 - accumulator */
194 shr_Xsig(&accumulator, -1-exponent);
195 negate_Xsig(&accumulator);
196 add_Xsig_Xsig(&accumulator, &pi_signif);
197 exponent = -1;
198 }
199
200 if ( inverted )
201 {
202 /* compute pi/2 - accumulator */
203 shr_Xsig(&accumulator, -exponent);
204 negate_Xsig(&accumulator);
205 add_Xsig_Xsig(&accumulator, &pi_signif);
206 exponent = 0;
207 }
208
209 if ( sign1 )
210 {
211 /* compute pi - accumulator */
212 shr_Xsig(&accumulator, 1 - exponent);
213 negate_Xsig(&accumulator);
214 add_Xsig_Xsig(&accumulator, &pi_signif);
215 exponent = 1;
216 }
217
218 exponent += round_Xsig(&accumulator);
219
220 significand(st1_ptr) = XSIG_LL(accumulator);
221 setexponent16(st1_ptr, exponent);
222
223 tag = FPU_round(st1_ptr, 1, 0, FULL_PRECISION, sign2);
224 FPU_settagi(1, tag);
225
226 set_precision_flag_up(); /* We do not really know if up or down,
227 use this as the default. */
228 207
229} 208}
diff --git a/arch/x86/math-emu/poly_l2.c b/arch/x86/math-emu/poly_l2.c
index dd00e1d5b07..8e2ff4b28a0 100644
--- a/arch/x86/math-emu/poly_l2.c
+++ b/arch/x86/math-emu/poly_l2.c
@@ -10,7 +10,6 @@
10 | | 10 | |
11 +---------------------------------------------------------------------------*/ 11 +---------------------------------------------------------------------------*/
12 12
13
14#include "exception.h" 13#include "exception.h"
15#include "reg_constant.h" 14#include "reg_constant.h"
16#include "fpu_emu.h" 15#include "fpu_emu.h"
@@ -18,184 +17,163 @@
18#include "control_w.h" 17#include "control_w.h"
19#include "poly.h" 18#include "poly.h"
20 19
21
22static void log2_kernel(FPU_REG const *arg, u_char argsign, 20static void log2_kernel(FPU_REG const *arg, u_char argsign,
23 Xsig *accum_result, long int *expon); 21 Xsig * accum_result, long int *expon);
24
25 22
26/*--- poly_l2() -------------------------------------------------------------+ 23/*--- poly_l2() -------------------------------------------------------------+
27 | Base 2 logarithm by a polynomial approximation. | 24 | Base 2 logarithm by a polynomial approximation. |
28 +---------------------------------------------------------------------------*/ 25 +---------------------------------------------------------------------------*/
29void poly_l2(FPU_REG *st0_ptr, FPU_REG *st1_ptr, u_char st1_sign) 26void poly_l2(FPU_REG *st0_ptr, FPU_REG *st1_ptr, u_char st1_sign)
30{ 27{
31 long int exponent, expon, expon_expon; 28 long int exponent, expon, expon_expon;
32 Xsig accumulator, expon_accum, yaccum; 29 Xsig accumulator, expon_accum, yaccum;
33 u_char sign, argsign; 30 u_char sign, argsign;
34 FPU_REG x; 31 FPU_REG x;
35 int tag; 32 int tag;
36 33
37 exponent = exponent16(st0_ptr); 34 exponent = exponent16(st0_ptr);
38 35
39 /* From st0_ptr, make a number > sqrt(2)/2 and < sqrt(2) */ 36 /* From st0_ptr, make a number > sqrt(2)/2 and < sqrt(2) */
40 if ( st0_ptr->sigh > (unsigned)0xb504f334 ) 37 if (st0_ptr->sigh > (unsigned)0xb504f334) {
41 { 38 /* Treat as sqrt(2)/2 < st0_ptr < 1 */
42 /* Treat as sqrt(2)/2 < st0_ptr < 1 */ 39 significand(&x) = -significand(st0_ptr);
43 significand(&x) = - significand(st0_ptr); 40 setexponent16(&x, -1);
44 setexponent16(&x, -1); 41 exponent++;
45 exponent++; 42 argsign = SIGN_NEG;
46 argsign = SIGN_NEG; 43 } else {
47 } 44 /* Treat as 1 <= st0_ptr < sqrt(2) */
48 else 45 x.sigh = st0_ptr->sigh - 0x80000000;
49 { 46 x.sigl = st0_ptr->sigl;
50 /* Treat as 1 <= st0_ptr < sqrt(2) */ 47 setexponent16(&x, 0);
51 x.sigh = st0_ptr->sigh - 0x80000000; 48 argsign = SIGN_POS;
52 x.sigl = st0_ptr->sigl; 49 }
53 setexponent16(&x, 0); 50 tag = FPU_normalize_nuo(&x);
54 argsign = SIGN_POS;
55 }
56 tag = FPU_normalize_nuo(&x);
57
58 if ( tag == TAG_Zero )
59 {
60 expon = 0;
61 accumulator.msw = accumulator.midw = accumulator.lsw = 0;
62 }
63 else
64 {
65 log2_kernel(&x, argsign, &accumulator, &expon);
66 }
67
68 if ( exponent < 0 )
69 {
70 sign = SIGN_NEG;
71 exponent = -exponent;
72 }
73 else
74 sign = SIGN_POS;
75 expon_accum.msw = exponent; expon_accum.midw = expon_accum.lsw = 0;
76 if ( exponent )
77 {
78 expon_expon = 31 + norm_Xsig(&expon_accum);
79 shr_Xsig(&accumulator, expon_expon - expon);
80
81 if ( sign ^ argsign )
82 negate_Xsig(&accumulator);
83 add_Xsig_Xsig(&accumulator, &expon_accum);
84 }
85 else
86 {
87 expon_expon = expon;
88 sign = argsign;
89 }
90
91 yaccum.lsw = 0; XSIG_LL(yaccum) = significand(st1_ptr);
92 mul_Xsig_Xsig(&accumulator, &yaccum);
93
94 expon_expon += round_Xsig(&accumulator);
95
96 if ( accumulator.msw == 0 )
97 {
98 FPU_copy_to_reg1(&CONST_Z, TAG_Zero);
99 return;
100 }
101
102 significand(st1_ptr) = XSIG_LL(accumulator);
103 setexponent16(st1_ptr, expon_expon + exponent16(st1_ptr) + 1);
104
105 tag = FPU_round(st1_ptr, 1, 0, FULL_PRECISION, sign ^ st1_sign);
106 FPU_settagi(1, tag);
107
108 set_precision_flag_up(); /* 80486 appears to always do this */
109
110 return;
111 51
112} 52 if (tag == TAG_Zero) {
53 expon = 0;
54 accumulator.msw = accumulator.midw = accumulator.lsw = 0;
55 } else {
56 log2_kernel(&x, argsign, &accumulator, &expon);
57 }
58
59 if (exponent < 0) {
60 sign = SIGN_NEG;
61 exponent = -exponent;
62 } else
63 sign = SIGN_POS;
64 expon_accum.msw = exponent;
65 expon_accum.midw = expon_accum.lsw = 0;
66 if (exponent) {
67 expon_expon = 31 + norm_Xsig(&expon_accum);
68 shr_Xsig(&accumulator, expon_expon - expon);
69
70 if (sign ^ argsign)
71 negate_Xsig(&accumulator);
72 add_Xsig_Xsig(&accumulator, &expon_accum);
73 } else {
74 expon_expon = expon;
75 sign = argsign;
76 }
77
78 yaccum.lsw = 0;
79 XSIG_LL(yaccum) = significand(st1_ptr);
80 mul_Xsig_Xsig(&accumulator, &yaccum);
81
82 expon_expon += round_Xsig(&accumulator);
83
84 if (accumulator.msw == 0) {
85 FPU_copy_to_reg1(&CONST_Z, TAG_Zero);
86 return;
87 }
88
89 significand(st1_ptr) = XSIG_LL(accumulator);
90 setexponent16(st1_ptr, expon_expon + exponent16(st1_ptr) + 1);
113 91
92 tag = FPU_round(st1_ptr, 1, 0, FULL_PRECISION, sign ^ st1_sign);
93 FPU_settagi(1, tag);
94
95 set_precision_flag_up(); /* 80486 appears to always do this */
96
97 return;
98
99}
114 100
115/*--- poly_l2p1() -----------------------------------------------------------+ 101/*--- poly_l2p1() -----------------------------------------------------------+
116 | Base 2 logarithm by a polynomial approximation. | 102 | Base 2 logarithm by a polynomial approximation. |
117 | log2(x+1) | 103 | log2(x+1) |
118 +---------------------------------------------------------------------------*/ 104 +---------------------------------------------------------------------------*/
119int poly_l2p1(u_char sign0, u_char sign1, 105int poly_l2p1(u_char sign0, u_char sign1,
120 FPU_REG *st0_ptr, FPU_REG *st1_ptr, FPU_REG *dest) 106 FPU_REG * st0_ptr, FPU_REG * st1_ptr, FPU_REG * dest)
121{ 107{
122 u_char tag; 108 u_char tag;
123 long int exponent; 109 long int exponent;
124 Xsig accumulator, yaccum; 110 Xsig accumulator, yaccum;
125 111
126 if ( exponent16(st0_ptr) < 0 ) 112 if (exponent16(st0_ptr) < 0) {
127 { 113 log2_kernel(st0_ptr, sign0, &accumulator, &exponent);
128 log2_kernel(st0_ptr, sign0, &accumulator, &exponent);
129 114
130 yaccum.lsw = 0; 115 yaccum.lsw = 0;
131 XSIG_LL(yaccum) = significand(st1_ptr); 116 XSIG_LL(yaccum) = significand(st1_ptr);
132 mul_Xsig_Xsig(&accumulator, &yaccum); 117 mul_Xsig_Xsig(&accumulator, &yaccum);
133 118
134 exponent += round_Xsig(&accumulator); 119 exponent += round_Xsig(&accumulator);
135 120
136 exponent += exponent16(st1_ptr) + 1; 121 exponent += exponent16(st1_ptr) + 1;
137 if ( exponent < EXP_WAY_UNDER ) exponent = EXP_WAY_UNDER; 122 if (exponent < EXP_WAY_UNDER)
123 exponent = EXP_WAY_UNDER;
138 124
139 significand(dest) = XSIG_LL(accumulator); 125 significand(dest) = XSIG_LL(accumulator);
140 setexponent16(dest, exponent); 126 setexponent16(dest, exponent);
141 127
142 tag = FPU_round(dest, 1, 0, FULL_PRECISION, sign0 ^ sign1); 128 tag = FPU_round(dest, 1, 0, FULL_PRECISION, sign0 ^ sign1);
143 FPU_settagi(1, tag); 129 FPU_settagi(1, tag);
144 130
145 if ( tag == TAG_Valid ) 131 if (tag == TAG_Valid)
146 set_precision_flag_up(); /* 80486 appears to always do this */ 132 set_precision_flag_up(); /* 80486 appears to always do this */
147 } 133 } else {
148 else 134 /* The magnitude of st0_ptr is far too large. */
149 {
150 /* The magnitude of st0_ptr is far too large. */
151 135
152 if ( sign0 != SIGN_POS ) 136 if (sign0 != SIGN_POS) {
153 { 137 /* Trying to get the log of a negative number. */
154 /* Trying to get the log of a negative number. */ 138#ifdef PECULIAR_486 /* Stupid 80486 doesn't worry about log(negative). */
155#ifdef PECULIAR_486 /* Stupid 80486 doesn't worry about log(negative). */ 139 changesign(st1_ptr);
156 changesign(st1_ptr);
157#else 140#else
158 if ( arith_invalid(1) < 0 ) 141 if (arith_invalid(1) < 0)
159 return 1; 142 return 1;
160#endif /* PECULIAR_486 */ 143#endif /* PECULIAR_486 */
161 } 144 }
162 145
163 /* 80486 appears to do this */ 146 /* 80486 appears to do this */
164 if ( sign0 == SIGN_NEG ) 147 if (sign0 == SIGN_NEG)
165 set_precision_flag_down(); 148 set_precision_flag_down();
166 else 149 else
167 set_precision_flag_up(); 150 set_precision_flag_up();
168 } 151 }
169 152
170 if ( exponent(dest) <= EXP_UNDER ) 153 if (exponent(dest) <= EXP_UNDER)
171 EXCEPTION(EX_Underflow); 154 EXCEPTION(EX_Underflow);
172 155
173 return 0; 156 return 0;
174 157
175} 158}
176 159
177
178
179
180#undef HIPOWER 160#undef HIPOWER
181#define HIPOWER 10 161#define HIPOWER 10
182static const unsigned long long logterms[HIPOWER] = 162static const unsigned long long logterms[HIPOWER] = {
183{ 163 0x2a8eca5705fc2ef0LL,
184 0x2a8eca5705fc2ef0LL, 164 0xf6384ee1d01febceLL,
185 0xf6384ee1d01febceLL, 165 0x093bb62877cdf642LL,
186 0x093bb62877cdf642LL, 166 0x006985d8a9ec439bLL,
187 0x006985d8a9ec439bLL, 167 0x0005212c4f55a9c8LL,
188 0x0005212c4f55a9c8LL, 168 0x00004326a16927f0LL,
189 0x00004326a16927f0LL, 169 0x0000038d1d80a0e7LL,
190 0x0000038d1d80a0e7LL, 170 0x0000003141cc80c6LL,
191 0x0000003141cc80c6LL, 171 0x00000002b1668c9fLL,
192 0x00000002b1668c9fLL, 172 0x000000002c7a46aaLL
193 0x000000002c7a46aaLL
194}; 173};
195 174
196static const unsigned long leadterm = 0xb8000000; 175static const unsigned long leadterm = 0xb8000000;
197 176
198
199/*--- log2_kernel() ---------------------------------------------------------+ 177/*--- log2_kernel() ---------------------------------------------------------+
200 | Base 2 logarithm by a polynomial approximation. | 178 | Base 2 logarithm by a polynomial approximation. |
201 | log2(x+1) | 179 | log2(x+1) |
@@ -203,70 +181,64 @@ static const unsigned long leadterm = 0xb8000000;
203static void log2_kernel(FPU_REG const *arg, u_char argsign, Xsig *accum_result, 181static void log2_kernel(FPU_REG const *arg, u_char argsign, Xsig *accum_result,
204 long int *expon) 182 long int *expon)
205{ 183{
206 long int exponent, adj; 184 long int exponent, adj;
207 unsigned long long Xsq; 185 unsigned long long Xsq;
208 Xsig accumulator, Numer, Denom, argSignif, arg_signif; 186 Xsig accumulator, Numer, Denom, argSignif, arg_signif;
209 187
210 exponent = exponent16(arg); 188 exponent = exponent16(arg);
211 Numer.lsw = Denom.lsw = 0; 189 Numer.lsw = Denom.lsw = 0;
212 XSIG_LL(Numer) = XSIG_LL(Denom) = significand(arg); 190 XSIG_LL(Numer) = XSIG_LL(Denom) = significand(arg);
213 if ( argsign == SIGN_POS ) 191 if (argsign == SIGN_POS) {
214 { 192 shr_Xsig(&Denom, 2 - (1 + exponent));
215 shr_Xsig(&Denom, 2 - (1 + exponent)); 193 Denom.msw |= 0x80000000;
216 Denom.msw |= 0x80000000; 194 div_Xsig(&Numer, &Denom, &argSignif);
217 div_Xsig(&Numer, &Denom, &argSignif); 195 } else {
218 } 196 shr_Xsig(&Denom, 1 - (1 + exponent));
219 else 197 negate_Xsig(&Denom);
220 { 198 if (Denom.msw & 0x80000000) {
221 shr_Xsig(&Denom, 1 - (1 + exponent)); 199 div_Xsig(&Numer, &Denom, &argSignif);
222 negate_Xsig(&Denom); 200 exponent++;
223 if ( Denom.msw & 0x80000000 ) 201 } else {
224 { 202 /* Denom must be 1.0 */
225 div_Xsig(&Numer, &Denom, &argSignif); 203 argSignif.lsw = Numer.lsw;
226 exponent ++; 204 argSignif.midw = Numer.midw;
227 } 205 argSignif.msw = Numer.msw;
228 else 206 }
229 {
230 /* Denom must be 1.0 */
231 argSignif.lsw = Numer.lsw; argSignif.midw = Numer.midw;
232 argSignif.msw = Numer.msw;
233 } 207 }
234 }
235 208
236#ifndef PECULIAR_486 209#ifndef PECULIAR_486
237 /* Should check here that |local_arg| is within the valid range */ 210 /* Should check here that |local_arg| is within the valid range */
238 if ( exponent >= -2 ) 211 if (exponent >= -2) {
239 { 212 if ((exponent > -2) || (argSignif.msw > (unsigned)0xafb0ccc0)) {
240 if ( (exponent > -2) || 213 /* The argument is too large */
241 (argSignif.msw > (unsigned)0xafb0ccc0) ) 214 }
242 {
243 /* The argument is too large */
244 } 215 }
245 }
246#endif /* PECULIAR_486 */ 216#endif /* PECULIAR_486 */
247 217
248 arg_signif.lsw = argSignif.lsw; XSIG_LL(arg_signif) = XSIG_LL(argSignif); 218 arg_signif.lsw = argSignif.lsw;
249 adj = norm_Xsig(&argSignif); 219 XSIG_LL(arg_signif) = XSIG_LL(argSignif);
250 accumulator.lsw = argSignif.lsw; XSIG_LL(accumulator) = XSIG_LL(argSignif); 220 adj = norm_Xsig(&argSignif);
251 mul_Xsig_Xsig(&accumulator, &accumulator); 221 accumulator.lsw = argSignif.lsw;
252 shr_Xsig(&accumulator, 2*(-1 - (1 + exponent + adj))); 222 XSIG_LL(accumulator) = XSIG_LL(argSignif);
253 Xsq = XSIG_LL(accumulator); 223 mul_Xsig_Xsig(&accumulator, &accumulator);
254 if ( accumulator.lsw & 0x80000000 ) 224 shr_Xsig(&accumulator, 2 * (-1 - (1 + exponent + adj)));
255 Xsq++; 225 Xsq = XSIG_LL(accumulator);
256 226 if (accumulator.lsw & 0x80000000)
257 accumulator.msw = accumulator.midw = accumulator.lsw = 0; 227 Xsq++;
258 /* Do the basic fixed point polynomial evaluation */ 228
259 polynomial_Xsig(&accumulator, &Xsq, logterms, HIPOWER-1); 229 accumulator.msw = accumulator.midw = accumulator.lsw = 0;
260 230 /* Do the basic fixed point polynomial evaluation */
261 mul_Xsig_Xsig(&accumulator, &argSignif); 231 polynomial_Xsig(&accumulator, &Xsq, logterms, HIPOWER - 1);
262 shr_Xsig(&accumulator, 6 - adj); 232
263 233 mul_Xsig_Xsig(&accumulator, &argSignif);
264 mul32_Xsig(&arg_signif, leadterm); 234 shr_Xsig(&accumulator, 6 - adj);
265 add_two_Xsig(&accumulator, &arg_signif, &exponent); 235
266 236 mul32_Xsig(&arg_signif, leadterm);
267 *expon = exponent + 1; 237 add_two_Xsig(&accumulator, &arg_signif, &exponent);
268 accum_result->lsw = accumulator.lsw; 238
269 accum_result->midw = accumulator.midw; 239 *expon = exponent + 1;
270 accum_result->msw = accumulator.msw; 240 accum_result->lsw = accumulator.lsw;
241 accum_result->midw = accumulator.midw;
242 accum_result->msw = accumulator.msw;
271 243
272} 244}
diff --git a/arch/x86/math-emu/poly_sin.c b/arch/x86/math-emu/poly_sin.c
index a36313fb06f..b862039c728 100644
--- a/arch/x86/math-emu/poly_sin.c
+++ b/arch/x86/math-emu/poly_sin.c
@@ -11,7 +11,6 @@
11 | | 11 | |
12 +---------------------------------------------------------------------------*/ 12 +---------------------------------------------------------------------------*/
13 13
14
15#include "exception.h" 14#include "exception.h"
16#include "reg_constant.h" 15#include "reg_constant.h"
17#include "fpu_emu.h" 16#include "fpu_emu.h"
@@ -19,379 +18,361 @@
19#include "control_w.h" 18#include "control_w.h"
20#include "poly.h" 19#include "poly.h"
21 20
22
23#define N_COEFF_P 4 21#define N_COEFF_P 4
24#define N_COEFF_N 4 22#define N_COEFF_N 4
25 23
26static const unsigned long long pos_terms_l[N_COEFF_P] = 24static const unsigned long long pos_terms_l[N_COEFF_P] = {
27{ 25 0xaaaaaaaaaaaaaaabLL,
28 0xaaaaaaaaaaaaaaabLL, 26 0x00d00d00d00cf906LL,
29 0x00d00d00d00cf906LL, 27 0x000006b99159a8bbLL,
30 0x000006b99159a8bbLL, 28 0x000000000d7392e6LL
31 0x000000000d7392e6LL
32}; 29};
33 30
34static const unsigned long long neg_terms_l[N_COEFF_N] = 31static const unsigned long long neg_terms_l[N_COEFF_N] = {
35{ 32 0x2222222222222167LL,
36 0x2222222222222167LL, 33 0x0002e3bc74aab624LL,
37 0x0002e3bc74aab624LL, 34 0x0000000b09229062LL,
38 0x0000000b09229062LL, 35 0x00000000000c7973LL
39 0x00000000000c7973LL
40}; 36};
41 37
42
43
44#define N_COEFF_PH 4 38#define N_COEFF_PH 4
45#define N_COEFF_NH 4 39#define N_COEFF_NH 4
46static const unsigned long long pos_terms_h[N_COEFF_PH] = 40static const unsigned long long pos_terms_h[N_COEFF_PH] = {
47{ 41 0x0000000000000000LL,
48 0x0000000000000000LL, 42 0x05b05b05b05b0406LL,
49 0x05b05b05b05b0406LL, 43 0x000049f93edd91a9LL,
50 0x000049f93edd91a9LL, 44 0x00000000c9c9ed62LL
51 0x00000000c9c9ed62LL
52}; 45};
53 46
54static const unsigned long long neg_terms_h[N_COEFF_NH] = 47static const unsigned long long neg_terms_h[N_COEFF_NH] = {
55{ 48 0xaaaaaaaaaaaaaa98LL,
56 0xaaaaaaaaaaaaaa98LL, 49 0x001a01a01a019064LL,
57 0x001a01a01a019064LL, 50 0x0000008f76c68a77LL,
58 0x0000008f76c68a77LL, 51 0x0000000000d58f5eLL
59 0x0000000000d58f5eLL
60}; 52};
61 53
62
63/*--- poly_sine() -----------------------------------------------------------+ 54/*--- poly_sine() -----------------------------------------------------------+
64 | | 55 | |
65 +---------------------------------------------------------------------------*/ 56 +---------------------------------------------------------------------------*/
66void poly_sine(FPU_REG *st0_ptr) 57void poly_sine(FPU_REG *st0_ptr)
67{ 58{
68 int exponent, echange; 59 int exponent, echange;
69 Xsig accumulator, argSqrd, argTo4; 60 Xsig accumulator, argSqrd, argTo4;
70 unsigned long fix_up, adj; 61 unsigned long fix_up, adj;
71 unsigned long long fixed_arg; 62 unsigned long long fixed_arg;
72 FPU_REG result; 63 FPU_REG result;
73 64
74 exponent = exponent(st0_ptr); 65 exponent = exponent(st0_ptr);
75 66
76 accumulator.lsw = accumulator.midw = accumulator.msw = 0; 67 accumulator.lsw = accumulator.midw = accumulator.msw = 0;
77 68
78 /* Split into two ranges, for arguments below and above 1.0 */ 69 /* Split into two ranges, for arguments below and above 1.0 */
79 /* The boundary between upper and lower is approx 0.88309101259 */ 70 /* The boundary between upper and lower is approx 0.88309101259 */
80 if ( (exponent < -1) || ((exponent == -1) && (st0_ptr->sigh <= 0xe21240aa)) ) 71 if ((exponent < -1)
81 { 72 || ((exponent == -1) && (st0_ptr->sigh <= 0xe21240aa))) {
82 /* The argument is <= 0.88309101259 */ 73 /* The argument is <= 0.88309101259 */
74
75 argSqrd.msw = st0_ptr->sigh;
76 argSqrd.midw = st0_ptr->sigl;
77 argSqrd.lsw = 0;
78 mul64_Xsig(&argSqrd, &significand(st0_ptr));
79 shr_Xsig(&argSqrd, 2 * (-1 - exponent));
80 argTo4.msw = argSqrd.msw;
81 argTo4.midw = argSqrd.midw;
82 argTo4.lsw = argSqrd.lsw;
83 mul_Xsig_Xsig(&argTo4, &argTo4);
83 84
84 argSqrd.msw = st0_ptr->sigh; argSqrd.midw = st0_ptr->sigl; argSqrd.lsw = 0; 85 polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), neg_terms_l,
85 mul64_Xsig(&argSqrd, &significand(st0_ptr)); 86 N_COEFF_N - 1);
86 shr_Xsig(&argSqrd, 2*(-1-exponent)); 87 mul_Xsig_Xsig(&accumulator, &argSqrd);
87 argTo4.msw = argSqrd.msw; argTo4.midw = argSqrd.midw; 88 negate_Xsig(&accumulator);
88 argTo4.lsw = argSqrd.lsw;
89 mul_Xsig_Xsig(&argTo4, &argTo4);
90 89
91 polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), neg_terms_l, 90 polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), pos_terms_l,
92 N_COEFF_N-1); 91 N_COEFF_P - 1);
93 mul_Xsig_Xsig(&accumulator, &argSqrd);
94 negate_Xsig(&accumulator);
95 92
96 polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), pos_terms_l, 93 shr_Xsig(&accumulator, 2); /* Divide by four */
97 N_COEFF_P-1); 94 accumulator.msw |= 0x80000000; /* Add 1.0 */
98 95
99 shr_Xsig(&accumulator, 2); /* Divide by four */ 96 mul64_Xsig(&accumulator, &significand(st0_ptr));
100 accumulator.msw |= 0x80000000; /* Add 1.0 */ 97 mul64_Xsig(&accumulator, &significand(st0_ptr));
98 mul64_Xsig(&accumulator, &significand(st0_ptr));
101 99
102 mul64_Xsig(&accumulator, &significand(st0_ptr)); 100 /* Divide by four, FPU_REG compatible, etc */
103 mul64_Xsig(&accumulator, &significand(st0_ptr)); 101 exponent = 3 * exponent;
104 mul64_Xsig(&accumulator, &significand(st0_ptr));
105 102
106 /* Divide by four, FPU_REG compatible, etc */ 103 /* The minimum exponent difference is 3 */
107 exponent = 3*exponent; 104 shr_Xsig(&accumulator, exponent(st0_ptr) - exponent);
108 105
109 /* The minimum exponent difference is 3 */ 106 negate_Xsig(&accumulator);
110 shr_Xsig(&accumulator, exponent(st0_ptr) - exponent); 107 XSIG_LL(accumulator) += significand(st0_ptr);
111 108
112 negate_Xsig(&accumulator); 109 echange = round_Xsig(&accumulator);
113 XSIG_LL(accumulator) += significand(st0_ptr);
114 110
115 echange = round_Xsig(&accumulator); 111 setexponentpos(&result, exponent(st0_ptr) + echange);
112 } else {
113 /* The argument is > 0.88309101259 */
114 /* We use sin(st(0)) = cos(pi/2-st(0)) */
116 115
117 setexponentpos(&result, exponent(st0_ptr) + echange); 116 fixed_arg = significand(st0_ptr);
118 }
119 else
120 {
121 /* The argument is > 0.88309101259 */
122 /* We use sin(st(0)) = cos(pi/2-st(0)) */
123 117
124 fixed_arg = significand(st0_ptr); 118 if (exponent == 0) {
119 /* The argument is >= 1.0 */
125 120
126 if ( exponent == 0 ) 121 /* Put the binary point at the left. */
127 { 122 fixed_arg <<= 1;
128 /* The argument is >= 1.0 */ 123 }
124 /* pi/2 in hex is: 1.921fb54442d18469 898CC51701B839A2 52049C1 */
125 fixed_arg = 0x921fb54442d18469LL - fixed_arg;
126 /* There is a special case which arises due to rounding, to fix here. */
127 if (fixed_arg == 0xffffffffffffffffLL)
128 fixed_arg = 0;
129 129
130 /* Put the binary point at the left. */ 130 XSIG_LL(argSqrd) = fixed_arg;
131 fixed_arg <<= 1; 131 argSqrd.lsw = 0;
132 } 132 mul64_Xsig(&argSqrd, &fixed_arg);
133 /* pi/2 in hex is: 1.921fb54442d18469 898CC51701B839A2 52049C1 */
134 fixed_arg = 0x921fb54442d18469LL - fixed_arg;
135 /* There is a special case which arises due to rounding, to fix here. */
136 if ( fixed_arg == 0xffffffffffffffffLL )
137 fixed_arg = 0;
138 133
139 XSIG_LL(argSqrd) = fixed_arg; argSqrd.lsw = 0; 134 XSIG_LL(argTo4) = XSIG_LL(argSqrd);
140 mul64_Xsig(&argSqrd, &fixed_arg); 135 argTo4.lsw = argSqrd.lsw;
136 mul_Xsig_Xsig(&argTo4, &argTo4);
141 137
142 XSIG_LL(argTo4) = XSIG_LL(argSqrd); argTo4.lsw = argSqrd.lsw; 138 polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), neg_terms_h,
143 mul_Xsig_Xsig(&argTo4, &argTo4); 139 N_COEFF_NH - 1);
140 mul_Xsig_Xsig(&accumulator, &argSqrd);
141 negate_Xsig(&accumulator);
144 142
145 polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), neg_terms_h, 143 polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), pos_terms_h,
146 N_COEFF_NH-1); 144 N_COEFF_PH - 1);
147 mul_Xsig_Xsig(&accumulator, &argSqrd); 145 negate_Xsig(&accumulator);
148 negate_Xsig(&accumulator);
149 146
150 polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), pos_terms_h, 147 mul64_Xsig(&accumulator, &fixed_arg);
151 N_COEFF_PH-1); 148 mul64_Xsig(&accumulator, &fixed_arg);
152 negate_Xsig(&accumulator);
153 149
154 mul64_Xsig(&accumulator, &fixed_arg); 150 shr_Xsig(&accumulator, 3);
155 mul64_Xsig(&accumulator, &fixed_arg); 151 negate_Xsig(&accumulator);
156 152
157 shr_Xsig(&accumulator, 3); 153 add_Xsig_Xsig(&accumulator, &argSqrd);
158 negate_Xsig(&accumulator);
159 154
160 add_Xsig_Xsig(&accumulator, &argSqrd); 155 shr_Xsig(&accumulator, 1);
161 156
162 shr_Xsig(&accumulator, 1); 157 accumulator.lsw |= 1; /* A zero accumulator here would cause problems */
158 negate_Xsig(&accumulator);
163 159
164 accumulator.lsw |= 1; /* A zero accumulator here would cause problems */ 160 /* The basic computation is complete. Now fix the answer to
165 negate_Xsig(&accumulator); 161 compensate for the error due to the approximation used for
162 pi/2
163 */
166 164
167 /* The basic computation is complete. Now fix the answer to 165 /* This has an exponent of -65 */
168 compensate for the error due to the approximation used for 166 fix_up = 0x898cc517;
169 pi/2 167 /* The fix-up needs to be improved for larger args */
170 */ 168 if (argSqrd.msw & 0xffc00000) {
169 /* Get about 32 bit precision in these: */
170 fix_up -= mul_32_32(0x898cc517, argSqrd.msw) / 6;
171 }
172 fix_up = mul_32_32(fix_up, LL_MSW(fixed_arg));
171 173
172 /* This has an exponent of -65 */ 174 adj = accumulator.lsw; /* temp save */
173 fix_up = 0x898cc517; 175 accumulator.lsw -= fix_up;
174 /* The fix-up needs to be improved for larger args */ 176 if (accumulator.lsw > adj)
175 if ( argSqrd.msw & 0xffc00000 ) 177 XSIG_LL(accumulator)--;
176 {
177 /* Get about 32 bit precision in these: */
178 fix_up -= mul_32_32(0x898cc517, argSqrd.msw) / 6;
179 }
180 fix_up = mul_32_32(fix_up, LL_MSW(fixed_arg));
181 178
182 adj = accumulator.lsw; /* temp save */ 179 echange = round_Xsig(&accumulator);
183 accumulator.lsw -= fix_up;
184 if ( accumulator.lsw > adj )
185 XSIG_LL(accumulator) --;
186 180
187 echange = round_Xsig(&accumulator); 181 setexponentpos(&result, echange - 1);
188 182 }
189 setexponentpos(&result, echange - 1);
190 }
191 183
192 significand(&result) = XSIG_LL(accumulator); 184 significand(&result) = XSIG_LL(accumulator);
193 setsign(&result, getsign(st0_ptr)); 185 setsign(&result, getsign(st0_ptr));
194 FPU_copy_to_reg0(&result, TAG_Valid); 186 FPU_copy_to_reg0(&result, TAG_Valid);
195 187
196#ifdef PARANOID 188#ifdef PARANOID
197 if ( (exponent(&result) >= 0) 189 if ((exponent(&result) >= 0)
198 && (significand(&result) > 0x8000000000000000LL) ) 190 && (significand(&result) > 0x8000000000000000LL)) {
199 { 191 EXCEPTION(EX_INTERNAL | 0x150);
200 EXCEPTION(EX_INTERNAL|0x150); 192 }
201 }
202#endif /* PARANOID */ 193#endif /* PARANOID */
203 194
204} 195}
205 196
206
207
208/*--- poly_cos() ------------------------------------------------------------+ 197/*--- poly_cos() ------------------------------------------------------------+
209 | | 198 | |
210 +---------------------------------------------------------------------------*/ 199 +---------------------------------------------------------------------------*/
211void poly_cos(FPU_REG *st0_ptr) 200void poly_cos(FPU_REG *st0_ptr)
212{ 201{
213 FPU_REG result; 202 FPU_REG result;
214 long int exponent, exp2, echange; 203 long int exponent, exp2, echange;
215 Xsig accumulator, argSqrd, fix_up, argTo4; 204 Xsig accumulator, argSqrd, fix_up, argTo4;
216 unsigned long long fixed_arg; 205 unsigned long long fixed_arg;
217 206
218#ifdef PARANOID 207#ifdef PARANOID
219 if ( (exponent(st0_ptr) > 0) 208 if ((exponent(st0_ptr) > 0)
220 || ((exponent(st0_ptr) == 0) 209 || ((exponent(st0_ptr) == 0)
221 && (significand(st0_ptr) > 0xc90fdaa22168c234LL)) ) 210 && (significand(st0_ptr) > 0xc90fdaa22168c234LL))) {
222 { 211 EXCEPTION(EX_Invalid);
223 EXCEPTION(EX_Invalid); 212 FPU_copy_to_reg0(&CONST_QNaN, TAG_Special);
224 FPU_copy_to_reg0(&CONST_QNaN, TAG_Special); 213 return;
225 return;
226 }
227#endif /* PARANOID */
228
229 exponent = exponent(st0_ptr);
230
231 accumulator.lsw = accumulator.midw = accumulator.msw = 0;
232
233 if ( (exponent < -1) || ((exponent == -1) && (st0_ptr->sigh <= 0xb00d6f54)) )
234 {
235 /* arg is < 0.687705 */
236
237 argSqrd.msw = st0_ptr->sigh; argSqrd.midw = st0_ptr->sigl;
238 argSqrd.lsw = 0;
239 mul64_Xsig(&argSqrd, &significand(st0_ptr));
240
241 if ( exponent < -1 )
242 {
243 /* shift the argument right by the required places */
244 shr_Xsig(&argSqrd, 2*(-1-exponent));
245 }
246
247 argTo4.msw = argSqrd.msw; argTo4.midw = argSqrd.midw;
248 argTo4.lsw = argSqrd.lsw;
249 mul_Xsig_Xsig(&argTo4, &argTo4);
250
251 polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), neg_terms_h,
252 N_COEFF_NH-1);
253 mul_Xsig_Xsig(&accumulator, &argSqrd);
254 negate_Xsig(&accumulator);
255
256 polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), pos_terms_h,
257 N_COEFF_PH-1);
258 negate_Xsig(&accumulator);
259
260 mul64_Xsig(&accumulator, &significand(st0_ptr));
261 mul64_Xsig(&accumulator, &significand(st0_ptr));
262 shr_Xsig(&accumulator, -2*(1+exponent));
263
264 shr_Xsig(&accumulator, 3);
265 negate_Xsig(&accumulator);
266
267 add_Xsig_Xsig(&accumulator, &argSqrd);
268
269 shr_Xsig(&accumulator, 1);
270
271 /* It doesn't matter if accumulator is all zero here, the
272 following code will work ok */
273 negate_Xsig(&accumulator);
274
275 if ( accumulator.lsw & 0x80000000 )
276 XSIG_LL(accumulator) ++;
277 if ( accumulator.msw == 0 )
278 {
279 /* The result is 1.0 */
280 FPU_copy_to_reg0(&CONST_1, TAG_Valid);
281 return;
282 }
283 else
284 {
285 significand(&result) = XSIG_LL(accumulator);
286
287 /* will be a valid positive nr with expon = -1 */
288 setexponentpos(&result, -1);
289 }
290 }
291 else
292 {
293 fixed_arg = significand(st0_ptr);
294
295 if ( exponent == 0 )
296 {
297 /* The argument is >= 1.0 */
298
299 /* Put the binary point at the left. */
300 fixed_arg <<= 1;
301 }
302 /* pi/2 in hex is: 1.921fb54442d18469 898CC51701B839A2 52049C1 */
303 fixed_arg = 0x921fb54442d18469LL - fixed_arg;
304 /* There is a special case which arises due to rounding, to fix here. */
305 if ( fixed_arg == 0xffffffffffffffffLL )
306 fixed_arg = 0;
307
308 exponent = -1;
309 exp2 = -1;
310
311 /* A shift is needed here only for a narrow range of arguments,
312 i.e. for fixed_arg approx 2^-32, but we pick up more... */
313 if ( !(LL_MSW(fixed_arg) & 0xffff0000) )
314 {
315 fixed_arg <<= 16;
316 exponent -= 16;
317 exp2 -= 16;
318 } 214 }
215#endif /* PARANOID */
319 216
320 XSIG_LL(argSqrd) = fixed_arg; argSqrd.lsw = 0; 217 exponent = exponent(st0_ptr);
321 mul64_Xsig(&argSqrd, &fixed_arg); 218
322 219 accumulator.lsw = accumulator.midw = accumulator.msw = 0;
323 if ( exponent < -1 ) 220
324 { 221 if ((exponent < -1)
325 /* shift the argument right by the required places */ 222 || ((exponent == -1) && (st0_ptr->sigh <= 0xb00d6f54))) {
326 shr_Xsig(&argSqrd, 2*(-1-exponent)); 223 /* arg is < 0.687705 */
327 } 224
328 225 argSqrd.msw = st0_ptr->sigh;
329 argTo4.msw = argSqrd.msw; argTo4.midw = argSqrd.midw; 226 argSqrd.midw = st0_ptr->sigl;
330 argTo4.lsw = argSqrd.lsw; 227 argSqrd.lsw = 0;
331 mul_Xsig_Xsig(&argTo4, &argTo4); 228 mul64_Xsig(&argSqrd, &significand(st0_ptr));
332 229
333 polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), neg_terms_l, 230 if (exponent < -1) {
334 N_COEFF_N-1); 231 /* shift the argument right by the required places */
335 mul_Xsig_Xsig(&accumulator, &argSqrd); 232 shr_Xsig(&argSqrd, 2 * (-1 - exponent));
336 negate_Xsig(&accumulator); 233 }
337 234
338 polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), pos_terms_l, 235 argTo4.msw = argSqrd.msw;
339 N_COEFF_P-1); 236 argTo4.midw = argSqrd.midw;
340 237 argTo4.lsw = argSqrd.lsw;
341 shr_Xsig(&accumulator, 2); /* Divide by four */ 238 mul_Xsig_Xsig(&argTo4, &argTo4);
342 accumulator.msw |= 0x80000000; /* Add 1.0 */ 239
343 240 polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), neg_terms_h,
344 mul64_Xsig(&accumulator, &fixed_arg); 241 N_COEFF_NH - 1);
345 mul64_Xsig(&accumulator, &fixed_arg); 242 mul_Xsig_Xsig(&accumulator, &argSqrd);
346 mul64_Xsig(&accumulator, &fixed_arg); 243 negate_Xsig(&accumulator);
347 244
348 /* Divide by four, FPU_REG compatible, etc */ 245 polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), pos_terms_h,
349 exponent = 3*exponent; 246 N_COEFF_PH - 1);
350 247 negate_Xsig(&accumulator);
351 /* The minimum exponent difference is 3 */ 248
352 shr_Xsig(&accumulator, exp2 - exponent); 249 mul64_Xsig(&accumulator, &significand(st0_ptr));
353 250 mul64_Xsig(&accumulator, &significand(st0_ptr));
354 negate_Xsig(&accumulator); 251 shr_Xsig(&accumulator, -2 * (1 + exponent));
355 XSIG_LL(accumulator) += fixed_arg; 252
356 253 shr_Xsig(&accumulator, 3);
357 /* The basic computation is complete. Now fix the answer to 254 negate_Xsig(&accumulator);
358 compensate for the error due to the approximation used for 255
359 pi/2 256 add_Xsig_Xsig(&accumulator, &argSqrd);
360 */ 257
361 258 shr_Xsig(&accumulator, 1);
362 /* This has an exponent of -65 */ 259
363 XSIG_LL(fix_up) = 0x898cc51701b839a2ll; 260 /* It doesn't matter if accumulator is all zero here, the
364 fix_up.lsw = 0; 261 following code will work ok */
365 262 negate_Xsig(&accumulator);
366 /* The fix-up needs to be improved for larger args */ 263
367 if ( argSqrd.msw & 0xffc00000 ) 264 if (accumulator.lsw & 0x80000000)
368 { 265 XSIG_LL(accumulator)++;
369 /* Get about 32 bit precision in these: */ 266 if (accumulator.msw == 0) {
370 fix_up.msw -= mul_32_32(0x898cc517, argSqrd.msw) / 2; 267 /* The result is 1.0 */
371 fix_up.msw += mul_32_32(0x898cc517, argTo4.msw) / 24; 268 FPU_copy_to_reg0(&CONST_1, TAG_Valid);
269 return;
270 } else {
271 significand(&result) = XSIG_LL(accumulator);
272
273 /* will be a valid positive nr with expon = -1 */
274 setexponentpos(&result, -1);
275 }
276 } else {
277 fixed_arg = significand(st0_ptr);
278
279 if (exponent == 0) {
280 /* The argument is >= 1.0 */
281
282 /* Put the binary point at the left. */
283 fixed_arg <<= 1;
284 }
285 /* pi/2 in hex is: 1.921fb54442d18469 898CC51701B839A2 52049C1 */
286 fixed_arg = 0x921fb54442d18469LL - fixed_arg;
287 /* There is a special case which arises due to rounding, to fix here. */
288 if (fixed_arg == 0xffffffffffffffffLL)
289 fixed_arg = 0;
290
291 exponent = -1;
292 exp2 = -1;
293
294 /* A shift is needed here only for a narrow range of arguments,
295 i.e. for fixed_arg approx 2^-32, but we pick up more... */
296 if (!(LL_MSW(fixed_arg) & 0xffff0000)) {
297 fixed_arg <<= 16;
298 exponent -= 16;
299 exp2 -= 16;
300 }
301
302 XSIG_LL(argSqrd) = fixed_arg;
303 argSqrd.lsw = 0;
304 mul64_Xsig(&argSqrd, &fixed_arg);
305
306 if (exponent < -1) {
307 /* shift the argument right by the required places */
308 shr_Xsig(&argSqrd, 2 * (-1 - exponent));
309 }
310
311 argTo4.msw = argSqrd.msw;
312 argTo4.midw = argSqrd.midw;
313 argTo4.lsw = argSqrd.lsw;
314 mul_Xsig_Xsig(&argTo4, &argTo4);
315
316 polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), neg_terms_l,
317 N_COEFF_N - 1);
318 mul_Xsig_Xsig(&accumulator, &argSqrd);
319 negate_Xsig(&accumulator);
320
321 polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), pos_terms_l,
322 N_COEFF_P - 1);
323
324 shr_Xsig(&accumulator, 2); /* Divide by four */
325 accumulator.msw |= 0x80000000; /* Add 1.0 */
326
327 mul64_Xsig(&accumulator, &fixed_arg);
328 mul64_Xsig(&accumulator, &fixed_arg);
329 mul64_Xsig(&accumulator, &fixed_arg);
330
331 /* Divide by four, FPU_REG compatible, etc */
332 exponent = 3 * exponent;
333
334 /* The minimum exponent difference is 3 */
335 shr_Xsig(&accumulator, exp2 - exponent);
336
337 negate_Xsig(&accumulator);
338 XSIG_LL(accumulator) += fixed_arg;
339
340 /* The basic computation is complete. Now fix the answer to
341 compensate for the error due to the approximation used for
342 pi/2
343 */
344
345 /* This has an exponent of -65 */
346 XSIG_LL(fix_up) = 0x898cc51701b839a2ll;
347 fix_up.lsw = 0;
348
349 /* The fix-up needs to be improved for larger args */
350 if (argSqrd.msw & 0xffc00000) {
351 /* Get about 32 bit precision in these: */
352 fix_up.msw -= mul_32_32(0x898cc517, argSqrd.msw) / 2;
353 fix_up.msw += mul_32_32(0x898cc517, argTo4.msw) / 24;
354 }
355
356 exp2 += norm_Xsig(&accumulator);
357 shr_Xsig(&accumulator, 1); /* Prevent overflow */
358 exp2++;
359 shr_Xsig(&fix_up, 65 + exp2);
360
361 add_Xsig_Xsig(&accumulator, &fix_up);
362
363 echange = round_Xsig(&accumulator);
364
365 setexponentpos(&result, exp2 + echange);
366 significand(&result) = XSIG_LL(accumulator);
372 } 367 }
373 368
374 exp2 += norm_Xsig(&accumulator); 369 FPU_copy_to_reg0(&result, TAG_Valid);
375 shr_Xsig(&accumulator, 1); /* Prevent overflow */
376 exp2++;
377 shr_Xsig(&fix_up, 65 + exp2);
378
379 add_Xsig_Xsig(&accumulator, &fix_up);
380
381 echange = round_Xsig(&accumulator);
382
383 setexponentpos(&result, exp2 + echange);
384 significand(&result) = XSIG_LL(accumulator);
385 }
386
387 FPU_copy_to_reg0(&result, TAG_Valid);
388 370
389#ifdef PARANOID 371#ifdef PARANOID
390 if ( (exponent(&result) >= 0) 372 if ((exponent(&result) >= 0)
391 && (significand(&result) > 0x8000000000000000LL) ) 373 && (significand(&result) > 0x8000000000000000LL)) {
392 { 374 EXCEPTION(EX_INTERNAL | 0x151);
393 EXCEPTION(EX_INTERNAL|0x151); 375 }
394 }
395#endif /* PARANOID */ 376#endif /* PARANOID */
396 377
397} 378}
diff --git a/arch/x86/math-emu/poly_tan.c b/arch/x86/math-emu/poly_tan.c
index 8df3e03b6e6..1875763e0c0 100644
--- a/arch/x86/math-emu/poly_tan.c
+++ b/arch/x86/math-emu/poly_tan.c
@@ -17,206 +17,196 @@
17#include "control_w.h" 17#include "control_w.h"
18#include "poly.h" 18#include "poly.h"
19 19
20
21#define HiPOWERop 3 /* odd poly, positive terms */ 20#define HiPOWERop 3 /* odd poly, positive terms */
22static const unsigned long long oddplterm[HiPOWERop] = 21static const unsigned long long oddplterm[HiPOWERop] = {
23{ 22 0x0000000000000000LL,
24 0x0000000000000000LL, 23 0x0051a1cf08fca228LL,
25 0x0051a1cf08fca228LL, 24 0x0000000071284ff7LL
26 0x0000000071284ff7LL
27}; 25};
28 26
29#define HiPOWERon 2 /* odd poly, negative terms */ 27#define HiPOWERon 2 /* odd poly, negative terms */
30static const unsigned long long oddnegterm[HiPOWERon] = 28static const unsigned long long oddnegterm[HiPOWERon] = {
31{ 29 0x1291a9a184244e80LL,
32 0x1291a9a184244e80LL, 30 0x0000583245819c21LL
33 0x0000583245819c21LL
34}; 31};
35 32
36#define HiPOWERep 2 /* even poly, positive terms */ 33#define HiPOWERep 2 /* even poly, positive terms */
37static const unsigned long long evenplterm[HiPOWERep] = 34static const unsigned long long evenplterm[HiPOWERep] = {
38{ 35 0x0e848884b539e888LL,
39 0x0e848884b539e888LL, 36 0x00003c7f18b887daLL
40 0x00003c7f18b887daLL
41}; 37};
42 38
43#define HiPOWERen 2 /* even poly, negative terms */ 39#define HiPOWERen 2 /* even poly, negative terms */
44static const unsigned long long evennegterm[HiPOWERen] = 40static const unsigned long long evennegterm[HiPOWERen] = {
45{ 41 0xf1f0200fd51569ccLL,
46 0xf1f0200fd51569ccLL, 42 0x003afb46105c4432LL
47 0x003afb46105c4432LL
48}; 43};
49 44
50static const unsigned long long twothirds = 0xaaaaaaaaaaaaaaabLL; 45static const unsigned long long twothirds = 0xaaaaaaaaaaaaaaabLL;
51 46
52
53/*--- poly_tan() ------------------------------------------------------------+ 47/*--- poly_tan() ------------------------------------------------------------+
54 | | 48 | |
55 +---------------------------------------------------------------------------*/ 49 +---------------------------------------------------------------------------*/
56void poly_tan(FPU_REG *st0_ptr) 50void poly_tan(FPU_REG *st0_ptr)
57{ 51{
58 long int exponent; 52 long int exponent;
59 int invert; 53 int invert;
60 Xsig argSq, argSqSq, accumulatoro, accumulatore, accum, 54 Xsig argSq, argSqSq, accumulatoro, accumulatore, accum,
61 argSignif, fix_up; 55 argSignif, fix_up;
62 unsigned long adj; 56 unsigned long adj;
63 57
64 exponent = exponent(st0_ptr); 58 exponent = exponent(st0_ptr);
65 59
66#ifdef PARANOID 60#ifdef PARANOID
67 if ( signnegative(st0_ptr) ) /* Can't hack a number < 0.0 */ 61 if (signnegative(st0_ptr)) { /* Can't hack a number < 0.0 */
68 { arith_invalid(0); return; } /* Need a positive number */ 62 arith_invalid(0);
63 return;
64 } /* Need a positive number */
69#endif /* PARANOID */ 65#endif /* PARANOID */
70 66
71 /* Split the problem into two domains, smaller and larger than pi/4 */ 67 /* Split the problem into two domains, smaller and larger than pi/4 */
72 if ( (exponent == 0) || ((exponent == -1) && (st0_ptr->sigh > 0xc90fdaa2)) ) 68 if ((exponent == 0)
73 { 69 || ((exponent == -1) && (st0_ptr->sigh > 0xc90fdaa2))) {
74 /* The argument is greater than (approx) pi/4 */ 70 /* The argument is greater than (approx) pi/4 */
75 invert = 1; 71 invert = 1;
76 accum.lsw = 0; 72 accum.lsw = 0;
77 XSIG_LL(accum) = significand(st0_ptr); 73 XSIG_LL(accum) = significand(st0_ptr);
78 74
79 if ( exponent == 0 ) 75 if (exponent == 0) {
80 { 76 /* The argument is >= 1.0 */
81 /* The argument is >= 1.0 */ 77 /* Put the binary point at the left. */
82 /* Put the binary point at the left. */ 78 XSIG_LL(accum) <<= 1;
83 XSIG_LL(accum) <<= 1; 79 }
84 } 80 /* pi/2 in hex is: 1.921fb54442d18469 898CC51701B839A2 52049C1 */
85 /* pi/2 in hex is: 1.921fb54442d18469 898CC51701B839A2 52049C1 */ 81 XSIG_LL(accum) = 0x921fb54442d18469LL - XSIG_LL(accum);
86 XSIG_LL(accum) = 0x921fb54442d18469LL - XSIG_LL(accum); 82 /* This is a special case which arises due to rounding. */
87 /* This is a special case which arises due to rounding. */ 83 if (XSIG_LL(accum) == 0xffffffffffffffffLL) {
88 if ( XSIG_LL(accum) == 0xffffffffffffffffLL ) 84 FPU_settag0(TAG_Valid);
89 { 85 significand(st0_ptr) = 0x8a51e04daabda360LL;
90 FPU_settag0(TAG_Valid); 86 setexponent16(st0_ptr,
91 significand(st0_ptr) = 0x8a51e04daabda360LL; 87 (0x41 + EXTENDED_Ebias) | SIGN_Negative);
92 setexponent16(st0_ptr, (0x41 + EXTENDED_Ebias) | SIGN_Negative); 88 return;
93 return; 89 }
90
91 argSignif.lsw = accum.lsw;
92 XSIG_LL(argSignif) = XSIG_LL(accum);
93 exponent = -1 + norm_Xsig(&argSignif);
94 } else {
95 invert = 0;
96 argSignif.lsw = 0;
97 XSIG_LL(accum) = XSIG_LL(argSignif) = significand(st0_ptr);
98
99 if (exponent < -1) {
100 /* shift the argument right by the required places */
101 if (FPU_shrx(&XSIG_LL(accum), -1 - exponent) >=
102 0x80000000U)
103 XSIG_LL(accum)++; /* round up */
104 }
94 } 105 }
95 106
96 argSignif.lsw = accum.lsw; 107 XSIG_LL(argSq) = XSIG_LL(accum);
97 XSIG_LL(argSignif) = XSIG_LL(accum); 108 argSq.lsw = accum.lsw;
98 exponent = -1 + norm_Xsig(&argSignif); 109 mul_Xsig_Xsig(&argSq, &argSq);
99 } 110 XSIG_LL(argSqSq) = XSIG_LL(argSq);
100 else 111 argSqSq.lsw = argSq.lsw;
101 { 112 mul_Xsig_Xsig(&argSqSq, &argSqSq);
102 invert = 0; 113
103 argSignif.lsw = 0; 114 /* Compute the negative terms for the numerator polynomial */
104 XSIG_LL(accum) = XSIG_LL(argSignif) = significand(st0_ptr); 115 accumulatoro.msw = accumulatoro.midw = accumulatoro.lsw = 0;
105 116 polynomial_Xsig(&accumulatoro, &XSIG_LL(argSqSq), oddnegterm,
106 if ( exponent < -1 ) 117 HiPOWERon - 1);
107 { 118 mul_Xsig_Xsig(&accumulatoro, &argSq);
108 /* shift the argument right by the required places */ 119 negate_Xsig(&accumulatoro);
109 if ( FPU_shrx(&XSIG_LL(accum), -1-exponent) >= 0x80000000U ) 120 /* Add the positive terms */
110 XSIG_LL(accum) ++; /* round up */ 121 polynomial_Xsig(&accumulatoro, &XSIG_LL(argSqSq), oddplterm,
111 } 122 HiPOWERop - 1);
112 } 123
113 124 /* Compute the positive terms for the denominator polynomial */
114 XSIG_LL(argSq) = XSIG_LL(accum); argSq.lsw = accum.lsw; 125 accumulatore.msw = accumulatore.midw = accumulatore.lsw = 0;
115 mul_Xsig_Xsig(&argSq, &argSq); 126 polynomial_Xsig(&accumulatore, &XSIG_LL(argSqSq), evenplterm,
116 XSIG_LL(argSqSq) = XSIG_LL(argSq); argSqSq.lsw = argSq.lsw; 127 HiPOWERep - 1);
117 mul_Xsig_Xsig(&argSqSq, &argSqSq); 128 mul_Xsig_Xsig(&accumulatore, &argSq);
118 129 negate_Xsig(&accumulatore);
119 /* Compute the negative terms for the numerator polynomial */ 130 /* Add the negative terms */
120 accumulatoro.msw = accumulatoro.midw = accumulatoro.lsw = 0; 131 polynomial_Xsig(&accumulatore, &XSIG_LL(argSqSq), evennegterm,
121 polynomial_Xsig(&accumulatoro, &XSIG_LL(argSqSq), oddnegterm, HiPOWERon-1); 132 HiPOWERen - 1);
122 mul_Xsig_Xsig(&accumulatoro, &argSq); 133 /* Multiply by arg^2 */
123 negate_Xsig(&accumulatoro); 134 mul64_Xsig(&accumulatore, &XSIG_LL(argSignif));
124 /* Add the positive terms */ 135 mul64_Xsig(&accumulatore, &XSIG_LL(argSignif));
125 polynomial_Xsig(&accumulatoro, &XSIG_LL(argSqSq), oddplterm, HiPOWERop-1); 136 /* de-normalize and divide by 2 */
126 137 shr_Xsig(&accumulatore, -2 * (1 + exponent) + 1);
127 138 negate_Xsig(&accumulatore); /* This does 1 - accumulator */
128 /* Compute the positive terms for the denominator polynomial */ 139
129 accumulatore.msw = accumulatore.midw = accumulatore.lsw = 0; 140 /* Now find the ratio. */
130 polynomial_Xsig(&accumulatore, &XSIG_LL(argSqSq), evenplterm, HiPOWERep-1); 141 if (accumulatore.msw == 0) {
131 mul_Xsig_Xsig(&accumulatore, &argSq); 142 /* accumulatoro must contain 1.0 here, (actually, 0) but it
132 negate_Xsig(&accumulatore); 143 really doesn't matter what value we use because it will
133 /* Add the negative terms */ 144 have negligible effect in later calculations
134 polynomial_Xsig(&accumulatore, &XSIG_LL(argSqSq), evennegterm, HiPOWERen-1); 145 */
135 /* Multiply by arg^2 */ 146 XSIG_LL(accum) = 0x8000000000000000LL;
136 mul64_Xsig(&accumulatore, &XSIG_LL(argSignif)); 147 accum.lsw = 0;
137 mul64_Xsig(&accumulatore, &XSIG_LL(argSignif)); 148 } else {
138 /* de-normalize and divide by 2 */ 149 div_Xsig(&accumulatoro, &accumulatore, &accum);
139 shr_Xsig(&accumulatore, -2*(1+exponent) + 1);
140 negate_Xsig(&accumulatore); /* This does 1 - accumulator */
141
142 /* Now find the ratio. */
143 if ( accumulatore.msw == 0 )
144 {
145 /* accumulatoro must contain 1.0 here, (actually, 0) but it
146 really doesn't matter what value we use because it will
147 have negligible effect in later calculations
148 */
149 XSIG_LL(accum) = 0x8000000000000000LL;
150 accum.lsw = 0;
151 }
152 else
153 {
154 div_Xsig(&accumulatoro, &accumulatore, &accum);
155 }
156
157 /* Multiply by 1/3 * arg^3 */
158 mul64_Xsig(&accum, &XSIG_LL(argSignif));
159 mul64_Xsig(&accum, &XSIG_LL(argSignif));
160 mul64_Xsig(&accum, &XSIG_LL(argSignif));
161 mul64_Xsig(&accum, &twothirds);
162 shr_Xsig(&accum, -2*(exponent+1));
163
164 /* tan(arg) = arg + accum */
165 add_two_Xsig(&accum, &argSignif, &exponent);
166
167 if ( invert )
168 {
169 /* We now have the value of tan(pi_2 - arg) where pi_2 is an
170 approximation for pi/2
171 */
172 /* The next step is to fix the answer to compensate for the
173 error due to the approximation used for pi/2
174 */
175
176 /* This is (approx) delta, the error in our approx for pi/2
177 (see above). It has an exponent of -65
178 */
179 XSIG_LL(fix_up) = 0x898cc51701b839a2LL;
180 fix_up.lsw = 0;
181
182 if ( exponent == 0 )
183 adj = 0xffffffff; /* We want approx 1.0 here, but
184 this is close enough. */
185 else if ( exponent > -30 )
186 {
187 adj = accum.msw >> -(exponent+1); /* tan */
188 adj = mul_32_32(adj, adj); /* tan^2 */
189 } 150 }
190 else 151
191 adj = 0; 152 /* Multiply by 1/3 * arg^3 */
192 adj = mul_32_32(0x898cc517, adj); /* delta * tan^2 */ 153 mul64_Xsig(&accum, &XSIG_LL(argSignif));
193 154 mul64_Xsig(&accum, &XSIG_LL(argSignif));
194 fix_up.msw += adj; 155 mul64_Xsig(&accum, &XSIG_LL(argSignif));
195 if ( !(fix_up.msw & 0x80000000) ) /* did fix_up overflow ? */ 156 mul64_Xsig(&accum, &twothirds);
196 { 157 shr_Xsig(&accum, -2 * (exponent + 1));
197 /* Yes, we need to add an msb */ 158
198 shr_Xsig(&fix_up, 1); 159 /* tan(arg) = arg + accum */
199 fix_up.msw |= 0x80000000; 160 add_two_Xsig(&accum, &argSignif, &exponent);
200 shr_Xsig(&fix_up, 64 + exponent); 161
162 if (invert) {
163 /* We now have the value of tan(pi_2 - arg) where pi_2 is an
164 approximation for pi/2
165 */
166 /* The next step is to fix the answer to compensate for the
167 error due to the approximation used for pi/2
168 */
169
170 /* This is (approx) delta, the error in our approx for pi/2
171 (see above). It has an exponent of -65
172 */
173 XSIG_LL(fix_up) = 0x898cc51701b839a2LL;
174 fix_up.lsw = 0;
175
176 if (exponent == 0)
177 adj = 0xffffffff; /* We want approx 1.0 here, but
178 this is close enough. */
179 else if (exponent > -30) {
180 adj = accum.msw >> -(exponent + 1); /* tan */
181 adj = mul_32_32(adj, adj); /* tan^2 */
182 } else
183 adj = 0;
184 adj = mul_32_32(0x898cc517, adj); /* delta * tan^2 */
185
186 fix_up.msw += adj;
187 if (!(fix_up.msw & 0x80000000)) { /* did fix_up overflow ? */
188 /* Yes, we need to add an msb */
189 shr_Xsig(&fix_up, 1);
190 fix_up.msw |= 0x80000000;
191 shr_Xsig(&fix_up, 64 + exponent);
192 } else
193 shr_Xsig(&fix_up, 65 + exponent);
194
195 add_two_Xsig(&accum, &fix_up, &exponent);
196
197 /* accum now contains tan(pi/2 - arg).
198 Use tan(arg) = 1.0 / tan(pi/2 - arg)
199 */
200 accumulatoro.lsw = accumulatoro.midw = 0;
201 accumulatoro.msw = 0x80000000;
202 div_Xsig(&accumulatoro, &accum, &accum);
203 exponent = -exponent - 1;
201 } 204 }
202 else 205
203 shr_Xsig(&fix_up, 65 + exponent); 206 /* Transfer the result */
204 207 round_Xsig(&accum);
205 add_two_Xsig(&accum, &fix_up, &exponent); 208 FPU_settag0(TAG_Valid);
206 209 significand(st0_ptr) = XSIG_LL(accum);
207 /* accum now contains tan(pi/2 - arg). 210 setexponent16(st0_ptr, exponent + EXTENDED_Ebias); /* Result is positive. */
208 Use tan(arg) = 1.0 / tan(pi/2 - arg)
209 */
210 accumulatoro.lsw = accumulatoro.midw = 0;
211 accumulatoro.msw = 0x80000000;
212 div_Xsig(&accumulatoro, &accum, &accum);
213 exponent = - exponent - 1;
214 }
215
216 /* Transfer the result */
217 round_Xsig(&accum);
218 FPU_settag0(TAG_Valid);
219 significand(st0_ptr) = XSIG_LL(accum);
220 setexponent16(st0_ptr, exponent + EXTENDED_Ebias); /* Result is positive. */
221 211
222} 212}
diff --git a/arch/x86/math-emu/reg_add_sub.c b/arch/x86/math-emu/reg_add_sub.c
index 7cd3b37ac08..deea48b9f13 100644
--- a/arch/x86/math-emu/reg_add_sub.c
+++ b/arch/x86/math-emu/reg_add_sub.c
@@ -27,7 +27,7 @@
27static 27static
28int add_sub_specials(FPU_REG const *a, u_char taga, u_char signa, 28int add_sub_specials(FPU_REG const *a, u_char taga, u_char signa,
29 FPU_REG const *b, u_char tagb, u_char signb, 29 FPU_REG const *b, u_char tagb, u_char signb,
30 FPU_REG *dest, int deststnr, int control_w); 30 FPU_REG * dest, int deststnr, int control_w);
31 31
32/* 32/*
33 Operates on st(0) and st(n), or on st(0) and temporary data. 33 Operates on st(0) and st(n), or on st(0) and temporary data.
@@ -35,340 +35,299 @@ int add_sub_specials(FPU_REG const *a, u_char taga, u_char signa,
35 */ 35 */
36int FPU_add(FPU_REG const *b, u_char tagb, int deststnr, int control_w) 36int FPU_add(FPU_REG const *b, u_char tagb, int deststnr, int control_w)
37{ 37{
38 FPU_REG *a = &st(0); 38 FPU_REG *a = &st(0);
39 FPU_REG *dest = &st(deststnr); 39 FPU_REG *dest = &st(deststnr);
40 u_char signb = getsign(b); 40 u_char signb = getsign(b);
41 u_char taga = FPU_gettag0(); 41 u_char taga = FPU_gettag0();
42 u_char signa = getsign(a); 42 u_char signa = getsign(a);
43 u_char saved_sign = getsign(dest); 43 u_char saved_sign = getsign(dest);
44 int diff, tag, expa, expb; 44 int diff, tag, expa, expb;
45 45
46 if ( !(taga | tagb) ) 46 if (!(taga | tagb)) {
47 { 47 expa = exponent(a);
48 expa = exponent(a); 48 expb = exponent(b);
49 expb = exponent(b); 49
50 50 valid_add:
51 valid_add: 51 /* Both registers are valid */
52 /* Both registers are valid */ 52 if (!(signa ^ signb)) {
53 if (!(signa ^ signb)) 53 /* signs are the same */
54 { 54 tag =
55 /* signs are the same */ 55 FPU_u_add(a, b, dest, control_w, signa, expa, expb);
56 tag = FPU_u_add(a, b, dest, control_w, signa, expa, expb); 56 } else {
57 } 57 /* The signs are different, so do a subtraction */
58 else 58 diff = expa - expb;
59 { 59 if (!diff) {
60 /* The signs are different, so do a subtraction */ 60 diff = a->sigh - b->sigh; /* This works only if the ms bits
61 diff = expa - expb; 61 are identical. */
62 if (!diff) 62 if (!diff) {
63 { 63 diff = a->sigl > b->sigl;
64 diff = a->sigh - b->sigh; /* This works only if the ms bits 64 if (!diff)
65 are identical. */ 65 diff = -(a->sigl < b->sigl);
66 if (!diff) 66 }
67 { 67 }
68 diff = a->sigl > b->sigl; 68
69 if (!diff) 69 if (diff > 0) {
70 diff = -(a->sigl < b->sigl); 70 tag =
71 FPU_u_sub(a, b, dest, control_w, signa,
72 expa, expb);
73 } else if (diff < 0) {
74 tag =
75 FPU_u_sub(b, a, dest, control_w, signb,
76 expb, expa);
77 } else {
78 FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr);
79 /* sign depends upon rounding mode */
80 setsign(dest, ((control_w & CW_RC) != RC_DOWN)
81 ? SIGN_POS : SIGN_NEG);
82 return TAG_Zero;
83 }
71 } 84 }
72 }
73
74 if (diff > 0)
75 {
76 tag = FPU_u_sub(a, b, dest, control_w, signa, expa, expb);
77 }
78 else if ( diff < 0 )
79 {
80 tag = FPU_u_sub(b, a, dest, control_w, signb, expb, expa);
81 }
82 else
83 {
84 FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr);
85 /* sign depends upon rounding mode */
86 setsign(dest, ((control_w & CW_RC) != RC_DOWN)
87 ? SIGN_POS : SIGN_NEG);
88 return TAG_Zero;
89 }
90 }
91 85
92 if ( tag < 0 ) 86 if (tag < 0) {
93 { 87 setsign(dest, saved_sign);
94 setsign(dest, saved_sign); 88 return tag;
95 return tag; 89 }
90 FPU_settagi(deststnr, tag);
91 return tag;
96 } 92 }
97 FPU_settagi(deststnr, tag);
98 return tag;
99 }
100 93
101 if ( taga == TAG_Special ) 94 if (taga == TAG_Special)
102 taga = FPU_Special(a); 95 taga = FPU_Special(a);
103 if ( tagb == TAG_Special ) 96 if (tagb == TAG_Special)
104 tagb = FPU_Special(b); 97 tagb = FPU_Special(b);
105 98
106 if ( ((taga == TAG_Valid) && (tagb == TW_Denormal)) 99 if (((taga == TAG_Valid) && (tagb == TW_Denormal))
107 || ((taga == TW_Denormal) && (tagb == TAG_Valid)) 100 || ((taga == TW_Denormal) && (tagb == TAG_Valid))
108 || ((taga == TW_Denormal) && (tagb == TW_Denormal)) ) 101 || ((taga == TW_Denormal) && (tagb == TW_Denormal))) {
109 { 102 FPU_REG x, y;
110 FPU_REG x, y; 103
104 if (denormal_operand() < 0)
105 return FPU_Exception;
106
107 FPU_to_exp16(a, &x);
108 FPU_to_exp16(b, &y);
109 a = &x;
110 b = &y;
111 expa = exponent16(a);
112 expb = exponent16(b);
113 goto valid_add;
114 }
111 115
112 if ( denormal_operand() < 0 ) 116 if ((taga == TW_NaN) || (tagb == TW_NaN)) {
113 return FPU_Exception; 117 if (deststnr == 0)
118 return real_2op_NaN(b, tagb, deststnr, a);
119 else
120 return real_2op_NaN(a, taga, deststnr, a);
121 }
114 122
115 FPU_to_exp16(a, &x); 123 return add_sub_specials(a, taga, signa, b, tagb, signb,
116 FPU_to_exp16(b, &y); 124 dest, deststnr, control_w);
117 a = &x;
118 b = &y;
119 expa = exponent16(a);
120 expb = exponent16(b);
121 goto valid_add;
122 }
123
124 if ( (taga == TW_NaN) || (tagb == TW_NaN) )
125 {
126 if ( deststnr == 0 )
127 return real_2op_NaN(b, tagb, deststnr, a);
128 else
129 return real_2op_NaN(a, taga, deststnr, a);
130 }
131
132 return add_sub_specials(a, taga, signa, b, tagb, signb,
133 dest, deststnr, control_w);
134} 125}
135 126
136
137/* Subtract b from a. (a-b) -> dest */ 127/* Subtract b from a. (a-b) -> dest */
138int FPU_sub(int flags, int rm, int control_w) 128int FPU_sub(int flags, int rm, int control_w)
139{ 129{
140 FPU_REG const *a, *b; 130 FPU_REG const *a, *b;
141 FPU_REG *dest; 131 FPU_REG *dest;
142 u_char taga, tagb, signa, signb, saved_sign, sign; 132 u_char taga, tagb, signa, signb, saved_sign, sign;
143 int diff, tag = 0, expa, expb, deststnr; 133 int diff, tag = 0, expa, expb, deststnr;
144 134
145 a = &st(0); 135 a = &st(0);
146 taga = FPU_gettag0(); 136 taga = FPU_gettag0();
147 137
148 deststnr = 0; 138 deststnr = 0;
149 if ( flags & LOADED ) 139 if (flags & LOADED) {
150 { 140 b = (FPU_REG *) rm;
151 b = (FPU_REG *)rm; 141 tagb = flags & 0x0f;
152 tagb = flags & 0x0f; 142 } else {
153 } 143 b = &st(rm);
154 else 144 tagb = FPU_gettagi(rm);
155 { 145
156 b = &st(rm); 146 if (flags & DEST_RM)
157 tagb = FPU_gettagi(rm); 147 deststnr = rm;
158
159 if ( flags & DEST_RM )
160 deststnr = rm;
161 }
162
163 signa = getsign(a);
164 signb = getsign(b);
165
166 if ( flags & REV )
167 {
168 signa ^= SIGN_NEG;
169 signb ^= SIGN_NEG;
170 }
171
172 dest = &st(deststnr);
173 saved_sign = getsign(dest);
174
175 if ( !(taga | tagb) )
176 {
177 expa = exponent(a);
178 expb = exponent(b);
179
180 valid_subtract:
181 /* Both registers are valid */
182
183 diff = expa - expb;
184
185 if (!diff)
186 {
187 diff = a->sigh - b->sigh; /* Works only if ms bits are identical */
188 if (!diff)
189 {
190 diff = a->sigl > b->sigl;
191 if (!diff)
192 diff = -(a->sigl < b->sigl);
193 }
194 } 148 }
195 149
196 switch ( (((int)signa)*2 + signb) / SIGN_NEG ) 150 signa = getsign(a);
197 { 151 signb = getsign(b);
198 case 0: /* P - P */ 152
199 case 3: /* N - N */ 153 if (flags & REV) {
200 if (diff > 0) 154 signa ^= SIGN_NEG;
201 { 155 signb ^= SIGN_NEG;
202 /* |a| > |b| */ 156 }
203 tag = FPU_u_sub(a, b, dest, control_w, signa, expa, expb); 157
204 } 158 dest = &st(deststnr);
205 else if ( diff == 0 ) 159 saved_sign = getsign(dest);
206 { 160
207 FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr); 161 if (!(taga | tagb)) {
208 162 expa = exponent(a);
209 /* sign depends upon rounding mode */ 163 expb = exponent(b);
210 setsign(dest, ((control_w & CW_RC) != RC_DOWN) 164
211 ? SIGN_POS : SIGN_NEG); 165 valid_subtract:
212 return TAG_Zero; 166 /* Both registers are valid */
213 } 167
214 else 168 diff = expa - expb;
215 { 169
216 sign = signa ^ SIGN_NEG; 170 if (!diff) {
217 tag = FPU_u_sub(b, a, dest, control_w, sign, expb, expa); 171 diff = a->sigh - b->sigh; /* Works only if ms bits are identical */
218 } 172 if (!diff) {
219 break; 173 diff = a->sigl > b->sigl;
220 case 1: /* P - N */ 174 if (!diff)
221 tag = FPU_u_add(a, b, dest, control_w, SIGN_POS, expa, expb); 175 diff = -(a->sigl < b->sigl);
222 break; 176 }
223 case 2: /* N - P */ 177 }
224 tag = FPU_u_add(a, b, dest, control_w, SIGN_NEG, expa, expb); 178
225 break; 179 switch ((((int)signa) * 2 + signb) / SIGN_NEG) {
180 case 0: /* P - P */
181 case 3: /* N - N */
182 if (diff > 0) {
183 /* |a| > |b| */
184 tag =
185 FPU_u_sub(a, b, dest, control_w, signa,
186 expa, expb);
187 } else if (diff == 0) {
188 FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr);
189
190 /* sign depends upon rounding mode */
191 setsign(dest, ((control_w & CW_RC) != RC_DOWN)
192 ? SIGN_POS : SIGN_NEG);
193 return TAG_Zero;
194 } else {
195 sign = signa ^ SIGN_NEG;
196 tag =
197 FPU_u_sub(b, a, dest, control_w, sign, expb,
198 expa);
199 }
200 break;
201 case 1: /* P - N */
202 tag =
203 FPU_u_add(a, b, dest, control_w, SIGN_POS, expa,
204 expb);
205 break;
206 case 2: /* N - P */
207 tag =
208 FPU_u_add(a, b, dest, control_w, SIGN_NEG, expa,
209 expb);
210 break;
226#ifdef PARANOID 211#ifdef PARANOID
227 default: 212 default:
228 EXCEPTION(EX_INTERNAL|0x111); 213 EXCEPTION(EX_INTERNAL | 0x111);
229 return -1; 214 return -1;
230#endif 215#endif
216 }
217 if (tag < 0) {
218 setsign(dest, saved_sign);
219 return tag;
220 }
221 FPU_settagi(deststnr, tag);
222 return tag;
231 } 223 }
232 if ( tag < 0 )
233 {
234 setsign(dest, saved_sign);
235 return tag;
236 }
237 FPU_settagi(deststnr, tag);
238 return tag;
239 }
240 224
241 if ( taga == TAG_Special ) 225 if (taga == TAG_Special)
242 taga = FPU_Special(a); 226 taga = FPU_Special(a);
243 if ( tagb == TAG_Special ) 227 if (tagb == TAG_Special)
244 tagb = FPU_Special(b); 228 tagb = FPU_Special(b);
245 229
246 if ( ((taga == TAG_Valid) && (tagb == TW_Denormal)) 230 if (((taga == TAG_Valid) && (tagb == TW_Denormal))
247 || ((taga == TW_Denormal) && (tagb == TAG_Valid)) 231 || ((taga == TW_Denormal) && (tagb == TAG_Valid))
248 || ((taga == TW_Denormal) && (tagb == TW_Denormal)) ) 232 || ((taga == TW_Denormal) && (tagb == TW_Denormal))) {
249 { 233 FPU_REG x, y;
250 FPU_REG x, y;
251 234
252 if ( denormal_operand() < 0 ) 235 if (denormal_operand() < 0)
253 return FPU_Exception; 236 return FPU_Exception;
237
238 FPU_to_exp16(a, &x);
239 FPU_to_exp16(b, &y);
240 a = &x;
241 b = &y;
242 expa = exponent16(a);
243 expb = exponent16(b);
254 244
255 FPU_to_exp16(a, &x); 245 goto valid_subtract;
256 FPU_to_exp16(b, &y);
257 a = &x;
258 b = &y;
259 expa = exponent16(a);
260 expb = exponent16(b);
261
262 goto valid_subtract;
263 }
264
265 if ( (taga == TW_NaN) || (tagb == TW_NaN) )
266 {
267 FPU_REG const *d1, *d2;
268 if ( flags & REV )
269 {
270 d1 = b;
271 d2 = a;
272 } 246 }
273 else 247
274 { 248 if ((taga == TW_NaN) || (tagb == TW_NaN)) {
275 d1 = a; 249 FPU_REG const *d1, *d2;
276 d2 = b; 250 if (flags & REV) {
251 d1 = b;
252 d2 = a;
253 } else {
254 d1 = a;
255 d2 = b;
256 }
257 if (flags & LOADED)
258 return real_2op_NaN(b, tagb, deststnr, d1);
259 if (flags & DEST_RM)
260 return real_2op_NaN(a, taga, deststnr, d2);
261 else
262 return real_2op_NaN(b, tagb, deststnr, d2);
277 } 263 }
278 if ( flags & LOADED )
279 return real_2op_NaN(b, tagb, deststnr, d1);
280 if ( flags & DEST_RM )
281 return real_2op_NaN(a, taga, deststnr, d2);
282 else
283 return real_2op_NaN(b, tagb, deststnr, d2);
284 }
285
286 return add_sub_specials(a, taga, signa, b, tagb, signb ^ SIGN_NEG,
287 dest, deststnr, control_w);
288}
289 264
265 return add_sub_specials(a, taga, signa, b, tagb, signb ^ SIGN_NEG,
266 dest, deststnr, control_w);
267}
290 268
291static 269static
292int add_sub_specials(FPU_REG const *a, u_char taga, u_char signa, 270int add_sub_specials(FPU_REG const *a, u_char taga, u_char signa,
293 FPU_REG const *b, u_char tagb, u_char signb, 271 FPU_REG const *b, u_char tagb, u_char signb,
294 FPU_REG *dest, int deststnr, int control_w) 272 FPU_REG * dest, int deststnr, int control_w)
295{ 273{
296 if ( ((taga == TW_Denormal) || (tagb == TW_Denormal)) 274 if (((taga == TW_Denormal) || (tagb == TW_Denormal))
297 && (denormal_operand() < 0) ) 275 && (denormal_operand() < 0))
298 return FPU_Exception; 276 return FPU_Exception;
299 277
300 if (taga == TAG_Zero) 278 if (taga == TAG_Zero) {
301 { 279 if (tagb == TAG_Zero) {
302 if (tagb == TAG_Zero) 280 /* Both are zero, result will be zero. */
303 { 281 u_char different_signs = signa ^ signb;
304 /* Both are zero, result will be zero. */ 282
305 u_char different_signs = signa ^ signb; 283 FPU_copy_to_regi(a, TAG_Zero, deststnr);
306 284 if (different_signs) {
307 FPU_copy_to_regi(a, TAG_Zero, deststnr); 285 /* Signs are different. */
308 if ( different_signs ) 286 /* Sign of answer depends upon rounding mode. */
309 { 287 setsign(dest, ((control_w & CW_RC) != RC_DOWN)
310 /* Signs are different. */ 288 ? SIGN_POS : SIGN_NEG);
311 /* Sign of answer depends upon rounding mode. */ 289 } else
312 setsign(dest, ((control_w & CW_RC) != RC_DOWN) 290 setsign(dest, signa); /* signa may differ from the sign of a. */
313 ? SIGN_POS : SIGN_NEG); 291 return TAG_Zero;
314 } 292 } else {
315 else 293 reg_copy(b, dest);
316 setsign(dest, signa); /* signa may differ from the sign of a. */ 294 if ((tagb == TW_Denormal) && (b->sigh & 0x80000000)) {
317 return TAG_Zero; 295 /* A pseudoDenormal, convert it. */
318 } 296 addexponent(dest, 1);
319 else 297 tagb = TAG_Valid;
320 { 298 } else if (tagb > TAG_Empty)
321 reg_copy(b, dest); 299 tagb = TAG_Special;
322 if ( (tagb == TW_Denormal) && (b->sigh & 0x80000000) ) 300 setsign(dest, signb); /* signb may differ from the sign of b. */
323 { 301 FPU_settagi(deststnr, tagb);
324 /* A pseudoDenormal, convert it. */ 302 return tagb;
325 addexponent(dest, 1); 303 }
326 tagb = TAG_Valid; 304 } else if (tagb == TAG_Zero) {
327 } 305 reg_copy(a, dest);
328 else if ( tagb > TAG_Empty ) 306 if ((taga == TW_Denormal) && (a->sigh & 0x80000000)) {
329 tagb = TAG_Special; 307 /* A pseudoDenormal */
330 setsign(dest, signb); /* signb may differ from the sign of b. */ 308 addexponent(dest, 1);
331 FPU_settagi(deststnr, tagb); 309 taga = TAG_Valid;
332 return tagb; 310 } else if (taga > TAG_Empty)
333 } 311 taga = TAG_Special;
334 } 312 setsign(dest, signa); /* signa may differ from the sign of a. */
335 else if (tagb == TAG_Zero) 313 FPU_settagi(deststnr, taga);
336 { 314 return taga;
337 reg_copy(a, dest); 315 } else if (taga == TW_Infinity) {
338 if ( (taga == TW_Denormal) && (a->sigh & 0x80000000) ) 316 if ((tagb != TW_Infinity) || (signa == signb)) {
339 { 317 FPU_copy_to_regi(a, TAG_Special, deststnr);
340 /* A pseudoDenormal */ 318 setsign(dest, signa); /* signa may differ from the sign of a. */
341 addexponent(dest, 1); 319 return taga;
342 taga = TAG_Valid; 320 }
343 } 321 /* Infinity-Infinity is undefined. */
344 else if ( taga > TAG_Empty ) 322 return arith_invalid(deststnr);
345 taga = TAG_Special; 323 } else if (tagb == TW_Infinity) {
346 setsign(dest, signa); /* signa may differ from the sign of a. */ 324 FPU_copy_to_regi(b, TAG_Special, deststnr);
347 FPU_settagi(deststnr, taga); 325 setsign(dest, signb); /* signb may differ from the sign of b. */
348 return taga; 326 return tagb;
349 }
350 else if (taga == TW_Infinity)
351 {
352 if ( (tagb != TW_Infinity) || (signa == signb) )
353 {
354 FPU_copy_to_regi(a, TAG_Special, deststnr);
355 setsign(dest, signa); /* signa may differ from the sign of a. */
356 return taga;
357 } 327 }
358 /* Infinity-Infinity is undefined. */
359 return arith_invalid(deststnr);
360 }
361 else if (tagb == TW_Infinity)
362 {
363 FPU_copy_to_regi(b, TAG_Special, deststnr);
364 setsign(dest, signb); /* signb may differ from the sign of b. */
365 return tagb;
366 }
367
368#ifdef PARANOID 328#ifdef PARANOID
369 EXCEPTION(EX_INTERNAL|0x101); 329 EXCEPTION(EX_INTERNAL | 0x101);
370#endif 330#endif
371 331
372 return FPU_Exception; 332 return FPU_Exception;
373} 333}
374
diff --git a/arch/x86/math-emu/reg_compare.c b/arch/x86/math-emu/reg_compare.c
index f37c5b5a35a..ecce55fc2e2 100644
--- a/arch/x86/math-emu/reg_compare.c
+++ b/arch/x86/math-emu/reg_compare.c
@@ -20,362 +20,331 @@
20#include "control_w.h" 20#include "control_w.h"
21#include "status_w.h" 21#include "status_w.h"
22 22
23
24static int compare(FPU_REG const *b, int tagb) 23static int compare(FPU_REG const *b, int tagb)
25{ 24{
26 int diff, exp0, expb; 25 int diff, exp0, expb;
27 u_char st0_tag; 26 u_char st0_tag;
28 FPU_REG *st0_ptr; 27 FPU_REG *st0_ptr;
29 FPU_REG x, y; 28 FPU_REG x, y;
30 u_char st0_sign, signb = getsign(b); 29 u_char st0_sign, signb = getsign(b);
31 30
32 st0_ptr = &st(0); 31 st0_ptr = &st(0);
33 st0_tag = FPU_gettag0(); 32 st0_tag = FPU_gettag0();
34 st0_sign = getsign(st0_ptr); 33 st0_sign = getsign(st0_ptr);
35 34
36 if ( tagb == TAG_Special ) 35 if (tagb == TAG_Special)
37 tagb = FPU_Special(b); 36 tagb = FPU_Special(b);
38 if ( st0_tag == TAG_Special ) 37 if (st0_tag == TAG_Special)
39 st0_tag = FPU_Special(st0_ptr); 38 st0_tag = FPU_Special(st0_ptr);
40 39
41 if ( ((st0_tag != TAG_Valid) && (st0_tag != TW_Denormal)) 40 if (((st0_tag != TAG_Valid) && (st0_tag != TW_Denormal))
42 || ((tagb != TAG_Valid) && (tagb != TW_Denormal)) ) 41 || ((tagb != TAG_Valid) && (tagb != TW_Denormal))) {
43 { 42 if (st0_tag == TAG_Zero) {
44 if ( st0_tag == TAG_Zero ) 43 if (tagb == TAG_Zero)
45 { 44 return COMP_A_eq_B;
46 if ( tagb == TAG_Zero ) return COMP_A_eq_B; 45 if (tagb == TAG_Valid)
47 if ( tagb == TAG_Valid ) 46 return ((signb ==
48 return ((signb == SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B); 47 SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B);
49 if ( tagb == TW_Denormal ) 48 if (tagb == TW_Denormal)
50 return ((signb == SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B) 49 return ((signb ==
51 | COMP_Denormal; 50 SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B)
52 } 51 | COMP_Denormal;
53 else if ( tagb == TAG_Zero ) 52 } else if (tagb == TAG_Zero) {
54 { 53 if (st0_tag == TAG_Valid)
55 if ( st0_tag == TAG_Valid ) 54 return ((st0_sign ==
56 return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B); 55 SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B);
57 if ( st0_tag == TW_Denormal ) 56 if (st0_tag == TW_Denormal)
58 return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B) 57 return ((st0_sign ==
59 | COMP_Denormal; 58 SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B)
59 | COMP_Denormal;
60 }
61
62 if (st0_tag == TW_Infinity) {
63 if ((tagb == TAG_Valid) || (tagb == TAG_Zero))
64 return ((st0_sign ==
65 SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B);
66 else if (tagb == TW_Denormal)
67 return ((st0_sign ==
68 SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B)
69 | COMP_Denormal;
70 else if (tagb == TW_Infinity) {
71 /* The 80486 book says that infinities can be equal! */
72 return (st0_sign == signb) ? COMP_A_eq_B :
73 ((st0_sign ==
74 SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B);
75 }
76 /* Fall through to the NaN code */
77 } else if (tagb == TW_Infinity) {
78 if ((st0_tag == TAG_Valid) || (st0_tag == TAG_Zero))
79 return ((signb ==
80 SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B);
81 if (st0_tag == TW_Denormal)
82 return ((signb ==
83 SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B)
84 | COMP_Denormal;
85 /* Fall through to the NaN code */
86 }
87
88 /* The only possibility now should be that one of the arguments
89 is a NaN */
90 if ((st0_tag == TW_NaN) || (tagb == TW_NaN)) {
91 int signalling = 0, unsupported = 0;
92 if (st0_tag == TW_NaN) {
93 signalling =
94 (st0_ptr->sigh & 0xc0000000) == 0x80000000;
95 unsupported = !((exponent(st0_ptr) == EXP_OVER)
96 && (st0_ptr->
97 sigh & 0x80000000));
98 }
99 if (tagb == TW_NaN) {
100 signalling |=
101 (b->sigh & 0xc0000000) == 0x80000000;
102 unsupported |= !((exponent(b) == EXP_OVER)
103 && (b->sigh & 0x80000000));
104 }
105 if (signalling || unsupported)
106 return COMP_No_Comp | COMP_SNaN | COMP_NaN;
107 else
108 /* Neither is a signaling NaN */
109 return COMP_No_Comp | COMP_NaN;
110 }
111
112 EXCEPTION(EX_Invalid);
60 } 113 }
61 114
62 if ( st0_tag == TW_Infinity ) 115 if (st0_sign != signb) {
63 { 116 return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B)
64 if ( (tagb == TAG_Valid) || (tagb == TAG_Zero) ) 117 | (((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) ?
65 return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B); 118 COMP_Denormal : 0);
66 else if ( tagb == TW_Denormal )
67 return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B)
68 | COMP_Denormal;
69 else if ( tagb == TW_Infinity )
70 {
71 /* The 80486 book says that infinities can be equal! */
72 return (st0_sign == signb) ? COMP_A_eq_B :
73 ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B);
74 }
75 /* Fall through to the NaN code */
76 }
77 else if ( tagb == TW_Infinity )
78 {
79 if ( (st0_tag == TAG_Valid) || (st0_tag == TAG_Zero) )
80 return ((signb == SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B);
81 if ( st0_tag == TW_Denormal )
82 return ((signb == SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B)
83 | COMP_Denormal;
84 /* Fall through to the NaN code */
85 } 119 }
86 120
87 /* The only possibility now should be that one of the arguments 121 if ((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) {
88 is a NaN */ 122 FPU_to_exp16(st0_ptr, &x);
89 if ( (st0_tag == TW_NaN) || (tagb == TW_NaN) ) 123 FPU_to_exp16(b, &y);
90 { 124 st0_ptr = &x;
91 int signalling = 0, unsupported = 0; 125 b = &y;
92 if ( st0_tag == TW_NaN ) 126 exp0 = exponent16(st0_ptr);
93 { 127 expb = exponent16(b);
94 signalling = (st0_ptr->sigh & 0xc0000000) == 0x80000000; 128 } else {
95 unsupported = !((exponent(st0_ptr) == EXP_OVER) 129 exp0 = exponent(st0_ptr);
96 && (st0_ptr->sigh & 0x80000000)); 130 expb = exponent(b);
97 }
98 if ( tagb == TW_NaN )
99 {
100 signalling |= (b->sigh & 0xc0000000) == 0x80000000;
101 unsupported |= !((exponent(b) == EXP_OVER)
102 && (b->sigh & 0x80000000));
103 }
104 if ( signalling || unsupported )
105 return COMP_No_Comp | COMP_SNaN | COMP_NaN;
106 else
107 /* Neither is a signaling NaN */
108 return COMP_No_Comp | COMP_NaN;
109 } 131 }
110
111 EXCEPTION(EX_Invalid);
112 }
113
114 if (st0_sign != signb)
115 {
116 return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B)
117 | ( ((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) ?
118 COMP_Denormal : 0);
119 }
120
121 if ( (st0_tag == TW_Denormal) || (tagb == TW_Denormal) )
122 {
123 FPU_to_exp16(st0_ptr, &x);
124 FPU_to_exp16(b, &y);
125 st0_ptr = &x;
126 b = &y;
127 exp0 = exponent16(st0_ptr);
128 expb = exponent16(b);
129 }
130 else
131 {
132 exp0 = exponent(st0_ptr);
133 expb = exponent(b);
134 }
135 132
136#ifdef PARANOID 133#ifdef PARANOID
137 if (!(st0_ptr->sigh & 0x80000000)) EXCEPTION(EX_Invalid); 134 if (!(st0_ptr->sigh & 0x80000000))
138 if (!(b->sigh & 0x80000000)) EXCEPTION(EX_Invalid); 135 EXCEPTION(EX_Invalid);
136 if (!(b->sigh & 0x80000000))
137 EXCEPTION(EX_Invalid);
139#endif /* PARANOID */ 138#endif /* PARANOID */
140 139
141 diff = exp0 - expb; 140 diff = exp0 - expb;
142 if ( diff == 0 ) 141 if (diff == 0) {
143 { 142 diff = st0_ptr->sigh - b->sigh; /* Works only if ms bits are
144 diff = st0_ptr->sigh - b->sigh; /* Works only if ms bits are 143 identical */
145 identical */ 144 if (diff == 0) {
146 if ( diff == 0 ) 145 diff = st0_ptr->sigl > b->sigl;
147 { 146 if (diff == 0)
148 diff = st0_ptr->sigl > b->sigl; 147 diff = -(st0_ptr->sigl < b->sigl);
149 if ( diff == 0 ) 148 }
150 diff = -(st0_ptr->sigl < b->sigl);
151 } 149 }
152 }
153
154 if ( diff > 0 )
155 {
156 return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B)
157 | ( ((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) ?
158 COMP_Denormal : 0);
159 }
160 if ( diff < 0 )
161 {
162 return ((st0_sign == SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B)
163 | ( ((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) ?
164 COMP_Denormal : 0);
165 }
166
167 return COMP_A_eq_B
168 | ( ((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) ?
169 COMP_Denormal : 0);
170 150
171} 151 if (diff > 0) {
152 return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B)
153 | (((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) ?
154 COMP_Denormal : 0);
155 }
156 if (diff < 0) {
157 return ((st0_sign == SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B)
158 | (((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) ?
159 COMP_Denormal : 0);
160 }
172 161
162 return COMP_A_eq_B
163 | (((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) ?
164 COMP_Denormal : 0);
165
166}
173 167
174/* This function requires that st(0) is not empty */ 168/* This function requires that st(0) is not empty */
175int FPU_compare_st_data(FPU_REG const *loaded_data, u_char loaded_tag) 169int FPU_compare_st_data(FPU_REG const *loaded_data, u_char loaded_tag)
176{ 170{
177 int f = 0, c; 171 int f = 0, c;
178 172
179 c = compare(loaded_data, loaded_tag); 173 c = compare(loaded_data, loaded_tag);
180 174
181 if (c & COMP_NaN) 175 if (c & COMP_NaN) {
182 { 176 EXCEPTION(EX_Invalid);
183 EXCEPTION(EX_Invalid); 177 f = SW_C3 | SW_C2 | SW_C0;
184 f = SW_C3 | SW_C2 | SW_C0; 178 } else
185 } 179 switch (c & 7) {
186 else 180 case COMP_A_lt_B:
187 switch (c & 7) 181 f = SW_C0;
188 { 182 break;
189 case COMP_A_lt_B: 183 case COMP_A_eq_B:
190 f = SW_C0; 184 f = SW_C3;
191 break; 185 break;
192 case COMP_A_eq_B: 186 case COMP_A_gt_B:
193 f = SW_C3; 187 f = 0;
194 break; 188 break;
195 case COMP_A_gt_B: 189 case COMP_No_Comp:
196 f = 0; 190 f = SW_C3 | SW_C2 | SW_C0;
197 break; 191 break;
198 case COMP_No_Comp:
199 f = SW_C3 | SW_C2 | SW_C0;
200 break;
201#ifdef PARANOID 192#ifdef PARANOID
202 default: 193 default:
203 EXCEPTION(EX_INTERNAL|0x121); 194 EXCEPTION(EX_INTERNAL | 0x121);
204 f = SW_C3 | SW_C2 | SW_C0; 195 f = SW_C3 | SW_C2 | SW_C0;
205 break; 196 break;
206#endif /* PARANOID */ 197#endif /* PARANOID */
207 } 198 }
208 setcc(f); 199 setcc(f);
209 if (c & COMP_Denormal) 200 if (c & COMP_Denormal) {
210 { 201 return denormal_operand() < 0;
211 return denormal_operand() < 0; 202 }
212 } 203 return 0;
213 return 0;
214} 204}
215 205
216
217static int compare_st_st(int nr) 206static int compare_st_st(int nr)
218{ 207{
219 int f = 0, c; 208 int f = 0, c;
220 FPU_REG *st_ptr; 209 FPU_REG *st_ptr;
221 210
222 if ( !NOT_EMPTY(0) || !NOT_EMPTY(nr) ) 211 if (!NOT_EMPTY(0) || !NOT_EMPTY(nr)) {
223 { 212 setcc(SW_C3 | SW_C2 | SW_C0);
224 setcc(SW_C3 | SW_C2 | SW_C0); 213 /* Stack fault */
225 /* Stack fault */ 214 EXCEPTION(EX_StackUnder);
226 EXCEPTION(EX_StackUnder); 215 return !(control_word & CW_Invalid);
227 return !(control_word & CW_Invalid); 216 }
228 } 217
229 218 st_ptr = &st(nr);
230 st_ptr = &st(nr); 219 c = compare(st_ptr, FPU_gettagi(nr));
231 c = compare(st_ptr, FPU_gettagi(nr)); 220 if (c & COMP_NaN) {
232 if (c & COMP_NaN) 221 setcc(SW_C3 | SW_C2 | SW_C0);
233 { 222 EXCEPTION(EX_Invalid);
234 setcc(SW_C3 | SW_C2 | SW_C0); 223 return !(control_word & CW_Invalid);
235 EXCEPTION(EX_Invalid); 224 } else
236 return !(control_word & CW_Invalid); 225 switch (c & 7) {
237 } 226 case COMP_A_lt_B:
238 else 227 f = SW_C0;
239 switch (c & 7) 228 break;
240 { 229 case COMP_A_eq_B:
241 case COMP_A_lt_B: 230 f = SW_C3;
242 f = SW_C0; 231 break;
243 break; 232 case COMP_A_gt_B:
244 case COMP_A_eq_B: 233 f = 0;
245 f = SW_C3; 234 break;
246 break; 235 case COMP_No_Comp:
247 case COMP_A_gt_B: 236 f = SW_C3 | SW_C2 | SW_C0;
248 f = 0; 237 break;
249 break;
250 case COMP_No_Comp:
251 f = SW_C3 | SW_C2 | SW_C0;
252 break;
253#ifdef PARANOID 238#ifdef PARANOID
254 default: 239 default:
255 EXCEPTION(EX_INTERNAL|0x122); 240 EXCEPTION(EX_INTERNAL | 0x122);
256 f = SW_C3 | SW_C2 | SW_C0; 241 f = SW_C3 | SW_C2 | SW_C0;
257 break; 242 break;
258#endif /* PARANOID */ 243#endif /* PARANOID */
259 } 244 }
260 setcc(f); 245 setcc(f);
261 if (c & COMP_Denormal) 246 if (c & COMP_Denormal) {
262 { 247 return denormal_operand() < 0;
263 return denormal_operand() < 0; 248 }
264 } 249 return 0;
265 return 0;
266} 250}
267 251
268
269static int compare_u_st_st(int nr) 252static int compare_u_st_st(int nr)
270{ 253{
271 int f = 0, c; 254 int f = 0, c;
272 FPU_REG *st_ptr; 255 FPU_REG *st_ptr;
273 256
274 if ( !NOT_EMPTY(0) || !NOT_EMPTY(nr) ) 257 if (!NOT_EMPTY(0) || !NOT_EMPTY(nr)) {
275 { 258 setcc(SW_C3 | SW_C2 | SW_C0);
276 setcc(SW_C3 | SW_C2 | SW_C0); 259 /* Stack fault */
277 /* Stack fault */ 260 EXCEPTION(EX_StackUnder);
278 EXCEPTION(EX_StackUnder); 261 return !(control_word & CW_Invalid);
279 return !(control_word & CW_Invalid);
280 }
281
282 st_ptr = &st(nr);
283 c = compare(st_ptr, FPU_gettagi(nr));
284 if (c & COMP_NaN)
285 {
286 setcc(SW_C3 | SW_C2 | SW_C0);
287 if (c & COMP_SNaN) /* This is the only difference between
288 un-ordered and ordinary comparisons */
289 {
290 EXCEPTION(EX_Invalid);
291 return !(control_word & CW_Invalid);
292 } 262 }
293 return 0; 263
294 } 264 st_ptr = &st(nr);
295 else 265 c = compare(st_ptr, FPU_gettagi(nr));
296 switch (c & 7) 266 if (c & COMP_NaN) {
297 { 267 setcc(SW_C3 | SW_C2 | SW_C0);
298 case COMP_A_lt_B: 268 if (c & COMP_SNaN) { /* This is the only difference between
299 f = SW_C0; 269 un-ordered and ordinary comparisons */
300 break; 270 EXCEPTION(EX_Invalid);
301 case COMP_A_eq_B: 271 return !(control_word & CW_Invalid);
302 f = SW_C3; 272 }
303 break; 273 return 0;
304 case COMP_A_gt_B: 274 } else
305 f = 0; 275 switch (c & 7) {
306 break; 276 case COMP_A_lt_B:
307 case COMP_No_Comp: 277 f = SW_C0;
308 f = SW_C3 | SW_C2 | SW_C0; 278 break;
309 break; 279 case COMP_A_eq_B:
280 f = SW_C3;
281 break;
282 case COMP_A_gt_B:
283 f = 0;
284 break;
285 case COMP_No_Comp:
286 f = SW_C3 | SW_C2 | SW_C0;
287 break;
310#ifdef PARANOID 288#ifdef PARANOID
311 default: 289 default:
312 EXCEPTION(EX_INTERNAL|0x123); 290 EXCEPTION(EX_INTERNAL | 0x123);
313 f = SW_C3 | SW_C2 | SW_C0; 291 f = SW_C3 | SW_C2 | SW_C0;
314 break; 292 break;
315#endif /* PARANOID */ 293#endif /* PARANOID */
316 } 294 }
317 setcc(f); 295 setcc(f);
318 if (c & COMP_Denormal) 296 if (c & COMP_Denormal) {
319 { 297 return denormal_operand() < 0;
320 return denormal_operand() < 0; 298 }
321 } 299 return 0;
322 return 0;
323} 300}
324 301
325/*---------------------------------------------------------------------------*/ 302/*---------------------------------------------------------------------------*/
326 303
327void fcom_st(void) 304void fcom_st(void)
328{ 305{
329 /* fcom st(i) */ 306 /* fcom st(i) */
330 compare_st_st(FPU_rm); 307 compare_st_st(FPU_rm);
331} 308}
332 309
333
334void fcompst(void) 310void fcompst(void)
335{ 311{
336 /* fcomp st(i) */ 312 /* fcomp st(i) */
337 if ( !compare_st_st(FPU_rm) ) 313 if (!compare_st_st(FPU_rm))
338 FPU_pop(); 314 FPU_pop();
339} 315}
340 316
341
342void fcompp(void) 317void fcompp(void)
343{ 318{
344 /* fcompp */ 319 /* fcompp */
345 if (FPU_rm != 1) 320 if (FPU_rm != 1) {
346 { 321 FPU_illegal();
347 FPU_illegal(); 322 return;
348 return; 323 }
349 } 324 if (!compare_st_st(1))
350 if ( !compare_st_st(1) ) 325 poppop();
351 poppop();
352} 326}
353 327
354
355void fucom_(void) 328void fucom_(void)
356{ 329{
357 /* fucom st(i) */ 330 /* fucom st(i) */
358 compare_u_st_st(FPU_rm); 331 compare_u_st_st(FPU_rm);
359 332
360} 333}
361 334
362
363void fucomp(void) 335void fucomp(void)
364{ 336{
365 /* fucomp st(i) */ 337 /* fucomp st(i) */
366 if ( !compare_u_st_st(FPU_rm) ) 338 if (!compare_u_st_st(FPU_rm))
367 FPU_pop(); 339 FPU_pop();
368} 340}
369 341
370
371void fucompp(void) 342void fucompp(void)
372{ 343{
373 /* fucompp */ 344 /* fucompp */
374 if (FPU_rm == 1) 345 if (FPU_rm == 1) {
375 { 346 if (!compare_u_st_st(1))
376 if ( !compare_u_st_st(1) ) 347 poppop();
377 poppop(); 348 } else
378 } 349 FPU_illegal();
379 else
380 FPU_illegal();
381} 350}
diff --git a/arch/x86/math-emu/reg_constant.c b/arch/x86/math-emu/reg_constant.c
index a8501580196..04869e64b18 100644
--- a/arch/x86/math-emu/reg_constant.c
+++ b/arch/x86/math-emu/reg_constant.c
@@ -16,29 +16,28 @@
16#include "reg_constant.h" 16#include "reg_constant.h"
17#include "control_w.h" 17#include "control_w.h"
18 18
19
20#define MAKE_REG(s,e,l,h) { l, h, \ 19#define MAKE_REG(s,e,l,h) { l, h, \
21 ((EXTENDED_Ebias+(e)) | ((SIGN_##s != 0)*0x8000)) } 20 ((EXTENDED_Ebias+(e)) | ((SIGN_##s != 0)*0x8000)) }
22 21
23FPU_REG const CONST_1 = MAKE_REG(POS, 0, 0x00000000, 0x80000000); 22FPU_REG const CONST_1 = MAKE_REG(POS, 0, 0x00000000, 0x80000000);
24#if 0 23#if 0
25FPU_REG const CONST_2 = MAKE_REG(POS, 1, 0x00000000, 0x80000000); 24FPU_REG const CONST_2 = MAKE_REG(POS, 1, 0x00000000, 0x80000000);
26FPU_REG const CONST_HALF = MAKE_REG(POS, -1, 0x00000000, 0x80000000); 25FPU_REG const CONST_HALF = MAKE_REG(POS, -1, 0x00000000, 0x80000000);
27#endif /* 0 */ 26#endif /* 0 */
28static FPU_REG const CONST_L2T = MAKE_REG(POS, 1, 0xcd1b8afe, 0xd49a784b); 27static FPU_REG const CONST_L2T = MAKE_REG(POS, 1, 0xcd1b8afe, 0xd49a784b);
29static FPU_REG const CONST_L2E = MAKE_REG(POS, 0, 0x5c17f0bc, 0xb8aa3b29); 28static FPU_REG const CONST_L2E = MAKE_REG(POS, 0, 0x5c17f0bc, 0xb8aa3b29);
30FPU_REG const CONST_PI = MAKE_REG(POS, 1, 0x2168c235, 0xc90fdaa2); 29FPU_REG const CONST_PI = MAKE_REG(POS, 1, 0x2168c235, 0xc90fdaa2);
31FPU_REG const CONST_PI2 = MAKE_REG(POS, 0, 0x2168c235, 0xc90fdaa2); 30FPU_REG const CONST_PI2 = MAKE_REG(POS, 0, 0x2168c235, 0xc90fdaa2);
32FPU_REG const CONST_PI4 = MAKE_REG(POS, -1, 0x2168c235, 0xc90fdaa2); 31FPU_REG const CONST_PI4 = MAKE_REG(POS, -1, 0x2168c235, 0xc90fdaa2);
33static FPU_REG const CONST_LG2 = MAKE_REG(POS, -2, 0xfbcff799, 0x9a209a84); 32static FPU_REG const CONST_LG2 = MAKE_REG(POS, -2, 0xfbcff799, 0x9a209a84);
34static FPU_REG const CONST_LN2 = MAKE_REG(POS, -1, 0xd1cf79ac, 0xb17217f7); 33static FPU_REG const CONST_LN2 = MAKE_REG(POS, -1, 0xd1cf79ac, 0xb17217f7);
35 34
36/* Extra bits to take pi/2 to more than 128 bits precision. */ 35/* Extra bits to take pi/2 to more than 128 bits precision. */
37FPU_REG const CONST_PI2extra = MAKE_REG(NEG, -66, 36FPU_REG const CONST_PI2extra = MAKE_REG(NEG, -66,
38 0xfc8f8cbb, 0xece675d1); 37 0xfc8f8cbb, 0xece675d1);
39 38
40/* Only the sign (and tag) is used in internal zeroes */ 39/* Only the sign (and tag) is used in internal zeroes */
41FPU_REG const CONST_Z = MAKE_REG(POS, EXP_UNDER, 0x0, 0x0); 40FPU_REG const CONST_Z = MAKE_REG(POS, EXP_UNDER, 0x0, 0x0);
42 41
43/* Only the sign and significand (and tag) are used in internal NaNs */ 42/* Only the sign and significand (and tag) are used in internal NaNs */
44/* The 80486 never generates one of these 43/* The 80486 never generates one of these
@@ -48,24 +47,22 @@ FPU_REG const CONST_SNAN = MAKE_REG(POS, EXP_OVER, 0x00000001, 0x80000000);
48FPU_REG const CONST_QNaN = MAKE_REG(NEG, EXP_OVER, 0x00000000, 0xC0000000); 47FPU_REG const CONST_QNaN = MAKE_REG(NEG, EXP_OVER, 0x00000000, 0xC0000000);
49 48
50/* Only the sign (and tag) is used in internal infinities */ 49/* Only the sign (and tag) is used in internal infinities */
51FPU_REG const CONST_INF = MAKE_REG(POS, EXP_OVER, 0x00000000, 0x80000000); 50FPU_REG const CONST_INF = MAKE_REG(POS, EXP_OVER, 0x00000000, 0x80000000);
52
53 51
54static void fld_const(FPU_REG const *c, int adj, u_char tag) 52static void fld_const(FPU_REG const *c, int adj, u_char tag)
55{ 53{
56 FPU_REG *st_new_ptr; 54 FPU_REG *st_new_ptr;
57 55
58 if ( STACK_OVERFLOW ) 56 if (STACK_OVERFLOW) {
59 { 57 FPU_stack_overflow();
60 FPU_stack_overflow(); 58 return;
61 return; 59 }
62 } 60 push();
63 push(); 61 reg_copy(c, st_new_ptr);
64 reg_copy(c, st_new_ptr); 62 st_new_ptr->sigl += adj; /* For all our fldxxx constants, we don't need to
65 st_new_ptr->sigl += adj; /* For all our fldxxx constants, we don't need to 63 borrow or carry. */
66 borrow or carry. */ 64 FPU_settag0(tag);
67 FPU_settag0(tag); 65 clear_C1();
68 clear_C1();
69} 66}
70 67
71/* A fast way to find out whether x is one of RC_DOWN or RC_CHOP 68/* A fast way to find out whether x is one of RC_DOWN or RC_CHOP
@@ -75,46 +72,46 @@ static void fld_const(FPU_REG const *c, int adj, u_char tag)
75 72
76static void fld1(int rc) 73static void fld1(int rc)
77{ 74{
78 fld_const(&CONST_1, 0, TAG_Valid); 75 fld_const(&CONST_1, 0, TAG_Valid);
79} 76}
80 77
81static void fldl2t(int rc) 78static void fldl2t(int rc)
82{ 79{
83 fld_const(&CONST_L2T, (rc == RC_UP) ? 1 : 0, TAG_Valid); 80 fld_const(&CONST_L2T, (rc == RC_UP) ? 1 : 0, TAG_Valid);
84} 81}
85 82
86static void fldl2e(int rc) 83static void fldl2e(int rc)
87{ 84{
88 fld_const(&CONST_L2E, DOWN_OR_CHOP(rc) ? -1 : 0, TAG_Valid); 85 fld_const(&CONST_L2E, DOWN_OR_CHOP(rc) ? -1 : 0, TAG_Valid);
89} 86}
90 87
91static void fldpi(int rc) 88static void fldpi(int rc)
92{ 89{
93 fld_const(&CONST_PI, DOWN_OR_CHOP(rc) ? -1 : 0, TAG_Valid); 90 fld_const(&CONST_PI, DOWN_OR_CHOP(rc) ? -1 : 0, TAG_Valid);
94} 91}
95 92
96static void fldlg2(int rc) 93static void fldlg2(int rc)
97{ 94{
98 fld_const(&CONST_LG2, DOWN_OR_CHOP(rc) ? -1 : 0, TAG_Valid); 95 fld_const(&CONST_LG2, DOWN_OR_CHOP(rc) ? -1 : 0, TAG_Valid);
99} 96}
100 97
101static void fldln2(int rc) 98static void fldln2(int rc)
102{ 99{
103 fld_const(&CONST_LN2, DOWN_OR_CHOP(rc) ? -1 : 0, TAG_Valid); 100 fld_const(&CONST_LN2, DOWN_OR_CHOP(rc) ? -1 : 0, TAG_Valid);
104} 101}
105 102
106static void fldz(int rc) 103static void fldz(int rc)
107{ 104{
108 fld_const(&CONST_Z, 0, TAG_Zero); 105 fld_const(&CONST_Z, 0, TAG_Zero);
109} 106}
110 107
111typedef void (*FUNC_RC)(int); 108typedef void (*FUNC_RC) (int);
112 109
113static FUNC_RC constants_table[] = { 110static FUNC_RC constants_table[] = {
114 fld1, fldl2t, fldl2e, fldpi, fldlg2, fldln2, fldz, (FUNC_RC)FPU_illegal 111 fld1, fldl2t, fldl2e, fldpi, fldlg2, fldln2, fldz, (FUNC_RC) FPU_illegal
115}; 112};
116 113
117void fconst(void) 114void fconst(void)
118{ 115{
119 (constants_table[FPU_rm])(control_word & CW_RC); 116 (constants_table[FPU_rm]) (control_word & CW_RC);
120} 117}
diff --git a/arch/x86/math-emu/reg_convert.c b/arch/x86/math-emu/reg_convert.c
index 45a25875270..10806077997 100644
--- a/arch/x86/math-emu/reg_convert.c
+++ b/arch/x86/math-emu/reg_convert.c
@@ -13,41 +13,34 @@
13#include "exception.h" 13#include "exception.h"
14#include "fpu_emu.h" 14#include "fpu_emu.h"
15 15
16
17int FPU_to_exp16(FPU_REG const *a, FPU_REG *x) 16int FPU_to_exp16(FPU_REG const *a, FPU_REG *x)
18{ 17{
19 int sign = getsign(a); 18 int sign = getsign(a);
20 19
21 *(long long *)&(x->sigl) = *(const long long *)&(a->sigl); 20 *(long long *)&(x->sigl) = *(const long long *)&(a->sigl);
22 21
23 /* Set up the exponent as a 16 bit quantity. */ 22 /* Set up the exponent as a 16 bit quantity. */
24 setexponent16(x, exponent(a)); 23 setexponent16(x, exponent(a));
25 24
26 if ( exponent16(x) == EXP_UNDER ) 25 if (exponent16(x) == EXP_UNDER) {
27 { 26 /* The number is a de-normal or pseudodenormal. */
28 /* The number is a de-normal or pseudodenormal. */ 27 /* We only deal with the significand and exponent. */
29 /* We only deal with the significand and exponent. */ 28
30 29 if (x->sigh & 0x80000000) {
31 if (x->sigh & 0x80000000) 30 /* Is a pseudodenormal. */
32 { 31 /* This is non-80486 behaviour because the number
33 /* Is a pseudodenormal. */ 32 loses its 'denormal' identity. */
34 /* This is non-80486 behaviour because the number 33 addexponent(x, 1);
35 loses its 'denormal' identity. */ 34 } else {
36 addexponent(x, 1); 35 /* Is a denormal. */
37 } 36 addexponent(x, 1);
38 else 37 FPU_normalize_nuo(x);
39 { 38 }
40 /* Is a denormal. */
41 addexponent(x, 1);
42 FPU_normalize_nuo(x);
43 } 39 }
44 }
45 40
46 if ( !(x->sigh & 0x80000000) ) 41 if (!(x->sigh & 0x80000000)) {
47 { 42 EXCEPTION(EX_INTERNAL | 0x180);
48 EXCEPTION(EX_INTERNAL | 0x180); 43 }
49 }
50 44
51 return sign; 45 return sign;
52} 46}
53
diff --git a/arch/x86/math-emu/reg_divide.c b/arch/x86/math-emu/reg_divide.c
index 5cee7ff920d..6827012db34 100644
--- a/arch/x86/math-emu/reg_divide.c
+++ b/arch/x86/math-emu/reg_divide.c
@@ -26,182 +26,157 @@
26 */ 26 */
27int FPU_div(int flags, int rm, int control_w) 27int FPU_div(int flags, int rm, int control_w)
28{ 28{
29 FPU_REG x, y; 29 FPU_REG x, y;
30 FPU_REG const *a, *b, *st0_ptr, *st_ptr; 30 FPU_REG const *a, *b, *st0_ptr, *st_ptr;
31 FPU_REG *dest; 31 FPU_REG *dest;
32 u_char taga, tagb, signa, signb, sign, saved_sign; 32 u_char taga, tagb, signa, signb, sign, saved_sign;
33 int tag, deststnr; 33 int tag, deststnr;
34 34
35 if ( flags & DEST_RM ) 35 if (flags & DEST_RM)
36 deststnr = rm; 36 deststnr = rm;
37 else 37 else
38 deststnr = 0; 38 deststnr = 0;
39 39
40 if ( flags & REV ) 40 if (flags & REV) {
41 { 41 b = &st(0);
42 b = &st(0); 42 st0_ptr = b;
43 st0_ptr = b; 43 tagb = FPU_gettag0();
44 tagb = FPU_gettag0(); 44 if (flags & LOADED) {
45 if ( flags & LOADED ) 45 a = (FPU_REG *) rm;
46 { 46 taga = flags & 0x0f;
47 a = (FPU_REG *)rm; 47 } else {
48 taga = flags & 0x0f; 48 a = &st(rm);
49 st_ptr = a;
50 taga = FPU_gettagi(rm);
51 }
52 } else {
53 a = &st(0);
54 st0_ptr = a;
55 taga = FPU_gettag0();
56 if (flags & LOADED) {
57 b = (FPU_REG *) rm;
58 tagb = flags & 0x0f;
59 } else {
60 b = &st(rm);
61 st_ptr = b;
62 tagb = FPU_gettagi(rm);
63 }
49 } 64 }
50 else
51 {
52 a = &st(rm);
53 st_ptr = a;
54 taga = FPU_gettagi(rm);
55 }
56 }
57 else
58 {
59 a = &st(0);
60 st0_ptr = a;
61 taga = FPU_gettag0();
62 if ( flags & LOADED )
63 {
64 b = (FPU_REG *)rm;
65 tagb = flags & 0x0f;
66 }
67 else
68 {
69 b = &st(rm);
70 st_ptr = b;
71 tagb = FPU_gettagi(rm);
72 }
73 }
74 65
75 signa = getsign(a); 66 signa = getsign(a);
76 signb = getsign(b); 67 signb = getsign(b);
77 68
78 sign = signa ^ signb; 69 sign = signa ^ signb;
79 70
80 dest = &st(deststnr); 71 dest = &st(deststnr);
81 saved_sign = getsign(dest); 72 saved_sign = getsign(dest);
82 73
83 if ( !(taga | tagb) ) 74 if (!(taga | tagb)) {
84 { 75 /* Both regs Valid, this should be the most common case. */
85 /* Both regs Valid, this should be the most common case. */ 76 reg_copy(a, &x);
86 reg_copy(a, &x); 77 reg_copy(b, &y);
87 reg_copy(b, &y); 78 setpositive(&x);
88 setpositive(&x); 79 setpositive(&y);
89 setpositive(&y); 80 tag = FPU_u_div(&x, &y, dest, control_w, sign);
90 tag = FPU_u_div(&x, &y, dest, control_w, sign);
91 81
92 if ( tag < 0 ) 82 if (tag < 0)
93 return tag; 83 return tag;
94 84
95 FPU_settagi(deststnr, tag); 85 FPU_settagi(deststnr, tag);
96 return tag; 86 return tag;
97 } 87 }
98 88
99 if ( taga == TAG_Special ) 89 if (taga == TAG_Special)
100 taga = FPU_Special(a); 90 taga = FPU_Special(a);
101 if ( tagb == TAG_Special ) 91 if (tagb == TAG_Special)
102 tagb = FPU_Special(b); 92 tagb = FPU_Special(b);
103 93
104 if ( ((taga == TAG_Valid) && (tagb == TW_Denormal)) 94 if (((taga == TAG_Valid) && (tagb == TW_Denormal))
105 || ((taga == TW_Denormal) && (tagb == TAG_Valid)) 95 || ((taga == TW_Denormal) && (tagb == TAG_Valid))
106 || ((taga == TW_Denormal) && (tagb == TW_Denormal)) ) 96 || ((taga == TW_Denormal) && (tagb == TW_Denormal))) {
107 { 97 if (denormal_operand() < 0)
108 if ( denormal_operand() < 0 ) 98 return FPU_Exception;
109 return FPU_Exception; 99
110 100 FPU_to_exp16(a, &x);
111 FPU_to_exp16(a, &x); 101 FPU_to_exp16(b, &y);
112 FPU_to_exp16(b, &y); 102 tag = FPU_u_div(&x, &y, dest, control_w, sign);
113 tag = FPU_u_div(&x, &y, dest, control_w, sign); 103 if (tag < 0)
114 if ( tag < 0 ) 104 return tag;
115 return tag; 105
116 106 FPU_settagi(deststnr, tag);
117 FPU_settagi(deststnr, tag); 107 return tag;
118 return tag; 108 } else if ((taga <= TW_Denormal) && (tagb <= TW_Denormal)) {
119 } 109 if (tagb != TAG_Zero) {
120 else if ( (taga <= TW_Denormal) && (tagb <= TW_Denormal) ) 110 /* Want to find Zero/Valid */
121 { 111 if (tagb == TW_Denormal) {
122 if ( tagb != TAG_Zero ) 112 if (denormal_operand() < 0)
123 { 113 return FPU_Exception;
124 /* Want to find Zero/Valid */ 114 }
125 if ( tagb == TW_Denormal ) 115
126 { 116 /* The result is zero. */
127 if ( denormal_operand() < 0 ) 117 FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr);
128 return FPU_Exception; 118 setsign(dest, sign);
129 } 119 return TAG_Zero;
130 120 }
131 /* The result is zero. */ 121 /* We have an exception condition, either 0/0 or Valid/Zero. */
132 FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr); 122 if (taga == TAG_Zero) {
133 setsign(dest, sign); 123 /* 0/0 */
134 return TAG_Zero; 124 return arith_invalid(deststnr);
125 }
126 /* Valid/Zero */
127 return FPU_divide_by_zero(deststnr, sign);
135 } 128 }
136 /* We have an exception condition, either 0/0 or Valid/Zero. */ 129 /* Must have infinities, NaNs, etc */
137 if ( taga == TAG_Zero ) 130 else if ((taga == TW_NaN) || (tagb == TW_NaN)) {
138 { 131 if (flags & LOADED)
139 /* 0/0 */ 132 return real_2op_NaN((FPU_REG *) rm, flags & 0x0f, 0,
140 return arith_invalid(deststnr); 133 st0_ptr);
134
135 if (flags & DEST_RM) {
136 int tag;
137 tag = FPU_gettag0();
138 if (tag == TAG_Special)
139 tag = FPU_Special(st0_ptr);
140 return real_2op_NaN(st0_ptr, tag, rm,
141 (flags & REV) ? st0_ptr : &st(rm));
142 } else {
143 int tag;
144 tag = FPU_gettagi(rm);
145 if (tag == TAG_Special)
146 tag = FPU_Special(&st(rm));
147 return real_2op_NaN(&st(rm), tag, 0,
148 (flags & REV) ? st0_ptr : &st(rm));
149 }
150 } else if (taga == TW_Infinity) {
151 if (tagb == TW_Infinity) {
152 /* infinity/infinity */
153 return arith_invalid(deststnr);
154 } else {
155 /* tagb must be Valid or Zero */
156 if ((tagb == TW_Denormal) && (denormal_operand() < 0))
157 return FPU_Exception;
158
159 /* Infinity divided by Zero or Valid does
160 not raise and exception, but returns Infinity */
161 FPU_copy_to_regi(a, TAG_Special, deststnr);
162 setsign(dest, sign);
163 return taga;
164 }
165 } else if (tagb == TW_Infinity) {
166 if ((taga == TW_Denormal) && (denormal_operand() < 0))
167 return FPU_Exception;
168
169 /* The result is zero. */
170 FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr);
171 setsign(dest, sign);
172 return TAG_Zero;
141 } 173 }
142 /* Valid/Zero */
143 return FPU_divide_by_zero(deststnr, sign);
144 }
145 /* Must have infinities, NaNs, etc */
146 else if ( (taga == TW_NaN) || (tagb == TW_NaN) )
147 {
148 if ( flags & LOADED )
149 return real_2op_NaN((FPU_REG *)rm, flags & 0x0f, 0, st0_ptr);
150
151 if ( flags & DEST_RM )
152 {
153 int tag;
154 tag = FPU_gettag0();
155 if ( tag == TAG_Special )
156 tag = FPU_Special(st0_ptr);
157 return real_2op_NaN(st0_ptr, tag, rm, (flags & REV) ? st0_ptr : &st(rm));
158 }
159 else
160 {
161 int tag;
162 tag = FPU_gettagi(rm);
163 if ( tag == TAG_Special )
164 tag = FPU_Special(&st(rm));
165 return real_2op_NaN(&st(rm), tag, 0, (flags & REV) ? st0_ptr : &st(rm));
166 }
167 }
168 else if (taga == TW_Infinity)
169 {
170 if (tagb == TW_Infinity)
171 {
172 /* infinity/infinity */
173 return arith_invalid(deststnr);
174 }
175 else
176 {
177 /* tagb must be Valid or Zero */
178 if ( (tagb == TW_Denormal) && (denormal_operand() < 0) )
179 return FPU_Exception;
180
181 /* Infinity divided by Zero or Valid does
182 not raise and exception, but returns Infinity */
183 FPU_copy_to_regi(a, TAG_Special, deststnr);
184 setsign(dest, sign);
185 return taga;
186 }
187 }
188 else if (tagb == TW_Infinity)
189 {
190 if ( (taga == TW_Denormal) && (denormal_operand() < 0) )
191 return FPU_Exception;
192
193 /* The result is zero. */
194 FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr);
195 setsign(dest, sign);
196 return TAG_Zero;
197 }
198#ifdef PARANOID 174#ifdef PARANOID
199 else 175 else {
200 { 176 EXCEPTION(EX_INTERNAL | 0x102);
201 EXCEPTION(EX_INTERNAL|0x102); 177 return FPU_Exception;
202 return FPU_Exception; 178 }
203 } 179#endif /* PARANOID */
204#endif /* PARANOID */
205 180
206 return 0; 181 return 0;
207} 182}
diff --git a/arch/x86/math-emu/reg_ld_str.c b/arch/x86/math-emu/reg_ld_str.c
index e976caef649..799d4af5be6 100644
--- a/arch/x86/math-emu/reg_ld_str.c
+++ b/arch/x86/math-emu/reg_ld_str.c
@@ -27,1084 +27,938 @@
27#include "control_w.h" 27#include "control_w.h"
28#include "status_w.h" 28#include "status_w.h"
29 29
30 30#define DOUBLE_Emax 1023 /* largest valid exponent */
31#define DOUBLE_Emax 1023 /* largest valid exponent */
32#define DOUBLE_Ebias 1023 31#define DOUBLE_Ebias 1023
33#define DOUBLE_Emin (-1022) /* smallest valid exponent */ 32#define DOUBLE_Emin (-1022) /* smallest valid exponent */
34 33
35#define SINGLE_Emax 127 /* largest valid exponent */ 34#define SINGLE_Emax 127 /* largest valid exponent */
36#define SINGLE_Ebias 127 35#define SINGLE_Ebias 127
37#define SINGLE_Emin (-126) /* smallest valid exponent */ 36#define SINGLE_Emin (-126) /* smallest valid exponent */
38
39 37
40static u_char normalize_no_excep(FPU_REG *r, int exp, int sign) 38static u_char normalize_no_excep(FPU_REG *r, int exp, int sign)
41{ 39{
42 u_char tag; 40 u_char tag;
43 41
44 setexponent16(r, exp); 42 setexponent16(r, exp);
45 43
46 tag = FPU_normalize_nuo(r); 44 tag = FPU_normalize_nuo(r);
47 stdexp(r); 45 stdexp(r);
48 if ( sign ) 46 if (sign)
49 setnegative(r); 47 setnegative(r);
50 48
51 return tag; 49 return tag;
52} 50}
53 51
54
55int FPU_tagof(FPU_REG *ptr) 52int FPU_tagof(FPU_REG *ptr)
56{ 53{
57 int exp; 54 int exp;
58 55
59 exp = exponent16(ptr) & 0x7fff; 56 exp = exponent16(ptr) & 0x7fff;
60 if ( exp == 0 ) 57 if (exp == 0) {
61 { 58 if (!(ptr->sigh | ptr->sigl)) {
62 if ( !(ptr->sigh | ptr->sigl) ) 59 return TAG_Zero;
63 { 60 }
64 return TAG_Zero; 61 /* The number is a de-normal or pseudodenormal. */
62 return TAG_Special;
63 }
64
65 if (exp == 0x7fff) {
66 /* Is an Infinity, a NaN, or an unsupported data type. */
67 return TAG_Special;
65 } 68 }
66 /* The number is a de-normal or pseudodenormal. */
67 return TAG_Special;
68 }
69
70 if ( exp == 0x7fff )
71 {
72 /* Is an Infinity, a NaN, or an unsupported data type. */
73 return TAG_Special;
74 }
75
76 if ( !(ptr->sigh & 0x80000000) )
77 {
78 /* Unsupported data type. */
79 /* Valid numbers have the ms bit set to 1. */
80 /* Unnormal. */
81 return TAG_Special;
82 }
83
84 return TAG_Valid;
85}
86 69
70 if (!(ptr->sigh & 0x80000000)) {
71 /* Unsupported data type. */
72 /* Valid numbers have the ms bit set to 1. */
73 /* Unnormal. */
74 return TAG_Special;
75 }
76
77 return TAG_Valid;
78}
87 79
88/* Get a long double from user memory */ 80/* Get a long double from user memory */
89int FPU_load_extended(long double __user *s, int stnr) 81int FPU_load_extended(long double __user *s, int stnr)
90{ 82{
91 FPU_REG *sti_ptr = &st(stnr); 83 FPU_REG *sti_ptr = &st(stnr);
92 84
93 RE_ENTRANT_CHECK_OFF; 85 RE_ENTRANT_CHECK_OFF;
94 FPU_access_ok(VERIFY_READ, s, 10); 86 FPU_access_ok(VERIFY_READ, s, 10);
95 __copy_from_user(sti_ptr, s, 10); 87 __copy_from_user(sti_ptr, s, 10);
96 RE_ENTRANT_CHECK_ON; 88 RE_ENTRANT_CHECK_ON;
97 89
98 return FPU_tagof(sti_ptr); 90 return FPU_tagof(sti_ptr);
99} 91}
100 92
101
102/* Get a double from user memory */ 93/* Get a double from user memory */
103int FPU_load_double(double __user *dfloat, FPU_REG *loaded_data) 94int FPU_load_double(double __user *dfloat, FPU_REG *loaded_data)
104{ 95{
105 int exp, tag, negative; 96 int exp, tag, negative;
106 unsigned m64, l64; 97 unsigned m64, l64;
107 98
108 RE_ENTRANT_CHECK_OFF; 99 RE_ENTRANT_CHECK_OFF;
109 FPU_access_ok(VERIFY_READ, dfloat, 8); 100 FPU_access_ok(VERIFY_READ, dfloat, 8);
110 FPU_get_user(m64, 1 + (unsigned long __user *) dfloat); 101 FPU_get_user(m64, 1 + (unsigned long __user *)dfloat);
111 FPU_get_user(l64, (unsigned long __user *) dfloat); 102 FPU_get_user(l64, (unsigned long __user *)dfloat);
112 RE_ENTRANT_CHECK_ON; 103 RE_ENTRANT_CHECK_ON;
113 104
114 negative = (m64 & 0x80000000) ? SIGN_Negative : SIGN_Positive; 105 negative = (m64 & 0x80000000) ? SIGN_Negative : SIGN_Positive;
115 exp = ((m64 & 0x7ff00000) >> 20) - DOUBLE_Ebias + EXTENDED_Ebias; 106 exp = ((m64 & 0x7ff00000) >> 20) - DOUBLE_Ebias + EXTENDED_Ebias;
116 m64 &= 0xfffff; 107 m64 &= 0xfffff;
117 if ( exp > DOUBLE_Emax + EXTENDED_Ebias ) 108 if (exp > DOUBLE_Emax + EXTENDED_Ebias) {
118 { 109 /* Infinity or NaN */
119 /* Infinity or NaN */ 110 if ((m64 == 0) && (l64 == 0)) {
120 if ((m64 == 0) && (l64 == 0)) 111 /* +- infinity */
121 { 112 loaded_data->sigh = 0x80000000;
122 /* +- infinity */ 113 loaded_data->sigl = 0x00000000;
123 loaded_data->sigh = 0x80000000; 114 exp = EXP_Infinity + EXTENDED_Ebias;
124 loaded_data->sigl = 0x00000000; 115 tag = TAG_Special;
125 exp = EXP_Infinity + EXTENDED_Ebias; 116 } else {
126 tag = TAG_Special; 117 /* Must be a signaling or quiet NaN */
127 } 118 exp = EXP_NaN + EXTENDED_Ebias;
128 else 119 loaded_data->sigh = (m64 << 11) | 0x80000000;
129 { 120 loaded_data->sigh |= l64 >> 21;
130 /* Must be a signaling or quiet NaN */ 121 loaded_data->sigl = l64 << 11;
131 exp = EXP_NaN + EXTENDED_Ebias; 122 tag = TAG_Special; /* The calling function must look for NaNs */
132 loaded_data->sigh = (m64 << 11) | 0x80000000; 123 }
133 loaded_data->sigh |= l64 >> 21; 124 } else if (exp < DOUBLE_Emin + EXTENDED_Ebias) {
134 loaded_data->sigl = l64 << 11; 125 /* Zero or de-normal */
135 tag = TAG_Special; /* The calling function must look for NaNs */ 126 if ((m64 == 0) && (l64 == 0)) {
136 } 127 /* Zero */
137 } 128 reg_copy(&CONST_Z, loaded_data);
138 else if ( exp < DOUBLE_Emin + EXTENDED_Ebias ) 129 exp = 0;
139 { 130 tag = TAG_Zero;
140 /* Zero or de-normal */ 131 } else {
141 if ((m64 == 0) && (l64 == 0)) 132 /* De-normal */
142 { 133 loaded_data->sigh = m64 << 11;
143 /* Zero */ 134 loaded_data->sigh |= l64 >> 21;
144 reg_copy(&CONST_Z, loaded_data); 135 loaded_data->sigl = l64 << 11;
145 exp = 0; 136
146 tag = TAG_Zero; 137 return normalize_no_excep(loaded_data, DOUBLE_Emin,
147 } 138 negative)
148 else 139 | (denormal_operand() < 0 ? FPU_Exception : 0);
149 { 140 }
150 /* De-normal */ 141 } else {
151 loaded_data->sigh = m64 << 11; 142 loaded_data->sigh = (m64 << 11) | 0x80000000;
152 loaded_data->sigh |= l64 >> 21; 143 loaded_data->sigh |= l64 >> 21;
153 loaded_data->sigl = l64 << 11; 144 loaded_data->sigl = l64 << 11;
154
155 return normalize_no_excep(loaded_data, DOUBLE_Emin, negative)
156 | (denormal_operand() < 0 ? FPU_Exception : 0);
157 }
158 }
159 else
160 {
161 loaded_data->sigh = (m64 << 11) | 0x80000000;
162 loaded_data->sigh |= l64 >> 21;
163 loaded_data->sigl = l64 << 11;
164 145
165 tag = TAG_Valid; 146 tag = TAG_Valid;
166 } 147 }
167 148
168 setexponent16(loaded_data, exp | negative); 149 setexponent16(loaded_data, exp | negative);
169 150
170 return tag; 151 return tag;
171} 152}
172 153
173
174/* Get a float from user memory */ 154/* Get a float from user memory */
175int FPU_load_single(float __user *single, FPU_REG *loaded_data) 155int FPU_load_single(float __user *single, FPU_REG *loaded_data)
176{ 156{
177 unsigned m32; 157 unsigned m32;
178 int exp, tag, negative; 158 int exp, tag, negative;
179 159
180 RE_ENTRANT_CHECK_OFF; 160 RE_ENTRANT_CHECK_OFF;
181 FPU_access_ok(VERIFY_READ, single, 4); 161 FPU_access_ok(VERIFY_READ, single, 4);
182 FPU_get_user(m32, (unsigned long __user *) single); 162 FPU_get_user(m32, (unsigned long __user *)single);
183 RE_ENTRANT_CHECK_ON; 163 RE_ENTRANT_CHECK_ON;
184 164
185 negative = (m32 & 0x80000000) ? SIGN_Negative : SIGN_Positive; 165 negative = (m32 & 0x80000000) ? SIGN_Negative : SIGN_Positive;
186 166
187 if (!(m32 & 0x7fffffff)) 167 if (!(m32 & 0x7fffffff)) {
188 { 168 /* Zero */
189 /* Zero */ 169 reg_copy(&CONST_Z, loaded_data);
190 reg_copy(&CONST_Z, loaded_data); 170 addexponent(loaded_data, negative);
191 addexponent(loaded_data, negative); 171 return TAG_Zero;
192 return TAG_Zero;
193 }
194 exp = ((m32 & 0x7f800000) >> 23) - SINGLE_Ebias + EXTENDED_Ebias;
195 m32 = (m32 & 0x7fffff) << 8;
196 if ( exp < SINGLE_Emin + EXTENDED_Ebias )
197 {
198 /* De-normals */
199 loaded_data->sigh = m32;
200 loaded_data->sigl = 0;
201
202 return normalize_no_excep(loaded_data, SINGLE_Emin, negative)
203 | (denormal_operand() < 0 ? FPU_Exception : 0);
204 }
205 else if ( exp > SINGLE_Emax + EXTENDED_Ebias )
206 {
207 /* Infinity or NaN */
208 if ( m32 == 0 )
209 {
210 /* +- infinity */
211 loaded_data->sigh = 0x80000000;
212 loaded_data->sigl = 0x00000000;
213 exp = EXP_Infinity + EXTENDED_Ebias;
214 tag = TAG_Special;
215 } 172 }
216 else 173 exp = ((m32 & 0x7f800000) >> 23) - SINGLE_Ebias + EXTENDED_Ebias;
217 { 174 m32 = (m32 & 0x7fffff) << 8;
218 /* Must be a signaling or quiet NaN */ 175 if (exp < SINGLE_Emin + EXTENDED_Ebias) {
219 exp = EXP_NaN + EXTENDED_Ebias; 176 /* De-normals */
220 loaded_data->sigh = m32 | 0x80000000; 177 loaded_data->sigh = m32;
221 loaded_data->sigl = 0; 178 loaded_data->sigl = 0;
222 tag = TAG_Special; /* The calling function must look for NaNs */ 179
180 return normalize_no_excep(loaded_data, SINGLE_Emin, negative)
181 | (denormal_operand() < 0 ? FPU_Exception : 0);
182 } else if (exp > SINGLE_Emax + EXTENDED_Ebias) {
183 /* Infinity or NaN */
184 if (m32 == 0) {
185 /* +- infinity */
186 loaded_data->sigh = 0x80000000;
187 loaded_data->sigl = 0x00000000;
188 exp = EXP_Infinity + EXTENDED_Ebias;
189 tag = TAG_Special;
190 } else {
191 /* Must be a signaling or quiet NaN */
192 exp = EXP_NaN + EXTENDED_Ebias;
193 loaded_data->sigh = m32 | 0x80000000;
194 loaded_data->sigl = 0;
195 tag = TAG_Special; /* The calling function must look for NaNs */
196 }
197 } else {
198 loaded_data->sigh = m32 | 0x80000000;
199 loaded_data->sigl = 0;
200 tag = TAG_Valid;
223 } 201 }
224 }
225 else
226 {
227 loaded_data->sigh = m32 | 0x80000000;
228 loaded_data->sigl = 0;
229 tag = TAG_Valid;
230 }
231 202
232 setexponent16(loaded_data, exp | negative); /* Set the sign. */ 203 setexponent16(loaded_data, exp | negative); /* Set the sign. */
233 204
234 return tag; 205 return tag;
235} 206}
236 207
237
238/* Get a long long from user memory */ 208/* Get a long long from user memory */
239int FPU_load_int64(long long __user *_s) 209int FPU_load_int64(long long __user *_s)
240{ 210{
241 long long s; 211 long long s;
242 int sign; 212 int sign;
243 FPU_REG *st0_ptr = &st(0); 213 FPU_REG *st0_ptr = &st(0);
244 214
245 RE_ENTRANT_CHECK_OFF; 215 RE_ENTRANT_CHECK_OFF;
246 FPU_access_ok(VERIFY_READ, _s, 8); 216 FPU_access_ok(VERIFY_READ, _s, 8);
247 if (copy_from_user(&s,_s,8)) 217 if (copy_from_user(&s, _s, 8))
248 FPU_abort; 218 FPU_abort;
249 RE_ENTRANT_CHECK_ON; 219 RE_ENTRANT_CHECK_ON;
250 220
251 if (s == 0) 221 if (s == 0) {
252 { 222 reg_copy(&CONST_Z, st0_ptr);
253 reg_copy(&CONST_Z, st0_ptr); 223 return TAG_Zero;
254 return TAG_Zero; 224 }
255 } 225
256 226 if (s > 0)
257 if (s > 0) 227 sign = SIGN_Positive;
258 sign = SIGN_Positive; 228 else {
259 else 229 s = -s;
260 { 230 sign = SIGN_Negative;
261 s = -s; 231 }
262 sign = SIGN_Negative;
263 }
264
265 significand(st0_ptr) = s;
266
267 return normalize_no_excep(st0_ptr, 63, sign);
268}
269 232
233 significand(st0_ptr) = s;
234
235 return normalize_no_excep(st0_ptr, 63, sign);
236}
270 237
271/* Get a long from user memory */ 238/* Get a long from user memory */
272int FPU_load_int32(long __user *_s, FPU_REG *loaded_data) 239int FPU_load_int32(long __user *_s, FPU_REG *loaded_data)
273{ 240{
274 long s; 241 long s;
275 int negative; 242 int negative;
276 243
277 RE_ENTRANT_CHECK_OFF; 244 RE_ENTRANT_CHECK_OFF;
278 FPU_access_ok(VERIFY_READ, _s, 4); 245 FPU_access_ok(VERIFY_READ, _s, 4);
279 FPU_get_user(s, _s); 246 FPU_get_user(s, _s);
280 RE_ENTRANT_CHECK_ON; 247 RE_ENTRANT_CHECK_ON;
281 248
282 if (s == 0) 249 if (s == 0) {
283 { reg_copy(&CONST_Z, loaded_data); return TAG_Zero; } 250 reg_copy(&CONST_Z, loaded_data);
251 return TAG_Zero;
252 }
284 253
285 if (s > 0) 254 if (s > 0)
286 negative = SIGN_Positive; 255 negative = SIGN_Positive;
287 else 256 else {
288 { 257 s = -s;
289 s = -s; 258 negative = SIGN_Negative;
290 negative = SIGN_Negative; 259 }
291 }
292 260
293 loaded_data->sigh = s; 261 loaded_data->sigh = s;
294 loaded_data->sigl = 0; 262 loaded_data->sigl = 0;
295 263
296 return normalize_no_excep(loaded_data, 31, negative); 264 return normalize_no_excep(loaded_data, 31, negative);
297} 265}
298 266
299
300/* Get a short from user memory */ 267/* Get a short from user memory */
301int FPU_load_int16(short __user *_s, FPU_REG *loaded_data) 268int FPU_load_int16(short __user *_s, FPU_REG *loaded_data)
302{ 269{
303 int s, negative; 270 int s, negative;
304 271
305 RE_ENTRANT_CHECK_OFF; 272 RE_ENTRANT_CHECK_OFF;
306 FPU_access_ok(VERIFY_READ, _s, 2); 273 FPU_access_ok(VERIFY_READ, _s, 2);
307 /* Cast as short to get the sign extended. */ 274 /* Cast as short to get the sign extended. */
308 FPU_get_user(s, _s); 275 FPU_get_user(s, _s);
309 RE_ENTRANT_CHECK_ON; 276 RE_ENTRANT_CHECK_ON;
310 277
311 if (s == 0) 278 if (s == 0) {
312 { reg_copy(&CONST_Z, loaded_data); return TAG_Zero; } 279 reg_copy(&CONST_Z, loaded_data);
280 return TAG_Zero;
281 }
313 282
314 if (s > 0) 283 if (s > 0)
315 negative = SIGN_Positive; 284 negative = SIGN_Positive;
316 else 285 else {
317 { 286 s = -s;
318 s = -s; 287 negative = SIGN_Negative;
319 negative = SIGN_Negative; 288 }
320 }
321 289
322 loaded_data->sigh = s << 16; 290 loaded_data->sigh = s << 16;
323 loaded_data->sigl = 0; 291 loaded_data->sigl = 0;
324 292
325 return normalize_no_excep(loaded_data, 15, negative); 293 return normalize_no_excep(loaded_data, 15, negative);
326} 294}
327 295
328
329/* Get a packed bcd array from user memory */ 296/* Get a packed bcd array from user memory */
330int FPU_load_bcd(u_char __user *s) 297int FPU_load_bcd(u_char __user *s)
331{ 298{
332 FPU_REG *st0_ptr = &st(0); 299 FPU_REG *st0_ptr = &st(0);
333 int pos; 300 int pos;
334 u_char bcd; 301 u_char bcd;
335 long long l=0; 302 long long l = 0;
336 int sign; 303 int sign;
337 304
338 RE_ENTRANT_CHECK_OFF; 305 RE_ENTRANT_CHECK_OFF;
339 FPU_access_ok(VERIFY_READ, s, 10); 306 FPU_access_ok(VERIFY_READ, s, 10);
340 RE_ENTRANT_CHECK_ON; 307 RE_ENTRANT_CHECK_ON;
341 for ( pos = 8; pos >= 0; pos--) 308 for (pos = 8; pos >= 0; pos--) {
342 { 309 l *= 10;
343 l *= 10; 310 RE_ENTRANT_CHECK_OFF;
344 RE_ENTRANT_CHECK_OFF; 311 FPU_get_user(bcd, s + pos);
345 FPU_get_user(bcd, s+pos); 312 RE_ENTRANT_CHECK_ON;
346 RE_ENTRANT_CHECK_ON; 313 l += bcd >> 4;
347 l += bcd >> 4; 314 l *= 10;
348 l *= 10; 315 l += bcd & 0x0f;
349 l += bcd & 0x0f; 316 }
350 } 317
351 318 RE_ENTRANT_CHECK_OFF;
352 RE_ENTRANT_CHECK_OFF; 319 FPU_get_user(sign, s + 9);
353 FPU_get_user(sign, s+9); 320 sign = sign & 0x80 ? SIGN_Negative : SIGN_Positive;
354 sign = sign & 0x80 ? SIGN_Negative : SIGN_Positive; 321 RE_ENTRANT_CHECK_ON;
355 RE_ENTRANT_CHECK_ON; 322
356 323 if (l == 0) {
357 if ( l == 0 ) 324 reg_copy(&CONST_Z, st0_ptr);
358 { 325 addexponent(st0_ptr, sign); /* Set the sign. */
359 reg_copy(&CONST_Z, st0_ptr); 326 return TAG_Zero;
360 addexponent(st0_ptr, sign); /* Set the sign. */ 327 } else {
361 return TAG_Zero; 328 significand(st0_ptr) = l;
362 } 329 return normalize_no_excep(st0_ptr, 63, sign);
363 else 330 }
364 {
365 significand(st0_ptr) = l;
366 return normalize_no_excep(st0_ptr, 63, sign);
367 }
368} 331}
369 332
370/*===========================================================================*/ 333/*===========================================================================*/
371 334
372/* Put a long double into user memory */ 335/* Put a long double into user memory */
373int FPU_store_extended(FPU_REG *st0_ptr, u_char st0_tag, long double __user *d) 336int FPU_store_extended(FPU_REG *st0_ptr, u_char st0_tag,
337 long double __user * d)
374{ 338{
375 /* 339 /*
376 The only exception raised by an attempt to store to an 340 The only exception raised by an attempt to store to an
377 extended format is the Invalid Stack exception, i.e. 341 extended format is the Invalid Stack exception, i.e.
378 attempting to store from an empty register. 342 attempting to store from an empty register.
379 */ 343 */
380 344
381 if ( st0_tag != TAG_Empty ) 345 if (st0_tag != TAG_Empty) {
382 { 346 RE_ENTRANT_CHECK_OFF;
383 RE_ENTRANT_CHECK_OFF; 347 FPU_access_ok(VERIFY_WRITE, d, 10);
384 FPU_access_ok(VERIFY_WRITE, d, 10); 348
385 349 FPU_put_user(st0_ptr->sigl, (unsigned long __user *)d);
386 FPU_put_user(st0_ptr->sigl, (unsigned long __user *) d); 350 FPU_put_user(st0_ptr->sigh,
387 FPU_put_user(st0_ptr->sigh, (unsigned long __user *) ((u_char __user *)d + 4)); 351 (unsigned long __user *)((u_char __user *) d + 4));
388 FPU_put_user(exponent16(st0_ptr), (unsigned short __user *) ((u_char __user *)d + 8)); 352 FPU_put_user(exponent16(st0_ptr),
389 RE_ENTRANT_CHECK_ON; 353 (unsigned short __user *)((u_char __user *) d +
390 354 8));
391 return 1; 355 RE_ENTRANT_CHECK_ON;
392 } 356
393 357 return 1;
394 /* Empty register (stack underflow) */ 358 }
395 EXCEPTION(EX_StackUnder);
396 if ( control_word & CW_Invalid )
397 {
398 /* The masked response */
399 /* Put out the QNaN indefinite */
400 RE_ENTRANT_CHECK_OFF;
401 FPU_access_ok(VERIFY_WRITE,d,10);
402 FPU_put_user(0, (unsigned long __user *) d);
403 FPU_put_user(0xc0000000, 1 + (unsigned long __user *) d);
404 FPU_put_user(0xffff, 4 + (short __user *) d);
405 RE_ENTRANT_CHECK_ON;
406 return 1;
407 }
408 else
409 return 0;
410 359
411} 360 /* Empty register (stack underflow) */
361 EXCEPTION(EX_StackUnder);
362 if (control_word & CW_Invalid) {
363 /* The masked response */
364 /* Put out the QNaN indefinite */
365 RE_ENTRANT_CHECK_OFF;
366 FPU_access_ok(VERIFY_WRITE, d, 10);
367 FPU_put_user(0, (unsigned long __user *)d);
368 FPU_put_user(0xc0000000, 1 + (unsigned long __user *)d);
369 FPU_put_user(0xffff, 4 + (short __user *)d);
370 RE_ENTRANT_CHECK_ON;
371 return 1;
372 } else
373 return 0;
412 374
375}
413 376
414/* Put a double into user memory */ 377/* Put a double into user memory */
415int FPU_store_double(FPU_REG *st0_ptr, u_char st0_tag, double __user *dfloat) 378int FPU_store_double(FPU_REG *st0_ptr, u_char st0_tag, double __user *dfloat)
416{ 379{
417 unsigned long l[2]; 380 unsigned long l[2];
418 unsigned long increment = 0; /* avoid gcc warnings */ 381 unsigned long increment = 0; /* avoid gcc warnings */
419 int precision_loss; 382 int precision_loss;
420 int exp; 383 int exp;
421 FPU_REG tmp; 384 FPU_REG tmp;
422 385
423 if ( st0_tag == TAG_Valid ) 386 if (st0_tag == TAG_Valid) {
424 { 387 reg_copy(st0_ptr, &tmp);
425 reg_copy(st0_ptr, &tmp); 388 exp = exponent(&tmp);
426 exp = exponent(&tmp);
427 389
428 if ( exp < DOUBLE_Emin ) /* It may be a denormal */ 390 if (exp < DOUBLE_Emin) { /* It may be a denormal */
429 { 391 addexponent(&tmp, -DOUBLE_Emin + 52); /* largest exp to be 51 */
430 addexponent(&tmp, -DOUBLE_Emin + 52); /* largest exp to be 51 */
431 392
432 denormal_arg: 393 denormal_arg:
433 394
434 if ( (precision_loss = FPU_round_to_int(&tmp, st0_tag)) ) 395 if ((precision_loss = FPU_round_to_int(&tmp, st0_tag))) {
435 {
436#ifdef PECULIAR_486 396#ifdef PECULIAR_486
437 /* Did it round to a non-denormal ? */ 397 /* Did it round to a non-denormal ? */
438 /* This behaviour might be regarded as peculiar, it appears 398 /* This behaviour might be regarded as peculiar, it appears
439 that the 80486 rounds to the dest precision, then 399 that the 80486 rounds to the dest precision, then
440 converts to decide underflow. */ 400 converts to decide underflow. */
441 if ( !((tmp.sigh == 0x00100000) && (tmp.sigl == 0) && 401 if (!
442 (st0_ptr->sigl & 0x000007ff)) ) 402 ((tmp.sigh == 0x00100000) && (tmp.sigl == 0)
403 && (st0_ptr->sigl & 0x000007ff)))
443#endif /* PECULIAR_486 */ 404#endif /* PECULIAR_486 */
444 { 405 {
445 EXCEPTION(EX_Underflow); 406 EXCEPTION(EX_Underflow);
446 /* This is a special case: see sec 16.2.5.1 of 407 /* This is a special case: see sec 16.2.5.1 of
447 the 80486 book */ 408 the 80486 book */
448 if ( !(control_word & CW_Underflow) ) 409 if (!(control_word & CW_Underflow))
449 return 0; 410 return 0;
450 } 411 }
451 EXCEPTION(precision_loss); 412 EXCEPTION(precision_loss);
452 if ( !(control_word & CW_Precision) ) 413 if (!(control_word & CW_Precision))
453 return 0; 414 return 0;
454 }
455 l[0] = tmp.sigl;
456 l[1] = tmp.sigh;
457 }
458 else
459 {
460 if ( tmp.sigl & 0x000007ff )
461 {
462 precision_loss = 1;
463 switch (control_word & CW_RC)
464 {
465 case RC_RND:
466 /* Rounding can get a little messy.. */
467 increment = ((tmp.sigl & 0x7ff) > 0x400) | /* nearest */
468 ((tmp.sigl & 0xc00) == 0xc00); /* odd -> even */
469 break;
470 case RC_DOWN: /* towards -infinity */
471 increment = signpositive(&tmp) ? 0 : tmp.sigl & 0x7ff;
472 break;
473 case RC_UP: /* towards +infinity */
474 increment = signpositive(&tmp) ? tmp.sigl & 0x7ff : 0;
475 break;
476 case RC_CHOP:
477 increment = 0;
478 break;
479 }
480
481 /* Truncate the mantissa */
482 tmp.sigl &= 0xfffff800;
483
484 if ( increment )
485 {
486 if ( tmp.sigl >= 0xfffff800 )
487 {
488 /* the sigl part overflows */
489 if ( tmp.sigh == 0xffffffff )
490 {
491 /* The sigh part overflows */
492 tmp.sigh = 0x80000000;
493 exp++;
494 if (exp >= EXP_OVER)
495 goto overflow;
496 } 415 }
497 else 416 l[0] = tmp.sigl;
498 { 417 l[1] = tmp.sigh;
499 tmp.sigh ++; 418 } else {
419 if (tmp.sigl & 0x000007ff) {
420 precision_loss = 1;
421 switch (control_word & CW_RC) {
422 case RC_RND:
423 /* Rounding can get a little messy.. */
424 increment = ((tmp.sigl & 0x7ff) > 0x400) | /* nearest */
425 ((tmp.sigl & 0xc00) == 0xc00); /* odd -> even */
426 break;
427 case RC_DOWN: /* towards -infinity */
428 increment =
429 signpositive(&tmp) ? 0 : tmp.
430 sigl & 0x7ff;
431 break;
432 case RC_UP: /* towards +infinity */
433 increment =
434 signpositive(&tmp) ? tmp.
435 sigl & 0x7ff : 0;
436 break;
437 case RC_CHOP:
438 increment = 0;
439 break;
440 }
441
442 /* Truncate the mantissa */
443 tmp.sigl &= 0xfffff800;
444
445 if (increment) {
446 if (tmp.sigl >= 0xfffff800) {
447 /* the sigl part overflows */
448 if (tmp.sigh == 0xffffffff) {
449 /* The sigh part overflows */
450 tmp.sigh = 0x80000000;
451 exp++;
452 if (exp >= EXP_OVER)
453 goto overflow;
454 } else {
455 tmp.sigh++;
456 }
457 tmp.sigl = 0x00000000;
458 } else {
459 /* We only need to increment sigl */
460 tmp.sigl += 0x00000800;
461 }
462 }
463 } else
464 precision_loss = 0;
465
466 l[0] = (tmp.sigl >> 11) | (tmp.sigh << 21);
467 l[1] = ((tmp.sigh >> 11) & 0xfffff);
468
469 if (exp > DOUBLE_Emax) {
470 overflow:
471 EXCEPTION(EX_Overflow);
472 if (!(control_word & CW_Overflow))
473 return 0;
474 set_precision_flag_up();
475 if (!(control_word & CW_Precision))
476 return 0;
477
478 /* This is a special case: see sec 16.2.5.1 of the 80486 book */
479 /* Overflow to infinity */
480 l[0] = 0x00000000; /* Set to */
481 l[1] = 0x7ff00000; /* + INF */
482 } else {
483 if (precision_loss) {
484 if (increment)
485 set_precision_flag_up();
486 else
487 set_precision_flag_down();
488 }
489 /* Add the exponent */
490 l[1] |= (((exp + DOUBLE_Ebias) & 0x7ff) << 20);
500 } 491 }
501 tmp.sigl = 0x00000000;
502 }
503 else
504 {
505 /* We only need to increment sigl */
506 tmp.sigl += 0x00000800;
507 }
508 }
509 }
510 else
511 precision_loss = 0;
512
513 l[0] = (tmp.sigl >> 11) | (tmp.sigh << 21);
514 l[1] = ((tmp.sigh >> 11) & 0xfffff);
515
516 if ( exp > DOUBLE_Emax )
517 {
518 overflow:
519 EXCEPTION(EX_Overflow);
520 if ( !(control_word & CW_Overflow) )
521 return 0;
522 set_precision_flag_up();
523 if ( !(control_word & CW_Precision) )
524 return 0;
525
526 /* This is a special case: see sec 16.2.5.1 of the 80486 book */
527 /* Overflow to infinity */
528 l[0] = 0x00000000; /* Set to */
529 l[1] = 0x7ff00000; /* + INF */
530 }
531 else
532 {
533 if ( precision_loss )
534 {
535 if ( increment )
536 set_precision_flag_up();
537 else
538 set_precision_flag_down();
539 } 492 }
540 /* Add the exponent */ 493 } else if (st0_tag == TAG_Zero) {
541 l[1] |= (((exp+DOUBLE_Ebias) & 0x7ff) << 20); 494 /* Number is zero */
542 } 495 l[0] = 0;
543 } 496 l[1] = 0;
544 } 497 } else if (st0_tag == TAG_Special) {
545 else if (st0_tag == TAG_Zero) 498 st0_tag = FPU_Special(st0_ptr);
546 { 499 if (st0_tag == TW_Denormal) {
547 /* Number is zero */ 500 /* A denormal will always underflow. */
548 l[0] = 0;
549 l[1] = 0;
550 }
551 else if ( st0_tag == TAG_Special )
552 {
553 st0_tag = FPU_Special(st0_ptr);
554 if ( st0_tag == TW_Denormal )
555 {
556 /* A denormal will always underflow. */
557#ifndef PECULIAR_486 501#ifndef PECULIAR_486
558 /* An 80486 is supposed to be able to generate 502 /* An 80486 is supposed to be able to generate
559 a denormal exception here, but... */ 503 a denormal exception here, but... */
560 /* Underflow has priority. */ 504 /* Underflow has priority. */
561 if ( control_word & CW_Underflow ) 505 if (control_word & CW_Underflow)
562 denormal_operand(); 506 denormal_operand();
563#endif /* PECULIAR_486 */ 507#endif /* PECULIAR_486 */
564 reg_copy(st0_ptr, &tmp); 508 reg_copy(st0_ptr, &tmp);
565 goto denormal_arg; 509 goto denormal_arg;
566 } 510 } else if (st0_tag == TW_Infinity) {
567 else if (st0_tag == TW_Infinity) 511 l[0] = 0;
568 { 512 l[1] = 0x7ff00000;
569 l[0] = 0; 513 } else if (st0_tag == TW_NaN) {
570 l[1] = 0x7ff00000; 514 /* Is it really a NaN ? */
571 } 515 if ((exponent(st0_ptr) == EXP_OVER)
572 else if (st0_tag == TW_NaN) 516 && (st0_ptr->sigh & 0x80000000)) {
573 { 517 /* See if we can get a valid NaN from the FPU_REG */
574 /* Is it really a NaN ? */ 518 l[0] =
575 if ( (exponent(st0_ptr) == EXP_OVER) 519 (st0_ptr->sigl >> 11) | (st0_ptr->
576 && (st0_ptr->sigh & 0x80000000) ) 520 sigh << 21);
577 { 521 l[1] = ((st0_ptr->sigh >> 11) & 0xfffff);
578 /* See if we can get a valid NaN from the FPU_REG */ 522 if (!(st0_ptr->sigh & 0x40000000)) {
579 l[0] = (st0_ptr->sigl >> 11) | (st0_ptr->sigh << 21); 523 /* It is a signalling NaN */
580 l[1] = ((st0_ptr->sigh >> 11) & 0xfffff); 524 EXCEPTION(EX_Invalid);
581 if ( !(st0_ptr->sigh & 0x40000000) ) 525 if (!(control_word & CW_Invalid))
582 { 526 return 0;
583 /* It is a signalling NaN */ 527 l[1] |= (0x40000000 >> 11);
584 EXCEPTION(EX_Invalid); 528 }
585 if ( !(control_word & CW_Invalid) ) 529 l[1] |= 0x7ff00000;
586 return 0; 530 } else {
587 l[1] |= (0x40000000 >> 11); 531 /* It is an unsupported data type */
532 EXCEPTION(EX_Invalid);
533 if (!(control_word & CW_Invalid))
534 return 0;
535 l[0] = 0;
536 l[1] = 0xfff80000;
537 }
588 } 538 }
589 l[1] |= 0x7ff00000; 539 } else if (st0_tag == TAG_Empty) {
590 } 540 /* Empty register (stack underflow) */
591 else 541 EXCEPTION(EX_StackUnder);
592 { 542 if (control_word & CW_Invalid) {
593 /* It is an unsupported data type */ 543 /* The masked response */
594 EXCEPTION(EX_Invalid); 544 /* Put out the QNaN indefinite */
595 if ( !(control_word & CW_Invalid) ) 545 RE_ENTRANT_CHECK_OFF;
596 return 0; 546 FPU_access_ok(VERIFY_WRITE, dfloat, 8);
597 l[0] = 0; 547 FPU_put_user(0, (unsigned long __user *)dfloat);
598 l[1] = 0xfff80000; 548 FPU_put_user(0xfff80000,
599 } 549 1 + (unsigned long __user *)dfloat);
550 RE_ENTRANT_CHECK_ON;
551 return 1;
552 } else
553 return 0;
600 } 554 }
601 } 555 if (getsign(st0_ptr))
602 else if ( st0_tag == TAG_Empty ) 556 l[1] |= 0x80000000;
603 {
604 /* Empty register (stack underflow) */
605 EXCEPTION(EX_StackUnder);
606 if ( control_word & CW_Invalid )
607 {
608 /* The masked response */
609 /* Put out the QNaN indefinite */
610 RE_ENTRANT_CHECK_OFF;
611 FPU_access_ok(VERIFY_WRITE,dfloat,8);
612 FPU_put_user(0, (unsigned long __user *) dfloat);
613 FPU_put_user(0xfff80000, 1 + (unsigned long __user *) dfloat);
614 RE_ENTRANT_CHECK_ON;
615 return 1;
616 }
617 else
618 return 0;
619 }
620 if ( getsign(st0_ptr) )
621 l[1] |= 0x80000000;
622
623 RE_ENTRANT_CHECK_OFF;
624 FPU_access_ok(VERIFY_WRITE,dfloat,8);
625 FPU_put_user(l[0], (unsigned long __user *)dfloat);
626 FPU_put_user(l[1], 1 + (unsigned long __user *)dfloat);
627 RE_ENTRANT_CHECK_ON;
628
629 return 1;
630}
631 557
558 RE_ENTRANT_CHECK_OFF;
559 FPU_access_ok(VERIFY_WRITE, dfloat, 8);
560 FPU_put_user(l[0], (unsigned long __user *)dfloat);
561 FPU_put_user(l[1], 1 + (unsigned long __user *)dfloat);
562 RE_ENTRANT_CHECK_ON;
563
564 return 1;
565}
632 566
633/* Put a float into user memory */ 567/* Put a float into user memory */
634int FPU_store_single(FPU_REG *st0_ptr, u_char st0_tag, float __user *single) 568int FPU_store_single(FPU_REG *st0_ptr, u_char st0_tag, float __user *single)
635{ 569{
636 long templ = 0; 570 long templ = 0;
637 unsigned long increment = 0; /* avoid gcc warnings */ 571 unsigned long increment = 0; /* avoid gcc warnings */
638 int precision_loss; 572 int precision_loss;
639 int exp; 573 int exp;
640 FPU_REG tmp; 574 FPU_REG tmp;
641 575
642 if ( st0_tag == TAG_Valid ) 576 if (st0_tag == TAG_Valid) {
643 {
644 577
645 reg_copy(st0_ptr, &tmp); 578 reg_copy(st0_ptr, &tmp);
646 exp = exponent(&tmp); 579 exp = exponent(&tmp);
647 580
648 if ( exp < SINGLE_Emin ) 581 if (exp < SINGLE_Emin) {
649 { 582 addexponent(&tmp, -SINGLE_Emin + 23); /* largest exp to be 22 */
650 addexponent(&tmp, -SINGLE_Emin + 23); /* largest exp to be 22 */
651 583
652 denormal_arg: 584 denormal_arg:
653 585
654 if ( (precision_loss = FPU_round_to_int(&tmp, st0_tag)) ) 586 if ((precision_loss = FPU_round_to_int(&tmp, st0_tag))) {
655 {
656#ifdef PECULIAR_486 587#ifdef PECULIAR_486
657 /* Did it round to a non-denormal ? */ 588 /* Did it round to a non-denormal ? */
658 /* This behaviour might be regarded as peculiar, it appears 589 /* This behaviour might be regarded as peculiar, it appears
659 that the 80486 rounds to the dest precision, then 590 that the 80486 rounds to the dest precision, then
660 converts to decide underflow. */ 591 converts to decide underflow. */
661 if ( !((tmp.sigl == 0x00800000) && 592 if (!((tmp.sigl == 0x00800000) &&
662 ((st0_ptr->sigh & 0x000000ff) || st0_ptr->sigl)) ) 593 ((st0_ptr->sigh & 0x000000ff)
594 || st0_ptr->sigl)))
663#endif /* PECULIAR_486 */ 595#endif /* PECULIAR_486 */
664 { 596 {
665 EXCEPTION(EX_Underflow); 597 EXCEPTION(EX_Underflow);
666 /* This is a special case: see sec 16.2.5.1 of 598 /* This is a special case: see sec 16.2.5.1 of
667 the 80486 book */ 599 the 80486 book */
668 if ( !(control_word & CW_Underflow) ) 600 if (!(control_word & CW_Underflow))
669 return 0; 601 return 0;
670 } 602 }
671 EXCEPTION(precision_loss); 603 EXCEPTION(precision_loss);
672 if ( !(control_word & CW_Precision) ) 604 if (!(control_word & CW_Precision))
673 return 0; 605 return 0;
674 } 606 }
675 templ = tmp.sigl; 607 templ = tmp.sigl;
676 } 608 } else {
677 else 609 if (tmp.sigl | (tmp.sigh & 0x000000ff)) {
678 { 610 unsigned long sigh = tmp.sigh;
679 if ( tmp.sigl | (tmp.sigh & 0x000000ff) ) 611 unsigned long sigl = tmp.sigl;
680 { 612
681 unsigned long sigh = tmp.sigh; 613 precision_loss = 1;
682 unsigned long sigl = tmp.sigl; 614 switch (control_word & CW_RC) {
683 615 case RC_RND:
684 precision_loss = 1; 616 increment = ((sigh & 0xff) > 0x80) /* more than half */
685 switch (control_word & CW_RC) 617 ||(((sigh & 0xff) == 0x80) && sigl) /* more than half */
686 { 618 ||((sigh & 0x180) == 0x180); /* round to even */
687 case RC_RND: 619 break;
688 increment = ((sigh & 0xff) > 0x80) /* more than half */ 620 case RC_DOWN: /* towards -infinity */
689 || (((sigh & 0xff) == 0x80) && sigl) /* more than half */ 621 increment = signpositive(&tmp)
690 || ((sigh & 0x180) == 0x180); /* round to even */ 622 ? 0 : (sigl | (sigh & 0xff));
691 break; 623 break;
692 case RC_DOWN: /* towards -infinity */ 624 case RC_UP: /* towards +infinity */
693 increment = signpositive(&tmp) 625 increment = signpositive(&tmp)
694 ? 0 : (sigl | (sigh & 0xff)); 626 ? (sigl | (sigh & 0xff)) : 0;
695 break; 627 break;
696 case RC_UP: /* towards +infinity */ 628 case RC_CHOP:
697 increment = signpositive(&tmp) 629 increment = 0;
698 ? (sigl | (sigh & 0xff)) : 0; 630 break;
699 break; 631 }
700 case RC_CHOP: 632
701 increment = 0; 633 /* Truncate part of the mantissa */
702 break; 634 tmp.sigl = 0;
703 } 635
704 636 if (increment) {
705 /* Truncate part of the mantissa */ 637 if (sigh >= 0xffffff00) {
706 tmp.sigl = 0; 638 /* The sigh part overflows */
707 639 tmp.sigh = 0x80000000;
708 if (increment) 640 exp++;
709 { 641 if (exp >= EXP_OVER)
710 if ( sigh >= 0xffffff00 ) 642 goto overflow;
711 { 643 } else {
712 /* The sigh part overflows */ 644 tmp.sigh &= 0xffffff00;
713 tmp.sigh = 0x80000000; 645 tmp.sigh += 0x100;
714 exp++; 646 }
715 if ( exp >= EXP_OVER ) 647 } else {
716 goto overflow; 648 tmp.sigh &= 0xffffff00; /* Finish the truncation */
717 } 649 }
718 else 650 } else
719 { 651 precision_loss = 0;
720 tmp.sigh &= 0xffffff00; 652
721 tmp.sigh += 0x100; 653 templ = (tmp.sigh >> 8) & 0x007fffff;
722 } 654
723 } 655 if (exp > SINGLE_Emax) {
724 else 656 overflow:
725 { 657 EXCEPTION(EX_Overflow);
726 tmp.sigh &= 0xffffff00; /* Finish the truncation */ 658 if (!(control_word & CW_Overflow))
727 } 659 return 0;
728 } 660 set_precision_flag_up();
729 else 661 if (!(control_word & CW_Precision))
730 precision_loss = 0; 662 return 0;
731 663
732 templ = (tmp.sigh >> 8) & 0x007fffff; 664 /* This is a special case: see sec 16.2.5.1 of the 80486 book. */
733 665 /* Masked response is overflow to infinity. */
734 if ( exp > SINGLE_Emax ) 666 templ = 0x7f800000;
735 { 667 } else {
736 overflow: 668 if (precision_loss) {
737 EXCEPTION(EX_Overflow); 669 if (increment)
738 if ( !(control_word & CW_Overflow) ) 670 set_precision_flag_up();
739 return 0; 671 else
740 set_precision_flag_up(); 672 set_precision_flag_down();
741 if ( !(control_word & CW_Precision) ) 673 }
742 return 0; 674 /* Add the exponent */
743 675 templ |= ((exp + SINGLE_Ebias) & 0xff) << 23;
744 /* This is a special case: see sec 16.2.5.1 of the 80486 book. */ 676 }
745 /* Masked response is overflow to infinity. */
746 templ = 0x7f800000;
747 }
748 else
749 {
750 if ( precision_loss )
751 {
752 if ( increment )
753 set_precision_flag_up();
754 else
755 set_precision_flag_down();
756 } 677 }
757 /* Add the exponent */ 678 } else if (st0_tag == TAG_Zero) {
758 templ |= ((exp+SINGLE_Ebias) & 0xff) << 23; 679 templ = 0;
759 } 680 } else if (st0_tag == TAG_Special) {
760 } 681 st0_tag = FPU_Special(st0_ptr);
761 } 682 if (st0_tag == TW_Denormal) {
762 else if (st0_tag == TAG_Zero) 683 reg_copy(st0_ptr, &tmp);
763 { 684
764 templ = 0; 685 /* A denormal will always underflow. */
765 }
766 else if ( st0_tag == TAG_Special )
767 {
768 st0_tag = FPU_Special(st0_ptr);
769 if (st0_tag == TW_Denormal)
770 {
771 reg_copy(st0_ptr, &tmp);
772
773 /* A denormal will always underflow. */
774#ifndef PECULIAR_486 686#ifndef PECULIAR_486
775 /* An 80486 is supposed to be able to generate 687 /* An 80486 is supposed to be able to generate
776 a denormal exception here, but... */ 688 a denormal exception here, but... */
777 /* Underflow has priority. */ 689 /* Underflow has priority. */
778 if ( control_word & CW_Underflow ) 690 if (control_word & CW_Underflow)
779 denormal_operand(); 691 denormal_operand();
780#endif /* PECULIAR_486 */ 692#endif /* PECULIAR_486 */
781 goto denormal_arg; 693 goto denormal_arg;
782 } 694 } else if (st0_tag == TW_Infinity) {
783 else if (st0_tag == TW_Infinity) 695 templ = 0x7f800000;
784 { 696 } else if (st0_tag == TW_NaN) {
785 templ = 0x7f800000; 697 /* Is it really a NaN ? */
786 } 698 if ((exponent(st0_ptr) == EXP_OVER)
787 else if (st0_tag == TW_NaN) 699 && (st0_ptr->sigh & 0x80000000)) {
788 { 700 /* See if we can get a valid NaN from the FPU_REG */
789 /* Is it really a NaN ? */ 701 templ = st0_ptr->sigh >> 8;
790 if ( (exponent(st0_ptr) == EXP_OVER) && (st0_ptr->sigh & 0x80000000) ) 702 if (!(st0_ptr->sigh & 0x40000000)) {
791 { 703 /* It is a signalling NaN */
792 /* See if we can get a valid NaN from the FPU_REG */ 704 EXCEPTION(EX_Invalid);
793 templ = st0_ptr->sigh >> 8; 705 if (!(control_word & CW_Invalid))
794 if ( !(st0_ptr->sigh & 0x40000000) ) 706 return 0;
795 { 707 templ |= (0x40000000 >> 8);
796 /* It is a signalling NaN */ 708 }
797 EXCEPTION(EX_Invalid); 709 templ |= 0x7f800000;
798 if ( !(control_word & CW_Invalid) ) 710 } else {
799 return 0; 711 /* It is an unsupported data type */
800 templ |= (0x40000000 >> 8); 712 EXCEPTION(EX_Invalid);
713 if (!(control_word & CW_Invalid))
714 return 0;
715 templ = 0xffc00000;
716 }
801 } 717 }
802 templ |= 0x7f800000;
803 }
804 else
805 {
806 /* It is an unsupported data type */
807 EXCEPTION(EX_Invalid);
808 if ( !(control_word & CW_Invalid) )
809 return 0;
810 templ = 0xffc00000;
811 }
812 }
813#ifdef PARANOID 718#ifdef PARANOID
814 else 719 else {
815 { 720 EXCEPTION(EX_INTERNAL | 0x164);
816 EXCEPTION(EX_INTERNAL|0x164); 721 return 0;
817 return 0; 722 }
818 }
819#endif 723#endif
820 } 724 } else if (st0_tag == TAG_Empty) {
821 else if ( st0_tag == TAG_Empty ) 725 /* Empty register (stack underflow) */
822 { 726 EXCEPTION(EX_StackUnder);
823 /* Empty register (stack underflow) */ 727 if (control_word & EX_Invalid) {
824 EXCEPTION(EX_StackUnder); 728 /* The masked response */
825 if ( control_word & EX_Invalid ) 729 /* Put out the QNaN indefinite */
826 { 730 RE_ENTRANT_CHECK_OFF;
827 /* The masked response */ 731 FPU_access_ok(VERIFY_WRITE, single, 4);
828 /* Put out the QNaN indefinite */ 732 FPU_put_user(0xffc00000,
829 RE_ENTRANT_CHECK_OFF; 733 (unsigned long __user *)single);
830 FPU_access_ok(VERIFY_WRITE,single,4); 734 RE_ENTRANT_CHECK_ON;
831 FPU_put_user(0xffc00000, (unsigned long __user *) single); 735 return 1;
832 RE_ENTRANT_CHECK_ON; 736 } else
833 return 1; 737 return 0;
834 } 738 }
835 else
836 return 0;
837 }
838#ifdef PARANOID 739#ifdef PARANOID
839 else 740 else {
840 { 741 EXCEPTION(EX_INTERNAL | 0x163);
841 EXCEPTION(EX_INTERNAL|0x163); 742 return 0;
842 return 0; 743 }
843 }
844#endif 744#endif
845 if ( getsign(st0_ptr) ) 745 if (getsign(st0_ptr))
846 templ |= 0x80000000; 746 templ |= 0x80000000;
847 747
848 RE_ENTRANT_CHECK_OFF; 748 RE_ENTRANT_CHECK_OFF;
849 FPU_access_ok(VERIFY_WRITE,single,4); 749 FPU_access_ok(VERIFY_WRITE, single, 4);
850 FPU_put_user(templ,(unsigned long __user *) single); 750 FPU_put_user(templ, (unsigned long __user *)single);
851 RE_ENTRANT_CHECK_ON; 751 RE_ENTRANT_CHECK_ON;
852 752
853 return 1; 753 return 1;
854} 754}
855 755
856
857/* Put a long long into user memory */ 756/* Put a long long into user memory */
858int FPU_store_int64(FPU_REG *st0_ptr, u_char st0_tag, long long __user *d) 757int FPU_store_int64(FPU_REG *st0_ptr, u_char st0_tag, long long __user *d)
859{ 758{
860 FPU_REG t; 759 FPU_REG t;
861 long long tll; 760 long long tll;
862 int precision_loss; 761 int precision_loss;
863 762
864 if ( st0_tag == TAG_Empty ) 763 if (st0_tag == TAG_Empty) {
865 { 764 /* Empty register (stack underflow) */
866 /* Empty register (stack underflow) */ 765 EXCEPTION(EX_StackUnder);
867 EXCEPTION(EX_StackUnder); 766 goto invalid_operand;
868 goto invalid_operand; 767 } else if (st0_tag == TAG_Special) {
869 } 768 st0_tag = FPU_Special(st0_ptr);
870 else if ( st0_tag == TAG_Special ) 769 if ((st0_tag == TW_Infinity) || (st0_tag == TW_NaN)) {
871 { 770 EXCEPTION(EX_Invalid);
872 st0_tag = FPU_Special(st0_ptr); 771 goto invalid_operand;
873 if ( (st0_tag == TW_Infinity) || 772 }
874 (st0_tag == TW_NaN) )
875 {
876 EXCEPTION(EX_Invalid);
877 goto invalid_operand;
878 } 773 }
879 } 774
880 775 reg_copy(st0_ptr, &t);
881 reg_copy(st0_ptr, &t); 776 precision_loss = FPU_round_to_int(&t, st0_tag);
882 precision_loss = FPU_round_to_int(&t, st0_tag); 777 ((long *)&tll)[0] = t.sigl;
883 ((long *)&tll)[0] = t.sigl; 778 ((long *)&tll)[1] = t.sigh;
884 ((long *)&tll)[1] = t.sigh; 779 if ((precision_loss == 1) ||
885 if ( (precision_loss == 1) || 780 ((t.sigh & 0x80000000) &&
886 ((t.sigh & 0x80000000) && 781 !((t.sigh == 0x80000000) && (t.sigl == 0) && signnegative(&t)))) {
887 !((t.sigh == 0x80000000) && (t.sigl == 0) && 782 EXCEPTION(EX_Invalid);
888 signnegative(&t))) ) 783 /* This is a special case: see sec 16.2.5.1 of the 80486 book */
889 { 784 invalid_operand:
890 EXCEPTION(EX_Invalid); 785 if (control_word & EX_Invalid) {
891 /* This is a special case: see sec 16.2.5.1 of the 80486 book */ 786 /* Produce something like QNaN "indefinite" */
892 invalid_operand: 787 tll = 0x8000000000000000LL;
893 if ( control_word & EX_Invalid ) 788 } else
894 { 789 return 0;
895 /* Produce something like QNaN "indefinite" */ 790 } else {
896 tll = 0x8000000000000000LL; 791 if (precision_loss)
792 set_precision_flag(precision_loss);
793 if (signnegative(&t))
794 tll = -tll;
897 } 795 }
898 else
899 return 0;
900 }
901 else
902 {
903 if ( precision_loss )
904 set_precision_flag(precision_loss);
905 if ( signnegative(&t) )
906 tll = - tll;
907 }
908
909 RE_ENTRANT_CHECK_OFF;
910 FPU_access_ok(VERIFY_WRITE,d,8);
911 if (copy_to_user(d, &tll, 8))
912 FPU_abort;
913 RE_ENTRANT_CHECK_ON;
914
915 return 1;
916}
917 796
797 RE_ENTRANT_CHECK_OFF;
798 FPU_access_ok(VERIFY_WRITE, d, 8);
799 if (copy_to_user(d, &tll, 8))
800 FPU_abort;
801 RE_ENTRANT_CHECK_ON;
802
803 return 1;
804}
918 805
919/* Put a long into user memory */ 806/* Put a long into user memory */
920int FPU_store_int32(FPU_REG *st0_ptr, u_char st0_tag, long __user *d) 807int FPU_store_int32(FPU_REG *st0_ptr, u_char st0_tag, long __user *d)
921{ 808{
922 FPU_REG t; 809 FPU_REG t;
923 int precision_loss; 810 int precision_loss;
924 811
925 if ( st0_tag == TAG_Empty ) 812 if (st0_tag == TAG_Empty) {
926 { 813 /* Empty register (stack underflow) */
927 /* Empty register (stack underflow) */ 814 EXCEPTION(EX_StackUnder);
928 EXCEPTION(EX_StackUnder); 815 goto invalid_operand;
929 goto invalid_operand; 816 } else if (st0_tag == TAG_Special) {
930 } 817 st0_tag = FPU_Special(st0_ptr);
931 else if ( st0_tag == TAG_Special ) 818 if ((st0_tag == TW_Infinity) || (st0_tag == TW_NaN)) {
932 { 819 EXCEPTION(EX_Invalid);
933 st0_tag = FPU_Special(st0_ptr); 820 goto invalid_operand;
934 if ( (st0_tag == TW_Infinity) || 821 }
935 (st0_tag == TW_NaN) )
936 {
937 EXCEPTION(EX_Invalid);
938 goto invalid_operand;
939 } 822 }
940 } 823
941 824 reg_copy(st0_ptr, &t);
942 reg_copy(st0_ptr, &t); 825 precision_loss = FPU_round_to_int(&t, st0_tag);
943 precision_loss = FPU_round_to_int(&t, st0_tag); 826 if (t.sigh ||
944 if (t.sigh || 827 ((t.sigl & 0x80000000) &&
945 ((t.sigl & 0x80000000) && 828 !((t.sigl == 0x80000000) && signnegative(&t)))) {
946 !((t.sigl == 0x80000000) && signnegative(&t))) ) 829 EXCEPTION(EX_Invalid);
947 { 830 /* This is a special case: see sec 16.2.5.1 of the 80486 book */
948 EXCEPTION(EX_Invalid); 831 invalid_operand:
949 /* This is a special case: see sec 16.2.5.1 of the 80486 book */ 832 if (control_word & EX_Invalid) {
950 invalid_operand: 833 /* Produce something like QNaN "indefinite" */
951 if ( control_word & EX_Invalid ) 834 t.sigl = 0x80000000;
952 { 835 } else
953 /* Produce something like QNaN "indefinite" */ 836 return 0;
954 t.sigl = 0x80000000; 837 } else {
838 if (precision_loss)
839 set_precision_flag(precision_loss);
840 if (signnegative(&t))
841 t.sigl = -(long)t.sigl;
955 } 842 }
956 else
957 return 0;
958 }
959 else
960 {
961 if ( precision_loss )
962 set_precision_flag(precision_loss);
963 if ( signnegative(&t) )
964 t.sigl = -(long)t.sigl;
965 }
966
967 RE_ENTRANT_CHECK_OFF;
968 FPU_access_ok(VERIFY_WRITE,d,4);
969 FPU_put_user(t.sigl, (unsigned long __user *) d);
970 RE_ENTRANT_CHECK_ON;
971
972 return 1;
973}
974 843
844 RE_ENTRANT_CHECK_OFF;
845 FPU_access_ok(VERIFY_WRITE, d, 4);
846 FPU_put_user(t.sigl, (unsigned long __user *)d);
847 RE_ENTRANT_CHECK_ON;
848
849 return 1;
850}
975 851
976/* Put a short into user memory */ 852/* Put a short into user memory */
977int FPU_store_int16(FPU_REG *st0_ptr, u_char st0_tag, short __user *d) 853int FPU_store_int16(FPU_REG *st0_ptr, u_char st0_tag, short __user *d)
978{ 854{
979 FPU_REG t; 855 FPU_REG t;
980 int precision_loss; 856 int precision_loss;
981 857
982 if ( st0_tag == TAG_Empty ) 858 if (st0_tag == TAG_Empty) {
983 { 859 /* Empty register (stack underflow) */
984 /* Empty register (stack underflow) */ 860 EXCEPTION(EX_StackUnder);
985 EXCEPTION(EX_StackUnder); 861 goto invalid_operand;
986 goto invalid_operand; 862 } else if (st0_tag == TAG_Special) {
987 } 863 st0_tag = FPU_Special(st0_ptr);
988 else if ( st0_tag == TAG_Special ) 864 if ((st0_tag == TW_Infinity) || (st0_tag == TW_NaN)) {
989 { 865 EXCEPTION(EX_Invalid);
990 st0_tag = FPU_Special(st0_ptr); 866 goto invalid_operand;
991 if ( (st0_tag == TW_Infinity) || 867 }
992 (st0_tag == TW_NaN) )
993 {
994 EXCEPTION(EX_Invalid);
995 goto invalid_operand;
996 } 868 }
997 } 869
998 870 reg_copy(st0_ptr, &t);
999 reg_copy(st0_ptr, &t); 871 precision_loss = FPU_round_to_int(&t, st0_tag);
1000 precision_loss = FPU_round_to_int(&t, st0_tag); 872 if (t.sigh ||
1001 if (t.sigh || 873 ((t.sigl & 0xffff8000) &&
1002 ((t.sigl & 0xffff8000) && 874 !((t.sigl == 0x8000) && signnegative(&t)))) {
1003 !((t.sigl == 0x8000) && signnegative(&t))) ) 875 EXCEPTION(EX_Invalid);
1004 { 876 /* This is a special case: see sec 16.2.5.1 of the 80486 book */
1005 EXCEPTION(EX_Invalid); 877 invalid_operand:
1006 /* This is a special case: see sec 16.2.5.1 of the 80486 book */ 878 if (control_word & EX_Invalid) {
1007 invalid_operand: 879 /* Produce something like QNaN "indefinite" */
1008 if ( control_word & EX_Invalid ) 880 t.sigl = 0x8000;
1009 { 881 } else
1010 /* Produce something like QNaN "indefinite" */ 882 return 0;
1011 t.sigl = 0x8000; 883 } else {
884 if (precision_loss)
885 set_precision_flag(precision_loss);
886 if (signnegative(&t))
887 t.sigl = -t.sigl;
1012 } 888 }
1013 else
1014 return 0;
1015 }
1016 else
1017 {
1018 if ( precision_loss )
1019 set_precision_flag(precision_loss);
1020 if ( signnegative(&t) )
1021 t.sigl = -t.sigl;
1022 }
1023
1024 RE_ENTRANT_CHECK_OFF;
1025 FPU_access_ok(VERIFY_WRITE,d,2);
1026 FPU_put_user((short)t.sigl, d);
1027 RE_ENTRANT_CHECK_ON;
1028
1029 return 1;
1030}
1031 889
890 RE_ENTRANT_CHECK_OFF;
891 FPU_access_ok(VERIFY_WRITE, d, 2);
892 FPU_put_user((short)t.sigl, d);
893 RE_ENTRANT_CHECK_ON;
894
895 return 1;
896}
1032 897
1033/* Put a packed bcd array into user memory */ 898/* Put a packed bcd array into user memory */
1034int FPU_store_bcd(FPU_REG *st0_ptr, u_char st0_tag, u_char __user *d) 899int FPU_store_bcd(FPU_REG *st0_ptr, u_char st0_tag, u_char __user *d)
1035{ 900{
1036 FPU_REG t; 901 FPU_REG t;
1037 unsigned long long ll; 902 unsigned long long ll;
1038 u_char b; 903 u_char b;
1039 int i, precision_loss; 904 int i, precision_loss;
1040 u_char sign = (getsign(st0_ptr) == SIGN_NEG) ? 0x80 : 0; 905 u_char sign = (getsign(st0_ptr) == SIGN_NEG) ? 0x80 : 0;
1041 906
1042 if ( st0_tag == TAG_Empty ) 907 if (st0_tag == TAG_Empty) {
1043 { 908 /* Empty register (stack underflow) */
1044 /* Empty register (stack underflow) */ 909 EXCEPTION(EX_StackUnder);
1045 EXCEPTION(EX_StackUnder); 910 goto invalid_operand;
1046 goto invalid_operand; 911 } else if (st0_tag == TAG_Special) {
1047 } 912 st0_tag = FPU_Special(st0_ptr);
1048 else if ( st0_tag == TAG_Special ) 913 if ((st0_tag == TW_Infinity) || (st0_tag == TW_NaN)) {
1049 { 914 EXCEPTION(EX_Invalid);
1050 st0_tag = FPU_Special(st0_ptr); 915 goto invalid_operand;
1051 if ( (st0_tag == TW_Infinity) || 916 }
1052 (st0_tag == TW_NaN) ) 917 }
1053 { 918
1054 EXCEPTION(EX_Invalid); 919 reg_copy(st0_ptr, &t);
1055 goto invalid_operand; 920 precision_loss = FPU_round_to_int(&t, st0_tag);
921 ll = significand(&t);
922
923 /* Check for overflow, by comparing with 999999999999999999 decimal. */
924 if ((t.sigh > 0x0de0b6b3) ||
925 ((t.sigh == 0x0de0b6b3) && (t.sigl > 0xa763ffff))) {
926 EXCEPTION(EX_Invalid);
927 /* This is a special case: see sec 16.2.5.1 of the 80486 book */
928 invalid_operand:
929 if (control_word & CW_Invalid) {
930 /* Produce the QNaN "indefinite" */
931 RE_ENTRANT_CHECK_OFF;
932 FPU_access_ok(VERIFY_WRITE, d, 10);
933 for (i = 0; i < 7; i++)
934 FPU_put_user(0, d + i); /* These bytes "undefined" */
935 FPU_put_user(0xc0, d + 7); /* This byte "undefined" */
936 FPU_put_user(0xff, d + 8);
937 FPU_put_user(0xff, d + 9);
938 RE_ENTRANT_CHECK_ON;
939 return 1;
940 } else
941 return 0;
942 } else if (precision_loss) {
943 /* Precision loss doesn't stop the data transfer */
944 set_precision_flag(precision_loss);
1056 } 945 }
1057 } 946
1058 947 RE_ENTRANT_CHECK_OFF;
1059 reg_copy(st0_ptr, &t); 948 FPU_access_ok(VERIFY_WRITE, d, 10);
1060 precision_loss = FPU_round_to_int(&t, st0_tag); 949 RE_ENTRANT_CHECK_ON;
1061 ll = significand(&t); 950 for (i = 0; i < 9; i++) {
1062 951 b = FPU_div_small(&ll, 10);
1063 /* Check for overflow, by comparing with 999999999999999999 decimal. */ 952 b |= (FPU_div_small(&ll, 10)) << 4;
1064 if ( (t.sigh > 0x0de0b6b3) || 953 RE_ENTRANT_CHECK_OFF;
1065 ((t.sigh == 0x0de0b6b3) && (t.sigl > 0xa763ffff)) ) 954 FPU_put_user(b, d + i);
1066 { 955 RE_ENTRANT_CHECK_ON;
1067 EXCEPTION(EX_Invalid);
1068 /* This is a special case: see sec 16.2.5.1 of the 80486 book */
1069 invalid_operand:
1070 if ( control_word & CW_Invalid )
1071 {
1072 /* Produce the QNaN "indefinite" */
1073 RE_ENTRANT_CHECK_OFF;
1074 FPU_access_ok(VERIFY_WRITE,d,10);
1075 for ( i = 0; i < 7; i++)
1076 FPU_put_user(0, d+i); /* These bytes "undefined" */
1077 FPU_put_user(0xc0, d+7); /* This byte "undefined" */
1078 FPU_put_user(0xff, d+8);
1079 FPU_put_user(0xff, d+9);
1080 RE_ENTRANT_CHECK_ON;
1081 return 1;
1082 } 956 }
1083 else 957 RE_ENTRANT_CHECK_OFF;
1084 return 0; 958 FPU_put_user(sign, d + 9);
1085 } 959 RE_ENTRANT_CHECK_ON;
1086 else if ( precision_loss ) 960
1087 { 961 return 1;
1088 /* Precision loss doesn't stop the data transfer */
1089 set_precision_flag(precision_loss);
1090 }
1091
1092 RE_ENTRANT_CHECK_OFF;
1093 FPU_access_ok(VERIFY_WRITE,d,10);
1094 RE_ENTRANT_CHECK_ON;
1095 for ( i = 0; i < 9; i++)
1096 {
1097 b = FPU_div_small(&ll, 10);
1098 b |= (FPU_div_small(&ll, 10)) << 4;
1099 RE_ENTRANT_CHECK_OFF;
1100 FPU_put_user(b, d+i);
1101 RE_ENTRANT_CHECK_ON;
1102 }
1103 RE_ENTRANT_CHECK_OFF;
1104 FPU_put_user(sign, d+9);
1105 RE_ENTRANT_CHECK_ON;
1106
1107 return 1;
1108} 962}
1109 963
1110/*===========================================================================*/ 964/*===========================================================================*/
@@ -1119,59 +973,56 @@ int FPU_store_bcd(FPU_REG *st0_ptr, u_char st0_tag, u_char __user *d)
1119 largest possible value */ 973 largest possible value */
1120int FPU_round_to_int(FPU_REG *r, u_char tag) 974int FPU_round_to_int(FPU_REG *r, u_char tag)
1121{ 975{
1122 u_char very_big; 976 u_char very_big;
1123 unsigned eax; 977 unsigned eax;
1124 978
1125 if (tag == TAG_Zero) 979 if (tag == TAG_Zero) {
1126 { 980 /* Make sure that zero is returned */
1127 /* Make sure that zero is returned */ 981 significand(r) = 0;
1128 significand(r) = 0; 982 return 0; /* o.k. */
1129 return 0; /* o.k. */ 983 }
1130 } 984
1131 985 if (exponent(r) > 63) {
1132 if (exponent(r) > 63) 986 r->sigl = r->sigh = ~0; /* The largest representable number */
1133 { 987 return 1; /* overflow */
1134 r->sigl = r->sigh = ~0; /* The largest representable number */ 988 }
1135 return 1; /* overflow */ 989
1136 } 990 eax = FPU_shrxs(&r->sigl, 63 - exponent(r));
1137 991 very_big = !(~(r->sigh) | ~(r->sigl)); /* test for 0xfff...fff */
1138 eax = FPU_shrxs(&r->sigl, 63 - exponent(r));
1139 very_big = !(~(r->sigh) | ~(r->sigl)); /* test for 0xfff...fff */
1140#define half_or_more (eax & 0x80000000) 992#define half_or_more (eax & 0x80000000)
1141#define frac_part (eax) 993#define frac_part (eax)
1142#define more_than_half ((eax & 0x80000001) == 0x80000001) 994#define more_than_half ((eax & 0x80000001) == 0x80000001)
1143 switch (control_word & CW_RC) 995 switch (control_word & CW_RC) {
1144 { 996 case RC_RND:
1145 case RC_RND: 997 if (more_than_half /* nearest */
1146 if ( more_than_half /* nearest */ 998 || (half_or_more && (r->sigl & 1))) { /* odd -> even */
1147 || (half_or_more && (r->sigl & 1)) ) /* odd -> even */ 999 if (very_big)
1148 { 1000 return 1; /* overflow */
1149 if ( very_big ) return 1; /* overflow */ 1001 significand(r)++;
1150 significand(r) ++; 1002 return PRECISION_LOST_UP;
1151 return PRECISION_LOST_UP; 1003 }
1152 } 1004 break;
1153 break; 1005 case RC_DOWN:
1154 case RC_DOWN: 1006 if (frac_part && getsign(r)) {
1155 if (frac_part && getsign(r)) 1007 if (very_big)
1156 { 1008 return 1; /* overflow */
1157 if ( very_big ) return 1; /* overflow */ 1009 significand(r)++;
1158 significand(r) ++; 1010 return PRECISION_LOST_UP;
1159 return PRECISION_LOST_UP; 1011 }
1160 } 1012 break;
1161 break; 1013 case RC_UP:
1162 case RC_UP: 1014 if (frac_part && !getsign(r)) {
1163 if (frac_part && !getsign(r)) 1015 if (very_big)
1164 { 1016 return 1; /* overflow */
1165 if ( very_big ) return 1; /* overflow */ 1017 significand(r)++;
1166 significand(r) ++; 1018 return PRECISION_LOST_UP;
1167 return PRECISION_LOST_UP; 1019 }
1020 break;
1021 case RC_CHOP:
1022 break;
1168 } 1023 }
1169 break;
1170 case RC_CHOP:
1171 break;
1172 }
1173 1024
1174 return eax ? PRECISION_LOST_DOWN : 0; 1025 return eax ? PRECISION_LOST_DOWN : 0;
1175 1026
1176} 1027}
1177 1028
@@ -1179,197 +1030,195 @@ int FPU_round_to_int(FPU_REG *r, u_char tag)
1179 1030
1180u_char __user *fldenv(fpu_addr_modes addr_modes, u_char __user *s) 1031u_char __user *fldenv(fpu_addr_modes addr_modes, u_char __user *s)
1181{ 1032{
1182 unsigned short tag_word = 0; 1033 unsigned short tag_word = 0;
1183 u_char tag; 1034 u_char tag;
1184 int i; 1035 int i;
1185 1036
1186 if ( (addr_modes.default_mode == VM86) || 1037 if ((addr_modes.default_mode == VM86) ||
1187 ((addr_modes.default_mode == PM16) 1038 ((addr_modes.default_mode == PM16)
1188 ^ (addr_modes.override.operand_size == OP_SIZE_PREFIX)) ) 1039 ^ (addr_modes.override.operand_size == OP_SIZE_PREFIX))) {
1189 { 1040 RE_ENTRANT_CHECK_OFF;
1190 RE_ENTRANT_CHECK_OFF; 1041 FPU_access_ok(VERIFY_READ, s, 0x0e);
1191 FPU_access_ok(VERIFY_READ, s, 0x0e); 1042 FPU_get_user(control_word, (unsigned short __user *)s);
1192 FPU_get_user(control_word, (unsigned short __user *) s); 1043 FPU_get_user(partial_status, (unsigned short __user *)(s + 2));
1193 FPU_get_user(partial_status, (unsigned short __user *) (s+2)); 1044 FPU_get_user(tag_word, (unsigned short __user *)(s + 4));
1194 FPU_get_user(tag_word, (unsigned short __user *) (s+4)); 1045 FPU_get_user(instruction_address.offset,
1195 FPU_get_user(instruction_address.offset, (unsigned short __user *) (s+6)); 1046 (unsigned short __user *)(s + 6));
1196 FPU_get_user(instruction_address.selector, (unsigned short __user *) (s+8)); 1047 FPU_get_user(instruction_address.selector,
1197 FPU_get_user(operand_address.offset, (unsigned short __user *) (s+0x0a)); 1048 (unsigned short __user *)(s + 8));
1198 FPU_get_user(operand_address.selector, (unsigned short __user *) (s+0x0c)); 1049 FPU_get_user(operand_address.offset,
1199 RE_ENTRANT_CHECK_ON; 1050 (unsigned short __user *)(s + 0x0a));
1200 s += 0x0e; 1051 FPU_get_user(operand_address.selector,
1201 if ( addr_modes.default_mode == VM86 ) 1052 (unsigned short __user *)(s + 0x0c));
1202 { 1053 RE_ENTRANT_CHECK_ON;
1203 instruction_address.offset 1054 s += 0x0e;
1204 += (instruction_address.selector & 0xf000) << 4; 1055 if (addr_modes.default_mode == VM86) {
1205 operand_address.offset += (operand_address.selector & 0xf000) << 4; 1056 instruction_address.offset
1057 += (instruction_address.selector & 0xf000) << 4;
1058 operand_address.offset +=
1059 (operand_address.selector & 0xf000) << 4;
1060 }
1061 } else {
1062 RE_ENTRANT_CHECK_OFF;
1063 FPU_access_ok(VERIFY_READ, s, 0x1c);
1064 FPU_get_user(control_word, (unsigned short __user *)s);
1065 FPU_get_user(partial_status, (unsigned short __user *)(s + 4));
1066 FPU_get_user(tag_word, (unsigned short __user *)(s + 8));
1067 FPU_get_user(instruction_address.offset,
1068 (unsigned long __user *)(s + 0x0c));
1069 FPU_get_user(instruction_address.selector,
1070 (unsigned short __user *)(s + 0x10));
1071 FPU_get_user(instruction_address.opcode,
1072 (unsigned short __user *)(s + 0x12));
1073 FPU_get_user(operand_address.offset,
1074 (unsigned long __user *)(s + 0x14));
1075 FPU_get_user(operand_address.selector,
1076 (unsigned long __user *)(s + 0x18));
1077 RE_ENTRANT_CHECK_ON;
1078 s += 0x1c;
1206 } 1079 }
1207 }
1208 else
1209 {
1210 RE_ENTRANT_CHECK_OFF;
1211 FPU_access_ok(VERIFY_READ, s, 0x1c);
1212 FPU_get_user(control_word, (unsigned short __user *) s);
1213 FPU_get_user(partial_status, (unsigned short __user *) (s+4));
1214 FPU_get_user(tag_word, (unsigned short __user *) (s+8));
1215 FPU_get_user(instruction_address.offset, (unsigned long __user *) (s+0x0c));
1216 FPU_get_user(instruction_address.selector, (unsigned short __user *) (s+0x10));
1217 FPU_get_user(instruction_address.opcode, (unsigned short __user *) (s+0x12));
1218 FPU_get_user(operand_address.offset, (unsigned long __user *) (s+0x14));
1219 FPU_get_user(operand_address.selector, (unsigned long __user *) (s+0x18));
1220 RE_ENTRANT_CHECK_ON;
1221 s += 0x1c;
1222 }
1223 1080
1224#ifdef PECULIAR_486 1081#ifdef PECULIAR_486
1225 control_word &= ~0xe080; 1082 control_word &= ~0xe080;
1226#endif /* PECULIAR_486 */ 1083#endif /* PECULIAR_486 */
1227 1084
1228 top = (partial_status >> SW_Top_Shift) & 7; 1085 top = (partial_status >> SW_Top_Shift) & 7;
1229 1086
1230 if ( partial_status & ~control_word & CW_Exceptions ) 1087 if (partial_status & ~control_word & CW_Exceptions)
1231 partial_status |= (SW_Summary | SW_Backward); 1088 partial_status |= (SW_Summary | SW_Backward);
1232 else 1089 else
1233 partial_status &= ~(SW_Summary | SW_Backward); 1090 partial_status &= ~(SW_Summary | SW_Backward);
1234 1091
1235 for ( i = 0; i < 8; i++ ) 1092 for (i = 0; i < 8; i++) {
1236 { 1093 tag = tag_word & 3;
1237 tag = tag_word & 3; 1094 tag_word >>= 2;
1238 tag_word >>= 2; 1095
1239 1096 if (tag == TAG_Empty)
1240 if ( tag == TAG_Empty ) 1097 /* New tag is empty. Accept it */
1241 /* New tag is empty. Accept it */ 1098 FPU_settag(i, TAG_Empty);
1242 FPU_settag(i, TAG_Empty); 1099 else if (FPU_gettag(i) == TAG_Empty) {
1243 else if ( FPU_gettag(i) == TAG_Empty ) 1100 /* Old tag is empty and new tag is not empty. New tag is determined
1244 { 1101 by old reg contents */
1245 /* Old tag is empty and new tag is not empty. New tag is determined 1102 if (exponent(&fpu_register(i)) == -EXTENDED_Ebias) {
1246 by old reg contents */ 1103 if (!
1247 if ( exponent(&fpu_register(i)) == - EXTENDED_Ebias ) 1104 (fpu_register(i).sigl | fpu_register(i).
1248 { 1105 sigh))
1249 if ( !(fpu_register(i).sigl | fpu_register(i).sigh) ) 1106 FPU_settag(i, TAG_Zero);
1250 FPU_settag(i, TAG_Zero); 1107 else
1251 else 1108 FPU_settag(i, TAG_Special);
1252 FPU_settag(i, TAG_Special); 1109 } else if (exponent(&fpu_register(i)) ==
1253 } 1110 0x7fff - EXTENDED_Ebias) {
1254 else if ( exponent(&fpu_register(i)) == 0x7fff - EXTENDED_Ebias ) 1111 FPU_settag(i, TAG_Special);
1255 { 1112 } else if (fpu_register(i).sigh & 0x80000000)
1256 FPU_settag(i, TAG_Special); 1113 FPU_settag(i, TAG_Valid);
1257 } 1114 else
1258 else if ( fpu_register(i).sigh & 0x80000000 ) 1115 FPU_settag(i, TAG_Special); /* An Un-normal */
1259 FPU_settag(i, TAG_Valid); 1116 }
1260 else 1117 /* Else old tag is not empty and new tag is not empty. Old tag
1261 FPU_settag(i, TAG_Special); /* An Un-normal */ 1118 remains correct */
1262 } 1119 }
1263 /* Else old tag is not empty and new tag is not empty. Old tag
1264 remains correct */
1265 }
1266
1267 return s;
1268}
1269 1120
1121 return s;
1122}
1270 1123
1271void frstor(fpu_addr_modes addr_modes, u_char __user *data_address) 1124void frstor(fpu_addr_modes addr_modes, u_char __user *data_address)
1272{ 1125{
1273 int i, regnr; 1126 int i, regnr;
1274 u_char __user *s = fldenv(addr_modes, data_address); 1127 u_char __user *s = fldenv(addr_modes, data_address);
1275 int offset = (top & 7) * 10, other = 80 - offset; 1128 int offset = (top & 7) * 10, other = 80 - offset;
1276 1129
1277 /* Copy all registers in stack order. */ 1130 /* Copy all registers in stack order. */
1278 RE_ENTRANT_CHECK_OFF; 1131 RE_ENTRANT_CHECK_OFF;
1279 FPU_access_ok(VERIFY_READ,s,80); 1132 FPU_access_ok(VERIFY_READ, s, 80);
1280 __copy_from_user(register_base+offset, s, other); 1133 __copy_from_user(register_base + offset, s, other);
1281 if ( offset ) 1134 if (offset)
1282 __copy_from_user(register_base, s+other, offset); 1135 __copy_from_user(register_base, s + other, offset);
1283 RE_ENTRANT_CHECK_ON; 1136 RE_ENTRANT_CHECK_ON;
1284 1137
1285 for ( i = 0; i < 8; i++ ) 1138 for (i = 0; i < 8; i++) {
1286 { 1139 regnr = (i + top) & 7;
1287 regnr = (i+top) & 7; 1140 if (FPU_gettag(regnr) != TAG_Empty)
1288 if ( FPU_gettag(regnr) != TAG_Empty ) 1141 /* The loaded data over-rides all other cases. */
1289 /* The loaded data over-rides all other cases. */ 1142 FPU_settag(regnr, FPU_tagof(&st(i)));
1290 FPU_settag(regnr, FPU_tagof(&st(i))); 1143 }
1291 }
1292 1144
1293} 1145}
1294 1146
1295
1296u_char __user *fstenv(fpu_addr_modes addr_modes, u_char __user *d) 1147u_char __user *fstenv(fpu_addr_modes addr_modes, u_char __user *d)
1297{ 1148{
1298 if ( (addr_modes.default_mode == VM86) || 1149 if ((addr_modes.default_mode == VM86) ||
1299 ((addr_modes.default_mode == PM16) 1150 ((addr_modes.default_mode == PM16)
1300 ^ (addr_modes.override.operand_size == OP_SIZE_PREFIX)) ) 1151 ^ (addr_modes.override.operand_size == OP_SIZE_PREFIX))) {
1301 { 1152 RE_ENTRANT_CHECK_OFF;
1302 RE_ENTRANT_CHECK_OFF; 1153 FPU_access_ok(VERIFY_WRITE, d, 14);
1303 FPU_access_ok(VERIFY_WRITE,d,14);
1304#ifdef PECULIAR_486 1154#ifdef PECULIAR_486
1305 FPU_put_user(control_word & ~0xe080, (unsigned long __user *) d); 1155 FPU_put_user(control_word & ~0xe080, (unsigned long __user *)d);
1306#else 1156#else
1307 FPU_put_user(control_word, (unsigned short __user *) d); 1157 FPU_put_user(control_word, (unsigned short __user *)d);
1308#endif /* PECULIAR_486 */ 1158#endif /* PECULIAR_486 */
1309 FPU_put_user(status_word(), (unsigned short __user *) (d+2)); 1159 FPU_put_user(status_word(), (unsigned short __user *)(d + 2));
1310 FPU_put_user(fpu_tag_word, (unsigned short __user *) (d+4)); 1160 FPU_put_user(fpu_tag_word, (unsigned short __user *)(d + 4));
1311 FPU_put_user(instruction_address.offset, (unsigned short __user *) (d+6)); 1161 FPU_put_user(instruction_address.offset,
1312 FPU_put_user(operand_address.offset, (unsigned short __user *) (d+0x0a)); 1162 (unsigned short __user *)(d + 6));
1313 if ( addr_modes.default_mode == VM86 ) 1163 FPU_put_user(operand_address.offset,
1314 { 1164 (unsigned short __user *)(d + 0x0a));
1315 FPU_put_user((instruction_address.offset & 0xf0000) >> 4, 1165 if (addr_modes.default_mode == VM86) {
1316 (unsigned short __user *) (d+8)); 1166 FPU_put_user((instruction_address.
1317 FPU_put_user((operand_address.offset & 0xf0000) >> 4, 1167 offset & 0xf0000) >> 4,
1318 (unsigned short __user *) (d+0x0c)); 1168 (unsigned short __user *)(d + 8));
1319 } 1169 FPU_put_user((operand_address.offset & 0xf0000) >> 4,
1320 else 1170 (unsigned short __user *)(d + 0x0c));
1321 { 1171 } else {
1322 FPU_put_user(instruction_address.selector, (unsigned short __user *) (d+8)); 1172 FPU_put_user(instruction_address.selector,
1323 FPU_put_user(operand_address.selector, (unsigned short __user *) (d+0x0c)); 1173 (unsigned short __user *)(d + 8));
1324 } 1174 FPU_put_user(operand_address.selector,
1325 RE_ENTRANT_CHECK_ON; 1175 (unsigned short __user *)(d + 0x0c));
1326 d += 0x0e; 1176 }
1327 } 1177 RE_ENTRANT_CHECK_ON;
1328 else 1178 d += 0x0e;
1329 { 1179 } else {
1330 RE_ENTRANT_CHECK_OFF; 1180 RE_ENTRANT_CHECK_OFF;
1331 FPU_access_ok(VERIFY_WRITE, d, 7*4); 1181 FPU_access_ok(VERIFY_WRITE, d, 7 * 4);
1332#ifdef PECULIAR_486 1182#ifdef PECULIAR_486
1333 control_word &= ~0xe080; 1183 control_word &= ~0xe080;
1334 /* An 80486 sets nearly all of the reserved bits to 1. */ 1184 /* An 80486 sets nearly all of the reserved bits to 1. */
1335 control_word |= 0xffff0040; 1185 control_word |= 0xffff0040;
1336 partial_status = status_word() | 0xffff0000; 1186 partial_status = status_word() | 0xffff0000;
1337 fpu_tag_word |= 0xffff0000; 1187 fpu_tag_word |= 0xffff0000;
1338 I387.soft.fcs &= ~0xf8000000; 1188 I387.soft.fcs &= ~0xf8000000;
1339 I387.soft.fos |= 0xffff0000; 1189 I387.soft.fos |= 0xffff0000;
1340#endif /* PECULIAR_486 */ 1190#endif /* PECULIAR_486 */
1341 if (__copy_to_user(d, &control_word, 7*4)) 1191 if (__copy_to_user(d, &control_word, 7 * 4))
1342 FPU_abort; 1192 FPU_abort;
1343 RE_ENTRANT_CHECK_ON; 1193 RE_ENTRANT_CHECK_ON;
1344 d += 0x1c; 1194 d += 0x1c;
1345 } 1195 }
1346
1347 control_word |= CW_Exceptions;
1348 partial_status &= ~(SW_Summary | SW_Backward);
1349
1350 return d;
1351}
1352 1196
1197 control_word |= CW_Exceptions;
1198 partial_status &= ~(SW_Summary | SW_Backward);
1199
1200 return d;
1201}
1353 1202
1354void fsave(fpu_addr_modes addr_modes, u_char __user *data_address) 1203void fsave(fpu_addr_modes addr_modes, u_char __user *data_address)
1355{ 1204{
1356 u_char __user *d; 1205 u_char __user *d;
1357 int offset = (top & 7) * 10, other = 80 - offset; 1206 int offset = (top & 7) * 10, other = 80 - offset;
1358 1207
1359 d = fstenv(addr_modes, data_address); 1208 d = fstenv(addr_modes, data_address);
1360 1209
1361 RE_ENTRANT_CHECK_OFF; 1210 RE_ENTRANT_CHECK_OFF;
1362 FPU_access_ok(VERIFY_WRITE,d,80); 1211 FPU_access_ok(VERIFY_WRITE, d, 80);
1363 1212
1364 /* Copy all registers in stack order. */ 1213 /* Copy all registers in stack order. */
1365 if (__copy_to_user(d, register_base+offset, other)) 1214 if (__copy_to_user(d, register_base + offset, other))
1366 FPU_abort; 1215 FPU_abort;
1367 if ( offset ) 1216 if (offset)
1368 if (__copy_to_user(d+other, register_base, offset)) 1217 if (__copy_to_user(d + other, register_base, offset))
1369 FPU_abort; 1218 FPU_abort;
1370 RE_ENTRANT_CHECK_ON; 1219 RE_ENTRANT_CHECK_ON;
1371 1220
1372 finit(); 1221 finit();
1373} 1222}
1374 1223
1375/*===========================================================================*/ 1224/*===========================================================================*/
diff --git a/arch/x86/math-emu/reg_mul.c b/arch/x86/math-emu/reg_mul.c
index 40f50b61bc6..36c37f71f71 100644
--- a/arch/x86/math-emu/reg_mul.c
+++ b/arch/x86/math-emu/reg_mul.c
@@ -20,7 +20,6 @@
20#include "reg_constant.h" 20#include "reg_constant.h"
21#include "fpu_system.h" 21#include "fpu_system.h"
22 22
23
24/* 23/*
25 Multiply two registers to give a register result. 24 Multiply two registers to give a register result.
26 The sources are st(deststnr) and (b,tagb,signb). 25 The sources are st(deststnr) and (b,tagb,signb).
@@ -29,104 +28,88 @@
29/* This routine must be called with non-empty source registers */ 28/* This routine must be called with non-empty source registers */
30int FPU_mul(FPU_REG const *b, u_char tagb, int deststnr, int control_w) 29int FPU_mul(FPU_REG const *b, u_char tagb, int deststnr, int control_w)
31{ 30{
32 FPU_REG *a = &st(deststnr); 31 FPU_REG *a = &st(deststnr);
33 FPU_REG *dest = a; 32 FPU_REG *dest = a;
34 u_char taga = FPU_gettagi(deststnr); 33 u_char taga = FPU_gettagi(deststnr);
35 u_char saved_sign = getsign(dest); 34 u_char saved_sign = getsign(dest);
36 u_char sign = (getsign(a) ^ getsign(b)); 35 u_char sign = (getsign(a) ^ getsign(b));
37 int tag; 36 int tag;
38
39 37
40 if ( !(taga | tagb) ) 38 if (!(taga | tagb)) {
41 { 39 /* Both regs Valid, this should be the most common case. */
42 /* Both regs Valid, this should be the most common case. */
43 40
44 tag = FPU_u_mul(a, b, dest, control_w, sign, exponent(a) + exponent(b)); 41 tag =
45 if ( tag < 0 ) 42 FPU_u_mul(a, b, dest, control_w, sign,
46 { 43 exponent(a) + exponent(b));
47 setsign(dest, saved_sign); 44 if (tag < 0) {
48 return tag; 45 setsign(dest, saved_sign);
46 return tag;
47 }
48 FPU_settagi(deststnr, tag);
49 return tag;
49 } 50 }
50 FPU_settagi(deststnr, tag);
51 return tag;
52 }
53 51
54 if ( taga == TAG_Special ) 52 if (taga == TAG_Special)
55 taga = FPU_Special(a); 53 taga = FPU_Special(a);
56 if ( tagb == TAG_Special ) 54 if (tagb == TAG_Special)
57 tagb = FPU_Special(b); 55 tagb = FPU_Special(b);
58 56
59 if ( ((taga == TAG_Valid) && (tagb == TW_Denormal)) 57 if (((taga == TAG_Valid) && (tagb == TW_Denormal))
60 || ((taga == TW_Denormal) && (tagb == TAG_Valid)) 58 || ((taga == TW_Denormal) && (tagb == TAG_Valid))
61 || ((taga == TW_Denormal) && (tagb == TW_Denormal)) ) 59 || ((taga == TW_Denormal) && (tagb == TW_Denormal))) {
62 { 60 FPU_REG x, y;
63 FPU_REG x, y; 61 if (denormal_operand() < 0)
64 if ( denormal_operand() < 0 ) 62 return FPU_Exception;
65 return FPU_Exception;
66
67 FPU_to_exp16(a, &x);
68 FPU_to_exp16(b, &y);
69 tag = FPU_u_mul(&x, &y, dest, control_w, sign,
70 exponent16(&x) + exponent16(&y));
71 if ( tag < 0 )
72 {
73 setsign(dest, saved_sign);
74 return tag;
75 }
76 FPU_settagi(deststnr, tag);
77 return tag;
78 }
79 else if ( (taga <= TW_Denormal) && (tagb <= TW_Denormal) )
80 {
81 if ( ((tagb == TW_Denormal) || (taga == TW_Denormal))
82 && (denormal_operand() < 0) )
83 return FPU_Exception;
84 63
85 /* Must have either both arguments == zero, or 64 FPU_to_exp16(a, &x);
86 one valid and the other zero. 65 FPU_to_exp16(b, &y);
87 The result is therefore zero. */ 66 tag = FPU_u_mul(&x, &y, dest, control_w, sign,
88 FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr); 67 exponent16(&x) + exponent16(&y));
89 /* The 80486 book says that the answer is +0, but a real 68 if (tag < 0) {
90 80486 behaves this way. 69 setsign(dest, saved_sign);
91 IEEE-754 apparently says it should be this way. */ 70 return tag;
92 setsign(dest, sign); 71 }
93 return TAG_Zero; 72 FPU_settagi(deststnr, tag);
94 } 73 return tag;
95 /* Must have infinities, NaNs, etc */ 74 } else if ((taga <= TW_Denormal) && (tagb <= TW_Denormal)) {
96 else if ( (taga == TW_NaN) || (tagb == TW_NaN) ) 75 if (((tagb == TW_Denormal) || (taga == TW_Denormal))
97 { 76 && (denormal_operand() < 0))
98 return real_2op_NaN(b, tagb, deststnr, &st(0)); 77 return FPU_Exception;
99 }
100 else if ( ((taga == TW_Infinity) && (tagb == TAG_Zero))
101 || ((tagb == TW_Infinity) && (taga == TAG_Zero)) )
102 {
103 return arith_invalid(deststnr); /* Zero*Infinity is invalid */
104 }
105 else if ( ((taga == TW_Denormal) || (tagb == TW_Denormal))
106 && (denormal_operand() < 0) )
107 {
108 return FPU_Exception;
109 }
110 else if (taga == TW_Infinity)
111 {
112 FPU_copy_to_regi(a, TAG_Special, deststnr);
113 setsign(dest, sign);
114 return TAG_Special;
115 }
116 else if (tagb == TW_Infinity)
117 {
118 FPU_copy_to_regi(b, TAG_Special, deststnr);
119 setsign(dest, sign);
120 return TAG_Special;
121 }
122 78
79 /* Must have either both arguments == zero, or
80 one valid and the other zero.
81 The result is therefore zero. */
82 FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr);
83 /* The 80486 book says that the answer is +0, but a real
84 80486 behaves this way.
85 IEEE-754 apparently says it should be this way. */
86 setsign(dest, sign);
87 return TAG_Zero;
88 }
89 /* Must have infinities, NaNs, etc */
90 else if ((taga == TW_NaN) || (tagb == TW_NaN)) {
91 return real_2op_NaN(b, tagb, deststnr, &st(0));
92 } else if (((taga == TW_Infinity) && (tagb == TAG_Zero))
93 || ((tagb == TW_Infinity) && (taga == TAG_Zero))) {
94 return arith_invalid(deststnr); /* Zero*Infinity is invalid */
95 } else if (((taga == TW_Denormal) || (tagb == TW_Denormal))
96 && (denormal_operand() < 0)) {
97 return FPU_Exception;
98 } else if (taga == TW_Infinity) {
99 FPU_copy_to_regi(a, TAG_Special, deststnr);
100 setsign(dest, sign);
101 return TAG_Special;
102 } else if (tagb == TW_Infinity) {
103 FPU_copy_to_regi(b, TAG_Special, deststnr);
104 setsign(dest, sign);
105 return TAG_Special;
106 }
123#ifdef PARANOID 107#ifdef PARANOID
124 else 108 else {
125 { 109 EXCEPTION(EX_INTERNAL | 0x102);
126 EXCEPTION(EX_INTERNAL|0x102); 110 return FPU_Exception;
127 return FPU_Exception; 111 }
128 } 112#endif /* PARANOID */
129#endif /* PARANOID */
130 113
131 return 0; 114 return 0;
132} 115}
diff --git a/arch/x86/math-emu/status_w.h b/arch/x86/math-emu/status_w.h
index 59e73302aa6..54a3f226982 100644
--- a/arch/x86/math-emu/status_w.h
+++ b/arch/x86/math-emu/status_w.h
@@ -10,7 +10,7 @@
10#ifndef _STATUS_H_ 10#ifndef _STATUS_H_
11#define _STATUS_H_ 11#define _STATUS_H_
12 12
13#include "fpu_emu.h" /* for definition of PECULIAR_486 */ 13#include "fpu_emu.h" /* for definition of PECULIAR_486 */
14 14
15#ifdef __ASSEMBLY__ 15#ifdef __ASSEMBLY__
16#define Const__(x) $##x 16#define Const__(x) $##x
@@ -34,7 +34,7 @@
34#define SW_Denorm_Op Const__(0x0002) /* denormalized operand */ 34#define SW_Denorm_Op Const__(0x0002) /* denormalized operand */
35#define SW_Invalid Const__(0x0001) /* invalid operation */ 35#define SW_Invalid Const__(0x0001) /* invalid operation */
36 36
37#define SW_Exc_Mask Const__(0x27f) /* Status word exception bit mask */ 37#define SW_Exc_Mask Const__(0x27f) /* Status word exception bit mask */
38 38
39#ifndef __ASSEMBLY__ 39#ifndef __ASSEMBLY__
40 40
@@ -50,8 +50,8 @@
50 ((partial_status & ~SW_Top & 0xffff) | ((top << SW_Top_Shift) & SW_Top)) 50 ((partial_status & ~SW_Top & 0xffff) | ((top << SW_Top_Shift) & SW_Top))
51static inline void setcc(int cc) 51static inline void setcc(int cc)
52{ 52{
53 partial_status &= ~(SW_C0|SW_C1|SW_C2|SW_C3); 53 partial_status &= ~(SW_C0 | SW_C1 | SW_C2 | SW_C3);
54 partial_status |= (cc) & (SW_C0|SW_C1|SW_C2|SW_C3); 54 partial_status |= (cc) & (SW_C0 | SW_C1 | SW_C2 | SW_C3);
55} 55}
56 56
57#ifdef PECULIAR_486 57#ifdef PECULIAR_486
diff --git a/arch/x86/mm/Makefile_32 b/arch/x86/mm/Makefile_32
index 362b4ad082d..c36ae88bb54 100644
--- a/arch/x86/mm/Makefile_32
+++ b/arch/x86/mm/Makefile_32
@@ -2,9 +2,8 @@
2# Makefile for the linux i386-specific parts of the memory manager. 2# Makefile for the linux i386-specific parts of the memory manager.
3# 3#
4 4
5obj-y := init_32.o pgtable_32.o fault_32.o ioremap_32.o extable_32.o pageattr_32.o mmap_32.o 5obj-y := init_32.o pgtable_32.o fault.o ioremap.o extable.o pageattr.o mmap.o
6 6
7obj-$(CONFIG_NUMA) += discontig_32.o 7obj-$(CONFIG_NUMA) += discontig_32.o
8obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o 8obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
9obj-$(CONFIG_HIGHMEM) += highmem_32.o 9obj-$(CONFIG_HIGHMEM) += highmem_32.o
10obj-$(CONFIG_BOOT_IOREMAP) += boot_ioremap_32.o
diff --git a/arch/x86/mm/Makefile_64 b/arch/x86/mm/Makefile_64
index 6bcb47945b8..688c8c28ac8 100644
--- a/arch/x86/mm/Makefile_64
+++ b/arch/x86/mm/Makefile_64
@@ -2,9 +2,8 @@
2# Makefile for the linux x86_64-specific parts of the memory manager. 2# Makefile for the linux x86_64-specific parts of the memory manager.
3# 3#
4 4
5obj-y := init_64.o fault_64.o ioremap_64.o extable_64.o pageattr_64.o mmap_64.o 5obj-y := init_64.o fault.o ioremap.o extable.o pageattr.o mmap.o
6obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o 6obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
7obj-$(CONFIG_NUMA) += numa_64.o 7obj-$(CONFIG_NUMA) += numa_64.o
8obj-$(CONFIG_K8_NUMA) += k8topology_64.o 8obj-$(CONFIG_K8_NUMA) += k8topology_64.o
9obj-$(CONFIG_ACPI_NUMA) += srat_64.o 9obj-$(CONFIG_ACPI_NUMA) += srat_64.o
10
diff --git a/arch/x86/mm/boot_ioremap_32.c b/arch/x86/mm/boot_ioremap_32.c
deleted file mode 100644
index f14da2a53ec..00000000000
--- a/arch/x86/mm/boot_ioremap_32.c
+++ /dev/null
@@ -1,100 +0,0 @@
1/*
2 * arch/i386/mm/boot_ioremap.c
3 *
4 * Re-map functions for early boot-time before paging_init() when the
5 * boot-time pagetables are still in use
6 *
7 * Written by Dave Hansen <haveblue@us.ibm.com>
8 */
9
10
11/*
12 * We need to use the 2-level pagetable functions, but CONFIG_X86_PAE
13 * keeps that from happening. If anyone has a better way, I'm listening.
14 *
15 * boot_pte_t is defined only if this all works correctly
16 */
17
18#undef CONFIG_X86_PAE
19#undef CONFIG_PARAVIRT
20#include <asm/page.h>
21#include <asm/pgtable.h>
22#include <asm/tlbflush.h>
23#include <linux/init.h>
24#include <linux/stddef.h>
25
26/*
27 * I'm cheating here. It is known that the two boot PTE pages are
28 * allocated next to each other. I'm pretending that they're just
29 * one big array.
30 */
31
32#define BOOT_PTE_PTRS (PTRS_PER_PTE*2)
33
34static unsigned long boot_pte_index(unsigned long vaddr)
35{
36 return __pa(vaddr) >> PAGE_SHIFT;
37}
38
39static inline boot_pte_t* boot_vaddr_to_pte(void *address)
40{
41 boot_pte_t* boot_pg = (boot_pte_t*)pg0;
42 return &boot_pg[boot_pte_index((unsigned long)address)];
43}
44
45/*
46 * This is only for a caller who is clever enough to page-align
47 * phys_addr and virtual_source, and who also has a preference
48 * about which virtual address from which to steal ptes
49 */
50static void __boot_ioremap(unsigned long phys_addr, unsigned long nrpages,
51 void* virtual_source)
52{
53 boot_pte_t* pte;
54 int i;
55 char *vaddr = virtual_source;
56
57 pte = boot_vaddr_to_pte(virtual_source);
58 for (i=0; i < nrpages; i++, phys_addr += PAGE_SIZE, pte++) {
59 set_pte(pte, pfn_pte(phys_addr>>PAGE_SHIFT, PAGE_KERNEL));
60 __flush_tlb_one(&vaddr[i*PAGE_SIZE]);
61 }
62}
63
64/* the virtual space we're going to remap comes from this array */
65#define BOOT_IOREMAP_PAGES 4
66#define BOOT_IOREMAP_SIZE (BOOT_IOREMAP_PAGES*PAGE_SIZE)
67static __initdata char boot_ioremap_space[BOOT_IOREMAP_SIZE]
68 __attribute__ ((aligned (PAGE_SIZE)));
69
70/*
71 * This only applies to things which need to ioremap before paging_init()
72 * bt_ioremap() and plain ioremap() are both useless at this point.
73 *
74 * When used, we're still using the boot-time pagetables, which only
75 * have 2 PTE pages mapping the first 8MB
76 *
77 * There is no unmap. The boot-time PTE pages aren't used after boot.
78 * If you really want the space back, just remap it yourself.
79 * boot_ioremap(&ioremap_space-PAGE_OFFSET, BOOT_IOREMAP_SIZE)
80 */
81__init void* boot_ioremap(unsigned long phys_addr, unsigned long size)
82{
83 unsigned long last_addr, offset;
84 unsigned int nrpages;
85
86 last_addr = phys_addr + size - 1;
87
88 /* page align the requested address */
89 offset = phys_addr & ~PAGE_MASK;
90 phys_addr &= PAGE_MASK;
91 size = PAGE_ALIGN(last_addr) - phys_addr;
92
93 nrpages = size >> PAGE_SHIFT;
94 if (nrpages > BOOT_IOREMAP_PAGES)
95 return NULL;
96
97 __boot_ioremap(phys_addr, nrpages, boot_ioremap_space);
98
99 return &boot_ioremap_space[offset];
100}
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c
index 13a474d3c6e..04b1d20e261 100644
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/discontig_32.c
@@ -32,6 +32,7 @@
32#include <linux/kexec.h> 32#include <linux/kexec.h>
33#include <linux/pfn.h> 33#include <linux/pfn.h>
34#include <linux/swap.h> 34#include <linux/swap.h>
35#include <linux/acpi.h>
35 36
36#include <asm/e820.h> 37#include <asm/e820.h>
37#include <asm/setup.h> 38#include <asm/setup.h>
@@ -103,14 +104,10 @@ extern unsigned long highend_pfn, highstart_pfn;
103 104
104#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) 105#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
105 106
106static unsigned long node_remap_start_pfn[MAX_NUMNODES];
107unsigned long node_remap_size[MAX_NUMNODES]; 107unsigned long node_remap_size[MAX_NUMNODES];
108static unsigned long node_remap_offset[MAX_NUMNODES];
109static void *node_remap_start_vaddr[MAX_NUMNODES]; 108static void *node_remap_start_vaddr[MAX_NUMNODES];
110void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); 109void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
111 110
112static void *node_remap_end_vaddr[MAX_NUMNODES];
113static void *node_remap_alloc_vaddr[MAX_NUMNODES];
114static unsigned long kva_start_pfn; 111static unsigned long kva_start_pfn;
115static unsigned long kva_pages; 112static unsigned long kva_pages;
116/* 113/*
@@ -167,6 +164,22 @@ static void __init allocate_pgdat(int nid)
167 } 164 }
168} 165}
169 166
167#ifdef CONFIG_DISCONTIGMEM
168/*
169 * In the discontig memory model, a portion of the kernel virtual area (KVA)
170 * is reserved and portions of nodes are mapped using it. This is to allow
171 * node-local memory to be allocated for structures that would normally require
172 * ZONE_NORMAL. The memory is allocated with alloc_remap() and callers
173 * should be prepared to allocate from the bootmem allocator instead. This KVA
174 * mechanism is incompatible with SPARSEMEM as it makes assumptions about the
175 * layout of memory that are broken if alloc_remap() succeeds for some of the
176 * map and fails for others
177 */
178static unsigned long node_remap_start_pfn[MAX_NUMNODES];
179static void *node_remap_end_vaddr[MAX_NUMNODES];
180static void *node_remap_alloc_vaddr[MAX_NUMNODES];
181static unsigned long node_remap_offset[MAX_NUMNODES];
182
170void *alloc_remap(int nid, unsigned long size) 183void *alloc_remap(int nid, unsigned long size)
171{ 184{
172 void *allocation = node_remap_alloc_vaddr[nid]; 185 void *allocation = node_remap_alloc_vaddr[nid];
@@ -263,11 +276,46 @@ static unsigned long calculate_numa_remap_pages(void)
263 return reserve_pages; 276 return reserve_pages;
264} 277}
265 278
279static void init_remap_allocator(int nid)
280{
281 node_remap_start_vaddr[nid] = pfn_to_kaddr(
282 kva_start_pfn + node_remap_offset[nid]);
283 node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] +
284 (node_remap_size[nid] * PAGE_SIZE);
285 node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] +
286 ALIGN(sizeof(pg_data_t), PAGE_SIZE);
287
288 printk ("node %d will remap to vaddr %08lx - %08lx\n", nid,
289 (ulong) node_remap_start_vaddr[nid],
290 (ulong) pfn_to_kaddr(highstart_pfn
291 + node_remap_offset[nid] + node_remap_size[nid]));
292}
293#else
294void *alloc_remap(int nid, unsigned long size)
295{
296 return NULL;
297}
298
299static unsigned long calculate_numa_remap_pages(void)
300{
301 return 0;
302}
303
304static void init_remap_allocator(int nid)
305{
306}
307
308void __init remap_numa_kva(void)
309{
310}
311#endif /* CONFIG_DISCONTIGMEM */
312
266extern void setup_bootmem_allocator(void); 313extern void setup_bootmem_allocator(void);
267unsigned long __init setup_memory(void) 314unsigned long __init setup_memory(void)
268{ 315{
269 int nid; 316 int nid;
270 unsigned long system_start_pfn, system_max_low_pfn; 317 unsigned long system_start_pfn, system_max_low_pfn;
318 unsigned long wasted_pages;
271 319
272 /* 320 /*
273 * When mapping a NUMA machine we allocate the node_mem_map arrays 321 * When mapping a NUMA machine we allocate the node_mem_map arrays
@@ -288,11 +336,18 @@ unsigned long __init setup_memory(void)
288 336
289#ifdef CONFIG_BLK_DEV_INITRD 337#ifdef CONFIG_BLK_DEV_INITRD
290 /* Numa kva area is below the initrd */ 338 /* Numa kva area is below the initrd */
291 if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) 339 if (initrd_start)
292 kva_start_pfn = PFN_DOWN(boot_params.hdr.ramdisk_image) 340 kva_start_pfn = PFN_DOWN(initrd_start - PAGE_OFFSET)
293 - kva_pages; 341 - kva_pages;
294#endif 342#endif
295 kva_start_pfn -= kva_start_pfn & (PTRS_PER_PTE-1); 343
344 /*
345 * We waste pages past at the end of the KVA for no good reason other
346 * than how it is located. This is bad.
347 */
348 wasted_pages = kva_start_pfn & (PTRS_PER_PTE-1);
349 kva_start_pfn -= wasted_pages;
350 kva_pages += wasted_pages;
296 351
297 system_max_low_pfn = max_low_pfn = find_max_low_pfn(); 352 system_max_low_pfn = max_low_pfn = find_max_low_pfn();
298 printk("kva_start_pfn ~ %ld find_max_low_pfn() ~ %ld\n", 353 printk("kva_start_pfn ~ %ld find_max_low_pfn() ~ %ld\n",
@@ -318,19 +373,9 @@ unsigned long __init setup_memory(void)
318 printk("Low memory ends at vaddr %08lx\n", 373 printk("Low memory ends at vaddr %08lx\n",
319 (ulong) pfn_to_kaddr(max_low_pfn)); 374 (ulong) pfn_to_kaddr(max_low_pfn));
320 for_each_online_node(nid) { 375 for_each_online_node(nid) {
321 node_remap_start_vaddr[nid] = pfn_to_kaddr( 376 init_remap_allocator(nid);
322 kva_start_pfn + node_remap_offset[nid]);
323 /* Init the node remap allocator */
324 node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] +
325 (node_remap_size[nid] * PAGE_SIZE);
326 node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] +
327 ALIGN(sizeof(pg_data_t), PAGE_SIZE);
328 377
329 allocate_pgdat(nid); 378 allocate_pgdat(nid);
330 printk ("node %d will remap to vaddr %08lx - %08lx\n", nid,
331 (ulong) node_remap_start_vaddr[nid],
332 (ulong) pfn_to_kaddr(highstart_pfn
333 + node_remap_offset[nid] + node_remap_size[nid]));
334 } 379 }
335 printk("High memory starts at vaddr %08lx\n", 380 printk("High memory starts at vaddr %08lx\n",
336 (ulong) pfn_to_kaddr(highstart_pfn)); 381 (ulong) pfn_to_kaddr(highstart_pfn));
@@ -345,7 +390,8 @@ unsigned long __init setup_memory(void)
345 390
346void __init numa_kva_reserve(void) 391void __init numa_kva_reserve(void)
347{ 392{
348 reserve_bootmem(PFN_PHYS(kva_start_pfn),PFN_PHYS(kva_pages)); 393 if (kva_pages)
394 reserve_bootmem(PFN_PHYS(kva_start_pfn), PFN_PHYS(kva_pages));
349} 395}
350 396
351void __init zone_sizes_init(void) 397void __init zone_sizes_init(void)
@@ -430,3 +476,29 @@ int memory_add_physaddr_to_nid(u64 addr)
430 476
431EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); 477EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
432#endif 478#endif
479
480#ifndef CONFIG_HAVE_ARCH_PARSE_SRAT
481/*
482 * XXX FIXME: Make SLIT table parsing available to 32-bit NUMA
483 *
484 * These stub functions are needed to compile 32-bit NUMA when SRAT is
485 * not set. There are functions in srat_64.c for parsing this table
486 * and it may be possible to make them common functions.
487 */
488void acpi_numa_slit_init (struct acpi_table_slit *slit)
489{
490 printk(KERN_INFO "ACPI: No support for parsing SLIT table\n");
491}
492
493void acpi_numa_processor_affinity_init (struct acpi_srat_cpu_affinity *pa)
494{
495}
496
497void acpi_numa_memory_affinity_init (struct acpi_srat_mem_affinity *ma)
498{
499}
500
501void acpi_numa_arch_fixup(void)
502{
503}
504#endif /* CONFIG_HAVE_ARCH_PARSE_SRAT */
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
new file mode 100644
index 00000000000..7e8db53528a
--- /dev/null
+++ b/arch/x86/mm/extable.c
@@ -0,0 +1,62 @@
1#include <linux/module.h>
2#include <linux/spinlock.h>
3#include <asm/uaccess.h>
4
5
6int fixup_exception(struct pt_regs *regs)
7{
8 const struct exception_table_entry *fixup;
9
10#ifdef CONFIG_PNPBIOS
11 if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) {
12 extern u32 pnp_bios_fault_eip, pnp_bios_fault_esp;
13 extern u32 pnp_bios_is_utter_crap;
14 pnp_bios_is_utter_crap = 1;
15 printk(KERN_CRIT "PNPBIOS fault.. attempting recovery.\n");
16 __asm__ volatile(
17 "movl %0, %%esp\n\t"
18 "jmp *%1\n\t"
19 : : "g" (pnp_bios_fault_esp), "g" (pnp_bios_fault_eip));
20 panic("do_trap: can't hit this");
21 }
22#endif
23
24 fixup = search_exception_tables(regs->ip);
25 if (fixup) {
26 regs->ip = fixup->fixup;
27 return 1;
28 }
29
30 return 0;
31}
32
33#ifdef CONFIG_X86_64
34/*
35 * Need to defined our own search_extable on X86_64 to work around
36 * a B stepping K8 bug.
37 */
38const struct exception_table_entry *
39search_extable(const struct exception_table_entry *first,
40 const struct exception_table_entry *last,
41 unsigned long value)
42{
43 /* B stepping K8 bug */
44 if ((value >> 32) == 0)
45 value |= 0xffffffffUL << 32;
46
47 while (first <= last) {
48 const struct exception_table_entry *mid;
49 long diff;
50
51 mid = (last - first) / 2 + first;
52 diff = mid->insn - value;
53 if (diff == 0)
54 return mid;
55 else if (diff < 0)
56 first = mid+1;
57 else
58 last = mid-1;
59 }
60 return NULL;
61}
62#endif
diff --git a/arch/x86/mm/extable_32.c b/arch/x86/mm/extable_32.c
deleted file mode 100644
index 0ce4f22a263..00000000000
--- a/arch/x86/mm/extable_32.c
+++ /dev/null
@@ -1,35 +0,0 @@
1/*
2 * linux/arch/i386/mm/extable.c
3 */
4
5#include <linux/module.h>
6#include <linux/spinlock.h>
7#include <asm/uaccess.h>
8
9int fixup_exception(struct pt_regs *regs)
10{
11 const struct exception_table_entry *fixup;
12
13#ifdef CONFIG_PNPBIOS
14 if (unlikely(SEGMENT_IS_PNP_CODE(regs->xcs)))
15 {
16 extern u32 pnp_bios_fault_eip, pnp_bios_fault_esp;
17 extern u32 pnp_bios_is_utter_crap;
18 pnp_bios_is_utter_crap = 1;
19 printk(KERN_CRIT "PNPBIOS fault.. attempting recovery.\n");
20 __asm__ volatile(
21 "movl %0, %%esp\n\t"
22 "jmp *%1\n\t"
23 : : "g" (pnp_bios_fault_esp), "g" (pnp_bios_fault_eip));
24 panic("do_trap: can't hit this");
25 }
26#endif
27
28 fixup = search_exception_tables(regs->eip);
29 if (fixup) {
30 regs->eip = fixup->fixup;
31 return 1;
32 }
33
34 return 0;
35}
diff --git a/arch/x86/mm/extable_64.c b/arch/x86/mm/extable_64.c
deleted file mode 100644
index 79ac6e7100a..00000000000
--- a/arch/x86/mm/extable_64.c
+++ /dev/null
@@ -1,34 +0,0 @@
1/*
2 * linux/arch/x86_64/mm/extable.c
3 */
4
5#include <linux/module.h>
6#include <linux/spinlock.h>
7#include <linux/init.h>
8#include <asm/uaccess.h>
9
10/* Simple binary search */
11const struct exception_table_entry *
12search_extable(const struct exception_table_entry *first,
13 const struct exception_table_entry *last,
14 unsigned long value)
15{
16 /* Work around a B stepping K8 bug */
17 if ((value >> 32) == 0)
18 value |= 0xffffffffUL << 32;
19
20 while (first <= last) {
21 const struct exception_table_entry *mid;
22 long diff;
23
24 mid = (last - first) / 2 + first;
25 diff = mid->insn - value;
26 if (diff == 0)
27 return mid;
28 else if (diff < 0)
29 first = mid+1;
30 else
31 last = mid-1;
32 }
33 return NULL;
34}
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
new file mode 100644
index 00000000000..621afb6343d
--- /dev/null
+++ b/arch/x86/mm/fault.c
@@ -0,0 +1,986 @@
1/*
2 * Copyright (C) 1995 Linus Torvalds
3 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
4 */
5
6#include <linux/signal.h>
7#include <linux/sched.h>
8#include <linux/kernel.h>
9#include <linux/errno.h>
10#include <linux/string.h>
11#include <linux/types.h>
12#include <linux/ptrace.h>
13#include <linux/mman.h>
14#include <linux/mm.h>
15#include <linux/smp.h>
16#include <linux/interrupt.h>
17#include <linux/init.h>
18#include <linux/tty.h>
19#include <linux/vt_kern.h> /* For unblank_screen() */
20#include <linux/compiler.h>
21#include <linux/highmem.h>
22#include <linux/bootmem.h> /* for max_low_pfn */
23#include <linux/vmalloc.h>
24#include <linux/module.h>
25#include <linux/kprobes.h>
26#include <linux/uaccess.h>
27#include <linux/kdebug.h>
28
29#include <asm/system.h>
30#include <asm/desc.h>
31#include <asm/segment.h>
32#include <asm/pgalloc.h>
33#include <asm/smp.h>
34#include <asm/tlbflush.h>
35#include <asm/proto.h>
36#include <asm-generic/sections.h>
37
38/*
39 * Page fault error code bits
40 * bit 0 == 0 means no page found, 1 means protection fault
41 * bit 1 == 0 means read, 1 means write
42 * bit 2 == 0 means kernel, 1 means user-mode
43 * bit 3 == 1 means use of reserved bit detected
44 * bit 4 == 1 means fault was an instruction fetch
45 */
46#define PF_PROT (1<<0)
47#define PF_WRITE (1<<1)
48#define PF_USER (1<<2)
49#define PF_RSVD (1<<3)
50#define PF_INSTR (1<<4)
51
52static inline int notify_page_fault(struct pt_regs *regs)
53{
54#ifdef CONFIG_KPROBES
55 int ret = 0;
56
57 /* kprobe_running() needs smp_processor_id() */
58#ifdef CONFIG_X86_32
59 if (!user_mode_vm(regs)) {
60#else
61 if (!user_mode(regs)) {
62#endif
63 preempt_disable();
64 if (kprobe_running() && kprobe_fault_handler(regs, 14))
65 ret = 1;
66 preempt_enable();
67 }
68
69 return ret;
70#else
71 return 0;
72#endif
73}
74
75/*
76 * X86_32
77 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
78 * Check that here and ignore it.
79 *
80 * X86_64
81 * Sometimes the CPU reports invalid exceptions on prefetch.
82 * Check that here and ignore it.
83 *
84 * Opcode checker based on code by Richard Brunner
85 */
86static int is_prefetch(struct pt_regs *regs, unsigned long addr,
87 unsigned long error_code)
88{
89 unsigned char *instr;
90 int scan_more = 1;
91 int prefetch = 0;
92 unsigned char *max_instr;
93
94#ifdef CONFIG_X86_32
95 if (!(__supported_pte_mask & _PAGE_NX))
96 return 0;
97#endif
98
99 /* If it was a exec fault on NX page, ignore */
100 if (error_code & PF_INSTR)
101 return 0;
102
103 instr = (unsigned char *)convert_ip_to_linear(current, regs);
104 max_instr = instr + 15;
105
106 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
107 return 0;
108
109 while (scan_more && instr < max_instr) {
110 unsigned char opcode;
111 unsigned char instr_hi;
112 unsigned char instr_lo;
113
114 if (probe_kernel_address(instr, opcode))
115 break;
116
117 instr_hi = opcode & 0xf0;
118 instr_lo = opcode & 0x0f;
119 instr++;
120
121 switch (instr_hi) {
122 case 0x20:
123 case 0x30:
124 /*
125 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
126 * In X86_64 long mode, the CPU will signal invalid
127 * opcode if some of these prefixes are present so
128 * X86_64 will never get here anyway
129 */
130 scan_more = ((instr_lo & 7) == 0x6);
131 break;
132#ifdef CONFIG_X86_64
133 case 0x40:
134 /*
135 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
136 * Need to figure out under what instruction mode the
137 * instruction was issued. Could check the LDT for lm,
138 * but for now it's good enough to assume that long
139 * mode only uses well known segments or kernel.
140 */
141 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
142 break;
143#endif
144 case 0x60:
145 /* 0x64 thru 0x67 are valid prefixes in all modes. */
146 scan_more = (instr_lo & 0xC) == 0x4;
147 break;
148 case 0xF0:
149 /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
150 scan_more = !instr_lo || (instr_lo>>1) == 1;
151 break;
152 case 0x00:
153 /* Prefetch instruction is 0x0F0D or 0x0F18 */
154 scan_more = 0;
155
156 if (probe_kernel_address(instr, opcode))
157 break;
158 prefetch = (instr_lo == 0xF) &&
159 (opcode == 0x0D || opcode == 0x18);
160 break;
161 default:
162 scan_more = 0;
163 break;
164 }
165 }
166 return prefetch;
167}
168
169static void force_sig_info_fault(int si_signo, int si_code,
170 unsigned long address, struct task_struct *tsk)
171{
172 siginfo_t info;
173
174 info.si_signo = si_signo;
175 info.si_errno = 0;
176 info.si_code = si_code;
177 info.si_addr = (void __user *)address;
178 force_sig_info(si_signo, &info, tsk);
179}
180
181#ifdef CONFIG_X86_64
182static int bad_address(void *p)
183{
184 unsigned long dummy;
185 return probe_kernel_address((unsigned long *)p, dummy);
186}
187#endif
188
189void dump_pagetable(unsigned long address)
190{
191#ifdef CONFIG_X86_32
192 __typeof__(pte_val(__pte(0))) page;
193
194 page = read_cr3();
195 page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
196#ifdef CONFIG_X86_PAE
197 printk("*pdpt = %016Lx ", page);
198 if ((page >> PAGE_SHIFT) < max_low_pfn
199 && page & _PAGE_PRESENT) {
200 page &= PAGE_MASK;
201 page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
202 & (PTRS_PER_PMD - 1)];
203 printk(KERN_CONT "*pde = %016Lx ", page);
204 page &= ~_PAGE_NX;
205 }
206#else
207 printk("*pde = %08lx ", page);
208#endif
209
210 /*
211 * We must not directly access the pte in the highpte
212 * case if the page table is located in highmem.
213 * And let's rather not kmap-atomic the pte, just in case
214 * it's allocated already.
215 */
216 if ((page >> PAGE_SHIFT) < max_low_pfn
217 && (page & _PAGE_PRESENT)
218 && !(page & _PAGE_PSE)) {
219 page &= PAGE_MASK;
220 page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
221 & (PTRS_PER_PTE - 1)];
222 printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page);
223 }
224
225 printk("\n");
226#else /* CONFIG_X86_64 */
227 pgd_t *pgd;
228 pud_t *pud;
229 pmd_t *pmd;
230 pte_t *pte;
231
232 pgd = (pgd_t *)read_cr3();
233
234 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
235 pgd += pgd_index(address);
236 if (bad_address(pgd)) goto bad;
237 printk("PGD %lx ", pgd_val(*pgd));
238 if (!pgd_present(*pgd)) goto ret;
239
240 pud = pud_offset(pgd, address);
241 if (bad_address(pud)) goto bad;
242 printk("PUD %lx ", pud_val(*pud));
243 if (!pud_present(*pud) || pud_large(*pud))
244 goto ret;
245
246 pmd = pmd_offset(pud, address);
247 if (bad_address(pmd)) goto bad;
248 printk("PMD %lx ", pmd_val(*pmd));
249 if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
250
251 pte = pte_offset_kernel(pmd, address);
252 if (bad_address(pte)) goto bad;
253 printk("PTE %lx", pte_val(*pte));
254ret:
255 printk("\n");
256 return;
257bad:
258 printk("BAD\n");
259#endif
260}
261
262#ifdef CONFIG_X86_32
263static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
264{
265 unsigned index = pgd_index(address);
266 pgd_t *pgd_k;
267 pud_t *pud, *pud_k;
268 pmd_t *pmd, *pmd_k;
269
270 pgd += index;
271 pgd_k = init_mm.pgd + index;
272
273 if (!pgd_present(*pgd_k))
274 return NULL;
275
276 /*
277 * set_pgd(pgd, *pgd_k); here would be useless on PAE
278 * and redundant with the set_pmd() on non-PAE. As would
279 * set_pud.
280 */
281
282 pud = pud_offset(pgd, address);
283 pud_k = pud_offset(pgd_k, address);
284 if (!pud_present(*pud_k))
285 return NULL;
286
287 pmd = pmd_offset(pud, address);
288 pmd_k = pmd_offset(pud_k, address);
289 if (!pmd_present(*pmd_k))
290 return NULL;
291 if (!pmd_present(*pmd)) {
292 set_pmd(pmd, *pmd_k);
293 arch_flush_lazy_mmu_mode();
294 } else
295 BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
296 return pmd_k;
297}
298#endif
299
300#ifdef CONFIG_X86_64
301static const char errata93_warning[] =
302KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
303KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
304KERN_ERR "******* Please consider a BIOS update.\n"
305KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
306#endif
307
308/* Workaround for K8 erratum #93 & buggy BIOS.
309 BIOS SMM functions are required to use a specific workaround
310 to avoid corruption of the 64bit RIP register on C stepping K8.
311 A lot of BIOS that didn't get tested properly miss this.
312 The OS sees this as a page fault with the upper 32bits of RIP cleared.
313 Try to work around it here.
314 Note we only handle faults in kernel here.
315 Does nothing for X86_32
316 */
317static int is_errata93(struct pt_regs *regs, unsigned long address)
318{
319#ifdef CONFIG_X86_64
320 static int warned;
321 if (address != regs->ip)
322 return 0;
323 if ((address >> 32) != 0)
324 return 0;
325 address |= 0xffffffffUL << 32;
326 if ((address >= (u64)_stext && address <= (u64)_etext) ||
327 (address >= MODULES_VADDR && address <= MODULES_END)) {
328 if (!warned) {
329 printk(errata93_warning);
330 warned = 1;
331 }
332 regs->ip = address;
333 return 1;
334 }
335#endif
336 return 0;
337}
338
339/*
340 * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal
341 * addresses >4GB. We catch this in the page fault handler because these
342 * addresses are not reachable. Just detect this case and return. Any code
343 * segment in LDT is compatibility mode.
344 */
345static int is_errata100(struct pt_regs *regs, unsigned long address)
346{
347#ifdef CONFIG_X86_64
348 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
349 (address >> 32))
350 return 1;
351#endif
352 return 0;
353}
354
355void do_invalid_op(struct pt_regs *, unsigned long);
356
357static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
358{
359#ifdef CONFIG_X86_F00F_BUG
360 unsigned long nr;
361 /*
362 * Pentium F0 0F C7 C8 bug workaround.
363 */
364 if (boot_cpu_data.f00f_bug) {
365 nr = (address - idt_descr.address) >> 3;
366
367 if (nr == 6) {
368 do_invalid_op(regs, 0);
369 return 1;
370 }
371 }
372#endif
373 return 0;
374}
375
376static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
377 unsigned long address)
378{
379#ifdef CONFIG_X86_32
380 if (!oops_may_print())
381 return;
382#endif
383
384#ifdef CONFIG_X86_PAE
385 if (error_code & PF_INSTR) {
386 unsigned int level;
387 pte_t *pte = lookup_address(address, &level);
388
389 if (pte && pte_present(*pte) && !pte_exec(*pte))
390 printk(KERN_CRIT "kernel tried to execute "
391 "NX-protected page - exploit attempt? "
392 "(uid: %d)\n", current->uid);
393 }
394#endif
395
396 printk(KERN_ALERT "BUG: unable to handle kernel ");
397 if (address < PAGE_SIZE)
398 printk(KERN_CONT "NULL pointer dereference");
399 else
400 printk(KERN_CONT "paging request");
401#ifdef CONFIG_X86_32
402 printk(KERN_CONT " at %08lx\n", address);
403#else
404 printk(KERN_CONT " at %016lx\n", address);
405#endif
406 printk(KERN_ALERT "IP:");
407 printk_address(regs->ip, 1);
408 dump_pagetable(address);
409}
410
411#ifdef CONFIG_X86_64
412static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
413 unsigned long error_code)
414{
415 unsigned long flags = oops_begin();
416 struct task_struct *tsk;
417
418 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
419 current->comm, address);
420 dump_pagetable(address);
421 tsk = current;
422 tsk->thread.cr2 = address;
423 tsk->thread.trap_no = 14;
424 tsk->thread.error_code = error_code;
425 if (__die("Bad pagetable", regs, error_code))
426 regs = NULL;
427 oops_end(flags, regs, SIGKILL);
428}
429#endif
430
431static int spurious_fault_check(unsigned long error_code, pte_t *pte)
432{
433 if ((error_code & PF_WRITE) && !pte_write(*pte))
434 return 0;
435 if ((error_code & PF_INSTR) && !pte_exec(*pte))
436 return 0;
437
438 return 1;
439}
440
441/*
442 * Handle a spurious fault caused by a stale TLB entry. This allows
443 * us to lazily refresh the TLB when increasing the permissions of a
444 * kernel page (RO -> RW or NX -> X). Doing it eagerly is very
445 * expensive since that implies doing a full cross-processor TLB
446 * flush, even if no stale TLB entries exist on other processors.
447 * There are no security implications to leaving a stale TLB when
448 * increasing the permissions on a page.
449 */
450static int spurious_fault(unsigned long address,
451 unsigned long error_code)
452{
453 pgd_t *pgd;
454 pud_t *pud;
455 pmd_t *pmd;
456 pte_t *pte;
457
458 /* Reserved-bit violation or user access to kernel space? */
459 if (error_code & (PF_USER | PF_RSVD))
460 return 0;
461
462 pgd = init_mm.pgd + pgd_index(address);
463 if (!pgd_present(*pgd))
464 return 0;
465
466 pud = pud_offset(pgd, address);
467 if (!pud_present(*pud))
468 return 0;
469
470 if (pud_large(*pud))
471 return spurious_fault_check(error_code, (pte_t *) pud);
472
473 pmd = pmd_offset(pud, address);
474 if (!pmd_present(*pmd))
475 return 0;
476
477 if (pmd_large(*pmd))
478 return spurious_fault_check(error_code, (pte_t *) pmd);
479
480 pte = pte_offset_kernel(pmd, address);
481 if (!pte_present(*pte))
482 return 0;
483
484 return spurious_fault_check(error_code, pte);
485}
486
487/*
488 * X86_32
489 * Handle a fault on the vmalloc or module mapping area
490 *
491 * X86_64
492 * Handle a fault on the vmalloc area
493 *
494 * This assumes no large pages in there.
495 */
496static int vmalloc_fault(unsigned long address)
497{
498#ifdef CONFIG_X86_32
499 unsigned long pgd_paddr;
500 pmd_t *pmd_k;
501 pte_t *pte_k;
502 /*
503 * Synchronize this task's top level page-table
504 * with the 'reference' page table.
505 *
506 * Do _not_ use "current" here. We might be inside
507 * an interrupt in the middle of a task switch..
508 */
509 pgd_paddr = read_cr3();
510 pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
511 if (!pmd_k)
512 return -1;
513 pte_k = pte_offset_kernel(pmd_k, address);
514 if (!pte_present(*pte_k))
515 return -1;
516 return 0;
517#else
518 pgd_t *pgd, *pgd_ref;
519 pud_t *pud, *pud_ref;
520 pmd_t *pmd, *pmd_ref;
521 pte_t *pte, *pte_ref;
522
523 /* Make sure we are in vmalloc area */
524 if (!(address >= VMALLOC_START && address < VMALLOC_END))
525 return -1;
526
527 /* Copy kernel mappings over when needed. This can also
528 happen within a race in page table update. In the later
529 case just flush. */
530
531 pgd = pgd_offset(current->mm ?: &init_mm, address);
532 pgd_ref = pgd_offset_k(address);
533 if (pgd_none(*pgd_ref))
534 return -1;
535 if (pgd_none(*pgd))
536 set_pgd(pgd, *pgd_ref);
537 else
538 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
539
540 /* Below here mismatches are bugs because these lower tables
541 are shared */
542
543 pud = pud_offset(pgd, address);
544 pud_ref = pud_offset(pgd_ref, address);
545 if (pud_none(*pud_ref))
546 return -1;
547 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
548 BUG();
549 pmd = pmd_offset(pud, address);
550 pmd_ref = pmd_offset(pud_ref, address);
551 if (pmd_none(*pmd_ref))
552 return -1;
553 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
554 BUG();
555 pte_ref = pte_offset_kernel(pmd_ref, address);
556 if (!pte_present(*pte_ref))
557 return -1;
558 pte = pte_offset_kernel(pmd, address);
559 /* Don't use pte_page here, because the mappings can point
560 outside mem_map, and the NUMA hash lookup cannot handle
561 that. */
562 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
563 BUG();
564 return 0;
565#endif
566}
567
568int show_unhandled_signals = 1;
569
570/*
571 * This routine handles page faults. It determines the address,
572 * and the problem, and then passes it off to one of the appropriate
573 * routines.
574 */
575#ifdef CONFIG_X86_64
576asmlinkage
577#endif
578void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
579{
580 struct task_struct *tsk;
581 struct mm_struct *mm;
582 struct vm_area_struct *vma;
583 unsigned long address;
584 int write, si_code;
585 int fault;
586#ifdef CONFIG_X86_64
587 unsigned long flags;
588#endif
589
590 /*
591 * We can fault from pretty much anywhere, with unknown IRQ state.
592 */
593 trace_hardirqs_fixup();
594
595 tsk = current;
596 mm = tsk->mm;
597 prefetchw(&mm->mmap_sem);
598
599 /* get the address */
600 address = read_cr2();
601
602 si_code = SEGV_MAPERR;
603
604 if (notify_page_fault(regs))
605 return;
606
607 /*
608 * We fault-in kernel-space virtual memory on-demand. The
609 * 'reference' page table is init_mm.pgd.
610 *
611 * NOTE! We MUST NOT take any locks for this case. We may
612 * be in an interrupt or a critical region, and should
613 * only copy the information from the master page table,
614 * nothing more.
615 *
616 * This verifies that the fault happens in kernel space
617 * (error_code & 4) == 0, and that the fault was not a
618 * protection error (error_code & 9) == 0.
619 */
620#ifdef CONFIG_X86_32
621 if (unlikely(address >= TASK_SIZE)) {
622#else
623 if (unlikely(address >= TASK_SIZE64)) {
624#endif
625 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
626 vmalloc_fault(address) >= 0)
627 return;
628
629 /* Can handle a stale RO->RW TLB */
630 if (spurious_fault(address, error_code))
631 return;
632
633 /*
634 * Don't take the mm semaphore here. If we fixup a prefetch
635 * fault we could otherwise deadlock.
636 */
637 goto bad_area_nosemaphore;
638 }
639
640
641#ifdef CONFIG_X86_32
642 /* It's safe to allow irq's after cr2 has been saved and the vmalloc
643 fault has been handled. */
644 if (regs->flags & (X86_EFLAGS_IF|VM_MASK))
645 local_irq_enable();
646
647 /*
648 * If we're in an interrupt, have no user context or are running in an
649 * atomic region then we must not take the fault.
650 */
651 if (in_atomic() || !mm)
652 goto bad_area_nosemaphore;
653#else /* CONFIG_X86_64 */
654 if (likely(regs->flags & X86_EFLAGS_IF))
655 local_irq_enable();
656
657 if (unlikely(error_code & PF_RSVD))
658 pgtable_bad(address, regs, error_code);
659
660 /*
661 * If we're in an interrupt, have no user context or are running in an
662 * atomic region then we must not take the fault.
663 */
664 if (unlikely(in_atomic() || !mm))
665 goto bad_area_nosemaphore;
666
667 /*
668 * User-mode registers count as a user access even for any
669 * potential system fault or CPU buglet.
670 */
671 if (user_mode_vm(regs))
672 error_code |= PF_USER;
673again:
674#endif
675 /* When running in the kernel we expect faults to occur only to
676 * addresses in user space. All other faults represent errors in the
677 * kernel and should generate an OOPS. Unfortunately, in the case of an
678 * erroneous fault occurring in a code path which already holds mmap_sem
679 * we will deadlock attempting to validate the fault against the
680 * address space. Luckily the kernel only validly references user
681 * space from well defined areas of code, which are listed in the
682 * exceptions table.
683 *
684 * As the vast majority of faults will be valid we will only perform
685 * the source reference check when there is a possibility of a deadlock.
686 * Attempt to lock the address space, if we cannot we then validate the
687 * source. If this is invalid we can skip the address space check,
688 * thus avoiding the deadlock.
689 */
690 if (!down_read_trylock(&mm->mmap_sem)) {
691 if ((error_code & PF_USER) == 0 &&
692 !search_exception_tables(regs->ip))
693 goto bad_area_nosemaphore;
694 down_read(&mm->mmap_sem);
695 }
696
697 vma = find_vma(mm, address);
698 if (!vma)
699 goto bad_area;
700 if (vma->vm_start <= address)
701 goto good_area;
702 if (!(vma->vm_flags & VM_GROWSDOWN))
703 goto bad_area;
704 if (error_code & PF_USER) {
705 /*
706 * Accessing the stack below %sp is always a bug.
707 * The large cushion allows instructions like enter
708 * and pusha to work. ("enter $65535,$31" pushes
709 * 32 pointers and then decrements %sp by 65535.)
710 */
711 if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
712 goto bad_area;
713 }
714 if (expand_stack(vma, address))
715 goto bad_area;
716/*
717 * Ok, we have a good vm_area for this memory access, so
718 * we can handle it..
719 */
720good_area:
721 si_code = SEGV_ACCERR;
722 write = 0;
723 switch (error_code & (PF_PROT|PF_WRITE)) {
724 default: /* 3: write, present */
725 /* fall through */
726 case PF_WRITE: /* write, not present */
727 if (!(vma->vm_flags & VM_WRITE))
728 goto bad_area;
729 write++;
730 break;
731 case PF_PROT: /* read, present */
732 goto bad_area;
733 case 0: /* read, not present */
734 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
735 goto bad_area;
736 }
737
738#ifdef CONFIG_X86_32
739survive:
740#endif
741 /*
742 * If for any reason at all we couldn't handle the fault,
743 * make sure we exit gracefully rather than endlessly redo
744 * the fault.
745 */
746 fault = handle_mm_fault(mm, vma, address, write);
747 if (unlikely(fault & VM_FAULT_ERROR)) {
748 if (fault & VM_FAULT_OOM)
749 goto out_of_memory;
750 else if (fault & VM_FAULT_SIGBUS)
751 goto do_sigbus;
752 BUG();
753 }
754 if (fault & VM_FAULT_MAJOR)
755 tsk->maj_flt++;
756 else
757 tsk->min_flt++;
758
759#ifdef CONFIG_X86_32
760 /*
761 * Did it hit the DOS screen memory VA from vm86 mode?
762 */
763 if (v8086_mode(regs)) {
764 unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
765 if (bit < 32)
766 tsk->thread.screen_bitmap |= 1 << bit;
767 }
768#endif
769 up_read(&mm->mmap_sem);
770 return;
771
772/*
773 * Something tried to access memory that isn't in our memory map..
774 * Fix it, but check if it's kernel or user first..
775 */
776bad_area:
777 up_read(&mm->mmap_sem);
778
779bad_area_nosemaphore:
780 /* User mode accesses just cause a SIGSEGV */
781 if (error_code & PF_USER) {
782 /*
783 * It's possible to have interrupts off here.
784 */
785 local_irq_enable();
786
787 /*
788 * Valid to do another page fault here because this one came
789 * from user space.
790 */
791 if (is_prefetch(regs, address, error_code))
792 return;
793
794 if (is_errata100(regs, address))
795 return;
796
797 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
798 printk_ratelimit()) {
799 printk(
800#ifdef CONFIG_X86_32
801 "%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx",
802#else
803 "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx",
804#endif
805 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
806 tsk->comm, task_pid_nr(tsk), address, regs->ip,
807 regs->sp, error_code);
808 print_vma_addr(" in ", regs->ip);
809 printk("\n");
810 }
811
812 tsk->thread.cr2 = address;
813 /* Kernel addresses are always protection faults */
814 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
815 tsk->thread.trap_no = 14;
816 force_sig_info_fault(SIGSEGV, si_code, address, tsk);
817 return;
818 }
819
820 if (is_f00f_bug(regs, address))
821 return;
822
823no_context:
824 /* Are we prepared to handle this kernel fault? */
825 if (fixup_exception(regs))
826 return;
827
828 /*
829 * X86_32
830 * Valid to do another page fault here, because if this fault
831 * had been triggered by is_prefetch fixup_exception would have
832 * handled it.
833 *
834 * X86_64
835 * Hall of shame of CPU/BIOS bugs.
836 */
837 if (is_prefetch(regs, address, error_code))
838 return;
839
840 if (is_errata93(regs, address))
841 return;
842
843/*
844 * Oops. The kernel tried to access some bad page. We'll have to
845 * terminate things with extreme prejudice.
846 */
847#ifdef CONFIG_X86_32
848 bust_spinlocks(1);
849#else
850 flags = oops_begin();
851#endif
852
853 show_fault_oops(regs, error_code, address);
854
855 tsk->thread.cr2 = address;
856 tsk->thread.trap_no = 14;
857 tsk->thread.error_code = error_code;
858
859#ifdef CONFIG_X86_32
860 die("Oops", regs, error_code);
861 bust_spinlocks(0);
862 do_exit(SIGKILL);
863#else
864 if (__die("Oops", regs, error_code))
865 regs = NULL;
866 /* Executive summary in case the body of the oops scrolled away */
867 printk(KERN_EMERG "CR2: %016lx\n", address);
868 oops_end(flags, regs, SIGKILL);
869#endif
870
871/*
872 * We ran out of memory, or some other thing happened to us that made
873 * us unable to handle the page fault gracefully.
874 */
875out_of_memory:
876 up_read(&mm->mmap_sem);
877 if (is_global_init(tsk)) {
878 yield();
879#ifdef CONFIG_X86_32
880 down_read(&mm->mmap_sem);
881 goto survive;
882#else
883 goto again;
884#endif
885 }
886
887 printk("VM: killing process %s\n", tsk->comm);
888 if (error_code & PF_USER)
889 do_group_exit(SIGKILL);
890 goto no_context;
891
892do_sigbus:
893 up_read(&mm->mmap_sem);
894
895 /* Kernel mode? Handle exceptions or die */
896 if (!(error_code & PF_USER))
897 goto no_context;
898#ifdef CONFIG_X86_32
899 /* User space => ok to do another page fault */
900 if (is_prefetch(regs, address, error_code))
901 return;
902#endif
903 tsk->thread.cr2 = address;
904 tsk->thread.error_code = error_code;
905 tsk->thread.trap_no = 14;
906 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
907}
908
909DEFINE_SPINLOCK(pgd_lock);
910LIST_HEAD(pgd_list);
911
912void vmalloc_sync_all(void)
913{
914#ifdef CONFIG_X86_32
915 /*
916 * Note that races in the updates of insync and start aren't
917 * problematic: insync can only get set bits added, and updates to
918 * start are only improving performance (without affecting correctness
919 * if undone).
920 */
921 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
922 static unsigned long start = TASK_SIZE;
923 unsigned long address;
924
925 if (SHARED_KERNEL_PMD)
926 return;
927
928 BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
929 for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) {
930 if (!test_bit(pgd_index(address), insync)) {
931 unsigned long flags;
932 struct page *page;
933
934 spin_lock_irqsave(&pgd_lock, flags);
935 list_for_each_entry(page, &pgd_list, lru) {
936 if (!vmalloc_sync_one(page_address(page),
937 address))
938 break;
939 }
940 spin_unlock_irqrestore(&pgd_lock, flags);
941 if (!page)
942 set_bit(pgd_index(address), insync);
943 }
944 if (address == start && test_bit(pgd_index(address), insync))
945 start = address + PGDIR_SIZE;
946 }
947#else /* CONFIG_X86_64 */
948 /*
949 * Note that races in the updates of insync and start aren't
950 * problematic: insync can only get set bits added, and updates to
951 * start are only improving performance (without affecting correctness
952 * if undone).
953 */
954 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
955 static unsigned long start = VMALLOC_START & PGDIR_MASK;
956 unsigned long address;
957
958 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
959 if (!test_bit(pgd_index(address), insync)) {
960 const pgd_t *pgd_ref = pgd_offset_k(address);
961 unsigned long flags;
962 struct page *page;
963
964 if (pgd_none(*pgd_ref))
965 continue;
966 spin_lock_irqsave(&pgd_lock, flags);
967 list_for_each_entry(page, &pgd_list, lru) {
968 pgd_t *pgd;
969 pgd = (pgd_t *)page_address(page) + pgd_index(address);
970 if (pgd_none(*pgd))
971 set_pgd(pgd, *pgd_ref);
972 else
973 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
974 }
975 spin_unlock_irqrestore(&pgd_lock, flags);
976 set_bit(pgd_index(address), insync);
977 }
978 if (address == start)
979 start = address + PGDIR_SIZE;
980 }
981 /* Check that there is no need to do the same for the modules area. */
982 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
983 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
984 (__START_KERNEL & PGDIR_MASK)));
985#endif
986}
diff --git a/arch/x86/mm/fault_32.c b/arch/x86/mm/fault_32.c
deleted file mode 100644
index a2273d44aa2..00000000000
--- a/arch/x86/mm/fault_32.c
+++ /dev/null
@@ -1,659 +0,0 @@
1/*
2 * linux/arch/i386/mm/fault.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 */
6
7#include <linux/signal.h>
8#include <linux/sched.h>
9#include <linux/kernel.h>
10#include <linux/errno.h>
11#include <linux/string.h>
12#include <linux/types.h>
13#include <linux/ptrace.h>
14#include <linux/mman.h>
15#include <linux/mm.h>
16#include <linux/smp.h>
17#include <linux/interrupt.h>
18#include <linux/init.h>
19#include <linux/tty.h>
20#include <linux/vt_kern.h> /* For unblank_screen() */
21#include <linux/highmem.h>
22#include <linux/bootmem.h> /* for max_low_pfn */
23#include <linux/vmalloc.h>
24#include <linux/module.h>
25#include <linux/kprobes.h>
26#include <linux/uaccess.h>
27#include <linux/kdebug.h>
28#include <linux/kprobes.h>
29
30#include <asm/system.h>
31#include <asm/desc.h>
32#include <asm/segment.h>
33
34extern void die(const char *,struct pt_regs *,long);
35
36#ifdef CONFIG_KPROBES
37static inline int notify_page_fault(struct pt_regs *regs)
38{
39 int ret = 0;
40
41 /* kprobe_running() needs smp_processor_id() */
42 if (!user_mode_vm(regs)) {
43 preempt_disable();
44 if (kprobe_running() && kprobe_fault_handler(regs, 14))
45 ret = 1;
46 preempt_enable();
47 }
48
49 return ret;
50}
51#else
52static inline int notify_page_fault(struct pt_regs *regs)
53{
54 return 0;
55}
56#endif
57
58/*
59 * Return EIP plus the CS segment base. The segment limit is also
60 * adjusted, clamped to the kernel/user address space (whichever is
61 * appropriate), and returned in *eip_limit.
62 *
63 * The segment is checked, because it might have been changed by another
64 * task between the original faulting instruction and here.
65 *
66 * If CS is no longer a valid code segment, or if EIP is beyond the
67 * limit, or if it is a kernel address when CS is not a kernel segment,
68 * then the returned value will be greater than *eip_limit.
69 *
70 * This is slow, but is very rarely executed.
71 */
72static inline unsigned long get_segment_eip(struct pt_regs *regs,
73 unsigned long *eip_limit)
74{
75 unsigned long eip = regs->eip;
76 unsigned seg = regs->xcs & 0xffff;
77 u32 seg_ar, seg_limit, base, *desc;
78
79 /* Unlikely, but must come before segment checks. */
80 if (unlikely(regs->eflags & VM_MASK)) {
81 base = seg << 4;
82 *eip_limit = base + 0xffff;
83 return base + (eip & 0xffff);
84 }
85
86 /* The standard kernel/user address space limit. */
87 *eip_limit = user_mode(regs) ? USER_DS.seg : KERNEL_DS.seg;
88
89 /* By far the most common cases. */
90 if (likely(SEGMENT_IS_FLAT_CODE(seg)))
91 return eip;
92
93 /* Check the segment exists, is within the current LDT/GDT size,
94 that kernel/user (ring 0..3) has the appropriate privilege,
95 that it's a code segment, and get the limit. */
96 __asm__ ("larl %3,%0; lsll %3,%1"
97 : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg));
98 if ((~seg_ar & 0x9800) || eip > seg_limit) {
99 *eip_limit = 0;
100 return 1; /* So that returned eip > *eip_limit. */
101 }
102
103 /* Get the GDT/LDT descriptor base.
104 When you look for races in this code remember that
105 LDT and other horrors are only used in user space. */
106 if (seg & (1<<2)) {
107 /* Must lock the LDT while reading it. */
108 mutex_lock(&current->mm->context.lock);
109 desc = current->mm->context.ldt;
110 desc = (void *)desc + (seg & ~7);
111 } else {
112 /* Must disable preemption while reading the GDT. */
113 desc = (u32 *)get_cpu_gdt_table(get_cpu());
114 desc = (void *)desc + (seg & ~7);
115 }
116
117 /* Decode the code segment base from the descriptor */
118 base = get_desc_base((unsigned long *)desc);
119
120 if (seg & (1<<2)) {
121 mutex_unlock(&current->mm->context.lock);
122 } else
123 put_cpu();
124
125 /* Adjust EIP and segment limit, and clamp at the kernel limit.
126 It's legitimate for segments to wrap at 0xffffffff. */
127 seg_limit += base;
128 if (seg_limit < *eip_limit && seg_limit >= base)
129 *eip_limit = seg_limit;
130 return eip + base;
131}
132
133/*
134 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
135 * Check that here and ignore it.
136 */
137static int __is_prefetch(struct pt_regs *regs, unsigned long addr)
138{
139 unsigned long limit;
140 unsigned char *instr = (unsigned char *)get_segment_eip (regs, &limit);
141 int scan_more = 1;
142 int prefetch = 0;
143 int i;
144
145 for (i = 0; scan_more && i < 15; i++) {
146 unsigned char opcode;
147 unsigned char instr_hi;
148 unsigned char instr_lo;
149
150 if (instr > (unsigned char *)limit)
151 break;
152 if (probe_kernel_address(instr, opcode))
153 break;
154
155 instr_hi = opcode & 0xf0;
156 instr_lo = opcode & 0x0f;
157 instr++;
158
159 switch (instr_hi) {
160 case 0x20:
161 case 0x30:
162 /* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. */
163 scan_more = ((instr_lo & 7) == 0x6);
164 break;
165
166 case 0x60:
167 /* 0x64 thru 0x67 are valid prefixes in all modes. */
168 scan_more = (instr_lo & 0xC) == 0x4;
169 break;
170 case 0xF0:
171 /* 0xF0, 0xF2, and 0xF3 are valid prefixes */
172 scan_more = !instr_lo || (instr_lo>>1) == 1;
173 break;
174 case 0x00:
175 /* Prefetch instruction is 0x0F0D or 0x0F18 */
176 scan_more = 0;
177 if (instr > (unsigned char *)limit)
178 break;
179 if (probe_kernel_address(instr, opcode))
180 break;
181 prefetch = (instr_lo == 0xF) &&
182 (opcode == 0x0D || opcode == 0x18);
183 break;
184 default:
185 scan_more = 0;
186 break;
187 }
188 }
189 return prefetch;
190}
191
192static inline int is_prefetch(struct pt_regs *regs, unsigned long addr,
193 unsigned long error_code)
194{
195 if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
196 boot_cpu_data.x86 >= 6)) {
197 /* Catch an obscure case of prefetch inside an NX page. */
198 if (nx_enabled && (error_code & 16))
199 return 0;
200 return __is_prefetch(regs, addr);
201 }
202 return 0;
203}
204
205static noinline void force_sig_info_fault(int si_signo, int si_code,
206 unsigned long address, struct task_struct *tsk)
207{
208 siginfo_t info;
209
210 info.si_signo = si_signo;
211 info.si_errno = 0;
212 info.si_code = si_code;
213 info.si_addr = (void __user *)address;
214 force_sig_info(si_signo, &info, tsk);
215}
216
217fastcall void do_invalid_op(struct pt_regs *, unsigned long);
218
219static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
220{
221 unsigned index = pgd_index(address);
222 pgd_t *pgd_k;
223 pud_t *pud, *pud_k;
224 pmd_t *pmd, *pmd_k;
225
226 pgd += index;
227 pgd_k = init_mm.pgd + index;
228
229 if (!pgd_present(*pgd_k))
230 return NULL;
231
232 /*
233 * set_pgd(pgd, *pgd_k); here would be useless on PAE
234 * and redundant with the set_pmd() on non-PAE. As would
235 * set_pud.
236 */
237
238 pud = pud_offset(pgd, address);
239 pud_k = pud_offset(pgd_k, address);
240 if (!pud_present(*pud_k))
241 return NULL;
242
243 pmd = pmd_offset(pud, address);
244 pmd_k = pmd_offset(pud_k, address);
245 if (!pmd_present(*pmd_k))
246 return NULL;
247 if (!pmd_present(*pmd)) {
248 set_pmd(pmd, *pmd_k);
249 arch_flush_lazy_mmu_mode();
250 } else
251 BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
252 return pmd_k;
253}
254
255/*
256 * Handle a fault on the vmalloc or module mapping area
257 *
258 * This assumes no large pages in there.
259 */
260static inline int vmalloc_fault(unsigned long address)
261{
262 unsigned long pgd_paddr;
263 pmd_t *pmd_k;
264 pte_t *pte_k;
265 /*
266 * Synchronize this task's top level page-table
267 * with the 'reference' page table.
268 *
269 * Do _not_ use "current" here. We might be inside
270 * an interrupt in the middle of a task switch..
271 */
272 pgd_paddr = read_cr3();
273 pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
274 if (!pmd_k)
275 return -1;
276 pte_k = pte_offset_kernel(pmd_k, address);
277 if (!pte_present(*pte_k))
278 return -1;
279 return 0;
280}
281
282int show_unhandled_signals = 1;
283
284/*
285 * This routine handles page faults. It determines the address,
286 * and the problem, and then passes it off to one of the appropriate
287 * routines.
288 *
289 * error_code:
290 * bit 0 == 0 means no page found, 1 means protection fault
291 * bit 1 == 0 means read, 1 means write
292 * bit 2 == 0 means kernel, 1 means user-mode
293 * bit 3 == 1 means use of reserved bit detected
294 * bit 4 == 1 means fault was an instruction fetch
295 */
296fastcall void __kprobes do_page_fault(struct pt_regs *regs,
297 unsigned long error_code)
298{
299 struct task_struct *tsk;
300 struct mm_struct *mm;
301 struct vm_area_struct * vma;
302 unsigned long address;
303 int write, si_code;
304 int fault;
305
306 /*
307 * We can fault from pretty much anywhere, with unknown IRQ state.
308 */
309 trace_hardirqs_fixup();
310
311 /* get the address */
312 address = read_cr2();
313
314 tsk = current;
315
316 si_code = SEGV_MAPERR;
317
318 /*
319 * We fault-in kernel-space virtual memory on-demand. The
320 * 'reference' page table is init_mm.pgd.
321 *
322 * NOTE! We MUST NOT take any locks for this case. We may
323 * be in an interrupt or a critical region, and should
324 * only copy the information from the master page table,
325 * nothing more.
326 *
327 * This verifies that the fault happens in kernel space
328 * (error_code & 4) == 0, and that the fault was not a
329 * protection error (error_code & 9) == 0.
330 */
331 if (unlikely(address >= TASK_SIZE)) {
332 if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0)
333 return;
334 if (notify_page_fault(regs))
335 return;
336 /*
337 * Don't take the mm semaphore here. If we fixup a prefetch
338 * fault we could otherwise deadlock.
339 */
340 goto bad_area_nosemaphore;
341 }
342
343 if (notify_page_fault(regs))
344 return;
345
346 /* It's safe to allow irq's after cr2 has been saved and the vmalloc
347 fault has been handled. */
348 if (regs->eflags & (X86_EFLAGS_IF|VM_MASK))
349 local_irq_enable();
350
351 mm = tsk->mm;
352
353 /*
354 * If we're in an interrupt, have no user context or are running in an
355 * atomic region then we must not take the fault..
356 */
357 if (in_atomic() || !mm)
358 goto bad_area_nosemaphore;
359
360 /* When running in the kernel we expect faults to occur only to
361 * addresses in user space. All other faults represent errors in the
362 * kernel and should generate an OOPS. Unfortunately, in the case of an
363 * erroneous fault occurring in a code path which already holds mmap_sem
364 * we will deadlock attempting to validate the fault against the
365 * address space. Luckily the kernel only validly references user
366 * space from well defined areas of code, which are listed in the
367 * exceptions table.
368 *
369 * As the vast majority of faults will be valid we will only perform
370 * the source reference check when there is a possibility of a deadlock.
371 * Attempt to lock the address space, if we cannot we then validate the
372 * source. If this is invalid we can skip the address space check,
373 * thus avoiding the deadlock.
374 */
375 if (!down_read_trylock(&mm->mmap_sem)) {
376 if ((error_code & 4) == 0 &&
377 !search_exception_tables(regs->eip))
378 goto bad_area_nosemaphore;
379 down_read(&mm->mmap_sem);
380 }
381
382 vma = find_vma(mm, address);
383 if (!vma)
384 goto bad_area;
385 if (vma->vm_start <= address)
386 goto good_area;
387 if (!(vma->vm_flags & VM_GROWSDOWN))
388 goto bad_area;
389 if (error_code & 4) {
390 /*
391 * Accessing the stack below %esp is always a bug.
392 * The large cushion allows instructions like enter
393 * and pusha to work. ("enter $65535,$31" pushes
394 * 32 pointers and then decrements %esp by 65535.)
395 */
396 if (address + 65536 + 32 * sizeof(unsigned long) < regs->esp)
397 goto bad_area;
398 }
399 if (expand_stack(vma, address))
400 goto bad_area;
401/*
402 * Ok, we have a good vm_area for this memory access, so
403 * we can handle it..
404 */
405good_area:
406 si_code = SEGV_ACCERR;
407 write = 0;
408 switch (error_code & 3) {
409 default: /* 3: write, present */
410 /* fall through */
411 case 2: /* write, not present */
412 if (!(vma->vm_flags & VM_WRITE))
413 goto bad_area;
414 write++;
415 break;
416 case 1: /* read, present */
417 goto bad_area;
418 case 0: /* read, not present */
419 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
420 goto bad_area;
421 }
422
423 survive:
424 /*
425 * If for any reason at all we couldn't handle the fault,
426 * make sure we exit gracefully rather than endlessly redo
427 * the fault.
428 */
429 fault = handle_mm_fault(mm, vma, address, write);
430 if (unlikely(fault & VM_FAULT_ERROR)) {
431 if (fault & VM_FAULT_OOM)
432 goto out_of_memory;
433 else if (fault & VM_FAULT_SIGBUS)
434 goto do_sigbus;
435 BUG();
436 }
437 if (fault & VM_FAULT_MAJOR)
438 tsk->maj_flt++;
439 else
440 tsk->min_flt++;
441
442 /*
443 * Did it hit the DOS screen memory VA from vm86 mode?
444 */
445 if (regs->eflags & VM_MASK) {
446 unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
447 if (bit < 32)
448 tsk->thread.screen_bitmap |= 1 << bit;
449 }
450 up_read(&mm->mmap_sem);
451 return;
452
453/*
454 * Something tried to access memory that isn't in our memory map..
455 * Fix it, but check if it's kernel or user first..
456 */
457bad_area:
458 up_read(&mm->mmap_sem);
459
460bad_area_nosemaphore:
461 /* User mode accesses just cause a SIGSEGV */
462 if (error_code & 4) {
463 /*
464 * It's possible to have interrupts off here.
465 */
466 local_irq_enable();
467
468 /*
469 * Valid to do another page fault here because this one came
470 * from user space.
471 */
472 if (is_prefetch(regs, address, error_code))
473 return;
474
475 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
476 printk_ratelimit()) {
477 printk("%s%s[%d]: segfault at %08lx eip %08lx "
478 "esp %08lx error %lx\n",
479 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
480 tsk->comm, task_pid_nr(tsk), address, regs->eip,
481 regs->esp, error_code);
482 }
483 tsk->thread.cr2 = address;
484 /* Kernel addresses are always protection faults */
485 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
486 tsk->thread.trap_no = 14;
487 force_sig_info_fault(SIGSEGV, si_code, address, tsk);
488 return;
489 }
490
491#ifdef CONFIG_X86_F00F_BUG
492 /*
493 * Pentium F0 0F C7 C8 bug workaround.
494 */
495 if (boot_cpu_data.f00f_bug) {
496 unsigned long nr;
497
498 nr = (address - idt_descr.address) >> 3;
499
500 if (nr == 6) {
501 do_invalid_op(regs, 0);
502 return;
503 }
504 }
505#endif
506
507no_context:
508 /* Are we prepared to handle this kernel fault? */
509 if (fixup_exception(regs))
510 return;
511
512 /*
513 * Valid to do another page fault here, because if this fault
514 * had been triggered by is_prefetch fixup_exception would have
515 * handled it.
516 */
517 if (is_prefetch(regs, address, error_code))
518 return;
519
520/*
521 * Oops. The kernel tried to access some bad page. We'll have to
522 * terminate things with extreme prejudice.
523 */
524
525 bust_spinlocks(1);
526
527 if (oops_may_print()) {
528 __typeof__(pte_val(__pte(0))) page;
529
530#ifdef CONFIG_X86_PAE
531 if (error_code & 16) {
532 pte_t *pte = lookup_address(address);
533
534 if (pte && pte_present(*pte) && !pte_exec_kernel(*pte))
535 printk(KERN_CRIT "kernel tried to execute "
536 "NX-protected page - exploit attempt? "
537 "(uid: %d)\n", current->uid);
538 }
539#endif
540 if (address < PAGE_SIZE)
541 printk(KERN_ALERT "BUG: unable to handle kernel NULL "
542 "pointer dereference");
543 else
544 printk(KERN_ALERT "BUG: unable to handle kernel paging"
545 " request");
546 printk(" at virtual address %08lx\n",address);
547 printk(KERN_ALERT "printing eip: %08lx ", regs->eip);
548
549 page = read_cr3();
550 page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
551#ifdef CONFIG_X86_PAE
552 printk("*pdpt = %016Lx ", page);
553 if ((page >> PAGE_SHIFT) < max_low_pfn
554 && page & _PAGE_PRESENT) {
555 page &= PAGE_MASK;
556 page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
557 & (PTRS_PER_PMD - 1)];
558 printk(KERN_CONT "*pde = %016Lx ", page);
559 page &= ~_PAGE_NX;
560 }
561#else
562 printk("*pde = %08lx ", page);
563#endif
564
565 /*
566 * We must not directly access the pte in the highpte
567 * case if the page table is located in highmem.
568 * And let's rather not kmap-atomic the pte, just in case
569 * it's allocated already.
570 */
571 if ((page >> PAGE_SHIFT) < max_low_pfn
572 && (page & _PAGE_PRESENT)
573 && !(page & _PAGE_PSE)) {
574 page &= PAGE_MASK;
575 page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
576 & (PTRS_PER_PTE - 1)];
577 printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page);
578 }
579
580 printk("\n");
581 }
582
583 tsk->thread.cr2 = address;
584 tsk->thread.trap_no = 14;
585 tsk->thread.error_code = error_code;
586 die("Oops", regs, error_code);
587 bust_spinlocks(0);
588 do_exit(SIGKILL);
589
590/*
591 * We ran out of memory, or some other thing happened to us that made
592 * us unable to handle the page fault gracefully.
593 */
594out_of_memory:
595 up_read(&mm->mmap_sem);
596 if (is_global_init(tsk)) {
597 yield();
598 down_read(&mm->mmap_sem);
599 goto survive;
600 }
601 printk("VM: killing process %s\n", tsk->comm);
602 if (error_code & 4)
603 do_group_exit(SIGKILL);
604 goto no_context;
605
606do_sigbus:
607 up_read(&mm->mmap_sem);
608
609 /* Kernel mode? Handle exceptions or die */
610 if (!(error_code & 4))
611 goto no_context;
612
613 /* User space => ok to do another page fault */
614 if (is_prefetch(regs, address, error_code))
615 return;
616
617 tsk->thread.cr2 = address;
618 tsk->thread.error_code = error_code;
619 tsk->thread.trap_no = 14;
620 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
621}
622
623void vmalloc_sync_all(void)
624{
625 /*
626 * Note that races in the updates of insync and start aren't
627 * problematic: insync can only get set bits added, and updates to
628 * start are only improving performance (without affecting correctness
629 * if undone).
630 */
631 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
632 static unsigned long start = TASK_SIZE;
633 unsigned long address;
634
635 if (SHARED_KERNEL_PMD)
636 return;
637
638 BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
639 for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) {
640 if (!test_bit(pgd_index(address), insync)) {
641 unsigned long flags;
642 struct page *page;
643
644 spin_lock_irqsave(&pgd_lock, flags);
645 for (page = pgd_list; page; page =
646 (struct page *)page->index)
647 if (!vmalloc_sync_one(page_address(page),
648 address)) {
649 BUG_ON(page != pgd_list);
650 break;
651 }
652 spin_unlock_irqrestore(&pgd_lock, flags);
653 if (!page)
654 set_bit(pgd_index(address), insync);
655 }
656 if (address == start && test_bit(pgd_index(address), insync))
657 start = address + PGDIR_SIZE;
658 }
659}
diff --git a/arch/x86/mm/fault_64.c b/arch/x86/mm/fault_64.c
deleted file mode 100644
index 0e26230669c..00000000000
--- a/arch/x86/mm/fault_64.c
+++ /dev/null
@@ -1,623 +0,0 @@
1/*
2 * linux/arch/x86-64/mm/fault.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
6 */
7
8#include <linux/signal.h>
9#include <linux/sched.h>
10#include <linux/kernel.h>
11#include <linux/errno.h>
12#include <linux/string.h>
13#include <linux/types.h>
14#include <linux/ptrace.h>
15#include <linux/mman.h>
16#include <linux/mm.h>
17#include <linux/smp.h>
18#include <linux/interrupt.h>
19#include <linux/init.h>
20#include <linux/tty.h>
21#include <linux/vt_kern.h> /* For unblank_screen() */
22#include <linux/compiler.h>
23#include <linux/vmalloc.h>
24#include <linux/module.h>
25#include <linux/kprobes.h>
26#include <linux/uaccess.h>
27#include <linux/kdebug.h>
28#include <linux/kprobes.h>
29
30#include <asm/system.h>
31#include <asm/pgalloc.h>
32#include <asm/smp.h>
33#include <asm/tlbflush.h>
34#include <asm/proto.h>
35#include <asm-generic/sections.h>
36
37/* Page fault error code bits */
38#define PF_PROT (1<<0) /* or no page found */
39#define PF_WRITE (1<<1)
40#define PF_USER (1<<2)
41#define PF_RSVD (1<<3)
42#define PF_INSTR (1<<4)
43
44#ifdef CONFIG_KPROBES
45static inline int notify_page_fault(struct pt_regs *regs)
46{
47 int ret = 0;
48
49 /* kprobe_running() needs smp_processor_id() */
50 if (!user_mode(regs)) {
51 preempt_disable();
52 if (kprobe_running() && kprobe_fault_handler(regs, 14))
53 ret = 1;
54 preempt_enable();
55 }
56
57 return ret;
58}
59#else
60static inline int notify_page_fault(struct pt_regs *regs)
61{
62 return 0;
63}
64#endif
65
66/* Sometimes the CPU reports invalid exceptions on prefetch.
67 Check that here and ignore.
68 Opcode checker based on code by Richard Brunner */
69static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
70 unsigned long error_code)
71{
72 unsigned char *instr;
73 int scan_more = 1;
74 int prefetch = 0;
75 unsigned char *max_instr;
76
77 /* If it was a exec fault ignore */
78 if (error_code & PF_INSTR)
79 return 0;
80
81 instr = (unsigned char __user *)convert_rip_to_linear(current, regs);
82 max_instr = instr + 15;
83
84 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
85 return 0;
86
87 while (scan_more && instr < max_instr) {
88 unsigned char opcode;
89 unsigned char instr_hi;
90 unsigned char instr_lo;
91
92 if (probe_kernel_address(instr, opcode))
93 break;
94
95 instr_hi = opcode & 0xf0;
96 instr_lo = opcode & 0x0f;
97 instr++;
98
99 switch (instr_hi) {
100 case 0x20:
101 case 0x30:
102 /* Values 0x26,0x2E,0x36,0x3E are valid x86
103 prefixes. In long mode, the CPU will signal
104 invalid opcode if some of these prefixes are
105 present so we will never get here anyway */
106 scan_more = ((instr_lo & 7) == 0x6);
107 break;
108
109 case 0x40:
110 /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
111 Need to figure out under what instruction mode the
112 instruction was issued ... */
113 /* Could check the LDT for lm, but for now it's good
114 enough to assume that long mode only uses well known
115 segments or kernel. */
116 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
117 break;
118
119 case 0x60:
120 /* 0x64 thru 0x67 are valid prefixes in all modes. */
121 scan_more = (instr_lo & 0xC) == 0x4;
122 break;
123 case 0xF0:
124 /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
125 scan_more = !instr_lo || (instr_lo>>1) == 1;
126 break;
127 case 0x00:
128 /* Prefetch instruction is 0x0F0D or 0x0F18 */
129 scan_more = 0;
130 if (probe_kernel_address(instr, opcode))
131 break;
132 prefetch = (instr_lo == 0xF) &&
133 (opcode == 0x0D || opcode == 0x18);
134 break;
135 default:
136 scan_more = 0;
137 break;
138 }
139 }
140 return prefetch;
141}
142
143static int bad_address(void *p)
144{
145 unsigned long dummy;
146 return probe_kernel_address((unsigned long *)p, dummy);
147}
148
149void dump_pagetable(unsigned long address)
150{
151 pgd_t *pgd;
152 pud_t *pud;
153 pmd_t *pmd;
154 pte_t *pte;
155
156 pgd = (pgd_t *)read_cr3();
157
158 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
159 pgd += pgd_index(address);
160 if (bad_address(pgd)) goto bad;
161 printk("PGD %lx ", pgd_val(*pgd));
162 if (!pgd_present(*pgd)) goto ret;
163
164 pud = pud_offset(pgd, address);
165 if (bad_address(pud)) goto bad;
166 printk("PUD %lx ", pud_val(*pud));
167 if (!pud_present(*pud)) goto ret;
168
169 pmd = pmd_offset(pud, address);
170 if (bad_address(pmd)) goto bad;
171 printk("PMD %lx ", pmd_val(*pmd));
172 if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
173
174 pte = pte_offset_kernel(pmd, address);
175 if (bad_address(pte)) goto bad;
176 printk("PTE %lx", pte_val(*pte));
177ret:
178 printk("\n");
179 return;
180bad:
181 printk("BAD\n");
182}
183
184static const char errata93_warning[] =
185KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
186KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
187KERN_ERR "******* Please consider a BIOS update.\n"
188KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
189
190/* Workaround for K8 erratum #93 & buggy BIOS.
191 BIOS SMM functions are required to use a specific workaround
192 to avoid corruption of the 64bit RIP register on C stepping K8.
193 A lot of BIOS that didn't get tested properly miss this.
194 The OS sees this as a page fault with the upper 32bits of RIP cleared.
195 Try to work around it here.
196 Note we only handle faults in kernel here. */
197
198static int is_errata93(struct pt_regs *regs, unsigned long address)
199{
200 static int warned;
201 if (address != regs->rip)
202 return 0;
203 if ((address >> 32) != 0)
204 return 0;
205 address |= 0xffffffffUL << 32;
206 if ((address >= (u64)_stext && address <= (u64)_etext) ||
207 (address >= MODULES_VADDR && address <= MODULES_END)) {
208 if (!warned) {
209 printk(errata93_warning);
210 warned = 1;
211 }
212 regs->rip = address;
213 return 1;
214 }
215 return 0;
216}
217
218static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
219 unsigned long error_code)
220{
221 unsigned long flags = oops_begin();
222 struct task_struct *tsk;
223
224 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
225 current->comm, address);
226 dump_pagetable(address);
227 tsk = current;
228 tsk->thread.cr2 = address;
229 tsk->thread.trap_no = 14;
230 tsk->thread.error_code = error_code;
231 __die("Bad pagetable", regs, error_code);
232 oops_end(flags);
233 do_exit(SIGKILL);
234}
235
236/*
237 * Handle a fault on the vmalloc area
238 *
239 * This assumes no large pages in there.
240 */
241static int vmalloc_fault(unsigned long address)
242{
243 pgd_t *pgd, *pgd_ref;
244 pud_t *pud, *pud_ref;
245 pmd_t *pmd, *pmd_ref;
246 pte_t *pte, *pte_ref;
247
248 /* Copy kernel mappings over when needed. This can also
249 happen within a race in page table update. In the later
250 case just flush. */
251
252 pgd = pgd_offset(current->mm ?: &init_mm, address);
253 pgd_ref = pgd_offset_k(address);
254 if (pgd_none(*pgd_ref))
255 return -1;
256 if (pgd_none(*pgd))
257 set_pgd(pgd, *pgd_ref);
258 else
259 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
260
261 /* Below here mismatches are bugs because these lower tables
262 are shared */
263
264 pud = pud_offset(pgd, address);
265 pud_ref = pud_offset(pgd_ref, address);
266 if (pud_none(*pud_ref))
267 return -1;
268 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
269 BUG();
270 pmd = pmd_offset(pud, address);
271 pmd_ref = pmd_offset(pud_ref, address);
272 if (pmd_none(*pmd_ref))
273 return -1;
274 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
275 BUG();
276 pte_ref = pte_offset_kernel(pmd_ref, address);
277 if (!pte_present(*pte_ref))
278 return -1;
279 pte = pte_offset_kernel(pmd, address);
280 /* Don't use pte_page here, because the mappings can point
281 outside mem_map, and the NUMA hash lookup cannot handle
282 that. */
283 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
284 BUG();
285 return 0;
286}
287
288int show_unhandled_signals = 1;
289
290/*
291 * This routine handles page faults. It determines the address,
292 * and the problem, and then passes it off to one of the appropriate
293 * routines.
294 */
295asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
296 unsigned long error_code)
297{
298 struct task_struct *tsk;
299 struct mm_struct *mm;
300 struct vm_area_struct * vma;
301 unsigned long address;
302 const struct exception_table_entry *fixup;
303 int write, fault;
304 unsigned long flags;
305 siginfo_t info;
306
307 /*
308 * We can fault from pretty much anywhere, with unknown IRQ state.
309 */
310 trace_hardirqs_fixup();
311
312 tsk = current;
313 mm = tsk->mm;
314 prefetchw(&mm->mmap_sem);
315
316 /* get the address */
317 address = read_cr2();
318
319 info.si_code = SEGV_MAPERR;
320
321
322 /*
323 * We fault-in kernel-space virtual memory on-demand. The
324 * 'reference' page table is init_mm.pgd.
325 *
326 * NOTE! We MUST NOT take any locks for this case. We may
327 * be in an interrupt or a critical region, and should
328 * only copy the information from the master page table,
329 * nothing more.
330 *
331 * This verifies that the fault happens in kernel space
332 * (error_code & 4) == 0, and that the fault was not a
333 * protection error (error_code & 9) == 0.
334 */
335 if (unlikely(address >= TASK_SIZE64)) {
336 /*
337 * Don't check for the module range here: its PML4
338 * is always initialized because it's shared with the main
339 * kernel text. Only vmalloc may need PML4 syncups.
340 */
341 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
342 ((address >= VMALLOC_START && address < VMALLOC_END))) {
343 if (vmalloc_fault(address) >= 0)
344 return;
345 }
346 if (notify_page_fault(regs))
347 return;
348 /*
349 * Don't take the mm semaphore here. If we fixup a prefetch
350 * fault we could otherwise deadlock.
351 */
352 goto bad_area_nosemaphore;
353 }
354
355 if (notify_page_fault(regs))
356 return;
357
358 if (likely(regs->eflags & X86_EFLAGS_IF))
359 local_irq_enable();
360
361 if (unlikely(error_code & PF_RSVD))
362 pgtable_bad(address, regs, error_code);
363
364 /*
365 * If we're in an interrupt or have no user
366 * context, we must not take the fault..
367 */
368 if (unlikely(in_atomic() || !mm))
369 goto bad_area_nosemaphore;
370
371 /*
372 * User-mode registers count as a user access even for any
373 * potential system fault or CPU buglet.
374 */
375 if (user_mode_vm(regs))
376 error_code |= PF_USER;
377
378 again:
379 /* When running in the kernel we expect faults to occur only to
380 * addresses in user space. All other faults represent errors in the
381 * kernel and should generate an OOPS. Unfortunately, in the case of an
382 * erroneous fault occurring in a code path which already holds mmap_sem
383 * we will deadlock attempting to validate the fault against the
384 * address space. Luckily the kernel only validly references user
385 * space from well defined areas of code, which are listed in the
386 * exceptions table.
387 *
388 * As the vast majority of faults will be valid we will only perform
389 * the source reference check when there is a possibility of a deadlock.
390 * Attempt to lock the address space, if we cannot we then validate the
391 * source. If this is invalid we can skip the address space check,
392 * thus avoiding the deadlock.
393 */
394 if (!down_read_trylock(&mm->mmap_sem)) {
395 if ((error_code & PF_USER) == 0 &&
396 !search_exception_tables(regs->rip))
397 goto bad_area_nosemaphore;
398 down_read(&mm->mmap_sem);
399 }
400
401 vma = find_vma(mm, address);
402 if (!vma)
403 goto bad_area;
404 if (likely(vma->vm_start <= address))
405 goto good_area;
406 if (!(vma->vm_flags & VM_GROWSDOWN))
407 goto bad_area;
408 if (error_code & 4) {
409 /* Allow userspace just enough access below the stack pointer
410 * to let the 'enter' instruction work.
411 */
412 if (address + 65536 + 32 * sizeof(unsigned long) < regs->rsp)
413 goto bad_area;
414 }
415 if (expand_stack(vma, address))
416 goto bad_area;
417/*
418 * Ok, we have a good vm_area for this memory access, so
419 * we can handle it..
420 */
421good_area:
422 info.si_code = SEGV_ACCERR;
423 write = 0;
424 switch (error_code & (PF_PROT|PF_WRITE)) {
425 default: /* 3: write, present */
426 /* fall through */
427 case PF_WRITE: /* write, not present */
428 if (!(vma->vm_flags & VM_WRITE))
429 goto bad_area;
430 write++;
431 break;
432 case PF_PROT: /* read, present */
433 goto bad_area;
434 case 0: /* read, not present */
435 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
436 goto bad_area;
437 }
438
439 /*
440 * If for any reason at all we couldn't handle the fault,
441 * make sure we exit gracefully rather than endlessly redo
442 * the fault.
443 */
444 fault = handle_mm_fault(mm, vma, address, write);
445 if (unlikely(fault & VM_FAULT_ERROR)) {
446 if (fault & VM_FAULT_OOM)
447 goto out_of_memory;
448 else if (fault & VM_FAULT_SIGBUS)
449 goto do_sigbus;
450 BUG();
451 }
452 if (fault & VM_FAULT_MAJOR)
453 tsk->maj_flt++;
454 else
455 tsk->min_flt++;
456 up_read(&mm->mmap_sem);
457 return;
458
459/*
460 * Something tried to access memory that isn't in our memory map..
461 * Fix it, but check if it's kernel or user first..
462 */
463bad_area:
464 up_read(&mm->mmap_sem);
465
466bad_area_nosemaphore:
467 /* User mode accesses just cause a SIGSEGV */
468 if (error_code & PF_USER) {
469
470 /*
471 * It's possible to have interrupts off here.
472 */
473 local_irq_enable();
474
475 if (is_prefetch(regs, address, error_code))
476 return;
477
478 /* Work around K8 erratum #100 K8 in compat mode
479 occasionally jumps to illegal addresses >4GB. We
480 catch this here in the page fault handler because
481 these addresses are not reachable. Just detect this
482 case and return. Any code segment in LDT is
483 compatibility mode. */
484 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
485 (address >> 32))
486 return;
487
488 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
489 printk_ratelimit()) {
490 printk(
491 "%s%s[%d]: segfault at %lx rip %lx rsp %lx error %lx\n",
492 tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
493 tsk->comm, tsk->pid, address, regs->rip,
494 regs->rsp, error_code);
495 }
496
497 tsk->thread.cr2 = address;
498 /* Kernel addresses are always protection faults */
499 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
500 tsk->thread.trap_no = 14;
501 info.si_signo = SIGSEGV;
502 info.si_errno = 0;
503 /* info.si_code has been set above */
504 info.si_addr = (void __user *)address;
505 force_sig_info(SIGSEGV, &info, tsk);
506 return;
507 }
508
509no_context:
510
511 /* Are we prepared to handle this kernel fault? */
512 fixup = search_exception_tables(regs->rip);
513 if (fixup) {
514 regs->rip = fixup->fixup;
515 return;
516 }
517
518 /*
519 * Hall of shame of CPU/BIOS bugs.
520 */
521
522 if (is_prefetch(regs, address, error_code))
523 return;
524
525 if (is_errata93(regs, address))
526 return;
527
528/*
529 * Oops. The kernel tried to access some bad page. We'll have to
530 * terminate things with extreme prejudice.
531 */
532
533 flags = oops_begin();
534
535 if (address < PAGE_SIZE)
536 printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
537 else
538 printk(KERN_ALERT "Unable to handle kernel paging request");
539 printk(" at %016lx RIP: \n" KERN_ALERT,address);
540 printk_address(regs->rip);
541 dump_pagetable(address);
542 tsk->thread.cr2 = address;
543 tsk->thread.trap_no = 14;
544 tsk->thread.error_code = error_code;
545 __die("Oops", regs, error_code);
546 /* Executive summary in case the body of the oops scrolled away */
547 printk(KERN_EMERG "CR2: %016lx\n", address);
548 oops_end(flags);
549 do_exit(SIGKILL);
550
551/*
552 * We ran out of memory, or some other thing happened to us that made
553 * us unable to handle the page fault gracefully.
554 */
555out_of_memory:
556 up_read(&mm->mmap_sem);
557 if (is_global_init(current)) {
558 yield();
559 goto again;
560 }
561 printk("VM: killing process %s\n", tsk->comm);
562 if (error_code & 4)
563 do_group_exit(SIGKILL);
564 goto no_context;
565
566do_sigbus:
567 up_read(&mm->mmap_sem);
568
569 /* Kernel mode? Handle exceptions or die */
570 if (!(error_code & PF_USER))
571 goto no_context;
572
573 tsk->thread.cr2 = address;
574 tsk->thread.error_code = error_code;
575 tsk->thread.trap_no = 14;
576 info.si_signo = SIGBUS;
577 info.si_errno = 0;
578 info.si_code = BUS_ADRERR;
579 info.si_addr = (void __user *)address;
580 force_sig_info(SIGBUS, &info, tsk);
581 return;
582}
583
584DEFINE_SPINLOCK(pgd_lock);
585LIST_HEAD(pgd_list);
586
587void vmalloc_sync_all(void)
588{
589 /* Note that races in the updates of insync and start aren't
590 problematic:
591 insync can only get set bits added, and updates to start are only
592 improving performance (without affecting correctness if undone). */
593 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
594 static unsigned long start = VMALLOC_START & PGDIR_MASK;
595 unsigned long address;
596
597 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
598 if (!test_bit(pgd_index(address), insync)) {
599 const pgd_t *pgd_ref = pgd_offset_k(address);
600 struct page *page;
601
602 if (pgd_none(*pgd_ref))
603 continue;
604 spin_lock(&pgd_lock);
605 list_for_each_entry(page, &pgd_list, lru) {
606 pgd_t *pgd;
607 pgd = (pgd_t *)page_address(page) + pgd_index(address);
608 if (pgd_none(*pgd))
609 set_pgd(pgd, *pgd_ref);
610 else
611 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
612 }
613 spin_unlock(&pgd_lock);
614 set_bit(pgd_index(address), insync);
615 }
616 if (address == start)
617 start = address + PGDIR_SIZE;
618 }
619 /* Check that there is no need to do the same for the modules area. */
620 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
621 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
622 (__START_KERNEL & PGDIR_MASK)));
623}
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 1c3bf95f735..3d936f23270 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -18,6 +18,49 @@ void kunmap(struct page *page)
18 kunmap_high(page); 18 kunmap_high(page);
19} 19}
20 20
21static void debug_kmap_atomic_prot(enum km_type type)
22{
23#ifdef CONFIG_DEBUG_HIGHMEM
24 static unsigned warn_count = 10;
25
26 if (unlikely(warn_count == 0))
27 return;
28
29 if (unlikely(in_interrupt())) {
30 if (in_irq()) {
31 if (type != KM_IRQ0 && type != KM_IRQ1 &&
32 type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ &&
33 type != KM_BOUNCE_READ) {
34 WARN_ON(1);
35 warn_count--;
36 }
37 } else if (!irqs_disabled()) { /* softirq */
38 if (type != KM_IRQ0 && type != KM_IRQ1 &&
39 type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 &&
40 type != KM_SKB_SUNRPC_DATA &&
41 type != KM_SKB_DATA_SOFTIRQ &&
42 type != KM_BOUNCE_READ) {
43 WARN_ON(1);
44 warn_count--;
45 }
46 }
47 }
48
49 if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ ||
50 type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ) {
51 if (!irqs_disabled()) {
52 WARN_ON(1);
53 warn_count--;
54 }
55 } else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) {
56 if (irq_count() == 0 && !irqs_disabled()) {
57 WARN_ON(1);
58 warn_count--;
59 }
60 }
61#endif
62}
63
21/* 64/*
22 * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because 65 * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
23 * no global lock is needed and because the kmap code must perform a global TLB 66 * no global lock is needed and because the kmap code must perform a global TLB
@@ -30,8 +73,10 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
30{ 73{
31 enum fixed_addresses idx; 74 enum fixed_addresses idx;
32 unsigned long vaddr; 75 unsigned long vaddr;
33
34 /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ 76 /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
77
78 debug_kmap_atomic_prot(type);
79
35 pagefault_disable(); 80 pagefault_disable();
36 81
37 if (!PageHighMem(page)) 82 if (!PageHighMem(page))
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 6c06d9c0488..4fbafb4bc2f 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -15,6 +15,7 @@
15#include <asm/mman.h> 15#include <asm/mman.h>
16#include <asm/tlb.h> 16#include <asm/tlb.h>
17#include <asm/tlbflush.h> 17#include <asm/tlbflush.h>
18#include <asm/pgalloc.h>
18 19
19static unsigned long page_table_shareable(struct vm_area_struct *svma, 20static unsigned long page_table_shareable(struct vm_area_struct *svma,
20 struct vm_area_struct *vma, 21 struct vm_area_struct *vma,
@@ -88,7 +89,7 @@ static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
88 89
89 spin_lock(&mm->page_table_lock); 90 spin_lock(&mm->page_table_lock);
90 if (pud_none(*pud)) 91 if (pud_none(*pud))
91 pud_populate(mm, pud, (unsigned long) spte & PAGE_MASK); 92 pud_populate(mm, pud, (pmd_t *)((unsigned long)spte & PAGE_MASK));
92 else 93 else
93 put_page(virt_to_page(spte)); 94 put_page(virt_to_page(spte));
94 spin_unlock(&mm->page_table_lock); 95 spin_unlock(&mm->page_table_lock);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index c7d19471261..d1bc04006d1 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -27,11 +27,11 @@
27#include <linux/bootmem.h> 27#include <linux/bootmem.h>
28#include <linux/slab.h> 28#include <linux/slab.h>
29#include <linux/proc_fs.h> 29#include <linux/proc_fs.h>
30#include <linux/efi.h>
31#include <linux/memory_hotplug.h> 30#include <linux/memory_hotplug.h>
32#include <linux/initrd.h> 31#include <linux/initrd.h>
33#include <linux/cpumask.h> 32#include <linux/cpumask.h>
34 33
34#include <asm/asm.h>
35#include <asm/processor.h> 35#include <asm/processor.h>
36#include <asm/system.h> 36#include <asm/system.h>
37#include <asm/uaccess.h> 37#include <asm/uaccess.h>
@@ -40,8 +40,10 @@
40#include <asm/fixmap.h> 40#include <asm/fixmap.h>
41#include <asm/e820.h> 41#include <asm/e820.h>
42#include <asm/apic.h> 42#include <asm/apic.h>
43#include <asm/bugs.h>
43#include <asm/tlb.h> 44#include <asm/tlb.h>
44#include <asm/tlbflush.h> 45#include <asm/tlbflush.h>
46#include <asm/pgalloc.h>
45#include <asm/sections.h> 47#include <asm/sections.h>
46#include <asm/paravirt.h> 48#include <asm/paravirt.h>
47 49
@@ -50,7 +52,7 @@ unsigned int __VMALLOC_RESERVE = 128 << 20;
50DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); 52DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
51unsigned long highstart_pfn, highend_pfn; 53unsigned long highstart_pfn, highend_pfn;
52 54
53static int noinline do_test_wp_bit(void); 55static noinline int do_test_wp_bit(void);
54 56
55/* 57/*
56 * Creates a middle page table and puts a pointer to it in the 58 * Creates a middle page table and puts a pointer to it in the
@@ -61,26 +63,26 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
61{ 63{
62 pud_t *pud; 64 pud_t *pud;
63 pmd_t *pmd_table; 65 pmd_t *pmd_table;
64 66
65#ifdef CONFIG_X86_PAE 67#ifdef CONFIG_X86_PAE
66 if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { 68 if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
67 pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); 69 pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
68 70
69 paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT); 71 paravirt_alloc_pd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
70 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); 72 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
71 pud = pud_offset(pgd, 0); 73 pud = pud_offset(pgd, 0);
72 if (pmd_table != pmd_offset(pud, 0)) 74 BUG_ON(pmd_table != pmd_offset(pud, 0));
73 BUG();
74 } 75 }
75#endif 76#endif
76 pud = pud_offset(pgd, 0); 77 pud = pud_offset(pgd, 0);
77 pmd_table = pmd_offset(pud, 0); 78 pmd_table = pmd_offset(pud, 0);
79
78 return pmd_table; 80 return pmd_table;
79} 81}
80 82
81/* 83/*
82 * Create a page table and place a pointer to it in a middle page 84 * Create a page table and place a pointer to it in a middle page
83 * directory entry. 85 * directory entry:
84 */ 86 */
85static pte_t * __init one_page_table_init(pmd_t *pmd) 87static pte_t * __init one_page_table_init(pmd_t *pmd)
86{ 88{
@@ -90,9 +92,10 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
90#ifdef CONFIG_DEBUG_PAGEALLOC 92#ifdef CONFIG_DEBUG_PAGEALLOC
91 page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); 93 page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
92#endif 94#endif
93 if (!page_table) 95 if (!page_table) {
94 page_table = 96 page_table =
95 (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE); 97 (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
98 }
96 99
97 paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT); 100 paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT);
98 set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); 101 set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
@@ -103,22 +106,21 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
103} 106}
104 107
105/* 108/*
106 * This function initializes a certain range of kernel virtual memory 109 * This function initializes a certain range of kernel virtual memory
107 * with new bootmem page tables, everywhere page tables are missing in 110 * with new bootmem page tables, everywhere page tables are missing in
108 * the given range. 111 * the given range.
109 */ 112 *
110 113 * NOTE: The pagetables are allocated contiguous on the physical space
111/* 114 * so we can cache the place of the first one and move around without
112 * NOTE: The pagetables are allocated contiguous on the physical space
113 * so we can cache the place of the first one and move around without
114 * checking the pgd every time. 115 * checking the pgd every time.
115 */ 116 */
116static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base) 117static void __init
118page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
117{ 119{
118 pgd_t *pgd;
119 pmd_t *pmd;
120 int pgd_idx, pmd_idx; 120 int pgd_idx, pmd_idx;
121 unsigned long vaddr; 121 unsigned long vaddr;
122 pgd_t *pgd;
123 pmd_t *pmd;
122 124
123 vaddr = start; 125 vaddr = start;
124 pgd_idx = pgd_index(vaddr); 126 pgd_idx = pgd_index(vaddr);
@@ -128,7 +130,8 @@ static void __init page_table_range_init (unsigned long start, unsigned long end
128 for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) { 130 for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
129 pmd = one_md_table_init(pgd); 131 pmd = one_md_table_init(pgd);
130 pmd = pmd + pmd_index(vaddr); 132 pmd = pmd + pmd_index(vaddr);
131 for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) { 133 for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
134 pmd++, pmd_idx++) {
132 one_page_table_init(pmd); 135 one_page_table_init(pmd);
133 136
134 vaddr += PMD_SIZE; 137 vaddr += PMD_SIZE;
@@ -145,17 +148,17 @@ static inline int is_kernel_text(unsigned long addr)
145} 148}
146 149
147/* 150/*
148 * This maps the physical memory to kernel virtual address space, a total 151 * This maps the physical memory to kernel virtual address space, a total
149 * of max_low_pfn pages, by creating page tables starting from address 152 * of max_low_pfn pages, by creating page tables starting from address
150 * PAGE_OFFSET. 153 * PAGE_OFFSET:
151 */ 154 */
152static void __init kernel_physical_mapping_init(pgd_t *pgd_base) 155static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
153{ 156{
157 int pgd_idx, pmd_idx, pte_ofs;
154 unsigned long pfn; 158 unsigned long pfn;
155 pgd_t *pgd; 159 pgd_t *pgd;
156 pmd_t *pmd; 160 pmd_t *pmd;
157 pte_t *pte; 161 pte_t *pte;
158 int pgd_idx, pmd_idx, pte_ofs;
159 162
160 pgd_idx = pgd_index(PAGE_OFFSET); 163 pgd_idx = pgd_index(PAGE_OFFSET);
161 pgd = pgd_base + pgd_idx; 164 pgd = pgd_base + pgd_idx;
@@ -165,29 +168,43 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
165 pmd = one_md_table_init(pgd); 168 pmd = one_md_table_init(pgd);
166 if (pfn >= max_low_pfn) 169 if (pfn >= max_low_pfn)
167 continue; 170 continue;
168 for (pmd_idx = 0; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) {
169 unsigned int address = pfn * PAGE_SIZE + PAGE_OFFSET;
170 171
171 /* Map with big pages if possible, otherwise create normal page tables. */ 172 for (pmd_idx = 0;
173 pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn;
174 pmd++, pmd_idx++) {
175 unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET;
176
177 /*
178 * Map with big pages if possible, otherwise
179 * create normal page tables:
180 */
172 if (cpu_has_pse) { 181 if (cpu_has_pse) {
173 unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1; 182 unsigned int addr2;
174 if (is_kernel_text(address) || is_kernel_text(address2)) 183 pgprot_t prot = PAGE_KERNEL_LARGE;
175 set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC)); 184
176 else 185 addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
177 set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE)); 186 PAGE_OFFSET + PAGE_SIZE-1;
187
188 if (is_kernel_text(addr) ||
189 is_kernel_text(addr2))
190 prot = PAGE_KERNEL_LARGE_EXEC;
191
192 set_pmd(pmd, pfn_pmd(pfn, prot));
178 193
179 pfn += PTRS_PER_PTE; 194 pfn += PTRS_PER_PTE;
180 } else { 195 continue;
181 pte = one_page_table_init(pmd); 196 }
182 197 pte = one_page_table_init(pmd);
183 for (pte_ofs = 0; 198
184 pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; 199 for (pte_ofs = 0;
185 pte++, pfn++, pte_ofs++, address += PAGE_SIZE) { 200 pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn;
186 if (is_kernel_text(address)) 201 pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
187 set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC)); 202 pgprot_t prot = PAGE_KERNEL;
188 else 203
189 set_pte(pte, pfn_pte(pfn, PAGE_KERNEL)); 204 if (is_kernel_text(addr))
190 } 205 prot = PAGE_KERNEL_EXEC;
206
207 set_pte(pte, pfn_pte(pfn, prot));
191 } 208 }
192 } 209 }
193 } 210 }
@@ -200,57 +217,23 @@ static inline int page_kills_ppro(unsigned long pagenr)
200 return 0; 217 return 0;
201} 218}
202 219
203int page_is_ram(unsigned long pagenr)
204{
205 int i;
206 unsigned long addr, end;
207
208 if (efi_enabled) {
209 efi_memory_desc_t *md;
210 void *p;
211
212 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
213 md = p;
214 if (!is_available_memory(md))
215 continue;
216 addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT;
217 end = (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >> PAGE_SHIFT;
218
219 if ((pagenr >= addr) && (pagenr < end))
220 return 1;
221 }
222 return 0;
223 }
224
225 for (i = 0; i < e820.nr_map; i++) {
226
227 if (e820.map[i].type != E820_RAM) /* not usable memory */
228 continue;
229 /*
230 * !!!FIXME!!! Some BIOSen report areas as RAM that
231 * are not. Notably the 640->1Mb area. We need a sanity
232 * check here.
233 */
234 addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT;
235 end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT;
236 if ((pagenr >= addr) && (pagenr < end))
237 return 1;
238 }
239 return 0;
240}
241
242#ifdef CONFIG_HIGHMEM 220#ifdef CONFIG_HIGHMEM
243pte_t *kmap_pte; 221pte_t *kmap_pte;
244pgprot_t kmap_prot; 222pgprot_t kmap_prot;
245 223
246#define kmap_get_fixmap_pte(vaddr) \ 224static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr)
247 pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), (vaddr)), (vaddr)) 225{
226 return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr),
227 vaddr), vaddr), vaddr);
228}
248 229
249static void __init kmap_init(void) 230static void __init kmap_init(void)
250{ 231{
251 unsigned long kmap_vstart; 232 unsigned long kmap_vstart;
252 233
253 /* cache the first kmap pte */ 234 /*
235 * Cache the first kmap pte:
236 */
254 kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); 237 kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
255 kmap_pte = kmap_get_fixmap_pte(kmap_vstart); 238 kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
256 239
@@ -259,11 +242,11 @@ static void __init kmap_init(void)
259 242
260static void __init permanent_kmaps_init(pgd_t *pgd_base) 243static void __init permanent_kmaps_init(pgd_t *pgd_base)
261{ 244{
245 unsigned long vaddr;
262 pgd_t *pgd; 246 pgd_t *pgd;
263 pud_t *pud; 247 pud_t *pud;
264 pmd_t *pmd; 248 pmd_t *pmd;
265 pte_t *pte; 249 pte_t *pte;
266 unsigned long vaddr;
267 250
268 vaddr = PKMAP_BASE; 251 vaddr = PKMAP_BASE;
269 page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base); 252 page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
@@ -272,7 +255,7 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
272 pud = pud_offset(pgd, vaddr); 255 pud = pud_offset(pgd, vaddr);
273 pmd = pmd_offset(pud, vaddr); 256 pmd = pmd_offset(pud, vaddr);
274 pte = pte_offset_kernel(pmd, vaddr); 257 pte = pte_offset_kernel(pmd, vaddr);
275 pkmap_page_table = pte; 258 pkmap_page_table = pte;
276} 259}
277 260
278static void __meminit free_new_highpage(struct page *page) 261static void __meminit free_new_highpage(struct page *page)
@@ -291,7 +274,8 @@ void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
291 SetPageReserved(page); 274 SetPageReserved(page);
292} 275}
293 276
294static int __meminit add_one_highpage_hotplug(struct page *page, unsigned long pfn) 277static int __meminit
278add_one_highpage_hotplug(struct page *page, unsigned long pfn)
295{ 279{
296 free_new_highpage(page); 280 free_new_highpage(page);
297 totalram_pages++; 281 totalram_pages++;
@@ -299,6 +283,7 @@ static int __meminit add_one_highpage_hotplug(struct page *page, unsigned long p
299 max_mapnr = max(pfn, max_mapnr); 283 max_mapnr = max(pfn, max_mapnr);
300#endif 284#endif
301 num_physpages++; 285 num_physpages++;
286
302 return 0; 287 return 0;
303} 288}
304 289
@@ -306,7 +291,7 @@ static int __meminit add_one_highpage_hotplug(struct page *page, unsigned long p
306 * Not currently handling the NUMA case. 291 * Not currently handling the NUMA case.
307 * Assuming single node and all memory that 292 * Assuming single node and all memory that
308 * has been added dynamically that would be 293 * has been added dynamically that would be
309 * onlined here is in HIGHMEM 294 * onlined here is in HIGHMEM.
310 */ 295 */
311void __meminit online_page(struct page *page) 296void __meminit online_page(struct page *page)
312{ 297{
@@ -314,34 +299,32 @@ void __meminit online_page(struct page *page)
314 add_one_highpage_hotplug(page, page_to_pfn(page)); 299 add_one_highpage_hotplug(page, page_to_pfn(page));
315} 300}
316 301
317 302#ifndef CONFIG_NUMA
318#ifdef CONFIG_NUMA
319extern void set_highmem_pages_init(int);
320#else
321static void __init set_highmem_pages_init(int bad_ppro) 303static void __init set_highmem_pages_init(int bad_ppro)
322{ 304{
323 int pfn; 305 int pfn;
324 for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) 306
325 add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro); 307 for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) {
308 /*
309 * Holes under sparsemem might not have no mem_map[]:
310 */
311 if (pfn_valid(pfn))
312 add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro);
313 }
326 totalram_pages += totalhigh_pages; 314 totalram_pages += totalhigh_pages;
327} 315}
328#endif /* CONFIG_FLATMEM */ 316#endif /* !CONFIG_NUMA */
329 317
330#else 318#else
331#define kmap_init() do { } while (0) 319# define kmap_init() do { } while (0)
332#define permanent_kmaps_init(pgd_base) do { } while (0) 320# define permanent_kmaps_init(pgd_base) do { } while (0)
333#define set_highmem_pages_init(bad_ppro) do { } while (0) 321# define set_highmem_pages_init(bad_ppro) do { } while (0)
334#endif /* CONFIG_HIGHMEM */ 322#endif /* CONFIG_HIGHMEM */
335 323
336unsigned long long __PAGE_KERNEL = _PAGE_KERNEL; 324pteval_t __PAGE_KERNEL = _PAGE_KERNEL;
337EXPORT_SYMBOL(__PAGE_KERNEL); 325EXPORT_SYMBOL(__PAGE_KERNEL);
338unsigned long long __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
339 326
340#ifdef CONFIG_NUMA 327pteval_t __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
341extern void __init remap_numa_kva(void);
342#else
343#define remap_numa_kva() do {} while (0)
344#endif
345 328
346void __init native_pagetable_setup_start(pgd_t *base) 329void __init native_pagetable_setup_start(pgd_t *base)
347{ 330{
@@ -367,7 +350,7 @@ void __init native_pagetable_setup_start(pgd_t *base)
367 memset(&base[USER_PTRS_PER_PGD], 0, 350 memset(&base[USER_PTRS_PER_PGD], 0,
368 KERNEL_PGD_PTRS * sizeof(pgd_t)); 351 KERNEL_PGD_PTRS * sizeof(pgd_t));
369#else 352#else
370 paravirt_alloc_pd(__pa(swapper_pg_dir) >> PAGE_SHIFT); 353 paravirt_alloc_pd(&init_mm, __pa(base) >> PAGE_SHIFT);
371#endif 354#endif
372} 355}
373 356
@@ -405,10 +388,10 @@ void __init native_pagetable_setup_done(pgd_t *base)
405 * be partially populated, and so it avoids stomping on any existing 388 * be partially populated, and so it avoids stomping on any existing
406 * mappings. 389 * mappings.
407 */ 390 */
408static void __init pagetable_init (void) 391static void __init pagetable_init(void)
409{ 392{
410 unsigned long vaddr, end;
411 pgd_t *pgd_base = swapper_pg_dir; 393 pgd_t *pgd_base = swapper_pg_dir;
394 unsigned long vaddr, end;
412 395
413 paravirt_pagetable_setup_start(pgd_base); 396 paravirt_pagetable_setup_start(pgd_base);
414 397
@@ -430,34 +413,36 @@ static void __init pagetable_init (void)
430 * Fixed mappings, only the page table structure has to be 413 * Fixed mappings, only the page table structure has to be
431 * created - mappings will be set by set_fixmap(): 414 * created - mappings will be set by set_fixmap():
432 */ 415 */
416 early_ioremap_clear();
433 vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; 417 vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
434 end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK; 418 end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
435 page_table_range_init(vaddr, end, pgd_base); 419 page_table_range_init(vaddr, end, pgd_base);
420 early_ioremap_reset();
436 421
437 permanent_kmaps_init(pgd_base); 422 permanent_kmaps_init(pgd_base);
438 423
439 paravirt_pagetable_setup_done(pgd_base); 424 paravirt_pagetable_setup_done(pgd_base);
440} 425}
441 426
442#if defined(CONFIG_HIBERNATION) || defined(CONFIG_ACPI) 427#ifdef CONFIG_ACPI_SLEEP
443/* 428/*
444 * Swap suspend & friends need this for resume because things like the intel-agp 429 * ACPI suspend needs this for resume, because things like the intel-agp
445 * driver might have split up a kernel 4MB mapping. 430 * driver might have split up a kernel 4MB mapping.
446 */ 431 */
447char __nosavedata swsusp_pg_dir[PAGE_SIZE] 432char swsusp_pg_dir[PAGE_SIZE]
448 __attribute__ ((aligned (PAGE_SIZE))); 433 __attribute__ ((aligned(PAGE_SIZE)));
449 434
450static inline void save_pg_dir(void) 435static inline void save_pg_dir(void)
451{ 436{
452 memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE); 437 memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE);
453} 438}
454#else 439#else /* !CONFIG_ACPI_SLEEP */
455static inline void save_pg_dir(void) 440static inline void save_pg_dir(void)
456{ 441{
457} 442}
458#endif 443#endif /* !CONFIG_ACPI_SLEEP */
459 444
460void zap_low_mappings (void) 445void zap_low_mappings(void)
461{ 446{
462 int i; 447 int i;
463 448
@@ -469,22 +454,24 @@ void zap_low_mappings (void)
469 * Note that "pgd_clear()" doesn't do it for 454 * Note that "pgd_clear()" doesn't do it for
470 * us, because pgd_clear() is a no-op on i386. 455 * us, because pgd_clear() is a no-op on i386.
471 */ 456 */
472 for (i = 0; i < USER_PTRS_PER_PGD; i++) 457 for (i = 0; i < USER_PTRS_PER_PGD; i++) {
473#ifdef CONFIG_X86_PAE 458#ifdef CONFIG_X86_PAE
474 set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page))); 459 set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
475#else 460#else
476 set_pgd(swapper_pg_dir+i, __pgd(0)); 461 set_pgd(swapper_pg_dir+i, __pgd(0));
477#endif 462#endif
463 }
478 flush_tlb_all(); 464 flush_tlb_all();
479} 465}
480 466
481int nx_enabled = 0; 467int nx_enabled;
468
469pteval_t __supported_pte_mask __read_mostly = ~_PAGE_NX;
470EXPORT_SYMBOL_GPL(__supported_pte_mask);
482 471
483#ifdef CONFIG_X86_PAE 472#ifdef CONFIG_X86_PAE
484 473
485static int disable_nx __initdata = 0; 474static int disable_nx __initdata;
486u64 __supported_pte_mask __read_mostly = ~_PAGE_NX;
487EXPORT_SYMBOL_GPL(__supported_pte_mask);
488 475
489/* 476/*
490 * noexec = on|off 477 * noexec = on|off
@@ -501,11 +488,14 @@ static int __init noexec_setup(char *str)
501 __supported_pte_mask |= _PAGE_NX; 488 __supported_pte_mask |= _PAGE_NX;
502 disable_nx = 0; 489 disable_nx = 0;
503 } 490 }
504 } else if (!strcmp(str,"off")) { 491 } else {
505 disable_nx = 1; 492 if (!strcmp(str, "off")) {
506 __supported_pte_mask &= ~_PAGE_NX; 493 disable_nx = 1;
507 } else 494 __supported_pte_mask &= ~_PAGE_NX;
508 return -EINVAL; 495 } else {
496 return -EINVAL;
497 }
498 }
509 499
510 return 0; 500 return 0;
511} 501}
@@ -517,6 +507,7 @@ static void __init set_nx(void)
517 507
518 if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) { 508 if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
519 cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]); 509 cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
510
520 if ((v[3] & (1 << 20)) && !disable_nx) { 511 if ((v[3] & (1 << 20)) && !disable_nx) {
521 rdmsr(MSR_EFER, l, h); 512 rdmsr(MSR_EFER, l, h);
522 l |= EFER_NX; 513 l |= EFER_NX;
@@ -526,35 +517,6 @@ static void __init set_nx(void)
526 } 517 }
527 } 518 }
528} 519}
529
530/*
531 * Enables/disables executability of a given kernel page and
532 * returns the previous setting.
533 */
534int __init set_kernel_exec(unsigned long vaddr, int enable)
535{
536 pte_t *pte;
537 int ret = 1;
538
539 if (!nx_enabled)
540 goto out;
541
542 pte = lookup_address(vaddr);
543 BUG_ON(!pte);
544
545 if (!pte_exec_kernel(*pte))
546 ret = 0;
547
548 if (enable)
549 pte->pte_high &= ~(1 << (_PAGE_BIT_NX - 32));
550 else
551 pte->pte_high |= 1 << (_PAGE_BIT_NX - 32);
552 pte_update_defer(&init_mm, vaddr, pte);
553 __flush_tlb_all();
554out:
555 return ret;
556}
557
558#endif 520#endif
559 521
560/* 522/*
@@ -569,9 +531,8 @@ void __init paging_init(void)
569#ifdef CONFIG_X86_PAE 531#ifdef CONFIG_X86_PAE
570 set_nx(); 532 set_nx();
571 if (nx_enabled) 533 if (nx_enabled)
572 printk("NX (Execute Disable) protection: active\n"); 534 printk(KERN_INFO "NX (Execute Disable) protection: active\n");
573#endif 535#endif
574
575 pagetable_init(); 536 pagetable_init();
576 537
577 load_cr3(swapper_pg_dir); 538 load_cr3(swapper_pg_dir);
@@ -595,10 +556,10 @@ void __init paging_init(void)
595 * used to involve black magic jumps to work around some nasty CPU bugs, 556 * used to involve black magic jumps to work around some nasty CPU bugs,
596 * but fortunately the switch to using exceptions got rid of all that. 557 * but fortunately the switch to using exceptions got rid of all that.
597 */ 558 */
598
599static void __init test_wp_bit(void) 559static void __init test_wp_bit(void)
600{ 560{
601 printk("Checking if this processor honours the WP bit even in supervisor mode... "); 561 printk(KERN_INFO
562 "Checking if this processor honours the WP bit even in supervisor mode...");
602 563
603 /* Any page-aligned address will do, the test is non-destructive */ 564 /* Any page-aligned address will do, the test is non-destructive */
604 __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY); 565 __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY);
@@ -606,47 +567,46 @@ static void __init test_wp_bit(void)
606 clear_fixmap(FIX_WP_TEST); 567 clear_fixmap(FIX_WP_TEST);
607 568
608 if (!boot_cpu_data.wp_works_ok) { 569 if (!boot_cpu_data.wp_works_ok) {
609 printk("No.\n"); 570 printk(KERN_CONT "No.\n");
610#ifdef CONFIG_X86_WP_WORKS_OK 571#ifdef CONFIG_X86_WP_WORKS_OK
611 panic("This kernel doesn't support CPU's with broken WP. Recompile it for a 386!"); 572 panic(
573 "This kernel doesn't support CPU's with broken WP. Recompile it for a 386!");
612#endif 574#endif
613 } else { 575 } else {
614 printk("Ok.\n"); 576 printk(KERN_CONT "Ok.\n");
615 } 577 }
616} 578}
617 579
618static struct kcore_list kcore_mem, kcore_vmalloc; 580static struct kcore_list kcore_mem, kcore_vmalloc;
619 581
620void __init mem_init(void) 582void __init mem_init(void)
621{ 583{
622 extern int ppro_with_ram_bug(void);
623 int codesize, reservedpages, datasize, initsize; 584 int codesize, reservedpages, datasize, initsize;
624 int tmp; 585 int tmp, bad_ppro;
625 int bad_ppro;
626 586
627#ifdef CONFIG_FLATMEM 587#ifdef CONFIG_FLATMEM
628 BUG_ON(!mem_map); 588 BUG_ON(!mem_map);
629#endif 589#endif
630
631 bad_ppro = ppro_with_ram_bug(); 590 bad_ppro = ppro_with_ram_bug();
632 591
633#ifdef CONFIG_HIGHMEM 592#ifdef CONFIG_HIGHMEM
634 /* check that fixmap and pkmap do not overlap */ 593 /* check that fixmap and pkmap do not overlap */
635 if (PKMAP_BASE+LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) { 594 if (PKMAP_BASE + LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
636 printk(KERN_ERR "fixmap and kmap areas overlap - this will crash\n"); 595 printk(KERN_ERR
596 "fixmap and kmap areas overlap - this will crash\n");
637 printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n", 597 printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n",
638 PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, FIXADDR_START); 598 PKMAP_BASE, PKMAP_BASE + LAST_PKMAP*PAGE_SIZE,
599 FIXADDR_START);
639 BUG(); 600 BUG();
640 } 601 }
641#endif 602#endif
642
643 /* this will put all low memory onto the freelists */ 603 /* this will put all low memory onto the freelists */
644 totalram_pages += free_all_bootmem(); 604 totalram_pages += free_all_bootmem();
645 605
646 reservedpages = 0; 606 reservedpages = 0;
647 for (tmp = 0; tmp < max_low_pfn; tmp++) 607 for (tmp = 0; tmp < max_low_pfn; tmp++)
648 /* 608 /*
649 * Only count reserved RAM pages 609 * Only count reserved RAM pages:
650 */ 610 */
651 if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp))) 611 if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
652 reservedpages++; 612 reservedpages++;
@@ -657,11 +617,12 @@ void __init mem_init(void)
657 datasize = (unsigned long) &_edata - (unsigned long) &_etext; 617 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
658 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; 618 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
659 619
660 kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); 620 kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
661 kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, 621 kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
662 VMALLOC_END-VMALLOC_START); 622 VMALLOC_END-VMALLOC_START);
663 623
664 printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n", 624 printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, "
625 "%dk reserved, %dk data, %dk init, %ldk highmem)\n",
665 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), 626 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
666 num_physpages << (PAGE_SHIFT-10), 627 num_physpages << (PAGE_SHIFT-10),
667 codesize >> 10, 628 codesize >> 10,
@@ -672,45 +633,46 @@ void __init mem_init(void)
672 ); 633 );
673 634
674#if 1 /* double-sanity-check paranoia */ 635#if 1 /* double-sanity-check paranoia */
675 printk("virtual kernel memory layout:\n" 636 printk(KERN_INFO "virtual kernel memory layout:\n"
676 " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" 637 " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
677#ifdef CONFIG_HIGHMEM 638#ifdef CONFIG_HIGHMEM
678 " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n" 639 " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
679#endif 640#endif
680 " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n" 641 " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n"
681 " lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n" 642 " lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n"
682 " .init : 0x%08lx - 0x%08lx (%4ld kB)\n" 643 " .init : 0x%08lx - 0x%08lx (%4ld kB)\n"
683 " .data : 0x%08lx - 0x%08lx (%4ld kB)\n" 644 " .data : 0x%08lx - 0x%08lx (%4ld kB)\n"
684 " .text : 0x%08lx - 0x%08lx (%4ld kB)\n", 645 " .text : 0x%08lx - 0x%08lx (%4ld kB)\n",
685 FIXADDR_START, FIXADDR_TOP, 646 FIXADDR_START, FIXADDR_TOP,
686 (FIXADDR_TOP - FIXADDR_START) >> 10, 647 (FIXADDR_TOP - FIXADDR_START) >> 10,
687 648
688#ifdef CONFIG_HIGHMEM 649#ifdef CONFIG_HIGHMEM
689 PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, 650 PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
690 (LAST_PKMAP*PAGE_SIZE) >> 10, 651 (LAST_PKMAP*PAGE_SIZE) >> 10,
691#endif 652#endif
692 653
693 VMALLOC_START, VMALLOC_END, 654 VMALLOC_START, VMALLOC_END,
694 (VMALLOC_END - VMALLOC_START) >> 20, 655 (VMALLOC_END - VMALLOC_START) >> 20,
695 656
696 (unsigned long)__va(0), (unsigned long)high_memory, 657 (unsigned long)__va(0), (unsigned long)high_memory,
697 ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20, 658 ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,
698 659
699 (unsigned long)&__init_begin, (unsigned long)&__init_end, 660 (unsigned long)&__init_begin, (unsigned long)&__init_end,
700 ((unsigned long)&__init_end - (unsigned long)&__init_begin) >> 10, 661 ((unsigned long)&__init_end -
662 (unsigned long)&__init_begin) >> 10,
701 663
702 (unsigned long)&_etext, (unsigned long)&_edata, 664 (unsigned long)&_etext, (unsigned long)&_edata,
703 ((unsigned long)&_edata - (unsigned long)&_etext) >> 10, 665 ((unsigned long)&_edata - (unsigned long)&_etext) >> 10,
704 666
705 (unsigned long)&_text, (unsigned long)&_etext, 667 (unsigned long)&_text, (unsigned long)&_etext,
706 ((unsigned long)&_etext - (unsigned long)&_text) >> 10); 668 ((unsigned long)&_etext - (unsigned long)&_text) >> 10);
707 669
708#ifdef CONFIG_HIGHMEM 670#ifdef CONFIG_HIGHMEM
709 BUG_ON(PKMAP_BASE+LAST_PKMAP*PAGE_SIZE > FIXADDR_START); 671 BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
710 BUG_ON(VMALLOC_END > PKMAP_BASE); 672 BUG_ON(VMALLOC_END > PKMAP_BASE);
711#endif 673#endif
712 BUG_ON(VMALLOC_START > VMALLOC_END); 674 BUG_ON(VMALLOC_START > VMALLOC_END);
713 BUG_ON((unsigned long)high_memory > VMALLOC_START); 675 BUG_ON((unsigned long)high_memory > VMALLOC_START);
714#endif /* double-sanity-check paranoia */ 676#endif /* double-sanity-check paranoia */
715 677
716#ifdef CONFIG_X86_PAE 678#ifdef CONFIG_X86_PAE
@@ -741,49 +703,35 @@ int arch_add_memory(int nid, u64 start, u64 size)
741 703
742 return __add_pages(zone, start_pfn, nr_pages); 704 return __add_pages(zone, start_pfn, nr_pages);
743} 705}
744
745#endif 706#endif
746 707
747struct kmem_cache *pmd_cache;
748
749void __init pgtable_cache_init(void)
750{
751 if (PTRS_PER_PMD > 1)
752 pmd_cache = kmem_cache_create("pmd",
753 PTRS_PER_PMD*sizeof(pmd_t),
754 PTRS_PER_PMD*sizeof(pmd_t),
755 SLAB_PANIC,
756 pmd_ctor);
757}
758
759/* 708/*
760 * This function cannot be __init, since exceptions don't work in that 709 * This function cannot be __init, since exceptions don't work in that
761 * section. Put this after the callers, so that it cannot be inlined. 710 * section. Put this after the callers, so that it cannot be inlined.
762 */ 711 */
763static int noinline do_test_wp_bit(void) 712static noinline int do_test_wp_bit(void)
764{ 713{
765 char tmp_reg; 714 char tmp_reg;
766 int flag; 715 int flag;
767 716
768 __asm__ __volatile__( 717 __asm__ __volatile__(
769 " movb %0,%1 \n" 718 " movb %0, %1 \n"
770 "1: movb %1,%0 \n" 719 "1: movb %1, %0 \n"
771 " xorl %2,%2 \n" 720 " xorl %2, %2 \n"
772 "2: \n" 721 "2: \n"
773 ".section __ex_table,\"a\"\n" 722 _ASM_EXTABLE(1b,2b)
774 " .align 4 \n"
775 " .long 1b,2b \n"
776 ".previous \n"
777 :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)), 723 :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)),
778 "=q" (tmp_reg), 724 "=q" (tmp_reg),
779 "=r" (flag) 725 "=r" (flag)
780 :"2" (1) 726 :"2" (1)
781 :"memory"); 727 :"memory");
782 728
783 return flag; 729 return flag;
784} 730}
785 731
786#ifdef CONFIG_DEBUG_RODATA 732#ifdef CONFIG_DEBUG_RODATA
733const int rodata_test_data = 0xC3;
734EXPORT_SYMBOL_GPL(rodata_test_data);
787 735
788void mark_rodata_ro(void) 736void mark_rodata_ro(void)
789{ 737{
@@ -796,32 +744,58 @@ void mark_rodata_ro(void)
796 if (num_possible_cpus() <= 1) 744 if (num_possible_cpus() <= 1)
797#endif 745#endif
798 { 746 {
799 change_page_attr(virt_to_page(start), 747 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
800 size >> PAGE_SHIFT, PAGE_KERNEL_RX); 748 printk(KERN_INFO "Write protecting the kernel text: %luk\n",
801 printk("Write protecting the kernel text: %luk\n", size >> 10); 749 size >> 10);
750
751#ifdef CONFIG_CPA_DEBUG
752 printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n",
753 start, start+size);
754 set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT);
755
756 printk(KERN_INFO "Testing CPA: write protecting again\n");
757 set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
758#endif
802 } 759 }
803#endif 760#endif
804 start += size; 761 start += size;
805 size = (unsigned long)__end_rodata - start; 762 size = (unsigned long)__end_rodata - start;
806 change_page_attr(virt_to_page(start), 763 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
807 size >> PAGE_SHIFT, PAGE_KERNEL_RO); 764 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
808 printk("Write protecting the kernel read-only data: %luk\n", 765 size >> 10);
809 size >> 10); 766 rodata_test();
810 767
811 /* 768#ifdef CONFIG_CPA_DEBUG
812 * change_page_attr() requires a global_flush_tlb() call after it. 769 printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, start + size);
813 * We do this after the printk so that if something went wrong in the 770 set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT);
814 * change, the printk gets out at least to give a better debug hint 771
815 * of who is the culprit. 772 printk(KERN_INFO "Testing CPA: write protecting again\n");
816 */ 773 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
817 global_flush_tlb(); 774#endif
818} 775}
819#endif 776#endif
820 777
821void free_init_pages(char *what, unsigned long begin, unsigned long end) 778void free_init_pages(char *what, unsigned long begin, unsigned long end)
822{ 779{
780#ifdef CONFIG_DEBUG_PAGEALLOC
781 /*
782 * If debugging page accesses then do not free this memory but
783 * mark them not present - any buggy init-section access will
784 * create a kernel page fault:
785 */
786 printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
787 begin, PAGE_ALIGN(end));
788 set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
789#else
823 unsigned long addr; 790 unsigned long addr;
824 791
792 /*
793 * We just marked the kernel text read only above, now that
794 * we are going to free part of that, we need to make that
795 * writeable first.
796 */
797 set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
798
825 for (addr = begin; addr < end; addr += PAGE_SIZE) { 799 for (addr = begin; addr < end; addr += PAGE_SIZE) {
826 ClearPageReserved(virt_to_page(addr)); 800 ClearPageReserved(virt_to_page(addr));
827 init_page_count(virt_to_page(addr)); 801 init_page_count(virt_to_page(addr));
@@ -830,6 +804,7 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
830 totalram_pages++; 804 totalram_pages++;
831 } 805 }
832 printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); 806 printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
807#endif
833} 808}
834 809
835void free_initmem(void) 810void free_initmem(void)
@@ -845,4 +820,3 @@ void free_initrd_mem(unsigned long start, unsigned long end)
845 free_init_pages("initrd memory", start, end); 820 free_init_pages("initrd memory", start, end);
846} 821}
847#endif 822#endif
848
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 0f9c8c89065..9b61c75a235 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -43,12 +43,10 @@
43#include <asm/proto.h> 43#include <asm/proto.h>
44#include <asm/smp.h> 44#include <asm/smp.h>
45#include <asm/sections.h> 45#include <asm/sections.h>
46#include <asm/kdebug.h>
47#include <asm/numa.h>
46 48
47#ifndef Dprintk 49const struct dma_mapping_ops *dma_ops;
48#define Dprintk(x...)
49#endif
50
51const struct dma_mapping_ops* dma_ops;
52EXPORT_SYMBOL(dma_ops); 50EXPORT_SYMBOL(dma_ops);
53 51
54static unsigned long dma_reserve __initdata; 52static unsigned long dma_reserve __initdata;
@@ -65,22 +63,26 @@ void show_mem(void)
65{ 63{
66 long i, total = 0, reserved = 0; 64 long i, total = 0, reserved = 0;
67 long shared = 0, cached = 0; 65 long shared = 0, cached = 0;
68 pg_data_t *pgdat;
69 struct page *page; 66 struct page *page;
67 pg_data_t *pgdat;
70 68
71 printk(KERN_INFO "Mem-info:\n"); 69 printk(KERN_INFO "Mem-info:\n");
72 show_free_areas(); 70 show_free_areas();
73 printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); 71 printk(KERN_INFO "Free swap: %6ldkB\n",
72 nr_swap_pages << (PAGE_SHIFT-10));
74 73
75 for_each_online_pgdat(pgdat) { 74 for_each_online_pgdat(pgdat) {
76 for (i = 0; i < pgdat->node_spanned_pages; ++i) { 75 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
77 /* this loop can take a while with 256 GB and 4k pages 76 /*
78 so update the NMI watchdog */ 77 * This loop can take a while with 256 GB and
79 if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) { 78 * 4k pages so defer the NMI watchdog:
79 */
80 if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
80 touch_nmi_watchdog(); 81 touch_nmi_watchdog();
81 } 82
82 if (!pfn_valid(pgdat->node_start_pfn + i)) 83 if (!pfn_valid(pgdat->node_start_pfn + i))
83 continue; 84 continue;
85
84 page = pfn_to_page(pgdat->node_start_pfn + i); 86 page = pfn_to_page(pgdat->node_start_pfn + i);
85 total++; 87 total++;
86 if (PageReserved(page)) 88 if (PageReserved(page))
@@ -89,51 +91,58 @@ void show_mem(void)
89 cached++; 91 cached++;
90 else if (page_count(page)) 92 else if (page_count(page))
91 shared += page_count(page) - 1; 93 shared += page_count(page) - 1;
92 } 94 }
93 } 95 }
94 printk(KERN_INFO "%lu pages of RAM\n", total); 96 printk(KERN_INFO "%lu pages of RAM\n", total);
95 printk(KERN_INFO "%lu reserved pages\n",reserved); 97 printk(KERN_INFO "%lu reserved pages\n", reserved);
96 printk(KERN_INFO "%lu pages shared\n",shared); 98 printk(KERN_INFO "%lu pages shared\n", shared);
97 printk(KERN_INFO "%lu pages swap cached\n",cached); 99 printk(KERN_INFO "%lu pages swap cached\n", cached);
98} 100}
99 101
100int after_bootmem; 102int after_bootmem;
101 103
102static __init void *spp_getpage(void) 104static __init void *spp_getpage(void)
103{ 105{
104 void *ptr; 106 void *ptr;
107
105 if (after_bootmem) 108 if (after_bootmem)
106 ptr = (void *) get_zeroed_page(GFP_ATOMIC); 109 ptr = (void *) get_zeroed_page(GFP_ATOMIC);
107 else 110 else
108 ptr = alloc_bootmem_pages(PAGE_SIZE); 111 ptr = alloc_bootmem_pages(PAGE_SIZE);
109 if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
110 panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
111 112
112 Dprintk("spp_getpage %p\n", ptr); 113 if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
114 panic("set_pte_phys: cannot allocate page data %s\n",
115 after_bootmem ? "after bootmem" : "");
116 }
117
118 pr_debug("spp_getpage %p\n", ptr);
119
113 return ptr; 120 return ptr;
114} 121}
115 122
116static __init void set_pte_phys(unsigned long vaddr, 123static __init void
117 unsigned long phys, pgprot_t prot) 124set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot)
118{ 125{
119 pgd_t *pgd; 126 pgd_t *pgd;
120 pud_t *pud; 127 pud_t *pud;
121 pmd_t *pmd; 128 pmd_t *pmd;
122 pte_t *pte, new_pte; 129 pte_t *pte, new_pte;
123 130
124 Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys); 131 pr_debug("set_pte_phys %lx to %lx\n", vaddr, phys);
125 132
126 pgd = pgd_offset_k(vaddr); 133 pgd = pgd_offset_k(vaddr);
127 if (pgd_none(*pgd)) { 134 if (pgd_none(*pgd)) {
128 printk("PGD FIXMAP MISSING, it should be setup in head.S!\n"); 135 printk(KERN_ERR
136 "PGD FIXMAP MISSING, it should be setup in head.S!\n");
129 return; 137 return;
130 } 138 }
131 pud = pud_offset(pgd, vaddr); 139 pud = pud_offset(pgd, vaddr);
132 if (pud_none(*pud)) { 140 if (pud_none(*pud)) {
133 pmd = (pmd_t *) spp_getpage(); 141 pmd = (pmd_t *) spp_getpage();
134 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER)); 142 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
135 if (pmd != pmd_offset(pud, 0)) { 143 if (pmd != pmd_offset(pud, 0)) {
136 printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0)); 144 printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
145 pmd, pmd_offset(pud, 0));
137 return; 146 return;
138 } 147 }
139 } 148 }
@@ -142,7 +151,7 @@ static __init void set_pte_phys(unsigned long vaddr,
142 pte = (pte_t *) spp_getpage(); 151 pte = (pte_t *) spp_getpage();
143 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER)); 152 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
144 if (pte != pte_offset_kernel(pmd, 0)) { 153 if (pte != pte_offset_kernel(pmd, 0)) {
145 printk("PAGETABLE BUG #02!\n"); 154 printk(KERN_ERR "PAGETABLE BUG #02!\n");
146 return; 155 return;
147 } 156 }
148 } 157 }
@@ -162,33 +171,35 @@ static __init void set_pte_phys(unsigned long vaddr,
162} 171}
163 172
164/* NOTE: this is meant to be run only at boot */ 173/* NOTE: this is meant to be run only at boot */
165void __init 174void __init
166__set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot) 175__set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
167{ 176{
168 unsigned long address = __fix_to_virt(idx); 177 unsigned long address = __fix_to_virt(idx);
169 178
170 if (idx >= __end_of_fixed_addresses) { 179 if (idx >= __end_of_fixed_addresses) {
171 printk("Invalid __set_fixmap\n"); 180 printk(KERN_ERR "Invalid __set_fixmap\n");
172 return; 181 return;
173 } 182 }
174 set_pte_phys(address, phys, prot); 183 set_pte_phys(address, phys, prot);
175} 184}
176 185
177unsigned long __meminitdata table_start, table_end; 186static unsigned long __initdata table_start;
187static unsigned long __meminitdata table_end;
178 188
179static __meminit void *alloc_low_page(unsigned long *phys) 189static __meminit void *alloc_low_page(unsigned long *phys)
180{ 190{
181 unsigned long pfn = table_end++; 191 unsigned long pfn = table_end++;
182 void *adr; 192 void *adr;
183 193
184 if (after_bootmem) { 194 if (after_bootmem) {
185 adr = (void *)get_zeroed_page(GFP_ATOMIC); 195 adr = (void *)get_zeroed_page(GFP_ATOMIC);
186 *phys = __pa(adr); 196 *phys = __pa(adr);
197
187 return adr; 198 return adr;
188 } 199 }
189 200
190 if (pfn >= end_pfn) 201 if (pfn >= end_pfn)
191 panic("alloc_low_page: ran out of memory"); 202 panic("alloc_low_page: ran out of memory");
192 203
193 adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE); 204 adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE);
194 memset(adr, 0, PAGE_SIZE); 205 memset(adr, 0, PAGE_SIZE);
@@ -197,44 +208,49 @@ static __meminit void *alloc_low_page(unsigned long *phys)
197} 208}
198 209
199static __meminit void unmap_low_page(void *adr) 210static __meminit void unmap_low_page(void *adr)
200{ 211{
201
202 if (after_bootmem) 212 if (after_bootmem)
203 return; 213 return;
204 214
205 early_iounmap(adr, PAGE_SIZE); 215 early_iounmap(adr, PAGE_SIZE);
206} 216}
207 217
208/* Must run before zap_low_mappings */ 218/* Must run before zap_low_mappings */
209__meminit void *early_ioremap(unsigned long addr, unsigned long size) 219__meminit void *early_ioremap(unsigned long addr, unsigned long size)
210{ 220{
211 unsigned long vaddr;
212 pmd_t *pmd, *last_pmd; 221 pmd_t *pmd, *last_pmd;
222 unsigned long vaddr;
213 int i, pmds; 223 int i, pmds;
214 224
215 pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE; 225 pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
216 vaddr = __START_KERNEL_map; 226 vaddr = __START_KERNEL_map;
217 pmd = level2_kernel_pgt; 227 pmd = level2_kernel_pgt;
218 last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1; 228 last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
229
219 for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) { 230 for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
220 for (i = 0; i < pmds; i++) { 231 for (i = 0; i < pmds; i++) {
221 if (pmd_present(pmd[i])) 232 if (pmd_present(pmd[i]))
222 goto next; 233 goto continue_outer_loop;
223 } 234 }
224 vaddr += addr & ~PMD_MASK; 235 vaddr += addr & ~PMD_MASK;
225 addr &= PMD_MASK; 236 addr &= PMD_MASK;
237
226 for (i = 0; i < pmds; i++, addr += PMD_SIZE) 238 for (i = 0; i < pmds; i++, addr += PMD_SIZE)
227 set_pmd(pmd + i,__pmd(addr | _KERNPG_TABLE | _PAGE_PSE)); 239 set_pmd(pmd+i, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
228 __flush_tlb(); 240 __flush_tlb_all();
241
229 return (void *)vaddr; 242 return (void *)vaddr;
230 next: 243continue_outer_loop:
231 ; 244 ;
232 } 245 }
233 printk("early_ioremap(0x%lx, %lu) failed\n", addr, size); 246 printk(KERN_ERR "early_ioremap(0x%lx, %lu) failed\n", addr, size);
247
234 return NULL; 248 return NULL;
235} 249}
236 250
237/* To avoid virtual aliases later */ 251/*
252 * To avoid virtual aliases later:
253 */
238__meminit void early_iounmap(void *addr, unsigned long size) 254__meminit void early_iounmap(void *addr, unsigned long size)
239{ 255{
240 unsigned long vaddr; 256 unsigned long vaddr;
@@ -244,9 +260,11 @@ __meminit void early_iounmap(void *addr, unsigned long size)
244 vaddr = (unsigned long)addr; 260 vaddr = (unsigned long)addr;
245 pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE; 261 pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
246 pmd = level2_kernel_pgt + pmd_index(vaddr); 262 pmd = level2_kernel_pgt + pmd_index(vaddr);
263
247 for (i = 0; i < pmds; i++) 264 for (i = 0; i < pmds; i++)
248 pmd_clear(pmd + i); 265 pmd_clear(pmd + i);
249 __flush_tlb(); 266
267 __flush_tlb_all();
250} 268}
251 269
252static void __meminit 270static void __meminit
@@ -255,41 +273,40 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
255 int i = pmd_index(address); 273 int i = pmd_index(address);
256 274
257 for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) { 275 for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
258 unsigned long entry;
259 pmd_t *pmd = pmd_page + pmd_index(address); 276 pmd_t *pmd = pmd_page + pmd_index(address);
260 277
261 if (address >= end) { 278 if (address >= end) {
262 if (!after_bootmem) 279 if (!after_bootmem) {
263 for (; i < PTRS_PER_PMD; i++, pmd++) 280 for (; i < PTRS_PER_PMD; i++, pmd++)
264 set_pmd(pmd, __pmd(0)); 281 set_pmd(pmd, __pmd(0));
282 }
265 break; 283 break;
266 } 284 }
267 285
268 if (pmd_val(*pmd)) 286 if (pmd_val(*pmd))
269 continue; 287 continue;
270 288
271 entry = _PAGE_NX|_PAGE_PSE|_KERNPG_TABLE|_PAGE_GLOBAL|address; 289 set_pte((pte_t *)pmd,
272 entry &= __supported_pte_mask; 290 pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
273 set_pmd(pmd, __pmd(entry));
274 } 291 }
275} 292}
276 293
277static void __meminit 294static void __meminit
278phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end) 295phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
279{ 296{
280 pmd_t *pmd = pmd_offset(pud,0); 297 pmd_t *pmd = pmd_offset(pud, 0);
281 spin_lock(&init_mm.page_table_lock); 298 spin_lock(&init_mm.page_table_lock);
282 phys_pmd_init(pmd, address, end); 299 phys_pmd_init(pmd, address, end);
283 spin_unlock(&init_mm.page_table_lock); 300 spin_unlock(&init_mm.page_table_lock);
284 __flush_tlb_all(); 301 __flush_tlb_all();
285} 302}
286 303
287static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) 304static void __meminit
288{ 305phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
306{
289 int i = pud_index(addr); 307 int i = pud_index(addr);
290 308
291 309 for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) {
292 for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) {
293 unsigned long pmd_phys; 310 unsigned long pmd_phys;
294 pud_t *pud = pud_page + pud_index(addr); 311 pud_t *pud = pud_page + pud_index(addr);
295 pmd_t *pmd; 312 pmd_t *pmd;
@@ -297,10 +314,11 @@ static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigne
297 if (addr >= end) 314 if (addr >= end)
298 break; 315 break;
299 316
300 if (!after_bootmem && !e820_any_mapped(addr,addr+PUD_SIZE,0)) { 317 if (!after_bootmem &&
301 set_pud(pud, __pud(0)); 318 !e820_any_mapped(addr, addr+PUD_SIZE, 0)) {
319 set_pud(pud, __pud(0));
302 continue; 320 continue;
303 } 321 }
304 322
305 if (pud_val(*pud)) { 323 if (pud_val(*pud)) {
306 phys_pmd_update(pud, addr, end); 324 phys_pmd_update(pud, addr, end);
@@ -308,14 +326,16 @@ static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigne
308 } 326 }
309 327
310 pmd = alloc_low_page(&pmd_phys); 328 pmd = alloc_low_page(&pmd_phys);
329
311 spin_lock(&init_mm.page_table_lock); 330 spin_lock(&init_mm.page_table_lock);
312 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE)); 331 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
313 phys_pmd_init(pmd, addr, end); 332 phys_pmd_init(pmd, addr, end);
314 spin_unlock(&init_mm.page_table_lock); 333 spin_unlock(&init_mm.page_table_lock);
334
315 unmap_low_page(pmd); 335 unmap_low_page(pmd);
316 } 336 }
317 __flush_tlb(); 337 __flush_tlb_all();
318} 338}
319 339
320static void __init find_early_table_space(unsigned long end) 340static void __init find_early_table_space(unsigned long end)
321{ 341{
@@ -326,11 +346,13 @@ static void __init find_early_table_space(unsigned long end)
326 tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) + 346 tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) +
327 round_up(pmds * sizeof(pmd_t), PAGE_SIZE); 347 round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
328 348
329 /* RED-PEN putting page tables only on node 0 could 349 /*
330 cause a hotspot and fill up ZONE_DMA. The page tables 350 * RED-PEN putting page tables only on node 0 could
331 need roughly 0.5KB per GB. */ 351 * cause a hotspot and fill up ZONE_DMA. The page tables
332 start = 0x8000; 352 * need roughly 0.5KB per GB.
333 table_start = find_e820_area(start, end, tables); 353 */
354 start = 0x8000;
355 table_start = find_e820_area(start, end, tables, PAGE_SIZE);
334 if (table_start == -1UL) 356 if (table_start == -1UL)
335 panic("Cannot find space for the kernel page tables"); 357 panic("Cannot find space for the kernel page tables");
336 358
@@ -342,20 +364,23 @@ static void __init find_early_table_space(unsigned long end)
342 (table_start << PAGE_SHIFT) + tables); 364 (table_start << PAGE_SHIFT) + tables);
343} 365}
344 366
345/* Setup the direct mapping of the physical memory at PAGE_OFFSET. 367/*
346 This runs before bootmem is initialized and gets pages directly from the 368 * Setup the direct mapping of the physical memory at PAGE_OFFSET.
347 physical memory. To access them they are temporarily mapped. */ 369 * This runs before bootmem is initialized and gets pages directly from
370 * the physical memory. To access them they are temporarily mapped.
371 */
348void __init_refok init_memory_mapping(unsigned long start, unsigned long end) 372void __init_refok init_memory_mapping(unsigned long start, unsigned long end)
349{ 373{
350 unsigned long next; 374 unsigned long next;
351 375
352 Dprintk("init_memory_mapping\n"); 376 pr_debug("init_memory_mapping\n");
353 377
354 /* 378 /*
355 * Find space for the kernel direct mapping tables. 379 * Find space for the kernel direct mapping tables.
356 * Later we should allocate these tables in the local node of the memory 380 *
357 * mapped. Unfortunately this is done currently before the nodes are 381 * Later we should allocate these tables in the local node of the
358 * discovered. 382 * memory mapped. Unfortunately this is done currently before the
383 * nodes are discovered.
359 */ 384 */
360 if (!after_bootmem) 385 if (!after_bootmem)
361 find_early_table_space(end); 386 find_early_table_space(end);
@@ -364,8 +389,8 @@ void __init_refok init_memory_mapping(unsigned long start, unsigned long end)
364 end = (unsigned long)__va(end); 389 end = (unsigned long)__va(end);
365 390
366 for (; start < end; start = next) { 391 for (; start < end; start = next) {
367 unsigned long pud_phys;
368 pgd_t *pgd = pgd_offset_k(start); 392 pgd_t *pgd = pgd_offset_k(start);
393 unsigned long pud_phys;
369 pud_t *pud; 394 pud_t *pud;
370 395
371 if (after_bootmem) 396 if (after_bootmem)
@@ -374,23 +399,28 @@ void __init_refok init_memory_mapping(unsigned long start, unsigned long end)
374 pud = alloc_low_page(&pud_phys); 399 pud = alloc_low_page(&pud_phys);
375 400
376 next = start + PGDIR_SIZE; 401 next = start + PGDIR_SIZE;
377 if (next > end) 402 if (next > end)
378 next = end; 403 next = end;
379 phys_pud_init(pud, __pa(start), __pa(next)); 404 phys_pud_init(pud, __pa(start), __pa(next));
380 if (!after_bootmem) 405 if (!after_bootmem)
381 set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys)); 406 set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
382 unmap_low_page(pud); 407 unmap_low_page(pud);
383 } 408 }
384 409
385 if (!after_bootmem) 410 if (!after_bootmem)
386 mmu_cr4_features = read_cr4(); 411 mmu_cr4_features = read_cr4();
387 __flush_tlb_all(); 412 __flush_tlb_all();
413
414 if (!after_bootmem)
415 reserve_early(table_start << PAGE_SHIFT,
416 table_end << PAGE_SHIFT, "PGTABLE");
388} 417}
389 418
390#ifndef CONFIG_NUMA 419#ifndef CONFIG_NUMA
391void __init paging_init(void) 420void __init paging_init(void)
392{ 421{
393 unsigned long max_zone_pfns[MAX_NR_ZONES]; 422 unsigned long max_zone_pfns[MAX_NR_ZONES];
423
394 memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); 424 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
395 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; 425 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
396 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; 426 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
@@ -402,40 +432,6 @@ void __init paging_init(void)
402} 432}
403#endif 433#endif
404 434
405/* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
406 from the CPU leading to inconsistent cache lines. address and size
407 must be aligned to 2MB boundaries.
408 Does nothing when the mapping doesn't exist. */
409void __init clear_kernel_mapping(unsigned long address, unsigned long size)
410{
411 unsigned long end = address + size;
412
413 BUG_ON(address & ~LARGE_PAGE_MASK);
414 BUG_ON(size & ~LARGE_PAGE_MASK);
415
416 for (; address < end; address += LARGE_PAGE_SIZE) {
417 pgd_t *pgd = pgd_offset_k(address);
418 pud_t *pud;
419 pmd_t *pmd;
420 if (pgd_none(*pgd))
421 continue;
422 pud = pud_offset(pgd, address);
423 if (pud_none(*pud))
424 continue;
425 pmd = pmd_offset(pud, address);
426 if (!pmd || pmd_none(*pmd))
427 continue;
428 if (0 == (pmd_val(*pmd) & _PAGE_PSE)) {
429 /* Could handle this, but it should not happen currently. */
430 printk(KERN_ERR
431 "clear_kernel_mapping: mapping has been split. will leak memory\n");
432 pmd_ERROR(*pmd);
433 }
434 set_pmd(pmd, __pmd(0));
435 }
436 __flush_tlb_all();
437}
438
439/* 435/*
440 * Memory hotplug specific functions 436 * Memory hotplug specific functions
441 */ 437 */
@@ -461,16 +457,12 @@ int arch_add_memory(int nid, u64 start, u64 size)
461 unsigned long nr_pages = size >> PAGE_SHIFT; 457 unsigned long nr_pages = size >> PAGE_SHIFT;
462 int ret; 458 int ret;
463 459
464 init_memory_mapping(start, (start + size -1)); 460 init_memory_mapping(start, start + size-1);
465 461
466 ret = __add_pages(zone, start_pfn, nr_pages); 462 ret = __add_pages(zone, start_pfn, nr_pages);
467 if (ret) 463 WARN_ON(1);
468 goto error;
469 464
470 return ret; 465 return ret;
471error:
472 printk("%s: Problem encountered in __add_pages!\n", __func__);
473 return ret;
474} 466}
475EXPORT_SYMBOL_GPL(arch_add_memory); 467EXPORT_SYMBOL_GPL(arch_add_memory);
476 468
@@ -484,36 +476,8 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
484 476
485#endif /* CONFIG_MEMORY_HOTPLUG */ 477#endif /* CONFIG_MEMORY_HOTPLUG */
486 478
487#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE 479static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
488/* 480 kcore_modules, kcore_vsyscall;
489 * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
490 * just online the pages.
491 */
492int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
493{
494 int err = -EIO;
495 unsigned long pfn;
496 unsigned long total = 0, mem = 0;
497 for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
498 if (pfn_valid(pfn)) {
499 online_page(pfn_to_page(pfn));
500 err = 0;
501 mem++;
502 }
503 total++;
504 }
505 if (!err) {
506 z->spanned_pages += total;
507 z->present_pages += mem;
508 z->zone_pgdat->node_spanned_pages += total;
509 z->zone_pgdat->node_present_pages += mem;
510 }
511 return err;
512}
513#endif
514
515static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
516 kcore_vsyscall;
517 481
518void __init mem_init(void) 482void __init mem_init(void)
519{ 483{
@@ -521,8 +485,15 @@ void __init mem_init(void)
521 485
522 pci_iommu_alloc(); 486 pci_iommu_alloc();
523 487
524 /* clear the zero-page */ 488 /* clear_bss() already clear the empty_zero_page */
525 memset(empty_zero_page, 0, PAGE_SIZE); 489
490 /* temporary debugging - double check it's true: */
491 {
492 int i;
493
494 for (i = 0; i < 1024; i++)
495 WARN_ON_ONCE(empty_zero_page[i]);
496 }
526 497
527 reservedpages = 0; 498 reservedpages = 0;
528 499
@@ -534,7 +505,6 @@ void __init mem_init(void)
534#endif 505#endif
535 reservedpages = end_pfn - totalram_pages - 506 reservedpages = end_pfn - totalram_pages -
536 absent_pages_in_range(0, end_pfn); 507 absent_pages_in_range(0, end_pfn);
537
538 after_bootmem = 1; 508 after_bootmem = 1;
539 509
540 codesize = (unsigned long) &_etext - (unsigned long) &_text; 510 codesize = (unsigned long) &_etext - (unsigned long) &_text;
@@ -542,15 +512,16 @@ void __init mem_init(void)
542 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; 512 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
543 513
544 /* Register memory areas for /proc/kcore */ 514 /* Register memory areas for /proc/kcore */
545 kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); 515 kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
546 kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, 516 kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
547 VMALLOC_END-VMALLOC_START); 517 VMALLOC_END-VMALLOC_START);
548 kclist_add(&kcore_kernel, &_stext, _end - _stext); 518 kclist_add(&kcore_kernel, &_stext, _end - _stext);
549 kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN); 519 kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
550 kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, 520 kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
551 VSYSCALL_END - VSYSCALL_START); 521 VSYSCALL_END - VSYSCALL_START);
552 522
553 printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n", 523 printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
524 "%ldk reserved, %ldk data, %ldk init)\n",
554 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), 525 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
555 end_pfn << (PAGE_SHIFT-10), 526 end_pfn << (PAGE_SHIFT-10),
556 codesize >> 10, 527 codesize >> 10,
@@ -566,19 +537,27 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
566 if (begin >= end) 537 if (begin >= end)
567 return; 538 return;
568 539
540 /*
541 * If debugging page accesses then do not free this memory but
542 * mark them not present - any buggy init-section access will
543 * create a kernel page fault:
544 */
545#ifdef CONFIG_DEBUG_PAGEALLOC
546 printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
547 begin, PAGE_ALIGN(end));
548 set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
549#else
569 printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); 550 printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
551
570 for (addr = begin; addr < end; addr += PAGE_SIZE) { 552 for (addr = begin; addr < end; addr += PAGE_SIZE) {
571 ClearPageReserved(virt_to_page(addr)); 553 ClearPageReserved(virt_to_page(addr));
572 init_page_count(virt_to_page(addr)); 554 init_page_count(virt_to_page(addr));
573 memset((void *)(addr & ~(PAGE_SIZE-1)), 555 memset((void *)(addr & ~(PAGE_SIZE-1)),
574 POISON_FREE_INITMEM, PAGE_SIZE); 556 POISON_FREE_INITMEM, PAGE_SIZE);
575 if (addr >= __START_KERNEL_map)
576 change_page_attr_addr(addr, 1, __pgprot(0));
577 free_page(addr); 557 free_page(addr);
578 totalram_pages++; 558 totalram_pages++;
579 } 559 }
580 if (addr > __START_KERNEL_map) 560#endif
581 global_flush_tlb();
582} 561}
583 562
584void free_initmem(void) 563void free_initmem(void)
@@ -589,6 +568,8 @@ void free_initmem(void)
589} 568}
590 569
591#ifdef CONFIG_DEBUG_RODATA 570#ifdef CONFIG_DEBUG_RODATA
571const int rodata_test_data = 0xC3;
572EXPORT_SYMBOL_GPL(rodata_test_data);
592 573
593void mark_rodata_ro(void) 574void mark_rodata_ro(void)
594{ 575{
@@ -603,25 +584,34 @@ void mark_rodata_ro(void)
603#ifdef CONFIG_KPROBES 584#ifdef CONFIG_KPROBES
604 start = (unsigned long)__start_rodata; 585 start = (unsigned long)__start_rodata;
605#endif 586#endif
606 587
607 end = (unsigned long)__end_rodata; 588 end = (unsigned long)__end_rodata;
608 start = (start + PAGE_SIZE - 1) & PAGE_MASK; 589 start = (start + PAGE_SIZE - 1) & PAGE_MASK;
609 end &= PAGE_MASK; 590 end &= PAGE_MASK;
610 if (end <= start) 591 if (end <= start)
611 return; 592 return;
612 593
613 change_page_attr_addr(start, (end - start) >> PAGE_SHIFT, PAGE_KERNEL_RO);
614 594
615 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", 595 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
616 (end - start) >> 10); 596 (end - start) >> 10);
597 set_memory_ro(start, (end - start) >> PAGE_SHIFT);
617 598
618 /* 599 /*
619 * change_page_attr_addr() requires a global_flush_tlb() call after it. 600 * The rodata section (but not the kernel text!) should also be
620 * We do this after the printk so that if something went wrong in the 601 * not-executable.
621 * change, the printk gets out at least to give a better debug hint
622 * of who is the culprit.
623 */ 602 */
624 global_flush_tlb(); 603 start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
604 set_memory_nx(start, (end - start) >> PAGE_SHIFT);
605
606 rodata_test();
607
608#ifdef CONFIG_CPA_DEBUG
609 printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
610 set_memory_rw(start, (end-start) >> PAGE_SHIFT);
611
612 printk(KERN_INFO "Testing CPA: again\n");
613 set_memory_ro(start, (end-start) >> PAGE_SHIFT);
614#endif
625} 615}
626#endif 616#endif
627 617
@@ -632,17 +622,21 @@ void free_initrd_mem(unsigned long start, unsigned long end)
632} 622}
633#endif 623#endif
634 624
635void __init reserve_bootmem_generic(unsigned long phys, unsigned len) 625void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
636{ 626{
637#ifdef CONFIG_NUMA 627#ifdef CONFIG_NUMA
638 int nid = phys_to_nid(phys); 628 int nid = phys_to_nid(phys);
639#endif 629#endif
640 unsigned long pfn = phys >> PAGE_SHIFT; 630 unsigned long pfn = phys >> PAGE_SHIFT;
631
641 if (pfn >= end_pfn) { 632 if (pfn >= end_pfn) {
642 /* This can happen with kdump kernels when accessing firmware 633 /*
643 tables. */ 634 * This can happen with kdump kernels when accessing
635 * firmware tables:
636 */
644 if (pfn < end_pfn_map) 637 if (pfn < end_pfn_map)
645 return; 638 return;
639
646 printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n", 640 printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
647 phys, len); 641 phys, len);
648 return; 642 return;
@@ -650,9 +644,9 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
650 644
651 /* Should check here against the e820 map to avoid double free */ 645 /* Should check here against the e820 map to avoid double free */
652#ifdef CONFIG_NUMA 646#ifdef CONFIG_NUMA
653 reserve_bootmem_node(NODE_DATA(nid), phys, len); 647 reserve_bootmem_node(NODE_DATA(nid), phys, len);
654#else 648#else
655 reserve_bootmem(phys, len); 649 reserve_bootmem(phys, len);
656#endif 650#endif
657 if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) { 651 if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
658 dma_reserve += len / PAGE_SIZE; 652 dma_reserve += len / PAGE_SIZE;
@@ -660,46 +654,49 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
660 } 654 }
661} 655}
662 656
663int kern_addr_valid(unsigned long addr) 657int kern_addr_valid(unsigned long addr)
664{ 658{
665 unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT; 659 unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
666 pgd_t *pgd; 660 pgd_t *pgd;
667 pud_t *pud; 661 pud_t *pud;
668 pmd_t *pmd; 662 pmd_t *pmd;
669 pte_t *pte; 663 pte_t *pte;
670 664
671 if (above != 0 && above != -1UL) 665 if (above != 0 && above != -1UL)
672 return 0; 666 return 0;
673 667
674 pgd = pgd_offset_k(addr); 668 pgd = pgd_offset_k(addr);
675 if (pgd_none(*pgd)) 669 if (pgd_none(*pgd))
676 return 0; 670 return 0;
677 671
678 pud = pud_offset(pgd, addr); 672 pud = pud_offset(pgd, addr);
679 if (pud_none(*pud)) 673 if (pud_none(*pud))
680 return 0; 674 return 0;
681 675
682 pmd = pmd_offset(pud, addr); 676 pmd = pmd_offset(pud, addr);
683 if (pmd_none(*pmd)) 677 if (pmd_none(*pmd))
684 return 0; 678 return 0;
679
685 if (pmd_large(*pmd)) 680 if (pmd_large(*pmd))
686 return pfn_valid(pmd_pfn(*pmd)); 681 return pfn_valid(pmd_pfn(*pmd));
687 682
688 pte = pte_offset_kernel(pmd, addr); 683 pte = pte_offset_kernel(pmd, addr);
689 if (pte_none(*pte)) 684 if (pte_none(*pte))
690 return 0; 685 return 0;
686
691 return pfn_valid(pte_pfn(*pte)); 687 return pfn_valid(pte_pfn(*pte));
692} 688}
693 689
694/* A pseudo VMA to allow ptrace access for the vsyscall page. This only 690/*
695 covers the 64bit vsyscall page now. 32bit has a real VMA now and does 691 * A pseudo VMA to allow ptrace access for the vsyscall page. This only
696 not need special handling anymore. */ 692 * covers the 64bit vsyscall page now. 32bit has a real VMA now and does
697 693 * not need special handling anymore:
694 */
698static struct vm_area_struct gate_vma = { 695static struct vm_area_struct gate_vma = {
699 .vm_start = VSYSCALL_START, 696 .vm_start = VSYSCALL_START,
700 .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES << PAGE_SHIFT), 697 .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE),
701 .vm_page_prot = PAGE_READONLY_EXEC, 698 .vm_page_prot = PAGE_READONLY_EXEC,
702 .vm_flags = VM_READ | VM_EXEC 699 .vm_flags = VM_READ | VM_EXEC
703}; 700};
704 701
705struct vm_area_struct *get_gate_vma(struct task_struct *tsk) 702struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
@@ -714,14 +711,17 @@ struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
714int in_gate_area(struct task_struct *task, unsigned long addr) 711int in_gate_area(struct task_struct *task, unsigned long addr)
715{ 712{
716 struct vm_area_struct *vma = get_gate_vma(task); 713 struct vm_area_struct *vma = get_gate_vma(task);
714
717 if (!vma) 715 if (!vma)
718 return 0; 716 return 0;
717
719 return (addr >= vma->vm_start) && (addr < vma->vm_end); 718 return (addr >= vma->vm_start) && (addr < vma->vm_end);
720} 719}
721 720
722/* Use this when you have no reliable task/vma, typically from interrupt 721/*
723 * context. It is less reliable than using the task's vma and may give 722 * Use this when you have no reliable task/vma, typically from interrupt
724 * false positives. 723 * context. It is less reliable than using the task's vma and may give
724 * false positives:
725 */ 725 */
726int in_gate_area_no_task(unsigned long addr) 726int in_gate_area_no_task(unsigned long addr)
727{ 727{
@@ -741,8 +741,8 @@ const char *arch_vma_name(struct vm_area_struct *vma)
741/* 741/*
742 * Initialise the sparsemem vmemmap using huge-pages at the PMD level. 742 * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
743 */ 743 */
744int __meminit vmemmap_populate(struct page *start_page, 744int __meminit
745 unsigned long size, int node) 745vmemmap_populate(struct page *start_page, unsigned long size, int node)
746{ 746{
747 unsigned long addr = (unsigned long)start_page; 747 unsigned long addr = (unsigned long)start_page;
748 unsigned long end = (unsigned long)(start_page + size); 748 unsigned long end = (unsigned long)(start_page + size);
@@ -757,6 +757,7 @@ int __meminit vmemmap_populate(struct page *start_page,
757 pgd = vmemmap_pgd_populate(addr, node); 757 pgd = vmemmap_pgd_populate(addr, node);
758 if (!pgd) 758 if (!pgd)
759 return -ENOMEM; 759 return -ENOMEM;
760
760 pud = vmemmap_pud_populate(pgd, addr, node); 761 pud = vmemmap_pud_populate(pgd, addr, node);
761 if (!pud) 762 if (!pud)
762 return -ENOMEM; 763 return -ENOMEM;
@@ -764,20 +765,22 @@ int __meminit vmemmap_populate(struct page *start_page,
764 pmd = pmd_offset(pud, addr); 765 pmd = pmd_offset(pud, addr);
765 if (pmd_none(*pmd)) { 766 if (pmd_none(*pmd)) {
766 pte_t entry; 767 pte_t entry;
767 void *p = vmemmap_alloc_block(PMD_SIZE, node); 768 void *p;
769
770 p = vmemmap_alloc_block(PMD_SIZE, node);
768 if (!p) 771 if (!p)
769 return -ENOMEM; 772 return -ENOMEM;
770 773
771 entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); 774 entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
772 mk_pte_huge(entry); 775 PAGE_KERNEL_LARGE);
773 set_pmd(pmd, __pmd(pte_val(entry))); 776 set_pmd(pmd, __pmd(pte_val(entry)));
774 777
775 printk(KERN_DEBUG " [%lx-%lx] PMD ->%p on node %d\n", 778 printk(KERN_DEBUG " [%lx-%lx] PMD ->%p on node %d\n",
776 addr, addr + PMD_SIZE - 1, p, node); 779 addr, addr + PMD_SIZE - 1, p, node);
777 } else 780 } else {
778 vmemmap_verify((pte_t *)pmd, node, addr, next); 781 vmemmap_verify((pte_t *)pmd, node, addr, next);
782 }
779 } 783 }
780
781 return 0; 784 return 0;
782} 785}
783#endif 786#endif
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
new file mode 100644
index 00000000000..ee6648fe6b1
--- /dev/null
+++ b/arch/x86/mm/ioremap.c
@@ -0,0 +1,485 @@
1/*
2 * Re-map IO memory to kernel address space so that we can access it.
3 * This is needed for high PCI addresses that aren't mapped in the
4 * 640k-1MB IO memory area on PC's
5 *
6 * (C) Copyright 1995 1996 Linus Torvalds
7 */
8
9#include <linux/bootmem.h>
10#include <linux/init.h>
11#include <linux/io.h>
12#include <linux/module.h>
13#include <linux/slab.h>
14#include <linux/vmalloc.h>
15
16#include <asm/cacheflush.h>
17#include <asm/e820.h>
18#include <asm/fixmap.h>
19#include <asm/pgtable.h>
20#include <asm/tlbflush.h>
21#include <asm/pgalloc.h>
22
23enum ioremap_mode {
24 IOR_MODE_UNCACHED,
25 IOR_MODE_CACHED,
26};
27
28#ifdef CONFIG_X86_64
29
30unsigned long __phys_addr(unsigned long x)
31{
32 if (x >= __START_KERNEL_map)
33 return x - __START_KERNEL_map + phys_base;
34 return x - PAGE_OFFSET;
35}
36EXPORT_SYMBOL(__phys_addr);
37
38#endif
39
40int page_is_ram(unsigned long pagenr)
41{
42 unsigned long addr, end;
43 int i;
44
45 for (i = 0; i < e820.nr_map; i++) {
46 /*
47 * Not usable memory:
48 */
49 if (e820.map[i].type != E820_RAM)
50 continue;
51 addr = (e820.map[i].addr + PAGE_SIZE-1) >> PAGE_SHIFT;
52 end = (e820.map[i].addr + e820.map[i].size) >> PAGE_SHIFT;
53
54 /*
55 * Sanity check: Some BIOSen report areas as RAM that
56 * are not. Notably the 640->1Mb area, which is the
57 * PCI BIOS area.
58 */
59 if (addr >= (BIOS_BEGIN >> PAGE_SHIFT) &&
60 end < (BIOS_END >> PAGE_SHIFT))
61 continue;
62
63 if ((pagenr >= addr) && (pagenr < end))
64 return 1;
65 }
66 return 0;
67}
68
69/*
70 * Fix up the linear direct mapping of the kernel to avoid cache attribute
71 * conflicts.
72 */
73static int ioremap_change_attr(unsigned long vaddr, unsigned long size,
74 enum ioremap_mode mode)
75{
76 unsigned long nrpages = size >> PAGE_SHIFT;
77 int err;
78
79 switch (mode) {
80 case IOR_MODE_UNCACHED:
81 default:
82 err = set_memory_uc(vaddr, nrpages);
83 break;
84 case IOR_MODE_CACHED:
85 err = set_memory_wb(vaddr, nrpages);
86 break;
87 }
88
89 return err;
90}
91
92/*
93 * Remap an arbitrary physical address space into the kernel virtual
94 * address space. Needed when the kernel wants to access high addresses
95 * directly.
96 *
97 * NOTE! We need to allow non-page-aligned mappings too: we will obviously
98 * have to convert them into an offset in a page-aligned mapping, but the
99 * caller shouldn't need to know that small detail.
100 */
101static void __iomem *__ioremap(unsigned long phys_addr, unsigned long size,
102 enum ioremap_mode mode)
103{
104 unsigned long pfn, offset, last_addr, vaddr;
105 struct vm_struct *area;
106 pgprot_t prot;
107
108 /* Don't allow wraparound or zero size */
109 last_addr = phys_addr + size - 1;
110 if (!size || last_addr < phys_addr)
111 return NULL;
112
113 /*
114 * Don't remap the low PCI/ISA area, it's always mapped..
115 */
116 if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
117 return (__force void __iomem *)phys_to_virt(phys_addr);
118
119 /*
120 * Don't allow anybody to remap normal RAM that we're using..
121 */
122 for (pfn = phys_addr >> PAGE_SHIFT; pfn < max_pfn_mapped &&
123 (pfn << PAGE_SHIFT) < last_addr; pfn++) {
124 if (page_is_ram(pfn) && pfn_valid(pfn) &&
125 !PageReserved(pfn_to_page(pfn)))
126 return NULL;
127 }
128
129 switch (mode) {
130 case IOR_MODE_UNCACHED:
131 default:
132 prot = PAGE_KERNEL_NOCACHE;
133 break;
134 case IOR_MODE_CACHED:
135 prot = PAGE_KERNEL;
136 break;
137 }
138
139 /*
140 * Mappings have to be page-aligned
141 */
142 offset = phys_addr & ~PAGE_MASK;
143 phys_addr &= PAGE_MASK;
144 size = PAGE_ALIGN(last_addr+1) - phys_addr;
145
146 /*
147 * Ok, go for it..
148 */
149 area = get_vm_area(size, VM_IOREMAP);
150 if (!area)
151 return NULL;
152 area->phys_addr = phys_addr;
153 vaddr = (unsigned long) area->addr;
154 if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot)) {
155 remove_vm_area((void *)(vaddr & PAGE_MASK));
156 return NULL;
157 }
158
159 if (ioremap_change_attr(vaddr, size, mode) < 0) {
160 vunmap(area->addr);
161 return NULL;
162 }
163
164 return (void __iomem *) (vaddr + offset);
165}
166
167/**
168 * ioremap_nocache - map bus memory into CPU space
169 * @offset: bus address of the memory
170 * @size: size of the resource to map
171 *
172 * ioremap_nocache performs a platform specific sequence of operations to
173 * make bus memory CPU accessible via the readb/readw/readl/writeb/
174 * writew/writel functions and the other mmio helpers. The returned
175 * address is not guaranteed to be usable directly as a virtual
176 * address.
177 *
178 * This version of ioremap ensures that the memory is marked uncachable
179 * on the CPU as well as honouring existing caching rules from things like
180 * the PCI bus. Note that there are other caches and buffers on many
181 * busses. In particular driver authors should read up on PCI writes
182 *
183 * It's useful if some control registers are in such an area and
184 * write combining or read caching is not desirable:
185 *
186 * Must be freed with iounmap.
187 */
188void __iomem *ioremap_nocache(unsigned long phys_addr, unsigned long size)
189{
190 return __ioremap(phys_addr, size, IOR_MODE_UNCACHED);
191}
192EXPORT_SYMBOL(ioremap_nocache);
193
194void __iomem *ioremap_cache(unsigned long phys_addr, unsigned long size)
195{
196 return __ioremap(phys_addr, size, IOR_MODE_CACHED);
197}
198EXPORT_SYMBOL(ioremap_cache);
199
200/**
201 * iounmap - Free a IO remapping
202 * @addr: virtual address from ioremap_*
203 *
204 * Caller must ensure there is only one unmapping for the same pointer.
205 */
206void iounmap(volatile void __iomem *addr)
207{
208 struct vm_struct *p, *o;
209
210 if ((void __force *)addr <= high_memory)
211 return;
212
213 /*
214 * __ioremap special-cases the PCI/ISA range by not instantiating a
215 * vm_area and by simply returning an address into the kernel mapping
216 * of ISA space. So handle that here.
217 */
218 if (addr >= phys_to_virt(ISA_START_ADDRESS) &&
219 addr < phys_to_virt(ISA_END_ADDRESS))
220 return;
221
222 addr = (volatile void __iomem *)
223 (PAGE_MASK & (unsigned long __force)addr);
224
225 /* Use the vm area unlocked, assuming the caller
226 ensures there isn't another iounmap for the same address
227 in parallel. Reuse of the virtual address is prevented by
228 leaving it in the global lists until we're done with it.
229 cpa takes care of the direct mappings. */
230 read_lock(&vmlist_lock);
231 for (p = vmlist; p; p = p->next) {
232 if (p->addr == addr)
233 break;
234 }
235 read_unlock(&vmlist_lock);
236
237 if (!p) {
238 printk(KERN_ERR "iounmap: bad address %p\n", addr);
239 dump_stack();
240 return;
241 }
242
243 /* Finally remove it */
244 o = remove_vm_area((void *)addr);
245 BUG_ON(p != o || o == NULL);
246 kfree(p);
247}
248EXPORT_SYMBOL(iounmap);
249
250#ifdef CONFIG_X86_32
251
252int __initdata early_ioremap_debug;
253
254static int __init early_ioremap_debug_setup(char *str)
255{
256 early_ioremap_debug = 1;
257
258 return 0;
259}
260early_param("early_ioremap_debug", early_ioremap_debug_setup);
261
262static __initdata int after_paging_init;
263static __initdata unsigned long bm_pte[1024]
264 __attribute__((aligned(PAGE_SIZE)));
265
266static inline unsigned long * __init early_ioremap_pgd(unsigned long addr)
267{
268 return (unsigned long *)swapper_pg_dir + ((addr >> 22) & 1023);
269}
270
271static inline unsigned long * __init early_ioremap_pte(unsigned long addr)
272{
273 return bm_pte + ((addr >> PAGE_SHIFT) & 1023);
274}
275
276void __init early_ioremap_init(void)
277{
278 unsigned long *pgd;
279
280 if (early_ioremap_debug)
281 printk(KERN_INFO "early_ioremap_init()\n");
282
283 pgd = early_ioremap_pgd(fix_to_virt(FIX_BTMAP_BEGIN));
284 *pgd = __pa(bm_pte) | _PAGE_TABLE;
285 memset(bm_pte, 0, sizeof(bm_pte));
286 /*
287 * The boot-ioremap range spans multiple pgds, for which
288 * we are not prepared:
289 */
290 if (pgd != early_ioremap_pgd(fix_to_virt(FIX_BTMAP_END))) {
291 WARN_ON(1);
292 printk(KERN_WARNING "pgd %p != %p\n",
293 pgd, early_ioremap_pgd(fix_to_virt(FIX_BTMAP_END)));
294 printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n",
295 fix_to_virt(FIX_BTMAP_BEGIN));
296 printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_END): %08lx\n",
297 fix_to_virt(FIX_BTMAP_END));
298
299 printk(KERN_WARNING "FIX_BTMAP_END: %d\n", FIX_BTMAP_END);
300 printk(KERN_WARNING "FIX_BTMAP_BEGIN: %d\n",
301 FIX_BTMAP_BEGIN);
302 }
303}
304
305void __init early_ioremap_clear(void)
306{
307 unsigned long *pgd;
308
309 if (early_ioremap_debug)
310 printk(KERN_INFO "early_ioremap_clear()\n");
311
312 pgd = early_ioremap_pgd(fix_to_virt(FIX_BTMAP_BEGIN));
313 *pgd = 0;
314 paravirt_release_pt(__pa(pgd) >> PAGE_SHIFT);
315 __flush_tlb_all();
316}
317
318void __init early_ioremap_reset(void)
319{
320 enum fixed_addresses idx;
321 unsigned long *pte, phys, addr;
322
323 after_paging_init = 1;
324 for (idx = FIX_BTMAP_BEGIN; idx >= FIX_BTMAP_END; idx--) {
325 addr = fix_to_virt(idx);
326 pte = early_ioremap_pte(addr);
327 if (*pte & _PAGE_PRESENT) {
328 phys = *pte & PAGE_MASK;
329 set_fixmap(idx, phys);
330 }
331 }
332}
333
334static void __init __early_set_fixmap(enum fixed_addresses idx,
335 unsigned long phys, pgprot_t flags)
336{
337 unsigned long *pte, addr = __fix_to_virt(idx);
338
339 if (idx >= __end_of_fixed_addresses) {
340 BUG();
341 return;
342 }
343 pte = early_ioremap_pte(addr);
344 if (pgprot_val(flags))
345 *pte = (phys & PAGE_MASK) | pgprot_val(flags);
346 else
347 *pte = 0;
348 __flush_tlb_one(addr);
349}
350
351static inline void __init early_set_fixmap(enum fixed_addresses idx,
352 unsigned long phys)
353{
354 if (after_paging_init)
355 set_fixmap(idx, phys);
356 else
357 __early_set_fixmap(idx, phys, PAGE_KERNEL);
358}
359
360static inline void __init early_clear_fixmap(enum fixed_addresses idx)
361{
362 if (after_paging_init)
363 clear_fixmap(idx);
364 else
365 __early_set_fixmap(idx, 0, __pgprot(0));
366}
367
368
369int __initdata early_ioremap_nested;
370
371static int __init check_early_ioremap_leak(void)
372{
373 if (!early_ioremap_nested)
374 return 0;
375
376 printk(KERN_WARNING
377 "Debug warning: early ioremap leak of %d areas detected.\n",
378 early_ioremap_nested);
379 printk(KERN_WARNING
380 "please boot with early_ioremap_debug and report the dmesg.\n");
381 WARN_ON(1);
382
383 return 1;
384}
385late_initcall(check_early_ioremap_leak);
386
387void __init *early_ioremap(unsigned long phys_addr, unsigned long size)
388{
389 unsigned long offset, last_addr;
390 unsigned int nrpages, nesting;
391 enum fixed_addresses idx0, idx;
392
393 WARN_ON(system_state != SYSTEM_BOOTING);
394
395 nesting = early_ioremap_nested;
396 if (early_ioremap_debug) {
397 printk(KERN_INFO "early_ioremap(%08lx, %08lx) [%d] => ",
398 phys_addr, size, nesting);
399 dump_stack();
400 }
401
402 /* Don't allow wraparound or zero size */
403 last_addr = phys_addr + size - 1;
404 if (!size || last_addr < phys_addr) {
405 WARN_ON(1);
406 return NULL;
407 }
408
409 if (nesting >= FIX_BTMAPS_NESTING) {
410 WARN_ON(1);
411 return NULL;
412 }
413 early_ioremap_nested++;
414 /*
415 * Mappings have to be page-aligned
416 */
417 offset = phys_addr & ~PAGE_MASK;
418 phys_addr &= PAGE_MASK;
419 size = PAGE_ALIGN(last_addr) - phys_addr;
420
421 /*
422 * Mappings have to fit in the FIX_BTMAP area.
423 */
424 nrpages = size >> PAGE_SHIFT;
425 if (nrpages > NR_FIX_BTMAPS) {
426 WARN_ON(1);
427 return NULL;
428 }
429
430 /*
431 * Ok, go for it..
432 */
433 idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting;
434 idx = idx0;
435 while (nrpages > 0) {
436 early_set_fixmap(idx, phys_addr);
437 phys_addr += PAGE_SIZE;
438 --idx;
439 --nrpages;
440 }
441 if (early_ioremap_debug)
442 printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0));
443
444 return (void *) (offset + fix_to_virt(idx0));
445}
446
447void __init early_iounmap(void *addr, unsigned long size)
448{
449 unsigned long virt_addr;
450 unsigned long offset;
451 unsigned int nrpages;
452 enum fixed_addresses idx;
453 unsigned int nesting;
454
455 nesting = --early_ioremap_nested;
456 WARN_ON(nesting < 0);
457
458 if (early_ioremap_debug) {
459 printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr,
460 size, nesting);
461 dump_stack();
462 }
463
464 virt_addr = (unsigned long)addr;
465 if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)) {
466 WARN_ON(1);
467 return;
468 }
469 offset = virt_addr & ~PAGE_MASK;
470 nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
471
472 idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting;
473 while (nrpages > 0) {
474 early_clear_fixmap(idx);
475 --idx;
476 --nrpages;
477 }
478}
479
480void __this_fixmap_does_not_exist(void)
481{
482 WARN_ON(1);
483}
484
485#endif /* CONFIG_X86_32 */
diff --git a/arch/x86/mm/ioremap_32.c b/arch/x86/mm/ioremap_32.c
deleted file mode 100644
index 0b278315d73..00000000000
--- a/arch/x86/mm/ioremap_32.c
+++ /dev/null
@@ -1,274 +0,0 @@
1/*
2 * arch/i386/mm/ioremap.c
3 *
4 * Re-map IO memory to kernel address space so that we can access it.
5 * This is needed for high PCI addresses that aren't mapped in the
6 * 640k-1MB IO memory area on PC's
7 *
8 * (C) Copyright 1995 1996 Linus Torvalds
9 */
10
11#include <linux/vmalloc.h>
12#include <linux/init.h>
13#include <linux/slab.h>
14#include <linux/module.h>
15#include <linux/io.h>
16#include <asm/fixmap.h>
17#include <asm/cacheflush.h>
18#include <asm/tlbflush.h>
19#include <asm/pgtable.h>
20
21#define ISA_START_ADDRESS 0xa0000
22#define ISA_END_ADDRESS 0x100000
23
24/*
25 * Generic mapping function (not visible outside):
26 */
27
28/*
29 * Remap an arbitrary physical address space into the kernel virtual
30 * address space. Needed when the kernel wants to access high addresses
31 * directly.
32 *
33 * NOTE! We need to allow non-page-aligned mappings too: we will obviously
34 * have to convert them into an offset in a page-aligned mapping, but the
35 * caller shouldn't need to know that small detail.
36 */
37void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags)
38{
39 void __iomem * addr;
40 struct vm_struct * area;
41 unsigned long offset, last_addr;
42 pgprot_t prot;
43
44 /* Don't allow wraparound or zero size */
45 last_addr = phys_addr + size - 1;
46 if (!size || last_addr < phys_addr)
47 return NULL;
48
49 /*
50 * Don't remap the low PCI/ISA area, it's always mapped..
51 */
52 if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
53 return (void __iomem *) phys_to_virt(phys_addr);
54
55 /*
56 * Don't allow anybody to remap normal RAM that we're using..
57 */
58 if (phys_addr <= virt_to_phys(high_memory - 1)) {
59 char *t_addr, *t_end;
60 struct page *page;
61
62 t_addr = __va(phys_addr);
63 t_end = t_addr + (size - 1);
64
65 for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++)
66 if(!PageReserved(page))
67 return NULL;
68 }
69
70 prot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY
71 | _PAGE_ACCESSED | flags);
72
73 /*
74 * Mappings have to be page-aligned
75 */
76 offset = phys_addr & ~PAGE_MASK;
77 phys_addr &= PAGE_MASK;
78 size = PAGE_ALIGN(last_addr+1) - phys_addr;
79
80 /*
81 * Ok, go for it..
82 */
83 area = get_vm_area(size, VM_IOREMAP | (flags << 20));
84 if (!area)
85 return NULL;
86 area->phys_addr = phys_addr;
87 addr = (void __iomem *) area->addr;
88 if (ioremap_page_range((unsigned long) addr,
89 (unsigned long) addr + size, phys_addr, prot)) {
90 vunmap((void __force *) addr);
91 return NULL;
92 }
93 return (void __iomem *) (offset + (char __iomem *)addr);
94}
95EXPORT_SYMBOL(__ioremap);
96
97/**
98 * ioremap_nocache - map bus memory into CPU space
99 * @offset: bus address of the memory
100 * @size: size of the resource to map
101 *
102 * ioremap_nocache performs a platform specific sequence of operations to
103 * make bus memory CPU accessible via the readb/readw/readl/writeb/
104 * writew/writel functions and the other mmio helpers. The returned
105 * address is not guaranteed to be usable directly as a virtual
106 * address.
107 *
108 * This version of ioremap ensures that the memory is marked uncachable
109 * on the CPU as well as honouring existing caching rules from things like
110 * the PCI bus. Note that there are other caches and buffers on many
111 * busses. In particular driver authors should read up on PCI writes
112 *
113 * It's useful if some control registers are in such an area and
114 * write combining or read caching is not desirable:
115 *
116 * Must be freed with iounmap.
117 */
118
119void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size)
120{
121 unsigned long last_addr;
122 void __iomem *p = __ioremap(phys_addr, size, _PAGE_PCD);
123 if (!p)
124 return p;
125
126 /* Guaranteed to be > phys_addr, as per __ioremap() */
127 last_addr = phys_addr + size - 1;
128
129 if (last_addr < virt_to_phys(high_memory) - 1) {
130 struct page *ppage = virt_to_page(__va(phys_addr));
131 unsigned long npages;
132
133 phys_addr &= PAGE_MASK;
134
135 /* This might overflow and become zero.. */
136 last_addr = PAGE_ALIGN(last_addr);
137
138 /* .. but that's ok, because modulo-2**n arithmetic will make
139 * the page-aligned "last - first" come out right.
140 */
141 npages = (last_addr - phys_addr) >> PAGE_SHIFT;
142
143 if (change_page_attr(ppage, npages, PAGE_KERNEL_NOCACHE) < 0) {
144 iounmap(p);
145 p = NULL;
146 }
147 global_flush_tlb();
148 }
149
150 return p;
151}
152EXPORT_SYMBOL(ioremap_nocache);
153
154/**
155 * iounmap - Free a IO remapping
156 * @addr: virtual address from ioremap_*
157 *
158 * Caller must ensure there is only one unmapping for the same pointer.
159 */
160void iounmap(volatile void __iomem *addr)
161{
162 struct vm_struct *p, *o;
163
164 if ((void __force *)addr <= high_memory)
165 return;
166
167 /*
168 * __ioremap special-cases the PCI/ISA range by not instantiating a
169 * vm_area and by simply returning an address into the kernel mapping
170 * of ISA space. So handle that here.
171 */
172 if (addr >= phys_to_virt(ISA_START_ADDRESS) &&
173 addr < phys_to_virt(ISA_END_ADDRESS))
174 return;
175
176 addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr);
177
178 /* Use the vm area unlocked, assuming the caller
179 ensures there isn't another iounmap for the same address
180 in parallel. Reuse of the virtual address is prevented by
181 leaving it in the global lists until we're done with it.
182 cpa takes care of the direct mappings. */
183 read_lock(&vmlist_lock);
184 for (p = vmlist; p; p = p->next) {
185 if (p->addr == addr)
186 break;
187 }
188 read_unlock(&vmlist_lock);
189
190 if (!p) {
191 printk("iounmap: bad address %p\n", addr);
192 dump_stack();
193 return;
194 }
195
196 /* Reset the direct mapping. Can block */
197 if ((p->flags >> 20) && p->phys_addr < virt_to_phys(high_memory) - 1) {
198 change_page_attr(virt_to_page(__va(p->phys_addr)),
199 get_vm_area_size(p) >> PAGE_SHIFT,
200 PAGE_KERNEL);
201 global_flush_tlb();
202 }
203
204 /* Finally remove it */
205 o = remove_vm_area((void *)addr);
206 BUG_ON(p != o || o == NULL);
207 kfree(p);
208}
209EXPORT_SYMBOL(iounmap);
210
211void __init *bt_ioremap(unsigned long phys_addr, unsigned long size)
212{
213 unsigned long offset, last_addr;
214 unsigned int nrpages;
215 enum fixed_addresses idx;
216
217 /* Don't allow wraparound or zero size */
218 last_addr = phys_addr + size - 1;
219 if (!size || last_addr < phys_addr)
220 return NULL;
221
222 /*
223 * Don't remap the low PCI/ISA area, it's always mapped..
224 */
225 if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
226 return phys_to_virt(phys_addr);
227
228 /*
229 * Mappings have to be page-aligned
230 */
231 offset = phys_addr & ~PAGE_MASK;
232 phys_addr &= PAGE_MASK;
233 size = PAGE_ALIGN(last_addr) - phys_addr;
234
235 /*
236 * Mappings have to fit in the FIX_BTMAP area.
237 */
238 nrpages = size >> PAGE_SHIFT;
239 if (nrpages > NR_FIX_BTMAPS)
240 return NULL;
241
242 /*
243 * Ok, go for it..
244 */
245 idx = FIX_BTMAP_BEGIN;
246 while (nrpages > 0) {
247 set_fixmap(idx, phys_addr);
248 phys_addr += PAGE_SIZE;
249 --idx;
250 --nrpages;
251 }
252 return (void*) (offset + fix_to_virt(FIX_BTMAP_BEGIN));
253}
254
255void __init bt_iounmap(void *addr, unsigned long size)
256{
257 unsigned long virt_addr;
258 unsigned long offset;
259 unsigned int nrpages;
260 enum fixed_addresses idx;
261
262 virt_addr = (unsigned long)addr;
263 if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN))
264 return;
265 offset = virt_addr & ~PAGE_MASK;
266 nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
267
268 idx = FIX_BTMAP_BEGIN;
269 while (nrpages > 0) {
270 clear_fixmap(idx);
271 --idx;
272 --nrpages;
273 }
274}
diff --git a/arch/x86/mm/ioremap_64.c b/arch/x86/mm/ioremap_64.c
deleted file mode 100644
index 6cac90aa503..00000000000
--- a/arch/x86/mm/ioremap_64.c
+++ /dev/null
@@ -1,210 +0,0 @@
1/*
2 * arch/x86_64/mm/ioremap.c
3 *
4 * Re-map IO memory to kernel address space so that we can access it.
5 * This is needed for high PCI addresses that aren't mapped in the
6 * 640k-1MB IO memory area on PC's
7 *
8 * (C) Copyright 1995 1996 Linus Torvalds
9 */
10
11#include <linux/vmalloc.h>
12#include <linux/init.h>
13#include <linux/slab.h>
14#include <linux/module.h>
15#include <linux/io.h>
16
17#include <asm/pgalloc.h>
18#include <asm/fixmap.h>
19#include <asm/tlbflush.h>
20#include <asm/cacheflush.h>
21#include <asm/proto.h>
22
23unsigned long __phys_addr(unsigned long x)
24{
25 if (x >= __START_KERNEL_map)
26 return x - __START_KERNEL_map + phys_base;
27 return x - PAGE_OFFSET;
28}
29EXPORT_SYMBOL(__phys_addr);
30
31#define ISA_START_ADDRESS 0xa0000
32#define ISA_END_ADDRESS 0x100000
33
34/*
35 * Fix up the linear direct mapping of the kernel to avoid cache attribute
36 * conflicts.
37 */
38static int
39ioremap_change_attr(unsigned long phys_addr, unsigned long size,
40 unsigned long flags)
41{
42 int err = 0;
43 if (phys_addr + size - 1 < (end_pfn_map << PAGE_SHIFT)) {
44 unsigned long npages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
45 unsigned long vaddr = (unsigned long) __va(phys_addr);
46
47 /*
48 * Must use a address here and not struct page because the phys addr
49 * can be a in hole between nodes and not have an memmap entry.
50 */
51 err = change_page_attr_addr(vaddr,npages,__pgprot(__PAGE_KERNEL|flags));
52 if (!err)
53 global_flush_tlb();
54 }
55 return err;
56}
57
58/*
59 * Generic mapping function
60 */
61
62/*
63 * Remap an arbitrary physical address space into the kernel virtual
64 * address space. Needed when the kernel wants to access high addresses
65 * directly.
66 *
67 * NOTE! We need to allow non-page-aligned mappings too: we will obviously
68 * have to convert them into an offset in a page-aligned mapping, but the
69 * caller shouldn't need to know that small detail.
70 */
71void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags)
72{
73 void * addr;
74 struct vm_struct * area;
75 unsigned long offset, last_addr;
76 pgprot_t pgprot;
77
78 /* Don't allow wraparound or zero size */
79 last_addr = phys_addr + size - 1;
80 if (!size || last_addr < phys_addr)
81 return NULL;
82
83 /*
84 * Don't remap the low PCI/ISA area, it's always mapped..
85 */
86 if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
87 return (__force void __iomem *)phys_to_virt(phys_addr);
88
89#ifdef CONFIG_FLATMEM
90 /*
91 * Don't allow anybody to remap normal RAM that we're using..
92 */
93 if (last_addr < virt_to_phys(high_memory)) {
94 char *t_addr, *t_end;
95 struct page *page;
96
97 t_addr = __va(phys_addr);
98 t_end = t_addr + (size - 1);
99
100 for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++)
101 if(!PageReserved(page))
102 return NULL;
103 }
104#endif
105
106 pgprot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_GLOBAL
107 | _PAGE_DIRTY | _PAGE_ACCESSED | flags);
108 /*
109 * Mappings have to be page-aligned
110 */
111 offset = phys_addr & ~PAGE_MASK;
112 phys_addr &= PAGE_MASK;
113 size = PAGE_ALIGN(last_addr+1) - phys_addr;
114
115 /*
116 * Ok, go for it..
117 */
118 area = get_vm_area(size, VM_IOREMAP | (flags << 20));
119 if (!area)
120 return NULL;
121 area->phys_addr = phys_addr;
122 addr = area->addr;
123 if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size,
124 phys_addr, pgprot)) {
125 remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr));
126 return NULL;
127 }
128 if (flags && ioremap_change_attr(phys_addr, size, flags) < 0) {
129 area->flags &= 0xffffff;
130 vunmap(addr);
131 return NULL;
132 }
133 return (__force void __iomem *) (offset + (char *)addr);
134}
135EXPORT_SYMBOL(__ioremap);
136
137/**
138 * ioremap_nocache - map bus memory into CPU space
139 * @offset: bus address of the memory
140 * @size: size of the resource to map
141 *
142 * ioremap_nocache performs a platform specific sequence of operations to
143 * make bus memory CPU accessible via the readb/readw/readl/writeb/
144 * writew/writel functions and the other mmio helpers. The returned
145 * address is not guaranteed to be usable directly as a virtual
146 * address.
147 *
148 * This version of ioremap ensures that the memory is marked uncachable
149 * on the CPU as well as honouring existing caching rules from things like
150 * the PCI bus. Note that there are other caches and buffers on many
151 * busses. In particular driver authors should read up on PCI writes
152 *
153 * It's useful if some control registers are in such an area and
154 * write combining or read caching is not desirable:
155 *
156 * Must be freed with iounmap.
157 */
158
159void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size)
160{
161 return __ioremap(phys_addr, size, _PAGE_PCD);
162}
163EXPORT_SYMBOL(ioremap_nocache);
164
165/**
166 * iounmap - Free a IO remapping
167 * @addr: virtual address from ioremap_*
168 *
169 * Caller must ensure there is only one unmapping for the same pointer.
170 */
171void iounmap(volatile void __iomem *addr)
172{
173 struct vm_struct *p, *o;
174
175 if (addr <= high_memory)
176 return;
177 if (addr >= phys_to_virt(ISA_START_ADDRESS) &&
178 addr < phys_to_virt(ISA_END_ADDRESS))
179 return;
180
181 addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr);
182 /* Use the vm area unlocked, assuming the caller
183 ensures there isn't another iounmap for the same address
184 in parallel. Reuse of the virtual address is prevented by
185 leaving it in the global lists until we're done with it.
186 cpa takes care of the direct mappings. */
187 read_lock(&vmlist_lock);
188 for (p = vmlist; p; p = p->next) {
189 if (p->addr == addr)
190 break;
191 }
192 read_unlock(&vmlist_lock);
193
194 if (!p) {
195 printk("iounmap: bad address %p\n", addr);
196 dump_stack();
197 return;
198 }
199
200 /* Reset the direct mapping. Can block */
201 if (p->flags >> 20)
202 ioremap_change_attr(p->phys_addr, p->size, 0);
203
204 /* Finally remove it */
205 o = remove_vm_area((void *)addr);
206 BUG_ON(p != o || o == NULL);
207 kfree(p);
208}
209EXPORT_SYMBOL(iounmap);
210
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c
index a96006f7ae0..7a2ebce87df 100644
--- a/arch/x86/mm/k8topology_64.c
+++ b/arch/x86/mm/k8topology_64.c
@@ -1,9 +1,9 @@
1/* 1/*
2 * AMD K8 NUMA support. 2 * AMD K8 NUMA support.
3 * Discover the memory map and associated nodes. 3 * Discover the memory map and associated nodes.
4 * 4 *
5 * This version reads it directly from the K8 northbridge. 5 * This version reads it directly from the K8 northbridge.
6 * 6 *
7 * Copyright 2002,2003 Andi Kleen, SuSE Labs. 7 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
8 */ 8 */
9#include <linux/kernel.h> 9#include <linux/kernel.h>
@@ -22,132 +22,135 @@
22 22
23static __init int find_northbridge(void) 23static __init int find_northbridge(void)
24{ 24{
25 int num; 25 int num;
26 26
27 for (num = 0; num < 32; num++) { 27 for (num = 0; num < 32; num++) {
28 u32 header; 28 u32 header;
29 29
30 header = read_pci_config(0, num, 0, 0x00); 30 header = read_pci_config(0, num, 0, 0x00);
31 if (header != (PCI_VENDOR_ID_AMD | (0x1100<<16))) 31 if (header != (PCI_VENDOR_ID_AMD | (0x1100<<16)) &&
32 continue; 32 header != (PCI_VENDOR_ID_AMD | (0x1200<<16)) &&
33 33 header != (PCI_VENDOR_ID_AMD | (0x1300<<16)))
34 header = read_pci_config(0, num, 1, 0x00); 34 continue;
35 if (header != (PCI_VENDOR_ID_AMD | (0x1101<<16))) 35
36 continue; 36 header = read_pci_config(0, num, 1, 0x00);
37 return num; 37 if (header != (PCI_VENDOR_ID_AMD | (0x1101<<16)) &&
38 } 38 header != (PCI_VENDOR_ID_AMD | (0x1201<<16)) &&
39 39 header != (PCI_VENDOR_ID_AMD | (0x1301<<16)))
40 return -1; 40 continue;
41 return num;
42 }
43
44 return -1;
41} 45}
42 46
43int __init k8_scan_nodes(unsigned long start, unsigned long end) 47int __init k8_scan_nodes(unsigned long start, unsigned long end)
44{ 48{
45 unsigned long prevbase; 49 unsigned long prevbase;
46 struct bootnode nodes[8]; 50 struct bootnode nodes[8];
47 int nodeid, i, j, nb; 51 int nodeid, i, nb;
48 unsigned char nodeids[8]; 52 unsigned char nodeids[8];
49 int found = 0; 53 int found = 0;
50 u32 reg; 54 u32 reg;
51 unsigned numnodes; 55 unsigned numnodes;
52 unsigned num_cores; 56 unsigned cores;
57 unsigned bits;
58 int j;
53 59
54 if (!early_pci_allowed()) 60 if (!early_pci_allowed())
55 return -1; 61 return -1;
56 62
57 nb = find_northbridge(); 63 nb = find_northbridge();
58 if (nb < 0) 64 if (nb < 0)
59 return nb; 65 return nb;
60 66
61 printk(KERN_INFO "Scanning NUMA topology in Northbridge %d\n", nb); 67 printk(KERN_INFO "Scanning NUMA topology in Northbridge %d\n", nb);
62
63 num_cores = (cpuid_ecx(0x80000008) & 0xff) + 1;
64 printk(KERN_INFO "CPU has %d num_cores\n", num_cores);
65 68
66 reg = read_pci_config(0, nb, 0, 0x60); 69 reg = read_pci_config(0, nb, 0, 0x60);
67 numnodes = ((reg >> 4) & 0xF) + 1; 70 numnodes = ((reg >> 4) & 0xF) + 1;
68 if (numnodes <= 1) 71 if (numnodes <= 1)
69 return -1; 72 return -1;
70 73
71 printk(KERN_INFO "Number of nodes %d\n", numnodes); 74 printk(KERN_INFO "Number of nodes %d\n", numnodes);
72 75
73 memset(&nodes,0,sizeof(nodes)); 76 memset(&nodes, 0, sizeof(nodes));
74 prevbase = 0; 77 prevbase = 0;
75 for (i = 0; i < 8; i++) { 78 for (i = 0; i < 8; i++) {
76 unsigned long base,limit; 79 unsigned long base, limit;
77 u32 nodeid; 80 u32 nodeid;
78 81
79 base = read_pci_config(0, nb, 1, 0x40 + i*8); 82 base = read_pci_config(0, nb, 1, 0x40 + i*8);
80 limit = read_pci_config(0, nb, 1, 0x44 + i*8); 83 limit = read_pci_config(0, nb, 1, 0x44 + i*8);
81 84
82 nodeid = limit & 7; 85 nodeid = limit & 7;
83 nodeids[i] = nodeid; 86 nodeids[i] = nodeid;
84 if ((base & 3) == 0) { 87 if ((base & 3) == 0) {
85 if (i < numnodes) 88 if (i < numnodes)
86 printk("Skipping disabled node %d\n", i); 89 printk("Skipping disabled node %d\n", i);
87 continue; 90 continue;
88 } 91 }
89 if (nodeid >= numnodes) { 92 if (nodeid >= numnodes) {
90 printk("Ignoring excess node %d (%lx:%lx)\n", nodeid, 93 printk("Ignoring excess node %d (%lx:%lx)\n", nodeid,
91 base, limit); 94 base, limit);
92 continue; 95 continue;
93 } 96 }
94 97
95 if (!limit) { 98 if (!limit) {
96 printk(KERN_INFO "Skipping node entry %d (base %lx)\n", i, 99 printk(KERN_INFO "Skipping node entry %d (base %lx)\n",
97 base); 100 i, base);
98 continue; 101 continue;
99 } 102 }
100 if ((base >> 8) & 3 || (limit >> 8) & 3) { 103 if ((base >> 8) & 3 || (limit >> 8) & 3) {
101 printk(KERN_ERR "Node %d using interleaving mode %lx/%lx\n", 104 printk(KERN_ERR "Node %d using interleaving mode %lx/%lx\n",
102 nodeid, (base>>8)&3, (limit>>8) & 3); 105 nodeid, (base>>8)&3, (limit>>8) & 3);
103 return -1; 106 return -1;
104 } 107 }
105 if (node_isset(nodeid, node_possible_map)) { 108 if (node_isset(nodeid, node_possible_map)) {
106 printk(KERN_INFO "Node %d already present. Skipping\n", 109 printk(KERN_INFO "Node %d already present. Skipping\n",
107 nodeid); 110 nodeid);
108 continue; 111 continue;
109 } 112 }
110 113
111 limit >>= 16; 114 limit >>= 16;
112 limit <<= 24; 115 limit <<= 24;
113 limit |= (1<<24)-1; 116 limit |= (1<<24)-1;
114 limit++; 117 limit++;
115 118
116 if (limit > end_pfn << PAGE_SHIFT) 119 if (limit > end_pfn << PAGE_SHIFT)
117 limit = end_pfn << PAGE_SHIFT; 120 limit = end_pfn << PAGE_SHIFT;
118 if (limit <= base) 121 if (limit <= base)
119 continue; 122 continue;
120 123
121 base >>= 16; 124 base >>= 16;
122 base <<= 24; 125 base <<= 24;
123 126
124 if (base < start) 127 if (base < start)
125 base = start; 128 base = start;
126 if (limit > end) 129 if (limit > end)
127 limit = end; 130 limit = end;
128 if (limit == base) { 131 if (limit == base) {
129 printk(KERN_ERR "Empty node %d\n", nodeid); 132 printk(KERN_ERR "Empty node %d\n", nodeid);
130 continue; 133 continue;
131 } 134 }
132 if (limit < base) { 135 if (limit < base) {
133 printk(KERN_ERR "Node %d bogus settings %lx-%lx.\n", 136 printk(KERN_ERR "Node %d bogus settings %lx-%lx.\n",
134 nodeid, base, limit); 137 nodeid, base, limit);
135 continue; 138 continue;
136 } 139 }
137 140
138 /* Could sort here, but pun for now. Should not happen anyroads. */ 141 /* Could sort here, but pun for now. Should not happen anyroads. */
139 if (prevbase > base) { 142 if (prevbase > base) {
140 printk(KERN_ERR "Node map not sorted %lx,%lx\n", 143 printk(KERN_ERR "Node map not sorted %lx,%lx\n",
141 prevbase,base); 144 prevbase, base);
142 return -1; 145 return -1;
143 } 146 }
144 147
145 printk(KERN_INFO "Node %d MemBase %016lx Limit %016lx\n", 148 printk(KERN_INFO "Node %d MemBase %016lx Limit %016lx\n",
146 nodeid, base, limit); 149 nodeid, base, limit);
147 150
148 found++; 151 found++;
149 152
150 nodes[nodeid].start = base; 153 nodes[nodeid].start = base;
151 nodes[nodeid].end = limit; 154 nodes[nodeid].end = limit;
152 e820_register_active_regions(nodeid, 155 e820_register_active_regions(nodeid,
153 nodes[nodeid].start >> PAGE_SHIFT, 156 nodes[nodeid].start >> PAGE_SHIFT,
@@ -156,27 +159,31 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
156 prevbase = base; 159 prevbase = base;
157 160
158 node_set(nodeid, node_possible_map); 161 node_set(nodeid, node_possible_map);
159 } 162 }
160 163
161 if (!found) 164 if (!found)
162 return -1; 165 return -1;
163 166
164 memnode_shift = compute_hash_shift(nodes, 8); 167 memnode_shift = compute_hash_shift(nodes, 8);
165 if (memnode_shift < 0) { 168 if (memnode_shift < 0) {
166 printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n"); 169 printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n");
167 return -1; 170 return -1;
168 } 171 }
169 printk(KERN_INFO "Using node hash shift of %d\n", memnode_shift); 172 printk(KERN_INFO "Using node hash shift of %d\n", memnode_shift);
173
174 /* use the coreid bits from early_identify_cpu */
175 bits = boot_cpu_data.x86_coreid_bits;
176 cores = (1<<bits);
170 177
171 for (i = 0; i < 8; i++) { 178 for (i = 0; i < 8; i++) {
172 if (nodes[i].start != nodes[i].end) { 179 if (nodes[i].start != nodes[i].end) {
173 nodeid = nodeids[i]; 180 nodeid = nodeids[i];
174 for (j = 0; j < num_cores; j++) 181 for (j = 0; j < cores; j++)
175 apicid_to_node[(nodeid * num_cores) + j] = i; 182 apicid_to_node[(nodeid << bits) + j] = i;
176 setup_node_bootmem(i, nodes[i].start, nodes[i].end); 183 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
177 } 184 }
178 } 185 }
179 186
180 numa_init_array(); 187 numa_init_array();
181 return 0; 188 return 0;
182} 189}
diff --git a/arch/x86/mm/mmap_32.c b/arch/x86/mm/mmap.c
index 552e0847375..56fe7124fbe 100644
--- a/arch/x86/mm/mmap_32.c
+++ b/arch/x86/mm/mmap.c
@@ -1,10 +1,13 @@
1/* 1/*
2 * linux/arch/i386/mm/mmap.c 2 * Flexible mmap layout support
3 * 3 *
4 * flexible mmap layout support 4 * Based on code by Ingo Molnar and Andi Kleen, copyrighted
5 * as follows:
5 * 6 *
6 * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. 7 * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
7 * All Rights Reserved. 8 * All Rights Reserved.
9 * Copyright 2005 Andi Kleen, SUSE Labs.
10 * Copyright 2007 Jiri Kosina, SUSE Labs.
8 * 11 *
9 * This program is free software; you can redistribute it and/or modify 12 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by 13 * it under the terms of the GNU General Public License as published by
@@ -19,14 +22,12 @@
19 * You should have received a copy of the GNU General Public License 22 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software 23 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 24 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 *
23 *
24 * Started by Ingo Molnar <mingo@elte.hu>
25 */ 25 */
26 26
27#include <linux/personality.h> 27#include <linux/personality.h>
28#include <linux/mm.h> 28#include <linux/mm.h>
29#include <linux/random.h> 29#include <linux/random.h>
30#include <linux/limits.h>
30#include <linux/sched.h> 31#include <linux/sched.h>
31 32
32/* 33/*
@@ -37,20 +38,71 @@
37#define MIN_GAP (128*1024*1024) 38#define MIN_GAP (128*1024*1024)
38#define MAX_GAP (TASK_SIZE/6*5) 39#define MAX_GAP (TASK_SIZE/6*5)
39 40
40static inline unsigned long mmap_base(struct mm_struct *mm) 41/*
42 * True on X86_32 or when emulating IA32 on X86_64
43 */
44static int mmap_is_ia32(void)
41{ 45{
42 unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur; 46#ifdef CONFIG_X86_32
43 unsigned long random_factor = 0; 47 return 1;
48#endif
49#ifdef CONFIG_IA32_EMULATION
50 if (test_thread_flag(TIF_IA32))
51 return 1;
52#endif
53 return 0;
54}
44 55
45 if (current->flags & PF_RANDOMIZE) 56static int mmap_is_legacy(void)
46 random_factor = get_random_int() % (1024*1024); 57{
58 if (current->personality & ADDR_COMPAT_LAYOUT)
59 return 1;
60
61 if (current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY)
62 return 1;
63
64 return sysctl_legacy_va_layout;
65}
66
67static unsigned long mmap_rnd(void)
68{
69 unsigned long rnd = 0;
70
71 /*
72 * 8 bits of randomness in 32bit mmaps, 20 address space bits
73 * 28 bits of randomness in 64bit mmaps, 40 address space bits
74 */
75 if (current->flags & PF_RANDOMIZE) {
76 if (mmap_is_ia32())
77 rnd = (long)get_random_int() % (1<<8);
78 else
79 rnd = (long)(get_random_int() % (1<<28));
80 }
81 return rnd << PAGE_SHIFT;
82}
83
84static unsigned long mmap_base(void)
85{
86 unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur;
47 87
48 if (gap < MIN_GAP) 88 if (gap < MIN_GAP)
49 gap = MIN_GAP; 89 gap = MIN_GAP;
50 else if (gap > MAX_GAP) 90 else if (gap > MAX_GAP)
51 gap = MAX_GAP; 91 gap = MAX_GAP;
52 92
53 return PAGE_ALIGN(TASK_SIZE - gap - random_factor); 93 return PAGE_ALIGN(TASK_SIZE - gap - mmap_rnd());
94}
95
96/*
97 * Bottom-up (legacy) layout on X86_32 did not support randomization, X86_64
98 * does, but not when emulating X86_32
99 */
100static unsigned long mmap_legacy_base(void)
101{
102 if (mmap_is_ia32())
103 return TASK_UNMAPPED_BASE;
104 else
105 return TASK_UNMAPPED_BASE + mmap_rnd();
54} 106}
55 107
56/* 108/*
@@ -59,18 +111,12 @@ static inline unsigned long mmap_base(struct mm_struct *mm)
59 */ 111 */
60void arch_pick_mmap_layout(struct mm_struct *mm) 112void arch_pick_mmap_layout(struct mm_struct *mm)
61{ 113{
62 /* 114 if (mmap_is_legacy()) {
63 * Fall back to the standard layout if the personality 115 mm->mmap_base = mmap_legacy_base();
64 * bit is set, or if the expected stack growth is unlimited:
65 */
66 if (sysctl_legacy_va_layout ||
67 (current->personality & ADDR_COMPAT_LAYOUT) ||
68 current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY) {
69 mm->mmap_base = TASK_UNMAPPED_BASE;
70 mm->get_unmapped_area = arch_get_unmapped_area; 116 mm->get_unmapped_area = arch_get_unmapped_area;
71 mm->unmap_area = arch_unmap_area; 117 mm->unmap_area = arch_unmap_area;
72 } else { 118 } else {
73 mm->mmap_base = mmap_base(mm); 119 mm->mmap_base = mmap_base();
74 mm->get_unmapped_area = arch_get_unmapped_area_topdown; 120 mm->get_unmapped_area = arch_get_unmapped_area_topdown;
75 mm->unmap_area = arch_unmap_area_topdown; 121 mm->unmap_area = arch_unmap_area_topdown;
76 } 122 }
diff --git a/arch/x86/mm/mmap_64.c b/arch/x86/mm/mmap_64.c
deleted file mode 100644
index 80bba0dc000..00000000000
--- a/arch/x86/mm/mmap_64.c
+++ /dev/null
@@ -1,29 +0,0 @@
1/* Copyright 2005 Andi Kleen, SuSE Labs.
2 * Licensed under GPL, v.2
3 */
4#include <linux/mm.h>
5#include <linux/sched.h>
6#include <linux/random.h>
7#include <asm/ia32.h>
8
9/* Notebook: move the mmap code from sys_x86_64.c over here. */
10
11void arch_pick_mmap_layout(struct mm_struct *mm)
12{
13#ifdef CONFIG_IA32_EMULATION
14 if (current_thread_info()->flags & _TIF_IA32)
15 return ia32_pick_mmap_layout(mm);
16#endif
17 mm->mmap_base = TASK_UNMAPPED_BASE;
18 if (current->flags & PF_RANDOMIZE) {
19 /* Add 28bit randomness which is about 40bits of address space
20 because mmap base has to be page aligned.
21 or ~1/128 of the total user VM
22 (total user address space is 47bits) */
23 unsigned rnd = get_random_int() & 0xfffffff;
24 mm->mmap_base += ((unsigned long)rnd) << PAGE_SHIFT;
25 }
26 mm->get_unmapped_area = arch_get_unmapped_area;
27 mm->unmap_area = arch_unmap_area;
28}
29
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 3d6926ba899..5a02bf4c91e 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * Generic VM initialization for x86-64 NUMA setups. 2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs. 3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 */ 4 */
5#include <linux/kernel.h> 5#include <linux/kernel.h>
6#include <linux/mm.h> 6#include <linux/mm.h>
7#include <linux/string.h> 7#include <linux/string.h>
@@ -11,35 +11,45 @@
11#include <linux/ctype.h> 11#include <linux/ctype.h>
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/nodemask.h> 13#include <linux/nodemask.h>
14#include <linux/sched.h>
14 15
15#include <asm/e820.h> 16#include <asm/e820.h>
16#include <asm/proto.h> 17#include <asm/proto.h>
17#include <asm/dma.h> 18#include <asm/dma.h>
18#include <asm/numa.h> 19#include <asm/numa.h>
19#include <asm/acpi.h> 20#include <asm/acpi.h>
21#include <asm/k8.h>
20 22
21#ifndef Dprintk 23#ifndef Dprintk
22#define Dprintk(x...) 24#define Dprintk(x...)
23#endif 25#endif
24 26
25struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 27struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
28EXPORT_SYMBOL(node_data);
29
26bootmem_data_t plat_node_bdata[MAX_NUMNODES]; 30bootmem_data_t plat_node_bdata[MAX_NUMNODES];
27 31
28struct memnode memnode; 32struct memnode memnode;
29 33
30unsigned char cpu_to_node[NR_CPUS] __read_mostly = { 34int x86_cpu_to_node_map_init[NR_CPUS] = {
31 [0 ... NR_CPUS-1] = NUMA_NO_NODE 35 [0 ... NR_CPUS-1] = NUMA_NO_NODE
32}; 36};
33unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { 37void *x86_cpu_to_node_map_early_ptr;
34 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE 38DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE;
39EXPORT_PER_CPU_SYMBOL(x86_cpu_to_node_map);
40EXPORT_SYMBOL(x86_cpu_to_node_map_early_ptr);
41
42s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
43 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
35}; 44};
36cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly; 45
46cpumask_t node_to_cpumask_map[MAX_NUMNODES] __read_mostly;
47EXPORT_SYMBOL(node_to_cpumask_map);
37 48
38int numa_off __initdata; 49int numa_off __initdata;
39unsigned long __initdata nodemap_addr; 50unsigned long __initdata nodemap_addr;
40unsigned long __initdata nodemap_size; 51unsigned long __initdata nodemap_size;
41 52
42
43/* 53/*
44 * Given a shift value, try to populate memnodemap[] 54 * Given a shift value, try to populate memnodemap[]
45 * Returns : 55 * Returns :
@@ -47,14 +57,13 @@ unsigned long __initdata nodemap_size;
47 * 0 if memnodmap[] too small (of shift too small) 57 * 0 if memnodmap[] too small (of shift too small)
48 * -1 if node overlap or lost ram (shift too big) 58 * -1 if node overlap or lost ram (shift too big)
49 */ 59 */
50static int __init 60static int __init populate_memnodemap(const struct bootnode *nodes,
51populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift) 61 int numnodes, int shift)
52{ 62{
53 int i;
54 int res = -1;
55 unsigned long addr, end; 63 unsigned long addr, end;
64 int i, res = -1;
56 65
57 memset(memnodemap, 0xff, memnodemapsize); 66 memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize);
58 for (i = 0; i < numnodes; i++) { 67 for (i = 0; i < numnodes; i++) {
59 addr = nodes[i].start; 68 addr = nodes[i].start;
60 end = nodes[i].end; 69 end = nodes[i].end;
@@ -63,37 +72,36 @@ populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift)
63 if ((end >> shift) >= memnodemapsize) 72 if ((end >> shift) >= memnodemapsize)
64 return 0; 73 return 0;
65 do { 74 do {
66 if (memnodemap[addr >> shift] != 0xff) 75 if (memnodemap[addr >> shift] != NUMA_NO_NODE)
67 return -1; 76 return -1;
68 memnodemap[addr >> shift] = i; 77 memnodemap[addr >> shift] = i;
69 addr += (1UL << shift); 78 addr += (1UL << shift);
70 } while (addr < end); 79 } while (addr < end);
71 res = 1; 80 res = 1;
72 } 81 }
73 return res; 82 return res;
74} 83}
75 84
76static int __init allocate_cachealigned_memnodemap(void) 85static int __init allocate_cachealigned_memnodemap(void)
77{ 86{
78 unsigned long pad, pad_addr; 87 unsigned long addr;
79 88
80 memnodemap = memnode.embedded_map; 89 memnodemap = memnode.embedded_map;
81 if (memnodemapsize <= 48) 90 if (memnodemapsize <= ARRAY_SIZE(memnode.embedded_map))
82 return 0; 91 return 0;
83 92
84 pad = L1_CACHE_BYTES - 1; 93 addr = 0x8000;
85 pad_addr = 0x8000; 94 nodemap_size = round_up(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
86 nodemap_size = pad + memnodemapsize; 95 nodemap_addr = find_e820_area(addr, end_pfn<<PAGE_SHIFT,
87 nodemap_addr = find_e820_area(pad_addr, end_pfn<<PAGE_SHIFT, 96 nodemap_size, L1_CACHE_BYTES);
88 nodemap_size);
89 if (nodemap_addr == -1UL) { 97 if (nodemap_addr == -1UL) {
90 printk(KERN_ERR 98 printk(KERN_ERR
91 "NUMA: Unable to allocate Memory to Node hash map\n"); 99 "NUMA: Unable to allocate Memory to Node hash map\n");
92 nodemap_addr = nodemap_size = 0; 100 nodemap_addr = nodemap_size = 0;
93 return -1; 101 return -1;
94 } 102 }
95 pad_addr = (nodemap_addr + pad) & ~pad; 103 memnodemap = phys_to_virt(nodemap_addr);
96 memnodemap = phys_to_virt(pad_addr); 104 reserve_early(nodemap_addr, nodemap_addr + nodemap_size, "MEMNODEMAP");
97 105
98 printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n", 106 printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
99 nodemap_addr, nodemap_addr + nodemap_size); 107 nodemap_addr, nodemap_addr + nodemap_size);
@@ -104,8 +112,8 @@ static int __init allocate_cachealigned_memnodemap(void)
104 * The LSB of all start and end addresses in the node map is the value of the 112 * The LSB of all start and end addresses in the node map is the value of the
105 * maximum possible shift. 113 * maximum possible shift.
106 */ 114 */
107static int __init 115static int __init extract_lsb_from_nodes(const struct bootnode *nodes,
108extract_lsb_from_nodes (const struct bootnode *nodes, int numnodes) 116 int numnodes)
109{ 117{
110 int i, nodes_used = 0; 118 int i, nodes_used = 0;
111 unsigned long start, end; 119 unsigned long start, end;
@@ -140,59 +148,62 @@ int __init compute_hash_shift(struct bootnode *nodes, int numnodes)
140 shift); 148 shift);
141 149
142 if (populate_memnodemap(nodes, numnodes, shift) != 1) { 150 if (populate_memnodemap(nodes, numnodes, shift) != 1) {
143 printk(KERN_INFO 151 printk(KERN_INFO "Your memory is not aligned you need to "
144 "Your memory is not aligned you need to rebuild your kernel " 152 "rebuild your kernel with a bigger NODEMAPSIZE "
145 "with a bigger NODEMAPSIZE shift=%d\n", 153 "shift=%d\n", shift);
146 shift);
147 return -1; 154 return -1;
148 } 155 }
149 return shift; 156 return shift;
150} 157}
151 158
152#ifdef CONFIG_SPARSEMEM
153int early_pfn_to_nid(unsigned long pfn) 159int early_pfn_to_nid(unsigned long pfn)
154{ 160{
155 return phys_to_nid(pfn << PAGE_SHIFT); 161 return phys_to_nid(pfn << PAGE_SHIFT);
156} 162}
157#endif
158 163
159static void * __init 164static void * __init early_node_mem(int nodeid, unsigned long start,
160early_node_mem(int nodeid, unsigned long start, unsigned long end, 165 unsigned long end, unsigned long size,
161 unsigned long size) 166 unsigned long align)
162{ 167{
163 unsigned long mem = find_e820_area(start, end, size); 168 unsigned long mem = find_e820_area(start, end, size, align);
164 void *ptr; 169 void *ptr;
170
165 if (mem != -1L) 171 if (mem != -1L)
166 return __va(mem); 172 return __va(mem);
167 ptr = __alloc_bootmem_nopanic(size, 173
168 SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)); 174 ptr = __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS));
169 if (ptr == NULL) { 175 if (ptr == NULL) {
170 printk(KERN_ERR "Cannot find %lu bytes in node %d\n", 176 printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
171 size, nodeid); 177 size, nodeid);
172 return NULL; 178 return NULL;
173 } 179 }
174 return ptr; 180 return ptr;
175} 181}
176 182
177/* Initialize bootmem allocator for a node */ 183/* Initialize bootmem allocator for a node */
178void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) 184void __init setup_node_bootmem(int nodeid, unsigned long start,
179{ 185 unsigned long end)
180 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start; 186{
181 unsigned long nodedata_phys; 187 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size;
188 unsigned long bootmap_start, nodedata_phys;
182 void *bootmap; 189 void *bootmap;
183 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE); 190 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
184 191
185 start = round_up(start, ZONE_ALIGN); 192 start = round_up(start, ZONE_ALIGN);
186 193
187 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end); 194 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid,
195 start, end);
188 196
189 start_pfn = start >> PAGE_SHIFT; 197 start_pfn = start >> PAGE_SHIFT;
190 end_pfn = end >> PAGE_SHIFT; 198 end_pfn = end >> PAGE_SHIFT;
191 199
192 node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size); 200 node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size,
201 SMP_CACHE_BYTES);
193 if (node_data[nodeid] == NULL) 202 if (node_data[nodeid] == NULL)
194 return; 203 return;
195 nodedata_phys = __pa(node_data[nodeid]); 204 nodedata_phys = __pa(node_data[nodeid]);
205 printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys,
206 nodedata_phys + pgdat_size - 1);
196 207
197 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); 208 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
198 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid]; 209 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
@@ -200,75 +211,62 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en
200 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn; 211 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
201 212
202 /* Find a place for the bootmem map */ 213 /* Find a place for the bootmem map */
203 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); 214 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
204 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE); 215 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
216 /*
217 * SMP_CAHCE_BYTES could be enough, but init_bootmem_node like
218 * to use that to align to PAGE_SIZE
219 */
205 bootmap = early_node_mem(nodeid, bootmap_start, end, 220 bootmap = early_node_mem(nodeid, bootmap_start, end,
206 bootmap_pages<<PAGE_SHIFT); 221 bootmap_pages<<PAGE_SHIFT, PAGE_SIZE);
207 if (bootmap == NULL) { 222 if (bootmap == NULL) {
208 if (nodedata_phys < start || nodedata_phys >= end) 223 if (nodedata_phys < start || nodedata_phys >= end)
209 free_bootmem((unsigned long)node_data[nodeid],pgdat_size); 224 free_bootmem((unsigned long)node_data[nodeid],
225 pgdat_size);
210 node_data[nodeid] = NULL; 226 node_data[nodeid] = NULL;
211 return; 227 return;
212 } 228 }
213 bootmap_start = __pa(bootmap); 229 bootmap_start = __pa(bootmap);
214 Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages); 230
215
216 bootmap_size = init_bootmem_node(NODE_DATA(nodeid), 231 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
217 bootmap_start >> PAGE_SHIFT, 232 bootmap_start >> PAGE_SHIFT,
218 start_pfn, end_pfn); 233 start_pfn, end_pfn);
234
235 printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n",
236 bootmap_start, bootmap_start + bootmap_size - 1,
237 bootmap_pages);
219 238
220 free_bootmem_with_active_regions(nodeid, end); 239 free_bootmem_with_active_regions(nodeid, end);
221 240
222 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); 241 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
223 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT); 242 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start,
243 bootmap_pages<<PAGE_SHIFT);
224#ifdef CONFIG_ACPI_NUMA 244#ifdef CONFIG_ACPI_NUMA
225 srat_reserve_add_area(nodeid); 245 srat_reserve_add_area(nodeid);
226#endif 246#endif
227 node_set_online(nodeid); 247 node_set_online(nodeid);
228} 248}
229
230/* Initialize final allocator for a zone */
231void __init setup_node_zones(int nodeid)
232{
233 unsigned long start_pfn, end_pfn, memmapsize, limit;
234
235 start_pfn = node_start_pfn(nodeid);
236 end_pfn = node_end_pfn(nodeid);
237
238 Dprintk(KERN_INFO "Setting up memmap for node %d %lx-%lx\n",
239 nodeid, start_pfn, end_pfn);
240
241 /* Try to allocate mem_map at end to not fill up precious <4GB
242 memory. */
243 memmapsize = sizeof(struct page) * (end_pfn-start_pfn);
244 limit = end_pfn << PAGE_SHIFT;
245#ifdef CONFIG_FLAT_NODE_MEM_MAP
246 NODE_DATA(nodeid)->node_mem_map =
247 __alloc_bootmem_core(NODE_DATA(nodeid)->bdata,
248 memmapsize, SMP_CACHE_BYTES,
249 round_down(limit - memmapsize, PAGE_SIZE),
250 limit);
251#endif
252}
253 249
250/*
251 * There are unfortunately some poorly designed mainboards around that
252 * only connect memory to a single CPU. This breaks the 1:1 cpu->node
253 * mapping. To avoid this fill in the mapping for all possible CPUs,
254 * as the number of CPUs is not known yet. We round robin the existing
255 * nodes.
256 */
254void __init numa_init_array(void) 257void __init numa_init_array(void)
255{ 258{
256 int rr, i; 259 int rr, i;
257 /* There are unfortunately some poorly designed mainboards around 260
258 that only connect memory to a single CPU. This breaks the 1:1 cpu->node
259 mapping. To avoid this fill in the mapping for all possible
260 CPUs, as the number of CPUs is not known yet.
261 We round robin the existing nodes. */
262 rr = first_node(node_online_map); 261 rr = first_node(node_online_map);
263 for (i = 0; i < NR_CPUS; i++) { 262 for (i = 0; i < NR_CPUS; i++) {
264 if (cpu_to_node(i) != NUMA_NO_NODE) 263 if (early_cpu_to_node(i) != NUMA_NO_NODE)
265 continue; 264 continue;
266 numa_set_node(i, rr); 265 numa_set_node(i, rr);
267 rr = next_node(rr, node_online_map); 266 rr = next_node(rr, node_online_map);
268 if (rr == MAX_NUMNODES) 267 if (rr == MAX_NUMNODES)
269 rr = first_node(node_online_map); 268 rr = first_node(node_online_map);
270 } 269 }
271
272} 270}
273 271
274#ifdef CONFIG_NUMA_EMU 272#ifdef CONFIG_NUMA_EMU
@@ -276,15 +274,17 @@ void __init numa_init_array(void)
276char *cmdline __initdata; 274char *cmdline __initdata;
277 275
278/* 276/*
279 * Setups up nid to range from addr to addr + size. If the end boundary is 277 * Setups up nid to range from addr to addr + size. If the end
280 * greater than max_addr, then max_addr is used instead. The return value is 0 278 * boundary is greater than max_addr, then max_addr is used instead.
281 * if there is additional memory left for allocation past addr and -1 otherwise. 279 * The return value is 0 if there is additional memory left for
282 * addr is adjusted to be at the end of the node. 280 * allocation past addr and -1 otherwise. addr is adjusted to be at
281 * the end of the node.
283 */ 282 */
284static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr, 283static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr,
285 u64 size, u64 max_addr) 284 u64 size, u64 max_addr)
286{ 285{
287 int ret = 0; 286 int ret = 0;
287
288 nodes[nid].start = *addr; 288 nodes[nid].start = *addr;
289 *addr += size; 289 *addr += size;
290 if (*addr >= max_addr) { 290 if (*addr >= max_addr) {
@@ -335,6 +335,7 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
335 335
336 for (i = node_start; i < num_nodes + node_start; i++) { 336 for (i = node_start; i < num_nodes + node_start; i++) {
337 u64 end = *addr + size; 337 u64 end = *addr + size;
338
338 if (i < big) 339 if (i < big)
339 end += FAKE_NODE_MIN_SIZE; 340 end += FAKE_NODE_MIN_SIZE;
340 /* 341 /*
@@ -380,14 +381,9 @@ static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr,
380static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) 381static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
381{ 382{
382 struct bootnode nodes[MAX_NUMNODES]; 383 struct bootnode nodes[MAX_NUMNODES];
383 u64 addr = start_pfn << PAGE_SHIFT; 384 u64 size, addr = start_pfn << PAGE_SHIFT;
384 u64 max_addr = end_pfn << PAGE_SHIFT; 385 u64 max_addr = end_pfn << PAGE_SHIFT;
385 int num_nodes = 0; 386 int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i;
386 int coeff_flag;
387 int coeff = -1;
388 int num = 0;
389 u64 size;
390 int i;
391 387
392 memset(&nodes, 0, sizeof(nodes)); 388 memset(&nodes, 0, sizeof(nodes));
393 /* 389 /*
@@ -395,8 +391,9 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
395 * system RAM into N fake nodes. 391 * system RAM into N fake nodes.
396 */ 392 */
397 if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { 393 if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) {
398 num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, 394 long n = simple_strtol(cmdline, NULL, 0);
399 simple_strtol(cmdline, NULL, 0)); 395
396 num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n);
400 if (num_nodes < 0) 397 if (num_nodes < 0)
401 return num_nodes; 398 return num_nodes;
402 goto out; 399 goto out;
@@ -483,46 +480,47 @@ out:
483 for_each_node_mask(i, node_possible_map) { 480 for_each_node_mask(i, node_possible_map) {
484 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, 481 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
485 nodes[i].end >> PAGE_SHIFT); 482 nodes[i].end >> PAGE_SHIFT);
486 setup_node_bootmem(i, nodes[i].start, nodes[i].end); 483 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
487 } 484 }
488 acpi_fake_nodes(nodes, num_nodes); 485 acpi_fake_nodes(nodes, num_nodes);
489 numa_init_array(); 486 numa_init_array();
490 return 0; 487 return 0;
491} 488}
492#endif /* CONFIG_NUMA_EMU */ 489#endif /* CONFIG_NUMA_EMU */
493 490
494void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) 491void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
495{ 492{
496 int i; 493 int i;
497 494
498 nodes_clear(node_possible_map); 495 nodes_clear(node_possible_map);
499 496
500#ifdef CONFIG_NUMA_EMU 497#ifdef CONFIG_NUMA_EMU
501 if (cmdline && !numa_emulation(start_pfn, end_pfn)) 498 if (cmdline && !numa_emulation(start_pfn, end_pfn))
502 return; 499 return;
503 nodes_clear(node_possible_map); 500 nodes_clear(node_possible_map);
504#endif 501#endif
505 502
506#ifdef CONFIG_ACPI_NUMA 503#ifdef CONFIG_ACPI_NUMA
507 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, 504 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
508 end_pfn << PAGE_SHIFT)) 505 end_pfn << PAGE_SHIFT))
509 return; 506 return;
510 nodes_clear(node_possible_map); 507 nodes_clear(node_possible_map);
511#endif 508#endif
512 509
513#ifdef CONFIG_K8_NUMA 510#ifdef CONFIG_K8_NUMA
514 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT)) 511 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT,
512 end_pfn<<PAGE_SHIFT))
515 return; 513 return;
516 nodes_clear(node_possible_map); 514 nodes_clear(node_possible_map);
517#endif 515#endif
518 printk(KERN_INFO "%s\n", 516 printk(KERN_INFO "%s\n",
519 numa_off ? "NUMA turned off" : "No NUMA configuration found"); 517 numa_off ? "NUMA turned off" : "No NUMA configuration found");
520 518
521 printk(KERN_INFO "Faking a node at %016lx-%016lx\n", 519 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
522 start_pfn << PAGE_SHIFT, 520 start_pfn << PAGE_SHIFT,
523 end_pfn << PAGE_SHIFT); 521 end_pfn << PAGE_SHIFT);
524 /* setup dummy node covering all memory */ 522 /* setup dummy node covering all memory */
525 memnode_shift = 63; 523 memnode_shift = 63;
526 memnodemap = memnode.embedded_map; 524 memnodemap = memnode.embedded_map;
527 memnodemap[0] = 0; 525 memnodemap[0] = 0;
528 nodes_clear(node_online_map); 526 nodes_clear(node_online_map);
@@ -530,36 +528,48 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
530 node_set(0, node_possible_map); 528 node_set(0, node_possible_map);
531 for (i = 0; i < NR_CPUS; i++) 529 for (i = 0; i < NR_CPUS; i++)
532 numa_set_node(i, 0); 530 numa_set_node(i, 0);
533 node_to_cpumask[0] = cpumask_of_cpu(0); 531 /* cpumask_of_cpu() may not be available during early startup */
532 memset(&node_to_cpumask_map[0], 0, sizeof(node_to_cpumask_map[0]));
533 cpu_set(0, node_to_cpumask_map[0]);
534 e820_register_active_regions(0, start_pfn, end_pfn); 534 e820_register_active_regions(0, start_pfn, end_pfn);
535 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); 535 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
536} 536}
537 537
538__cpuinit void numa_add_cpu(int cpu) 538__cpuinit void numa_add_cpu(int cpu)
539{ 539{
540 set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]); 540 set_bit(cpu,
541} 541 (unsigned long *)&node_to_cpumask_map[early_cpu_to_node(cpu)]);
542}
542 543
543void __cpuinit numa_set_node(int cpu, int node) 544void __cpuinit numa_set_node(int cpu, int node)
544{ 545{
546 int *cpu_to_node_map = x86_cpu_to_node_map_early_ptr;
547
545 cpu_pda(cpu)->nodenumber = node; 548 cpu_pda(cpu)->nodenumber = node;
546 cpu_to_node(cpu) = node; 549
550 if(cpu_to_node_map)
551 cpu_to_node_map[cpu] = node;
552 else if(per_cpu_offset(cpu))
553 per_cpu(x86_cpu_to_node_map, cpu) = node;
554 else
555 Dprintk(KERN_INFO "Setting node for non-present cpu %d\n", cpu);
547} 556}
548 557
549unsigned long __init numa_free_all_bootmem(void) 558unsigned long __init numa_free_all_bootmem(void)
550{ 559{
551 int i;
552 unsigned long pages = 0; 560 unsigned long pages = 0;
553 for_each_online_node(i) { 561 int i;
562
563 for_each_online_node(i)
554 pages += free_all_bootmem_node(NODE_DATA(i)); 564 pages += free_all_bootmem_node(NODE_DATA(i));
555 } 565
556 return pages; 566 return pages;
557} 567}
558 568
559void __init paging_init(void) 569void __init paging_init(void)
560{ 570{
561 int i;
562 unsigned long max_zone_pfns[MAX_NR_ZONES]; 571 unsigned long max_zone_pfns[MAX_NR_ZONES];
572
563 memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); 573 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
564 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; 574 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
565 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; 575 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
@@ -568,32 +578,27 @@ void __init paging_init(void)
568 sparse_memory_present_with_active_regions(MAX_NUMNODES); 578 sparse_memory_present_with_active_regions(MAX_NUMNODES);
569 sparse_init(); 579 sparse_init();
570 580
571 for_each_online_node(i) {
572 setup_node_zones(i);
573 }
574
575 free_area_init_nodes(max_zone_pfns); 581 free_area_init_nodes(max_zone_pfns);
576} 582}
577 583
578static __init int numa_setup(char *opt) 584static __init int numa_setup(char *opt)
579{ 585{
580 if (!opt) 586 if (!opt)
581 return -EINVAL; 587 return -EINVAL;
582 if (!strncmp(opt,"off",3)) 588 if (!strncmp(opt, "off", 3))
583 numa_off = 1; 589 numa_off = 1;
584#ifdef CONFIG_NUMA_EMU 590#ifdef CONFIG_NUMA_EMU
585 if (!strncmp(opt, "fake=", 5)) 591 if (!strncmp(opt, "fake=", 5))
586 cmdline = opt + 5; 592 cmdline = opt + 5;
587#endif 593#endif
588#ifdef CONFIG_ACPI_NUMA 594#ifdef CONFIG_ACPI_NUMA
589 if (!strncmp(opt,"noacpi",6)) 595 if (!strncmp(opt, "noacpi", 6))
590 acpi_numa = -1; 596 acpi_numa = -1;
591 if (!strncmp(opt,"hotadd=", 7)) 597 if (!strncmp(opt, "hotadd=", 7))
592 hotadd_percent = simple_strtoul(opt+7, NULL, 10); 598 hotadd_percent = simple_strtoul(opt+7, NULL, 10);
593#endif 599#endif
594 return 0; 600 return 0;
595} 601}
596
597early_param("numa", numa_setup); 602early_param("numa", numa_setup);
598 603
599/* 604/*
@@ -611,38 +616,16 @@ early_param("numa", numa_setup);
611void __init init_cpu_to_node(void) 616void __init init_cpu_to_node(void)
612{ 617{
613 int i; 618 int i;
614 for (i = 0; i < NR_CPUS; i++) { 619
615 u8 apicid = x86_cpu_to_apicid_init[i]; 620 for (i = 0; i < NR_CPUS; i++) {
621 u16 apicid = x86_cpu_to_apicid_init[i];
622
616 if (apicid == BAD_APICID) 623 if (apicid == BAD_APICID)
617 continue; 624 continue;
618 if (apicid_to_node[apicid] == NUMA_NO_NODE) 625 if (apicid_to_node[apicid] == NUMA_NO_NODE)
619 continue; 626 continue;
620 numa_set_node(i,apicid_to_node[apicid]); 627 numa_set_node(i, apicid_to_node[apicid]);
621 } 628 }
622} 629}
623 630
624EXPORT_SYMBOL(cpu_to_node);
625EXPORT_SYMBOL(node_to_cpumask);
626EXPORT_SYMBOL(memnode);
627EXPORT_SYMBOL(node_data);
628 631
629#ifdef CONFIG_DISCONTIGMEM
630/*
631 * Functions to convert PFNs from/to per node page addresses.
632 * These are out of line because they are quite big.
633 * They could be all tuned by pre caching more state.
634 * Should do that.
635 */
636
637int pfn_valid(unsigned long pfn)
638{
639 unsigned nid;
640 if (pfn >= num_physpages)
641 return 0;
642 nid = pfn_to_nid(pfn);
643 if (nid == 0xff)
644 return 0;
645 return pfn >= node_start_pfn(nid) && (pfn) < node_end_pfn(nid);
646}
647EXPORT_SYMBOL(pfn_valid);
648#endif
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
new file mode 100644
index 00000000000..ed820160035
--- /dev/null
+++ b/arch/x86/mm/pageattr-test.c
@@ -0,0 +1,262 @@
1/*
2 * self test for change_page_attr.
3 *
4 * Clears the global bit on random pages in the direct mapping, then reverts
5 * and compares page tables forwards and afterwards.
6 */
7#include <linux/bootmem.h>
8#include <linux/kthread.h>
9#include <linux/random.h>
10#include <linux/kernel.h>
11#include <linux/init.h>
12#include <linux/mm.h>
13
14#include <asm/cacheflush.h>
15#include <asm/pgtable.h>
16#include <asm/kdebug.h>
17
18/*
19 * Only print the results of the first pass:
20 */
21static __read_mostly int print = 1;
22
23enum {
24 NTEST = 400,
25#ifdef CONFIG_X86_64
26 LPS = (1 << PMD_SHIFT),
27#elif defined(CONFIG_X86_PAE)
28 LPS = (1 << PMD_SHIFT),
29#else
30 LPS = (1 << 22),
31#endif
32 GPS = (1<<30)
33};
34
35struct split_state {
36 long lpg, gpg, spg, exec;
37 long min_exec, max_exec;
38};
39
40static int print_split(struct split_state *s)
41{
42 long i, expected, missed = 0;
43 int printed = 0;
44 int err = 0;
45
46 s->lpg = s->gpg = s->spg = s->exec = 0;
47 s->min_exec = ~0UL;
48 s->max_exec = 0;
49 for (i = 0; i < max_pfn_mapped; ) {
50 unsigned long addr = (unsigned long)__va(i << PAGE_SHIFT);
51 unsigned int level;
52 pte_t *pte;
53
54 pte = lookup_address(addr, &level);
55 if (!pte) {
56 if (!printed) {
57 dump_pagetable(addr);
58 printk(KERN_INFO "CPA %lx no pte level %d\n",
59 addr, level);
60 printed = 1;
61 }
62 missed++;
63 i++;
64 continue;
65 }
66
67 if (level == PG_LEVEL_1G && sizeof(long) == 8) {
68 s->gpg++;
69 i += GPS/PAGE_SIZE;
70 } else if (level == PG_LEVEL_2M) {
71 if (!(pte_val(*pte) & _PAGE_PSE)) {
72 printk(KERN_ERR
73 "%lx level %d but not PSE %Lx\n",
74 addr, level, (u64)pte_val(*pte));
75 err = 1;
76 }
77 s->lpg++;
78 i += LPS/PAGE_SIZE;
79 } else {
80 s->spg++;
81 i++;
82 }
83 if (!(pte_val(*pte) & _PAGE_NX)) {
84 s->exec++;
85 if (addr < s->min_exec)
86 s->min_exec = addr;
87 if (addr > s->max_exec)
88 s->max_exec = addr;
89 }
90 }
91 if (print) {
92 printk(KERN_INFO
93 " 4k %lu large %lu gb %lu x %lu[%lx-%lx] miss %lu\n",
94 s->spg, s->lpg, s->gpg, s->exec,
95 s->min_exec != ~0UL ? s->min_exec : 0,
96 s->max_exec, missed);
97 }
98
99 expected = (s->gpg*GPS + s->lpg*LPS)/PAGE_SIZE + s->spg + missed;
100 if (expected != i) {
101 printk(KERN_ERR "CPA max_pfn_mapped %lu but expected %lu\n",
102 max_pfn_mapped, expected);
103 return 1;
104 }
105 return err;
106}
107
108static unsigned long addr[NTEST];
109static unsigned int len[NTEST];
110
111/* Change the global bit on random pages in the direct mapping */
112static int pageattr_test(void)
113{
114 struct split_state sa, sb, sc;
115 unsigned long *bm;
116 pte_t *pte, pte0;
117 int failed = 0;
118 unsigned int level;
119 int i, k;
120 int err;
121
122 if (print)
123 printk(KERN_INFO "CPA self-test:\n");
124
125 bm = vmalloc((max_pfn_mapped + 7) / 8);
126 if (!bm) {
127 printk(KERN_ERR "CPA Cannot vmalloc bitmap\n");
128 return -ENOMEM;
129 }
130 memset(bm, 0, (max_pfn_mapped + 7) / 8);
131
132 failed += print_split(&sa);
133 srandom32(100);
134
135 for (i = 0; i < NTEST; i++) {
136 unsigned long pfn = random32() % max_pfn_mapped;
137
138 addr[i] = (unsigned long)__va(pfn << PAGE_SHIFT);
139 len[i] = random32() % 100;
140 len[i] = min_t(unsigned long, len[i], max_pfn_mapped - pfn - 1);
141
142 if (len[i] == 0)
143 len[i] = 1;
144
145 pte = NULL;
146 pte0 = pfn_pte(0, __pgprot(0)); /* shut gcc up */
147
148 for (k = 0; k < len[i]; k++) {
149 pte = lookup_address(addr[i] + k*PAGE_SIZE, &level);
150 if (!pte || pgprot_val(pte_pgprot(*pte)) == 0 ||
151 !(pte_val(*pte) & _PAGE_PRESENT)) {
152 addr[i] = 0;
153 break;
154 }
155 if (k == 0) {
156 pte0 = *pte;
157 } else {
158 if (pgprot_val(pte_pgprot(*pte)) !=
159 pgprot_val(pte_pgprot(pte0))) {
160 len[i] = k;
161 break;
162 }
163 }
164 if (test_bit(pfn + k, bm)) {
165 len[i] = k;
166 break;
167 }
168 __set_bit(pfn + k, bm);
169 }
170 if (!addr[i] || !pte || !k) {
171 addr[i] = 0;
172 continue;
173 }
174
175 err = change_page_attr_clear(addr[i], len[i],
176 __pgprot(_PAGE_GLOBAL));
177 if (err < 0) {
178 printk(KERN_ERR "CPA %d failed %d\n", i, err);
179 failed++;
180 }
181
182 pte = lookup_address(addr[i], &level);
183 if (!pte || pte_global(*pte) || pte_huge(*pte)) {
184 printk(KERN_ERR "CPA %lx: bad pte %Lx\n", addr[i],
185 pte ? (u64)pte_val(*pte) : 0ULL);
186 failed++;
187 }
188 if (level != PG_LEVEL_4K) {
189 printk(KERN_ERR "CPA %lx: unexpected level %d\n",
190 addr[i], level);
191 failed++;
192 }
193
194 }
195 vfree(bm);
196
197 failed += print_split(&sb);
198
199 for (i = 0; i < NTEST; i++) {
200 if (!addr[i])
201 continue;
202 pte = lookup_address(addr[i], &level);
203 if (!pte) {
204 printk(KERN_ERR "CPA lookup of %lx failed\n", addr[i]);
205 failed++;
206 continue;
207 }
208 err = change_page_attr_set(addr[i], len[i],
209 __pgprot(_PAGE_GLOBAL));
210 if (err < 0) {
211 printk(KERN_ERR "CPA reverting failed: %d\n", err);
212 failed++;
213 }
214 pte = lookup_address(addr[i], &level);
215 if (!pte || !pte_global(*pte)) {
216 printk(KERN_ERR "CPA %lx: bad pte after revert %Lx\n",
217 addr[i], pte ? (u64)pte_val(*pte) : 0ULL);
218 failed++;
219 }
220
221 }
222
223 failed += print_split(&sc);
224
225 if (failed) {
226 printk(KERN_ERR "NOT PASSED. Please report.\n");
227 WARN_ON(1);
228 return -EINVAL;
229 } else {
230 if (print)
231 printk(KERN_INFO "ok.\n");
232 }
233
234 return 0;
235}
236
237static int do_pageattr_test(void *__unused)
238{
239 while (!kthread_should_stop()) {
240 schedule_timeout_interruptible(HZ*30);
241 if (pageattr_test() < 0)
242 break;
243 if (print)
244 print--;
245 }
246 return 0;
247}
248
249static int start_pageattr_test(void)
250{
251 struct task_struct *p;
252
253 p = kthread_create(do_pageattr_test, NULL, "pageattr-test");
254 if (!IS_ERR(p))
255 wake_up_process(p);
256 else
257 WARN_ON(1);
258
259 return 0;
260}
261
262module_init(start_pageattr_test);
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
new file mode 100644
index 00000000000..8493c855582
--- /dev/null
+++ b/arch/x86/mm/pageattr.c
@@ -0,0 +1,782 @@
1/*
2 * Copyright 2002 Andi Kleen, SuSE Labs.
3 * Thanks to Ben LaHaise for precious feedback.
4 */
5#include <linux/highmem.h>
6#include <linux/bootmem.h>
7#include <linux/module.h>
8#include <linux/sched.h>
9#include <linux/slab.h>
10#include <linux/mm.h>
11
12#include <asm/e820.h>
13#include <asm/processor.h>
14#include <asm/tlbflush.h>
15#include <asm/sections.h>
16#include <asm/uaccess.h>
17#include <asm/pgalloc.h>
18
19/*
20 * The current flushing context - we pass it instead of 5 arguments:
21 */
22struct cpa_data {
23 unsigned long vaddr;
24 pgprot_t mask_set;
25 pgprot_t mask_clr;
26 int numpages;
27 int flushtlb;
28};
29
30static inline int
31within(unsigned long addr, unsigned long start, unsigned long end)
32{
33 return addr >= start && addr < end;
34}
35
36/*
37 * Flushing functions
38 */
39
40/**
41 * clflush_cache_range - flush a cache range with clflush
42 * @addr: virtual start address
43 * @size: number of bytes to flush
44 *
45 * clflush is an unordered instruction which needs fencing with mfence
46 * to avoid ordering issues.
47 */
48void clflush_cache_range(void *vaddr, unsigned int size)
49{
50 void *vend = vaddr + size - 1;
51
52 mb();
53
54 for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size)
55 clflush(vaddr);
56 /*
57 * Flush any possible final partial cacheline:
58 */
59 clflush(vend);
60
61 mb();
62}
63
64static void __cpa_flush_all(void *arg)
65{
66 unsigned long cache = (unsigned long)arg;
67
68 /*
69 * Flush all to work around Errata in early athlons regarding
70 * large page flushing.
71 */
72 __flush_tlb_all();
73
74 if (cache && boot_cpu_data.x86_model >= 4)
75 wbinvd();
76}
77
78static void cpa_flush_all(unsigned long cache)
79{
80 BUG_ON(irqs_disabled());
81
82 on_each_cpu(__cpa_flush_all, (void *) cache, 1, 1);
83}
84
85static void __cpa_flush_range(void *arg)
86{
87 /*
88 * We could optimize that further and do individual per page
89 * tlb invalidates for a low number of pages. Caveat: we must
90 * flush the high aliases on 64bit as well.
91 */
92 __flush_tlb_all();
93}
94
95static void cpa_flush_range(unsigned long start, int numpages, int cache)
96{
97 unsigned int i, level;
98 unsigned long addr;
99
100 BUG_ON(irqs_disabled());
101 WARN_ON(PAGE_ALIGN(start) != start);
102
103 on_each_cpu(__cpa_flush_range, NULL, 1, 1);
104
105 if (!cache)
106 return;
107
108 /*
109 * We only need to flush on one CPU,
110 * clflush is a MESI-coherent instruction that
111 * will cause all other CPUs to flush the same
112 * cachelines:
113 */
114 for (i = 0, addr = start; i < numpages; i++, addr += PAGE_SIZE) {
115 pte_t *pte = lookup_address(addr, &level);
116
117 /*
118 * Only flush present addresses:
119 */
120 if (pte && (pte_val(*pte) & _PAGE_PRESENT))
121 clflush_cache_range((void *) addr, PAGE_SIZE);
122 }
123}
124
125#define HIGH_MAP_START __START_KERNEL_map
126#define HIGH_MAP_END (__START_KERNEL_map + KERNEL_TEXT_SIZE)
127
128
129/*
130 * Converts a virtual address to a X86-64 highmap address
131 */
132static unsigned long virt_to_highmap(void *address)
133{
134#ifdef CONFIG_X86_64
135 return __pa((unsigned long)address) + HIGH_MAP_START - phys_base;
136#else
137 return (unsigned long)address;
138#endif
139}
140
141/*
142 * Certain areas of memory on x86 require very specific protection flags,
143 * for example the BIOS area or kernel text. Callers don't always get this
144 * right (again, ioremap() on BIOS memory is not uncommon) so this function
145 * checks and fixes these known static required protection bits.
146 */
147static inline pgprot_t static_protections(pgprot_t prot, unsigned long address)
148{
149 pgprot_t forbidden = __pgprot(0);
150
151 /*
152 * The BIOS area between 640k and 1Mb needs to be executable for
153 * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
154 */
155 if (within(__pa(address), BIOS_BEGIN, BIOS_END))
156 pgprot_val(forbidden) |= _PAGE_NX;
157
158 /*
159 * The kernel text needs to be executable for obvious reasons
160 * Does not cover __inittext since that is gone later on
161 */
162 if (within(address, (unsigned long)_text, (unsigned long)_etext))
163 pgprot_val(forbidden) |= _PAGE_NX;
164 /*
165 * Do the same for the x86-64 high kernel mapping
166 */
167 if (within(address, virt_to_highmap(_text), virt_to_highmap(_etext)))
168 pgprot_val(forbidden) |= _PAGE_NX;
169
170 /* The .rodata section needs to be read-only */
171 if (within(address, (unsigned long)__start_rodata,
172 (unsigned long)__end_rodata))
173 pgprot_val(forbidden) |= _PAGE_RW;
174 /*
175 * Do the same for the x86-64 high kernel mapping
176 */
177 if (within(address, virt_to_highmap(__start_rodata),
178 virt_to_highmap(__end_rodata)))
179 pgprot_val(forbidden) |= _PAGE_RW;
180
181 prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
182
183 return prot;
184}
185
186/*
187 * Lookup the page table entry for a virtual address. Return a pointer
188 * to the entry and the level of the mapping.
189 *
190 * Note: We return pud and pmd either when the entry is marked large
191 * or when the present bit is not set. Otherwise we would return a
192 * pointer to a nonexisting mapping.
193 */
194pte_t *lookup_address(unsigned long address, int *level)
195{
196 pgd_t *pgd = pgd_offset_k(address);
197 pud_t *pud;
198 pmd_t *pmd;
199
200 *level = PG_LEVEL_NONE;
201
202 if (pgd_none(*pgd))
203 return NULL;
204
205 pud = pud_offset(pgd, address);
206 if (pud_none(*pud))
207 return NULL;
208
209 *level = PG_LEVEL_1G;
210 if (pud_large(*pud) || !pud_present(*pud))
211 return (pte_t *)pud;
212
213 pmd = pmd_offset(pud, address);
214 if (pmd_none(*pmd))
215 return NULL;
216
217 *level = PG_LEVEL_2M;
218 if (pmd_large(*pmd) || !pmd_present(*pmd))
219 return (pte_t *)pmd;
220
221 *level = PG_LEVEL_4K;
222
223 return pte_offset_kernel(pmd, address);
224}
225
226/*
227 * Set the new pmd in all the pgds we know about:
228 */
229static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
230{
231 /* change init_mm */
232 set_pte_atomic(kpte, pte);
233#ifdef CONFIG_X86_32
234 if (!SHARED_KERNEL_PMD) {
235 struct page *page;
236
237 list_for_each_entry(page, &pgd_list, lru) {
238 pgd_t *pgd;
239 pud_t *pud;
240 pmd_t *pmd;
241
242 pgd = (pgd_t *)page_address(page) + pgd_index(address);
243 pud = pud_offset(pgd, address);
244 pmd = pmd_offset(pud, address);
245 set_pte_atomic((pte_t *)pmd, pte);
246 }
247 }
248#endif
249}
250
251static int
252try_preserve_large_page(pte_t *kpte, unsigned long address,
253 struct cpa_data *cpa)
254{
255 unsigned long nextpage_addr, numpages, pmask, psize, flags;
256 pte_t new_pte, old_pte, *tmp;
257 pgprot_t old_prot, new_prot;
258 int level, do_split = 1;
259
260 spin_lock_irqsave(&pgd_lock, flags);
261 /*
262 * Check for races, another CPU might have split this page
263 * up already:
264 */
265 tmp = lookup_address(address, &level);
266 if (tmp != kpte)
267 goto out_unlock;
268
269 switch (level) {
270 case PG_LEVEL_2M:
271 psize = PMD_PAGE_SIZE;
272 pmask = PMD_PAGE_MASK;
273 break;
274#ifdef CONFIG_X86_64
275 case PG_LEVEL_1G:
276 psize = PMD_PAGE_SIZE;
277 pmask = PMD_PAGE_MASK;
278 break;
279#endif
280 default:
281 do_split = -EINVAL;
282 goto out_unlock;
283 }
284
285 /*
286 * Calculate the number of pages, which fit into this large
287 * page starting at address:
288 */
289 nextpage_addr = (address + psize) & pmask;
290 numpages = (nextpage_addr - address) >> PAGE_SHIFT;
291 if (numpages < cpa->numpages)
292 cpa->numpages = numpages;
293
294 /*
295 * We are safe now. Check whether the new pgprot is the same:
296 */
297 old_pte = *kpte;
298 old_prot = new_prot = pte_pgprot(old_pte);
299
300 pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
301 pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
302 new_prot = static_protections(new_prot, address);
303
304 /*
305 * If there are no changes, return. maxpages has been updated
306 * above:
307 */
308 if (pgprot_val(new_prot) == pgprot_val(old_prot)) {
309 do_split = 0;
310 goto out_unlock;
311 }
312
313 /*
314 * We need to change the attributes. Check, whether we can
315 * change the large page in one go. We request a split, when
316 * the address is not aligned and the number of pages is
317 * smaller than the number of pages in the large page. Note
318 * that we limited the number of possible pages already to
319 * the number of pages in the large page.
320 */
321 if (address == (nextpage_addr - psize) && cpa->numpages == numpages) {
322 /*
323 * The address is aligned and the number of pages
324 * covers the full page.
325 */
326 new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot));
327 __set_pmd_pte(kpte, address, new_pte);
328 cpa->flushtlb = 1;
329 do_split = 0;
330 }
331
332out_unlock:
333 spin_unlock_irqrestore(&pgd_lock, flags);
334
335 return do_split;
336}
337
338static int split_large_page(pte_t *kpte, unsigned long address)
339{
340 unsigned long flags, pfn, pfninc = 1;
341 gfp_t gfp_flags = GFP_KERNEL;
342 unsigned int i, level;
343 pte_t *pbase, *tmp;
344 pgprot_t ref_prot;
345 struct page *base;
346
347#ifdef CONFIG_DEBUG_PAGEALLOC
348 gfp_flags = GFP_ATOMIC | __GFP_NOWARN;
349#endif
350 base = alloc_pages(gfp_flags, 0);
351 if (!base)
352 return -ENOMEM;
353
354 spin_lock_irqsave(&pgd_lock, flags);
355 /*
356 * Check for races, another CPU might have split this page
357 * up for us already:
358 */
359 tmp = lookup_address(address, &level);
360 if (tmp != kpte)
361 goto out_unlock;
362
363 pbase = (pte_t *)page_address(base);
364#ifdef CONFIG_X86_32
365 paravirt_alloc_pt(&init_mm, page_to_pfn(base));
366#endif
367 ref_prot = pte_pgprot(pte_clrhuge(*kpte));
368
369#ifdef CONFIG_X86_64
370 if (level == PG_LEVEL_1G) {
371 pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT;
372 pgprot_val(ref_prot) |= _PAGE_PSE;
373 }
374#endif
375
376 /*
377 * Get the target pfn from the original entry:
378 */
379 pfn = pte_pfn(*kpte);
380 for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
381 set_pte(&pbase[i], pfn_pte(pfn, ref_prot));
382
383 /*
384 * Install the new, split up pagetable. Important details here:
385 *
386 * On Intel the NX bit of all levels must be cleared to make a
387 * page executable. See section 4.13.2 of Intel 64 and IA-32
388 * Architectures Software Developer's Manual).
389 *
390 * Mark the entry present. The current mapping might be
391 * set to not present, which we preserved above.
392 */
393 ref_prot = pte_pgprot(pte_mkexec(pte_clrhuge(*kpte)));
394 pgprot_val(ref_prot) |= _PAGE_PRESENT;
395 __set_pmd_pte(kpte, address, mk_pte(base, ref_prot));
396 base = NULL;
397
398out_unlock:
399 spin_unlock_irqrestore(&pgd_lock, flags);
400
401 if (base)
402 __free_pages(base, 0);
403
404 return 0;
405}
406
407static int __change_page_attr(unsigned long address, struct cpa_data *cpa)
408{
409 int level, do_split, err;
410 struct page *kpte_page;
411 pte_t *kpte;
412
413repeat:
414 kpte = lookup_address(address, &level);
415 if (!kpte)
416 return -EINVAL;
417
418 kpte_page = virt_to_page(kpte);
419 BUG_ON(PageLRU(kpte_page));
420 BUG_ON(PageCompound(kpte_page));
421
422 if (level == PG_LEVEL_4K) {
423 pte_t new_pte, old_pte = *kpte;
424 pgprot_t new_prot = pte_pgprot(old_pte);
425
426 if(!pte_val(old_pte)) {
427 printk(KERN_WARNING "CPA: called for zero pte. "
428 "vaddr = %lx cpa->vaddr = %lx\n", address,
429 cpa->vaddr);
430 WARN_ON(1);
431 return -EINVAL;
432 }
433
434 pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
435 pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
436
437 new_prot = static_protections(new_prot, address);
438
439 /*
440 * We need to keep the pfn from the existing PTE,
441 * after all we're only going to change it's attributes
442 * not the memory it points to
443 */
444 new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot));
445
446 /*
447 * Do we really change anything ?
448 */
449 if (pte_val(old_pte) != pte_val(new_pte)) {
450 set_pte_atomic(kpte, new_pte);
451 cpa->flushtlb = 1;
452 }
453 cpa->numpages = 1;
454 return 0;
455 }
456
457 /*
458 * Check, whether we can keep the large page intact
459 * and just change the pte:
460 */
461 do_split = try_preserve_large_page(kpte, address, cpa);
462 /*
463 * When the range fits into the existing large page,
464 * return. cp->numpages and cpa->tlbflush have been updated in
465 * try_large_page:
466 */
467 if (do_split <= 0)
468 return do_split;
469
470 /*
471 * We have to split the large page:
472 */
473 err = split_large_page(kpte, address);
474 if (!err) {
475 cpa->flushtlb = 1;
476 goto repeat;
477 }
478
479 return err;
480}
481
482/**
483 * change_page_attr_addr - Change page table attributes in linear mapping
484 * @address: Virtual address in linear mapping.
485 * @prot: New page table attribute (PAGE_*)
486 *
487 * Change page attributes of a page in the direct mapping. This is a variant
488 * of change_page_attr() that also works on memory holes that do not have
489 * mem_map entry (pfn_valid() is false).
490 *
491 * See change_page_attr() documentation for more details.
492 *
493 * Modules and drivers should use the set_memory_* APIs instead.
494 */
495static int change_page_attr_addr(struct cpa_data *cpa)
496{
497 int err;
498 unsigned long address = cpa->vaddr;
499
500#ifdef CONFIG_X86_64
501 unsigned long phys_addr = __pa(address);
502
503 /*
504 * If we are inside the high mapped kernel range, then we
505 * fixup the low mapping first. __va() returns the virtual
506 * address in the linear mapping:
507 */
508 if (within(address, HIGH_MAP_START, HIGH_MAP_END))
509 address = (unsigned long) __va(phys_addr);
510#endif
511
512 err = __change_page_attr(address, cpa);
513 if (err)
514 return err;
515
516#ifdef CONFIG_X86_64
517 /*
518 * If the physical address is inside the kernel map, we need
519 * to touch the high mapped kernel as well:
520 */
521 if (within(phys_addr, 0, KERNEL_TEXT_SIZE)) {
522 /*
523 * Calc the high mapping address. See __phys_addr()
524 * for the non obvious details.
525 *
526 * Note that NX and other required permissions are
527 * checked in static_protections().
528 */
529 address = phys_addr + HIGH_MAP_START - phys_base;
530
531 /*
532 * Our high aliases are imprecise, because we check
533 * everything between 0 and KERNEL_TEXT_SIZE, so do
534 * not propagate lookup failures back to users:
535 */
536 __change_page_attr(address, cpa);
537 }
538#endif
539 return err;
540}
541
542static int __change_page_attr_set_clr(struct cpa_data *cpa)
543{
544 int ret, numpages = cpa->numpages;
545
546 while (numpages) {
547 /*
548 * Store the remaining nr of pages for the large page
549 * preservation check.
550 */
551 cpa->numpages = numpages;
552 ret = change_page_attr_addr(cpa);
553 if (ret)
554 return ret;
555
556 /*
557 * Adjust the number of pages with the result of the
558 * CPA operation. Either a large page has been
559 * preserved or a single page update happened.
560 */
561 BUG_ON(cpa->numpages > numpages);
562 numpages -= cpa->numpages;
563 cpa->vaddr += cpa->numpages * PAGE_SIZE;
564 }
565 return 0;
566}
567
568static inline int cache_attr(pgprot_t attr)
569{
570 return pgprot_val(attr) &
571 (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD);
572}
573
574static int change_page_attr_set_clr(unsigned long addr, int numpages,
575 pgprot_t mask_set, pgprot_t mask_clr)
576{
577 struct cpa_data cpa;
578 int ret, cache;
579
580 /*
581 * Check, if we are requested to change a not supported
582 * feature:
583 */
584 mask_set = canon_pgprot(mask_set);
585 mask_clr = canon_pgprot(mask_clr);
586 if (!pgprot_val(mask_set) && !pgprot_val(mask_clr))
587 return 0;
588
589 cpa.vaddr = addr;
590 cpa.numpages = numpages;
591 cpa.mask_set = mask_set;
592 cpa.mask_clr = mask_clr;
593 cpa.flushtlb = 0;
594
595 ret = __change_page_attr_set_clr(&cpa);
596
597 /*
598 * Check whether we really changed something:
599 */
600 if (!cpa.flushtlb)
601 return ret;
602
603 /*
604 * No need to flush, when we did not set any of the caching
605 * attributes:
606 */
607 cache = cache_attr(mask_set);
608
609 /*
610 * On success we use clflush, when the CPU supports it to
611 * avoid the wbindv. If the CPU does not support it and in the
612 * error case we fall back to cpa_flush_all (which uses
613 * wbindv):
614 */
615 if (!ret && cpu_has_clflush)
616 cpa_flush_range(addr, numpages, cache);
617 else
618 cpa_flush_all(cache);
619
620 return ret;
621}
622
623static inline int change_page_attr_set(unsigned long addr, int numpages,
624 pgprot_t mask)
625{
626 return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0));
627}
628
629static inline int change_page_attr_clear(unsigned long addr, int numpages,
630 pgprot_t mask)
631{
632 return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask);
633}
634
635int set_memory_uc(unsigned long addr, int numpages)
636{
637 return change_page_attr_set(addr, numpages,
638 __pgprot(_PAGE_PCD | _PAGE_PWT));
639}
640EXPORT_SYMBOL(set_memory_uc);
641
642int set_memory_wb(unsigned long addr, int numpages)
643{
644 return change_page_attr_clear(addr, numpages,
645 __pgprot(_PAGE_PCD | _PAGE_PWT));
646}
647EXPORT_SYMBOL(set_memory_wb);
648
649int set_memory_x(unsigned long addr, int numpages)
650{
651 return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_NX));
652}
653EXPORT_SYMBOL(set_memory_x);
654
655int set_memory_nx(unsigned long addr, int numpages)
656{
657 return change_page_attr_set(addr, numpages, __pgprot(_PAGE_NX));
658}
659EXPORT_SYMBOL(set_memory_nx);
660
661int set_memory_ro(unsigned long addr, int numpages)
662{
663 return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_RW));
664}
665
666int set_memory_rw(unsigned long addr, int numpages)
667{
668 return change_page_attr_set(addr, numpages, __pgprot(_PAGE_RW));
669}
670
671int set_memory_np(unsigned long addr, int numpages)
672{
673 return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT));
674}
675
676int set_pages_uc(struct page *page, int numpages)
677{
678 unsigned long addr = (unsigned long)page_address(page);
679
680 return set_memory_uc(addr, numpages);
681}
682EXPORT_SYMBOL(set_pages_uc);
683
684int set_pages_wb(struct page *page, int numpages)
685{
686 unsigned long addr = (unsigned long)page_address(page);
687
688 return set_memory_wb(addr, numpages);
689}
690EXPORT_SYMBOL(set_pages_wb);
691
692int set_pages_x(struct page *page, int numpages)
693{
694 unsigned long addr = (unsigned long)page_address(page);
695
696 return set_memory_x(addr, numpages);
697}
698EXPORT_SYMBOL(set_pages_x);
699
700int set_pages_nx(struct page *page, int numpages)
701{
702 unsigned long addr = (unsigned long)page_address(page);
703
704 return set_memory_nx(addr, numpages);
705}
706EXPORT_SYMBOL(set_pages_nx);
707
708int set_pages_ro(struct page *page, int numpages)
709{
710 unsigned long addr = (unsigned long)page_address(page);
711
712 return set_memory_ro(addr, numpages);
713}
714
715int set_pages_rw(struct page *page, int numpages)
716{
717 unsigned long addr = (unsigned long)page_address(page);
718
719 return set_memory_rw(addr, numpages);
720}
721
722#ifdef CONFIG_DEBUG_PAGEALLOC
723
724static int __set_pages_p(struct page *page, int numpages)
725{
726 struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page),
727 .numpages = numpages,
728 .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
729 .mask_clr = __pgprot(0)};
730
731 return __change_page_attr_set_clr(&cpa);
732}
733
734static int __set_pages_np(struct page *page, int numpages)
735{
736 struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page),
737 .numpages = numpages,
738 .mask_set = __pgprot(0),
739 .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW)};
740
741 return __change_page_attr_set_clr(&cpa);
742}
743
744void kernel_map_pages(struct page *page, int numpages, int enable)
745{
746 if (PageHighMem(page))
747 return;
748 if (!enable) {
749 debug_check_no_locks_freed(page_address(page),
750 numpages * PAGE_SIZE);
751 }
752
753 /*
754 * If page allocator is not up yet then do not call c_p_a():
755 */
756 if (!debug_pagealloc_enabled)
757 return;
758
759 /*
760 * The return value is ignored - the calls cannot fail,
761 * large pages are disabled at boot time:
762 */
763 if (enable)
764 __set_pages_p(page, numpages);
765 else
766 __set_pages_np(page, numpages);
767
768 /*
769 * We should perform an IPI and flush all tlbs,
770 * but that can deadlock->flush only current cpu:
771 */
772 __flush_tlb_all();
773}
774#endif
775
776/*
777 * The testcases use internal knowledge of the implementation that shouldn't
778 * be exposed to the rest of the kernel. Include these directly here.
779 */
780#ifdef CONFIG_CPA_DEBUG
781#include "pageattr-test.c"
782#endif
diff --git a/arch/x86/mm/pageattr_32.c b/arch/x86/mm/pageattr_32.c
deleted file mode 100644
index 260073c0760..00000000000
--- a/arch/x86/mm/pageattr_32.c
+++ /dev/null
@@ -1,278 +0,0 @@
1/*
2 * Copyright 2002 Andi Kleen, SuSE Labs.
3 * Thanks to Ben LaHaise for precious feedback.
4 */
5
6#include <linux/mm.h>
7#include <linux/sched.h>
8#include <linux/highmem.h>
9#include <linux/module.h>
10#include <linux/slab.h>
11#include <asm/uaccess.h>
12#include <asm/processor.h>
13#include <asm/tlbflush.h>
14#include <asm/pgalloc.h>
15#include <asm/sections.h>
16
17static DEFINE_SPINLOCK(cpa_lock);
18static struct list_head df_list = LIST_HEAD_INIT(df_list);
19
20
21pte_t *lookup_address(unsigned long address)
22{
23 pgd_t *pgd = pgd_offset_k(address);
24 pud_t *pud;
25 pmd_t *pmd;
26 if (pgd_none(*pgd))
27 return NULL;
28 pud = pud_offset(pgd, address);
29 if (pud_none(*pud))
30 return NULL;
31 pmd = pmd_offset(pud, address);
32 if (pmd_none(*pmd))
33 return NULL;
34 if (pmd_large(*pmd))
35 return (pte_t *)pmd;
36 return pte_offset_kernel(pmd, address);
37}
38
39static struct page *split_large_page(unsigned long address, pgprot_t prot,
40 pgprot_t ref_prot)
41{
42 int i;
43 unsigned long addr;
44 struct page *base;
45 pte_t *pbase;
46
47 spin_unlock_irq(&cpa_lock);
48 base = alloc_pages(GFP_KERNEL, 0);
49 spin_lock_irq(&cpa_lock);
50 if (!base)
51 return NULL;
52
53 /*
54 * page_private is used to track the number of entries in
55 * the page table page that have non standard attributes.
56 */
57 SetPagePrivate(base);
58 page_private(base) = 0;
59
60 address = __pa(address);
61 addr = address & LARGE_PAGE_MASK;
62 pbase = (pte_t *)page_address(base);
63 paravirt_alloc_pt(&init_mm, page_to_pfn(base));
64 for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
65 set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT,
66 addr == address ? prot : ref_prot));
67 }
68 return base;
69}
70
71static void cache_flush_page(struct page *p)
72{
73 void *adr = page_address(p);
74 int i;
75 for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size)
76 clflush(adr+i);
77}
78
79static void flush_kernel_map(void *arg)
80{
81 struct list_head *lh = (struct list_head *)arg;
82 struct page *p;
83
84 /* High level code is not ready for clflush yet */
85 if (0 && cpu_has_clflush) {
86 list_for_each_entry (p, lh, lru)
87 cache_flush_page(p);
88 } else if (boot_cpu_data.x86_model >= 4)
89 wbinvd();
90
91 /* Flush all to work around Errata in early athlons regarding
92 * large page flushing.
93 */
94 __flush_tlb_all();
95}
96
97static void set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
98{
99 struct page *page;
100 unsigned long flags;
101
102 set_pte_atomic(kpte, pte); /* change init_mm */
103 if (SHARED_KERNEL_PMD)
104 return;
105
106 spin_lock_irqsave(&pgd_lock, flags);
107 for (page = pgd_list; page; page = (struct page *)page->index) {
108 pgd_t *pgd;
109 pud_t *pud;
110 pmd_t *pmd;
111 pgd = (pgd_t *)page_address(page) + pgd_index(address);
112 pud = pud_offset(pgd, address);
113 pmd = pmd_offset(pud, address);
114 set_pte_atomic((pte_t *)pmd, pte);
115 }
116 spin_unlock_irqrestore(&pgd_lock, flags);
117}
118
119/*
120 * No more special protections in this 2/4MB area - revert to a
121 * large page again.
122 */
123static inline void revert_page(struct page *kpte_page, unsigned long address)
124{
125 pgprot_t ref_prot;
126 pte_t *linear;
127
128 ref_prot =
129 ((address & LARGE_PAGE_MASK) < (unsigned long)&_etext)
130 ? PAGE_KERNEL_LARGE_EXEC : PAGE_KERNEL_LARGE;
131
132 linear = (pte_t *)
133 pmd_offset(pud_offset(pgd_offset_k(address), address), address);
134 set_pmd_pte(linear, address,
135 pfn_pte((__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT,
136 ref_prot));
137}
138
139static inline void save_page(struct page *kpte_page)
140{
141 if (!test_and_set_bit(PG_arch_1, &kpte_page->flags))
142 list_add(&kpte_page->lru, &df_list);
143}
144
145static int
146__change_page_attr(struct page *page, pgprot_t prot)
147{
148 pte_t *kpte;
149 unsigned long address;
150 struct page *kpte_page;
151
152 BUG_ON(PageHighMem(page));
153 address = (unsigned long)page_address(page);
154
155 kpte = lookup_address(address);
156 if (!kpte)
157 return -EINVAL;
158 kpte_page = virt_to_page(kpte);
159 BUG_ON(PageLRU(kpte_page));
160 BUG_ON(PageCompound(kpte_page));
161
162 if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) {
163 if (!pte_huge(*kpte)) {
164 set_pte_atomic(kpte, mk_pte(page, prot));
165 } else {
166 pgprot_t ref_prot;
167 struct page *split;
168
169 ref_prot =
170 ((address & LARGE_PAGE_MASK) < (unsigned long)&_etext)
171 ? PAGE_KERNEL_EXEC : PAGE_KERNEL;
172 split = split_large_page(address, prot, ref_prot);
173 if (!split)
174 return -ENOMEM;
175 set_pmd_pte(kpte,address,mk_pte(split, ref_prot));
176 kpte_page = split;
177 }
178 page_private(kpte_page)++;
179 } else if (!pte_huge(*kpte)) {
180 set_pte_atomic(kpte, mk_pte(page, PAGE_KERNEL));
181 BUG_ON(page_private(kpte_page) == 0);
182 page_private(kpte_page)--;
183 } else
184 BUG();
185
186 /*
187 * If the pte was reserved, it means it was created at boot
188 * time (not via split_large_page) and in turn we must not
189 * replace it with a largepage.
190 */
191
192 save_page(kpte_page);
193 if (!PageReserved(kpte_page)) {
194 if (cpu_has_pse && (page_private(kpte_page) == 0)) {
195 paravirt_release_pt(page_to_pfn(kpte_page));
196 revert_page(kpte_page, address);
197 }
198 }
199 return 0;
200}
201
202static inline void flush_map(struct list_head *l)
203{
204 on_each_cpu(flush_kernel_map, l, 1, 1);
205}
206
207/*
208 * Change the page attributes of an page in the linear mapping.
209 *
210 * This should be used when a page is mapped with a different caching policy
211 * than write-back somewhere - some CPUs do not like it when mappings with
212 * different caching policies exist. This changes the page attributes of the
213 * in kernel linear mapping too.
214 *
215 * The caller needs to ensure that there are no conflicting mappings elsewhere.
216 * This function only deals with the kernel linear map.
217 *
218 * Caller must call global_flush_tlb() after this.
219 */
220int change_page_attr(struct page *page, int numpages, pgprot_t prot)
221{
222 int err = 0;
223 int i;
224 unsigned long flags;
225
226 spin_lock_irqsave(&cpa_lock, flags);
227 for (i = 0; i < numpages; i++, page++) {
228 err = __change_page_attr(page, prot);
229 if (err)
230 break;
231 }
232 spin_unlock_irqrestore(&cpa_lock, flags);
233 return err;
234}
235
236void global_flush_tlb(void)
237{
238 struct list_head l;
239 struct page *pg, *next;
240
241 BUG_ON(irqs_disabled());
242
243 spin_lock_irq(&cpa_lock);
244 list_replace_init(&df_list, &l);
245 spin_unlock_irq(&cpa_lock);
246 flush_map(&l);
247 list_for_each_entry_safe(pg, next, &l, lru) {
248 list_del(&pg->lru);
249 clear_bit(PG_arch_1, &pg->flags);
250 if (PageReserved(pg) || !cpu_has_pse || page_private(pg) != 0)
251 continue;
252 ClearPagePrivate(pg);
253 __free_page(pg);
254 }
255}
256
257#ifdef CONFIG_DEBUG_PAGEALLOC
258void kernel_map_pages(struct page *page, int numpages, int enable)
259{
260 if (PageHighMem(page))
261 return;
262 if (!enable)
263 debug_check_no_locks_freed(page_address(page),
264 numpages * PAGE_SIZE);
265
266 /* the return value is ignored - the calls cannot fail,
267 * large pages are disabled at boot time.
268 */
269 change_page_attr(page, numpages, enable ? PAGE_KERNEL : __pgprot(0));
270 /* we should perform an IPI and flush all tlbs,
271 * but that can deadlock->flush only current cpu.
272 */
273 __flush_tlb_all();
274}
275#endif
276
277EXPORT_SYMBOL(change_page_attr);
278EXPORT_SYMBOL(global_flush_tlb);
diff --git a/arch/x86/mm/pageattr_64.c b/arch/x86/mm/pageattr_64.c
deleted file mode 100644
index c40afbaaf93..00000000000
--- a/arch/x86/mm/pageattr_64.c
+++ /dev/null
@@ -1,255 +0,0 @@
1/*
2 * Copyright 2002 Andi Kleen, SuSE Labs.
3 * Thanks to Ben LaHaise for precious feedback.
4 */
5
6#include <linux/mm.h>
7#include <linux/sched.h>
8#include <linux/highmem.h>
9#include <linux/module.h>
10#include <linux/slab.h>
11#include <asm/uaccess.h>
12#include <asm/processor.h>
13#include <asm/tlbflush.h>
14#include <asm/io.h>
15
16pte_t *lookup_address(unsigned long address)
17{
18 pgd_t *pgd = pgd_offset_k(address);
19 pud_t *pud;
20 pmd_t *pmd;
21 pte_t *pte;
22 if (pgd_none(*pgd))
23 return NULL;
24 pud = pud_offset(pgd, address);
25 if (!pud_present(*pud))
26 return NULL;
27 pmd = pmd_offset(pud, address);
28 if (!pmd_present(*pmd))
29 return NULL;
30 if (pmd_large(*pmd))
31 return (pte_t *)pmd;
32 pte = pte_offset_kernel(pmd, address);
33 if (pte && !pte_present(*pte))
34 pte = NULL;
35 return pte;
36}
37
38static struct page *split_large_page(unsigned long address, pgprot_t prot,
39 pgprot_t ref_prot)
40{
41 int i;
42 unsigned long addr;
43 struct page *base = alloc_pages(GFP_KERNEL, 0);
44 pte_t *pbase;
45 if (!base)
46 return NULL;
47 /*
48 * page_private is used to track the number of entries in
49 * the page table page have non standard attributes.
50 */
51 SetPagePrivate(base);
52 page_private(base) = 0;
53
54 address = __pa(address);
55 addr = address & LARGE_PAGE_MASK;
56 pbase = (pte_t *)page_address(base);
57 for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
58 pbase[i] = pfn_pte(addr >> PAGE_SHIFT,
59 addr == address ? prot : ref_prot);
60 }
61 return base;
62}
63
64void clflush_cache_range(void *adr, int size)
65{
66 int i;
67 for (i = 0; i < size; i += boot_cpu_data.x86_clflush_size)
68 clflush(adr+i);
69}
70
71static void flush_kernel_map(void *arg)
72{
73 struct list_head *l = (struct list_head *)arg;
74 struct page *pg;
75
76 /* When clflush is available always use it because it is
77 much cheaper than WBINVD. */
78 /* clflush is still broken. Disable for now. */
79 if (1 || !cpu_has_clflush)
80 asm volatile("wbinvd" ::: "memory");
81 else list_for_each_entry(pg, l, lru) {
82 void *adr = page_address(pg);
83 clflush_cache_range(adr, PAGE_SIZE);
84 }
85 __flush_tlb_all();
86}
87
88static inline void flush_map(struct list_head *l)
89{
90 on_each_cpu(flush_kernel_map, l, 1, 1);
91}
92
93static LIST_HEAD(deferred_pages); /* protected by init_mm.mmap_sem */
94
95static inline void save_page(struct page *fpage)
96{
97 if (!test_and_set_bit(PG_arch_1, &fpage->flags))
98 list_add(&fpage->lru, &deferred_pages);
99}
100
101/*
102 * No more special protections in this 2/4MB area - revert to a
103 * large page again.
104 */
105static void revert_page(unsigned long address, pgprot_t ref_prot)
106{
107 pgd_t *pgd;
108 pud_t *pud;
109 pmd_t *pmd;
110 pte_t large_pte;
111 unsigned long pfn;
112
113 pgd = pgd_offset_k(address);
114 BUG_ON(pgd_none(*pgd));
115 pud = pud_offset(pgd,address);
116 BUG_ON(pud_none(*pud));
117 pmd = pmd_offset(pud, address);
118 BUG_ON(pmd_val(*pmd) & _PAGE_PSE);
119 pfn = (__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT;
120 large_pte = pfn_pte(pfn, ref_prot);
121 large_pte = pte_mkhuge(large_pte);
122 set_pte((pte_t *)pmd, large_pte);
123}
124
125static int
126__change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
127 pgprot_t ref_prot)
128{
129 pte_t *kpte;
130 struct page *kpte_page;
131 pgprot_t ref_prot2;
132
133 kpte = lookup_address(address);
134 if (!kpte) return 0;
135 kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK);
136 BUG_ON(PageLRU(kpte_page));
137 BUG_ON(PageCompound(kpte_page));
138 if (pgprot_val(prot) != pgprot_val(ref_prot)) {
139 if (!pte_huge(*kpte)) {
140 set_pte(kpte, pfn_pte(pfn, prot));
141 } else {
142 /*
143 * split_large_page will take the reference for this
144 * change_page_attr on the split page.
145 */
146 struct page *split;
147 ref_prot2 = pte_pgprot(pte_clrhuge(*kpte));
148 split = split_large_page(address, prot, ref_prot2);
149 if (!split)
150 return -ENOMEM;
151 pgprot_val(ref_prot2) &= ~_PAGE_NX;
152 set_pte(kpte, mk_pte(split, ref_prot2));
153 kpte_page = split;
154 }
155 page_private(kpte_page)++;
156 } else if (!pte_huge(*kpte)) {
157 set_pte(kpte, pfn_pte(pfn, ref_prot));
158 BUG_ON(page_private(kpte_page) == 0);
159 page_private(kpte_page)--;
160 } else
161 BUG();
162
163 /* on x86-64 the direct mapping set at boot is not using 4k pages */
164 BUG_ON(PageReserved(kpte_page));
165
166 save_page(kpte_page);
167 if (page_private(kpte_page) == 0)
168 revert_page(address, ref_prot);
169 return 0;
170}
171
172/*
173 * Change the page attributes of an page in the linear mapping.
174 *
175 * This should be used when a page is mapped with a different caching policy
176 * than write-back somewhere - some CPUs do not like it when mappings with
177 * different caching policies exist. This changes the page attributes of the
178 * in kernel linear mapping too.
179 *
180 * The caller needs to ensure that there are no conflicting mappings elsewhere.
181 * This function only deals with the kernel linear map.
182 *
183 * Caller must call global_flush_tlb() after this.
184 */
185int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot)
186{
187 int err = 0, kernel_map = 0;
188 int i;
189
190 if (address >= __START_KERNEL_map
191 && address < __START_KERNEL_map + KERNEL_TEXT_SIZE) {
192 address = (unsigned long)__va(__pa(address));
193 kernel_map = 1;
194 }
195
196 down_write(&init_mm.mmap_sem);
197 for (i = 0; i < numpages; i++, address += PAGE_SIZE) {
198 unsigned long pfn = __pa(address) >> PAGE_SHIFT;
199
200 if (!kernel_map || pte_present(pfn_pte(0, prot))) {
201 err = __change_page_attr(address, pfn, prot, PAGE_KERNEL);
202 if (err)
203 break;
204 }
205 /* Handle kernel mapping too which aliases part of the
206 * lowmem */
207 if (__pa(address) < KERNEL_TEXT_SIZE) {
208 unsigned long addr2;
209 pgprot_t prot2;
210 addr2 = __START_KERNEL_map + __pa(address);
211 /* Make sure the kernel mappings stay executable */
212 prot2 = pte_pgprot(pte_mkexec(pfn_pte(0, prot)));
213 err = __change_page_attr(addr2, pfn, prot2,
214 PAGE_KERNEL_EXEC);
215 }
216 }
217 up_write(&init_mm.mmap_sem);
218 return err;
219}
220
221/* Don't call this for MMIO areas that may not have a mem_map entry */
222int change_page_attr(struct page *page, int numpages, pgprot_t prot)
223{
224 unsigned long addr = (unsigned long)page_address(page);
225 return change_page_attr_addr(addr, numpages, prot);
226}
227
228void global_flush_tlb(void)
229{
230 struct page *pg, *next;
231 struct list_head l;
232
233 /*
234 * Write-protect the semaphore, to exclude two contexts
235 * doing a list_replace_init() call in parallel and to
236 * exclude new additions to the deferred_pages list:
237 */
238 down_write(&init_mm.mmap_sem);
239 list_replace_init(&deferred_pages, &l);
240 up_write(&init_mm.mmap_sem);
241
242 flush_map(&l);
243
244 list_for_each_entry_safe(pg, next, &l, lru) {
245 list_del(&pg->lru);
246 clear_bit(PG_arch_1, &pg->flags);
247 if (page_private(pg) != 0)
248 continue;
249 ClearPagePrivate(pg);
250 __free_page(pg);
251 }
252}
253
254EXPORT_SYMBOL(change_page_attr);
255EXPORT_SYMBOL(global_flush_tlb);
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index be61a1d845a..6c1914622a8 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -195,11 +195,6 @@ struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
195 return pte; 195 return pte;
196} 196}
197 197
198void pmd_ctor(struct kmem_cache *cache, void *pmd)
199{
200 memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
201}
202
203/* 198/*
204 * List of all pgd's needed for non-PAE so it can invalidate entries 199 * List of all pgd's needed for non-PAE so it can invalidate entries
205 * in both cached and uncached pgd's; not needed for PAE since the 200 * in both cached and uncached pgd's; not needed for PAE since the
@@ -210,73 +205,53 @@ void pmd_ctor(struct kmem_cache *cache, void *pmd)
210 * vmalloc faults work because attached pagetables are never freed. 205 * vmalloc faults work because attached pagetables are never freed.
211 * -- wli 206 * -- wli
212 */ 207 */
213DEFINE_SPINLOCK(pgd_lock);
214struct page *pgd_list;
215
216static inline void pgd_list_add(pgd_t *pgd) 208static inline void pgd_list_add(pgd_t *pgd)
217{ 209{
218 struct page *page = virt_to_page(pgd); 210 struct page *page = virt_to_page(pgd);
219 page->index = (unsigned long)pgd_list; 211
220 if (pgd_list) 212 list_add(&page->lru, &pgd_list);
221 set_page_private(pgd_list, (unsigned long)&page->index);
222 pgd_list = page;
223 set_page_private(page, (unsigned long)&pgd_list);
224} 213}
225 214
226static inline void pgd_list_del(pgd_t *pgd) 215static inline void pgd_list_del(pgd_t *pgd)
227{ 216{
228 struct page *next, **pprev, *page = virt_to_page(pgd); 217 struct page *page = virt_to_page(pgd);
229 next = (struct page *)page->index;
230 pprev = (struct page **)page_private(page);
231 *pprev = next;
232 if (next)
233 set_page_private(next, (unsigned long)pprev);
234}
235 218
219 list_del(&page->lru);
220}
236 221
222#define UNSHARED_PTRS_PER_PGD \
223 (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
237 224
238#if (PTRS_PER_PMD == 1) 225static void pgd_ctor(void *p)
239/* Non-PAE pgd constructor */
240static void pgd_ctor(void *pgd)
241{ 226{
227 pgd_t *pgd = p;
242 unsigned long flags; 228 unsigned long flags;
243 229
244 /* !PAE, no pagetable sharing */ 230 /* Clear usermode parts of PGD */
245 memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); 231 memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
246 232
247 spin_lock_irqsave(&pgd_lock, flags); 233 spin_lock_irqsave(&pgd_lock, flags);
248 234
249 /* must happen under lock */ 235 /* If the pgd points to a shared pagetable level (either the
250 clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD, 236 ptes in non-PAE, or shared PMD in PAE), then just copy the
251 swapper_pg_dir + USER_PTRS_PER_PGD, 237 references from swapper_pg_dir. */
252 KERNEL_PGD_PTRS); 238 if (PAGETABLE_LEVELS == 2 ||
253 paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT, 239 (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD)) {
254 __pa(swapper_pg_dir) >> PAGE_SHIFT, 240 clone_pgd_range(pgd + USER_PTRS_PER_PGD,
255 USER_PTRS_PER_PGD,
256 KERNEL_PGD_PTRS);
257 pgd_list_add(pgd);
258 spin_unlock_irqrestore(&pgd_lock, flags);
259}
260#else /* PTRS_PER_PMD > 1 */
261/* PAE pgd constructor */
262static void pgd_ctor(void *pgd)
263{
264 /* PAE, kernel PMD may be shared */
265
266 if (SHARED_KERNEL_PMD) {
267 clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
268 swapper_pg_dir + USER_PTRS_PER_PGD, 241 swapper_pg_dir + USER_PTRS_PER_PGD,
269 KERNEL_PGD_PTRS); 242 KERNEL_PGD_PTRS);
270 } else { 243 paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
271 unsigned long flags; 244 __pa(swapper_pg_dir) >> PAGE_SHIFT,
245 USER_PTRS_PER_PGD,
246 KERNEL_PGD_PTRS);
247 }
272 248
273 memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); 249 /* list required to sync kernel mapping updates */
274 spin_lock_irqsave(&pgd_lock, flags); 250 if (!SHARED_KERNEL_PMD)
275 pgd_list_add(pgd); 251 pgd_list_add(pgd);
276 spin_unlock_irqrestore(&pgd_lock, flags); 252
277 } 253 spin_unlock_irqrestore(&pgd_lock, flags);
278} 254}
279#endif /* PTRS_PER_PMD */
280 255
281static void pgd_dtor(void *pgd) 256static void pgd_dtor(void *pgd)
282{ 257{
@@ -285,86 +260,101 @@ static void pgd_dtor(void *pgd)
285 if (SHARED_KERNEL_PMD) 260 if (SHARED_KERNEL_PMD)
286 return; 261 return;
287 262
288 paravirt_release_pd(__pa(pgd) >> PAGE_SHIFT);
289 spin_lock_irqsave(&pgd_lock, flags); 263 spin_lock_irqsave(&pgd_lock, flags);
290 pgd_list_del(pgd); 264 pgd_list_del(pgd);
291 spin_unlock_irqrestore(&pgd_lock, flags); 265 spin_unlock_irqrestore(&pgd_lock, flags);
292} 266}
293 267
294#define UNSHARED_PTRS_PER_PGD \ 268#ifdef CONFIG_X86_PAE
295 (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD) 269/*
270 * Mop up any pmd pages which may still be attached to the pgd.
271 * Normally they will be freed by munmap/exit_mmap, but any pmd we
272 * preallocate which never got a corresponding vma will need to be
273 * freed manually.
274 */
275static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
276{
277 int i;
278
279 for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
280 pgd_t pgd = pgdp[i];
281
282 if (pgd_val(pgd) != 0) {
283 pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
284
285 pgdp[i] = native_make_pgd(0);
286
287 paravirt_release_pd(pgd_val(pgd) >> PAGE_SHIFT);
288 pmd_free(mm, pmd);
289 }
290 }
291}
296 292
297/* If we allocate a pmd for part of the kernel address space, then 293/*
298 make sure its initialized with the appropriate kernel mappings. 294 * In PAE mode, we need to do a cr3 reload (=tlb flush) when
299 Otherwise use a cached zeroed pmd. */ 295 * updating the top-level pagetable entries to guarantee the
300static pmd_t *pmd_cache_alloc(int idx) 296 * processor notices the update. Since this is expensive, and
297 * all 4 top-level entries are used almost immediately in a
298 * new process's life, we just pre-populate them here.
299 *
300 * Also, if we're in a paravirt environment where the kernel pmd is
301 * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
302 * and initialize the kernel pmds here.
303 */
304static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
301{ 305{
302 pmd_t *pmd; 306 pud_t *pud;
307 unsigned long addr;
308 int i;
303 309
304 if (idx >= USER_PTRS_PER_PGD) { 310 pud = pud_offset(pgd, 0);
305 pmd = (pmd_t *)__get_free_page(GFP_KERNEL); 311 for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
312 i++, pud++, addr += PUD_SIZE) {
313 pmd_t *pmd = pmd_alloc_one(mm, addr);
314
315 if (!pmd) {
316 pgd_mop_up_pmds(mm, pgd);
317 return 0;
318 }
306 319
307 if (pmd) 320 if (i >= USER_PTRS_PER_PGD)
308 memcpy(pmd, 321 memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
309 (void *)pgd_page_vaddr(swapper_pg_dir[idx]),
310 sizeof(pmd_t) * PTRS_PER_PMD); 322 sizeof(pmd_t) * PTRS_PER_PMD);
311 } else
312 pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
313 323
314 return pmd; 324 pud_populate(mm, pud, pmd);
325 }
326
327 return 1;
328}
329#else /* !CONFIG_X86_PAE */
330/* No need to prepopulate any pagetable entries in non-PAE modes. */
331static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
332{
333 return 1;
315} 334}
316 335
317static void pmd_cache_free(pmd_t *pmd, int idx) 336static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
318{ 337{
319 if (idx >= USER_PTRS_PER_PGD)
320 free_page((unsigned long)pmd);
321 else
322 kmem_cache_free(pmd_cache, pmd);
323} 338}
339#endif /* CONFIG_X86_PAE */
324 340
325pgd_t *pgd_alloc(struct mm_struct *mm) 341pgd_t *pgd_alloc(struct mm_struct *mm)
326{ 342{
327 int i;
328 pgd_t *pgd = quicklist_alloc(0, GFP_KERNEL, pgd_ctor); 343 pgd_t *pgd = quicklist_alloc(0, GFP_KERNEL, pgd_ctor);
329 344
330 if (PTRS_PER_PMD == 1 || !pgd) 345 mm->pgd = pgd; /* so that alloc_pd can use it */
331 return pgd;
332
333 for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
334 pmd_t *pmd = pmd_cache_alloc(i);
335
336 if (!pmd)
337 goto out_oom;
338 346
339 paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT); 347 if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
340 set_pgd(&pgd[i], __pgd(1 + __pa(pmd))); 348 quicklist_free(0, pgd_dtor, pgd);
349 pgd = NULL;
341 } 350 }
342 return pgd;
343 351
344out_oom: 352 return pgd;
345 for (i--; i >= 0; i--) {
346 pgd_t pgdent = pgd[i];
347 void* pmd = (void *)__va(pgd_val(pgdent)-1);
348 paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
349 pmd_cache_free(pmd, i);
350 }
351 quicklist_free(0, pgd_dtor, pgd);
352 return NULL;
353} 353}
354 354
355void pgd_free(pgd_t *pgd) 355void pgd_free(struct mm_struct *mm, pgd_t *pgd)
356{ 356{
357 int i; 357 pgd_mop_up_pmds(mm, pgd);
358
359 /* in the PAE case user pgd entries are overwritten before usage */
360 if (PTRS_PER_PMD > 1)
361 for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
362 pgd_t pgdent = pgd[i];
363 void* pmd = (void *)__va(pgd_val(pgdent)-1);
364 paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
365 pmd_cache_free(pmd, i);
366 }
367 /* in the non-PAE case, free_pgtables() clears user pgd entries */
368 quicklist_free(0, pgd_dtor, pgd); 358 quicklist_free(0, pgd_dtor, pgd);
369} 359}
370 360
@@ -373,3 +363,18 @@ void check_pgt_cache(void)
373 quicklist_trim(0, pgd_dtor, 25, 16); 363 quicklist_trim(0, pgd_dtor, 25, 16);
374} 364}
375 365
366void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
367{
368 paravirt_release_pt(page_to_pfn(pte));
369 tlb_remove_page(tlb, pte);
370}
371
372#ifdef CONFIG_X86_PAE
373
374void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
375{
376 paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
377 tlb_remove_page(tlb, virt_to_page(pmd));
378}
379
380#endif
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index ea85172fc0c..65416f843e5 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -130,6 +130,9 @@ void __init
130acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) 130acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
131{ 131{
132 int pxm, node; 132 int pxm, node;
133 int apic_id;
134
135 apic_id = pa->apic_id;
133 if (srat_disabled()) 136 if (srat_disabled())
134 return; 137 return;
135 if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) { 138 if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) {
@@ -145,68 +148,12 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
145 bad_srat(); 148 bad_srat();
146 return; 149 return;
147 } 150 }
148 apicid_to_node[pa->apic_id] = node; 151 apicid_to_node[apic_id] = node;
149 acpi_numa = 1; 152 acpi_numa = 1;
150 printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n", 153 printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
151 pxm, pa->apic_id, node); 154 pxm, apic_id, node);
152}
153
154#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
155/*
156 * Protect against too large hotadd areas that would fill up memory.
157 */
158static int hotadd_enough_memory(struct bootnode *nd)
159{
160 static unsigned long allocated;
161 static unsigned long last_area_end;
162 unsigned long pages = (nd->end - nd->start) >> PAGE_SHIFT;
163 long mem = pages * sizeof(struct page);
164 unsigned long addr;
165 unsigned long allowed;
166 unsigned long oldpages = pages;
167
168 if (mem < 0)
169 return 0;
170 allowed = (end_pfn - absent_pages_in_range(0, end_pfn)) * PAGE_SIZE;
171 allowed = (allowed / 100) * hotadd_percent;
172 if (allocated + mem > allowed) {
173 unsigned long range;
174 /* Give them at least part of their hotadd memory upto hotadd_percent
175 It would be better to spread the limit out
176 over multiple hotplug areas, but that is too complicated
177 right now */
178 if (allocated >= allowed)
179 return 0;
180 range = allowed - allocated;
181 pages = (range / PAGE_SIZE);
182 mem = pages * sizeof(struct page);
183 nd->end = nd->start + range;
184 }
185 /* Not completely fool proof, but a good sanity check */
186 addr = find_e820_area(last_area_end, end_pfn<<PAGE_SHIFT, mem);
187 if (addr == -1UL)
188 return 0;
189 if (pages != oldpages)
190 printk(KERN_NOTICE "SRAT: Hotadd area limited to %lu bytes\n",
191 pages << PAGE_SHIFT);
192 last_area_end = addr + mem;
193 allocated += mem;
194 return 1;
195}
196
197static int update_end_of_memory(unsigned long end)
198{
199 found_add_area = 1;
200 if ((end >> PAGE_SHIFT) > end_pfn)
201 end_pfn = end >> PAGE_SHIFT;
202 return 1;
203} 155}
204 156
205static inline int save_add_info(void)
206{
207 return hotadd_percent > 0;
208}
209#else
210int update_end_of_memory(unsigned long end) {return -1;} 157int update_end_of_memory(unsigned long end) {return -1;}
211static int hotadd_enough_memory(struct bootnode *nd) {return 1;} 158static int hotadd_enough_memory(struct bootnode *nd) {return 1;}
212#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE 159#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
@@ -214,10 +161,9 @@ static inline int save_add_info(void) {return 1;}
214#else 161#else
215static inline int save_add_info(void) {return 0;} 162static inline int save_add_info(void) {return 0;}
216#endif 163#endif
217#endif
218/* 164/*
219 * Update nodes_add and decide if to include add are in the zone. 165 * Update nodes_add and decide if to include add are in the zone.
220 * Both SPARSE and RESERVE need nodes_add infomation. 166 * Both SPARSE and RESERVE need nodes_add information.
221 * This code supports one contiguous hot add area per node. 167 * This code supports one contiguous hot add area per node.
222 */ 168 */
223static int reserve_hotadd(int node, unsigned long start, unsigned long end) 169static int reserve_hotadd(int node, unsigned long start, unsigned long end)
@@ -377,7 +323,7 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
377 return 1; 323 return 1;
378} 324}
379 325
380static void unparse_node(int node) 326static void __init unparse_node(int node)
381{ 327{
382 int i; 328 int i;
383 node_clear(node, nodes_parsed); 329 node_clear(node, nodes_parsed);
@@ -400,7 +346,12 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
400 /* First clean up the node list */ 346 /* First clean up the node list */
401 for (i = 0; i < MAX_NUMNODES; i++) { 347 for (i = 0; i < MAX_NUMNODES; i++) {
402 cutoff_node(i, start, end); 348 cutoff_node(i, start, end);
403 if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) { 349 /*
350 * don't confuse VM with a node that doesn't have the
351 * minimum memory.
352 */
353 if (nodes[i].end &&
354 (nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) {
404 unparse_node(i); 355 unparse_node(i);
405 node_set_offline(i); 356 node_set_offline(i);
406 } 357 }
@@ -431,9 +382,11 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
431 setup_node_bootmem(i, nodes[i].start, nodes[i].end); 382 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
432 383
433 for (i = 0; i < NR_CPUS; i++) { 384 for (i = 0; i < NR_CPUS; i++) {
434 if (cpu_to_node(i) == NUMA_NO_NODE) 385 int node = early_cpu_to_node(i);
386
387 if (node == NUMA_NO_NODE)
435 continue; 388 continue;
436 if (!node_isset(cpu_to_node(i), node_possible_map)) 389 if (!node_isset(node, node_possible_map))
437 numa_set_node(i, NUMA_NO_NODE); 390 numa_set_node(i, NUMA_NO_NODE);
438 } 391 }
439 numa_init_array(); 392 numa_init_array();
@@ -441,6 +394,12 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
441} 394}
442 395
443#ifdef CONFIG_NUMA_EMU 396#ifdef CONFIG_NUMA_EMU
397static int fake_node_to_pxm_map[MAX_NUMNODES] __initdata = {
398 [0 ... MAX_NUMNODES-1] = PXM_INVAL
399};
400static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
401 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
402};
444static int __init find_node_by_addr(unsigned long addr) 403static int __init find_node_by_addr(unsigned long addr)
445{ 404{
446 int ret = NUMA_NO_NODE; 405 int ret = NUMA_NO_NODE;
@@ -457,7 +416,7 @@ static int __init find_node_by_addr(unsigned long addr)
457 break; 416 break;
458 } 417 }
459 } 418 }
460 return i; 419 return ret;
461} 420}
462 421
463/* 422/*
@@ -471,12 +430,6 @@ static int __init find_node_by_addr(unsigned long addr)
471void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes) 430void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
472{ 431{
473 int i, j; 432 int i, j;
474 int fake_node_to_pxm_map[MAX_NUMNODES] = {
475 [0 ... MAX_NUMNODES-1] = PXM_INVAL
476 };
477 unsigned char fake_apicid_to_node[MAX_LOCAL_APIC] = {
478 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
479 };
480 433
481 printk(KERN_INFO "Faking PXM affinity for fake nodes on real " 434 printk(KERN_INFO "Faking PXM affinity for fake nodes on real "
482 "topology.\n"); 435 "topology.\n");
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
index 0ed046a187f..e2095cba409 100644
--- a/arch/x86/oprofile/backtrace.c
+++ b/arch/x86/oprofile/backtrace.c
@@ -32,7 +32,7 @@ static int backtrace_stack(void *data, char *name)
32 return 0; 32 return 0;
33} 33}
34 34
35static void backtrace_address(void *data, unsigned long addr) 35static void backtrace_address(void *data, unsigned long addr, int reliable)
36{ 36{
37 unsigned int *depth = data; 37 unsigned int *depth = data;
38 38
@@ -48,7 +48,7 @@ static struct stacktrace_ops backtrace_ops = {
48}; 48};
49 49
50struct frame_head { 50struct frame_head {
51 struct frame_head *ebp; 51 struct frame_head *bp;
52 unsigned long ret; 52 unsigned long ret;
53} __attribute__((packed)); 53} __attribute__((packed));
54 54
@@ -67,21 +67,21 @@ dump_user_backtrace(struct frame_head * head)
67 67
68 /* frame pointers should strictly progress back up the stack 68 /* frame pointers should strictly progress back up the stack
69 * (towards higher addresses) */ 69 * (towards higher addresses) */
70 if (head >= bufhead[0].ebp) 70 if (head >= bufhead[0].bp)
71 return NULL; 71 return NULL;
72 72
73 return bufhead[0].ebp; 73 return bufhead[0].bp;
74} 74}
75 75
76void 76void
77x86_backtrace(struct pt_regs * const regs, unsigned int depth) 77x86_backtrace(struct pt_regs * const regs, unsigned int depth)
78{ 78{
79 struct frame_head *head = (struct frame_head *)frame_pointer(regs); 79 struct frame_head *head = (struct frame_head *)frame_pointer(regs);
80 unsigned long stack = stack_pointer(regs); 80 unsigned long stack = kernel_trap_sp(regs);
81 81
82 if (!user_mode_vm(regs)) { 82 if (!user_mode_vm(regs)) {
83 if (depth) 83 if (depth)
84 dump_trace(NULL, regs, (unsigned long *)stack, 84 dump_trace(NULL, regs, (unsigned long *)stack, 0,
85 &backtrace_ops, &depth); 85 &backtrace_ops, &depth);
86 return; 86 return;
87 } 87 }
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 2d0eeac7251..1f11cf0a307 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -18,11 +18,11 @@
18#include <asm/nmi.h> 18#include <asm/nmi.h>
19#include <asm/msr.h> 19#include <asm/msr.h>
20#include <asm/apic.h> 20#include <asm/apic.h>
21 21
22#include "op_counter.h" 22#include "op_counter.h"
23#include "op_x86_model.h" 23#include "op_x86_model.h"
24 24
25static struct op_x86_model_spec const * model; 25static struct op_x86_model_spec const *model;
26static struct op_msrs cpu_msrs[NR_CPUS]; 26static struct op_msrs cpu_msrs[NR_CPUS];
27static unsigned long saved_lvtpc[NR_CPUS]; 27static unsigned long saved_lvtpc[NR_CPUS];
28 28
@@ -41,7 +41,6 @@ static int nmi_suspend(struct sys_device *dev, pm_message_t state)
41 return 0; 41 return 0;
42} 42}
43 43
44
45static int nmi_resume(struct sys_device *dev) 44static int nmi_resume(struct sys_device *dev)
46{ 45{
47 if (nmi_enabled == 1) 46 if (nmi_enabled == 1)
@@ -49,29 +48,27 @@ static int nmi_resume(struct sys_device *dev)
49 return 0; 48 return 0;
50} 49}
51 50
52
53static struct sysdev_class oprofile_sysclass = { 51static struct sysdev_class oprofile_sysclass = {
54 set_kset_name("oprofile"), 52 .name = "oprofile",
55 .resume = nmi_resume, 53 .resume = nmi_resume,
56 .suspend = nmi_suspend, 54 .suspend = nmi_suspend,
57}; 55};
58 56
59
60static struct sys_device device_oprofile = { 57static struct sys_device device_oprofile = {
61 .id = 0, 58 .id = 0,
62 .cls = &oprofile_sysclass, 59 .cls = &oprofile_sysclass,
63}; 60};
64 61
65
66static int __init init_sysfs(void) 62static int __init init_sysfs(void)
67{ 63{
68 int error; 64 int error;
69 if (!(error = sysdev_class_register(&oprofile_sysclass))) 65
66 error = sysdev_class_register(&oprofile_sysclass);
67 if (!error)
70 error = sysdev_register(&device_oprofile); 68 error = sysdev_register(&device_oprofile);
71 return error; 69 return error;
72} 70}
73 71
74
75static void exit_sysfs(void) 72static void exit_sysfs(void)
76{ 73{
77 sysdev_unregister(&device_oprofile); 74 sysdev_unregister(&device_oprofile);
@@ -90,7 +87,7 @@ static int profile_exceptions_notify(struct notifier_block *self,
90 int ret = NOTIFY_DONE; 87 int ret = NOTIFY_DONE;
91 int cpu = smp_processor_id(); 88 int cpu = smp_processor_id();
92 89
93 switch(val) { 90 switch (val) {
94 case DIE_NMI: 91 case DIE_NMI:
95 if (model->check_ctrs(args->regs, &cpu_msrs[cpu])) 92 if (model->check_ctrs(args->regs, &cpu_msrs[cpu]))
96 ret = NOTIFY_STOP; 93 ret = NOTIFY_STOP;
@@ -101,24 +98,24 @@ static int profile_exceptions_notify(struct notifier_block *self,
101 return ret; 98 return ret;
102} 99}
103 100
104static void nmi_cpu_save_registers(struct op_msrs * msrs) 101static void nmi_cpu_save_registers(struct op_msrs *msrs)
105{ 102{
106 unsigned int const nr_ctrs = model->num_counters; 103 unsigned int const nr_ctrs = model->num_counters;
107 unsigned int const nr_ctrls = model->num_controls; 104 unsigned int const nr_ctrls = model->num_controls;
108 struct op_msr * counters = msrs->counters; 105 struct op_msr *counters = msrs->counters;
109 struct op_msr * controls = msrs->controls; 106 struct op_msr *controls = msrs->controls;
110 unsigned int i; 107 unsigned int i;
111 108
112 for (i = 0; i < nr_ctrs; ++i) { 109 for (i = 0; i < nr_ctrs; ++i) {
113 if (counters[i].addr){ 110 if (counters[i].addr) {
114 rdmsr(counters[i].addr, 111 rdmsr(counters[i].addr,
115 counters[i].saved.low, 112 counters[i].saved.low,
116 counters[i].saved.high); 113 counters[i].saved.high);
117 } 114 }
118 } 115 }
119 116
120 for (i = 0; i < nr_ctrls; ++i) { 117 for (i = 0; i < nr_ctrls; ++i) {
121 if (controls[i].addr){ 118 if (controls[i].addr) {
122 rdmsr(controls[i].addr, 119 rdmsr(controls[i].addr,
123 controls[i].saved.low, 120 controls[i].saved.low,
124 controls[i].saved.high); 121 controls[i].saved.high);
@@ -126,15 +123,13 @@ static void nmi_cpu_save_registers(struct op_msrs * msrs)
126 } 123 }
127} 124}
128 125
129 126static void nmi_save_registers(void *dummy)
130static void nmi_save_registers(void * dummy)
131{ 127{
132 int cpu = smp_processor_id(); 128 int cpu = smp_processor_id();
133 struct op_msrs * msrs = &cpu_msrs[cpu]; 129 struct op_msrs *msrs = &cpu_msrs[cpu];
134 nmi_cpu_save_registers(msrs); 130 nmi_cpu_save_registers(msrs);
135} 131}
136 132
137
138static void free_msrs(void) 133static void free_msrs(void)
139{ 134{
140 int i; 135 int i;
@@ -146,7 +141,6 @@ static void free_msrs(void)
146 } 141 }
147} 142}
148 143
149
150static int allocate_msrs(void) 144static int allocate_msrs(void)
151{ 145{
152 int success = 1; 146 int success = 1;
@@ -173,11 +167,10 @@ static int allocate_msrs(void)
173 return success; 167 return success;
174} 168}
175 169
176 170static void nmi_cpu_setup(void *dummy)
177static void nmi_cpu_setup(void * dummy)
178{ 171{
179 int cpu = smp_processor_id(); 172 int cpu = smp_processor_id();
180 struct op_msrs * msrs = &cpu_msrs[cpu]; 173 struct op_msrs *msrs = &cpu_msrs[cpu];
181 spin_lock(&oprofilefs_lock); 174 spin_lock(&oprofilefs_lock);
182 model->setup_ctrs(msrs); 175 model->setup_ctrs(msrs);
183 spin_unlock(&oprofilefs_lock); 176 spin_unlock(&oprofilefs_lock);
@@ -193,13 +186,14 @@ static struct notifier_block profile_exceptions_nb = {
193 186
194static int nmi_setup(void) 187static int nmi_setup(void)
195{ 188{
196 int err=0; 189 int err = 0;
197 int cpu; 190 int cpu;
198 191
199 if (!allocate_msrs()) 192 if (!allocate_msrs())
200 return -ENOMEM; 193 return -ENOMEM;
201 194
202 if ((err = register_die_notifier(&profile_exceptions_nb))){ 195 err = register_die_notifier(&profile_exceptions_nb);
196 if (err) {
203 free_msrs(); 197 free_msrs();
204 return err; 198 return err;
205 } 199 }
@@ -210,7 +204,7 @@ static int nmi_setup(void)
210 204
211 /* Assume saved/restored counters are the same on all CPUs */ 205 /* Assume saved/restored counters are the same on all CPUs */
212 model->fill_in_addresses(&cpu_msrs[0]); 206 model->fill_in_addresses(&cpu_msrs[0]);
213 for_each_possible_cpu (cpu) { 207 for_each_possible_cpu(cpu) {
214 if (cpu != 0) { 208 if (cpu != 0) {
215 memcpy(cpu_msrs[cpu].counters, cpu_msrs[0].counters, 209 memcpy(cpu_msrs[cpu].counters, cpu_msrs[0].counters,
216 sizeof(struct op_msr) * model->num_counters); 210 sizeof(struct op_msr) * model->num_counters);
@@ -226,39 +220,37 @@ static int nmi_setup(void)
226 return 0; 220 return 0;
227} 221}
228 222
229 223static void nmi_restore_registers(struct op_msrs *msrs)
230static void nmi_restore_registers(struct op_msrs * msrs)
231{ 224{
232 unsigned int const nr_ctrs = model->num_counters; 225 unsigned int const nr_ctrs = model->num_counters;
233 unsigned int const nr_ctrls = model->num_controls; 226 unsigned int const nr_ctrls = model->num_controls;
234 struct op_msr * counters = msrs->counters; 227 struct op_msr *counters = msrs->counters;
235 struct op_msr * controls = msrs->controls; 228 struct op_msr *controls = msrs->controls;
236 unsigned int i; 229 unsigned int i;
237 230
238 for (i = 0; i < nr_ctrls; ++i) { 231 for (i = 0; i < nr_ctrls; ++i) {
239 if (controls[i].addr){ 232 if (controls[i].addr) {
240 wrmsr(controls[i].addr, 233 wrmsr(controls[i].addr,
241 controls[i].saved.low, 234 controls[i].saved.low,
242 controls[i].saved.high); 235 controls[i].saved.high);
243 } 236 }
244 } 237 }
245 238
246 for (i = 0; i < nr_ctrs; ++i) { 239 for (i = 0; i < nr_ctrs; ++i) {
247 if (counters[i].addr){ 240 if (counters[i].addr) {
248 wrmsr(counters[i].addr, 241 wrmsr(counters[i].addr,
249 counters[i].saved.low, 242 counters[i].saved.low,
250 counters[i].saved.high); 243 counters[i].saved.high);
251 } 244 }
252 } 245 }
253} 246}
254
255 247
256static void nmi_cpu_shutdown(void * dummy) 248static void nmi_cpu_shutdown(void *dummy)
257{ 249{
258 unsigned int v; 250 unsigned int v;
259 int cpu = smp_processor_id(); 251 int cpu = smp_processor_id();
260 struct op_msrs * msrs = &cpu_msrs[cpu]; 252 struct op_msrs *msrs = &cpu_msrs[cpu];
261 253
262 /* restoring APIC_LVTPC can trigger an apic error because the delivery 254 /* restoring APIC_LVTPC can trigger an apic error because the delivery
263 * mode and vector nr combination can be illegal. That's by design: on 255 * mode and vector nr combination can be illegal. That's by design: on
264 * power on apic lvt contain a zero vector nr which are legal only for 256 * power on apic lvt contain a zero vector nr which are legal only for
@@ -271,7 +263,6 @@ static void nmi_cpu_shutdown(void * dummy)
271 nmi_restore_registers(msrs); 263 nmi_restore_registers(msrs);
272} 264}
273 265
274
275static void nmi_shutdown(void) 266static void nmi_shutdown(void)
276{ 267{
277 nmi_enabled = 0; 268 nmi_enabled = 0;
@@ -281,45 +272,40 @@ static void nmi_shutdown(void)
281 free_msrs(); 272 free_msrs();
282} 273}
283 274
284 275static void nmi_cpu_start(void *dummy)
285static void nmi_cpu_start(void * dummy)
286{ 276{
287 struct op_msrs const * msrs = &cpu_msrs[smp_processor_id()]; 277 struct op_msrs const *msrs = &cpu_msrs[smp_processor_id()];
288 model->start(msrs); 278 model->start(msrs);
289} 279}
290
291 280
292static int nmi_start(void) 281static int nmi_start(void)
293{ 282{
294 on_each_cpu(nmi_cpu_start, NULL, 0, 1); 283 on_each_cpu(nmi_cpu_start, NULL, 0, 1);
295 return 0; 284 return 0;
296} 285}
297 286
298 287static void nmi_cpu_stop(void *dummy)
299static void nmi_cpu_stop(void * dummy)
300{ 288{
301 struct op_msrs const * msrs = &cpu_msrs[smp_processor_id()]; 289 struct op_msrs const *msrs = &cpu_msrs[smp_processor_id()];
302 model->stop(msrs); 290 model->stop(msrs);
303} 291}
304 292
305
306static void nmi_stop(void) 293static void nmi_stop(void)
307{ 294{
308 on_each_cpu(nmi_cpu_stop, NULL, 0, 1); 295 on_each_cpu(nmi_cpu_stop, NULL, 0, 1);
309} 296}
310 297
311
312struct op_counter_config counter_config[OP_MAX_COUNTER]; 298struct op_counter_config counter_config[OP_MAX_COUNTER];
313 299
314static int nmi_create_files(struct super_block * sb, struct dentry * root) 300static int nmi_create_files(struct super_block *sb, struct dentry *root)
315{ 301{
316 unsigned int i; 302 unsigned int i;
317 303
318 for (i = 0; i < model->num_counters; ++i) { 304 for (i = 0; i < model->num_counters; ++i) {
319 struct dentry * dir; 305 struct dentry *dir;
320 char buf[4]; 306 char buf[4];
321 307
322 /* quick little hack to _not_ expose a counter if it is not 308 /* quick little hack to _not_ expose a counter if it is not
323 * available for use. This should protect userspace app. 309 * available for use. This should protect userspace app.
324 * NOTE: assumes 1:1 mapping here (that counters are organized 310 * NOTE: assumes 1:1 mapping here (that counters are organized
325 * sequentially in their struct assignment). 311 * sequentially in their struct assignment).
@@ -329,21 +315,21 @@ static int nmi_create_files(struct super_block * sb, struct dentry * root)
329 315
330 snprintf(buf, sizeof(buf), "%d", i); 316 snprintf(buf, sizeof(buf), "%d", i);
331 dir = oprofilefs_mkdir(sb, root, buf); 317 dir = oprofilefs_mkdir(sb, root, buf);
332 oprofilefs_create_ulong(sb, dir, "enabled", &counter_config[i].enabled); 318 oprofilefs_create_ulong(sb, dir, "enabled", &counter_config[i].enabled);
333 oprofilefs_create_ulong(sb, dir, "event", &counter_config[i].event); 319 oprofilefs_create_ulong(sb, dir, "event", &counter_config[i].event);
334 oprofilefs_create_ulong(sb, dir, "count", &counter_config[i].count); 320 oprofilefs_create_ulong(sb, dir, "count", &counter_config[i].count);
335 oprofilefs_create_ulong(sb, dir, "unit_mask", &counter_config[i].unit_mask); 321 oprofilefs_create_ulong(sb, dir, "unit_mask", &counter_config[i].unit_mask);
336 oprofilefs_create_ulong(sb, dir, "kernel", &counter_config[i].kernel); 322 oprofilefs_create_ulong(sb, dir, "kernel", &counter_config[i].kernel);
337 oprofilefs_create_ulong(sb, dir, "user", &counter_config[i].user); 323 oprofilefs_create_ulong(sb, dir, "user", &counter_config[i].user);
338 } 324 }
339 325
340 return 0; 326 return 0;
341} 327}
342 328
343static int p4force; 329static int p4force;
344module_param(p4force, int, 0); 330module_param(p4force, int, 0);
345 331
346static int __init p4_init(char ** cpu_type) 332static int __init p4_init(char **cpu_type)
347{ 333{
348 __u8 cpu_model = boot_cpu_data.x86_model; 334 __u8 cpu_model = boot_cpu_data.x86_model;
349 335
@@ -356,15 +342,15 @@ static int __init p4_init(char ** cpu_type)
356 return 1; 342 return 1;
357#else 343#else
358 switch (smp_num_siblings) { 344 switch (smp_num_siblings) {
359 case 1: 345 case 1:
360 *cpu_type = "i386/p4"; 346 *cpu_type = "i386/p4";
361 model = &op_p4_spec; 347 model = &op_p4_spec;
362 return 1; 348 return 1;
363 349
364 case 2: 350 case 2:
365 *cpu_type = "i386/p4-ht"; 351 *cpu_type = "i386/p4-ht";
366 model = &op_p4_ht2_spec; 352 model = &op_p4_ht2_spec;
367 return 1; 353 return 1;
368 } 354 }
369#endif 355#endif
370 356
@@ -373,14 +359,13 @@ static int __init p4_init(char ** cpu_type)
373 return 0; 359 return 0;
374} 360}
375 361
376 362static int __init ppro_init(char **cpu_type)
377static int __init ppro_init(char ** cpu_type)
378{ 363{
379 __u8 cpu_model = boot_cpu_data.x86_model; 364 __u8 cpu_model = boot_cpu_data.x86_model;
380 365
381 if (cpu_model == 14) 366 if (cpu_model == 14)
382 *cpu_type = "i386/core"; 367 *cpu_type = "i386/core";
383 else if (cpu_model == 15) 368 else if (cpu_model == 15 || cpu_model == 23)
384 *cpu_type = "i386/core_2"; 369 *cpu_type = "i386/core_2";
385 else if (cpu_model > 0xd) 370 else if (cpu_model > 0xd)
386 return 0; 371 return 0;
@@ -409,52 +394,52 @@ int __init op_nmi_init(struct oprofile_operations *ops)
409 394
410 if (!cpu_has_apic) 395 if (!cpu_has_apic)
411 return -ENODEV; 396 return -ENODEV;
412 397
413 switch (vendor) { 398 switch (vendor) {
414 case X86_VENDOR_AMD: 399 case X86_VENDOR_AMD:
415 /* Needs to be at least an Athlon (or hammer in 32bit mode) */ 400 /* Needs to be at least an Athlon (or hammer in 32bit mode) */
416 401
417 switch (family) { 402 switch (family) {
418 default: 403 default:
404 return -ENODEV;
405 case 6:
406 model = &op_athlon_spec;
407 cpu_type = "i386/athlon";
408 break;
409 case 0xf:
410 model = &op_athlon_spec;
411 /* Actually it could be i386/hammer too, but give
412 user space an consistent name. */
413 cpu_type = "x86-64/hammer";
414 break;
415 case 0x10:
416 model = &op_athlon_spec;
417 cpu_type = "x86-64/family10";
418 break;
419 }
420 break;
421
422 case X86_VENDOR_INTEL:
423 switch (family) {
424 /* Pentium IV */
425 case 0xf:
426 if (!p4_init(&cpu_type))
419 return -ENODEV; 427 return -ENODEV;
420 case 6:
421 model = &op_athlon_spec;
422 cpu_type = "i386/athlon";
423 break;
424 case 0xf:
425 model = &op_athlon_spec;
426 /* Actually it could be i386/hammer too, but give
427 user space an consistent name. */
428 cpu_type = "x86-64/hammer";
429 break;
430 case 0x10:
431 model = &op_athlon_spec;
432 cpu_type = "x86-64/family10";
433 break;
434 }
435 break; 428 break;
436 429
437 case X86_VENDOR_INTEL: 430 /* A P6-class processor */
438 switch (family) { 431 case 6:
439 /* Pentium IV */ 432 if (!ppro_init(&cpu_type))
440 case 0xf: 433 return -ENODEV;
441 if (!p4_init(&cpu_type))
442 return -ENODEV;
443 break;
444
445 /* A P6-class processor */
446 case 6:
447 if (!ppro_init(&cpu_type))
448 return -ENODEV;
449 break;
450
451 default:
452 return -ENODEV;
453 }
454 break; 434 break;
455 435
456 default: 436 default:
457 return -ENODEV; 437 return -ENODEV;
438 }
439 break;
440
441 default:
442 return -ENODEV;
458 } 443 }
459 444
460 init_sysfs(); 445 init_sysfs();
@@ -469,7 +454,6 @@ int __init op_nmi_init(struct oprofile_operations *ops)
469 return 0; 454 return 0;
470} 455}
471 456
472
473void op_nmi_exit(void) 457void op_nmi_exit(void)
474{ 458{
475 if (using_nmi) 459 if (using_nmi)
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 86274639066..52deabc72a6 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -109,6 +109,19 @@ static void __devinit pcibios_fixup_ghosts(struct pci_bus *b)
109 } 109 }
110} 110}
111 111
112static void __devinit pcibios_fixup_device_resources(struct pci_dev *dev)
113{
114 struct resource *rom_r = &dev->resource[PCI_ROM_RESOURCE];
115
116 if (rom_r->parent)
117 return;
118 if (rom_r->start)
119 /* we deal with BIOS assigned ROM later */
120 return;
121 if (!(pci_probe & PCI_ASSIGN_ROMS))
122 rom_r->start = rom_r->end = rom_r->flags = 0;
123}
124
112/* 125/*
113 * Called after each bus is probed, but before its children 126 * Called after each bus is probed, but before its children
114 * are examined. 127 * are examined.
@@ -116,8 +129,12 @@ static void __devinit pcibios_fixup_ghosts(struct pci_bus *b)
116 129
117void __devinit pcibios_fixup_bus(struct pci_bus *b) 130void __devinit pcibios_fixup_bus(struct pci_bus *b)
118{ 131{
132 struct pci_dev *dev;
133
119 pcibios_fixup_ghosts(b); 134 pcibios_fixup_ghosts(b);
120 pci_read_bridge_bases(b); 135 pci_read_bridge_bases(b);
136 list_for_each_entry(dev, &b->devices, bus_list)
137 pcibios_fixup_device_resources(dev);
121} 138}
122 139
123/* 140/*
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index 6cff66dd0c9..74d30ff33c4 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -17,9 +17,9 @@ static void __devinit pci_fixup_i450nx(struct pci_dev *d)
17 int pxb, reg; 17 int pxb, reg;
18 u8 busno, suba, subb; 18 u8 busno, suba, subb;
19 19
20 printk(KERN_WARNING "PCI: Searching for i450NX host bridges on %s\n", pci_name(d)); 20 dev_warn(&d->dev, "Searching for i450NX host bridges\n");
21 reg = 0xd0; 21 reg = 0xd0;
22 for(pxb=0; pxb<2; pxb++) { 22 for(pxb = 0; pxb < 2; pxb++) {
23 pci_read_config_byte(d, reg++, &busno); 23 pci_read_config_byte(d, reg++, &busno);
24 pci_read_config_byte(d, reg++, &suba); 24 pci_read_config_byte(d, reg++, &suba);
25 pci_read_config_byte(d, reg++, &subb); 25 pci_read_config_byte(d, reg++, &subb);
@@ -41,7 +41,7 @@ static void __devinit pci_fixup_i450gx(struct pci_dev *d)
41 */ 41 */
42 u8 busno; 42 u8 busno;
43 pci_read_config_byte(d, 0x4a, &busno); 43 pci_read_config_byte(d, 0x4a, &busno);
44 printk(KERN_INFO "PCI: i440KX/GX host bridge %s: secondary bus %02x\n", pci_name(d), busno); 44 dev_info(&d->dev, "i440KX/GX host bridge; secondary bus %02x\n", busno);
45 pci_scan_bus_with_sysdata(busno); 45 pci_scan_bus_with_sysdata(busno);
46 pcibios_last_bus = -1; 46 pcibios_last_bus = -1;
47} 47}
@@ -55,8 +55,8 @@ static void __devinit pci_fixup_umc_ide(struct pci_dev *d)
55 */ 55 */
56 int i; 56 int i;
57 57
58 printk(KERN_WARNING "PCI: Fixing base address flags for device %s\n", pci_name(d)); 58 dev_warn(&d->dev, "Fixing base address flags\n");
59 for(i=0; i<4; i++) 59 for(i = 0; i < 4; i++)
60 d->resource[i].flags |= PCI_BASE_ADDRESS_SPACE_IO; 60 d->resource[i].flags |= PCI_BASE_ADDRESS_SPACE_IO;
61} 61}
62DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_UMC, PCI_DEVICE_ID_UMC_UM8886BF, pci_fixup_umc_ide); 62DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_UMC, PCI_DEVICE_ID_UMC_UM8886BF, pci_fixup_umc_ide);
@@ -68,7 +68,7 @@ static void __devinit pci_fixup_ncr53c810(struct pci_dev *d)
68 * Fix class to be PCI_CLASS_STORAGE_SCSI 68 * Fix class to be PCI_CLASS_STORAGE_SCSI
69 */ 69 */
70 if (!d->class) { 70 if (!d->class) {
71 printk(KERN_WARNING "PCI: fixing NCR 53C810 class code for %s\n", pci_name(d)); 71 dev_warn(&d->dev, "Fixing NCR 53C810 class code\n");
72 d->class = PCI_CLASS_STORAGE_SCSI << 8; 72 d->class = PCI_CLASS_STORAGE_SCSI << 8;
73 } 73 }
74} 74}
@@ -80,7 +80,7 @@ static void __devinit pci_fixup_latency(struct pci_dev *d)
80 * SiS 5597 and 5598 chipsets require latency timer set to 80 * SiS 5597 and 5598 chipsets require latency timer set to
81 * at most 32 to avoid lockups. 81 * at most 32 to avoid lockups.
82 */ 82 */
83 DBG("PCI: Setting max latency to 32\n"); 83 dev_dbg(&d->dev, "Setting max latency to 32\n");
84 pcibios_max_latency = 32; 84 pcibios_max_latency = 32;
85} 85}
86DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_SI, PCI_DEVICE_ID_SI_5597, pci_fixup_latency); 86DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_SI, PCI_DEVICE_ID_SI_5597, pci_fixup_latency);
@@ -127,7 +127,7 @@ static void pci_fixup_via_northbridge_bug(struct pci_dev *d)
127 NB latency to zero */ 127 NB latency to zero */
128 pci_write_config_byte(d, PCI_LATENCY_TIMER, 0); 128 pci_write_config_byte(d, PCI_LATENCY_TIMER, 0);
129 129
130 where = 0x95; /* the memory write queue timer register is 130 where = 0x95; /* the memory write queue timer register is
131 different for the KT266x's: 0x95 not 0x55 */ 131 different for the KT266x's: 0x95 not 0x55 */
132 } else if (d->device == PCI_DEVICE_ID_VIA_8363_0 && 132 } else if (d->device == PCI_DEVICE_ID_VIA_8363_0 &&
133 (d->revision == VIA_8363_KL133_REVISION_ID || 133 (d->revision == VIA_8363_KL133_REVISION_ID ||
@@ -138,7 +138,7 @@ static void pci_fixup_via_northbridge_bug(struct pci_dev *d)
138 138
139 pci_read_config_byte(d, where, &v); 139 pci_read_config_byte(d, where, &v);
140 if (v & ~mask) { 140 if (v & ~mask) {
141 printk(KERN_WARNING "Disabling VIA memory write queue (PCI ID %04x, rev %02x): [%02x] %02x & %02x -> %02x\n", \ 141 dev_warn(&d->dev, "Disabling VIA memory write queue (PCI ID %04x, rev %02x): [%02x] %02x & %02x -> %02x\n", \
142 d->device, d->revision, where, v, mask, v & mask); 142 d->device, d->revision, where, v, mask, v & mask);
143 v &= mask; 143 v &= mask;
144 pci_write_config_byte(d, where, v); 144 pci_write_config_byte(d, where, v);
@@ -200,7 +200,7 @@ static void pci_fixup_nforce2(struct pci_dev *dev)
200 * Apply fixup if needed, but don't touch disconnect state 200 * Apply fixup if needed, but don't touch disconnect state
201 */ 201 */
202 if ((val & 0x00FF0000) != 0x00010000) { 202 if ((val & 0x00FF0000) != 0x00010000) {
203 printk(KERN_WARNING "PCI: nForce2 C1 Halt Disconnect fixup\n"); 203 dev_warn(&dev->dev, "nForce2 C1 Halt Disconnect fixup\n");
204 pci_write_config_dword(dev, 0x6c, (val & 0xFF00FFFF) | 0x00010000); 204 pci_write_config_dword(dev, 0x6c, (val & 0xFF00FFFF) | 0x00010000);
205 } 205 }
206} 206}
@@ -230,7 +230,7 @@ static int quirk_pcie_aspm_write(struct pci_bus *bus, unsigned int devfn, int wh
230 230
231 if ((offset) && (where == offset)) 231 if ((offset) && (where == offset))
232 value = value & 0xfffffffc; 232 value = value & 0xfffffffc;
233 233
234 return raw_pci_ops->write(0, bus->number, devfn, where, size, value); 234 return raw_pci_ops->write(0, bus->number, devfn, where, size, value);
235} 235}
236 236
@@ -271,8 +271,8 @@ static void pcie_rootport_aspm_quirk(struct pci_dev *pdev)
271 * after hot-remove, the pbus->devices is empty and this code 271 * after hot-remove, the pbus->devices is empty and this code
272 * will set the offsets to zero and the bus ops to parent's bus 272 * will set the offsets to zero and the bus ops to parent's bus
273 * ops, which is unmodified. 273 * ops, which is unmodified.
274 */ 274 */
275 for (i= GET_INDEX(pdev->device, 0); i <= GET_INDEX(pdev->device, 7); ++i) 275 for (i = GET_INDEX(pdev->device, 0); i <= GET_INDEX(pdev->device, 7); ++i)
276 quirk_aspm_offset[i] = 0; 276 quirk_aspm_offset[i] = 0;
277 277
278 pbus->ops = pbus->parent->ops; 278 pbus->ops = pbus->parent->ops;
@@ -286,17 +286,17 @@ static void pcie_rootport_aspm_quirk(struct pci_dev *pdev)
286 list_for_each_entry(dev, &pbus->devices, bus_list) { 286 list_for_each_entry(dev, &pbus->devices, bus_list) {
287 /* There are 0 to 8 devices attached to this bus */ 287 /* There are 0 to 8 devices attached to this bus */
288 cap_base = pci_find_capability(dev, PCI_CAP_ID_EXP); 288 cap_base = pci_find_capability(dev, PCI_CAP_ID_EXP);
289 quirk_aspm_offset[GET_INDEX(pdev->device, dev->devfn)]= cap_base + 0x10; 289 quirk_aspm_offset[GET_INDEX(pdev->device, dev->devfn)] = cap_base + 0x10;
290 } 290 }
291 pbus->ops = &quirk_pcie_aspm_ops; 291 pbus->ops = &quirk_pcie_aspm_ops;
292 } 292 }
293} 293}
294DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PA, pcie_rootport_aspm_quirk ); 294DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PA, pcie_rootport_aspm_quirk);
295DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PA1, pcie_rootport_aspm_quirk ); 295DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PA1, pcie_rootport_aspm_quirk);
296DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PB, pcie_rootport_aspm_quirk ); 296DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PB, pcie_rootport_aspm_quirk);
297DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PB1, pcie_rootport_aspm_quirk ); 297DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PB1, pcie_rootport_aspm_quirk);
298DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PC, pcie_rootport_aspm_quirk ); 298DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PC, pcie_rootport_aspm_quirk);
299DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PC1, pcie_rootport_aspm_quirk ); 299DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PC1, pcie_rootport_aspm_quirk);
300 300
301/* 301/*
302 * Fixup to mark boot BIOS video selected by BIOS before it changes 302 * Fixup to mark boot BIOS video selected by BIOS before it changes
@@ -336,8 +336,8 @@ static void __devinit pci_fixup_video(struct pci_dev *pdev)
336 * PCI header type NORMAL. 336 * PCI header type NORMAL.
337 */ 337 */
338 if (bridge 338 if (bridge
339 &&((bridge->hdr_type == PCI_HEADER_TYPE_BRIDGE) 339 && ((bridge->hdr_type == PCI_HEADER_TYPE_BRIDGE)
340 ||(bridge->hdr_type == PCI_HEADER_TYPE_CARDBUS))) { 340 || (bridge->hdr_type == PCI_HEADER_TYPE_CARDBUS))) {
341 pci_read_config_word(bridge, PCI_BRIDGE_CONTROL, 341 pci_read_config_word(bridge, PCI_BRIDGE_CONTROL,
342 &config); 342 &config);
343 if (!(config & PCI_BRIDGE_CTL_VGA)) 343 if (!(config & PCI_BRIDGE_CTL_VGA))
@@ -348,7 +348,7 @@ static void __devinit pci_fixup_video(struct pci_dev *pdev)
348 pci_read_config_word(pdev, PCI_COMMAND, &config); 348 pci_read_config_word(pdev, PCI_COMMAND, &config);
349 if (config & (PCI_COMMAND_IO | PCI_COMMAND_MEMORY)) { 349 if (config & (PCI_COMMAND_IO | PCI_COMMAND_MEMORY)) {
350 pdev->resource[PCI_ROM_RESOURCE].flags |= IORESOURCE_ROM_SHADOW; 350 pdev->resource[PCI_ROM_RESOURCE].flags |= IORESOURCE_ROM_SHADOW;
351 printk(KERN_DEBUG "Boot video device is %s\n", pci_name(pdev)); 351 dev_printk(KERN_DEBUG, &pdev->dev, "Boot video device\n");
352 } 352 }
353} 353}
354DECLARE_PCI_FIXUP_FINAL(PCI_ANY_ID, PCI_ANY_ID, pci_fixup_video); 354DECLARE_PCI_FIXUP_FINAL(PCI_ANY_ID, PCI_ANY_ID, pci_fixup_video);
@@ -388,11 +388,11 @@ static void __devinit pci_fixup_msi_k8t_onboard_sound(struct pci_dev *dev)
388 /* verify the change for status output */ 388 /* verify the change for status output */
389 pci_read_config_byte(dev, 0x50, &val); 389 pci_read_config_byte(dev, 0x50, &val);
390 if (val & 0x40) 390 if (val & 0x40)
391 printk(KERN_INFO "PCI: Detected MSI K8T Neo2-FIR, " 391 dev_info(&dev->dev, "Detected MSI K8T Neo2-FIR; "
392 "can't enable onboard soundcard!\n"); 392 "can't enable onboard soundcard!\n");
393 else 393 else
394 printk(KERN_INFO "PCI: Detected MSI K8T Neo2-FIR, " 394 dev_info(&dev->dev, "Detected MSI K8T Neo2-FIR; "
395 "enabled onboard soundcard.\n"); 395 "enabled onboard soundcard\n");
396 } 396 }
397} 397}
398DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237, 398DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237,
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 42ba0e2da1a..103b9dff121 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -72,7 +72,7 @@ pcibios_align_resource(void *data, struct resource *res,
72 } 72 }
73 } 73 }
74} 74}
75 75EXPORT_SYMBOL(pcibios_align_resource);
76 76
77/* 77/*
78 * Handle resources of PCI devices. If the world were perfect, we could 78 * Handle resources of PCI devices. If the world were perfect, we could
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 88d8f5c0ecb..ed07ce6c171 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -200,6 +200,7 @@ static int pirq_ali_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
200{ 200{
201 static const unsigned char irqmap[16] = { 0, 9, 3, 10, 4, 5, 7, 6, 1, 11, 0, 12, 0, 14, 0, 15 }; 201 static const unsigned char irqmap[16] = { 0, 9, 3, 10, 4, 5, 7, 6, 1, 11, 0, 12, 0, 14, 0, 15 };
202 202
203 WARN_ON_ONCE(pirq >= 16);
203 return irqmap[read_config_nybble(router, 0x48, pirq-1)]; 204 return irqmap[read_config_nybble(router, 0x48, pirq-1)];
204} 205}
205 206
@@ -207,7 +208,8 @@ static int pirq_ali_set(struct pci_dev *router, struct pci_dev *dev, int pirq, i
207{ 208{
208 static const unsigned char irqmap[16] = { 0, 8, 0, 2, 4, 5, 7, 6, 0, 1, 3, 9, 11, 0, 13, 15 }; 209 static const unsigned char irqmap[16] = { 0, 8, 0, 2, 4, 5, 7, 6, 0, 1, 3, 9, 11, 0, 13, 15 };
209 unsigned int val = irqmap[irq]; 210 unsigned int val = irqmap[irq];
210 211
212 WARN_ON_ONCE(pirq >= 16);
211 if (val) { 213 if (val) {
212 write_config_nybble(router, 0x48, pirq-1, val); 214 write_config_nybble(router, 0x48, pirq-1, val);
213 return 1; 215 return 1;
@@ -257,12 +259,16 @@ static int pirq_via_set(struct pci_dev *router, struct pci_dev *dev, int pirq, i
257static int pirq_via586_get(struct pci_dev *router, struct pci_dev *dev, int pirq) 259static int pirq_via586_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
258{ 260{
259 static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 }; 261 static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };
262
263 WARN_ON_ONCE(pirq >= 5);
260 return read_config_nybble(router, 0x55, pirqmap[pirq-1]); 264 return read_config_nybble(router, 0x55, pirqmap[pirq-1]);
261} 265}
262 266
263static int pirq_via586_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) 267static int pirq_via586_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
264{ 268{
265 static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 }; 269 static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };
270
271 WARN_ON_ONCE(pirq >= 5);
266 write_config_nybble(router, 0x55, pirqmap[pirq-1], irq); 272 write_config_nybble(router, 0x55, pirqmap[pirq-1], irq);
267 return 1; 273 return 1;
268} 274}
@@ -275,12 +281,16 @@ static int pirq_via586_set(struct pci_dev *router, struct pci_dev *dev, int pirq
275static int pirq_ite_get(struct pci_dev *router, struct pci_dev *dev, int pirq) 281static int pirq_ite_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
276{ 282{
277 static const unsigned char pirqmap[4] = { 1, 0, 2, 3 }; 283 static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
284
285 WARN_ON_ONCE(pirq >= 4);
278 return read_config_nybble(router,0x43, pirqmap[pirq-1]); 286 return read_config_nybble(router,0x43, pirqmap[pirq-1]);
279} 287}
280 288
281static int pirq_ite_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) 289static int pirq_ite_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
282{ 290{
283 static const unsigned char pirqmap[4] = { 1, 0, 2, 3 }; 291 static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
292
293 WARN_ON_ONCE(pirq >= 4);
284 write_config_nybble(router, 0x43, pirqmap[pirq-1], irq); 294 write_config_nybble(router, 0x43, pirqmap[pirq-1], irq);
285 return 1; 295 return 1;
286} 296}
@@ -419,6 +429,7 @@ static int pirq_sis_set(struct pci_dev *router, struct pci_dev *dev, int pirq, i
419 429
420static int pirq_vlsi_get(struct pci_dev *router, struct pci_dev *dev, int pirq) 430static int pirq_vlsi_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
421{ 431{
432 WARN_ON_ONCE(pirq >= 9);
422 if (pirq > 8) { 433 if (pirq > 8) {
423 printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq); 434 printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
424 return 0; 435 return 0;
@@ -428,6 +439,7 @@ static int pirq_vlsi_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
428 439
429static int pirq_vlsi_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) 440static int pirq_vlsi_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
430{ 441{
442 WARN_ON_ONCE(pirq >= 9);
431 if (pirq > 8) { 443 if (pirq > 8) {
432 printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq); 444 printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
433 return 0; 445 return 0;
@@ -449,14 +461,14 @@ static int pirq_vlsi_set(struct pci_dev *router, struct pci_dev *dev, int pirq,
449 */ 461 */
450static int pirq_serverworks_get(struct pci_dev *router, struct pci_dev *dev, int pirq) 462static int pirq_serverworks_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
451{ 463{
452 outb_p(pirq, 0xc00); 464 outb(pirq, 0xc00);
453 return inb(0xc01) & 0xf; 465 return inb(0xc01) & 0xf;
454} 466}
455 467
456static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) 468static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
457{ 469{
458 outb_p(pirq, 0xc00); 470 outb(pirq, 0xc00);
459 outb_p(irq, 0xc01); 471 outb(irq, 0xc01);
460 return 1; 472 return 1;
461} 473}
462 474
diff --git a/arch/x86/pci/numa.c b/arch/x86/pci/numa.c
index f5f165f69e0..55270c26237 100644
--- a/arch/x86/pci/numa.c
+++ b/arch/x86/pci/numa.c
@@ -5,36 +5,62 @@
5#include <linux/pci.h> 5#include <linux/pci.h>
6#include <linux/init.h> 6#include <linux/init.h>
7#include <linux/nodemask.h> 7#include <linux/nodemask.h>
8#include <mach_apic.h>
8#include "pci.h" 9#include "pci.h"
9 10
11#define XQUAD_PORTIO_BASE 0xfe400000
12#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */
13
10#define BUS2QUAD(global) (mp_bus_id_to_node[global]) 14#define BUS2QUAD(global) (mp_bus_id_to_node[global])
11#define BUS2LOCAL(global) (mp_bus_id_to_local[global]) 15#define BUS2LOCAL(global) (mp_bus_id_to_local[global])
12#define QUADLOCAL2BUS(quad,local) (quad_local_to_mp_bus_id[quad][local]) 16#define QUADLOCAL2BUS(quad,local) (quad_local_to_mp_bus_id[quad][local])
13 17
18extern void *xquad_portio; /* Where the IO area was mapped */
19#define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port)
20
14#define PCI_CONF1_MQ_ADDRESS(bus, devfn, reg) \ 21#define PCI_CONF1_MQ_ADDRESS(bus, devfn, reg) \
15 (0x80000000 | (BUS2LOCAL(bus) << 16) | (devfn << 8) | (reg & ~3)) 22 (0x80000000 | (BUS2LOCAL(bus) << 16) | (devfn << 8) | (reg & ~3))
16 23
24static void write_cf8(unsigned bus, unsigned devfn, unsigned reg)
25{
26 unsigned val = PCI_CONF1_MQ_ADDRESS(bus, devfn, reg);
27 if (xquad_portio)
28 writel(val, XQUAD_PORT_ADDR(0xcf8, BUS2QUAD(bus)));
29 else
30 outl(val, 0xCF8);
31}
32
17static int pci_conf1_mq_read(unsigned int seg, unsigned int bus, 33static int pci_conf1_mq_read(unsigned int seg, unsigned int bus,
18 unsigned int devfn, int reg, int len, u32 *value) 34 unsigned int devfn, int reg, int len, u32 *value)
19{ 35{
20 unsigned long flags; 36 unsigned long flags;
37 void *adr __iomem = XQUAD_PORT_ADDR(0xcfc, BUS2QUAD(bus));
21 38
22 if (!value || (bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255)) 39 if (!value || (bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255))
23 return -EINVAL; 40 return -EINVAL;
24 41
25 spin_lock_irqsave(&pci_config_lock, flags); 42 spin_lock_irqsave(&pci_config_lock, flags);
26 43
27 outl_quad(PCI_CONF1_MQ_ADDRESS(bus, devfn, reg), 0xCF8, BUS2QUAD(bus)); 44 write_cf8(bus, devfn, reg);
28 45
29 switch (len) { 46 switch (len) {
30 case 1: 47 case 1:
31 *value = inb_quad(0xCFC + (reg & 3), BUS2QUAD(bus)); 48 if (xquad_portio)
49 *value = readb(adr + (reg & 3));
50 else
51 *value = inb(0xCFC + (reg & 3));
32 break; 52 break;
33 case 2: 53 case 2:
34 *value = inw_quad(0xCFC + (reg & 2), BUS2QUAD(bus)); 54 if (xquad_portio)
55 *value = readw(adr + (reg & 2));
56 else
57 *value = inw(0xCFC + (reg & 2));
35 break; 58 break;
36 case 4: 59 case 4:
37 *value = inl_quad(0xCFC, BUS2QUAD(bus)); 60 if (xquad_portio)
61 *value = readl(adr);
62 else
63 *value = inl(0xCFC);
38 break; 64 break;
39 } 65 }
40 66
@@ -47,23 +73,33 @@ static int pci_conf1_mq_write(unsigned int seg, unsigned int bus,
47 unsigned int devfn, int reg, int len, u32 value) 73 unsigned int devfn, int reg, int len, u32 value)
48{ 74{
49 unsigned long flags; 75 unsigned long flags;
76 void *adr __iomem = XQUAD_PORT_ADDR(0xcfc, BUS2QUAD(bus));
50 77
51 if ((bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255)) 78 if ((bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255))
52 return -EINVAL; 79 return -EINVAL;
53 80
54 spin_lock_irqsave(&pci_config_lock, flags); 81 spin_lock_irqsave(&pci_config_lock, flags);
55 82
56 outl_quad(PCI_CONF1_MQ_ADDRESS(bus, devfn, reg), 0xCF8, BUS2QUAD(bus)); 83 write_cf8(bus, devfn, reg);
57 84
58 switch (len) { 85 switch (len) {
59 case 1: 86 case 1:
60 outb_quad((u8)value, 0xCFC + (reg & 3), BUS2QUAD(bus)); 87 if (xquad_portio)
88 writeb(value, adr + (reg & 3));
89 else
90 outb((u8)value, 0xCFC + (reg & 3));
61 break; 91 break;
62 case 2: 92 case 2:
63 outw_quad((u16)value, 0xCFC + (reg & 2), BUS2QUAD(bus)); 93 if (xquad_portio)
94 writew(value, adr + (reg & 2));
95 else
96 outw((u16)value, 0xCFC + (reg & 2));
64 break; 97 break;
65 case 4: 98 case 4:
66 outl_quad((u32)value, 0xCFC, BUS2QUAD(bus)); 99 if (xquad_portio)
100 writel(value, adr + reg);
101 else
102 outl((u32)value, 0xCFC);
67 break; 103 break;
68 } 104 }
69 105
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 998fd3ec0d6..efcf620d143 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -19,7 +19,7 @@ unsigned long saved_context_esp, saved_context_ebp;
19unsigned long saved_context_esi, saved_context_edi; 19unsigned long saved_context_esi, saved_context_edi;
20unsigned long saved_context_eflags; 20unsigned long saved_context_eflags;
21 21
22void __save_processor_state(struct saved_context *ctxt) 22static void __save_processor_state(struct saved_context *ctxt)
23{ 23{
24 mtrr_save_fixed_ranges(NULL); 24 mtrr_save_fixed_ranges(NULL);
25 kernel_fpu_begin(); 25 kernel_fpu_begin();
@@ -74,19 +74,19 @@ static void fix_processor_context(void)
74 /* 74 /*
75 * Now maybe reload the debug registers 75 * Now maybe reload the debug registers
76 */ 76 */
77 if (current->thread.debugreg[7]){ 77 if (current->thread.debugreg7) {
78 set_debugreg(current->thread.debugreg[0], 0); 78 set_debugreg(current->thread.debugreg0, 0);
79 set_debugreg(current->thread.debugreg[1], 1); 79 set_debugreg(current->thread.debugreg1, 1);
80 set_debugreg(current->thread.debugreg[2], 2); 80 set_debugreg(current->thread.debugreg2, 2);
81 set_debugreg(current->thread.debugreg[3], 3); 81 set_debugreg(current->thread.debugreg3, 3);
82 /* no 4 and 5 */ 82 /* no 4 and 5 */
83 set_debugreg(current->thread.debugreg[6], 6); 83 set_debugreg(current->thread.debugreg6, 6);
84 set_debugreg(current->thread.debugreg[7], 7); 84 set_debugreg(current->thread.debugreg7, 7);
85 } 85 }
86 86
87} 87}
88 88
89void __restore_processor_state(struct saved_context *ctxt) 89static void __restore_processor_state(struct saved_context *ctxt)
90{ 90{
91 /* 91 /*
92 * control registers 92 * control registers
diff --git a/arch/x86/vdso/.gitignore b/arch/x86/vdso/.gitignore
index f8b69d84238..60274d5746e 100644
--- a/arch/x86/vdso/.gitignore
+++ b/arch/x86/vdso/.gitignore
@@ -1 +1,6 @@
1vdso.lds 1vdso.lds
2vdso-syms.lds
3vdso32-syms.lds
4vdso32-syscall-syms.lds
5vdso32-sysenter-syms.lds
6vdso32-int80-syms.lds
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index e7bff0fbac2..d28dda57470 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -1,39 +1,37 @@
1# 1#
2# x86-64 vDSO. 2# Building vDSO images for x86.
3# 3#
4 4
5VDSO64-$(CONFIG_X86_64) := y
6VDSO32-$(CONFIG_X86_32) := y
7VDSO32-$(CONFIG_COMPAT) := y
8
9vdso-install-$(VDSO64-y) += vdso.so
10vdso-install-$(VDSO32-y) += $(vdso32-y:=.so)
11
12
5# files to link into the vdso 13# files to link into the vdso
6# vdso-start.o has to be first 14vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o vvar.o
7vobjs-y := vdso-start.o vdso-note.o vclock_gettime.o vgetcpu.o vvar.o
8 15
9# files to link into kernel 16# files to link into kernel
10obj-y := vma.o vdso.o vdso-syms.o 17obj-$(VDSO64-y) += vma.o vdso.o
18obj-$(VDSO32-y) += vdso32.o vdso32-setup.o
11 19
12vobjs := $(foreach F,$(vobjs-y),$(obj)/$F) 20vobjs := $(foreach F,$(vobjs-y),$(obj)/$F)
13 21
14$(obj)/vdso.o: $(obj)/vdso.so 22$(obj)/vdso.o: $(obj)/vdso.so
15 23
16targets += vdso.so vdso.so.dbg vdso.lds $(vobjs-y) vdso-syms.o 24targets += vdso.so vdso.so.dbg vdso.lds $(vobjs-y)
17
18# The DSO images are built using a special linker script.
19quiet_cmd_syscall = SYSCALL $@
20 cmd_syscall = $(CC) -m elf_x86_64 -nostdlib $(SYSCFLAGS_$(@F)) \
21 -Wl,-T,$(filter-out FORCE,$^) -o $@
22 25
23export CPPFLAGS_vdso.lds += -P -C 26export CPPFLAGS_vdso.lds += -P -C
24 27
25vdso-flags = -fPIC -shared -Wl,-soname=linux-vdso.so.1 \ 28VDSO_LDFLAGS_vdso.lds = -m elf_x86_64 -Wl,-soname=linux-vdso.so.1 \
26 $(call ld-option, -Wl$(comma)--hash-style=sysv) \ 29 -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096
27 -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096
28SYSCFLAGS_vdso.so = $(vdso-flags)
29SYSCFLAGS_vdso.so.dbg = $(vdso-flags)
30 30
31$(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so 31$(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so
32 32
33$(obj)/vdso.so: $(src)/vdso.lds $(vobjs) FORCE
34
35$(obj)/vdso.so.dbg: $(src)/vdso.lds $(vobjs) FORCE 33$(obj)/vdso.so.dbg: $(src)/vdso.lds $(vobjs) FORCE
36 $(call if_changed,syscall) 34 $(call if_changed,vdso)
37 35
38$(obj)/%.so: OBJCOPYFLAGS := -S 36$(obj)/%.so: OBJCOPYFLAGS := -S
39$(obj)/%.so: $(obj)/%.so.dbg FORCE 37$(obj)/%.so: $(obj)/%.so.dbg FORCE
@@ -41,24 +39,96 @@ $(obj)/%.so: $(obj)/%.so.dbg FORCE
41 39
42CFL := $(PROFILING) -mcmodel=small -fPIC -g0 -O2 -fasynchronous-unwind-tables -m64 40CFL := $(PROFILING) -mcmodel=small -fPIC -g0 -O2 -fasynchronous-unwind-tables -m64
43 41
44$(obj)/vclock_gettime.o: KBUILD_CFLAGS = $(CFL) 42$(vobjs): KBUILD_CFLAGS = $(CFL)
45$(obj)/vgetcpu.o: KBUILD_CFLAGS = $(CFL) 43
44targets += vdso-syms.lds
45obj-$(VDSO64-y) += vdso-syms.lds
46
47#
48# Match symbols in the DSO that look like VDSO*; produce a file of constants.
49#
50sed-vdsosym := -e 's/^00*/0/' \
51 -e 's/^\([0-9a-fA-F]*\) . \(VDSO[a-zA-Z0-9_]*\)$$/\2 = 0x\1;/p'
52quiet_cmd_vdsosym = VDSOSYM $@
53 cmd_vdsosym = $(NM) $< | sed -n $(sed-vdsosym) | LC_ALL=C sort > $@
54
55$(obj)/%-syms.lds: $(obj)/%.so.dbg FORCE
56 $(call if_changed,vdsosym)
57
58#
59# Build multiple 32-bit vDSO images to choose from at boot time.
60#
61obj-$(VDSO32-y) += vdso32-syms.lds
62vdso32.so-$(CONFIG_X86_32) += int80
63vdso32.so-$(CONFIG_COMPAT) += syscall
64vdso32.so-$(VDSO32-y) += sysenter
65
66CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds)
67VDSO_LDFLAGS_vdso32.lds = -m elf_i386 -Wl,-soname=linux-gate.so.1
68
69# This makes sure the $(obj) subdirectory exists even though vdso32/
70# is not a kbuild sub-make subdirectory.
71override obj-dirs = $(dir $(obj)) $(obj)/vdso32/
46 72
47# We also create a special relocatable object that should mirror the symbol 73targets += vdso32/vdso32.lds
48# table and layout of the linked DSO. With ld -R we can then refer to 74targets += $(vdso32.so-y:%=vdso32-%.so.dbg) $(vdso32.so-y:%=vdso32-%.so)
49# these symbols in the kernel code rather than hand-coded addresses. 75targets += vdso32/note.o $(vdso32.so-y:%=vdso32/%.o)
50extra-y += vdso-syms.o
51$(obj)/built-in.o: $(obj)/vdso-syms.o
52$(obj)/built-in.o: ld_flags += -R $(obj)/vdso-syms.o
53 76
54SYSCFLAGS_vdso-syms.o = -r -d 77extra-y += $(vdso32.so-y:%=vdso32-%.so)
55$(obj)/vdso-syms.o: $(src)/vdso.lds $(vobjs) FORCE
56 $(call if_changed,syscall)
57 78
79$(obj)/vdso32.o: $(vdso32.so-y:%=$(obj)/vdso32-%.so)
80
81KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS))
82$(vdso32.so-y:%=$(obj)/vdso32-%.so.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_32)
83$(vdso32.so-y:%=$(obj)/vdso32-%.so.dbg): asflags-$(CONFIG_X86_64) += -m32
84
85$(vdso32.so-y:%=$(obj)/vdso32-%.so.dbg): $(obj)/vdso32-%.so.dbg: FORCE \
86 $(obj)/vdso32/vdso32.lds \
87 $(obj)/vdso32/note.o \
88 $(obj)/vdso32/%.o
89 $(call if_changed,vdso)
90
91# Make vdso32-*-syms.lds from each image, and then make sure they match.
92# The only difference should be that some do not define VDSO32_SYSENTER_RETURN.
93
94targets += vdso32-syms.lds $(vdso32.so-y:%=vdso32-%-syms.lds)
95
96quiet_cmd_vdso32sym = VDSOSYM $@
97define cmd_vdso32sym
98 if LC_ALL=C sort -u $(filter-out FORCE,$^) > $(@D)/.tmp_$(@F) && \
99 $(foreach H,$(filter-out FORCE,$^),\
100 if grep -q VDSO32_SYSENTER_RETURN $H; \
101 then diff -u $(@D)/.tmp_$(@F) $H; \
102 else sed /VDSO32_SYSENTER_RETURN/d $(@D)/.tmp_$(@F) | \
103 diff -u - $H; fi &&) : ;\
104 then mv -f $(@D)/.tmp_$(@F) $@; \
105 else rm -f $(@D)/.tmp_$(@F); exit 1; \
106 fi
107endef
108
109$(obj)/vdso32-syms.lds: $(vdso32.so-y:%=$(obj)/vdso32-%-syms.lds) FORCE
110 $(call if_changed,vdso32sym)
111
112#
113# The DSO images are built using a special linker script.
114#
115quiet_cmd_vdso = VDSO $@
116 cmd_vdso = $(CC) -nostdlib -o $@ \
117 $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \
118 -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^)
119
120VDSO_LDFLAGS = -fPIC -shared $(call ld-option, -Wl$(comma)--hash-style=sysv)
121
122#
123# Install the unstripped copy of vdso*.so listed in $(vdso-install-y).
124#
58quiet_cmd_vdso_install = INSTALL $@ 125quiet_cmd_vdso_install = INSTALL $@
59 cmd_vdso_install = cp $(obj)/$@.dbg $(MODLIB)/vdso/$@ 126 cmd_vdso_install = cp $(obj)/$@.dbg $(MODLIB)/vdso/$@
60vdso.so: 127$(vdso-install-y): %.so: $(obj)/%.so.dbg FORCE
61 @mkdir -p $(MODLIB)/vdso 128 @mkdir -p $(MODLIB)/vdso
62 $(call cmd,vdso_install) 129 $(call cmd,vdso_install)
63 130
64vdso_install: vdso.so 131PHONY += vdso_install $(vdso-install-y)
132vdso_install: $(vdso-install-y)
133
134clean-files := vdso32-syscall* vdso32-sysenter* vdso32-int80*
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 5b54cdfb2b0..23476c2ebfc 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -19,7 +19,6 @@
19#include <asm/hpet.h> 19#include <asm/hpet.h>
20#include <asm/unistd.h> 20#include <asm/unistd.h>
21#include <asm/io.h> 21#include <asm/io.h>
22#include <asm/vgtod.h>
23#include "vextern.h" 22#include "vextern.h"
24 23
25#define gtod vdso_vsyscall_gtod_data 24#define gtod vdso_vsyscall_gtod_data
diff --git a/arch/x86/vdso/vdso-layout.lds.S b/arch/x86/vdso/vdso-layout.lds.S
new file mode 100644
index 00000000000..634a2cf6204
--- /dev/null
+++ b/arch/x86/vdso/vdso-layout.lds.S
@@ -0,0 +1,64 @@
1/*
2 * Linker script for vDSO. This is an ELF shared object prelinked to
3 * its virtual address, and with only one read-only segment.
4 * This script controls its layout.
5 */
6
7SECTIONS
8{
9 . = VDSO_PRELINK + SIZEOF_HEADERS;
10
11 .hash : { *(.hash) } :text
12 .gnu.hash : { *(.gnu.hash) }
13 .dynsym : { *(.dynsym) }
14 .dynstr : { *(.dynstr) }
15 .gnu.version : { *(.gnu.version) }
16 .gnu.version_d : { *(.gnu.version_d) }
17 .gnu.version_r : { *(.gnu.version_r) }
18
19 .note : { *(.note.*) } :text :note
20
21 .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr
22 .eh_frame : { KEEP (*(.eh_frame)) } :text
23
24 .dynamic : { *(.dynamic) } :text :dynamic
25
26 .rodata : { *(.rodata*) } :text
27 .data : {
28 *(.data*)
29 *(.sdata*)
30 *(.got.plt) *(.got)
31 *(.gnu.linkonce.d.*)
32 *(.bss*)
33 *(.dynbss*)
34 *(.gnu.linkonce.b.*)
35 }
36
37 .altinstructions : { *(.altinstructions) }
38 .altinstr_replacement : { *(.altinstr_replacement) }
39
40 /*
41 * Align the actual code well away from the non-instruction data.
42 * This is the best thing for the I-cache.
43 */
44 . = ALIGN(0x100);
45
46 .text : { *(.text*) } :text =0x90909090
47}
48
49/*
50 * Very old versions of ld do not recognize this name token; use the constant.
51 */
52#define PT_GNU_EH_FRAME 0x6474e550
53
54/*
55 * We must supply the ELF program headers explicitly to get just one
56 * PT_LOAD segment, and set the flags explicitly to make segments read-only.
57 */
58PHDRS
59{
60 text PT_LOAD FLAGS(5) FILEHDR PHDRS; /* PF_R|PF_X */
61 dynamic PT_DYNAMIC FLAGS(4); /* PF_R */
62 note PT_NOTE FLAGS(4); /* PF_R */
63 eh_frame_hdr PT_GNU_EH_FRAME;
64}
diff --git a/arch/x86/vdso/vdso-start.S b/arch/x86/vdso/vdso-start.S
deleted file mode 100644
index 2dc2cdb84d6..00000000000
--- a/arch/x86/vdso/vdso-start.S
+++ /dev/null
@@ -1,2 +0,0 @@
1 .globl vdso_kernel_start
2vdso_kernel_start:
diff --git a/arch/x86/vdso/vdso.lds.S b/arch/x86/vdso/vdso.lds.S
index 667d3245d97..4e5dd3b4de7 100644
--- a/arch/x86/vdso/vdso.lds.S
+++ b/arch/x86/vdso/vdso.lds.S
@@ -1,79 +1,37 @@
1/* 1/*
2 * Linker script for vsyscall DSO. The vsyscall page is an ELF shared 2 * Linker script for 64-bit vDSO.
3 * object prelinked to its virtual address, and with only one read-only 3 * We #include the file to define the layout details.
4 * segment (that fits in one page). This script controls its layout. 4 * Here we only choose the prelinked virtual address.
5 *
6 * This file defines the version script giving the user-exported symbols in
7 * the DSO. We can define local symbols here called VDSO* to make their
8 * values visible using the asm-x86/vdso.h macros from the kernel proper.
5 */ 9 */
6#include <asm/asm-offsets.h>
7#include "voffset.h"
8 10
9#define VDSO_PRELINK 0xffffffffff700000 11#define VDSO_PRELINK 0xffffffffff700000
10 12#include "vdso-layout.lds.S"
11SECTIONS
12{
13 . = VDSO_PRELINK + SIZEOF_HEADERS;
14
15 .hash : { *(.hash) } :text
16 .gnu.hash : { *(.gnu.hash) }
17 .dynsym : { *(.dynsym) }
18 .dynstr : { *(.dynstr) }
19 .gnu.version : { *(.gnu.version) }
20 .gnu.version_d : { *(.gnu.version_d) }
21 .gnu.version_r : { *(.gnu.version_r) }
22
23 /* This linker script is used both with -r and with -shared.
24 For the layouts to match, we need to skip more than enough
25 space for the dynamic symbol table et al. If this amount
26 is insufficient, ld -shared will barf. Just increase it here. */
27 . = VDSO_PRELINK + VDSO_TEXT_OFFSET;
28
29 .text : { *(.text*) } :text
30 .rodata : { *(.rodata*) } :text
31 .data : {
32 *(.data*)
33 *(.sdata*)
34 *(.bss*)
35 *(.dynbss*)
36 } :text
37
38 .altinstructions : { *(.altinstructions) } :text
39 .altinstr_replacement : { *(.altinstr_replacement) } :text
40
41 .note : { *(.note.*) } :text :note
42 .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr
43 .eh_frame : { KEEP (*(.eh_frame)) } :text
44 .dynamic : { *(.dynamic) } :text :dynamic
45 .useless : {
46 *(.got.plt) *(.got)
47 *(.gnu.linkonce.d.*)
48 *(.gnu.linkonce.b.*)
49 } :text
50}
51 13
52/* 14/*
53 * We must supply the ELF program headers explicitly to get just one 15 * This controls what userland symbols we export from the vDSO.
54 * PT_LOAD segment, and set the flags explicitly to make segments read-only.
55 */ 16 */
56PHDRS 17VERSION {
57{ 18 LINUX_2.6 {
58 text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */ 19 global:
59 dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ 20 clock_gettime;
60 note PT_NOTE FLAGS(4); /* PF_R */ 21 __vdso_clock_gettime;
61 eh_frame_hdr 0x6474e550; /* PT_GNU_EH_FRAME, but ld doesn't match the name */ 22 gettimeofday;
23 __vdso_gettimeofday;
24 getcpu;
25 __vdso_getcpu;
26 local: *;
27 };
62} 28}
63 29
30VDSO64_PRELINK = VDSO_PRELINK;
31
64/* 32/*
65 * This controls what symbols we export from the DSO. 33 * Define VDSO64_x for each VEXTERN(x), for use via VDSO64_SYMBOL.
66 */ 34 */
67VERSION 35#define VEXTERN(x) VDSO64_ ## x = vdso_ ## x;
68{ 36#include "vextern.h"
69 LINUX_2.6 { 37#undef VEXTERN
70 global:
71 clock_gettime;
72 __vdso_clock_gettime;
73 gettimeofday;
74 __vdso_gettimeofday;
75 getcpu;
76 __vdso_getcpu;
77 local: *;
78 };
79}
diff --git a/arch/x86/kernel/sysenter_32.c b/arch/x86/vdso/vdso32-setup.c
index 5a2d951e260..348f1341e1c 100644
--- a/arch/x86/kernel/sysenter_32.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -23,6 +23,8 @@
23#include <asm/unistd.h> 23#include <asm/unistd.h>
24#include <asm/elf.h> 24#include <asm/elf.h>
25#include <asm/tlbflush.h> 25#include <asm/tlbflush.h>
26#include <asm/vdso.h>
27#include <asm/proto.h>
26 28
27enum { 29enum {
28 VDSO_DISABLED = 0, 30 VDSO_DISABLED = 0,
@@ -36,14 +38,24 @@ enum {
36#define VDSO_DEFAULT VDSO_ENABLED 38#define VDSO_DEFAULT VDSO_ENABLED
37#endif 39#endif
38 40
41#ifdef CONFIG_X86_64
42#define vdso_enabled sysctl_vsyscall32
43#define arch_setup_additional_pages syscall32_setup_pages
44#endif
45
46/*
47 * This is the difference between the prelinked addresses in the vDSO images
48 * and the VDSO_HIGH_BASE address where CONFIG_COMPAT_VDSO places the vDSO
49 * in the user address space.
50 */
51#define VDSO_ADDR_ADJUST (VDSO_HIGH_BASE - (unsigned long)VDSO32_PRELINK)
52
39/* 53/*
40 * Should the kernel map a VDSO page into processes and pass its 54 * Should the kernel map a VDSO page into processes and pass its
41 * address down to glibc upon exec()? 55 * address down to glibc upon exec()?
42 */ 56 */
43unsigned int __read_mostly vdso_enabled = VDSO_DEFAULT; 57unsigned int __read_mostly vdso_enabled = VDSO_DEFAULT;
44 58
45EXPORT_SYMBOL_GPL(vdso_enabled);
46
47static int __init vdso_setup(char *s) 59static int __init vdso_setup(char *s)
48{ 60{
49 vdso_enabled = simple_strtoul(s, NULL, 0); 61 vdso_enabled = simple_strtoul(s, NULL, 0);
@@ -51,9 +63,18 @@ static int __init vdso_setup(char *s)
51 return 1; 63 return 1;
52} 64}
53 65
54__setup("vdso=", vdso_setup); 66/*
67 * For consistency, the argument vdso32=[012] affects the 32-bit vDSO
68 * behavior on both 64-bit and 32-bit kernels.
69 * On 32-bit kernels, vdso=[012] means the same thing.
70 */
71__setup("vdso32=", vdso_setup);
72
73#ifdef CONFIG_X86_32
74__setup_param("vdso=", vdso32_setup, vdso_setup, 0);
55 75
56extern asmlinkage void sysenter_entry(void); 76EXPORT_SYMBOL_GPL(vdso_enabled);
77#endif
57 78
58static __init void reloc_symtab(Elf32_Ehdr *ehdr, 79static __init void reloc_symtab(Elf32_Ehdr *ehdr,
59 unsigned offset, unsigned size) 80 unsigned offset, unsigned size)
@@ -78,7 +99,7 @@ static __init void reloc_symtab(Elf32_Ehdr *ehdr,
78 case STT_FUNC: 99 case STT_FUNC:
79 case STT_SECTION: 100 case STT_SECTION:
80 case STT_FILE: 101 case STT_FILE:
81 sym->st_value += VDSO_HIGH_BASE; 102 sym->st_value += VDSO_ADDR_ADJUST;
82 } 103 }
83 } 104 }
84} 105}
@@ -104,7 +125,7 @@ static __init void reloc_dyn(Elf32_Ehdr *ehdr, unsigned offset)
104 case DT_VERNEED: 125 case DT_VERNEED:
105 case DT_ADDRRNGLO ... DT_ADDRRNGHI: 126 case DT_ADDRRNGLO ... DT_ADDRRNGHI:
106 /* definitely pointers needing relocation */ 127 /* definitely pointers needing relocation */
107 dyn->d_un.d_ptr += VDSO_HIGH_BASE; 128 dyn->d_un.d_ptr += VDSO_ADDR_ADJUST;
108 break; 129 break;
109 130
110 case DT_ENCODING ... OLD_DT_LOOS-1: 131 case DT_ENCODING ... OLD_DT_LOOS-1:
@@ -113,7 +134,7 @@ static __init void reloc_dyn(Elf32_Ehdr *ehdr, unsigned offset)
113 they're even */ 134 they're even */
114 if (dyn->d_tag >= DT_ENCODING && 135 if (dyn->d_tag >= DT_ENCODING &&
115 (dyn->d_tag & 1) == 0) 136 (dyn->d_tag & 1) == 0)
116 dyn->d_un.d_ptr += VDSO_HIGH_BASE; 137 dyn->d_un.d_ptr += VDSO_ADDR_ADJUST;
117 break; 138 break;
118 139
119 case DT_VERDEFNUM: 140 case DT_VERDEFNUM:
@@ -142,15 +163,15 @@ static __init void relocate_vdso(Elf32_Ehdr *ehdr)
142 int i; 163 int i;
143 164
144 BUG_ON(memcmp(ehdr->e_ident, ELFMAG, 4) != 0 || 165 BUG_ON(memcmp(ehdr->e_ident, ELFMAG, 4) != 0 ||
145 !elf_check_arch(ehdr) || 166 !elf_check_arch_ia32(ehdr) ||
146 ehdr->e_type != ET_DYN); 167 ehdr->e_type != ET_DYN);
147 168
148 ehdr->e_entry += VDSO_HIGH_BASE; 169 ehdr->e_entry += VDSO_ADDR_ADJUST;
149 170
150 /* rebase phdrs */ 171 /* rebase phdrs */
151 phdr = (void *)ehdr + ehdr->e_phoff; 172 phdr = (void *)ehdr + ehdr->e_phoff;
152 for (i = 0; i < ehdr->e_phnum; i++) { 173 for (i = 0; i < ehdr->e_phnum; i++) {
153 phdr[i].p_vaddr += VDSO_HIGH_BASE; 174 phdr[i].p_vaddr += VDSO_ADDR_ADJUST;
154 175
155 /* relocate dynamic stuff */ 176 /* relocate dynamic stuff */
156 if (phdr[i].p_type == PT_DYNAMIC) 177 if (phdr[i].p_type == PT_DYNAMIC)
@@ -163,7 +184,7 @@ static __init void relocate_vdso(Elf32_Ehdr *ehdr)
163 if (!(shdr[i].sh_flags & SHF_ALLOC)) 184 if (!(shdr[i].sh_flags & SHF_ALLOC))
164 continue; 185 continue;
165 186
166 shdr[i].sh_addr += VDSO_HIGH_BASE; 187 shdr[i].sh_addr += VDSO_ADDR_ADJUST;
167 188
168 if (shdr[i].sh_type == SHT_SYMTAB || 189 if (shdr[i].sh_type == SHT_SYMTAB ||
169 shdr[i].sh_type == SHT_DYNSYM) 190 shdr[i].sh_type == SHT_DYNSYM)
@@ -172,6 +193,45 @@ static __init void relocate_vdso(Elf32_Ehdr *ehdr)
172 } 193 }
173} 194}
174 195
196/*
197 * These symbols are defined by vdso32.S to mark the bounds
198 * of the ELF DSO images included therein.
199 */
200extern const char vdso32_default_start, vdso32_default_end;
201extern const char vdso32_sysenter_start, vdso32_sysenter_end;
202static struct page *vdso32_pages[1];
203
204#ifdef CONFIG_X86_64
205
206static int use_sysenter __read_mostly = -1;
207
208#define vdso32_sysenter() (use_sysenter > 0)
209
210/* May not be __init: called during resume */
211void syscall32_cpu_init(void)
212{
213 if (use_sysenter < 0)
214 use_sysenter = (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL);
215
216 /* Load these always in case some future AMD CPU supports
217 SYSENTER from compat mode too. */
218 checking_wrmsrl(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
219 checking_wrmsrl(MSR_IA32_SYSENTER_ESP, 0ULL);
220 checking_wrmsrl(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
221
222 wrmsrl(MSR_CSTAR, ia32_cstar_target);
223}
224
225#define compat_uses_vma 1
226
227static inline void map_compat_vdso(int map)
228{
229}
230
231#else /* CONFIG_X86_32 */
232
233#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP))
234
175void enable_sep_cpu(void) 235void enable_sep_cpu(void)
176{ 236{
177 int cpu = get_cpu(); 237 int cpu = get_cpu();
@@ -183,10 +243,10 @@ void enable_sep_cpu(void)
183 } 243 }
184 244
185 tss->x86_tss.ss1 = __KERNEL_CS; 245 tss->x86_tss.ss1 = __KERNEL_CS;
186 tss->x86_tss.esp1 = sizeof(struct tss_struct) + (unsigned long) tss; 246 tss->x86_tss.sp1 = sizeof(struct tss_struct) + (unsigned long) tss;
187 wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); 247 wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
188 wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.esp1, 0); 248 wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.sp1, 0);
189 wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) sysenter_entry, 0); 249 wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) ia32_sysenter_target, 0);
190 put_cpu(); 250 put_cpu();
191} 251}
192 252
@@ -209,13 +269,7 @@ static int __init gate_vma_init(void)
209 return 0; 269 return 0;
210} 270}
211 271
212/* 272#define compat_uses_vma 0
213 * These symbols are defined by vsyscall.o to mark the bounds
214 * of the ELF DSO images included therein.
215 */
216extern const char vsyscall_int80_start, vsyscall_int80_end;
217extern const char vsyscall_sysenter_start, vsyscall_sysenter_end;
218static struct page *syscall_pages[1];
219 273
220static void map_compat_vdso(int map) 274static void map_compat_vdso(int map)
221{ 275{
@@ -226,31 +280,35 @@ static void map_compat_vdso(int map)
226 280
227 vdso_mapped = map; 281 vdso_mapped = map;
228 282
229 __set_fixmap(FIX_VDSO, page_to_pfn(syscall_pages[0]) << PAGE_SHIFT, 283 __set_fixmap(FIX_VDSO, page_to_pfn(vdso32_pages[0]) << PAGE_SHIFT,
230 map ? PAGE_READONLY_EXEC : PAGE_NONE); 284 map ? PAGE_READONLY_EXEC : PAGE_NONE);
231 285
232 /* flush stray tlbs */ 286 /* flush stray tlbs */
233 flush_tlb_all(); 287 flush_tlb_all();
234} 288}
235 289
290#endif /* CONFIG_X86_64 */
291
236int __init sysenter_setup(void) 292int __init sysenter_setup(void)
237{ 293{
238 void *syscall_page = (void *)get_zeroed_page(GFP_ATOMIC); 294 void *syscall_page = (void *)get_zeroed_page(GFP_ATOMIC);
239 const void *vsyscall; 295 const void *vsyscall;
240 size_t vsyscall_len; 296 size_t vsyscall_len;
241 297
242 syscall_pages[0] = virt_to_page(syscall_page); 298 vdso32_pages[0] = virt_to_page(syscall_page);
243 299
300#ifdef CONFIG_X86_32
244 gate_vma_init(); 301 gate_vma_init();
245 302
246 printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO)); 303 printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO));
304#endif
247 305
248 if (!boot_cpu_has(X86_FEATURE_SEP)) { 306 if (!vdso32_sysenter()) {
249 vsyscall = &vsyscall_int80_start; 307 vsyscall = &vdso32_default_start;
250 vsyscall_len = &vsyscall_int80_end - &vsyscall_int80_start; 308 vsyscall_len = &vdso32_default_end - &vdso32_default_start;
251 } else { 309 } else {
252 vsyscall = &vsyscall_sysenter_start; 310 vsyscall = &vdso32_sysenter_start;
253 vsyscall_len = &vsyscall_sysenter_end - &vsyscall_sysenter_start; 311 vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start;
254 } 312 }
255 313
256 memcpy(syscall_page, vsyscall, vsyscall_len); 314 memcpy(syscall_page, vsyscall, vsyscall_len);
@@ -259,9 +317,6 @@ int __init sysenter_setup(void)
259 return 0; 317 return 0;
260} 318}
261 319
262/* Defined in vsyscall-sysenter.S */
263extern void SYSENTER_RETURN;
264
265/* Setup a VMA at program startup for the vsyscall page */ 320/* Setup a VMA at program startup for the vsyscall page */
266int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) 321int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
267{ 322{
@@ -286,7 +341,9 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
286 ret = addr; 341 ret = addr;
287 goto up_fail; 342 goto up_fail;
288 } 343 }
344 }
289 345
346 if (compat_uses_vma || !compat) {
290 /* 347 /*
291 * MAYWRITE to allow gdb to COW and set breakpoints 348 * MAYWRITE to allow gdb to COW and set breakpoints
292 * 349 *
@@ -300,7 +357,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
300 VM_READ|VM_EXEC| 357 VM_READ|VM_EXEC|
301 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| 358 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
302 VM_ALWAYSDUMP, 359 VM_ALWAYSDUMP,
303 syscall_pages); 360 vdso32_pages);
304 361
305 if (ret) 362 if (ret)
306 goto up_fail; 363 goto up_fail;
@@ -308,7 +365,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
308 365
309 current->mm->context.vdso = (void *)addr; 366 current->mm->context.vdso = (void *)addr;
310 current_thread_info()->sysenter_return = 367 current_thread_info()->sysenter_return =
311 (void *)VDSO_SYM(&SYSENTER_RETURN); 368 VDSO32_SYMBOL(addr, SYSENTER_RETURN);
312 369
313 up_fail: 370 up_fail:
314 up_write(&mm->mmap_sem); 371 up_write(&mm->mmap_sem);
@@ -316,6 +373,45 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
316 return ret; 373 return ret;
317} 374}
318 375
376#ifdef CONFIG_X86_64
377
378__initcall(sysenter_setup);
379
380#ifdef CONFIG_SYSCTL
381/* Register vsyscall32 into the ABI table */
382#include <linux/sysctl.h>
383
384static ctl_table abi_table2[] = {
385 {
386 .procname = "vsyscall32",
387 .data = &sysctl_vsyscall32,
388 .maxlen = sizeof(int),
389 .mode = 0644,
390 .proc_handler = proc_dointvec
391 },
392 {}
393};
394
395static ctl_table abi_root_table2[] = {
396 {
397 .ctl_name = CTL_ABI,
398 .procname = "abi",
399 .mode = 0555,
400 .child = abi_table2
401 },
402 {}
403};
404
405static __init int ia32_binfmt_init(void)
406{
407 register_sysctl_table(abi_root_table2);
408 return 0;
409}
410__initcall(ia32_binfmt_init);
411#endif
412
413#else /* CONFIG_X86_32 */
414
319const char *arch_vma_name(struct vm_area_struct *vma) 415const char *arch_vma_name(struct vm_area_struct *vma)
320{ 416{
321 if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso) 417 if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
@@ -344,3 +440,5 @@ int in_gate_area_no_task(unsigned long addr)
344{ 440{
345 return 0; 441 return 0;
346} 442}
443
444#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/vdso/vdso32.S b/arch/x86/vdso/vdso32.S
new file mode 100644
index 00000000000..1e36f72cab8
--- /dev/null
+++ b/arch/x86/vdso/vdso32.S
@@ -0,0 +1,19 @@
1#include <linux/init.h>
2
3__INITDATA
4
5 .globl vdso32_default_start, vdso32_default_end
6vdso32_default_start:
7#ifdef CONFIG_X86_32
8 .incbin "arch/x86/vdso/vdso32-int80.so"
9#else
10 .incbin "arch/x86/vdso/vdso32-syscall.so"
11#endif
12vdso32_default_end:
13
14 .globl vdso32_sysenter_start, vdso32_sysenter_end
15vdso32_sysenter_start:
16 .incbin "arch/x86/vdso/vdso32-sysenter.so"
17vdso32_sysenter_end:
18
19__FINIT
diff --git a/arch/x86/vdso/vdso32/.gitignore b/arch/x86/vdso/vdso32/.gitignore
new file mode 100644
index 00000000000..e45fba9d0ce
--- /dev/null
+++ b/arch/x86/vdso/vdso32/.gitignore
@@ -0,0 +1 @@
vdso32.lds
diff --git a/arch/x86/kernel/vsyscall-int80_32.S b/arch/x86/vdso/vdso32/int80.S
index 103cab6aa7c..b15b7c01aed 100644
--- a/arch/x86/kernel/vsyscall-int80_32.S
+++ b/arch/x86/vdso/vdso32/int80.S
@@ -1,15 +1,15 @@
1/* 1/*
2 * Code for the vsyscall page. This version uses the old int $0x80 method. 2 * Code for the vDSO. This version uses the old int $0x80 method.
3 * 3 *
4 * NOTE: 4 * First get the common code for the sigreturn entry points.
5 * 1) __kernel_vsyscall _must_ be first in this page. 5 * This must come first.
6 * 2) there are alignment constraints on this stub, see vsyscall-sigreturn.S
7 * for details.
8 */ 6 */
7#include "sigreturn.S"
9 8
10 .text 9 .text
11 .globl __kernel_vsyscall 10 .globl __kernel_vsyscall
12 .type __kernel_vsyscall,@function 11 .type __kernel_vsyscall,@function
12 ALIGN
13__kernel_vsyscall: 13__kernel_vsyscall:
14.LSTART_vsyscall: 14.LSTART_vsyscall:
15 int $0x80 15 int $0x80
@@ -47,7 +47,10 @@ __kernel_vsyscall:
47.LENDFDEDLSI: 47.LENDFDEDLSI:
48 .previous 48 .previous
49 49
50/* 50 /*
51 * Get the common code for the sigreturn entry points. 51 * Pad out the segment to match the size of the sysenter.S version.
52 */ 52 */
53#include "vsyscall-sigreturn_32.S" 53VDSO32_vsyscall_eh_frame_size = 0x40
54 .section .data,"aw",@progbits
55 .space VDSO32_vsyscall_eh_frame_size-(.LENDFDEDLSI-.LSTARTFRAMEDLSI), 0
56 .previous
diff --git a/arch/x86/kernel/vsyscall-note_32.S b/arch/x86/vdso/vdso32/note.S
index fcf376a37f7..c83f2573469 100644
--- a/arch/x86/kernel/vsyscall-note_32.S
+++ b/arch/x86/vdso/vdso32/note.S
@@ -33,12 +33,11 @@ ELFNOTE_END
33 * at boot time we set VDSO_NOTE_NONEGSEG_BIT if running under Xen. 33 * at boot time we set VDSO_NOTE_NONEGSEG_BIT if running under Xen.
34 */ 34 */
35 35
36#include "../../x86/xen/vdso.h" /* Defines VDSO_NOTE_NONEGSEG_BIT. */ 36#include "../../xen/vdso.h" /* Defines VDSO_NOTE_NONEGSEG_BIT. */
37 37
38 .globl VDSO_NOTE_MASK
39ELFNOTE_START(GNU, 2, "a") 38ELFNOTE_START(GNU, 2, "a")
40 .long 1 /* ncaps */ 39 .long 1 /* ncaps */
41VDSO_NOTE_MASK: 40VDSO32_NOTE_MASK: /* Symbol used by arch/x86/xen/setup.c */
42 .long 0 /* mask */ 41 .long 0 /* mask */
43 .byte VDSO_NOTE_NONEGSEG_BIT; .asciz "nosegneg" /* bit, name */ 42 .byte VDSO_NOTE_NONEGSEG_BIT; .asciz "nosegneg" /* bit, name */
44ELFNOTE_END 43ELFNOTE_END
diff --git a/arch/x86/kernel/vsyscall-sigreturn_32.S b/arch/x86/vdso/vdso32/sigreturn.S
index a92262f4165..31776d0efc8 100644
--- a/arch/x86/kernel/vsyscall-sigreturn_32.S
+++ b/arch/x86/vdso/vdso32/sigreturn.S
@@ -1,41 +1,42 @@
1/* 1/*
2 * Common code for the sigreturn entry points on the vsyscall page. 2 * Common code for the sigreturn entry points in vDSO images.
3 * So far this code is the same for both int80 and sysenter versions. 3 * So far this code is the same for both int80 and sysenter versions.
4 * This file is #include'd by vsyscall-*.S to define them after the 4 * This file is #include'd by int80.S et al to define them first thing.
5 * vsyscall entry point. The kernel assumes that the addresses of these 5 * The kernel assumes that the addresses of these routines are constant
6 * routines are constant for all vsyscall implementations. 6 * for all vDSO implementations.
7 */ 7 */
8 8
9#include <asm/unistd.h> 9#include <linux/linkage.h>
10#include <asm/unistd_32.h>
10#include <asm/asm-offsets.h> 11#include <asm/asm-offsets.h>
11 12
12 13#ifndef SYSCALL_ENTER_KERNEL
13/* XXX 14#define SYSCALL_ENTER_KERNEL int $0x80
14 Should these be named "_sigtramp" or something? 15#endif
15*/
16 16
17 .text 17 .text
18 .org __kernel_vsyscall+32,0x90
19 .globl __kernel_sigreturn 18 .globl __kernel_sigreturn
20 .type __kernel_sigreturn,@function 19 .type __kernel_sigreturn,@function
20 ALIGN
21__kernel_sigreturn: 21__kernel_sigreturn:
22.LSTART_sigreturn: 22.LSTART_sigreturn:
23 popl %eax /* XXX does this mean it needs unwind info? */ 23 popl %eax /* XXX does this mean it needs unwind info? */
24 movl $__NR_sigreturn, %eax 24 movl $__NR_sigreturn, %eax
25 int $0x80 25 SYSCALL_ENTER_KERNEL
26.LEND_sigreturn: 26.LEND_sigreturn:
27 nop
27 .size __kernel_sigreturn,.-.LSTART_sigreturn 28 .size __kernel_sigreturn,.-.LSTART_sigreturn
28 29
29 .balign 32
30 .globl __kernel_rt_sigreturn 30 .globl __kernel_rt_sigreturn
31 .type __kernel_rt_sigreturn,@function 31 .type __kernel_rt_sigreturn,@function
32 ALIGN
32__kernel_rt_sigreturn: 33__kernel_rt_sigreturn:
33.LSTART_rt_sigreturn: 34.LSTART_rt_sigreturn:
34 movl $__NR_rt_sigreturn, %eax 35 movl $__NR_rt_sigreturn, %eax
35 int $0x80 36 SYSCALL_ENTER_KERNEL
36.LEND_rt_sigreturn: 37.LEND_rt_sigreturn:
38 nop
37 .size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn 39 .size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn
38 .balign 32
39 .previous 40 .previous
40 41
41 .section .eh_frame,"a",@progbits 42 .section .eh_frame,"a",@progbits
@@ -70,9 +71,9 @@ __kernel_rt_sigreturn:
70 be the value of the stack pointer in the caller. This means 71 be the value of the stack pointer in the caller. This means
71 that we must define the CFA of this body of code to be the 72 that we must define the CFA of this body of code to be the
72 saved value of the stack pointer in the sigcontext. Which 73 saved value of the stack pointer in the sigcontext. Which
73 also means that there is no fixed relation to the other 74 also means that there is no fixed relation to the other
74 saved registers, which means that we must use DW_CFA_expression 75 saved registers, which means that we must use DW_CFA_expression
75 to compute their addresses. It also means that when we 76 to compute their addresses. It also means that when we
76 adjust the stack with the popl, we have to do it all over again. */ 77 adjust the stack with the popl, we have to do it all over again. */
77 78
78#define do_cfa_expr(offset) \ 79#define do_cfa_expr(offset) \
@@ -91,27 +92,27 @@ __kernel_rt_sigreturn:
91 .sleb128 offset; /* offset */ \ 92 .sleb128 offset; /* offset */ \
921: 931:
93 94
94 do_cfa_expr(SIGCONTEXT_esp+4) 95 do_cfa_expr(IA32_SIGCONTEXT_sp+4)
95 do_expr(0, SIGCONTEXT_eax+4) 96 do_expr(0, IA32_SIGCONTEXT_ax+4)
96 do_expr(1, SIGCONTEXT_ecx+4) 97 do_expr(1, IA32_SIGCONTEXT_cx+4)
97 do_expr(2, SIGCONTEXT_edx+4) 98 do_expr(2, IA32_SIGCONTEXT_dx+4)
98 do_expr(3, SIGCONTEXT_ebx+4) 99 do_expr(3, IA32_SIGCONTEXT_bx+4)
99 do_expr(5, SIGCONTEXT_ebp+4) 100 do_expr(5, IA32_SIGCONTEXT_bp+4)
100 do_expr(6, SIGCONTEXT_esi+4) 101 do_expr(6, IA32_SIGCONTEXT_si+4)
101 do_expr(7, SIGCONTEXT_edi+4) 102 do_expr(7, IA32_SIGCONTEXT_di+4)
102 do_expr(8, SIGCONTEXT_eip+4) 103 do_expr(8, IA32_SIGCONTEXT_ip+4)
103 104
104 .byte 0x42 /* DW_CFA_advance_loc 2 -- nop; popl eax. */ 105 .byte 0x42 /* DW_CFA_advance_loc 2 -- nop; popl eax. */
105 106
106 do_cfa_expr(SIGCONTEXT_esp) 107 do_cfa_expr(IA32_SIGCONTEXT_sp)
107 do_expr(0, SIGCONTEXT_eax) 108 do_expr(0, IA32_SIGCONTEXT_ax)
108 do_expr(1, SIGCONTEXT_ecx) 109 do_expr(1, IA32_SIGCONTEXT_cx)
109 do_expr(2, SIGCONTEXT_edx) 110 do_expr(2, IA32_SIGCONTEXT_dx)
110 do_expr(3, SIGCONTEXT_ebx) 111 do_expr(3, IA32_SIGCONTEXT_bx)
111 do_expr(5, SIGCONTEXT_ebp) 112 do_expr(5, IA32_SIGCONTEXT_bp)
112 do_expr(6, SIGCONTEXT_esi) 113 do_expr(6, IA32_SIGCONTEXT_si)
113 do_expr(7, SIGCONTEXT_edi) 114 do_expr(7, IA32_SIGCONTEXT_di)
114 do_expr(8, SIGCONTEXT_eip) 115 do_expr(8, IA32_SIGCONTEXT_ip)
115 116
116 .align 4 117 .align 4
117.LENDFDEDLSI1: 118.LENDFDEDLSI1:
@@ -128,15 +129,15 @@ __kernel_rt_sigreturn:
128 slightly less complicated than the above, since we don't 129 slightly less complicated than the above, since we don't
129 modify the stack pointer in the process. */ 130 modify the stack pointer in the process. */
130 131
131 do_cfa_expr(RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_esp) 132 do_cfa_expr(IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_sp)
132 do_expr(0, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_eax) 133 do_expr(0, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ax)
133 do_expr(1, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_ecx) 134 do_expr(1, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_cx)
134 do_expr(2, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_edx) 135 do_expr(2, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_dx)
135 do_expr(3, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_ebx) 136 do_expr(3, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_bx)
136 do_expr(5, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_ebp) 137 do_expr(5, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_bp)
137 do_expr(6, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_esi) 138 do_expr(6, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_si)
138 do_expr(7, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_edi) 139 do_expr(7, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_di)
139 do_expr(8, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_eip) 140 do_expr(8, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ip)
140 141
141 .align 4 142 .align 4
142.LENDFDEDLSI2: 143.LENDFDEDLSI2:
diff --git a/arch/x86/ia32/vsyscall-syscall.S b/arch/x86/vdso/vdso32/syscall.S
index cf9ef678de3..5415b5613d5 100644
--- a/arch/x86/ia32/vsyscall-syscall.S
+++ b/arch/x86/vdso/vdso32/syscall.S
@@ -1,16 +1,18 @@
1/* 1/*
2 * Code for the vsyscall page. This version uses the syscall instruction. 2 * Code for the vDSO. This version uses the syscall instruction.
3 *
4 * First get the common code for the sigreturn entry points.
5 * This must come first.
3 */ 6 */
7#define SYSCALL_ENTER_KERNEL syscall
8#include "sigreturn.S"
4 9
5#include <asm/ia32_unistd.h>
6#include <asm/asm-offsets.h>
7#include <asm/segment.h> 10#include <asm/segment.h>
8 11
9 .code32
10 .text 12 .text
11 .section .text.vsyscall,"ax"
12 .globl __kernel_vsyscall 13 .globl __kernel_vsyscall
13 .type __kernel_vsyscall,@function 14 .type __kernel_vsyscall,@function
15 ALIGN
14__kernel_vsyscall: 16__kernel_vsyscall:
15.LSTART_vsyscall: 17.LSTART_vsyscall:
16 push %ebp 18 push %ebp
@@ -64,6 +66,12 @@ __kernel_vsyscall:
64 .uleb128 4 66 .uleb128 4
65 .align 4 67 .align 4
66.LENDFDE1: 68.LENDFDE1:
69 .previous
67 70
68#define SYSCALL_ENTER_KERNEL syscall 71 /*
69#include "vsyscall-sigreturn.S" 72 * Pad out the segment to match the size of the sysenter.S version.
73 */
74VDSO32_vsyscall_eh_frame_size = 0x40
75 .section .data,"aw",@progbits
76 .space VDSO32_vsyscall_eh_frame_size-(.LENDFDE1-.LSTARTFRAME), 0
77 .previous
diff --git a/arch/x86/kernel/vsyscall-sysenter_32.S b/arch/x86/vdso/vdso32/sysenter.S
index ed879bf4299..e2800affa75 100644
--- a/arch/x86/kernel/vsyscall-sysenter_32.S
+++ b/arch/x86/vdso/vdso32/sysenter.S
@@ -1,11 +1,10 @@
1/* 1/*
2 * Code for the vsyscall page. This version uses the sysenter instruction. 2 * Code for the vDSO. This version uses the sysenter instruction.
3 * 3 *
4 * NOTE: 4 * First get the common code for the sigreturn entry points.
5 * 1) __kernel_vsyscall _must_ be first in this page. 5 * This must come first.
6 * 2) there are alignment constraints on this stub, see vsyscall-sigreturn.S
7 * for details.
8 */ 6 */
7#include "sigreturn.S"
9 8
10/* 9/*
11 * The caller puts arg2 in %ecx, which gets pushed. The kernel will use 10 * The caller puts arg2 in %ecx, which gets pushed. The kernel will use
@@ -23,11 +22,12 @@
23 * arg6 from the stack. 22 * arg6 from the stack.
24 * 23 *
25 * You can not use this vsyscall for the clone() syscall because the 24 * You can not use this vsyscall for the clone() syscall because the
26 * three dwords on the parent stack do not get copied to the child. 25 * three words on the parent stack do not get copied to the child.
27 */ 26 */
28 .text 27 .text
29 .globl __kernel_vsyscall 28 .globl __kernel_vsyscall
30 .type __kernel_vsyscall,@function 29 .type __kernel_vsyscall,@function
30 ALIGN
31__kernel_vsyscall: 31__kernel_vsyscall:
32.LSTART_vsyscall: 32.LSTART_vsyscall:
33 push %ecx 33 push %ecx
@@ -45,8 +45,7 @@ __kernel_vsyscall:
45 /* 14: System call restart point is here! (SYSENTER_RETURN-2) */ 45 /* 14: System call restart point is here! (SYSENTER_RETURN-2) */
46 jmp .Lenter_kernel 46 jmp .Lenter_kernel
47 /* 16: System call normal return point is here! */ 47 /* 16: System call normal return point is here! */
48 .globl SYSENTER_RETURN /* Symbol used by sysenter.c */ 48VDSO32_SYSENTER_RETURN: /* Symbol used by sysenter.c via vdso32-syms.h */
49SYSENTER_RETURN:
50 pop %ebp 49 pop %ebp
51.Lpop_ebp: 50.Lpop_ebp:
52 pop %edx 51 pop %edx
@@ -85,38 +84,33 @@ SYSENTER_RETURN:
85 .uleb128 0 84 .uleb128 0
86 /* What follows are the instructions for the table generation. 85 /* What follows are the instructions for the table generation.
87 We have to record all changes of the stack pointer. */ 86 We have to record all changes of the stack pointer. */
88 .byte 0x04 /* DW_CFA_advance_loc4 */ 87 .byte 0x40 + (.Lpush_ecx-.LSTART_vsyscall) /* DW_CFA_advance_loc */
89 .long .Lpush_ecx-.LSTART_vsyscall
90 .byte 0x0e /* DW_CFA_def_cfa_offset */ 88 .byte 0x0e /* DW_CFA_def_cfa_offset */
91 .byte 0x08 /* RA at offset 8 now */ 89 .byte 0x08 /* RA at offset 8 now */
92 .byte 0x04 /* DW_CFA_advance_loc4 */ 90 .byte 0x40 + (.Lpush_edx-.Lpush_ecx) /* DW_CFA_advance_loc */
93 .long .Lpush_edx-.Lpush_ecx
94 .byte 0x0e /* DW_CFA_def_cfa_offset */ 91 .byte 0x0e /* DW_CFA_def_cfa_offset */
95 .byte 0x0c /* RA at offset 12 now */ 92 .byte 0x0c /* RA at offset 12 now */
96 .byte 0x04 /* DW_CFA_advance_loc4 */ 93 .byte 0x40 + (.Lenter_kernel-.Lpush_edx) /* DW_CFA_advance_loc */
97 .long .Lenter_kernel-.Lpush_edx
98 .byte 0x0e /* DW_CFA_def_cfa_offset */ 94 .byte 0x0e /* DW_CFA_def_cfa_offset */
99 .byte 0x10 /* RA at offset 16 now */ 95 .byte 0x10 /* RA at offset 16 now */
100 .byte 0x85, 0x04 /* DW_CFA_offset %ebp -16 */ 96 .byte 0x85, 0x04 /* DW_CFA_offset %ebp -16 */
101 /* Finally the epilogue. */ 97 /* Finally the epilogue. */
102 .byte 0x04 /* DW_CFA_advance_loc4 */ 98 .byte 0x40 + (.Lpop_ebp-.Lenter_kernel) /* DW_CFA_advance_loc */
103 .long .Lpop_ebp-.Lenter_kernel
104 .byte 0x0e /* DW_CFA_def_cfa_offset */ 99 .byte 0x0e /* DW_CFA_def_cfa_offset */
105 .byte 0x0c /* RA at offset 12 now */ 100 .byte 0x0c /* RA at offset 12 now */
106 .byte 0xc5 /* DW_CFA_restore %ebp */ 101 .byte 0xc5 /* DW_CFA_restore %ebp */
107 .byte 0x04 /* DW_CFA_advance_loc4 */ 102 .byte 0x40 + (.Lpop_edx-.Lpop_ebp) /* DW_CFA_advance_loc */
108 .long .Lpop_edx-.Lpop_ebp
109 .byte 0x0e /* DW_CFA_def_cfa_offset */ 103 .byte 0x0e /* DW_CFA_def_cfa_offset */
110 .byte 0x08 /* RA at offset 8 now */ 104 .byte 0x08 /* RA at offset 8 now */
111 .byte 0x04 /* DW_CFA_advance_loc4 */ 105 .byte 0x40 + (.Lpop_ecx-.Lpop_edx) /* DW_CFA_advance_loc */
112 .long .Lpop_ecx-.Lpop_edx
113 .byte 0x0e /* DW_CFA_def_cfa_offset */ 106 .byte 0x0e /* DW_CFA_def_cfa_offset */
114 .byte 0x04 /* RA at offset 4 now */ 107 .byte 0x04 /* RA at offset 4 now */
115 .align 4 108 .align 4
116.LENDFDEDLSI: 109.LENDFDEDLSI:
117 .previous 110 .previous
118 111
119/* 112 /*
120 * Get the common code for the sigreturn entry points. 113 * Emit a symbol with the size of this .eh_frame data,
121 */ 114 * to verify it matches the other versions.
122#include "vsyscall-sigreturn_32.S" 115 */
116VDSO32_vsyscall_eh_frame_size = (.LENDFDEDLSI-.LSTARTFRAMEDLSI)
diff --git a/arch/x86/vdso/vdso32/vdso32.lds.S b/arch/x86/vdso/vdso32/vdso32.lds.S
new file mode 100644
index 00000000000..976124bb5f9
--- /dev/null
+++ b/arch/x86/vdso/vdso32/vdso32.lds.S
@@ -0,0 +1,37 @@
1/*
2 * Linker script for 32-bit vDSO.
3 * We #include the file to define the layout details.
4 * Here we only choose the prelinked virtual address.
5 *
6 * This file defines the version script giving the user-exported symbols in
7 * the DSO. We can define local symbols here called VDSO* to make their
8 * values visible using the asm-x86/vdso.h macros from the kernel proper.
9 */
10
11#define VDSO_PRELINK 0
12#include "../vdso-layout.lds.S"
13
14/* The ELF entry point can be used to set the AT_SYSINFO value. */
15ENTRY(__kernel_vsyscall);
16
17/*
18 * This controls what userland symbols we export from the vDSO.
19 */
20VERSION
21{
22 LINUX_2.5 {
23 global:
24 __kernel_vsyscall;
25 __kernel_sigreturn;
26 __kernel_rt_sigreturn;
27 local: *;
28 };
29}
30
31/*
32 * Symbols we define here called VDSO* get their values into vdso32-syms.h.
33 */
34VDSO32_PRELINK = VDSO_PRELINK;
35VDSO32_vsyscall = __kernel_vsyscall;
36VDSO32_sigreturn = __kernel_sigreturn;
37VDSO32_rt_sigreturn = __kernel_rt_sigreturn;
diff --git a/arch/x86/vdso/vgetcpu.c b/arch/x86/vdso/vgetcpu.c
index 3b1ae1abfba..c8097f17f8a 100644
--- a/arch/x86/vdso/vgetcpu.c
+++ b/arch/x86/vdso/vgetcpu.c
@@ -15,11 +15,11 @@
15 15
16long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused) 16long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
17{ 17{
18 unsigned int dummy, p; 18 unsigned int p;
19 19
20 if (*vdso_vgetcpu_mode == VGETCPU_RDTSCP) { 20 if (*vdso_vgetcpu_mode == VGETCPU_RDTSCP) {
21 /* Load per CPU data from RDTSCP */ 21 /* Load per CPU data from RDTSCP */
22 rdtscp(dummy, dummy, p); 22 native_read_tscp(&p);
23 } else { 23 } else {
24 /* Load per CPU data from GDT */ 24 /* Load per CPU data from GDT */
25 asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); 25 asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index ff9333e5fb0..3fdd51497a8 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -11,23 +11,20 @@
11#include <asm/vsyscall.h> 11#include <asm/vsyscall.h>
12#include <asm/vgtod.h> 12#include <asm/vgtod.h>
13#include <asm/proto.h> 13#include <asm/proto.h>
14#include "voffset.h" 14#include <asm/vdso.h>
15 15
16int vdso_enabled = 1; 16#include "vextern.h" /* Just for VMAGIC. */
17
18#define VEXTERN(x) extern typeof(__ ## x) *vdso_ ## x;
19#include "vextern.h"
20#undef VEXTERN 17#undef VEXTERN
21 18
22extern char vdso_kernel_start[], vdso_start[], vdso_end[]; 19int vdso_enabled = 1;
20
21extern char vdso_start[], vdso_end[];
23extern unsigned short vdso_sync_cpuid; 22extern unsigned short vdso_sync_cpuid;
24 23
25struct page **vdso_pages; 24struct page **vdso_pages;
26 25
27static inline void *var_ref(void *vbase, char *var, char *name) 26static inline void *var_ref(void *p, char *name)
28{ 27{
29 unsigned offset = var - &vdso_kernel_start[0] + VDSO_TEXT_OFFSET;
30 void *p = vbase + offset;
31 if (*(void **)p != (void *)VMAGIC) { 28 if (*(void **)p != (void *)VMAGIC) {
32 printk("VDSO: variable %s broken\n", name); 29 printk("VDSO: variable %s broken\n", name);
33 vdso_enabled = 0; 30 vdso_enabled = 0;
@@ -62,9 +59,8 @@ static int __init init_vdso_vars(void)
62 vdso_enabled = 0; 59 vdso_enabled = 0;
63 } 60 }
64 61
65#define V(x) *(typeof(x) *) var_ref(vbase, (char *)RELOC_HIDE(&x, 0), #x)
66#define VEXTERN(x) \ 62#define VEXTERN(x) \
67 V(vdso_ ## x) = &__ ## x; 63 *(typeof(__ ## x) **) var_ref(VDSO64_SYMBOL(vbase, x), #x) = &__ ## x;
68#include "vextern.h" 64#include "vextern.h"
69#undef VEXTERN 65#undef VEXTERN
70 return 0; 66 return 0;
diff --git a/arch/x86/vdso/voffset.h b/arch/x86/vdso/voffset.h
deleted file mode 100644
index 4af67c79085..00000000000
--- a/arch/x86/vdso/voffset.h
+++ /dev/null
@@ -1 +0,0 @@
1#define VDSO_TEXT_OFFSET 0x600
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index fbfa55ce0d5..4d5f2649bee 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -5,6 +5,7 @@
5config XEN 5config XEN
6 bool "Xen guest support" 6 bool "Xen guest support"
7 select PARAVIRT 7 select PARAVIRT
8 depends on X86_32
8 depends on X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES && !(X86_VISWS || X86_VOYAGER) 9 depends on X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES && !(X86_VISWS || X86_VOYAGER)
9 help 10 help
10 This is the Linux Xen port. Enabling this will allow the 11 This is the Linux Xen port. Enabling this will allow the
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index b6af3ea43c7..de647bc6e74 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -95,7 +95,7 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&dummy_shared_info;
95 * 95 *
96 * 0: not available, 1: available 96 * 0: not available, 1: available
97 */ 97 */
98static int have_vcpu_info_placement = 1; 98static int have_vcpu_info_placement = 0;
99 99
100static void __init xen_vcpu_setup(int cpu) 100static void __init xen_vcpu_setup(int cpu)
101{ 101{
@@ -141,8 +141,8 @@ static void __init xen_banner(void)
141 printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic); 141 printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic);
142} 142}
143 143
144static void xen_cpuid(unsigned int *eax, unsigned int *ebx, 144static void xen_cpuid(unsigned int *ax, unsigned int *bx,
145 unsigned int *ecx, unsigned int *edx) 145 unsigned int *cx, unsigned int *dx)
146{ 146{
147 unsigned maskedx = ~0; 147 unsigned maskedx = ~0;
148 148
@@ -150,18 +150,18 @@ static void xen_cpuid(unsigned int *eax, unsigned int *ebx,
150 * Mask out inconvenient features, to try and disable as many 150 * Mask out inconvenient features, to try and disable as many
151 * unsupported kernel subsystems as possible. 151 * unsupported kernel subsystems as possible.
152 */ 152 */
153 if (*eax == 1) 153 if (*ax == 1)
154 maskedx = ~((1 << X86_FEATURE_APIC) | /* disable APIC */ 154 maskedx = ~((1 << X86_FEATURE_APIC) | /* disable APIC */
155 (1 << X86_FEATURE_ACPI) | /* disable ACPI */ 155 (1 << X86_FEATURE_ACPI) | /* disable ACPI */
156 (1 << X86_FEATURE_ACC)); /* thermal monitoring */ 156 (1 << X86_FEATURE_ACC)); /* thermal monitoring */
157 157
158 asm(XEN_EMULATE_PREFIX "cpuid" 158 asm(XEN_EMULATE_PREFIX "cpuid"
159 : "=a" (*eax), 159 : "=a" (*ax),
160 "=b" (*ebx), 160 "=b" (*bx),
161 "=c" (*ecx), 161 "=c" (*cx),
162 "=d" (*edx) 162 "=d" (*dx)
163 : "0" (*eax), "2" (*ecx)); 163 : "0" (*ax), "2" (*cx));
164 *edx &= maskedx; 164 *dx &= maskedx;
165} 165}
166 166
167static void xen_set_debugreg(int reg, unsigned long val) 167static void xen_set_debugreg(int reg, unsigned long val)
@@ -275,19 +275,12 @@ static unsigned long xen_store_tr(void)
275 275
276static void xen_set_ldt(const void *addr, unsigned entries) 276static void xen_set_ldt(const void *addr, unsigned entries)
277{ 277{
278 unsigned long linear_addr = (unsigned long)addr;
279 struct mmuext_op *op; 278 struct mmuext_op *op;
280 struct multicall_space mcs = xen_mc_entry(sizeof(*op)); 279 struct multicall_space mcs = xen_mc_entry(sizeof(*op));
281 280
282 op = mcs.args; 281 op = mcs.args;
283 op->cmd = MMUEXT_SET_LDT; 282 op->cmd = MMUEXT_SET_LDT;
284 if (linear_addr) { 283 op->arg1.linear_addr = (unsigned long)addr;
285 /* ldt my be vmalloced, use arbitrary_virt_to_machine */
286 xmaddr_t maddr;
287 maddr = arbitrary_virt_to_machine((unsigned long)addr);
288 linear_addr = (unsigned long)maddr.maddr;
289 }
290 op->arg1.linear_addr = linear_addr;
291 op->arg2.nr_ents = entries; 284 op->arg2.nr_ents = entries;
292 285
293 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 286 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
@@ -295,7 +288,7 @@ static void xen_set_ldt(const void *addr, unsigned entries)
295 xen_mc_issue(PARAVIRT_LAZY_CPU); 288 xen_mc_issue(PARAVIRT_LAZY_CPU);
296} 289}
297 290
298static void xen_load_gdt(const struct Xgt_desc_struct *dtr) 291static void xen_load_gdt(const struct desc_ptr *dtr)
299{ 292{
300 unsigned long *frames; 293 unsigned long *frames;
301 unsigned long va = dtr->address; 294 unsigned long va = dtr->address;
@@ -357,11 +350,11 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
357} 350}
358 351
359static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, 352static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
360 u32 low, u32 high) 353 const void *ptr)
361{ 354{
362 unsigned long lp = (unsigned long)&dt[entrynum]; 355 unsigned long lp = (unsigned long)&dt[entrynum];
363 xmaddr_t mach_lp = virt_to_machine(lp); 356 xmaddr_t mach_lp = virt_to_machine(lp);
364 u64 entry = (u64)high << 32 | low; 357 u64 entry = *(u64 *)ptr;
365 358
366 preempt_disable(); 359 preempt_disable();
367 360
@@ -395,12 +388,11 @@ static int cvt_gate_to_trap(int vector, u32 low, u32 high,
395} 388}
396 389
397/* Locations of each CPU's IDT */ 390/* Locations of each CPU's IDT */
398static DEFINE_PER_CPU(struct Xgt_desc_struct, idt_desc); 391static DEFINE_PER_CPU(struct desc_ptr, idt_desc);
399 392
400/* Set an IDT entry. If the entry is part of the current IDT, then 393/* Set an IDT entry. If the entry is part of the current IDT, then
401 also update Xen. */ 394 also update Xen. */
402static void xen_write_idt_entry(struct desc_struct *dt, int entrynum, 395static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g)
403 u32 low, u32 high)
404{ 396{
405 unsigned long p = (unsigned long)&dt[entrynum]; 397 unsigned long p = (unsigned long)&dt[entrynum];
406 unsigned long start, end; 398 unsigned long start, end;
@@ -412,14 +404,15 @@ static void xen_write_idt_entry(struct desc_struct *dt, int entrynum,
412 404
413 xen_mc_flush(); 405 xen_mc_flush();
414 406
415 write_dt_entry(dt, entrynum, low, high); 407 native_write_idt_entry(dt, entrynum, g);
416 408
417 if (p >= start && (p + 8) <= end) { 409 if (p >= start && (p + 8) <= end) {
418 struct trap_info info[2]; 410 struct trap_info info[2];
411 u32 *desc = (u32 *)g;
419 412
420 info[1].address = 0; 413 info[1].address = 0;
421 414
422 if (cvt_gate_to_trap(entrynum, low, high, &info[0])) 415 if (cvt_gate_to_trap(entrynum, desc[0], desc[1], &info[0]))
423 if (HYPERVISOR_set_trap_table(info)) 416 if (HYPERVISOR_set_trap_table(info))
424 BUG(); 417 BUG();
425 } 418 }
@@ -427,7 +420,7 @@ static void xen_write_idt_entry(struct desc_struct *dt, int entrynum,
427 preempt_enable(); 420 preempt_enable();
428} 421}
429 422
430static void xen_convert_trap_info(const struct Xgt_desc_struct *desc, 423static void xen_convert_trap_info(const struct desc_ptr *desc,
431 struct trap_info *traps) 424 struct trap_info *traps)
432{ 425{
433 unsigned in, out, count; 426 unsigned in, out, count;
@@ -446,7 +439,7 @@ static void xen_convert_trap_info(const struct Xgt_desc_struct *desc,
446 439
447void xen_copy_trap_info(struct trap_info *traps) 440void xen_copy_trap_info(struct trap_info *traps)
448{ 441{
449 const struct Xgt_desc_struct *desc = &__get_cpu_var(idt_desc); 442 const struct desc_ptr *desc = &__get_cpu_var(idt_desc);
450 443
451 xen_convert_trap_info(desc, traps); 444 xen_convert_trap_info(desc, traps);
452} 445}
@@ -454,7 +447,7 @@ void xen_copy_trap_info(struct trap_info *traps)
454/* Load a new IDT into Xen. In principle this can be per-CPU, so we 447/* Load a new IDT into Xen. In principle this can be per-CPU, so we
455 hold a spinlock to protect the static traps[] array (static because 448 hold a spinlock to protect the static traps[] array (static because
456 it avoids allocation, and saves stack space). */ 449 it avoids allocation, and saves stack space). */
457static void xen_load_idt(const struct Xgt_desc_struct *desc) 450static void xen_load_idt(const struct desc_ptr *desc)
458{ 451{
459 static DEFINE_SPINLOCK(lock); 452 static DEFINE_SPINLOCK(lock);
460 static struct trap_info traps[257]; 453 static struct trap_info traps[257];
@@ -475,22 +468,21 @@ static void xen_load_idt(const struct Xgt_desc_struct *desc)
475/* Write a GDT descriptor entry. Ignore LDT descriptors, since 468/* Write a GDT descriptor entry. Ignore LDT descriptors, since
476 they're handled differently. */ 469 they're handled differently. */
477static void xen_write_gdt_entry(struct desc_struct *dt, int entry, 470static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
478 u32 low, u32 high) 471 const void *desc, int type)
479{ 472{
480 preempt_disable(); 473 preempt_disable();
481 474
482 switch ((high >> 8) & 0xff) { 475 switch (type) {
483 case DESCTYPE_LDT: 476 case DESC_LDT:
484 case DESCTYPE_TSS: 477 case DESC_TSS:
485 /* ignore */ 478 /* ignore */
486 break; 479 break;
487 480
488 default: { 481 default: {
489 xmaddr_t maddr = virt_to_machine(&dt[entry]); 482 xmaddr_t maddr = virt_to_machine(&dt[entry]);
490 u64 desc = (u64)high << 32 | low;
491 483
492 xen_mc_flush(); 484 xen_mc_flush();
493 if (HYPERVISOR_update_descriptor(maddr.maddr, desc)) 485 if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc))
494 BUG(); 486 BUG();
495 } 487 }
496 488
@@ -499,11 +491,11 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
499 preempt_enable(); 491 preempt_enable();
500} 492}
501 493
502static void xen_load_esp0(struct tss_struct *tss, 494static void xen_load_sp0(struct tss_struct *tss,
503 struct thread_struct *thread) 495 struct thread_struct *thread)
504{ 496{
505 struct multicall_space mcs = xen_mc_entry(0); 497 struct multicall_space mcs = xen_mc_entry(0);
506 MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->esp0); 498 MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0);
507 xen_mc_issue(PARAVIRT_LAZY_CPU); 499 xen_mc_issue(PARAVIRT_LAZY_CPU);
508} 500}
509 501
@@ -521,12 +513,12 @@ static void xen_io_delay(void)
521} 513}
522 514
523#ifdef CONFIG_X86_LOCAL_APIC 515#ifdef CONFIG_X86_LOCAL_APIC
524static unsigned long xen_apic_read(unsigned long reg) 516static u32 xen_apic_read(unsigned long reg)
525{ 517{
526 return 0; 518 return 0;
527} 519}
528 520
529static void xen_apic_write(unsigned long reg, unsigned long val) 521static void xen_apic_write(unsigned long reg, u32 val)
530{ 522{
531 /* Warn to see if there's any stray references */ 523 /* Warn to see if there's any stray references */
532 WARN_ON(1); 524 WARN_ON(1);
@@ -666,6 +658,13 @@ static __init void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn)
666 make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); 658 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
667} 659}
668 660
661/* Early release_pt assumes that all pts are pinned, since there's
662 only init_mm and anything attached to that is pinned. */
663static void xen_release_pt_init(u32 pfn)
664{
665 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
666}
667
669static void pin_pagetable_pfn(unsigned level, unsigned long pfn) 668static void pin_pagetable_pfn(unsigned level, unsigned long pfn)
670{ 669{
671 struct mmuext_op op; 670 struct mmuext_op op;
@@ -677,7 +676,7 @@ static void pin_pagetable_pfn(unsigned level, unsigned long pfn)
677 676
678/* This needs to make sure the new pte page is pinned iff its being 677/* This needs to make sure the new pte page is pinned iff its being
679 attached to a pinned pagetable. */ 678 attached to a pinned pagetable. */
680static void xen_alloc_pt(struct mm_struct *mm, u32 pfn) 679static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level)
681{ 680{
682 struct page *page = pfn_to_page(pfn); 681 struct page *page = pfn_to_page(pfn);
683 682
@@ -686,7 +685,7 @@ static void xen_alloc_pt(struct mm_struct *mm, u32 pfn)
686 685
687 if (!PageHighMem(page)) { 686 if (!PageHighMem(page)) {
688 make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); 687 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
689 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); 688 pin_pagetable_pfn(level, pfn);
690 } else 689 } else
691 /* make sure there are no stray mappings of 690 /* make sure there are no stray mappings of
692 this page */ 691 this page */
@@ -694,6 +693,16 @@ static void xen_alloc_pt(struct mm_struct *mm, u32 pfn)
694 } 693 }
695} 694}
696 695
696static void xen_alloc_pt(struct mm_struct *mm, u32 pfn)
697{
698 xen_alloc_ptpage(mm, pfn, MMUEXT_PIN_L1_TABLE);
699}
700
701static void xen_alloc_pd(struct mm_struct *mm, u32 pfn)
702{
703 xen_alloc_ptpage(mm, pfn, MMUEXT_PIN_L2_TABLE);
704}
705
697/* This should never happen until we're OK to use struct page */ 706/* This should never happen until we're OK to use struct page */
698static void xen_release_pt(u32 pfn) 707static void xen_release_pt(u32 pfn)
699{ 708{
@@ -796,6 +805,9 @@ static __init void xen_pagetable_setup_done(pgd_t *base)
796 /* This will work as long as patching hasn't happened yet 805 /* This will work as long as patching hasn't happened yet
797 (which it hasn't) */ 806 (which it hasn't) */
798 pv_mmu_ops.alloc_pt = xen_alloc_pt; 807 pv_mmu_ops.alloc_pt = xen_alloc_pt;
808 pv_mmu_ops.alloc_pd = xen_alloc_pd;
809 pv_mmu_ops.release_pt = xen_release_pt;
810 pv_mmu_ops.release_pd = xen_release_pt;
799 pv_mmu_ops.set_pte = xen_set_pte; 811 pv_mmu_ops.set_pte = xen_set_pte;
800 812
801 if (!xen_feature(XENFEAT_auto_translated_physmap)) { 813 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
@@ -953,7 +965,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
953 .read_pmc = native_read_pmc, 965 .read_pmc = native_read_pmc,
954 966
955 .iret = (void *)&hypercall_page[__HYPERVISOR_iret], 967 .iret = (void *)&hypercall_page[__HYPERVISOR_iret],
956 .irq_enable_sysexit = NULL, /* never called */ 968 .irq_enable_syscall_ret = NULL, /* never called */
957 969
958 .load_tr_desc = paravirt_nop, 970 .load_tr_desc = paravirt_nop,
959 .set_ldt = xen_set_ldt, 971 .set_ldt = xen_set_ldt,
@@ -968,7 +980,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
968 .write_ldt_entry = xen_write_ldt_entry, 980 .write_ldt_entry = xen_write_ldt_entry,
969 .write_gdt_entry = xen_write_gdt_entry, 981 .write_gdt_entry = xen_write_gdt_entry,
970 .write_idt_entry = xen_write_idt_entry, 982 .write_idt_entry = xen_write_idt_entry,
971 .load_esp0 = xen_load_esp0, 983 .load_sp0 = xen_load_sp0,
972 984
973 .set_iopl_mask = xen_set_iopl_mask, 985 .set_iopl_mask = xen_set_iopl_mask,
974 .io_delay = xen_io_delay, 986 .io_delay = xen_io_delay,
@@ -1019,10 +1031,10 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1019 .pte_update_defer = paravirt_nop, 1031 .pte_update_defer = paravirt_nop,
1020 1032
1021 .alloc_pt = xen_alloc_pt_init, 1033 .alloc_pt = xen_alloc_pt_init,
1022 .release_pt = xen_release_pt, 1034 .release_pt = xen_release_pt_init,
1023 .alloc_pd = paravirt_nop, 1035 .alloc_pd = xen_alloc_pt_init,
1024 .alloc_pd_clone = paravirt_nop, 1036 .alloc_pd_clone = paravirt_nop,
1025 .release_pd = paravirt_nop, 1037 .release_pd = xen_release_pt_init,
1026 1038
1027#ifdef CONFIG_HIGHPTE 1039#ifdef CONFIG_HIGHPTE
1028 .kmap_atomic_pte = xen_kmap_atomic_pte, 1040 .kmap_atomic_pte = xen_kmap_atomic_pte,
diff --git a/arch/x86/xen/events.c b/arch/x86/xen/events.c
index 6d1da5809e6..dcf613e1758 100644
--- a/arch/x86/xen/events.c
+++ b/arch/x86/xen/events.c
@@ -465,7 +465,7 @@ void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector)
465 * a bitset of words which contain pending event bits. The second 465 * a bitset of words which contain pending event bits. The second
466 * level is a bitset of pending events themselves. 466 * level is a bitset of pending events themselves.
467 */ 467 */
468fastcall void xen_evtchn_do_upcall(struct pt_regs *regs) 468void xen_evtchn_do_upcall(struct pt_regs *regs)
469{ 469{
470 int cpu = get_cpu(); 470 int cpu = get_cpu();
471 struct shared_info *s = HYPERVISOR_shared_info; 471 struct shared_info *s = HYPERVISOR_shared_info;
@@ -487,7 +487,7 @@ fastcall void xen_evtchn_do_upcall(struct pt_regs *regs)
487 int irq = evtchn_to_irq[port]; 487 int irq = evtchn_to_irq[port];
488 488
489 if (irq != -1) { 489 if (irq != -1) {
490 regs->orig_eax = ~irq; 490 regs->orig_ax = ~irq;
491 do_IRQ(regs); 491 do_IRQ(regs);
492 } 492 }
493 } 493 }
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 0ac6c5dc49b..45aa771e73a 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -58,7 +58,8 @@
58 58
59xmaddr_t arbitrary_virt_to_machine(unsigned long address) 59xmaddr_t arbitrary_virt_to_machine(unsigned long address)
60{ 60{
61 pte_t *pte = lookup_address(address); 61 int level;
62 pte_t *pte = lookup_address(address, &level);
62 unsigned offset = address & PAGE_MASK; 63 unsigned offset = address & PAGE_MASK;
63 64
64 BUG_ON(pte == NULL); 65 BUG_ON(pte == NULL);
@@ -70,8 +71,9 @@ void make_lowmem_page_readonly(void *vaddr)
70{ 71{
71 pte_t *pte, ptev; 72 pte_t *pte, ptev;
72 unsigned long address = (unsigned long)vaddr; 73 unsigned long address = (unsigned long)vaddr;
74 int level;
73 75
74 pte = lookup_address(address); 76 pte = lookup_address(address, &level);
75 BUG_ON(pte == NULL); 77 BUG_ON(pte == NULL);
76 78
77 ptev = pte_wrprotect(*pte); 79 ptev = pte_wrprotect(*pte);
@@ -84,8 +86,9 @@ void make_lowmem_page_readwrite(void *vaddr)
84{ 86{
85 pte_t *pte, ptev; 87 pte_t *pte, ptev;
86 unsigned long address = (unsigned long)vaddr; 88 unsigned long address = (unsigned long)vaddr;
89 int level;
87 90
88 pte = lookup_address(address); 91 pte = lookup_address(address, &level);
89 BUG_ON(pte == NULL); 92 BUG_ON(pte == NULL);
90 93
91 ptev = pte_mkwrite(*pte); 94 ptev = pte_mkwrite(*pte);
@@ -241,12 +244,12 @@ unsigned long long xen_pgd_val(pgd_t pgd)
241 244
242pte_t xen_make_pte(unsigned long long pte) 245pte_t xen_make_pte(unsigned long long pte)
243{ 246{
244 if (pte & 1) 247 if (pte & _PAGE_PRESENT) {
245 pte = phys_to_machine(XPADDR(pte)).maddr; 248 pte = phys_to_machine(XPADDR(pte)).maddr;
249 pte &= ~(_PAGE_PCD | _PAGE_PWT);
250 }
246 251
247 pte &= ~_PAGE_PCD; 252 return (pte_t){ .pte = pte };
248
249 return (pte_t){ pte, pte >> 32 };
250} 253}
251 254
252pmd_t xen_make_pmd(unsigned long long pmd) 255pmd_t xen_make_pmd(unsigned long long pmd)
@@ -290,10 +293,10 @@ unsigned long xen_pgd_val(pgd_t pgd)
290 293
291pte_t xen_make_pte(unsigned long pte) 294pte_t xen_make_pte(unsigned long pte)
292{ 295{
293 if (pte & _PAGE_PRESENT) 296 if (pte & _PAGE_PRESENT) {
294 pte = phys_to_machine(XPADDR(pte)).maddr; 297 pte = phys_to_machine(XPADDR(pte)).maddr;
295 298 pte &= ~(_PAGE_PCD | _PAGE_PWT);
296 pte &= ~_PAGE_PCD; 299 }
297 300
298 return (pte_t){ pte }; 301 return (pte_t){ pte };
299} 302}
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index f84e7722664..3bad4773a2f 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -10,6 +10,7 @@
10#include <linux/pm.h> 10#include <linux/pm.h>
11 11
12#include <asm/elf.h> 12#include <asm/elf.h>
13#include <asm/vdso.h>
13#include <asm/e820.h> 14#include <asm/e820.h>
14#include <asm/setup.h> 15#include <asm/setup.h>
15#include <asm/xen/hypervisor.h> 16#include <asm/xen/hypervisor.h>
@@ -59,12 +60,10 @@ static void xen_idle(void)
59/* 60/*
60 * Set the bit indicating "nosegneg" library variants should be used. 61 * Set the bit indicating "nosegneg" library variants should be used.
61 */ 62 */
62static void fiddle_vdso(void) 63static void __init fiddle_vdso(void)
63{ 64{
64 extern u32 VDSO_NOTE_MASK; /* See ../kernel/vsyscall-note.S. */ 65 extern const char vdso32_default_start;
65 extern char vsyscall_int80_start; 66 u32 *mask = VDSO32_SYMBOL(&vdso32_default_start, NOTE_MASK);
66 u32 *mask = (u32 *) ((unsigned long) &VDSO_NOTE_MASK - VDSO_PRELINK +
67 &vsyscall_int80_start);
68 *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; 67 *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
69} 68}
70 69
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index c1b131bcdcb..aafc5443740 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -146,7 +146,7 @@ void __init xen_smp_prepare_boot_cpu(void)
146 old memory can be recycled */ 146 old memory can be recycled */
147 make_lowmem_page_readwrite(&per_cpu__gdt_page); 147 make_lowmem_page_readwrite(&per_cpu__gdt_page);
148 148
149 for (cpu = 0; cpu < NR_CPUS; cpu++) { 149 for_each_possible_cpu(cpu) {
150 cpus_clear(per_cpu(cpu_sibling_map, cpu)); 150 cpus_clear(per_cpu(cpu_sibling_map, cpu));
151 /* 151 /*
152 * cpu_core_map lives in a per cpu area that is cleared 152 * cpu_core_map lives in a per cpu area that is cleared
@@ -163,7 +163,7 @@ void __init xen_smp_prepare_cpus(unsigned int max_cpus)
163{ 163{
164 unsigned cpu; 164 unsigned cpu;
165 165
166 for (cpu = 0; cpu < NR_CPUS; cpu++) { 166 for_each_possible_cpu(cpu) {
167 cpus_clear(per_cpu(cpu_sibling_map, cpu)); 167 cpus_clear(per_cpu(cpu_sibling_map, cpu));
168 /* 168 /*
169 * cpu_core_ map will be zeroed when the per 169 * cpu_core_ map will be zeroed when the per
@@ -239,10 +239,10 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
239 ctxt->gdt_ents = ARRAY_SIZE(gdt->gdt); 239 ctxt->gdt_ents = ARRAY_SIZE(gdt->gdt);
240 240
241 ctxt->user_regs.cs = __KERNEL_CS; 241 ctxt->user_regs.cs = __KERNEL_CS;
242 ctxt->user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs); 242 ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
243 243
244 ctxt->kernel_ss = __KERNEL_DS; 244 ctxt->kernel_ss = __KERNEL_DS;
245 ctxt->kernel_sp = idle->thread.esp0; 245 ctxt->kernel_sp = idle->thread.sp0;
246 246
247 ctxt->event_callback_cs = __KERNEL_CS; 247 ctxt->event_callback_cs = __KERNEL_CS;
248 ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback; 248 ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback;
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index d083ff5ef08..b3721fd6877 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -592,7 +592,7 @@ __init void xen_time_init(void)
592 set_normalized_timespec(&wall_to_monotonic, 592 set_normalized_timespec(&wall_to_monotonic,
593 -xtime.tv_sec, -xtime.tv_nsec); 593 -xtime.tv_sec, -xtime.tv_nsec);
594 594
595 tsc_disable = 0; 595 setup_force_cpu_cap(X86_FEATURE_TSC);
596 596
597 xen_setup_timer(cpu); 597 xen_setup_timer(cpu);
598 xen_setup_cpu_clockevents(); 598 xen_setup_cpu_clockevents();
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index f8d6937db2e..288d587ce73 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -4,16 +4,18 @@
4#ifdef CONFIG_XEN 4#ifdef CONFIG_XEN
5 5
6#include <linux/elfnote.h> 6#include <linux/elfnote.h>
7#include <linux/init.h>
7#include <asm/boot.h> 8#include <asm/boot.h>
8#include <xen/interface/elfnote.h> 9#include <xen/interface/elfnote.h>
9 10
10.pushsection .init.text 11 __INIT
11ENTRY(startup_xen) 12ENTRY(startup_xen)
12 movl %esi,xen_start_info 13 movl %esi,xen_start_info
13 cld 14 cld
14 movl $(init_thread_union+THREAD_SIZE),%esp 15 movl $(init_thread_union+THREAD_SIZE),%esp
15 jmp xen_start_kernel 16 jmp xen_start_kernel
16.popsection 17
18 __FINIT
17 19
18.pushsection .bss.page_aligned 20.pushsection .bss.page_aligned
19 .align PAGE_SIZE_asm 21 .align PAGE_SIZE_asm