aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/boot/.gitignore5
-rw-r--r--arch/x86/boot/Makefile171
-rw-r--r--arch/x86/boot/a20.c161
-rw-r--r--arch/x86/boot/apm.c98
-rw-r--r--arch/x86/boot/bitops.h45
-rw-r--r--arch/x86/boot/boot.h296
-rw-r--r--arch/x86/boot/cmdline.c97
-rw-r--r--arch/x86/boot/code16gcc.h15
-rw-r--r--arch/x86/boot/compressed/.gitignore1
-rw-r--r--arch/x86/boot/compressed/Makefile5
-rw-r--r--arch/x86/boot/compressed/Makefile_3250
-rw-r--r--arch/x86/boot/compressed/Makefile_6430
-rw-r--r--arch/x86/boot/compressed/head_32.S180
-rw-r--r--arch/x86/boot/compressed/head_64.S311
-rw-r--r--arch/x86/boot/compressed/misc_32.c379
-rw-r--r--arch/x86/boot/compressed/misc_64.c371
-rw-r--r--arch/x86/boot/compressed/relocs.c631
-rw-r--r--arch/x86/boot/compressed/vmlinux_32.lds43
-rw-r--r--arch/x86/boot/compressed/vmlinux_32.scr10
-rw-r--r--arch/x86/boot/compressed/vmlinux_64.lds44
-rw-r--r--arch/x86/boot/compressed/vmlinux_64.scr10
-rw-r--r--arch/x86/boot/copy.S101
-rw-r--r--arch/x86/boot/cpu.c69
-rw-r--r--arch/x86/boot/cpucheck.c268
-rw-r--r--arch/x86/boot/edd.c167
-rw-r--r--arch/x86/boot/header.S283
-rw-r--r--arch/x86/boot/install.sh61
-rw-r--r--arch/x86/boot/main.c161
-rw-r--r--arch/x86/boot/mca.c43
-rw-r--r--arch/x86/boot/memory.c118
-rw-r--r--arch/x86/boot/mtools.conf.in17
-rw-r--r--arch/x86/boot/pm.c174
-rw-r--r--arch/x86/boot/pmjump.S54
-rw-r--r--arch/x86/boot/printf.c307
-rw-r--r--arch/x86/boot/setup.ld54
-rw-r--r--arch/x86/boot/string.c52
-rw-r--r--arch/x86/boot/tools/.gitignore1
-rw-r--r--arch/x86/boot/tools/build.c168
-rw-r--r--arch/x86/boot/tty.c112
-rw-r--r--arch/x86/boot/version.c23
-rw-r--r--arch/x86/boot/vesa.h79
-rw-r--r--arch/x86/boot/video-bios.c125
-rw-r--r--arch/x86/boot/video-vesa.c292
-rw-r--r--arch/x86/boot/video-vga.c261
-rw-r--r--arch/x86/boot/video.c467
-rw-r--r--arch/x86/boot/video.h152
-rw-r--r--arch/x86/boot/voyager.c46
-rw-r--r--arch/x86/crypto/Makefile5
-rw-r--r--arch/x86/crypto/Makefile_3212
-rw-r--r--arch/x86/crypto/Makefile_6412
-rw-r--r--arch/x86/crypto/aes-i586-asm_32.S373
-rw-r--r--arch/x86/crypto/aes-x86_64-asm_64.S190
-rw-r--r--arch/x86/crypto/aes_32.c515
-rw-r--r--arch/x86/crypto/aes_64.c336
-rw-r--r--arch/x86/crypto/twofish-i586-asm_32.S335
-rw-r--r--arch/x86/crypto/twofish-x86_64-asm_64.S324
-rw-r--r--arch/x86/crypto/twofish_32.c97
-rw-r--r--arch/x86/crypto/twofish_64.c97
-rw-r--r--arch/x86/ia32/Makefile35
-rw-r--r--arch/x86/ia32/audit.c42
-rw-r--r--arch/x86/ia32/fpu32.c183
-rw-r--r--arch/x86/ia32/ia32_aout.c528
-rw-r--r--arch/x86/ia32/ia32_binfmt.c320
-rw-r--r--arch/x86/ia32/ia32_signal.c617
-rw-r--r--arch/x86/ia32/ia32entry.S736
-rw-r--r--arch/x86/ia32/ipc32.c57
-rw-r--r--arch/x86/ia32/mmap32.c79
-rw-r--r--arch/x86/ia32/ptrace32.c404
-rw-r--r--arch/x86/ia32/sys_ia32.c889
-rw-r--r--arch/x86/ia32/syscall32.c83
-rw-r--r--arch/x86/ia32/syscall32_syscall.S17
-rw-r--r--arch/x86/ia32/tls32.c163
-rw-r--r--arch/x86/ia32/vsyscall-sigreturn.S143
-rw-r--r--arch/x86/ia32/vsyscall-syscall.S69
-rw-r--r--arch/x86/ia32/vsyscall-sysenter.S95
-rw-r--r--arch/x86/ia32/vsyscall.lds80
-rw-r--r--arch/x86/kernel/.gitignore1
-rw-r--r--arch/x86/kernel/Makefile5
-rw-r--r--arch/x86/kernel/Makefile_3286
-rw-r--r--arch/x86/kernel/Makefile_6454
-rw-r--r--arch/x86/kernel/acpi/Makefile5
-rw-r--r--arch/x86/kernel/acpi/Makefile_3210
-rw-r--r--arch/x86/kernel/acpi/Makefile_647
-rw-r--r--arch/x86/kernel/acpi/boot.c1326
-rw-r--r--arch/x86/kernel/acpi/cstate.c164
-rw-r--r--arch/x86/kernel/acpi/earlyquirk_32.c84
-rw-r--r--arch/x86/kernel/acpi/processor.c75
-rw-r--r--arch/x86/kernel/acpi/sleep_32.c110
-rw-r--r--arch/x86/kernel/acpi/sleep_64.c120
-rw-r--r--arch/x86/kernel/acpi/wakeup_32.S321
-rw-r--r--arch/x86/kernel/acpi/wakeup_64.S456
-rw-r--r--arch/x86/kernel/alternative.c450
-rw-r--r--arch/x86/kernel/aperture_64.c298
-rw-r--r--arch/x86/kernel/apic_32.c1566
-rw-r--r--arch/x86/kernel/apic_64.c1253
-rw-r--r--arch/x86/kernel/apm_32.c2403
-rw-r--r--arch/x86/kernel/asm-offsets.c5
-rw-r--r--arch/x86/kernel/asm-offsets_32.c147
-rw-r--r--arch/x86/kernel/asm-offsets_64.c85
-rw-r--r--arch/x86/kernel/audit_64.c81
-rw-r--r--arch/x86/kernel/bootflag.c98
-rw-r--r--arch/x86/kernel/bugs_64.c24
-rw-r--r--arch/x86/kernel/cpu/Makefile20
-rw-r--r--arch/x86/kernel/cpu/addon_cpuid_features.c50
-rw-r--r--arch/x86/kernel/cpu/amd.c337
-rw-r--r--arch/x86/kernel/cpu/bugs.c192
-rw-r--r--arch/x86/kernel/cpu/centaur.c471
-rw-r--r--arch/x86/kernel/cpu/common.c733
-rw-r--r--arch/x86/kernel/cpu/cpu.h28
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Kconfig250
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Makefile16
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c799
-rw-r--r--arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c441
-rw-r--r--arch/x86/kernel/cpu/cpufreq/e_powersaver.c334
-rw-r--r--arch/x86/kernel/cpu/cpufreq/elanfreq.c309
-rw-r--r--arch/x86/kernel/cpu/cpufreq/gx-suspmod.c495
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.c1024
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.h353
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longrun.c325
-rw-r--r--arch/x86/kernel/cpu/cpufreq/p4-clockmod.c316
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k6.c256
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k7.c703
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k7.h44
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c1363
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.h232
-rw-r--r--arch/x86/kernel/cpu/cpufreq/sc520_freq.c191
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c634
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-ich.c440
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-lib.c444
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-lib.h49
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-smi.c424
-rw-r--r--arch/x86/kernel/cpu/cyrix.c463
-rw-r--r--arch/x86/kernel/cpu/intel.c333
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c806
-rw-r--r--arch/x86/kernel/cpu/mcheck/Makefile2
-rw-r--r--arch/x86/kernel/cpu/mcheck/k7.c102
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c90
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.h14
-rw-r--r--arch/x86/kernel/cpu/mcheck/non-fatal.c91
-rw-r--r--arch/x86/kernel/cpu/mcheck/p4.c253
-rw-r--r--arch/x86/kernel/cpu/mcheck/p5.c53
-rw-r--r--arch/x86/kernel/cpu/mcheck/p6.c119
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c186
-rw-r--r--arch/x86/kernel/cpu/mcheck/winchip.c36
-rw-r--r--arch/x86/kernel/cpu/mtrr/Makefile3
-rw-r--r--arch/x86/kernel/cpu/mtrr/amd.c121
-rw-r--r--arch/x86/kernel/cpu/mtrr/centaur.c224
-rw-r--r--arch/x86/kernel/cpu/mtrr/cyrix.c380
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c509
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c439
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c768
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.h98
-rw-r--r--arch/x86/kernel/cpu/mtrr/state.c79
-rw-r--r--arch/x86/kernel/cpu/nexgen.c60
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c713
-rw-r--r--arch/x86/kernel/cpu/proc.c192
-rw-r--r--arch/x86/kernel/cpu/transmeta.c116
-rw-r--r--arch/x86/kernel/cpu/umc.c26
-rw-r--r--arch/x86/kernel/cpufreq/Kconfig108
-rw-r--r--arch/x86/kernel/cpuid.c242
-rw-r--r--arch/x86/kernel/crash_32.c137
-rw-r--r--arch/x86/kernel/crash_64.c135
-rw-r--r--arch/x86/kernel/crash_dump_32.c74
-rw-r--r--arch/x86/kernel/crash_dump_64.c47
-rw-r--r--arch/x86/kernel/doublefault_32.c70
-rw-r--r--arch/x86/kernel/e820_32.c944
-rw-r--r--arch/x86/kernel/e820_64.c725
-rw-r--r--arch/x86/kernel/early-quirks_64.c127
-rw-r--r--arch/x86/kernel/early_printk.c259
-rw-r--r--arch/x86/kernel/efi_32.c712
-rw-r--r--arch/x86/kernel/efi_stub_32.S122
-rw-r--r--arch/x86/kernel/entry_32.S1112
-rw-r--r--arch/x86/kernel/entry_64.S1172
-rw-r--r--arch/x86/kernel/genapic_64.c66
-rw-r--r--arch/x86/kernel/genapic_flat_64.c194
-rw-r--r--arch/x86/kernel/geode_32.c155
-rw-r--r--arch/x86/kernel/head64.c86
-rw-r--r--arch/x86/kernel/head_32.S578
-rw-r--r--arch/x86/kernel/head_64.S416
-rw-r--r--arch/x86/kernel/hpet_32.c553
-rw-r--r--arch/x86/kernel/hpet_64.c493
-rw-r--r--arch/x86/kernel/i386_ksyms_32.c30
-rw-r--r--arch/x86/kernel/i387_32.c546
-rw-r--r--arch/x86/kernel/i387_64.c151
-rw-r--r--arch/x86/kernel/i8237.c72
-rw-r--r--arch/x86/kernel/i8253_32.c206
-rw-r--r--arch/x86/kernel/i8259_32.c420
-rw-r--r--arch/x86/kernel/i8259_64.c544
-rw-r--r--arch/x86/kernel/init_task_32.c46
-rw-r--r--arch/x86/kernel/init_task_64.c54
-rw-r--r--arch/x86/kernel/io_apic_32.c2847
-rw-r--r--arch/x86/kernel/io_apic_64.c2202
-rw-r--r--arch/x86/kernel/ioport_32.c153
-rw-r--r--arch/x86/kernel/ioport_64.c119
-rw-r--r--arch/x86/kernel/irq_32.c341
-rw-r--r--arch/x86/kernel/irq_64.c212
-rw-r--r--arch/x86/kernel/k8.c123
-rw-r--r--arch/x86/kernel/kprobes_32.c751
-rw-r--r--arch/x86/kernel/kprobes_64.c749
-rw-r--r--arch/x86/kernel/ldt_32.c250
-rw-r--r--arch/x86/kernel/ldt_64.c252
-rw-r--r--arch/x86/kernel/machine_kexec_32.c171
-rw-r--r--arch/x86/kernel/machine_kexec_64.c259
-rw-r--r--arch/x86/kernel/mca_32.c470
-rw-r--r--arch/x86/kernel/mce_64.c875
-rw-r--r--arch/x86/kernel/mce_amd_64.c689
-rw-r--r--arch/x86/kernel/mce_intel_64.c89
-rw-r--r--arch/x86/kernel/microcode.c850
-rw-r--r--arch/x86/kernel/module_32.c152
-rw-r--r--arch/x86/kernel/module_64.c185
-rw-r--r--arch/x86/kernel/mpparse_32.c1132
-rw-r--r--arch/x86/kernel/mpparse_64.c852
-rw-r--r--arch/x86/kernel/msr.c224
-rw-r--r--arch/x86/kernel/nmi_32.c468
-rw-r--r--arch/x86/kernel/nmi_64.c483
-rw-r--r--arch/x86/kernel/numaq_32.c89
-rw-r--r--arch/x86/kernel/paravirt_32.c392
-rw-r--r--arch/x86/kernel/pci-calgary_64.c1578
-rw-r--r--arch/x86/kernel/pci-dma_32.c177
-rw-r--r--arch/x86/kernel/pci-dma_64.c346
-rw-r--r--arch/x86/kernel/pci-gart_64.c740
-rw-r--r--arch/x86/kernel/pci-nommu_64.c97
-rw-r--r--arch/x86/kernel/pci-swiotlb_64.c44
-rw-r--r--arch/x86/kernel/pcspeaker.c20
-rw-r--r--arch/x86/kernel/pmtimer_64.c69
-rw-r--r--arch/x86/kernel/process_32.c951
-rw-r--r--arch/x86/kernel/process_64.c903
-rw-r--r--arch/x86/kernel/ptrace_32.c723
-rw-r--r--arch/x86/kernel/ptrace_64.c627
-rw-r--r--arch/x86/kernel/quirks.c49
-rw-r--r--arch/x86/kernel/reboot_32.c413
-rw-r--r--arch/x86/kernel/reboot_64.c171
-rw-r--r--arch/x86/kernel/reboot_fixups_32.c68
-rw-r--r--arch/x86/kernel/relocate_kernel_32.S252
-rw-r--r--arch/x86/kernel/relocate_kernel_64.S276
-rw-r--r--arch/x86/kernel/scx200_32.c131
-rw-r--r--arch/x86/kernel/setup64.c289
-rw-r--r--arch/x86/kernel/setup_32.c653
-rw-r--r--arch/x86/kernel/setup_64.c1117
-rw-r--r--arch/x86/kernel/sigframe_32.h21
-rw-r--r--arch/x86/kernel/signal_32.c667
-rw-r--r--arch/x86/kernel/signal_64.c495
-rw-r--r--arch/x86/kernel/smp_32.c707
-rw-r--r--arch/x86/kernel/smp_64.c523
-rw-r--r--arch/x86/kernel/smpboot_32.c1322
-rw-r--r--arch/x86/kernel/smpboot_64.c1085
-rw-r--r--arch/x86/kernel/smpcommon_32.c81
-rw-r--r--arch/x86/kernel/srat_32.c360
-rw-r--r--arch/x86/kernel/stacktrace.c54
-rw-r--r--arch/x86/kernel/summit_32.c180
-rw-r--r--arch/x86/kernel/suspend_64.c239
-rw-r--r--arch/x86/kernel/suspend_asm_64.S110
-rw-r--r--arch/x86/kernel/sys_i386_32.c265
-rw-r--r--arch/x86/kernel/sys_x86_64.c159
-rw-r--r--arch/x86/kernel/syscall_64.c26
-rw-r--r--arch/x86/kernel/syscall_table_32.S326
-rw-r--r--arch/x86/kernel/sysenter_32.c348
-rw-r--r--arch/x86/kernel/tce_64.c189
-rw-r--r--arch/x86/kernel/time_32.c236
-rw-r--r--arch/x86/kernel/time_64.c447
-rw-r--r--arch/x86/kernel/topology.c77
-rw-r--r--arch/x86/kernel/trampoline_32.S85
-rw-r--r--arch/x86/kernel/trampoline_64.S166
-rw-r--r--arch/x86/kernel/traps_32.c1250
-rw-r--r--arch/x86/kernel/traps_64.c1138
-rw-r--r--arch/x86/kernel/tsc_32.c413
-rw-r--r--arch/x86/kernel/tsc_64.c207
-rw-r--r--arch/x86/kernel/tsc_sync.c187
-rw-r--r--arch/x86/kernel/verify_cpu_64.S105
-rw-r--r--arch/x86/kernel/vm86_32.c843
-rw-r--r--arch/x86/kernel/vmi_32.c981
-rw-r--r--arch/x86/kernel/vmiclock_32.c320
-rw-r--r--arch/x86/kernel/vmlinux.lds.S5
-rw-r--r--arch/x86/kernel/vmlinux_32.lds.S213
-rw-r--r--arch/x86/kernel/vmlinux_64.lds.S235
-rw-r--r--arch/x86/kernel/vsmp_64.c49
-rw-r--r--arch/x86/kernel/vsyscall-int80_32.S53
-rw-r--r--arch/x86/kernel/vsyscall-note_32.S45
-rw-r--r--arch/x86/kernel/vsyscall-sigreturn_32.S143
-rw-r--r--arch/x86/kernel/vsyscall-sysenter_32.S122
-rw-r--r--arch/x86/kernel/vsyscall_32.S15
-rw-r--r--arch/x86/kernel/vsyscall_32.lds.S67
-rw-r--r--arch/x86/kernel/vsyscall_64.c349
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c62
-rw-r--r--arch/x86/lib/Makefile5
-rw-r--r--arch/x86/lib/Makefile_3211
-rw-r--r--arch/x86/lib/Makefile_6413
-rw-r--r--arch/x86/lib/bitops_32.c70
-rw-r--r--arch/x86/lib/bitops_64.c175
-rw-r--r--arch/x86/lib/bitstr_64.c28
-rw-r--r--arch/x86/lib/checksum_32.S546
-rw-r--r--arch/x86/lib/clear_page_64.S59
-rw-r--r--arch/x86/lib/copy_page_64.S119
-rw-r--r--arch/x86/lib/copy_user_64.S354
-rw-r--r--arch/x86/lib/copy_user_nocache_64.S217
-rw-r--r--arch/x86/lib/csum-copy_64.S249
-rw-r--r--arch/x86/lib/csum-partial_64.c150
-rw-r--r--arch/x86/lib/csum-wrappers_64.c135
-rw-r--r--arch/x86/lib/delay_32.c103
-rw-r--r--arch/x86/lib/delay_64.c57
-rw-r--r--arch/x86/lib/getuser_32.S78
-rw-r--r--arch/x86/lib/getuser_64.S109
-rw-r--r--arch/x86/lib/io_64.c23
-rw-r--r--arch/x86/lib/iomap_copy_64.S30
-rw-r--r--arch/x86/lib/memcpy_32.c43
-rw-r--r--arch/x86/lib/memcpy_64.S131
-rw-r--r--arch/x86/lib/memmove_64.c21
-rw-r--r--arch/x86/lib/memset_64.S133
-rw-r--r--arch/x86/lib/mmx_32.c403
-rw-r--r--arch/x86/lib/msr-on-cpu.c119
-rw-r--r--arch/x86/lib/putuser_32.S98
-rw-r--r--arch/x86/lib/putuser_64.S106
-rw-r--r--arch/x86/lib/rwlock_64.S38
-rw-r--r--arch/x86/lib/semaphore_32.S219
-rw-r--r--arch/x86/lib/string_32.c257
-rw-r--r--arch/x86/lib/strstr_32.c31
-rw-r--r--arch/x86/lib/thunk_64.S67
-rw-r--r--arch/x86/lib/usercopy_32.c882
-rw-r--r--arch/x86/lib/usercopy_64.c166
-rw-r--r--arch/x86/mach-default/Makefile5
-rw-r--r--arch/x86/mach-default/setup.c180
-rw-r--r--arch/x86/mach-es7000/Makefile6
-rw-r--r--arch/x86/mach-es7000/es7000.h114
-rw-r--r--arch/x86/mach-es7000/es7000plat.c327
-rw-r--r--arch/x86/mach-generic/Makefile8
-rw-r--r--arch/x86/mach-generic/bigsmp.c57
-rw-r--r--arch/x86/mach-generic/default.c26
-rw-r--r--arch/x86/mach-generic/es7000.c69
-rw-r--r--arch/x86/mach-generic/probe.c125
-rw-r--r--arch/x86/mach-generic/summit.c27
-rw-r--r--arch/x86/mach-visws/Makefile8
-rw-r--r--arch/x86/mach-visws/mpparse.c101
-rw-r--r--arch/x86/mach-visws/reboot.c55
-rw-r--r--arch/x86/mach-visws/setup.c183
-rw-r--r--arch/x86/mach-visws/traps.c68
-rw-r--r--arch/x86/mach-visws/visws_apic.c299
-rw-r--r--arch/x86/mach-voyager/Makefile8
-rw-r--r--arch/x86/mach-voyager/setup.c125
-rw-r--r--arch/x86/mach-voyager/voyager_basic.c331
-rw-r--r--arch/x86/mach-voyager/voyager_cat.c1180
-rw-r--r--arch/x86/mach-voyager/voyager_smp.c1952
-rw-r--r--arch/x86/mach-voyager/voyager_thread.c134
-rw-r--r--arch/x86/math-emu/Makefile30
-rw-r--r--arch/x86/math-emu/README427
-rw-r--r--arch/x86/math-emu/control_w.h45
-rw-r--r--arch/x86/math-emu/div_Xsig.S365
-rw-r--r--arch/x86/math-emu/div_small.S47
-rw-r--r--arch/x86/math-emu/errors.c739
-rw-r--r--arch/x86/math-emu/exception.h53
-rw-r--r--arch/x86/math-emu/fpu_arith.c174
-rw-r--r--arch/x86/math-emu/fpu_asm.h32
-rw-r--r--arch/x86/math-emu/fpu_aux.c204
-rw-r--r--arch/x86/math-emu/fpu_emu.h218
-rw-r--r--arch/x86/math-emu/fpu_entry.c761
-rw-r--r--arch/x86/math-emu/fpu_etc.c143
-rw-r--r--arch/x86/math-emu/fpu_proto.h140
-rw-r--r--arch/x86/math-emu/fpu_system.h90
-rw-r--r--arch/x86/math-emu/fpu_tags.c127
-rw-r--r--arch/x86/math-emu/fpu_trig.c1845
-rw-r--r--arch/x86/math-emu/get_address.c438
-rw-r--r--arch/x86/math-emu/load_store.c272
-rw-r--r--arch/x86/math-emu/mul_Xsig.S176
-rw-r--r--arch/x86/math-emu/poly.h121
-rw-r--r--arch/x86/math-emu/poly_2xm1.c156
-rw-r--r--arch/x86/math-emu/poly_atan.c229
-rw-r--r--arch/x86/math-emu/poly_l2.c272
-rw-r--r--arch/x86/math-emu/poly_sin.c397
-rw-r--r--arch/x86/math-emu/poly_tan.c222
-rw-r--r--arch/x86/math-emu/polynom_Xsig.S135
-rw-r--r--arch/x86/math-emu/reg_add_sub.c374
-rw-r--r--arch/x86/math-emu/reg_compare.c381
-rw-r--r--arch/x86/math-emu/reg_constant.c120
-rw-r--r--arch/x86/math-emu/reg_constant.h25
-rw-r--r--arch/x86/math-emu/reg_convert.c53
-rw-r--r--arch/x86/math-emu/reg_divide.c207
-rw-r--r--arch/x86/math-emu/reg_ld_str.c1375
-rw-r--r--arch/x86/math-emu/reg_mul.c132
-rw-r--r--arch/x86/math-emu/reg_norm.S147
-rw-r--r--arch/x86/math-emu/reg_round.S708
-rw-r--r--arch/x86/math-emu/reg_u_add.S167
-rw-r--r--arch/x86/math-emu/reg_u_div.S471
-rw-r--r--arch/x86/math-emu/reg_u_mul.S148
-rw-r--r--arch/x86/math-emu/reg_u_sub.S272
-rw-r--r--arch/x86/math-emu/round_Xsig.S141
-rw-r--r--arch/x86/math-emu/shr_Xsig.S87
-rw-r--r--arch/x86/math-emu/status_w.h67
-rw-r--r--arch/x86/math-emu/version.h12
-rw-r--r--arch/x86/math-emu/wm_shrx.S204
-rw-r--r--arch/x86/math-emu/wm_sqrt.S470
-rw-r--r--arch/x86/mm/Makefile5
-rw-r--r--arch/x86/mm/Makefile_3210
-rw-r--r--arch/x86/mm/Makefile_6410
-rw-r--r--arch/x86/mm/boot_ioremap_32.c100
-rw-r--r--arch/x86/mm/discontig_32.c431
-rw-r--r--arch/x86/mm/extable_32.c35
-rw-r--r--arch/x86/mm/extable_64.c34
-rw-r--r--arch/x86/mm/fault_32.c657
-rw-r--r--arch/x86/mm/fault_64.c636
-rw-r--r--arch/x86/mm/highmem_32.c113
-rw-r--r--arch/x86/mm/hugetlbpage.c391
-rw-r--r--arch/x86/mm/init_32.c858
-rw-r--r--arch/x86/mm/init_64.c750
-rw-r--r--arch/x86/mm/ioremap_32.c274
-rw-r--r--arch/x86/mm/ioremap_64.c210
-rw-r--r--arch/x86/mm/k8topology_64.c182
-rw-r--r--arch/x86/mm/mmap_32.c77
-rw-r--r--arch/x86/mm/mmap_64.c29
-rw-r--r--arch/x86/mm/numa_64.c648
-rw-r--r--arch/x86/mm/pageattr_32.c278
-rw-r--r--arch/x86/mm/pageattr_64.c249
-rw-r--r--arch/x86/mm/pgtable_32.c373
-rw-r--r--arch/x86/mm/srat_64.c566
-rw-r--r--arch/x86/oprofile/Kconfig17
-rw-r--r--arch/x86/oprofile/Makefile12
-rw-r--r--arch/x86/oprofile/backtrace.c127
-rw-r--r--arch/x86/oprofile/init.c48
-rw-r--r--arch/x86/oprofile/nmi_int.c477
-rw-r--r--arch/x86/oprofile/nmi_timer_int.c69
-rw-r--r--arch/x86/oprofile/op_counter.h29
-rw-r--r--arch/x86/oprofile/op_model_athlon.c180
-rw-r--r--arch/x86/oprofile/op_model_p4.c722
-rw-r--r--arch/x86/oprofile/op_model_ppro.c192
-rw-r--r--arch/x86/oprofile/op_x86_model.h51
-rw-r--r--arch/x86/pci/Makefile5
-rw-r--r--arch/x86/pci/Makefile_3214
-rw-r--r--arch/x86/pci/Makefile_6417
-rw-r--r--arch/x86/pci/acpi.c90
-rw-r--r--arch/x86/pci/common.c480
-rw-r--r--arch/x86/pci/direct.c302
-rw-r--r--arch/x86/pci/early.c59
-rw-r--r--arch/x86/pci/fixup.c446
-rw-r--r--arch/x86/pci/i386.c315
-rw-r--r--arch/x86/pci/init.c37
-rw-r--r--arch/x86/pci/irq.c1173
-rw-r--r--arch/x86/pci/k8-bus_64.c83
-rw-r--r--arch/x86/pci/legacy.c56
-rw-r--r--arch/x86/pci/mmconfig-shared.c315
-rw-r--r--arch/x86/pci/mmconfig_32.c148
-rw-r--r--arch/x86/pci/mmconfig_64.c157
-rw-r--r--arch/x86/pci/numa.c135
-rw-r--r--arch/x86/pci/pcbios.c492
-rw-r--r--arch/x86/pci/pci.h149
-rw-r--r--arch/x86/pci/visws.c111
-rw-r--r--arch/x86/power/Makefile2
-rw-r--r--arch/x86/power/cpu.c133
-rw-r--r--arch/x86/power/suspend.c172
-rw-r--r--arch/x86/power/swsusp.S78
-rw-r--r--arch/x86/vdso/.gitignore1
-rw-r--r--arch/x86/vdso/Makefile49
-rw-r--r--arch/x86/vdso/vclock_gettime.c121
-rw-r--r--arch/x86/vdso/vdso-note.S12
-rw-r--r--arch/x86/vdso/vdso-start.S2
-rw-r--r--arch/x86/vdso/vdso.S2
-rw-r--r--arch/x86/vdso/vdso.lds.S77
-rw-r--r--arch/x86/vdso/vextern.h16
-rw-r--r--arch/x86/vdso/vgetcpu.c50
-rw-r--r--arch/x86/vdso/vma.c140
-rw-r--r--arch/x86/vdso/voffset.h1
-rw-r--r--arch/x86/vdso/vvar.c12
-rw-r--r--arch/x86/video/Makefile1
-rw-r--r--arch/x86/video/fbdev.c32
-rw-r--r--arch/x86/xen/Kconfig11
-rw-r--r--arch/x86/xen/Makefile4
-rw-r--r--arch/x86/xen/enlighten.c1146
-rw-r--r--arch/x86/xen/events.c591
-rw-r--r--arch/x86/xen/features.c29
-rw-r--r--arch/x86/xen/manage.c143
-rw-r--r--arch/x86/xen/mmu.c567
-rw-r--r--arch/x86/xen/mmu.h60
-rw-r--r--arch/x86/xen/multicalls.c90
-rw-r--r--arch/x86/xen/multicalls.h45
-rw-r--r--arch/x86/xen/setup.c111
-rw-r--r--arch/x86/xen/smp.c404
-rw-r--r--arch/x86/xen/time.c593
-rw-r--r--arch/x86/xen/vdso.h4
-rw-r--r--arch/x86/xen/xen-asm.S291
-rw-r--r--arch/x86/xen/xen-head.S38
-rw-r--r--arch/x86/xen/xen-ops.h71
478 files changed, 133441 insertions, 0 deletions
diff --git a/arch/x86/boot/.gitignore b/arch/x86/boot/.gitignore
new file mode 100644
index 000000000000..18465143cfa2
--- /dev/null
+++ b/arch/x86/boot/.gitignore
@@ -0,0 +1,5 @@
1bootsect
2bzImage
3setup
4setup.bin
5setup.elf
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
new file mode 100644
index 000000000000..cb1035f2b7e9
--- /dev/null
+++ b/arch/x86/boot/Makefile
@@ -0,0 +1,171 @@
1#
2# arch/x86/boot/Makefile
3#
4# This file is subject to the terms and conditions of the GNU General Public
5# License. See the file "COPYING" in the main directory of this archive
6# for more details.
7#
8# Copyright (C) 1994 by Linus Torvalds
9#
10
11# ROOT_DEV specifies the default root-device when making the image.
12# This can be either FLOPPY, CURRENT, /dev/xxxx or empty, in which case
13# the default of FLOPPY is used by 'build'.
14
15ROOT_DEV := CURRENT
16
17# If you want to preset the SVGA mode, uncomment the next line and
18# set SVGA_MODE to whatever number you want.
19# Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode.
20# The number is the same as you would ordinarily press at bootup.
21
22SVGA_MODE := -DSVGA_MODE=NORMAL_VGA
23
24# If you want the RAM disk device, define this to be the size in blocks.
25
26#RAMDISK := -DRAMDISK=512
27
28targets := vmlinux.bin setup.bin setup.elf zImage bzImage
29subdir- := compressed
30
31setup-y += a20.o apm.o cmdline.o copy.o cpu.o cpucheck.o edd.o
32setup-y += header.o main.o mca.o memory.o pm.o pmjump.o
33setup-y += printf.o string.o tty.o video.o version.o voyager.o
34
35# The link order of the video-*.o modules can matter. In particular,
36# video-vga.o *must* be listed first, followed by video-vesa.o.
37# Hardware-specific drivers should follow in the order they should be
38# probed, and video-bios.o should typically be last.
39setup-y += video-vga.o
40setup-y += video-vesa.o
41setup-y += video-bios.o
42targets += $(setup-y)
43hostprogs-y := tools/build
44
45HOSTCFLAGS_build.o := $(LINUXINCLUDE)
46
47# ---------------------------------------------------------------------------
48
49# How to compile the 16-bit code. Note we always compile for -march=i386,
50# that way we can complain to the user if the CPU is insufficient.
51cflags-i386 :=
52cflags-x86_64 := -m32
53CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \
54 $(cflags-$(ARCH)) \
55 -Wall -Wstrict-prototypes \
56 -march=i386 -mregparm=3 \
57 -include $(srctree)/$(src)/code16gcc.h \
58 -fno-strict-aliasing -fomit-frame-pointer \
59 $(call cc-option, -ffreestanding) \
60 $(call cc-option, -fno-toplevel-reorder,\
61 $(call cc-option, -fno-unit-at-a-time)) \
62 $(call cc-option, -fno-stack-protector) \
63 $(call cc-option, -mpreferred-stack-boundary=2)
64AFLAGS := $(CFLAGS) -D__ASSEMBLY__
65
66$(obj)/zImage: IMAGE_OFFSET := 0x1000
67$(obj)/zImage: EXTRA_AFLAGS := $(SVGA_MODE) $(RAMDISK)
68$(obj)/bzImage: IMAGE_OFFSET := 0x100000
69$(obj)/bzImage: EXTRA_CFLAGS := -D__BIG_KERNEL__
70$(obj)/bzImage: EXTRA_AFLAGS := $(SVGA_MODE) $(RAMDISK) -D__BIG_KERNEL__
71$(obj)/bzImage: BUILDFLAGS := -b
72
73quiet_cmd_image = BUILD $@
74cmd_image = $(obj)/tools/build $(BUILDFLAGS) $(obj)/setup.bin \
75 $(obj)/vmlinux.bin $(ROOT_DEV) > $@
76
77$(obj)/zImage $(obj)/bzImage: $(obj)/setup.bin \
78 $(obj)/vmlinux.bin $(obj)/tools/build FORCE
79 $(call if_changed,image)
80 @echo 'Kernel: $@ is ready' ' (#'`cat .version`')'
81
82$(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE
83 $(call if_changed,objcopy)
84
85SETUP_OBJS = $(addprefix $(obj)/,$(setup-y))
86
87LDFLAGS_setup.elf := -T
88$(obj)/setup.elf: $(src)/setup.ld $(SETUP_OBJS) FORCE
89 $(call if_changed,ld)
90
91OBJCOPYFLAGS_setup.bin := -O binary
92
93$(obj)/setup.bin: $(obj)/setup.elf FORCE
94 $(call if_changed,objcopy)
95
96$(obj)/compressed/vmlinux: FORCE
97 $(Q)$(MAKE) $(build)=$(obj)/compressed IMAGE_OFFSET=$(IMAGE_OFFSET) $@
98
99# Set this if you want to pass append arguments to the zdisk/fdimage/isoimage kernel
100FDARGS =
101# Set this if you want an initrd included with the zdisk/fdimage/isoimage kernel
102FDINITRD =
103
104image_cmdline = default linux $(FDARGS) $(if $(FDINITRD),initrd=initrd.img,)
105
106$(obj)/mtools.conf: $(src)/mtools.conf.in
107 sed -e 's|@OBJ@|$(obj)|g' < $< > $@
108
109# This requires write access to /dev/fd0
110zdisk: $(BOOTIMAGE) $(obj)/mtools.conf
111 MTOOLSRC=$(obj)/mtools.conf mformat a: ; sync
112 syslinux /dev/fd0 ; sync
113 echo '$(image_cmdline)' | \
114 MTOOLSRC=$(src)/mtools.conf mcopy - a:syslinux.cfg
115 if [ -f '$(FDINITRD)' ] ; then \
116 MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' a:initrd.img ; \
117 fi
118 MTOOLSRC=$(obj)/mtools.conf mcopy $(BOOTIMAGE) a:linux ; sync
119
120# These require being root or having syslinux 2.02 or higher installed
121fdimage fdimage144: $(BOOTIMAGE) $(obj)/mtools.conf
122 dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=1440
123 MTOOLSRC=$(obj)/mtools.conf mformat v: ; sync
124 syslinux $(obj)/fdimage ; sync
125 echo '$(image_cmdline)' | \
126 MTOOLSRC=$(obj)/mtools.conf mcopy - v:syslinux.cfg
127 if [ -f '$(FDINITRD)' ] ; then \
128 MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' v:initrd.img ; \
129 fi
130 MTOOLSRC=$(obj)/mtools.conf mcopy $(BOOTIMAGE) v:linux ; sync
131
132fdimage288: $(BOOTIMAGE) $(obj)/mtools.conf
133 dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=2880
134 MTOOLSRC=$(obj)/mtools.conf mformat w: ; sync
135 syslinux $(obj)/fdimage ; sync
136 echo '$(image_cmdline)' | \
137 MTOOLSRC=$(obj)/mtools.conf mcopy - w:syslinux.cfg
138 if [ -f '$(FDINITRD)' ] ; then \
139 MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' w:initrd.img ; \
140 fi
141 MTOOLSRC=$(obj)/mtools.conf mcopy $(BOOTIMAGE) w:linux ; sync
142
143isoimage: $(BOOTIMAGE)
144 -rm -rf $(obj)/isoimage
145 mkdir $(obj)/isoimage
146 for i in lib lib64 share end ; do \
147 if [ -f /usr/$$i/syslinux/isolinux.bin ] ; then \
148 cp /usr/$$i/syslinux/isolinux.bin $(obj)/isoimage ; \
149 break ; \
150 fi ; \
151 if [ $$i = end ] ; then exit 1 ; fi ; \
152 done
153 cp $(BOOTIMAGE) $(obj)/isoimage/linux
154 echo '$(image_cmdline)' > $(obj)/isoimage/isolinux.cfg
155 if [ -f '$(FDINITRD)' ] ; then \
156 cp '$(FDINITRD)' $(obj)/isoimage/initrd.img ; \
157 fi
158 mkisofs -J -r -o $(obj)/image.iso -b isolinux.bin -c boot.cat \
159 -no-emul-boot -boot-load-size 4 -boot-info-table \
160 $(obj)/isoimage
161 rm -rf $(obj)/isoimage
162
163zlilo: $(BOOTIMAGE)
164 if [ -f $(INSTALL_PATH)/vmlinuz ]; then mv $(INSTALL_PATH)/vmlinuz $(INSTALL_PATH)/vmlinuz.old; fi
165 if [ -f $(INSTALL_PATH)/System.map ]; then mv $(INSTALL_PATH)/System.map $(INSTALL_PATH)/System.old; fi
166 cat $(BOOTIMAGE) > $(INSTALL_PATH)/vmlinuz
167 cp System.map $(INSTALL_PATH)/
168 if [ -x /sbin/lilo ]; then /sbin/lilo; else /etc/lilo/install; fi
169
170install:
171 sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(BOOTIMAGE) System.map "$(INSTALL_PATH)"
diff --git a/arch/x86/boot/a20.c b/arch/x86/boot/a20.c
new file mode 100644
index 000000000000..31348d054fca
--- /dev/null
+++ b/arch/x86/boot/a20.c
@@ -0,0 +1,161 @@
1/* -*- linux-c -*- ------------------------------------------------------- *
2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 *
6 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2.
8 *
9 * ----------------------------------------------------------------------- */
10
11/*
12 * arch/i386/boot/a20.c
13 *
14 * Enable A20 gate (return -1 on failure)
15 */
16
17#include "boot.h"
18
19#define MAX_8042_LOOPS 100000
20
21static int empty_8042(void)
22{
23 u8 status;
24 int loops = MAX_8042_LOOPS;
25
26 while (loops--) {
27 io_delay();
28
29 status = inb(0x64);
30 if (status & 1) {
31 /* Read and discard input data */
32 io_delay();
33 (void)inb(0x60);
34 } else if (!(status & 2)) {
35 /* Buffers empty, finished! */
36 return 0;
37 }
38 }
39
40 return -1;
41}
42
43/* Returns nonzero if the A20 line is enabled. The memory address
44 used as a test is the int $0x80 vector, which should be safe. */
45
46#define A20_TEST_ADDR (4*0x80)
47#define A20_TEST_SHORT 32
48#define A20_TEST_LONG 2097152 /* 2^21 */
49
50static int a20_test(int loops)
51{
52 int ok = 0;
53 int saved, ctr;
54
55 set_fs(0x0000);
56 set_gs(0xffff);
57
58 saved = ctr = rdfs32(A20_TEST_ADDR);
59
60 while (loops--) {
61 wrfs32(++ctr, A20_TEST_ADDR);
62 io_delay(); /* Serialize and make delay constant */
63 ok = rdgs32(A20_TEST_ADDR+0x10) ^ ctr;
64 if (ok)
65 break;
66 }
67
68 wrfs32(saved, A20_TEST_ADDR);
69 return ok;
70}
71
72/* Quick test to see if A20 is already enabled */
73static int a20_test_short(void)
74{
75 return a20_test(A20_TEST_SHORT);
76}
77
78/* Longer test that actually waits for A20 to come on line; this
79 is useful when dealing with the KBC or other slow external circuitry. */
80static int a20_test_long(void)
81{
82 return a20_test(A20_TEST_LONG);
83}
84
85static void enable_a20_bios(void)
86{
87 asm volatile("pushfl; int $0x15; popfl"
88 : : "a" ((u16)0x2401));
89}
90
91static void enable_a20_kbc(void)
92{
93 empty_8042();
94
95 outb(0xd1, 0x64); /* Command write */
96 empty_8042();
97
98 outb(0xdf, 0x60); /* A20 on */
99 empty_8042();
100}
101
102static void enable_a20_fast(void)
103{
104 u8 port_a;
105
106 port_a = inb(0x92); /* Configuration port A */
107 port_a |= 0x02; /* Enable A20 */
108 port_a &= ~0x01; /* Do not reset machine */
109 outb(port_a, 0x92);
110}
111
112/*
113 * Actual routine to enable A20; return 0 on ok, -1 on failure
114 */
115
116#define A20_ENABLE_LOOPS 255 /* Number of times to try */
117
118int enable_a20(void)
119{
120 int loops = A20_ENABLE_LOOPS;
121
122#if defined(CONFIG_X86_ELAN)
123 /* Elan croaks if we try to touch the KBC */
124 enable_a20_fast();
125 while (!a20_test_long())
126 ;
127 return 0;
128#elif defined(CONFIG_X86_VOYAGER)
129 /* On Voyager, a20_test() is unsafe? */
130 enable_a20_kbc();
131 return 0;
132#else
133 while (loops--) {
134 /* First, check to see if A20 is already enabled
135 (legacy free, etc.) */
136 if (a20_test_short())
137 return 0;
138
139 /* Next, try the BIOS (INT 0x15, AX=0x2401) */
140 enable_a20_bios();
141 if (a20_test_short())
142 return 0;
143
144 /* Try enabling A20 through the keyboard controller */
145 empty_8042();
146 if (a20_test_short())
147 return 0; /* BIOS worked, but with delayed reaction */
148
149 enable_a20_kbc();
150 if (a20_test_long())
151 return 0;
152
153 /* Finally, try enabling the "fast A20 gate" */
154 enable_a20_fast();
155 if (a20_test_long())
156 return 0;
157 }
158
159 return -1;
160#endif
161}
diff --git a/arch/x86/boot/apm.c b/arch/x86/boot/apm.c
new file mode 100644
index 000000000000..eab50c55a3a5
--- /dev/null
+++ b/arch/x86/boot/apm.c
@@ -0,0 +1,98 @@
1/* -*- linux-c -*- ------------------------------------------------------- *
2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 *
6 * Original APM BIOS checking by Stephen Rothwell, May 1994
7 * (sfr@canb.auug.org.au)
8 *
9 * This file is part of the Linux kernel, and is made available under
10 * the terms of the GNU General Public License version 2.
11 *
12 * ----------------------------------------------------------------------- */
13
14/*
15 * arch/i386/boot/apm.c
16 *
17 * Get APM BIOS information
18 */
19
20#include "boot.h"
21
22#if defined(CONFIG_APM) || defined(CONFIG_APM_MODULE)
23
24int query_apm_bios(void)
25{
26 u16 ax, bx, cx, dx, di;
27 u32 ebx, esi;
28 u8 err;
29
30 /* APM BIOS installation check */
31 ax = 0x5300;
32 bx = cx = 0;
33 asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp ; setc %0"
34 : "=d" (err), "+a" (ax), "+b" (bx), "+c" (cx)
35 : : "esi", "edi");
36
37 if (err)
38 return -1; /* No APM BIOS */
39
40 if (bx != 0x504d) /* "PM" signature */
41 return -1;
42
43 if (!(cx & 0x02)) /* 32 bits supported? */
44 return -1;
45
46 /* Disconnect first, just in case */
47 ax = 0x5304;
48 bx = 0;
49 asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp"
50 : "+a" (ax), "+b" (bx)
51 : : "ecx", "edx", "esi", "edi");
52
53 /* Paranoia */
54 ebx = esi = 0;
55 cx = dx = di = 0;
56
57 /* 32-bit connect */
58 asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp ; setc %6"
59 : "=a" (ax), "+b" (ebx), "+c" (cx), "+d" (dx),
60 "+S" (esi), "+D" (di), "=m" (err)
61 : "a" (0x5303));
62
63 boot_params.apm_bios_info.cseg = ax;
64 boot_params.apm_bios_info.offset = ebx;
65 boot_params.apm_bios_info.cseg_16 = cx;
66 boot_params.apm_bios_info.dseg = dx;
67 boot_params.apm_bios_info.cseg_len = (u16)esi;
68 boot_params.apm_bios_info.cseg_16_len = esi >> 16;
69 boot_params.apm_bios_info.dseg_len = di;
70
71 if (err)
72 return -1;
73
74 /* Redo the installation check as the 32-bit connect;
75 some BIOSes return different flags this way... */
76
77 ax = 0x5300;
78 bx = cx = 0;
79 asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp ; setc %0"
80 : "=d" (err), "+a" (ax), "+b" (bx), "+c" (cx)
81 : : "esi", "edi");
82
83 if (err || bx != 0x504d) {
84 /* Failure with 32-bit connect, try to disconect and ignore */
85 ax = 0x5304;
86 bx = 0;
87 asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp"
88 : "+a" (ax), "+b" (bx)
89 : : "ecx", "edx", "esi", "edi");
90 return -1;
91 }
92
93 boot_params.apm_bios_info.version = ax;
94 boot_params.apm_bios_info.flags = cx;
95 return 0;
96}
97
98#endif
diff --git a/arch/x86/boot/bitops.h b/arch/x86/boot/bitops.h
new file mode 100644
index 000000000000..8dcc8dc7db88
--- /dev/null
+++ b/arch/x86/boot/bitops.h
@@ -0,0 +1,45 @@
1/* -*- linux-c -*- ------------------------------------------------------- *
2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 *
6 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2.
8 *
9 * ----------------------------------------------------------------------- */
10
11/*
12 * arch/i386/boot/bitops.h
13 *
14 * Very simple bitops for the boot code.
15 */
16
17#ifndef BOOT_BITOPS_H
18#define BOOT_BITOPS_H
19#define _LINUX_BITOPS_H /* Inhibit inclusion of <linux/bitops.h> */
20
21static inline int constant_test_bit(int nr, const void *addr)
22{
23 const u32 *p = (const u32 *)addr;
24 return ((1UL << (nr & 31)) & (p[nr >> 5])) != 0;
25}
26static inline int variable_test_bit(int nr, const void *addr)
27{
28 u8 v;
29 const u32 *p = (const u32 *)addr;
30
31 asm("btl %2,%1; setc %0" : "=qm" (v) : "m" (*p), "Ir" (nr));
32 return v;
33}
34
35#define test_bit(nr,addr) \
36(__builtin_constant_p(nr) ? \
37 constant_test_bit((nr),(addr)) : \
38 variable_test_bit((nr),(addr)))
39
40static inline void set_bit(int nr, void *addr)
41{
42 asm("btsl %1,%0" : "+m" (*(u32 *)addr) : "Ir" (nr));
43}
44
45#endif /* BOOT_BITOPS_H */
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
new file mode 100644
index 000000000000..20bab9431acb
--- /dev/null
+++ b/arch/x86/boot/boot.h
@@ -0,0 +1,296 @@
1/* -*- linux-c -*- ------------------------------------------------------- *
2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 *
6 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2.
8 *
9 * ----------------------------------------------------------------------- */
10
11/*
12 * arch/i386/boot/boot.h
13 *
14 * Header file for the real-mode kernel code
15 */
16
17#ifndef BOOT_BOOT_H
18#define BOOT_BOOT_H
19
20#ifndef __ASSEMBLY__
21
22#include <stdarg.h>
23#include <linux/types.h>
24#include <linux/edd.h>
25#include <asm/boot.h>
26#include <asm/bootparam.h>
27
28/* Useful macros */
29#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
30
31extern struct setup_header hdr;
32extern struct boot_params boot_params;
33
34/* Basic port I/O */
35static inline void outb(u8 v, u16 port)
36{
37 asm volatile("outb %0,%1" : : "a" (v), "dN" (port));
38}
39static inline u8 inb(u16 port)
40{
41 u8 v;
42 asm volatile("inb %1,%0" : "=a" (v) : "dN" (port));
43 return v;
44}
45
46static inline void outw(u16 v, u16 port)
47{
48 asm volatile("outw %0,%1" : : "a" (v), "dN" (port));
49}
50static inline u16 inw(u16 port)
51{
52 u16 v;
53 asm volatile("inw %1,%0" : "=a" (v) : "dN" (port));
54 return v;
55}
56
57static inline void outl(u32 v, u16 port)
58{
59 asm volatile("outl %0,%1" : : "a" (v), "dN" (port));
60}
61static inline u32 inl(u32 port)
62{
63 u32 v;
64 asm volatile("inl %1,%0" : "=a" (v) : "dN" (port));
65 return v;
66}
67
68static inline void io_delay(void)
69{
70 const u16 DELAY_PORT = 0x80;
71 asm volatile("outb %%al,%0" : : "dN" (DELAY_PORT));
72}
73
74/* These functions are used to reference data in other segments. */
75
76static inline u16 ds(void)
77{
78 u16 seg;
79 asm("movw %%ds,%0" : "=rm" (seg));
80 return seg;
81}
82
83static inline void set_fs(u16 seg)
84{
85 asm volatile("movw %0,%%fs" : : "rm" (seg));
86}
87static inline u16 fs(void)
88{
89 u16 seg;
90 asm volatile("movw %%fs,%0" : "=rm" (seg));
91 return seg;
92}
93
94static inline void set_gs(u16 seg)
95{
96 asm volatile("movw %0,%%gs" : : "rm" (seg));
97}
98static inline u16 gs(void)
99{
100 u16 seg;
101 asm volatile("movw %%gs,%0" : "=rm" (seg));
102 return seg;
103}
104
105typedef unsigned int addr_t;
106
107static inline u8 rdfs8(addr_t addr)
108{
109 u8 v;
110 asm volatile("movb %%fs:%1,%0" : "=r" (v) : "m" (*(u8 *)addr));
111 return v;
112}
113static inline u16 rdfs16(addr_t addr)
114{
115 u16 v;
116 asm volatile("movw %%fs:%1,%0" : "=r" (v) : "m" (*(u16 *)addr));
117 return v;
118}
119static inline u32 rdfs32(addr_t addr)
120{
121 u32 v;
122 asm volatile("movl %%fs:%1,%0" : "=r" (v) : "m" (*(u32 *)addr));
123 return v;
124}
125
126static inline void wrfs8(u8 v, addr_t addr)
127{
128 asm volatile("movb %1,%%fs:%0" : "+m" (*(u8 *)addr) : "r" (v));
129}
130static inline void wrfs16(u16 v, addr_t addr)
131{
132 asm volatile("movw %1,%%fs:%0" : "+m" (*(u16 *)addr) : "r" (v));
133}
134static inline void wrfs32(u32 v, addr_t addr)
135{
136 asm volatile("movl %1,%%fs:%0" : "+m" (*(u32 *)addr) : "r" (v));
137}
138
139static inline u8 rdgs8(addr_t addr)
140{
141 u8 v;
142 asm volatile("movb %%gs:%1,%0" : "=r" (v) : "m" (*(u8 *)addr));
143 return v;
144}
145static inline u16 rdgs16(addr_t addr)
146{
147 u16 v;
148 asm volatile("movw %%gs:%1,%0" : "=r" (v) : "m" (*(u16 *)addr));
149 return v;
150}
151static inline u32 rdgs32(addr_t addr)
152{
153 u32 v;
154 asm volatile("movl %%gs:%1,%0" : "=r" (v) : "m" (*(u32 *)addr));
155 return v;
156}
157
158static inline void wrgs8(u8 v, addr_t addr)
159{
160 asm volatile("movb %1,%%gs:%0" : "+m" (*(u8 *)addr) : "r" (v));
161}
162static inline void wrgs16(u16 v, addr_t addr)
163{
164 asm volatile("movw %1,%%gs:%0" : "+m" (*(u16 *)addr) : "r" (v));
165}
166static inline void wrgs32(u32 v, addr_t addr)
167{
168 asm volatile("movl %1,%%gs:%0" : "+m" (*(u32 *)addr) : "r" (v));
169}
170
171/* Note: these only return true/false, not a signed return value! */
172static inline int memcmp(const void *s1, const void *s2, size_t len)
173{
174 u8 diff;
175 asm("repe; cmpsb; setnz %0"
176 : "=qm" (diff), "+D" (s1), "+S" (s2), "+c" (len));
177 return diff;
178}
179
180static inline int memcmp_fs(const void *s1, addr_t s2, size_t len)
181{
182 u8 diff;
183 asm volatile("fs; repe; cmpsb; setnz %0"
184 : "=qm" (diff), "+D" (s1), "+S" (s2), "+c" (len));
185 return diff;
186}
187static inline int memcmp_gs(const void *s1, addr_t s2, size_t len)
188{
189 u8 diff;
190 asm volatile("gs; repe; cmpsb; setnz %0"
191 : "=qm" (diff), "+D" (s1), "+S" (s2), "+c" (len));
192 return diff;
193}
194
195static inline int isdigit(int ch)
196{
197 return (ch >= '0') && (ch <= '9');
198}
199
200/* Heap -- available for dynamic lists. */
201#define STACK_SIZE 512 /* Minimum number of bytes for stack */
202
203extern char _end[];
204extern char *HEAP;
205extern char *heap_end;
206#define RESET_HEAP() ((void *)( HEAP = _end ))
207static inline char *__get_heap(size_t s, size_t a, size_t n)
208{
209 char *tmp;
210
211 HEAP = (char *)(((size_t)HEAP+(a-1)) & ~(a-1));
212 tmp = HEAP;
213 HEAP += s*n;
214 return tmp;
215}
216#define GET_HEAP(type, n) \
217 ((type *)__get_heap(sizeof(type),__alignof__(type),(n)))
218
219static inline int heap_free(void)
220{
221 return heap_end-HEAP;
222}
223
224/* copy.S */
225
226void copy_to_fs(addr_t dst, void *src, size_t len);
227void *copy_from_fs(void *dst, addr_t src, size_t len);
228void copy_to_gs(addr_t dst, void *src, size_t len);
229void *copy_from_gs(void *dst, addr_t src, size_t len);
230void *memcpy(void *dst, void *src, size_t len);
231void *memset(void *dst, int c, size_t len);
232
233#define memcpy(d,s,l) __builtin_memcpy(d,s,l)
234#define memset(d,c,l) __builtin_memset(d,c,l)
235
236/* a20.c */
237int enable_a20(void);
238
239/* apm.c */
240int query_apm_bios(void);
241
242/* cmdline.c */
243int cmdline_find_option(const char *option, char *buffer, int bufsize);
244
245/* cpu.c, cpucheck.c */
246int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr);
247int validate_cpu(void);
248
249/* edd.c */
250void query_edd(void);
251
252/* header.S */
253void __attribute__((noreturn)) die(void);
254
255/* mca.c */
256int query_mca(void);
257
258/* memory.c */
259int detect_memory(void);
260
261/* pm.c */
262void __attribute__((noreturn)) go_to_protected_mode(void);
263
264/* pmjump.S */
265void __attribute__((noreturn))
266 protected_mode_jump(u32 entrypoint, u32 bootparams);
267
268/* printf.c */
269int sprintf(char *buf, const char *fmt, ...);
270int vsprintf(char *buf, const char *fmt, va_list args);
271int printf(const char *fmt, ...);
272
273/* string.c */
274int strcmp(const char *str1, const char *str2);
275size_t strnlen(const char *s, size_t maxlen);
276unsigned int atou(const char *s);
277
278/* tty.c */
279void puts(const char *);
280void putchar(int);
281int getchar(void);
282void kbd_flush(void);
283int getchar_timeout(void);
284
285/* video.c */
286void set_video(void);
287
288/* video-vesa.c */
289void vesa_store_edid(void);
290
291/* voyager.c */
292int query_voyager(void);
293
294#endif /* __ASSEMBLY__ */
295
296#endif /* BOOT_BOOT_H */
diff --git a/arch/x86/boot/cmdline.c b/arch/x86/boot/cmdline.c
new file mode 100644
index 000000000000..34bb778c4357
--- /dev/null
+++ b/arch/x86/boot/cmdline.c
@@ -0,0 +1,97 @@
1/* -*- linux-c -*- ------------------------------------------------------- *
2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 *
6 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2.
8 *
9 * ----------------------------------------------------------------------- */
10
11/*
12 * arch/i386/boot/cmdline.c
13 *
14 * Simple command-line parser for early boot.
15 */
16
17#include "boot.h"
18
19static inline int myisspace(u8 c)
20{
21 return c <= ' '; /* Close enough approximation */
22}
23
24/*
25 * Find a non-boolean option, that is, "option=argument". In accordance
26 * with standard Linux practice, if this option is repeated, this returns
27 * the last instance on the command line.
28 *
29 * Returns the length of the argument (regardless of if it was
30 * truncated to fit in the buffer), or -1 on not found.
31 */
32int cmdline_find_option(const char *option, char *buffer, int bufsize)
33{
34 u32 cmdline_ptr = boot_params.hdr.cmd_line_ptr;
35 addr_t cptr;
36 char c;
37 int len = -1;
38 const char *opptr = NULL;
39 char *bufptr = buffer;
40 enum {
41 st_wordstart, /* Start of word/after whitespace */
42 st_wordcmp, /* Comparing this word */
43 st_wordskip, /* Miscompare, skip */
44 st_bufcpy /* Copying this to buffer */
45 } state = st_wordstart;
46
47 if (!cmdline_ptr || cmdline_ptr >= 0x100000)
48 return -1; /* No command line, or inaccessible */
49
50 cptr = cmdline_ptr & 0xf;
51 set_fs(cmdline_ptr >> 4);
52
53 while (cptr < 0x10000 && (c = rdfs8(cptr++))) {
54 switch (state) {
55 case st_wordstart:
56 if (myisspace(c))
57 break;
58
59 /* else */
60 state = st_wordcmp;
61 opptr = option;
62 /* fall through */
63
64 case st_wordcmp:
65 if (c == '=' && !*opptr) {
66 len = 0;
67 bufptr = buffer;
68 state = st_bufcpy;
69 } else if (myisspace(c)) {
70 state = st_wordstart;
71 } else if (c != *opptr++) {
72 state = st_wordskip;
73 }
74 break;
75
76 case st_wordskip:
77 if (myisspace(c))
78 state = st_wordstart;
79 break;
80
81 case st_bufcpy:
82 if (myisspace(c)) {
83 state = st_wordstart;
84 } else {
85 if (len < bufsize-1)
86 *bufptr++ = c;
87 len++;
88 }
89 break;
90 }
91 }
92
93 if (bufsize)
94 *bufptr = '\0';
95
96 return len;
97}
diff --git a/arch/x86/boot/code16gcc.h b/arch/x86/boot/code16gcc.h
new file mode 100644
index 000000000000..d93e48010b61
--- /dev/null
+++ b/arch/x86/boot/code16gcc.h
@@ -0,0 +1,15 @@
1/*
2 * code16gcc.h
3 *
4 * This file is -include'd when compiling 16-bit C code.
5 * Note: this asm() needs to be emitted before gcc emits any code.
6 * Depending on gcc version, this requires -fno-unit-at-a-time or
7 * -fno-toplevel-reorder.
8 *
9 * Hopefully gcc will eventually have a real -m16 option so we can
10 * drop this hack long term.
11 */
12
13#ifndef __ASSEMBLY__
14asm(".code16gcc");
15#endif
diff --git a/arch/x86/boot/compressed/.gitignore b/arch/x86/boot/compressed/.gitignore
new file mode 100644
index 000000000000..be0ed065249b
--- /dev/null
+++ b/arch/x86/boot/compressed/.gitignore
@@ -0,0 +1 @@
relocs
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
new file mode 100644
index 000000000000..52c1db854520
--- /dev/null
+++ b/arch/x86/boot/compressed/Makefile
@@ -0,0 +1,5 @@
1ifeq ($(CONFIG_X86_32),y)
2include ${srctree}/arch/x86/boot/compressed/Makefile_32
3else
4include ${srctree}/arch/x86/boot/compressed/Makefile_64
5endif
diff --git a/arch/x86/boot/compressed/Makefile_32 b/arch/x86/boot/compressed/Makefile_32
new file mode 100644
index 000000000000..22613c652d22
--- /dev/null
+++ b/arch/x86/boot/compressed/Makefile_32
@@ -0,0 +1,50 @@
1#
2# linux/arch/x86/boot/compressed/Makefile
3#
4# create a compressed vmlinux image from the original vmlinux
5#
6
7targets := vmlinux vmlinux.bin vmlinux.bin.gz head_32.o misc_32.o piggy.o \
8 vmlinux.bin.all vmlinux.relocs
9EXTRA_AFLAGS := -traditional
10
11LDFLAGS_vmlinux := -T
12hostprogs-y := relocs
13
14CFLAGS := -m32 -D__KERNEL__ $(LINUX_INCLUDE) -O2 \
15 -fno-strict-aliasing -fPIC \
16 $(call cc-option,-ffreestanding) \
17 $(call cc-option,-fno-stack-protector)
18LDFLAGS := -m elf_i386
19
20$(obj)/vmlinux: $(src)/vmlinux_32.lds $(obj)/head_32.o $(obj)/misc_32.o $(obj)/piggy.o FORCE
21 $(call if_changed,ld)
22 @:
23
24$(obj)/vmlinux.bin: vmlinux FORCE
25 $(call if_changed,objcopy)
26
27quiet_cmd_relocs = RELOCS $@
28 cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $<
29$(obj)/vmlinux.relocs: vmlinux $(obj)/relocs FORCE
30 $(call if_changed,relocs)
31
32vmlinux.bin.all-y := $(obj)/vmlinux.bin
33vmlinux.bin.all-$(CONFIG_RELOCATABLE) += $(obj)/vmlinux.relocs
34quiet_cmd_relocbin = BUILD $@
35 cmd_relocbin = cat $(filter-out FORCE,$^) > $@
36$(obj)/vmlinux.bin.all: $(vmlinux.bin.all-y) FORCE
37 $(call if_changed,relocbin)
38
39ifdef CONFIG_RELOCATABLE
40$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin.all FORCE
41 $(call if_changed,gzip)
42else
43$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
44 $(call if_changed,gzip)
45endif
46
47LDFLAGS_piggy.o := -r --format binary --oformat elf32-i386 -T
48
49$(obj)/piggy.o: $(src)/vmlinux_32.scr $(obj)/vmlinux.bin.gz FORCE
50 $(call if_changed,ld)
diff --git a/arch/x86/boot/compressed/Makefile_64 b/arch/x86/boot/compressed/Makefile_64
new file mode 100644
index 000000000000..dc6b3380cc45
--- /dev/null
+++ b/arch/x86/boot/compressed/Makefile_64
@@ -0,0 +1,30 @@
1#
2# linux/arch/x86/boot/compressed/Makefile
3#
4# create a compressed vmlinux image from the original vmlinux
5#
6
7targets := vmlinux vmlinux.bin vmlinux.bin.gz head_64.o misc_64.o piggy.o
8
9CFLAGS := -m64 -D__KERNEL__ $(LINUXINCLUDE) -O2 \
10 -fno-strict-aliasing -fPIC -mcmodel=small \
11 $(call cc-option, -ffreestanding) \
12 $(call cc-option, -fno-stack-protector)
13AFLAGS := $(CFLAGS) -D__ASSEMBLY__
14LDFLAGS := -m elf_x86_64
15
16LDFLAGS_vmlinux := -T
17$(obj)/vmlinux: $(src)/vmlinux_64.lds $(obj)/head_64.o $(obj)/misc_64.o $(obj)/piggy.o FORCE
18 $(call if_changed,ld)
19 @:
20
21$(obj)/vmlinux.bin: vmlinux FORCE
22 $(call if_changed,objcopy)
23
24$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
25 $(call if_changed,gzip)
26
27LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T
28
29$(obj)/piggy.o: $(obj)/vmlinux_64.scr $(obj)/vmlinux.bin.gz FORCE
30 $(call if_changed,ld)
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
new file mode 100644
index 000000000000..f35ea2237522
--- /dev/null
+++ b/arch/x86/boot/compressed/head_32.S
@@ -0,0 +1,180 @@
1/*
2 * linux/boot/head.S
3 *
4 * Copyright (C) 1991, 1992, 1993 Linus Torvalds
5 */
6
7/*
8 * head.S contains the 32-bit startup code.
9 *
10 * NOTE!!! Startup happens at absolute address 0x00001000, which is also where
11 * the page directory will exist. The startup code will be overwritten by
12 * the page directory. [According to comments etc elsewhere on a compressed
13 * kernel it will end up at 0x1000 + 1Mb I hope so as I assume this. - AC]
14 *
15 * Page 0 is deliberately kept safe, since System Management Mode code in
16 * laptops may need to access the BIOS data stored there. This is also
17 * useful for future device drivers that either access the BIOS via VM86
18 * mode.
19 */
20
21/*
22 * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
23 */
24.text
25
26#include <linux/linkage.h>
27#include <asm/segment.h>
28#include <asm/page.h>
29#include <asm/boot.h>
30
31.section ".text.head","ax",@progbits
32 .globl startup_32
33
34startup_32:
35 cld
36 cli
37 movl $(__BOOT_DS),%eax
38 movl %eax,%ds
39 movl %eax,%es
40 movl %eax,%fs
41 movl %eax,%gs
42 movl %eax,%ss
43
44/* Calculate the delta between where we were compiled to run
45 * at and where we were actually loaded at. This can only be done
46 * with a short local call on x86. Nothing else will tell us what
47 * address we are running at. The reserved chunk of the real-mode
48 * data at 0x1e4 (defined as a scratch field) are used as the stack
49 * for this calculation. Only 4 bytes are needed.
50 */
51 leal (0x1e4+4)(%esi), %esp
52 call 1f
531: popl %ebp
54 subl $1b, %ebp
55
56/* %ebp contains the address we are loaded at by the boot loader and %ebx
57 * contains the address where we should move the kernel image temporarily
58 * for safe in-place decompression.
59 */
60
61#ifdef CONFIG_RELOCATABLE
62 movl %ebp, %ebx
63 addl $(CONFIG_PHYSICAL_ALIGN - 1), %ebx
64 andl $(~(CONFIG_PHYSICAL_ALIGN - 1)), %ebx
65#else
66 movl $LOAD_PHYSICAL_ADDR, %ebx
67#endif
68
69 /* Replace the compressed data size with the uncompressed size */
70 subl input_len(%ebp), %ebx
71 movl output_len(%ebp), %eax
72 addl %eax, %ebx
73 /* Add 8 bytes for every 32K input block */
74 shrl $12, %eax
75 addl %eax, %ebx
76 /* Add 32K + 18 bytes of extra slack */
77 addl $(32768 + 18), %ebx
78 /* Align on a 4K boundary */
79 addl $4095, %ebx
80 andl $~4095, %ebx
81
82/* Copy the compressed kernel to the end of our buffer
83 * where decompression in place becomes safe.
84 */
85 pushl %esi
86 leal _end(%ebp), %esi
87 leal _end(%ebx), %edi
88 movl $(_end - startup_32), %ecx
89 std
90 rep
91 movsb
92 cld
93 popl %esi
94
95/* Compute the kernel start address.
96 */
97#ifdef CONFIG_RELOCATABLE
98 addl $(CONFIG_PHYSICAL_ALIGN - 1), %ebp
99 andl $(~(CONFIG_PHYSICAL_ALIGN - 1)), %ebp
100#else
101 movl $LOAD_PHYSICAL_ADDR, %ebp
102#endif
103
104/*
105 * Jump to the relocated address.
106 */
107 leal relocated(%ebx), %eax
108 jmp *%eax
109.section ".text"
110relocated:
111
112/*
113 * Clear BSS
114 */
115 xorl %eax,%eax
116 leal _edata(%ebx),%edi
117 leal _end(%ebx), %ecx
118 subl %edi,%ecx
119 cld
120 rep
121 stosb
122
123/*
124 * Setup the stack for the decompressor
125 */
126 leal stack_end(%ebx), %esp
127
128/*
129 * Do the decompression, and jump to the new kernel..
130 */
131 movl output_len(%ebx), %eax
132 pushl %eax
133 pushl %ebp # output address
134 movl input_len(%ebx), %eax
135 pushl %eax # input_len
136 leal input_data(%ebx), %eax
137 pushl %eax # input_data
138 leal _end(%ebx), %eax
139 pushl %eax # end of the image as third argument
140 pushl %esi # real mode pointer as second arg
141 call decompress_kernel
142 addl $20, %esp
143 popl %ecx
144
145#if CONFIG_RELOCATABLE
146/* Find the address of the relocations.
147 */
148 movl %ebp, %edi
149 addl %ecx, %edi
150
151/* Calculate the delta between where vmlinux was compiled to run
152 * and where it was actually loaded.
153 */
154 movl %ebp, %ebx
155 subl $LOAD_PHYSICAL_ADDR, %ebx
156 jz 2f /* Nothing to be done if loaded at compiled addr. */
157/*
158 * Process relocations.
159 */
160
1611: subl $4, %edi
162 movl 0(%edi), %ecx
163 testl %ecx, %ecx
164 jz 2f
165 addl %ebx, -__PAGE_OFFSET(%ebx, %ecx)
166 jmp 1b
1672:
168#endif
169
170/*
171 * Jump to the decompressed kernel.
172 */
173 xorl %ebx,%ebx
174 jmp *%ebp
175
176.bss
177.balign 4
178stack:
179 .fill 4096, 1, 0
180stack_end:
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
new file mode 100644
index 000000000000..49467640751f
--- /dev/null
+++ b/arch/x86/boot/compressed/head_64.S
@@ -0,0 +1,311 @@
1/*
2 * linux/boot/head.S
3 *
4 * Copyright (C) 1991, 1992, 1993 Linus Torvalds
5 */
6
7/*
8 * head.S contains the 32-bit startup code.
9 *
10 * NOTE!!! Startup happens at absolute address 0x00001000, which is also where
11 * the page directory will exist. The startup code will be overwritten by
12 * the page directory. [According to comments etc elsewhere on a compressed
13 * kernel it will end up at 0x1000 + 1Mb I hope so as I assume this. - AC]
14 *
15 * Page 0 is deliberately kept safe, since System Management Mode code in
16 * laptops may need to access the BIOS data stored there. This is also
17 * useful for future device drivers that either access the BIOS via VM86
18 * mode.
19 */
20
21/*
22 * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
23 */
24.code32
25.text
26
27#include <linux/linkage.h>
28#include <asm/segment.h>
29#include <asm/pgtable.h>
30#include <asm/page.h>
31#include <asm/msr.h>
32
33.section ".text.head"
34 .code32
35 .globl startup_32
36
37startup_32:
38 cld
39 cli
40 movl $(__KERNEL_DS), %eax
41 movl %eax, %ds
42 movl %eax, %es
43 movl %eax, %ss
44
45/* Calculate the delta between where we were compiled to run
46 * at and where we were actually loaded at. This can only be done
47 * with a short local call on x86. Nothing else will tell us what
48 * address we are running at. The reserved chunk of the real-mode
49 * data at 0x1e4 (defined as a scratch field) are used as the stack
50 * for this calculation. Only 4 bytes are needed.
51 */
52 leal (0x1e4+4)(%esi), %esp
53 call 1f
541: popl %ebp
55 subl $1b, %ebp
56
57/* setup a stack and make sure cpu supports long mode. */
58 movl $user_stack_end, %eax
59 addl %ebp, %eax
60 movl %eax, %esp
61
62 call verify_cpu
63 testl %eax, %eax
64 jnz no_longmode
65
66/* Compute the delta between where we were compiled to run at
67 * and where the code will actually run at.
68 */
69/* %ebp contains the address we are loaded at by the boot loader and %ebx
70 * contains the address where we should move the kernel image temporarily
71 * for safe in-place decompression.
72 */
73
74#ifdef CONFIG_RELOCATABLE
75 movl %ebp, %ebx
76 addl $(LARGE_PAGE_SIZE -1), %ebx
77 andl $LARGE_PAGE_MASK, %ebx
78#else
79 movl $CONFIG_PHYSICAL_START, %ebx
80#endif
81
82 /* Replace the compressed data size with the uncompressed size */
83 subl input_len(%ebp), %ebx
84 movl output_len(%ebp), %eax
85 addl %eax, %ebx
86 /* Add 8 bytes for every 32K input block */
87 shrl $12, %eax
88 addl %eax, %ebx
89 /* Add 32K + 18 bytes of extra slack and align on a 4K boundary */
90 addl $(32768 + 18 + 4095), %ebx
91 andl $~4095, %ebx
92
93/*
94 * Prepare for entering 64 bit mode
95 */
96
97 /* Load new GDT with the 64bit segments using 32bit descriptor */
98 leal gdt(%ebp), %eax
99 movl %eax, gdt+2(%ebp)
100 lgdt gdt(%ebp)
101
102 /* Enable PAE mode */
103 xorl %eax, %eax
104 orl $(1 << 5), %eax
105 movl %eax, %cr4
106
107 /*
108 * Build early 4G boot pagetable
109 */
110 /* Initialize Page tables to 0*/
111 leal pgtable(%ebx), %edi
112 xorl %eax, %eax
113 movl $((4096*6)/4), %ecx
114 rep stosl
115
116 /* Build Level 4 */
117 leal pgtable + 0(%ebx), %edi
118 leal 0x1007 (%edi), %eax
119 movl %eax, 0(%edi)
120
121 /* Build Level 3 */
122 leal pgtable + 0x1000(%ebx), %edi
123 leal 0x1007(%edi), %eax
124 movl $4, %ecx
1251: movl %eax, 0x00(%edi)
126 addl $0x00001000, %eax
127 addl $8, %edi
128 decl %ecx
129 jnz 1b
130
131 /* Build Level 2 */
132 leal pgtable + 0x2000(%ebx), %edi
133 movl $0x00000183, %eax
134 movl $2048, %ecx
1351: movl %eax, 0(%edi)
136 addl $0x00200000, %eax
137 addl $8, %edi
138 decl %ecx
139 jnz 1b
140
141 /* Enable the boot page tables */
142 leal pgtable(%ebx), %eax
143 movl %eax, %cr3
144
145 /* Enable Long mode in EFER (Extended Feature Enable Register) */
146 movl $MSR_EFER, %ecx
147 rdmsr
148 btsl $_EFER_LME, %eax
149 wrmsr
150
151 /* Setup for the jump to 64bit mode
152 *
153 * When the jump is performend we will be in long mode but
154 * in 32bit compatibility mode with EFER.LME = 1, CS.L = 0, CS.D = 1
155 * (and in turn EFER.LMA = 1). To jump into 64bit mode we use
156 * the new gdt/idt that has __KERNEL_CS with CS.L = 1.
157 * We place all of the values on our mini stack so lret can
158 * used to perform that far jump.
159 */
160 pushl $__KERNEL_CS
161 leal startup_64(%ebp), %eax
162 pushl %eax
163
164 /* Enter paged protected Mode, activating Long Mode */
165 movl $0x80000001, %eax /* Enable Paging and Protected mode */
166 movl %eax, %cr0
167
168 /* Jump from 32bit compatibility mode into 64bit mode. */
169 lret
170
171no_longmode:
172 /* This isn't an x86-64 CPU so hang */
1731:
174 hlt
175 jmp 1b
176
177#include "../../kernel/verify_cpu_64.S"
178
179 /* Be careful here startup_64 needs to be at a predictable
180 * address so I can export it in an ELF header. Bootloaders
181 * should look at the ELF header to find this address, as
182 * it may change in the future.
183 */
184 .code64
185 .org 0x200
186ENTRY(startup_64)
187 /* We come here either from startup_32 or directly from a
188 * 64bit bootloader. If we come here from a bootloader we depend on
189 * an identity mapped page table being provied that maps our
190 * entire text+data+bss and hopefully all of memory.
191 */
192
193 /* Setup data segments. */
194 xorl %eax, %eax
195 movl %eax, %ds
196 movl %eax, %es
197 movl %eax, %ss
198 movl %eax, %fs
199 movl %eax, %gs
200 lldt %ax
201 movl $0x20, %eax
202 ltr %ax
203
204 /* Compute the decompressed kernel start address. It is where
205 * we were loaded at aligned to a 2M boundary. %rbp contains the
206 * decompressed kernel start address.
207 *
208 * If it is a relocatable kernel then decompress and run the kernel
209 * from load address aligned to 2MB addr, otherwise decompress and
210 * run the kernel from CONFIG_PHYSICAL_START
211 */
212
213 /* Start with the delta to where the kernel will run at. */
214#ifdef CONFIG_RELOCATABLE
215 leaq startup_32(%rip) /* - $startup_32 */, %rbp
216 addq $(LARGE_PAGE_SIZE - 1), %rbp
217 andq $LARGE_PAGE_MASK, %rbp
218 movq %rbp, %rbx
219#else
220 movq $CONFIG_PHYSICAL_START, %rbp
221 movq %rbp, %rbx
222#endif
223
224 /* Replace the compressed data size with the uncompressed size */
225 movl input_len(%rip), %eax
226 subq %rax, %rbx
227 movl output_len(%rip), %eax
228 addq %rax, %rbx
229 /* Add 8 bytes for every 32K input block */
230 shrq $12, %rax
231 addq %rax, %rbx
232 /* Add 32K + 18 bytes of extra slack and align on a 4K boundary */
233 addq $(32768 + 18 + 4095), %rbx
234 andq $~4095, %rbx
235
236/* Copy the compressed kernel to the end of our buffer
237 * where decompression in place becomes safe.
238 */
239 leaq _end(%rip), %r8
240 leaq _end(%rbx), %r9
241 movq $_end /* - $startup_32 */, %rcx
2421: subq $8, %r8
243 subq $8, %r9
244 movq 0(%r8), %rax
245 movq %rax, 0(%r9)
246 subq $8, %rcx
247 jnz 1b
248
249/*
250 * Jump to the relocated address.
251 */
252 leaq relocated(%rbx), %rax
253 jmp *%rax
254
255.section ".text"
256relocated:
257
258/*
259 * Clear BSS
260 */
261 xorq %rax, %rax
262 leaq _edata(%rbx), %rdi
263 leaq _end(%rbx), %rcx
264 subq %rdi, %rcx
265 cld
266 rep
267 stosb
268
269 /* Setup the stack */
270 leaq user_stack_end(%rip), %rsp
271
272 /* zero EFLAGS after setting rsp */
273 pushq $0
274 popfq
275
276/*
277 * Do the decompression, and jump to the new kernel..
278 */
279 pushq %rsi # Save the real mode argument
280 movq %rsi, %rdi # real mode address
281 leaq _heap(%rip), %rsi # _heap
282 leaq input_data(%rip), %rdx # input_data
283 movl input_len(%rip), %eax
284 movq %rax, %rcx # input_len
285 movq %rbp, %r8 # output
286 call decompress_kernel
287 popq %rsi
288
289
290/*
291 * Jump to the decompressed kernel.
292 */
293 jmp *%rbp
294
295 .data
296gdt:
297 .word gdt_end - gdt
298 .long gdt
299 .word 0
300 .quad 0x0000000000000000 /* NULL descriptor */
301 .quad 0x00af9a000000ffff /* __KERNEL_CS */
302 .quad 0x00cf92000000ffff /* __KERNEL_DS */
303 .quad 0x0080890000000000 /* TS descriptor */
304 .quad 0x0000000000000000 /* TS continued */
305gdt_end:
306 .bss
307/* Stack for uncompression */
308 .balign 4
309user_stack:
310 .fill 4096,4,0
311user_stack_end:
diff --git a/arch/x86/boot/compressed/misc_32.c b/arch/x86/boot/compressed/misc_32.c
new file mode 100644
index 000000000000..b28505c544c9
--- /dev/null
+++ b/arch/x86/boot/compressed/misc_32.c
@@ -0,0 +1,379 @@
1/*
2 * misc.c
3 *
4 * This is a collection of several routines from gzip-1.0.3
5 * adapted for Linux.
6 *
7 * malloc by Hannu Savolainen 1993 and Matthias Urlichs 1994
8 * puts by Nick Holloway 1993, better puts by Martin Mares 1995
9 * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
10 */
11
12#undef CONFIG_PARAVIRT
13#include <linux/linkage.h>
14#include <linux/vmalloc.h>
15#include <linux/screen_info.h>
16#include <asm/io.h>
17#include <asm/page.h>
18#include <asm/boot.h>
19
20/* WARNING!!
21 * This code is compiled with -fPIC and it is relocated dynamically
22 * at run time, but no relocation processing is performed.
23 * This means that it is not safe to place pointers in static structures.
24 */
25
26/*
27 * Getting to provable safe in place decompression is hard.
28 * Worst case behaviours need to be analized.
29 * Background information:
30 *
31 * The file layout is:
32 * magic[2]
33 * method[1]
34 * flags[1]
35 * timestamp[4]
36 * extraflags[1]
37 * os[1]
38 * compressed data blocks[N]
39 * crc[4] orig_len[4]
40 *
41 * resulting in 18 bytes of non compressed data overhead.
42 *
43 * Files divided into blocks
44 * 1 bit (last block flag)
45 * 2 bits (block type)
46 *
47 * 1 block occurs every 32K -1 bytes or when there 50% compression has been achieved.
48 * The smallest block type encoding is always used.
49 *
50 * stored:
51 * 32 bits length in bytes.
52 *
53 * fixed:
54 * magic fixed tree.
55 * symbols.
56 *
57 * dynamic:
58 * dynamic tree encoding.
59 * symbols.
60 *
61 *
62 * The buffer for decompression in place is the length of the
63 * uncompressed data, plus a small amount extra to keep the algorithm safe.
64 * The compressed data is placed at the end of the buffer. The output
65 * pointer is placed at the start of the buffer and the input pointer
66 * is placed where the compressed data starts. Problems will occur
67 * when the output pointer overruns the input pointer.
68 *
69 * The output pointer can only overrun the input pointer if the input
70 * pointer is moving faster than the output pointer. A condition only
71 * triggered by data whose compressed form is larger than the uncompressed
72 * form.
73 *
74 * The worst case at the block level is a growth of the compressed data
75 * of 5 bytes per 32767 bytes.
76 *
77 * The worst case internal to a compressed block is very hard to figure.
78 * The worst case can at least be boundined by having one bit that represents
79 * 32764 bytes and then all of the rest of the bytes representing the very
80 * very last byte.
81 *
82 * All of which is enough to compute an amount of extra data that is required
83 * to be safe. To avoid problems at the block level allocating 5 extra bytes
84 * per 32767 bytes of data is sufficient. To avoind problems internal to a block
85 * adding an extra 32767 bytes (the worst case uncompressed block size) is
86 * sufficient, to ensure that in the worst case the decompressed data for
87 * block will stop the byte before the compressed data for a block begins.
88 * To avoid problems with the compressed data's meta information an extra 18
89 * bytes are needed. Leading to the formula:
90 *
91 * extra_bytes = (uncompressed_size >> 12) + 32768 + 18 + decompressor_size.
92 *
93 * Adding 8 bytes per 32K is a bit excessive but much easier to calculate.
94 * Adding 32768 instead of 32767 just makes for round numbers.
95 * Adding the decompressor_size is necessary as it musht live after all
96 * of the data as well. Last I measured the decompressor is about 14K.
97 * 10K of actuall data and 4K of bss.
98 *
99 */
100
101/*
102 * gzip declarations
103 */
104
105#define OF(args) args
106#define STATIC static
107
108#undef memset
109#undef memcpy
110#define memzero(s, n) memset ((s), 0, (n))
111
112typedef unsigned char uch;
113typedef unsigned short ush;
114typedef unsigned long ulg;
115
116#define WSIZE 0x80000000 /* Window size must be at least 32k,
117 * and a power of two
118 * We don't actually have a window just
119 * a huge output buffer so I report
120 * a 2G windows size, as that should
121 * always be larger than our output buffer.
122 */
123
124static uch *inbuf; /* input buffer */
125static uch *window; /* Sliding window buffer, (and final output buffer) */
126
127static unsigned insize; /* valid bytes in inbuf */
128static unsigned inptr; /* index of next byte to be processed in inbuf */
129static unsigned outcnt; /* bytes in output buffer */
130
131/* gzip flag byte */
132#define ASCII_FLAG 0x01 /* bit 0 set: file probably ASCII text */
133#define CONTINUATION 0x02 /* bit 1 set: continuation of multi-part gzip file */
134#define EXTRA_FIELD 0x04 /* bit 2 set: extra field present */
135#define ORIG_NAME 0x08 /* bit 3 set: original file name present */
136#define COMMENT 0x10 /* bit 4 set: file comment present */
137#define ENCRYPTED 0x20 /* bit 5 set: file is encrypted */
138#define RESERVED 0xC0 /* bit 6,7: reserved */
139
140#define get_byte() (inptr < insize ? inbuf[inptr++] : fill_inbuf())
141
142/* Diagnostic functions */
143#ifdef DEBUG
144# define Assert(cond,msg) {if(!(cond)) error(msg);}
145# define Trace(x) fprintf x
146# define Tracev(x) {if (verbose) fprintf x ;}
147# define Tracevv(x) {if (verbose>1) fprintf x ;}
148# define Tracec(c,x) {if (verbose && (c)) fprintf x ;}
149# define Tracecv(c,x) {if (verbose>1 && (c)) fprintf x ;}
150#else
151# define Assert(cond,msg)
152# define Trace(x)
153# define Tracev(x)
154# define Tracevv(x)
155# define Tracec(c,x)
156# define Tracecv(c,x)
157#endif
158
159static int fill_inbuf(void);
160static void flush_window(void);
161static void error(char *m);
162static void gzip_mark(void **);
163static void gzip_release(void **);
164
165/*
166 * This is set up by the setup-routine at boot-time
167 */
168static unsigned char *real_mode; /* Pointer to real-mode data */
169
170#define RM_EXT_MEM_K (*(unsigned short *)(real_mode + 0x2))
171#ifndef STANDARD_MEMORY_BIOS_CALL
172#define RM_ALT_MEM_K (*(unsigned long *)(real_mode + 0x1e0))
173#endif
174#define RM_SCREEN_INFO (*(struct screen_info *)(real_mode+0))
175
176extern unsigned char input_data[];
177extern int input_len;
178
179static long bytes_out = 0;
180
181static void *malloc(int size);
182static void free(void *where);
183
184static void *memset(void *s, int c, unsigned n);
185static void *memcpy(void *dest, const void *src, unsigned n);
186
187static void putstr(const char *);
188
189static unsigned long free_mem_ptr;
190static unsigned long free_mem_end_ptr;
191
192#define HEAP_SIZE 0x4000
193
194static char *vidmem = (char *)0xb8000;
195static int vidport;
196static int lines, cols;
197
198#ifdef CONFIG_X86_NUMAQ
199void *xquad_portio;
200#endif
201
202#include "../../../../lib/inflate.c"
203
204static void *malloc(int size)
205{
206 void *p;
207
208 if (size <0) error("Malloc error");
209 if (free_mem_ptr <= 0) error("Memory error");
210
211 free_mem_ptr = (free_mem_ptr + 3) & ~3; /* Align */
212
213 p = (void *)free_mem_ptr;
214 free_mem_ptr += size;
215
216 if (free_mem_ptr >= free_mem_end_ptr)
217 error("Out of memory");
218
219 return p;
220}
221
222static void free(void *where)
223{ /* Don't care */
224}
225
226static void gzip_mark(void **ptr)
227{
228 *ptr = (void *) free_mem_ptr;
229}
230
231static void gzip_release(void **ptr)
232{
233 free_mem_ptr = (unsigned long) *ptr;
234}
235
236static void scroll(void)
237{
238 int i;
239
240 memcpy ( vidmem, vidmem + cols * 2, ( lines - 1 ) * cols * 2 );
241 for ( i = ( lines - 1 ) * cols * 2; i < lines * cols * 2; i += 2 )
242 vidmem[i] = ' ';
243}
244
245static void putstr(const char *s)
246{
247 int x,y,pos;
248 char c;
249
250 x = RM_SCREEN_INFO.orig_x;
251 y = RM_SCREEN_INFO.orig_y;
252
253 while ( ( c = *s++ ) != '\0' ) {
254 if ( c == '\n' ) {
255 x = 0;
256 if ( ++y >= lines ) {
257 scroll();
258 y--;
259 }
260 } else {
261 vidmem [ ( x + cols * y ) * 2 ] = c;
262 if ( ++x >= cols ) {
263 x = 0;
264 if ( ++y >= lines ) {
265 scroll();
266 y--;
267 }
268 }
269 }
270 }
271
272 RM_SCREEN_INFO.orig_x = x;
273 RM_SCREEN_INFO.orig_y = y;
274
275 pos = (x + cols * y) * 2; /* Update cursor position */
276 outb_p(14, vidport);
277 outb_p(0xff & (pos >> 9), vidport+1);
278 outb_p(15, vidport);
279 outb_p(0xff & (pos >> 1), vidport+1);
280}
281
282static void* memset(void* s, int c, unsigned n)
283{
284 int i;
285 char *ss = (char*)s;
286
287 for (i=0;i<n;i++) ss[i] = c;
288 return s;
289}
290
291static void* memcpy(void* dest, const void* src, unsigned n)
292{
293 int i;
294 char *d = (char *)dest, *s = (char *)src;
295
296 for (i=0;i<n;i++) d[i] = s[i];
297 return dest;
298}
299
300/* ===========================================================================
301 * Fill the input buffer. This is called only when the buffer is empty
302 * and at least one byte is really needed.
303 */
304static int fill_inbuf(void)
305{
306 error("ran out of input data");
307 return 0;
308}
309
310/* ===========================================================================
311 * Write the output window window[0..outcnt-1] and update crc and bytes_out.
312 * (Used for the decompressed data only.)
313 */
314static void flush_window(void)
315{
316 /* With my window equal to my output buffer
317 * I only need to compute the crc here.
318 */
319 ulg c = crc; /* temporary variable */
320 unsigned n;
321 uch *in, ch;
322
323 in = window;
324 for (n = 0; n < outcnt; n++) {
325 ch = *in++;
326 c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8);
327 }
328 crc = c;
329 bytes_out += (ulg)outcnt;
330 outcnt = 0;
331}
332
333static void error(char *x)
334{
335 putstr("\n\n");
336 putstr(x);
337 putstr("\n\n -- System halted");
338
339 while(1); /* Halt */
340}
341
342asmlinkage void decompress_kernel(void *rmode, unsigned long end,
343 uch *input_data, unsigned long input_len, uch *output)
344{
345 real_mode = rmode;
346
347 if (RM_SCREEN_INFO.orig_video_mode == 7) {
348 vidmem = (char *) 0xb0000;
349 vidport = 0x3b4;
350 } else {
351 vidmem = (char *) 0xb8000;
352 vidport = 0x3d4;
353 }
354
355 lines = RM_SCREEN_INFO.orig_video_lines;
356 cols = RM_SCREEN_INFO.orig_video_cols;
357
358 window = output; /* Output buffer (Normally at 1M) */
359 free_mem_ptr = end; /* Heap */
360 free_mem_end_ptr = end + HEAP_SIZE;
361 inbuf = input_data; /* Input buffer */
362 insize = input_len;
363 inptr = 0;
364
365 if ((u32)output & (CONFIG_PHYSICAL_ALIGN -1))
366 error("Destination address not CONFIG_PHYSICAL_ALIGN aligned");
367 if (end > ((-__PAGE_OFFSET-(512 <<20)-1) & 0x7fffffff))
368 error("Destination address too large");
369#ifndef CONFIG_RELOCATABLE
370 if ((u32)output != LOAD_PHYSICAL_ADDR)
371 error("Wrong destination address");
372#endif
373
374 makecrc();
375 putstr("Uncompressing Linux... ");
376 gunzip();
377 putstr("Ok, booting the kernel.\n");
378 return;
379}
diff --git a/arch/x86/boot/compressed/misc_64.c b/arch/x86/boot/compressed/misc_64.c
new file mode 100644
index 000000000000..f932b0e89096
--- /dev/null
+++ b/arch/x86/boot/compressed/misc_64.c
@@ -0,0 +1,371 @@
1/*
2 * misc.c
3 *
4 * This is a collection of several routines from gzip-1.0.3
5 * adapted for Linux.
6 *
7 * malloc by Hannu Savolainen 1993 and Matthias Urlichs 1994
8 * puts by Nick Holloway 1993, better puts by Martin Mares 1995
9 * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
10 */
11
12#define _LINUX_STRING_H_ 1
13#define __LINUX_BITMAP_H 1
14
15#include <linux/linkage.h>
16#include <linux/screen_info.h>
17#include <asm/io.h>
18#include <asm/page.h>
19
20/* WARNING!!
21 * This code is compiled with -fPIC and it is relocated dynamically
22 * at run time, but no relocation processing is performed.
23 * This means that it is not safe to place pointers in static structures.
24 */
25
26/*
27 * Getting to provable safe in place decompression is hard.
28 * Worst case behaviours need to be analized.
29 * Background information:
30 *
31 * The file layout is:
32 * magic[2]
33 * method[1]
34 * flags[1]
35 * timestamp[4]
36 * extraflags[1]
37 * os[1]
38 * compressed data blocks[N]
39 * crc[4] orig_len[4]
40 *
41 * resulting in 18 bytes of non compressed data overhead.
42 *
43 * Files divided into blocks
44 * 1 bit (last block flag)
45 * 2 bits (block type)
46 *
47 * 1 block occurs every 32K -1 bytes or when there 50% compression has been achieved.
48 * The smallest block type encoding is always used.
49 *
50 * stored:
51 * 32 bits length in bytes.
52 *
53 * fixed:
54 * magic fixed tree.
55 * symbols.
56 *
57 * dynamic:
58 * dynamic tree encoding.
59 * symbols.
60 *
61 *
62 * The buffer for decompression in place is the length of the
63 * uncompressed data, plus a small amount extra to keep the algorithm safe.
64 * The compressed data is placed at the end of the buffer. The output
65 * pointer is placed at the start of the buffer and the input pointer
66 * is placed where the compressed data starts. Problems will occur
67 * when the output pointer overruns the input pointer.
68 *
69 * The output pointer can only overrun the input pointer if the input
70 * pointer is moving faster than the output pointer. A condition only
71 * triggered by data whose compressed form is larger than the uncompressed
72 * form.
73 *
74 * The worst case at the block level is a growth of the compressed data
75 * of 5 bytes per 32767 bytes.
76 *
77 * The worst case internal to a compressed block is very hard to figure.
78 * The worst case can at least be boundined by having one bit that represents
79 * 32764 bytes and then all of the rest of the bytes representing the very
80 * very last byte.
81 *
82 * All of which is enough to compute an amount of extra data that is required
83 * to be safe. To avoid problems at the block level allocating 5 extra bytes
84 * per 32767 bytes of data is sufficient. To avoind problems internal to a block
85 * adding an extra 32767 bytes (the worst case uncompressed block size) is
86 * sufficient, to ensure that in the worst case the decompressed data for
87 * block will stop the byte before the compressed data for a block begins.
88 * To avoid problems with the compressed data's meta information an extra 18
89 * bytes are needed. Leading to the formula:
90 *
91 * extra_bytes = (uncompressed_size >> 12) + 32768 + 18 + decompressor_size.
92 *
93 * Adding 8 bytes per 32K is a bit excessive but much easier to calculate.
94 * Adding 32768 instead of 32767 just makes for round numbers.
95 * Adding the decompressor_size is necessary as it musht live after all
96 * of the data as well. Last I measured the decompressor is about 14K.
97 * 10K of actuall data and 4K of bss.
98 *
99 */
100
101/*
102 * gzip declarations
103 */
104
105#define OF(args) args
106#define STATIC static
107
108#undef memset
109#undef memcpy
110#define memzero(s, n) memset ((s), 0, (n))
111
112typedef unsigned char uch;
113typedef unsigned short ush;
114typedef unsigned long ulg;
115
116#define WSIZE 0x80000000 /* Window size must be at least 32k,
117 * and a power of two
118 * We don't actually have a window just
119 * a huge output buffer so I report
120 * a 2G windows size, as that should
121 * always be larger than our output buffer.
122 */
123
124static uch *inbuf; /* input buffer */
125static uch *window; /* Sliding window buffer, (and final output buffer) */
126
127static unsigned insize; /* valid bytes in inbuf */
128static unsigned inptr; /* index of next byte to be processed in inbuf */
129static unsigned outcnt; /* bytes in output buffer */
130
131/* gzip flag byte */
132#define ASCII_FLAG 0x01 /* bit 0 set: file probably ASCII text */
133#define CONTINUATION 0x02 /* bit 1 set: continuation of multi-part gzip file */
134#define EXTRA_FIELD 0x04 /* bit 2 set: extra field present */
135#define ORIG_NAME 0x08 /* bit 3 set: original file name present */
136#define COMMENT 0x10 /* bit 4 set: file comment present */
137#define ENCRYPTED 0x20 /* bit 5 set: file is encrypted */
138#define RESERVED 0xC0 /* bit 6,7: reserved */
139
140#define get_byte() (inptr < insize ? inbuf[inptr++] : fill_inbuf())
141
142/* Diagnostic functions */
143#ifdef DEBUG
144# define Assert(cond,msg) {if(!(cond)) error(msg);}
145# define Trace(x) fprintf x
146# define Tracev(x) {if (verbose) fprintf x ;}
147# define Tracevv(x) {if (verbose>1) fprintf x ;}
148# define Tracec(c,x) {if (verbose && (c)) fprintf x ;}
149# define Tracecv(c,x) {if (verbose>1 && (c)) fprintf x ;}
150#else
151# define Assert(cond,msg)
152# define Trace(x)
153# define Tracev(x)
154# define Tracevv(x)
155# define Tracec(c,x)
156# define Tracecv(c,x)
157#endif
158
159static int fill_inbuf(void);
160static void flush_window(void);
161static void error(char *m);
162static void gzip_mark(void **);
163static void gzip_release(void **);
164
165/*
166 * This is set up by the setup-routine at boot-time
167 */
168static unsigned char *real_mode; /* Pointer to real-mode data */
169
170#define RM_EXT_MEM_K (*(unsigned short *)(real_mode + 0x2))
171#ifndef STANDARD_MEMORY_BIOS_CALL
172#define RM_ALT_MEM_K (*(unsigned long *)(real_mode + 0x1e0))
173#endif
174#define RM_SCREEN_INFO (*(struct screen_info *)(real_mode+0))
175
176extern unsigned char input_data[];
177extern int input_len;
178
179static long bytes_out = 0;
180
181static void *malloc(int size);
182static void free(void *where);
183
184static void *memset(void *s, int c, unsigned n);
185static void *memcpy(void *dest, const void *src, unsigned n);
186
187static void putstr(const char *);
188
189static long free_mem_ptr;
190static long free_mem_end_ptr;
191
192#define HEAP_SIZE 0x7000
193
194static char *vidmem = (char *)0xb8000;
195static int vidport;
196static int lines, cols;
197
198#include "../../../../lib/inflate.c"
199
200static void *malloc(int size)
201{
202 void *p;
203
204 if (size <0) error("Malloc error");
205 if (free_mem_ptr <= 0) error("Memory error");
206
207 free_mem_ptr = (free_mem_ptr + 3) & ~3; /* Align */
208
209 p = (void *)free_mem_ptr;
210 free_mem_ptr += size;
211
212 if (free_mem_ptr >= free_mem_end_ptr)
213 error("Out of memory");
214
215 return p;
216}
217
218static void free(void *where)
219{ /* Don't care */
220}
221
222static void gzip_mark(void **ptr)
223{
224 *ptr = (void *) free_mem_ptr;
225}
226
227static void gzip_release(void **ptr)
228{
229 free_mem_ptr = (long) *ptr;
230}
231
232static void scroll(void)
233{
234 int i;
235
236 memcpy ( vidmem, vidmem + cols * 2, ( lines - 1 ) * cols * 2 );
237 for ( i = ( lines - 1 ) * cols * 2; i < lines * cols * 2; i += 2 )
238 vidmem[i] = ' ';
239}
240
241static void putstr(const char *s)
242{
243 int x,y,pos;
244 char c;
245
246 x = RM_SCREEN_INFO.orig_x;
247 y = RM_SCREEN_INFO.orig_y;
248
249 while ( ( c = *s++ ) != '\0' ) {
250 if ( c == '\n' ) {
251 x = 0;
252 if ( ++y >= lines ) {
253 scroll();
254 y--;
255 }
256 } else {
257 vidmem [ ( x + cols * y ) * 2 ] = c;
258 if ( ++x >= cols ) {
259 x = 0;
260 if ( ++y >= lines ) {
261 scroll();
262 y--;
263 }
264 }
265 }
266 }
267
268 RM_SCREEN_INFO.orig_x = x;
269 RM_SCREEN_INFO.orig_y = y;
270
271 pos = (x + cols * y) * 2; /* Update cursor position */
272 outb_p(14, vidport);
273 outb_p(0xff & (pos >> 9), vidport+1);
274 outb_p(15, vidport);
275 outb_p(0xff & (pos >> 1), vidport+1);
276}
277
278static void* memset(void* s, int c, unsigned n)
279{
280 int i;
281 char *ss = (char*)s;
282
283 for (i=0;i<n;i++) ss[i] = c;
284 return s;
285}
286
287static void* memcpy(void* dest, const void* src, unsigned n)
288{
289 int i;
290 char *d = (char *)dest, *s = (char *)src;
291
292 for (i=0;i<n;i++) d[i] = s[i];
293 return dest;
294}
295
296/* ===========================================================================
297 * Fill the input buffer. This is called only when the buffer is empty
298 * and at least one byte is really needed.
299 */
300static int fill_inbuf(void)
301{
302 error("ran out of input data");
303 return 0;
304}
305
306/* ===========================================================================
307 * Write the output window window[0..outcnt-1] and update crc and bytes_out.
308 * (Used for the decompressed data only.)
309 */
310static void flush_window(void)
311{
312 /* With my window equal to my output buffer
313 * I only need to compute the crc here.
314 */
315 ulg c = crc; /* temporary variable */
316 unsigned n;
317 uch *in, ch;
318
319 in = window;
320 for (n = 0; n < outcnt; n++) {
321 ch = *in++;
322 c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8);
323 }
324 crc = c;
325 bytes_out += (ulg)outcnt;
326 outcnt = 0;
327}
328
329static void error(char *x)
330{
331 putstr("\n\n");
332 putstr(x);
333 putstr("\n\n -- System halted");
334
335 while(1); /* Halt */
336}
337
338asmlinkage void decompress_kernel(void *rmode, unsigned long heap,
339 uch *input_data, unsigned long input_len, uch *output)
340{
341 real_mode = rmode;
342
343 if (RM_SCREEN_INFO.orig_video_mode == 7) {
344 vidmem = (char *) 0xb0000;
345 vidport = 0x3b4;
346 } else {
347 vidmem = (char *) 0xb8000;
348 vidport = 0x3d4;
349 }
350
351 lines = RM_SCREEN_INFO.orig_video_lines;
352 cols = RM_SCREEN_INFO.orig_video_cols;
353
354 window = output; /* Output buffer (Normally at 1M) */
355 free_mem_ptr = heap; /* Heap */
356 free_mem_end_ptr = heap + HEAP_SIZE;
357 inbuf = input_data; /* Input buffer */
358 insize = input_len;
359 inptr = 0;
360
361 if ((ulg)output & (__KERNEL_ALIGN - 1))
362 error("Destination address not 2M aligned");
363 if ((ulg)output >= 0xffffffffffUL)
364 error("Destination address too large");
365
366 makecrc();
367 putstr(".\nDecompressing Linux...");
368 gunzip();
369 putstr("done.\nBooting the kernel.\n");
370 return;
371}
diff --git a/arch/x86/boot/compressed/relocs.c b/arch/x86/boot/compressed/relocs.c
new file mode 100644
index 000000000000..2d77ee728f92
--- /dev/null
+++ b/arch/x86/boot/compressed/relocs.c
@@ -0,0 +1,631 @@
1#include <stdio.h>
2#include <stdarg.h>
3#include <stdlib.h>
4#include <stdint.h>
5#include <string.h>
6#include <errno.h>
7#include <unistd.h>
8#include <elf.h>
9#include <byteswap.h>
10#define USE_BSD
11#include <endian.h>
12
13#define MAX_SHDRS 100
14#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
15static Elf32_Ehdr ehdr;
16static Elf32_Shdr shdr[MAX_SHDRS];
17static Elf32_Sym *symtab[MAX_SHDRS];
18static Elf32_Rel *reltab[MAX_SHDRS];
19static char *strtab[MAX_SHDRS];
20static unsigned long reloc_count, reloc_idx;
21static unsigned long *relocs;
22
23/*
24 * Following symbols have been audited. There values are constant and do
25 * not change if bzImage is loaded at a different physical address than
26 * the address for which it has been compiled. Don't warn user about
27 * absolute relocations present w.r.t these symbols.
28 */
29static const char* safe_abs_relocs[] = {
30 "__kernel_vsyscall",
31 "__kernel_rt_sigreturn",
32 "__kernel_sigreturn",
33 "SYSENTER_RETURN",
34 "VDSO_NOTE_MASK",
35 "xen_irq_disable_direct_reloc",
36 "xen_save_fl_direct_reloc",
37};
38
39static int is_safe_abs_reloc(const char* sym_name)
40{
41 int i, array_size;
42
43 array_size = sizeof(safe_abs_relocs)/sizeof(char*);
44
45 for(i = 0; i < array_size; i++) {
46 if (!strcmp(sym_name, safe_abs_relocs[i]))
47 /* Match found */
48 return 1;
49 }
50 if (strncmp(sym_name, "__crc_", 6) == 0)
51 return 1;
52 return 0;
53}
54
55static void die(char *fmt, ...)
56{
57 va_list ap;
58 va_start(ap, fmt);
59 vfprintf(stderr, fmt, ap);
60 va_end(ap);
61 exit(1);
62}
63
64static const char *sym_type(unsigned type)
65{
66 static const char *type_name[] = {
67#define SYM_TYPE(X) [X] = #X
68 SYM_TYPE(STT_NOTYPE),
69 SYM_TYPE(STT_OBJECT),
70 SYM_TYPE(STT_FUNC),
71 SYM_TYPE(STT_SECTION),
72 SYM_TYPE(STT_FILE),
73 SYM_TYPE(STT_COMMON),
74 SYM_TYPE(STT_TLS),
75#undef SYM_TYPE
76 };
77 const char *name = "unknown sym type name";
78 if (type < ARRAY_SIZE(type_name)) {
79 name = type_name[type];
80 }
81 return name;
82}
83
84static const char *sym_bind(unsigned bind)
85{
86 static const char *bind_name[] = {
87#define SYM_BIND(X) [X] = #X
88 SYM_BIND(STB_LOCAL),
89 SYM_BIND(STB_GLOBAL),
90 SYM_BIND(STB_WEAK),
91#undef SYM_BIND
92 };
93 const char *name = "unknown sym bind name";
94 if (bind < ARRAY_SIZE(bind_name)) {
95 name = bind_name[bind];
96 }
97 return name;
98}
99
100static const char *sym_visibility(unsigned visibility)
101{
102 static const char *visibility_name[] = {
103#define SYM_VISIBILITY(X) [X] = #X
104 SYM_VISIBILITY(STV_DEFAULT),
105 SYM_VISIBILITY(STV_INTERNAL),
106 SYM_VISIBILITY(STV_HIDDEN),
107 SYM_VISIBILITY(STV_PROTECTED),
108#undef SYM_VISIBILITY
109 };
110 const char *name = "unknown sym visibility name";
111 if (visibility < ARRAY_SIZE(visibility_name)) {
112 name = visibility_name[visibility];
113 }
114 return name;
115}
116
117static const char *rel_type(unsigned type)
118{
119 static const char *type_name[] = {
120#define REL_TYPE(X) [X] = #X
121 REL_TYPE(R_386_NONE),
122 REL_TYPE(R_386_32),
123 REL_TYPE(R_386_PC32),
124 REL_TYPE(R_386_GOT32),
125 REL_TYPE(R_386_PLT32),
126 REL_TYPE(R_386_COPY),
127 REL_TYPE(R_386_GLOB_DAT),
128 REL_TYPE(R_386_JMP_SLOT),
129 REL_TYPE(R_386_RELATIVE),
130 REL_TYPE(R_386_GOTOFF),
131 REL_TYPE(R_386_GOTPC),
132#undef REL_TYPE
133 };
134 const char *name = "unknown type rel type name";
135 if (type < ARRAY_SIZE(type_name)) {
136 name = type_name[type];
137 }
138 return name;
139}
140
141static const char *sec_name(unsigned shndx)
142{
143 const char *sec_strtab;
144 const char *name;
145 sec_strtab = strtab[ehdr.e_shstrndx];
146 name = "<noname>";
147 if (shndx < ehdr.e_shnum) {
148 name = sec_strtab + shdr[shndx].sh_name;
149 }
150 else if (shndx == SHN_ABS) {
151 name = "ABSOLUTE";
152 }
153 else if (shndx == SHN_COMMON) {
154 name = "COMMON";
155 }
156 return name;
157}
158
159static const char *sym_name(const char *sym_strtab, Elf32_Sym *sym)
160{
161 const char *name;
162 name = "<noname>";
163 if (sym->st_name) {
164 name = sym_strtab + sym->st_name;
165 }
166 else {
167 name = sec_name(shdr[sym->st_shndx].sh_name);
168 }
169 return name;
170}
171
172
173
174#if BYTE_ORDER == LITTLE_ENDIAN
175#define le16_to_cpu(val) (val)
176#define le32_to_cpu(val) (val)
177#endif
178#if BYTE_ORDER == BIG_ENDIAN
179#define le16_to_cpu(val) bswap_16(val)
180#define le32_to_cpu(val) bswap_32(val)
181#endif
182
183static uint16_t elf16_to_cpu(uint16_t val)
184{
185 return le16_to_cpu(val);
186}
187
188static uint32_t elf32_to_cpu(uint32_t val)
189{
190 return le32_to_cpu(val);
191}
192
193static void read_ehdr(FILE *fp)
194{
195 if (fread(&ehdr, sizeof(ehdr), 1, fp) != 1) {
196 die("Cannot read ELF header: %s\n",
197 strerror(errno));
198 }
199 if (memcmp(ehdr.e_ident, ELFMAG, 4) != 0) {
200 die("No ELF magic\n");
201 }
202 if (ehdr.e_ident[EI_CLASS] != ELFCLASS32) {
203 die("Not a 32 bit executable\n");
204 }
205 if (ehdr.e_ident[EI_DATA] != ELFDATA2LSB) {
206 die("Not a LSB ELF executable\n");
207 }
208 if (ehdr.e_ident[EI_VERSION] != EV_CURRENT) {
209 die("Unknown ELF version\n");
210 }
211 /* Convert the fields to native endian */
212 ehdr.e_type = elf16_to_cpu(ehdr.e_type);
213 ehdr.e_machine = elf16_to_cpu(ehdr.e_machine);
214 ehdr.e_version = elf32_to_cpu(ehdr.e_version);
215 ehdr.e_entry = elf32_to_cpu(ehdr.e_entry);
216 ehdr.e_phoff = elf32_to_cpu(ehdr.e_phoff);
217 ehdr.e_shoff = elf32_to_cpu(ehdr.e_shoff);
218 ehdr.e_flags = elf32_to_cpu(ehdr.e_flags);
219 ehdr.e_ehsize = elf16_to_cpu(ehdr.e_ehsize);
220 ehdr.e_phentsize = elf16_to_cpu(ehdr.e_phentsize);
221 ehdr.e_phnum = elf16_to_cpu(ehdr.e_phnum);
222 ehdr.e_shentsize = elf16_to_cpu(ehdr.e_shentsize);
223 ehdr.e_shnum = elf16_to_cpu(ehdr.e_shnum);
224 ehdr.e_shstrndx = elf16_to_cpu(ehdr.e_shstrndx);
225
226 if ((ehdr.e_type != ET_EXEC) && (ehdr.e_type != ET_DYN)) {
227 die("Unsupported ELF header type\n");
228 }
229 if (ehdr.e_machine != EM_386) {
230 die("Not for x86\n");
231 }
232 if (ehdr.e_version != EV_CURRENT) {
233 die("Unknown ELF version\n");
234 }
235 if (ehdr.e_ehsize != sizeof(Elf32_Ehdr)) {
236 die("Bad Elf header size\n");
237 }
238 if (ehdr.e_phentsize != sizeof(Elf32_Phdr)) {
239 die("Bad program header entry\n");
240 }
241 if (ehdr.e_shentsize != sizeof(Elf32_Shdr)) {
242 die("Bad section header entry\n");
243 }
244 if (ehdr.e_shstrndx >= ehdr.e_shnum) {
245 die("String table index out of bounds\n");
246 }
247}
248
249static void read_shdrs(FILE *fp)
250{
251 int i;
252 if (ehdr.e_shnum > MAX_SHDRS) {
253 die("%d section headers supported: %d\n",
254 ehdr.e_shnum, MAX_SHDRS);
255 }
256 if (fseek(fp, ehdr.e_shoff, SEEK_SET) < 0) {
257 die("Seek to %d failed: %s\n",
258 ehdr.e_shoff, strerror(errno));
259 }
260 if (fread(&shdr, sizeof(shdr[0]), ehdr.e_shnum, fp) != ehdr.e_shnum) {
261 die("Cannot read ELF section headers: %s\n",
262 strerror(errno));
263 }
264 for(i = 0; i < ehdr.e_shnum; i++) {
265 shdr[i].sh_name = elf32_to_cpu(shdr[i].sh_name);
266 shdr[i].sh_type = elf32_to_cpu(shdr[i].sh_type);
267 shdr[i].sh_flags = elf32_to_cpu(shdr[i].sh_flags);
268 shdr[i].sh_addr = elf32_to_cpu(shdr[i].sh_addr);
269 shdr[i].sh_offset = elf32_to_cpu(shdr[i].sh_offset);
270 shdr[i].sh_size = elf32_to_cpu(shdr[i].sh_size);
271 shdr[i].sh_link = elf32_to_cpu(shdr[i].sh_link);
272 shdr[i].sh_info = elf32_to_cpu(shdr[i].sh_info);
273 shdr[i].sh_addralign = elf32_to_cpu(shdr[i].sh_addralign);
274 shdr[i].sh_entsize = elf32_to_cpu(shdr[i].sh_entsize);
275 }
276
277}
278
279static void read_strtabs(FILE *fp)
280{
281 int i;
282 for(i = 0; i < ehdr.e_shnum; i++) {
283 if (shdr[i].sh_type != SHT_STRTAB) {
284 continue;
285 }
286 strtab[i] = malloc(shdr[i].sh_size);
287 if (!strtab[i]) {
288 die("malloc of %d bytes for strtab failed\n",
289 shdr[i].sh_size);
290 }
291 if (fseek(fp, shdr[i].sh_offset, SEEK_SET) < 0) {
292 die("Seek to %d failed: %s\n",
293 shdr[i].sh_offset, strerror(errno));
294 }
295 if (fread(strtab[i], 1, shdr[i].sh_size, fp) != shdr[i].sh_size) {
296 die("Cannot read symbol table: %s\n",
297 strerror(errno));
298 }
299 }
300}
301
302static void read_symtabs(FILE *fp)
303{
304 int i,j;
305 for(i = 0; i < ehdr.e_shnum; i++) {
306 if (shdr[i].sh_type != SHT_SYMTAB) {
307 continue;
308 }
309 symtab[i] = malloc(shdr[i].sh_size);
310 if (!symtab[i]) {
311 die("malloc of %d bytes for symtab failed\n",
312 shdr[i].sh_size);
313 }
314 if (fseek(fp, shdr[i].sh_offset, SEEK_SET) < 0) {
315 die("Seek to %d failed: %s\n",
316 shdr[i].sh_offset, strerror(errno));
317 }
318 if (fread(symtab[i], 1, shdr[i].sh_size, fp) != shdr[i].sh_size) {
319 die("Cannot read symbol table: %s\n",
320 strerror(errno));
321 }
322 for(j = 0; j < shdr[i].sh_size/sizeof(symtab[i][0]); j++) {
323 symtab[i][j].st_name = elf32_to_cpu(symtab[i][j].st_name);
324 symtab[i][j].st_value = elf32_to_cpu(symtab[i][j].st_value);
325 symtab[i][j].st_size = elf32_to_cpu(symtab[i][j].st_size);
326 symtab[i][j].st_shndx = elf16_to_cpu(symtab[i][j].st_shndx);
327 }
328 }
329}
330
331
332static void read_relocs(FILE *fp)
333{
334 int i,j;
335 for(i = 0; i < ehdr.e_shnum; i++) {
336 if (shdr[i].sh_type != SHT_REL) {
337 continue;
338 }
339 reltab[i] = malloc(shdr[i].sh_size);
340 if (!reltab[i]) {
341 die("malloc of %d bytes for relocs failed\n",
342 shdr[i].sh_size);
343 }
344 if (fseek(fp, shdr[i].sh_offset, SEEK_SET) < 0) {
345 die("Seek to %d failed: %s\n",
346 shdr[i].sh_offset, strerror(errno));
347 }
348 if (fread(reltab[i], 1, shdr[i].sh_size, fp) != shdr[i].sh_size) {
349 die("Cannot read symbol table: %s\n",
350 strerror(errno));
351 }
352 for(j = 0; j < shdr[i].sh_size/sizeof(reltab[0][0]); j++) {
353 reltab[i][j].r_offset = elf32_to_cpu(reltab[i][j].r_offset);
354 reltab[i][j].r_info = elf32_to_cpu(reltab[i][j].r_info);
355 }
356 }
357}
358
359
360static void print_absolute_symbols(void)
361{
362 int i;
363 printf("Absolute symbols\n");
364 printf(" Num: Value Size Type Bind Visibility Name\n");
365 for(i = 0; i < ehdr.e_shnum; i++) {
366 char *sym_strtab;
367 Elf32_Sym *sh_symtab;
368 int j;
369 if (shdr[i].sh_type != SHT_SYMTAB) {
370 continue;
371 }
372 sh_symtab = symtab[i];
373 sym_strtab = strtab[shdr[i].sh_link];
374 for(j = 0; j < shdr[i].sh_size/sizeof(symtab[0][0]); j++) {
375 Elf32_Sym *sym;
376 const char *name;
377 sym = &symtab[i][j];
378 name = sym_name(sym_strtab, sym);
379 if (sym->st_shndx != SHN_ABS) {
380 continue;
381 }
382 printf("%5d %08x %5d %10s %10s %12s %s\n",
383 j, sym->st_value, sym->st_size,
384 sym_type(ELF32_ST_TYPE(sym->st_info)),
385 sym_bind(ELF32_ST_BIND(sym->st_info)),
386 sym_visibility(ELF32_ST_VISIBILITY(sym->st_other)),
387 name);
388 }
389 }
390 printf("\n");
391}
392
393static void print_absolute_relocs(void)
394{
395 int i, printed = 0;
396
397 for(i = 0; i < ehdr.e_shnum; i++) {
398 char *sym_strtab;
399 Elf32_Sym *sh_symtab;
400 unsigned sec_applies, sec_symtab;
401 int j;
402 if (shdr[i].sh_type != SHT_REL) {
403 continue;
404 }
405 sec_symtab = shdr[i].sh_link;
406 sec_applies = shdr[i].sh_info;
407 if (!(shdr[sec_applies].sh_flags & SHF_ALLOC)) {
408 continue;
409 }
410 sh_symtab = symtab[sec_symtab];
411 sym_strtab = strtab[shdr[sec_symtab].sh_link];
412 for(j = 0; j < shdr[i].sh_size/sizeof(reltab[0][0]); j++) {
413 Elf32_Rel *rel;
414 Elf32_Sym *sym;
415 const char *name;
416 rel = &reltab[i][j];
417 sym = &sh_symtab[ELF32_R_SYM(rel->r_info)];
418 name = sym_name(sym_strtab, sym);
419 if (sym->st_shndx != SHN_ABS) {
420 continue;
421 }
422
423 /* Absolute symbols are not relocated if bzImage is
424 * loaded at a non-compiled address. Display a warning
425 * to user at compile time about the absolute
426 * relocations present.
427 *
428 * User need to audit the code to make sure
429 * some symbols which should have been section
430 * relative have not become absolute because of some
431 * linker optimization or wrong programming usage.
432 *
433 * Before warning check if this absolute symbol
434 * relocation is harmless.
435 */
436 if (is_safe_abs_reloc(name))
437 continue;
438
439 if (!printed) {
440 printf("WARNING: Absolute relocations"
441 " present\n");
442 printf("Offset Info Type Sym.Value "
443 "Sym.Name\n");
444 printed = 1;
445 }
446
447 printf("%08x %08x %10s %08x %s\n",
448 rel->r_offset,
449 rel->r_info,
450 rel_type(ELF32_R_TYPE(rel->r_info)),
451 sym->st_value,
452 name);
453 }
454 }
455
456 if (printed)
457 printf("\n");
458}
459
460static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym))
461{
462 int i;
463 /* Walk through the relocations */
464 for(i = 0; i < ehdr.e_shnum; i++) {
465 char *sym_strtab;
466 Elf32_Sym *sh_symtab;
467 unsigned sec_applies, sec_symtab;
468 int j;
469 if (shdr[i].sh_type != SHT_REL) {
470 continue;
471 }
472 sec_symtab = shdr[i].sh_link;
473 sec_applies = shdr[i].sh_info;
474 if (!(shdr[sec_applies].sh_flags & SHF_ALLOC)) {
475 continue;
476 }
477 sh_symtab = symtab[sec_symtab];
478 sym_strtab = strtab[shdr[sec_symtab].sh_link];
479 for(j = 0; j < shdr[i].sh_size/sizeof(reltab[0][0]); j++) {
480 Elf32_Rel *rel;
481 Elf32_Sym *sym;
482 unsigned r_type;
483 rel = &reltab[i][j];
484 sym = &sh_symtab[ELF32_R_SYM(rel->r_info)];
485 r_type = ELF32_R_TYPE(rel->r_info);
486 /* Don't visit relocations to absolute symbols */
487 if (sym->st_shndx == SHN_ABS) {
488 continue;
489 }
490 if (r_type == R_386_PC32) {
491 /* PC relative relocations don't need to be adjusted */
492 }
493 else if (r_type == R_386_32) {
494 /* Visit relocations that need to be adjusted */
495 visit(rel, sym);
496 }
497 else {
498 die("Unsupported relocation type: %d\n", r_type);
499 }
500 }
501 }
502}
503
504static void count_reloc(Elf32_Rel *rel, Elf32_Sym *sym)
505{
506 reloc_count += 1;
507}
508
509static void collect_reloc(Elf32_Rel *rel, Elf32_Sym *sym)
510{
511 /* Remember the address that needs to be adjusted. */
512 relocs[reloc_idx++] = rel->r_offset;
513}
514
515static int cmp_relocs(const void *va, const void *vb)
516{
517 const unsigned long *a, *b;
518 a = va; b = vb;
519 return (*a == *b)? 0 : (*a > *b)? 1 : -1;
520}
521
522static void emit_relocs(int as_text)
523{
524 int i;
525 /* Count how many relocations I have and allocate space for them. */
526 reloc_count = 0;
527 walk_relocs(count_reloc);
528 relocs = malloc(reloc_count * sizeof(relocs[0]));
529 if (!relocs) {
530 die("malloc of %d entries for relocs failed\n",
531 reloc_count);
532 }
533 /* Collect up the relocations */
534 reloc_idx = 0;
535 walk_relocs(collect_reloc);
536
537 /* Order the relocations for more efficient processing */
538 qsort(relocs, reloc_count, sizeof(relocs[0]), cmp_relocs);
539
540 /* Print the relocations */
541 if (as_text) {
542 /* Print the relocations in a form suitable that
543 * gas will like.
544 */
545 printf(".section \".data.reloc\",\"a\"\n");
546 printf(".balign 4\n");
547 for(i = 0; i < reloc_count; i++) {
548 printf("\t .long 0x%08lx\n", relocs[i]);
549 }
550 printf("\n");
551 }
552 else {
553 unsigned char buf[4];
554 buf[0] = buf[1] = buf[2] = buf[3] = 0;
555 /* Print a stop */
556 printf("%c%c%c%c", buf[0], buf[1], buf[2], buf[3]);
557 /* Now print each relocation */
558 for(i = 0; i < reloc_count; i++) {
559 buf[0] = (relocs[i] >> 0) & 0xff;
560 buf[1] = (relocs[i] >> 8) & 0xff;
561 buf[2] = (relocs[i] >> 16) & 0xff;
562 buf[3] = (relocs[i] >> 24) & 0xff;
563 printf("%c%c%c%c", buf[0], buf[1], buf[2], buf[3]);
564 }
565 }
566}
567
568static void usage(void)
569{
570 die("relocs [--abs-syms |--abs-relocs | --text] vmlinux\n");
571}
572
573int main(int argc, char **argv)
574{
575 int show_absolute_syms, show_absolute_relocs;
576 int as_text;
577 const char *fname;
578 FILE *fp;
579 int i;
580
581 show_absolute_syms = 0;
582 show_absolute_relocs = 0;
583 as_text = 0;
584 fname = NULL;
585 for(i = 1; i < argc; i++) {
586 char *arg = argv[i];
587 if (*arg == '-') {
588 if (strcmp(argv[1], "--abs-syms") == 0) {
589 show_absolute_syms = 1;
590 continue;
591 }
592
593 if (strcmp(argv[1], "--abs-relocs") == 0) {
594 show_absolute_relocs = 1;
595 continue;
596 }
597 else if (strcmp(argv[1], "--text") == 0) {
598 as_text = 1;
599 continue;
600 }
601 }
602 else if (!fname) {
603 fname = arg;
604 continue;
605 }
606 usage();
607 }
608 if (!fname) {
609 usage();
610 }
611 fp = fopen(fname, "r");
612 if (!fp) {
613 die("Cannot open %s: %s\n",
614 fname, strerror(errno));
615 }
616 read_ehdr(fp);
617 read_shdrs(fp);
618 read_strtabs(fp);
619 read_symtabs(fp);
620 read_relocs(fp);
621 if (show_absolute_syms) {
622 print_absolute_symbols();
623 return 0;
624 }
625 if (show_absolute_relocs) {
626 print_absolute_relocs();
627 return 0;
628 }
629 emit_relocs(as_text);
630 return 0;
631}
diff --git a/arch/x86/boot/compressed/vmlinux_32.lds b/arch/x86/boot/compressed/vmlinux_32.lds
new file mode 100644
index 000000000000..cc4854f6c6c1
--- /dev/null
+++ b/arch/x86/boot/compressed/vmlinux_32.lds
@@ -0,0 +1,43 @@
1OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
2OUTPUT_ARCH(i386)
3ENTRY(startup_32)
4SECTIONS
5{
6 /* Be careful parts of head.S assume startup_32 is at
7 * address 0.
8 */
9 . = 0 ;
10 .text.head : {
11 _head = . ;
12 *(.text.head)
13 _ehead = . ;
14 }
15 .data.compressed : {
16 *(.data.compressed)
17 }
18 .text : {
19 _text = .; /* Text */
20 *(.text)
21 *(.text.*)
22 _etext = . ;
23 }
24 .rodata : {
25 _rodata = . ;
26 *(.rodata) /* read-only data */
27 *(.rodata.*)
28 _erodata = . ;
29 }
30 .data : {
31 _data = . ;
32 *(.data)
33 *(.data.*)
34 _edata = . ;
35 }
36 .bss : {
37 _bss = . ;
38 *(.bss)
39 *(.bss.*)
40 *(COMMON)
41 _end = . ;
42 }
43}
diff --git a/arch/x86/boot/compressed/vmlinux_32.scr b/arch/x86/boot/compressed/vmlinux_32.scr
new file mode 100644
index 000000000000..707a88f7f29e
--- /dev/null
+++ b/arch/x86/boot/compressed/vmlinux_32.scr
@@ -0,0 +1,10 @@
1SECTIONS
2{
3 .data.compressed : {
4 input_len = .;
5 LONG(input_data_end - input_data) input_data = .;
6 *(.data)
7 output_len = . - 4;
8 input_data_end = .;
9 }
10}
diff --git a/arch/x86/boot/compressed/vmlinux_64.lds b/arch/x86/boot/compressed/vmlinux_64.lds
new file mode 100644
index 000000000000..94c13e557fb4
--- /dev/null
+++ b/arch/x86/boot/compressed/vmlinux_64.lds
@@ -0,0 +1,44 @@
1OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
2OUTPUT_ARCH(i386:x86-64)
3ENTRY(startup_64)
4SECTIONS
5{
6 /* Be careful parts of head.S assume startup_32 is at
7 * address 0.
8 */
9 . = 0;
10 .text : {
11 _head = . ;
12 *(.text.head)
13 _ehead = . ;
14 *(.text.compressed)
15 _text = .; /* Text */
16 *(.text)
17 *(.text.*)
18 _etext = . ;
19 }
20 .rodata : {
21 _rodata = . ;
22 *(.rodata) /* read-only data */
23 *(.rodata.*)
24 _erodata = . ;
25 }
26 .data : {
27 _data = . ;
28 *(.data)
29 *(.data.*)
30 _edata = . ;
31 }
32 .bss : {
33 _bss = . ;
34 *(.bss)
35 *(.bss.*)
36 *(COMMON)
37 . = ALIGN(8);
38 _end = . ;
39 . = ALIGN(4096);
40 pgtable = . ;
41 . = . + 4096 * 6;
42 _heap = .;
43 }
44}
diff --git a/arch/x86/boot/compressed/vmlinux_64.scr b/arch/x86/boot/compressed/vmlinux_64.scr
new file mode 100644
index 000000000000..bd1429ce193e
--- /dev/null
+++ b/arch/x86/boot/compressed/vmlinux_64.scr
@@ -0,0 +1,10 @@
1SECTIONS
2{
3 .text.compressed : {
4 input_len = .;
5 LONG(input_data_end - input_data) input_data = .;
6 *(.data)
7 output_len = . - 4;
8 input_data_end = .;
9 }
10}
diff --git a/arch/x86/boot/copy.S b/arch/x86/boot/copy.S
new file mode 100644
index 000000000000..ef127e56a3cf
--- /dev/null
+++ b/arch/x86/boot/copy.S
@@ -0,0 +1,101 @@
1/* ----------------------------------------------------------------------- *
2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 *
6 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2.
8 *
9 * ----------------------------------------------------------------------- */
10
11/*
12 * arch/i386/boot/copy.S
13 *
14 * Memory copy routines
15 */
16
17 .code16gcc
18 .text
19
20 .globl memcpy
21 .type memcpy, @function
22memcpy:
23 pushw %si
24 pushw %di
25 movw %ax, %di
26 movw %dx, %si
27 pushw %cx
28 shrw $2, %cx
29 rep; movsl
30 popw %cx
31 andw $3, %cx
32 rep; movsb
33 popw %di
34 popw %si
35 ret
36 .size memcpy, .-memcpy
37
38 .globl memset
39 .type memset, @function
40memset:
41 pushw %di
42 movw %ax, %di
43 movzbl %dl, %eax
44 imull $0x01010101,%eax
45 pushw %cx
46 shrw $2, %cx
47 rep; stosl
48 popw %cx
49 andw $3, %cx
50 rep; stosb
51 popw %di
52 ret
53 .size memset, .-memset
54
55 .globl copy_from_fs
56 .type copy_from_fs, @function
57copy_from_fs:
58 pushw %ds
59 pushw %fs
60 popw %ds
61 call memcpy
62 popw %ds
63 ret
64 .size copy_from_fs, .-copy_from_fs
65
66 .globl copy_to_fs
67 .type copy_to_fs, @function
68copy_to_fs:
69 pushw %es
70 pushw %fs
71 popw %es
72 call memcpy
73 popw %es
74 ret
75 .size copy_to_fs, .-copy_to_fs
76
77#if 0 /* Not currently used, but can be enabled as needed */
78
79 .globl copy_from_gs
80 .type copy_from_gs, @function
81copy_from_gs:
82 pushw %ds
83 pushw %gs
84 popw %ds
85 call memcpy
86 popw %ds
87 ret
88 .size copy_from_gs, .-copy_from_gs
89 .globl copy_to_gs
90
91 .type copy_to_gs, @function
92copy_to_gs:
93 pushw %es
94 pushw %gs
95 popw %es
96 call memcpy
97 popw %es
98 ret
99 .size copy_to_gs, .-copy_to_gs
100
101#endif
diff --git a/arch/x86/boot/cpu.c b/arch/x86/boot/cpu.c
new file mode 100644
index 000000000000..2a5c32da5852
--- /dev/null
+++ b/arch/x86/boot/cpu.c
@@ -0,0 +1,69 @@
1/* -*- linux-c -*- ------------------------------------------------------- *
2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 *
6 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2.
8 *
9 * ----------------------------------------------------------------------- */
10
11/*
12 * arch/i386/boot/cpu.c
13 *
14 * Check for obligatory CPU features and abort if the features are not
15 * present.
16 */
17
18#include "boot.h"
19#include "bitops.h"
20#include <asm/cpufeature.h>
21
22static char *cpu_name(int level)
23{
24 static char buf[6];
25
26 if (level == 64) {
27 return "x86-64";
28 } else {
29 sprintf(buf, "i%d86", level);
30 return buf;
31 }
32}
33
34int validate_cpu(void)
35{
36 u32 *err_flags;
37 int cpu_level, req_level;
38
39 check_cpu(&cpu_level, &req_level, &err_flags);
40
41 if (cpu_level < req_level) {
42 printf("This kernel requires an %s CPU, ",
43 cpu_name(req_level));
44 printf("but only detected an %s CPU.\n",
45 cpu_name(cpu_level));
46 return -1;
47 }
48
49 if (err_flags) {
50 int i, j;
51 puts("This kernel requires the following features "
52 "not present on the CPU:\n");
53
54 for (i = 0; i < NCAPINTS; i++) {
55 u32 e = err_flags[i];
56
57 for (j = 0; j < 32; j++) {
58 if (e & 1)
59 printf("%d:%d ", i, j);
60
61 e >>= 1;
62 }
63 }
64 putchar('\n');
65 return -1;
66 } else {
67 return 0;
68 }
69}
diff --git a/arch/x86/boot/cpucheck.c b/arch/x86/boot/cpucheck.c
new file mode 100644
index 000000000000..e655a89c5510
--- /dev/null
+++ b/arch/x86/boot/cpucheck.c
@@ -0,0 +1,268 @@
1/* -*- linux-c -*- ------------------------------------------------------- *
2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 *
6 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2.
8 *
9 * ----------------------------------------------------------------------- */
10
11/*
12 * arch/i386/boot/cpucheck.c
13 *
14 * Check for obligatory CPU features and abort if the features are not
15 * present. This code should be compilable as 16-, 32- or 64-bit
16 * code, so be very careful with types and inline assembly.
17 *
18 * This code should not contain any messages; that requires an
19 * additional wrapper.
20 *
21 * As written, this code is not safe for inclusion into the kernel
22 * proper (after FPU initialization, in particular).
23 */
24
25#ifdef _SETUP
26# include "boot.h"
27# include "bitops.h"
28#endif
29#include <linux/types.h>
30#include <asm/cpufeature.h>
31#include <asm/processor-flags.h>
32#include <asm/required-features.h>
33#include <asm/msr-index.h>
34
35struct cpu_features {
36 int level; /* Family, or 64 for x86-64 */
37 int model;
38 u32 flags[NCAPINTS];
39};
40
41static struct cpu_features cpu;
42static u32 cpu_vendor[3];
43static u32 err_flags[NCAPINTS];
44
45#ifdef CONFIG_X86_64
46static const int req_level = 64;
47#elif defined(CONFIG_X86_MINIMUM_CPU_FAMILY)
48static const int req_level = CONFIG_X86_MINIMUM_CPU_FAMILY;
49#else
50static const int req_level = 3;
51#endif
52
53static const u32 req_flags[NCAPINTS] =
54{
55 REQUIRED_MASK0,
56 REQUIRED_MASK1,
57 REQUIRED_MASK2,
58 REQUIRED_MASK3,
59 REQUIRED_MASK4,
60 REQUIRED_MASK5,
61 REQUIRED_MASK6,
62 REQUIRED_MASK7,
63};
64
65#define A32(a,b,c,d) (((d) << 24)+((c) << 16)+((b) << 8)+(a))
66
67static int is_amd(void)
68{
69 return cpu_vendor[0] == A32('A','u','t','h') &&
70 cpu_vendor[1] == A32('e','n','t','i') &&
71 cpu_vendor[2] == A32('c','A','M','D');
72}
73
74static int is_centaur(void)
75{
76 return cpu_vendor[0] == A32('C','e','n','t') &&
77 cpu_vendor[1] == A32('a','u','r','H') &&
78 cpu_vendor[2] == A32('a','u','l','s');
79}
80
81static int is_transmeta(void)
82{
83 return cpu_vendor[0] == A32('G','e','n','u') &&
84 cpu_vendor[1] == A32('i','n','e','T') &&
85 cpu_vendor[2] == A32('M','x','8','6');
86}
87
88static int has_fpu(void)
89{
90 u16 fcw = -1, fsw = -1;
91 u32 cr0;
92
93 asm("movl %%cr0,%0" : "=r" (cr0));
94 if (cr0 & (X86_CR0_EM|X86_CR0_TS)) {
95 cr0 &= ~(X86_CR0_EM|X86_CR0_TS);
96 asm volatile("movl %0,%%cr0" : : "r" (cr0));
97 }
98
99 asm volatile("fninit ; fnstsw %0 ; fnstcw %1"
100 : "+m" (fsw), "+m" (fcw));
101
102 return fsw == 0 && (fcw & 0x103f) == 0x003f;
103}
104
105static int has_eflag(u32 mask)
106{
107 u32 f0, f1;
108
109 asm("pushfl ; "
110 "pushfl ; "
111 "popl %0 ; "
112 "movl %0,%1 ; "
113 "xorl %2,%1 ; "
114 "pushl %1 ; "
115 "popfl ; "
116 "pushfl ; "
117 "popl %1 ; "
118 "popfl"
119 : "=&r" (f0), "=&r" (f1)
120 : "ri" (mask));
121
122 return !!((f0^f1) & mask);
123}
124
125static void get_flags(void)
126{
127 u32 max_intel_level, max_amd_level;
128 u32 tfms;
129
130 if (has_fpu())
131 set_bit(X86_FEATURE_FPU, cpu.flags);
132
133 if (has_eflag(X86_EFLAGS_ID)) {
134 asm("cpuid"
135 : "=a" (max_intel_level),
136 "=b" (cpu_vendor[0]),
137 "=d" (cpu_vendor[1]),
138 "=c" (cpu_vendor[2])
139 : "a" (0));
140
141 if (max_intel_level >= 0x00000001 &&
142 max_intel_level <= 0x0000ffff) {
143 asm("cpuid"
144 : "=a" (tfms),
145 "=c" (cpu.flags[4]),
146 "=d" (cpu.flags[0])
147 : "a" (0x00000001)
148 : "ebx");
149 cpu.level = (tfms >> 8) & 15;
150 cpu.model = (tfms >> 4) & 15;
151 if (cpu.level >= 6)
152 cpu.model += ((tfms >> 16) & 0xf) << 4;
153 }
154
155 asm("cpuid"
156 : "=a" (max_amd_level)
157 : "a" (0x80000000)
158 : "ebx", "ecx", "edx");
159
160 if (max_amd_level >= 0x80000001 &&
161 max_amd_level <= 0x8000ffff) {
162 u32 eax = 0x80000001;
163 asm("cpuid"
164 : "+a" (eax),
165 "=c" (cpu.flags[6]),
166 "=d" (cpu.flags[1])
167 : : "ebx");
168 }
169 }
170}
171
172/* Returns a bitmask of which words we have error bits in */
173static int check_flags(void)
174{
175 u32 err;
176 int i;
177
178 err = 0;
179 for (i = 0; i < NCAPINTS; i++) {
180 err_flags[i] = req_flags[i] & ~cpu.flags[i];
181 if (err_flags[i])
182 err |= 1 << i;
183 }
184
185 return err;
186}
187
188/*
189 * Returns -1 on error.
190 *
191 * *cpu_level is set to the current CPU level; *req_level to the required
192 * level. x86-64 is considered level 64 for this purpose.
193 *
194 * *err_flags_ptr is set to the flags error array if there are flags missing.
195 */
196int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr)
197{
198 int err;
199
200 memset(&cpu.flags, 0, sizeof cpu.flags);
201 cpu.level = 3;
202
203 if (has_eflag(X86_EFLAGS_AC))
204 cpu.level = 4;
205
206 get_flags();
207 err = check_flags();
208
209 if (test_bit(X86_FEATURE_LM, cpu.flags))
210 cpu.level = 64;
211
212 if (err == 0x01 &&
213 !(err_flags[0] &
214 ~((1 << X86_FEATURE_XMM)|(1 << X86_FEATURE_XMM2))) &&
215 is_amd()) {
216 /* If this is an AMD and we're only missing SSE+SSE2, try to
217 turn them on */
218
219 u32 ecx = MSR_K7_HWCR;
220 u32 eax, edx;
221
222 asm("rdmsr" : "=a" (eax), "=d" (edx) : "c" (ecx));
223 eax &= ~(1 << 15);
224 asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx));
225
226 get_flags(); /* Make sure it really did something */
227 err = check_flags();
228 } else if (err == 0x01 &&
229 !(err_flags[0] & ~(1 << X86_FEATURE_CX8)) &&
230 is_centaur() && cpu.model >= 6) {
231 /* If this is a VIA C3, we might have to enable CX8
232 explicitly */
233
234 u32 ecx = MSR_VIA_FCR;
235 u32 eax, edx;
236
237 asm("rdmsr" : "=a" (eax), "=d" (edx) : "c" (ecx));
238 eax |= (1<<1)|(1<<7);
239 asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx));
240
241 set_bit(X86_FEATURE_CX8, cpu.flags);
242 err = check_flags();
243 } else if (err == 0x01 && is_transmeta()) {
244 /* Transmeta might have masked feature bits in word 0 */
245
246 u32 ecx = 0x80860004;
247 u32 eax, edx;
248 u32 level = 1;
249
250 asm("rdmsr" : "=a" (eax), "=d" (edx) : "c" (ecx));
251 asm("wrmsr" : : "a" (~0), "d" (edx), "c" (ecx));
252 asm("cpuid"
253 : "+a" (level), "=d" (cpu.flags[0])
254 : : "ecx", "ebx");
255 asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx));
256
257 err = check_flags();
258 }
259
260 if (err_flags_ptr)
261 *err_flags_ptr = err ? err_flags : NULL;
262 if (cpu_level_ptr)
263 *cpu_level_ptr = cpu.level;
264 if (req_level_ptr)
265 *req_level_ptr = req_level;
266
267 return (cpu.level < req_level || err) ? -1 : 0;
268}
diff --git a/arch/x86/boot/edd.c b/arch/x86/boot/edd.c
new file mode 100644
index 000000000000..bd138e442ec2
--- /dev/null
+++ b/arch/x86/boot/edd.c
@@ -0,0 +1,167 @@
1/* -*- linux-c -*- ------------------------------------------------------- *
2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 *
6 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2.
8 *
9 * ----------------------------------------------------------------------- */
10
11/*
12 * arch/i386/boot/edd.c
13 *
14 * Get EDD BIOS disk information
15 */
16
17#include "boot.h"
18#include <linux/edd.h>
19
20#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
21
22/*
23 * Read the MBR (first sector) from a specific device.
24 */
25static int read_mbr(u8 devno, void *buf)
26{
27 u16 ax, bx, cx, dx;
28
29 ax = 0x0201; /* Legacy Read, one sector */
30 cx = 0x0001; /* Sector 0-0-1 */
31 dx = devno;
32 bx = (size_t)buf;
33 asm volatile("pushfl; stc; int $0x13; setc %%al; popfl"
34 : "+a" (ax), "+c" (cx), "+d" (dx), "+b" (bx)
35 : : "esi", "edi", "memory");
36
37 return -(u8)ax; /* 0 or -1 */
38}
39
40static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig)
41{
42 int sector_size;
43 char *mbrbuf_ptr, *mbrbuf_end;
44 u32 buf_base, mbr_base;
45 extern char _end[];
46
47 sector_size = ei->params.bytes_per_sector;
48 if (!sector_size)
49 sector_size = 512; /* Best available guess */
50
51 /* Produce a naturally aligned buffer on the heap */
52 buf_base = (ds() << 4) + (u32)&_end;
53 mbr_base = (buf_base+sector_size-1) & ~(sector_size-1);
54 mbrbuf_ptr = _end + (mbr_base-buf_base);
55 mbrbuf_end = mbrbuf_ptr + sector_size;
56
57 /* Make sure we actually have space on the heap... */
58 if (!(boot_params.hdr.loadflags & CAN_USE_HEAP))
59 return -1;
60 if (mbrbuf_end > (char *)(size_t)boot_params.hdr.heap_end_ptr)
61 return -1;
62
63 if (read_mbr(devno, mbrbuf_ptr))
64 return -1;
65
66 *mbrsig = *(u32 *)&mbrbuf_ptr[EDD_MBR_SIG_OFFSET];
67 return 0;
68}
69
70static int get_edd_info(u8 devno, struct edd_info *ei)
71{
72 u16 ax, bx, cx, dx, di;
73
74 memset(ei, 0, sizeof *ei);
75
76 /* Check Extensions Present */
77
78 ax = 0x4100;
79 bx = EDDMAGIC1;
80 dx = devno;
81 asm("pushfl; stc; int $0x13; setc %%al; popfl"
82 : "+a" (ax), "+b" (bx), "=c" (cx), "+d" (dx)
83 : : "esi", "edi");
84
85 if ((u8)ax)
86 return -1; /* No extended information */
87
88 if (bx != EDDMAGIC2)
89 return -1;
90
91 ei->device = devno;
92 ei->version = ax >> 8; /* EDD version number */
93 ei->interface_support = cx; /* EDD functionality subsets */
94
95 /* Extended Get Device Parameters */
96
97 ei->params.length = sizeof(ei->params);
98 ax = 0x4800;
99 dx = devno;
100 asm("pushfl; int $0x13; popfl"
101 : "+a" (ax), "+d" (dx), "=m" (ei->params)
102 : "S" (&ei->params)
103 : "ebx", "ecx", "edi");
104
105 /* Get legacy CHS parameters */
106
107 /* Ralf Brown recommends setting ES:DI to 0:0 */
108 ax = 0x0800;
109 dx = devno;
110 di = 0;
111 asm("pushw %%es; "
112 "movw %%di,%%es; "
113 "pushfl; stc; int $0x13; setc %%al; popfl; "
114 "popw %%es"
115 : "+a" (ax), "=b" (bx), "=c" (cx), "+d" (dx), "+D" (di)
116 : : "esi");
117
118 if ((u8)ax == 0) {
119 ei->legacy_max_cylinder = (cx >> 8) + ((cx & 0xc0) << 2);
120 ei->legacy_max_head = dx >> 8;
121 ei->legacy_sectors_per_track = cx & 0x3f;
122 }
123
124 return 0;
125}
126
127void query_edd(void)
128{
129 char eddarg[8];
130 int do_mbr = 1;
131 int do_edd = 1;
132 int devno;
133 struct edd_info ei, *edp;
134 u32 *mbrptr;
135
136 if (cmdline_find_option("edd", eddarg, sizeof eddarg) > 0) {
137 if (!strcmp(eddarg, "skipmbr") || !strcmp(eddarg, "skip"))
138 do_mbr = 0;
139 else if (!strcmp(eddarg, "off"))
140 do_edd = 0;
141 }
142
143 edp = boot_params.eddbuf;
144 mbrptr = boot_params.edd_mbr_sig_buffer;
145
146 if (!do_edd)
147 return;
148
149 for (devno = 0x80; devno < 0x80+EDD_MBR_SIG_MAX; devno++) {
150 /*
151 * Scan the BIOS-supported hard disks and query EDD
152 * information...
153 */
154 get_edd_info(devno, &ei);
155
156 if (boot_params.eddbuf_entries < EDDMAXNR) {
157 memcpy(edp, &ei, sizeof ei);
158 edp++;
159 boot_params.eddbuf_entries++;
160 }
161
162 if (do_mbr && !read_mbr_sig(devno, &ei, mbrptr++))
163 boot_params.edd_mbr_sig_buf_entries = devno-0x80+1;
164 }
165}
166
167#endif
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
new file mode 100644
index 000000000000..f3140e596d40
--- /dev/null
+++ b/arch/x86/boot/header.S
@@ -0,0 +1,283 @@
1/*
2 * header.S
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 *
6 * Based on bootsect.S and setup.S
7 * modified by more people than can be counted
8 *
9 * Rewritten as a common file by H. Peter Anvin (Apr 2007)
10 *
11 * BIG FAT NOTE: We're in real mode using 64k segments. Therefore segment
12 * addresses must be multiplied by 16 to obtain their respective linear
13 * addresses. To avoid confusion, linear addresses are written using leading
14 * hex while segment addresses are written as segment:offset.
15 *
16 */
17
18#include <asm/segment.h>
19#include <linux/utsrelease.h>
20#include <asm/boot.h>
21#include <asm/e820.h>
22#include <asm/page.h>
23#include <asm/setup.h>
24#include "boot.h"
25
26SETUPSECTS = 4 /* default nr of setup-sectors */
27BOOTSEG = 0x07C0 /* original address of boot-sector */
28SYSSEG = DEF_SYSSEG /* system loaded at 0x10000 (65536) */
29SYSSIZE = DEF_SYSSIZE /* system size: # of 16-byte clicks */
30 /* to be loaded */
31ROOT_DEV = 0 /* ROOT_DEV is now written by "build" */
32SWAP_DEV = 0 /* SWAP_DEV is now written by "build" */
33
34#ifndef SVGA_MODE
35#define SVGA_MODE ASK_VGA
36#endif
37
38#ifndef RAMDISK
39#define RAMDISK 0
40#endif
41
42#ifndef ROOT_RDONLY
43#define ROOT_RDONLY 1
44#endif
45
46 .code16
47 .section ".bstext", "ax"
48
49 .global bootsect_start
50bootsect_start:
51
52 # Normalize the start address
53 ljmp $BOOTSEG, $start2
54
55start2:
56 movw %cs, %ax
57 movw %ax, %ds
58 movw %ax, %es
59 movw %ax, %ss
60 xorw %sp, %sp
61 sti
62 cld
63
64 movw $bugger_off_msg, %si
65
66msg_loop:
67 lodsb
68 andb %al, %al
69 jz bs_die
70 movb $0xe, %ah
71 movw $7, %bx
72 int $0x10
73 jmp msg_loop
74
75bs_die:
76 # Allow the user to press a key, then reboot
77 xorw %ax, %ax
78 int $0x16
79 int $0x19
80
81 # int 0x19 should never return. In case it does anyway,
82 # invoke the BIOS reset code...
83 ljmp $0xf000,$0xfff0
84
85 .section ".bsdata", "a"
86bugger_off_msg:
87 .ascii "Direct booting from floppy is no longer supported.\r\n"
88 .ascii "Please use a boot loader program instead.\r\n"
89 .ascii "\n"
90 .ascii "Remove disk and press any key to reboot . . .\r\n"
91 .byte 0
92
93
94 # Kernel attributes; used by setup. This is part 1 of the
95 # header, from the old boot sector.
96
97 .section ".header", "a"
98 .globl hdr
99hdr:
100setup_sects: .byte SETUPSECTS
101root_flags: .word ROOT_RDONLY
102syssize: .long SYSSIZE
103ram_size: .word RAMDISK
104vid_mode: .word SVGA_MODE
105root_dev: .word ROOT_DEV
106boot_flag: .word 0xAA55
107
108 # offset 512, entry point
109
110 .globl _start
111_start:
112 # Explicitly enter this as bytes, or the assembler
113 # tries to generate a 3-byte jump here, which causes
114 # everything else to push off to the wrong offset.
115 .byte 0xeb # short (2-byte) jump
116 .byte start_of_setup-1f
1171:
118
119 # Part 2 of the header, from the old setup.S
120
121 .ascii "HdrS" # header signature
122 .word 0x0206 # header version number (>= 0x0105)
123 # or else old loadlin-1.5 will fail)
124 .globl realmode_swtch
125realmode_swtch: .word 0, 0 # default_switch, SETUPSEG
126start_sys_seg: .word SYSSEG
127 .word kernel_version-512 # pointing to kernel version string
128 # above section of header is compatible
129 # with loadlin-1.5 (header v1.5). Don't
130 # change it.
131
132type_of_loader: .byte 0 # = 0, old one (LILO, Loadlin,
133 # Bootlin, SYSLX, bootsect...)
134 # See Documentation/i386/boot.txt for
135 # assigned ids
136
137# flags, unused bits must be zero (RFU) bit within loadflags
138loadflags:
139LOADED_HIGH = 1 # If set, the kernel is loaded high
140CAN_USE_HEAP = 0x80 # If set, the loader also has set
141 # heap_end_ptr to tell how much
142 # space behind setup.S can be used for
143 # heap purposes.
144 # Only the loader knows what is free
145#ifndef __BIG_KERNEL__
146 .byte 0
147#else
148 .byte LOADED_HIGH
149#endif
150
151setup_move_size: .word 0x8000 # size to move, when setup is not
152 # loaded at 0x90000. We will move setup
153 # to 0x90000 then just before jumping
154 # into the kernel. However, only the
155 # loader knows how much data behind
156 # us also needs to be loaded.
157
158code32_start: # here loaders can put a different
159 # start address for 32-bit code.
160#ifndef __BIG_KERNEL__
161 .long 0x1000 # 0x1000 = default for zImage
162#else
163 .long 0x100000 # 0x100000 = default for big kernel
164#endif
165
166ramdisk_image: .long 0 # address of loaded ramdisk image
167 # Here the loader puts the 32-bit
168 # address where it loaded the image.
169 # This only will be read by the kernel.
170
171ramdisk_size: .long 0 # its size in bytes
172
173bootsect_kludge:
174 .long 0 # obsolete
175
176heap_end_ptr: .word _end+1024 # (Header version 0x0201 or later)
177 # space from here (exclusive) down to
178 # end of setup code can be used by setup
179 # for local heap purposes.
180
181pad1: .word 0
182cmd_line_ptr: .long 0 # (Header version 0x0202 or later)
183 # If nonzero, a 32-bit pointer
184 # to the kernel command line.
185 # The command line should be
186 # located between the start of
187 # setup and the end of low
188 # memory (0xa0000), or it may
189 # get overwritten before it
190 # gets read. If this field is
191 # used, there is no longer
192 # anything magical about the
193 # 0x90000 segment; the setup
194 # can be located anywhere in
195 # low memory 0x10000 or higher.
196
197ramdisk_max: .long (-__PAGE_OFFSET-(512 << 20)-1) & 0x7fffffff
198 # (Header version 0x0203 or later)
199 # The highest safe address for
200 # the contents of an initrd
201
202kernel_alignment: .long CONFIG_PHYSICAL_ALIGN #physical addr alignment
203 #required for protected mode
204 #kernel
205#ifdef CONFIG_RELOCATABLE
206relocatable_kernel: .byte 1
207#else
208relocatable_kernel: .byte 0
209#endif
210pad2: .byte 0
211pad3: .word 0
212
213cmdline_size: .long COMMAND_LINE_SIZE-1 #length of the command line,
214 #added with boot protocol
215 #version 2.06
216
217# End of setup header #####################################################
218
219 .section ".inittext", "ax"
220start_of_setup:
221#ifdef SAFE_RESET_DISK_CONTROLLER
222# Reset the disk controller.
223 movw $0x0000, %ax # Reset disk controller
224 movb $0x80, %dl # All disks
225 int $0x13
226#endif
227
228# We will have entered with %cs = %ds+0x20, normalize %cs so
229# it is on par with the other segments.
230 pushw %ds
231 pushw $setup2
232 lretw
233
234setup2:
235# Force %es = %ds
236 movw %ds, %ax
237 movw %ax, %es
238 cld
239
240# Stack paranoia: align the stack and make sure it is good
241# for both 16- and 32-bit references. In particular, if we
242# were meant to have been using the full 16-bit segment, the
243# caller might have set %sp to zero, which breaks %esp-based
244# references.
245 andw $~3, %sp # dword align (might as well...)
246 jnz 1f
247 movw $0xfffc, %sp # Make sure we're not zero
2481: movzwl %sp, %esp # Clear upper half of %esp
249 sti
250
251# Check signature at end of setup
252 cmpl $0x5a5aaa55, setup_sig
253 jne setup_bad
254
255# Zero the bss
256 movw $__bss_start, %di
257 movw $_end+3, %cx
258 xorl %eax, %eax
259 subw %di, %cx
260 shrw $2, %cx
261 rep; stosl
262
263# Jump to C code (should not return)
264 calll main
265
266# Setup corrupt somehow...
267setup_bad:
268 movl $setup_corrupt, %eax
269 calll puts
270 # Fall through...
271
272 .globl die
273 .type die, @function
274die:
275 hlt
276 jmp die
277
278 .size die, .-die
279
280 .section ".initdata", "a"
281setup_corrupt:
282 .byte 7
283 .string "No setup signature found...\n"
diff --git a/arch/x86/boot/install.sh b/arch/x86/boot/install.sh
new file mode 100644
index 000000000000..88d77761d01b
--- /dev/null
+++ b/arch/x86/boot/install.sh
@@ -0,0 +1,61 @@
1#!/bin/sh
2#
3# arch/i386/boot/install.sh
4#
5# This file is subject to the terms and conditions of the GNU General Public
6# License. See the file "COPYING" in the main directory of this archive
7# for more details.
8#
9# Copyright (C) 1995 by Linus Torvalds
10#
11# Adapted from code in arch/i386/boot/Makefile by H. Peter Anvin
12#
13# "make install" script for i386 architecture
14#
15# Arguments:
16# $1 - kernel version
17# $2 - kernel image file
18# $3 - kernel map file
19# $4 - default install path (blank if root directory)
20#
21
22verify () {
23 if [ ! -f "$1" ]; then
24 echo "" 1>&2
25 echo " *** Missing file: $1" 1>&2
26 echo ' *** You need to run "make" before "make install".' 1>&2
27 echo "" 1>&2
28 exit 1
29 fi
30}
31
32# Make sure the files actually exist
33verify "$2"
34verify "$3"
35
36# User may have a custom install script
37
38if [ -x ~/bin/${CROSS_COMPILE}installkernel ]; then exec ~/bin/${CROSS_COMPILE}installkernel "$@"; fi
39if [ -x /sbin/${CROSS_COMPILE}installkernel ]; then exec /sbin/${CROSS_COMPILE}installkernel "$@"; fi
40
41# Default install - same as make zlilo
42
43if [ -f $4/vmlinuz ]; then
44 mv $4/vmlinuz $4/vmlinuz.old
45fi
46
47if [ -f $4/System.map ]; then
48 mv $4/System.map $4/System.old
49fi
50
51cat $2 > $4/vmlinuz
52cp $3 $4/System.map
53
54if [ -x /sbin/lilo ]; then
55 /sbin/lilo
56elif [ -x /etc/lilo/install ]; then
57 /etc/lilo/install
58else
59 sync
60 echo "Cannot find LILO."
61fi
diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c
new file mode 100644
index 000000000000..0eeef3989a17
--- /dev/null
+++ b/arch/x86/boot/main.c
@@ -0,0 +1,161 @@
1/* -*- linux-c -*- ------------------------------------------------------- *
2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 *
6 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2.
8 *
9 * ----------------------------------------------------------------------- */
10
11/*
12 * arch/i386/boot/main.c
13 *
14 * Main module for the real-mode kernel code
15 */
16
17#include "boot.h"
18
19struct boot_params boot_params __attribute__((aligned(16)));
20
21char *HEAP = _end;
22char *heap_end = _end; /* Default end of heap = no heap */
23
24/*
25 * Copy the header into the boot parameter block. Since this
26 * screws up the old-style command line protocol, adjust by
27 * filling in the new-style command line pointer instead.
28 */
29#define OLD_CL_MAGIC 0xA33F
30#define OLD_CL_ADDRESS 0x20
31
32static void copy_boot_params(void)
33{
34 struct old_cmdline {
35 u16 cl_magic;
36 u16 cl_offset;
37 };
38 const struct old_cmdline * const oldcmd =
39 (const struct old_cmdline *)OLD_CL_ADDRESS;
40
41 BUILD_BUG_ON(sizeof boot_params != 4096);
42 memcpy(&boot_params.hdr, &hdr, sizeof hdr);
43
44 if (!boot_params.hdr.cmd_line_ptr &&
45 oldcmd->cl_magic == OLD_CL_MAGIC) {
46 /* Old-style command line protocol. */
47 u16 cmdline_seg;
48
49 /* Figure out if the command line falls in the region
50 of memory that an old kernel would have copied up
51 to 0x90000... */
52 if (oldcmd->cl_offset < boot_params.hdr.setup_move_size)
53 cmdline_seg = ds();
54 else
55 cmdline_seg = 0x9000;
56
57 boot_params.hdr.cmd_line_ptr =
58 (cmdline_seg << 4) + oldcmd->cl_offset;
59 }
60}
61
62/*
63 * Set the keyboard repeat rate to maximum. Unclear why this
64 * is done here; this might be possible to kill off as stale code.
65 */
66static void keyboard_set_repeat(void)
67{
68 u16 ax = 0x0305;
69 u16 bx = 0;
70 asm volatile("int $0x16"
71 : "+a" (ax), "+b" (bx)
72 : : "ecx", "edx", "esi", "edi");
73}
74
75/*
76 * Get Intel SpeedStep (IST) information.
77 */
78static void query_ist(void)
79{
80 asm("int $0x15"
81 : "=a" (boot_params.ist_info.signature),
82 "=b" (boot_params.ist_info.command),
83 "=c" (boot_params.ist_info.event),
84 "=d" (boot_params.ist_info.perf_level)
85 : "a" (0x0000e980), /* IST Support */
86 "d" (0x47534943)); /* Request value */
87}
88
89/*
90 * Tell the BIOS what CPU mode we intend to run in.
91 */
92static void set_bios_mode(void)
93{
94#ifdef CONFIG_X86_64
95 u32 eax, ebx;
96
97 eax = 0xec00;
98 ebx = 2;
99 asm volatile("int $0x15"
100 : "+a" (eax), "+b" (ebx)
101 : : "ecx", "edx", "esi", "edi");
102#endif
103}
104
105void main(void)
106{
107 /* First, copy the boot header into the "zeropage" */
108 copy_boot_params();
109
110 /* End of heap check */
111 if (boot_params.hdr.loadflags & CAN_USE_HEAP) {
112 heap_end = (char *)(boot_params.hdr.heap_end_ptr
113 +0x200-STACK_SIZE);
114 } else {
115 /* Boot protocol 2.00 only, no heap available */
116 puts("WARNING: Ancient bootloader, some functionality "
117 "may be limited!\n");
118 }
119
120 /* Make sure we have all the proper CPU support */
121 if (validate_cpu()) {
122 puts("Unable to boot - please use a kernel appropriate "
123 "for your CPU.\n");
124 die();
125 }
126
127 /* Tell the BIOS what CPU mode we intend to run in. */
128 set_bios_mode();
129
130 /* Detect memory layout */
131 detect_memory();
132
133 /* Set keyboard repeat rate (why?) */
134 keyboard_set_repeat();
135
136 /* Set the video mode */
137 set_video();
138
139 /* Query MCA information */
140 query_mca();
141
142 /* Voyager */
143#ifdef CONFIG_X86_VOYAGER
144 query_voyager();
145#endif
146
147 /* Query Intel SpeedStep (IST) information */
148 query_ist();
149
150 /* Query APM information */
151#if defined(CONFIG_APM) || defined(CONFIG_APM_MODULE)
152 query_apm_bios();
153#endif
154
155 /* Query EDD information */
156#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
157 query_edd();
158#endif
159 /* Do the last things and invoke protected mode */
160 go_to_protected_mode();
161}
diff --git a/arch/x86/boot/mca.c b/arch/x86/boot/mca.c
new file mode 100644
index 000000000000..68222f2d4b67
--- /dev/null
+++ b/arch/x86/boot/mca.c
@@ -0,0 +1,43 @@
1/* -*- linux-c -*- ------------------------------------------------------- *
2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 *
6 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2.
8 *
9 * ----------------------------------------------------------------------- */
10
11/*
12 * arch/i386/boot/mca.c
13 *
14 * Get the MCA system description table
15 */
16
17#include "boot.h"
18
19int query_mca(void)
20{
21 u8 err;
22 u16 es, bx, len;
23
24 asm("pushw %%es ; "
25 "int $0x15 ; "
26 "setc %0 ; "
27 "movw %%es, %1 ; "
28 "popw %%es"
29 : "=acd" (err), "=acdSD" (es), "=b" (bx)
30 : "a" (0xc000));
31
32 if (err)
33 return -1; /* No MCA present */
34
35 set_fs(es);
36 len = rdfs16(bx);
37
38 if (len > sizeof(boot_params.sys_desc_table))
39 len = sizeof(boot_params.sys_desc_table);
40
41 copy_from_fs(&boot_params.sys_desc_table, bx, len);
42 return 0;
43}
diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c
new file mode 100644
index 000000000000..378353956b5d
--- /dev/null
+++ b/arch/x86/boot/memory.c
@@ -0,0 +1,118 @@
1/* -*- linux-c -*- ------------------------------------------------------- *
2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 *
6 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2.
8 *
9 * ----------------------------------------------------------------------- */
10
11/*
12 * arch/i386/boot/memory.c
13 *
14 * Memory detection code
15 */
16
17#include "boot.h"
18
19#define SMAP 0x534d4150 /* ASCII "SMAP" */
20
21static int detect_memory_e820(void)
22{
23 int count = 0;
24 u32 next = 0;
25 u32 size, id;
26 u8 err;
27 struct e820entry *desc = boot_params.e820_map;
28
29 do {
30 size = sizeof(struct e820entry);
31
32 /* Important: %edx is clobbered by some BIOSes,
33 so it must be either used for the error output
34 or explicitly marked clobbered. */
35 asm("int $0x15; setc %0"
36 : "=d" (err), "+b" (next), "=a" (id), "+c" (size),
37 "=m" (*desc)
38 : "D" (desc), "d" (SMAP), "a" (0xe820));
39
40 /* Some BIOSes stop returning SMAP in the middle of
41 the search loop. We don't know exactly how the BIOS
42 screwed up the map at that point, we might have a
43 partial map, the full map, or complete garbage, so
44 just return failure. */
45 if (id != SMAP) {
46 count = 0;
47 break;
48 }
49
50 if (err)
51 break;
52
53 count++;
54 desc++;
55 } while (next && count < E820MAX);
56
57 return boot_params.e820_entries = count;
58}
59
60static int detect_memory_e801(void)
61{
62 u16 ax, bx, cx, dx;
63 u8 err;
64
65 bx = cx = dx = 0;
66 ax = 0xe801;
67 asm("stc; int $0x15; setc %0"
68 : "=m" (err), "+a" (ax), "+b" (bx), "+c" (cx), "+d" (dx));
69
70 if (err)
71 return -1;
72
73 /* Do we really need to do this? */
74 if (cx || dx) {
75 ax = cx;
76 bx = dx;
77 }
78
79 if (ax > 15*1024)
80 return -1; /* Bogus! */
81
82 /* This ignores memory above 16MB if we have a memory hole
83 there. If someone actually finds a machine with a memory
84 hole at 16MB and no support for 0E820h they should probably
85 generate a fake e820 map. */
86 boot_params.alt_mem_k = (ax == 15*1024) ? (dx << 6)+ax : ax;
87
88 return 0;
89}
90
91static int detect_memory_88(void)
92{
93 u16 ax;
94 u8 err;
95
96 ax = 0x8800;
97 asm("stc; int $0x15; setc %0" : "=bcdm" (err), "+a" (ax));
98
99 boot_params.screen_info.ext_mem_k = ax;
100
101 return -err;
102}
103
104int detect_memory(void)
105{
106 int err = -1;
107
108 if (detect_memory_e820() > 0)
109 err = 0;
110
111 if (!detect_memory_e801())
112 err = 0;
113
114 if (!detect_memory_88())
115 err = 0;
116
117 return err;
118}
diff --git a/arch/x86/boot/mtools.conf.in b/arch/x86/boot/mtools.conf.in
new file mode 100644
index 000000000000..efd6d2490c1d
--- /dev/null
+++ b/arch/x86/boot/mtools.conf.in
@@ -0,0 +1,17 @@
1#
2# mtools configuration file for "make (b)zdisk"
3#
4
5# Actual floppy drive
6drive a:
7 file="/dev/fd0"
8
9# 1.44 MB floppy disk image
10drive v:
11 file="@OBJ@/fdimage" cylinders=80 heads=2 sectors=18 filter
12
13# 2.88 MB floppy disk image (mostly for virtual uses)
14drive w:
15 file="@OBJ@/fdimage" cylinders=80 heads=2 sectors=36 filter
16
17
diff --git a/arch/x86/boot/pm.c b/arch/x86/boot/pm.c
new file mode 100644
index 000000000000..09fb342cc62e
--- /dev/null
+++ b/arch/x86/boot/pm.c
@@ -0,0 +1,174 @@
1/* -*- linux-c -*- ------------------------------------------------------- *
2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 *
6 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2.
8 *
9 * ----------------------------------------------------------------------- */
10
11/*
12 * arch/i386/boot/pm.c
13 *
14 * Prepare the machine for transition to protected mode.
15 */
16
17#include "boot.h"
18#include <asm/segment.h>
19
20/*
21 * Invoke the realmode switch hook if present; otherwise
22 * disable all interrupts.
23 */
24static void realmode_switch_hook(void)
25{
26 if (boot_params.hdr.realmode_swtch) {
27 asm volatile("lcallw *%0"
28 : : "m" (boot_params.hdr.realmode_swtch)
29 : "eax", "ebx", "ecx", "edx");
30 } else {
31 asm volatile("cli");
32 outb(0x80, 0x70); /* Disable NMI */
33 io_delay();
34 }
35}
36
37/*
38 * A zImage kernel is loaded at 0x10000 but wants to run at 0x1000.
39 * A bzImage kernel is loaded and runs at 0x100000.
40 */
41static void move_kernel_around(void)
42{
43 /* Note: rely on the compile-time option here rather than
44 the LOADED_HIGH flag. The Qemu kernel loader unconditionally
45 sets the loadflags to zero. */
46#ifndef __BIG_KERNEL__
47 u16 dst_seg, src_seg;
48 u32 syssize;
49
50 dst_seg = 0x1000 >> 4;
51 src_seg = 0x10000 >> 4;
52 syssize = boot_params.hdr.syssize; /* Size in 16-byte paragraphs */
53
54 while (syssize) {
55 int paras = (syssize >= 0x1000) ? 0x1000 : syssize;
56 int dwords = paras << 2;
57
58 asm volatile("pushw %%es ; "
59 "pushw %%ds ; "
60 "movw %1,%%es ; "
61 "movw %2,%%ds ; "
62 "xorw %%di,%%di ; "
63 "xorw %%si,%%si ; "
64 "rep;movsl ; "
65 "popw %%ds ; "
66 "popw %%es"
67 : "+c" (dwords)
68 : "r" (dst_seg), "r" (src_seg)
69 : "esi", "edi");
70
71 syssize -= paras;
72 dst_seg += paras;
73 src_seg += paras;
74 }
75#endif
76}
77
78/*
79 * Disable all interrupts at the legacy PIC.
80 */
81static void mask_all_interrupts(void)
82{
83 outb(0xff, 0xa1); /* Mask all interrupts on the secondary PIC */
84 io_delay();
85 outb(0xfb, 0x21); /* Mask all but cascade on the primary PIC */
86 io_delay();
87}
88
89/*
90 * Reset IGNNE# if asserted in the FPU.
91 */
92static void reset_coprocessor(void)
93{
94 outb(0, 0xf0);
95 io_delay();
96 outb(0, 0xf1);
97 io_delay();
98}
99
100/*
101 * Set up the GDT
102 */
103#define GDT_ENTRY(flags,base,limit) \
104 (((u64)(base & 0xff000000) << 32) | \
105 ((u64)flags << 40) | \
106 ((u64)(limit & 0x00ff0000) << 32) | \
107 ((u64)(base & 0x00ffff00) << 16) | \
108 ((u64)(limit & 0x0000ffff)))
109
110struct gdt_ptr {
111 u16 len;
112 u32 ptr;
113} __attribute__((packed));
114
115static void setup_gdt(void)
116{
117 /* There are machines which are known to not boot with the GDT
118 being 8-byte unaligned. Intel recommends 16 byte alignment. */
119 static const u64 boot_gdt[] __attribute__((aligned(16))) = {
120 /* CS: code, read/execute, 4 GB, base 0 */
121 [GDT_ENTRY_BOOT_CS] = GDT_ENTRY(0xc09b, 0, 0xfffff),
122 /* DS: data, read/write, 4 GB, base 0 */
123 [GDT_ENTRY_BOOT_DS] = GDT_ENTRY(0xc093, 0, 0xfffff),
124 };
125 /* Xen HVM incorrectly stores a pointer to the gdt_ptr, instead
126 of the gdt_ptr contents. Thus, make it static so it will
127 stay in memory, at least long enough that we switch to the
128 proper kernel GDT. */
129 static struct gdt_ptr gdt;
130
131 gdt.len = sizeof(boot_gdt)-1;
132 gdt.ptr = (u32)&boot_gdt + (ds() << 4);
133
134 asm volatile("lgdtl %0" : : "m" (gdt));
135}
136
137/*
138 * Set up the IDT
139 */
140static void setup_idt(void)
141{
142 static const struct gdt_ptr null_idt = {0, 0};
143 asm volatile("lidtl %0" : : "m" (null_idt));
144}
145
146/*
147 * Actual invocation sequence
148 */
149void go_to_protected_mode(void)
150{
151 /* Hook before leaving real mode, also disables interrupts */
152 realmode_switch_hook();
153
154 /* Move the kernel/setup to their final resting places */
155 move_kernel_around();
156
157 /* Enable the A20 gate */
158 if (enable_a20()) {
159 puts("A20 gate not responding, unable to boot...\n");
160 die();
161 }
162
163 /* Reset coprocessor (IGNNE#) */
164 reset_coprocessor();
165
166 /* Mask all interrupts in the PIC */
167 mask_all_interrupts();
168
169 /* Actual transition to protected mode... */
170 setup_idt();
171 setup_gdt();
172 protected_mode_jump(boot_params.hdr.code32_start,
173 (u32)&boot_params + (ds() << 4));
174}
diff --git a/arch/x86/boot/pmjump.S b/arch/x86/boot/pmjump.S
new file mode 100644
index 000000000000..2e559233725a
--- /dev/null
+++ b/arch/x86/boot/pmjump.S
@@ -0,0 +1,54 @@
1/* ----------------------------------------------------------------------- *
2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 *
6 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2.
8 *
9 * ----------------------------------------------------------------------- */
10
11/*
12 * arch/i386/boot/pmjump.S
13 *
14 * The actual transition into protected mode
15 */
16
17#include <asm/boot.h>
18#include <asm/segment.h>
19
20 .text
21
22 .globl protected_mode_jump
23 .type protected_mode_jump, @function
24
25 .code16
26
27/*
28 * void protected_mode_jump(u32 entrypoint, u32 bootparams);
29 */
30protected_mode_jump:
31 xorl %ebx, %ebx # Flag to indicate this is a boot
32 movl %edx, %esi # Pointer to boot_params table
33 movl %eax, 2f # Patch ljmpl instruction
34 jmp 1f # Short jump to flush instruction q.
35
361:
37 movw $__BOOT_DS, %cx
38
39 movl %cr0, %edx
40 orb $1, %dl # Protected mode (PE) bit
41 movl %edx, %cr0
42
43 movw %cx, %ds
44 movw %cx, %es
45 movw %cx, %fs
46 movw %cx, %gs
47 movw %cx, %ss
48
49 # Jump to the 32-bit entrypoint
50 .byte 0x66, 0xea # ljmpl opcode
512: .long 0 # offset
52 .word __BOOT_CS # segment
53
54 .size protected_mode_jump, .-protected_mode_jump
diff --git a/arch/x86/boot/printf.c b/arch/x86/boot/printf.c
new file mode 100644
index 000000000000..1a09f9309d3c
--- /dev/null
+++ b/arch/x86/boot/printf.c
@@ -0,0 +1,307 @@
1/* -*- linux-c -*- ------------------------------------------------------- *
2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 *
6 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2.
8 *
9 * ----------------------------------------------------------------------- */
10
11/*
12 * arch/i386/boot/printf.c
13 *
14 * Oh, it's a waste of space, but oh-so-yummy for debugging. This
15 * version of printf() does not include 64-bit support. "Live with
16 * it."
17 *
18 */
19
20#include "boot.h"
21
22static int skip_atoi(const char **s)
23{
24 int i = 0;
25
26 while (isdigit(**s))
27 i = i * 10 + *((*s)++) - '0';
28 return i;
29}
30
31#define ZEROPAD 1 /* pad with zero */
32#define SIGN 2 /* unsigned/signed long */
33#define PLUS 4 /* show plus */
34#define SPACE 8 /* space if plus */
35#define LEFT 16 /* left justified */
36#define SPECIAL 32 /* 0x */
37#define LARGE 64 /* use 'ABCDEF' instead of 'abcdef' */
38
39#define do_div(n,base) ({ \
40int __res; \
41__res = ((unsigned long) n) % (unsigned) base; \
42n = ((unsigned long) n) / (unsigned) base; \
43__res; })
44
45static char *number(char *str, long num, int base, int size, int precision,
46 int type)
47{
48 char c, sign, tmp[66];
49 const char *digits = "0123456789abcdefghijklmnopqrstuvwxyz";
50 int i;
51
52 if (type & LARGE)
53 digits = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
54 if (type & LEFT)
55 type &= ~ZEROPAD;
56 if (base < 2 || base > 36)
57 return 0;
58 c = (type & ZEROPAD) ? '0' : ' ';
59 sign = 0;
60 if (type & SIGN) {
61 if (num < 0) {
62 sign = '-';
63 num = -num;
64 size--;
65 } else if (type & PLUS) {
66 sign = '+';
67 size--;
68 } else if (type & SPACE) {
69 sign = ' ';
70 size--;
71 }
72 }
73 if (type & SPECIAL) {
74 if (base == 16)
75 size -= 2;
76 else if (base == 8)
77 size--;
78 }
79 i = 0;
80 if (num == 0)
81 tmp[i++] = '0';
82 else
83 while (num != 0)
84 tmp[i++] = digits[do_div(num, base)];
85 if (i > precision)
86 precision = i;
87 size -= precision;
88 if (!(type & (ZEROPAD + LEFT)))
89 while (size-- > 0)
90 *str++ = ' ';
91 if (sign)
92 *str++ = sign;
93 if (type & SPECIAL) {
94 if (base == 8)
95 *str++ = '0';
96 else if (base == 16) {
97 *str++ = '0';
98 *str++ = digits[33];
99 }
100 }
101 if (!(type & LEFT))
102 while (size-- > 0)
103 *str++ = c;
104 while (i < precision--)
105 *str++ = '0';
106 while (i-- > 0)
107 *str++ = tmp[i];
108 while (size-- > 0)
109 *str++ = ' ';
110 return str;
111}
112
113int vsprintf(char *buf, const char *fmt, va_list args)
114{
115 int len;
116 unsigned long num;
117 int i, base;
118 char *str;
119 const char *s;
120
121 int flags; /* flags to number() */
122
123 int field_width; /* width of output field */
124 int precision; /* min. # of digits for integers; max
125 number of chars for from string */
126 int qualifier; /* 'h', 'l', or 'L' for integer fields */
127
128 for (str = buf; *fmt; ++fmt) {
129 if (*fmt != '%') {
130 *str++ = *fmt;
131 continue;
132 }
133
134 /* process flags */
135 flags = 0;
136 repeat:
137 ++fmt; /* this also skips first '%' */
138 switch (*fmt) {
139 case '-':
140 flags |= LEFT;
141 goto repeat;
142 case '+':
143 flags |= PLUS;
144 goto repeat;
145 case ' ':
146 flags |= SPACE;
147 goto repeat;
148 case '#':
149 flags |= SPECIAL;
150 goto repeat;
151 case '0':
152 flags |= ZEROPAD;
153 goto repeat;
154 }
155
156 /* get field width */
157 field_width = -1;
158 if (isdigit(*fmt))
159 field_width = skip_atoi(&fmt);
160 else if (*fmt == '*') {
161 ++fmt;
162 /* it's the next argument */
163 field_width = va_arg(args, int);
164 if (field_width < 0) {
165 field_width = -field_width;
166 flags |= LEFT;
167 }
168 }
169
170 /* get the precision */
171 precision = -1;
172 if (*fmt == '.') {
173 ++fmt;
174 if (isdigit(*fmt))
175 precision = skip_atoi(&fmt);
176 else if (*fmt == '*') {
177 ++fmt;
178 /* it's the next argument */
179 precision = va_arg(args, int);
180 }
181 if (precision < 0)
182 precision = 0;
183 }
184
185 /* get the conversion qualifier */
186 qualifier = -1;
187 if (*fmt == 'h' || *fmt == 'l' || *fmt == 'L') {
188 qualifier = *fmt;
189 ++fmt;
190 }
191
192 /* default base */
193 base = 10;
194
195 switch (*fmt) {
196 case 'c':
197 if (!(flags & LEFT))
198 while (--field_width > 0)
199 *str++ = ' ';
200 *str++ = (unsigned char)va_arg(args, int);
201 while (--field_width > 0)
202 *str++ = ' ';
203 continue;
204
205 case 's':
206 s = va_arg(args, char *);
207 len = strnlen(s, precision);
208
209 if (!(flags & LEFT))
210 while (len < field_width--)
211 *str++ = ' ';
212 for (i = 0; i < len; ++i)
213 *str++ = *s++;
214 while (len < field_width--)
215 *str++ = ' ';
216 continue;
217
218 case 'p':
219 if (field_width == -1) {
220 field_width = 2 * sizeof(void *);
221 flags |= ZEROPAD;
222 }
223 str = number(str,
224 (unsigned long)va_arg(args, void *), 16,
225 field_width, precision, flags);
226 continue;
227
228 case 'n':
229 if (qualifier == 'l') {
230 long *ip = va_arg(args, long *);
231 *ip = (str - buf);
232 } else {
233 int *ip = va_arg(args, int *);
234 *ip = (str - buf);
235 }
236 continue;
237
238 case '%':
239 *str++ = '%';
240 continue;
241
242 /* integer number formats - set up the flags and "break" */
243 case 'o':
244 base = 8;
245 break;
246
247 case 'X':
248 flags |= LARGE;
249 case 'x':
250 base = 16;
251 break;
252
253 case 'd':
254 case 'i':
255 flags |= SIGN;
256 case 'u':
257 break;
258
259 default:
260 *str++ = '%';
261 if (*fmt)
262 *str++ = *fmt;
263 else
264 --fmt;
265 continue;
266 }
267 if (qualifier == 'l')
268 num = va_arg(args, unsigned long);
269 else if (qualifier == 'h') {
270 num = (unsigned short)va_arg(args, int);
271 if (flags & SIGN)
272 num = (short)num;
273 } else if (flags & SIGN)
274 num = va_arg(args, int);
275 else
276 num = va_arg(args, unsigned int);
277 str = number(str, num, base, field_width, precision, flags);
278 }
279 *str = '\0';
280 return str - buf;
281}
282
283int sprintf(char *buf, const char *fmt, ...)
284{
285 va_list args;
286 int i;
287
288 va_start(args, fmt);
289 i = vsprintf(buf, fmt, args);
290 va_end(args);
291 return i;
292}
293
294int printf(const char *fmt, ...)
295{
296 char printf_buf[1024];
297 va_list args;
298 int printed;
299
300 va_start(args, fmt);
301 printed = vsprintf(printf_buf, fmt, args);
302 va_end(args);
303
304 puts(printf_buf);
305
306 return printed;
307}
diff --git a/arch/x86/boot/setup.ld b/arch/x86/boot/setup.ld
new file mode 100644
index 000000000000..df9234b3a5e0
--- /dev/null
+++ b/arch/x86/boot/setup.ld
@@ -0,0 +1,54 @@
1/*
2 * setup.ld
3 *
4 * Linker script for the i386 setup code
5 */
6OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
7OUTPUT_ARCH(i386)
8ENTRY(_start)
9
10SECTIONS
11{
12 . = 0;
13 .bstext : { *(.bstext) }
14 .bsdata : { *(.bsdata) }
15
16 . = 497;
17 .header : { *(.header) }
18 .inittext : { *(.inittext) }
19 .initdata : { *(.initdata) }
20 .text : { *(.text*) }
21
22 . = ALIGN(16);
23 .rodata : { *(.rodata*) }
24
25 .videocards : {
26 video_cards = .;
27 *(.videocards)
28 video_cards_end = .;
29 }
30
31 . = ALIGN(16);
32 .data : { *(.data*) }
33
34 .signature : {
35 setup_sig = .;
36 LONG(0x5a5aaa55)
37 }
38
39
40 . = ALIGN(16);
41 .bss :
42 {
43 __bss_start = .;
44 *(.bss)
45 __bss_end = .;
46 }
47 . = ALIGN(16);
48 _end = .;
49
50 /DISCARD/ : { *(.note*) }
51
52 . = ASSERT(_end <= 0x8000, "Setup too big!");
53 . = ASSERT(hdr == 0x1f1, "The setup header has the wrong offset!");
54}
diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c
new file mode 100644
index 000000000000..481a22097781
--- /dev/null
+++ b/arch/x86/boot/string.c
@@ -0,0 +1,52 @@
1/* -*- linux-c -*- ------------------------------------------------------- *
2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 *
6 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2.
8 *
9 * ----------------------------------------------------------------------- */
10
11/*
12 * arch/i386/boot/string.c
13 *
14 * Very basic string functions
15 */
16
17#include "boot.h"
18
19int strcmp(const char *str1, const char *str2)
20{
21 const unsigned char *s1 = (const unsigned char *)str1;
22 const unsigned char *s2 = (const unsigned char *)str2;
23 int delta = 0;
24
25 while (*s1 || *s2) {
26 delta = *s2 - *s1;
27 if (delta)
28 return delta;
29 s1++;
30 s2++;
31 }
32 return 0;
33}
34
35size_t strnlen(const char *s, size_t maxlen)
36{
37 const char *es = s;
38 while (*es && maxlen) {
39 es++;
40 maxlen--;
41 }
42
43 return (es - s);
44}
45
46unsigned int atou(const char *s)
47{
48 unsigned int i = 0;
49 while (isdigit(*s))
50 i = i * 10 + (*s++ - '0');
51 return i;
52}
diff --git a/arch/x86/boot/tools/.gitignore b/arch/x86/boot/tools/.gitignore
new file mode 100644
index 000000000000..378eac25d311
--- /dev/null
+++ b/arch/x86/boot/tools/.gitignore
@@ -0,0 +1 @@
build
diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c
new file mode 100644
index 000000000000..b4248740ff0d
--- /dev/null
+++ b/arch/x86/boot/tools/build.c
@@ -0,0 +1,168 @@
1/*
2 * Copyright (C) 1991, 1992 Linus Torvalds
3 * Copyright (C) 1997 Martin Mares
4 * Copyright (C) 2007 H. Peter Anvin
5 */
6
7/*
8 * This file builds a disk-image from two different files:
9 *
10 * - setup: 8086 machine code, sets up system parm
11 * - system: 80386 code for actual system
12 *
13 * It does some checking that all files are of the correct type, and
14 * just writes the result to stdout, removing headers and padding to
15 * the right amount. It also writes some system data to stderr.
16 */
17
18/*
19 * Changes by tytso to allow root device specification
20 * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
21 * Cross compiling fixes by Gertjan van Wingerde, July 1996
22 * Rewritten by Martin Mares, April 1997
23 * Substantially overhauled by H. Peter Anvin, April 2007
24 */
25
26#include <stdio.h>
27#include <string.h>
28#include <stdlib.h>
29#include <stdarg.h>
30#include <sys/types.h>
31#include <sys/stat.h>
32#include <sys/sysmacros.h>
33#include <unistd.h>
34#include <fcntl.h>
35#include <sys/mman.h>
36#include <asm/boot.h>
37
38typedef unsigned char u8;
39typedef unsigned short u16;
40typedef unsigned long u32;
41
42#define DEFAULT_MAJOR_ROOT 0
43#define DEFAULT_MINOR_ROOT 0
44
45/* Minimal number of setup sectors */
46#define SETUP_SECT_MIN 5
47#define SETUP_SECT_MAX 64
48
49/* This must be large enough to hold the entire setup */
50u8 buf[SETUP_SECT_MAX*512];
51int is_big_kernel;
52
53static void die(const char * str, ...)
54{
55 va_list args;
56 va_start(args, str);
57 vfprintf(stderr, str, args);
58 fputc('\n', stderr);
59 exit(1);
60}
61
62static void usage(void)
63{
64 die("Usage: build [-b] setup system [rootdev] [> image]");
65}
66
67int main(int argc, char ** argv)
68{
69 unsigned int i, sz, setup_sectors;
70 int c;
71 u32 sys_size;
72 u8 major_root, minor_root;
73 struct stat sb;
74 FILE *file;
75 int fd;
76 void *kernel;
77
78 if (argc > 2 && !strcmp(argv[1], "-b"))
79 {
80 is_big_kernel = 1;
81 argc--, argv++;
82 }
83 if ((argc < 3) || (argc > 4))
84 usage();
85 if (argc > 3) {
86 if (!strcmp(argv[3], "CURRENT")) {
87 if (stat("/", &sb)) {
88 perror("/");
89 die("Couldn't stat /");
90 }
91 major_root = major(sb.st_dev);
92 minor_root = minor(sb.st_dev);
93 } else if (strcmp(argv[3], "FLOPPY")) {
94 if (stat(argv[3], &sb)) {
95 perror(argv[3]);
96 die("Couldn't stat root device.");
97 }
98 major_root = major(sb.st_rdev);
99 minor_root = minor(sb.st_rdev);
100 } else {
101 major_root = 0;
102 minor_root = 0;
103 }
104 } else {
105 major_root = DEFAULT_MAJOR_ROOT;
106 minor_root = DEFAULT_MINOR_ROOT;
107 }
108 fprintf(stderr, "Root device is (%d, %d)\n", major_root, minor_root);
109
110 /* Copy the setup code */
111 file = fopen(argv[1], "r");
112 if (!file)
113 die("Unable to open `%s': %m", argv[1]);
114 c = fread(buf, 1, sizeof(buf), file);
115 if (ferror(file))
116 die("read-error on `setup'");
117 if (c < 1024)
118 die("The setup must be at least 1024 bytes");
119 if (buf[510] != 0x55 || buf[511] != 0xaa)
120 die("Boot block hasn't got boot flag (0xAA55)");
121 fclose(file);
122
123 /* Pad unused space with zeros */
124 setup_sectors = (c + 511) / 512;
125 if (setup_sectors < SETUP_SECT_MIN)
126 setup_sectors = SETUP_SECT_MIN;
127 i = setup_sectors*512;
128 memset(buf+c, 0, i-c);
129
130 /* Set the default root device */
131 buf[508] = minor_root;
132 buf[509] = major_root;
133
134 fprintf(stderr, "Setup is %d bytes (padded to %d bytes).\n", c, i);
135
136 /* Open and stat the kernel file */
137 fd = open(argv[2], O_RDONLY);
138 if (fd < 0)
139 die("Unable to open `%s': %m", argv[2]);
140 if (fstat(fd, &sb))
141 die("Unable to stat `%s': %m", argv[2]);
142 sz = sb.st_size;
143 fprintf (stderr, "System is %d kB\n", (sz+1023)/1024);
144 kernel = mmap(NULL, sz, PROT_READ, MAP_SHARED, fd, 0);
145 if (kernel == MAP_FAILED)
146 die("Unable to mmap '%s': %m", argv[2]);
147 sys_size = (sz + 15) / 16;
148 if (!is_big_kernel && sys_size > DEF_SYSSIZE)
149 die("System is too big. Try using bzImage or modules.");
150
151 /* Patch the setup code with the appropriate size parameters */
152 buf[0x1f1] = setup_sectors-1;
153 buf[0x1f4] = sys_size;
154 buf[0x1f5] = sys_size >> 8;
155 buf[0x1f6] = sys_size >> 16;
156 buf[0x1f7] = sys_size >> 24;
157
158 if (fwrite(buf, 1, i, stdout) != i)
159 die("Writing setup failed");
160
161 /* Copy the kernel code */
162 if (fwrite(kernel, 1, sz, stdout) != sz)
163 die("Writing kernel failed");
164 close(fd);
165
166 /* Everything is OK */
167 return 0;
168}
diff --git a/arch/x86/boot/tty.c b/arch/x86/boot/tty.c
new file mode 100644
index 000000000000..f3f14bd26371
--- /dev/null
+++ b/arch/x86/boot/tty.c
@@ -0,0 +1,112 @@
1/* -*- linux-c -*- ------------------------------------------------------- *
2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 *
6 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2.
8 *
9 * ----------------------------------------------------------------------- */
10
11/*
12 * arch/i386/boot/tty.c
13 *
14 * Very simple screen I/O
15 * XXX: Probably should add very simple serial I/O?
16 */
17
18#include "boot.h"
19
20/*
21 * These functions are in .inittext so they can be used to signal
22 * error during initialization.
23 */
24
25void __attribute__((section(".inittext"))) putchar(int ch)
26{
27 unsigned char c = ch;
28
29 if (c == '\n')
30 putchar('\r'); /* \n -> \r\n */
31
32 /* int $0x10 is known to have bugs involving touching registers
33 it shouldn't. Be extra conservative... */
34 asm volatile("pushal; pushw %%ds; int $0x10; popw %%ds; popal"
35 : : "b" (0x0007), "c" (0x0001), "a" (0x0e00|ch));
36}
37
38void __attribute__((section(".inittext"))) puts(const char *str)
39{
40 int n = 0;
41 while (*str) {
42 putchar(*str++);
43 n++;
44 }
45}
46
47/*
48 * Read the CMOS clock through the BIOS, and return the
49 * seconds in BCD.
50 */
51
52static u8 gettime(void)
53{
54 u16 ax = 0x0200;
55 u16 cx, dx;
56
57 asm volatile("int $0x1a"
58 : "+a" (ax), "=c" (cx), "=d" (dx)
59 : : "ebx", "esi", "edi");
60
61 return dx >> 8;
62}
63
64/*
65 * Read from the keyboard
66 */
67int getchar(void)
68{
69 u16 ax = 0;
70 asm volatile("int $0x16" : "+a" (ax));
71
72 return ax & 0xff;
73}
74
75static int kbd_pending(void)
76{
77 u8 pending;
78 asm volatile("int $0x16; setnz %0"
79 : "=rm" (pending)
80 : "a" (0x0100));
81 return pending;
82}
83
84void kbd_flush(void)
85{
86 for (;;) {
87 if (!kbd_pending())
88 break;
89 getchar();
90 }
91}
92
93int getchar_timeout(void)
94{
95 int cnt = 30;
96 int t0, t1;
97
98 t0 = gettime();
99
100 while (cnt) {
101 if (kbd_pending())
102 return getchar();
103
104 t1 = gettime();
105 if (t0 != t1) {
106 cnt--;
107 t0 = t1;
108 }
109 }
110
111 return 0; /* Timeout! */
112}
diff --git a/arch/x86/boot/version.c b/arch/x86/boot/version.c
new file mode 100644
index 000000000000..c61462f7d9a7
--- /dev/null
+++ b/arch/x86/boot/version.c
@@ -0,0 +1,23 @@
1/* -*- linux-c -*- ------------------------------------------------------- *
2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 *
6 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2.
8 *
9 * ----------------------------------------------------------------------- */
10
11/*
12 * arch/i386/boot/version.c
13 *
14 * Kernel version string
15 */
16
17#include "boot.h"
18#include <linux/utsrelease.h>
19#include <linux/compile.h>
20
21const char kernel_version[] =
22 UTS_RELEASE " (" LINUX_COMPILE_BY "@" LINUX_COMPILE_HOST ") "
23 UTS_VERSION;
diff --git a/arch/x86/boot/vesa.h b/arch/x86/boot/vesa.h
new file mode 100644
index 000000000000..ff5b73cd406f
--- /dev/null
+++ b/arch/x86/boot/vesa.h
@@ -0,0 +1,79 @@
1/* ----------------------------------------------------------------------- *
2 *
3 * Copyright 1999-2007 H. Peter Anvin - All Rights Reserved
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, Inc., 53 Temple Place Ste 330,
8 * Boston MA 02111-1307, USA; either version 2 of the License, or
9 * (at your option) any later version; incorporated herein by reference.
10 *
11 * ----------------------------------------------------------------------- */
12
13#ifndef BOOT_VESA_H
14#define BOOT_VESA_H
15
16typedef struct {
17 u16 off, seg;
18} far_ptr;
19
20/* VESA General Information table */
21struct vesa_general_info {
22 u32 signature; /* 0 Magic number = "VESA" */
23 u16 version; /* 4 */
24 far_ptr vendor_string; /* 6 */
25 u32 capabilities; /* 10 */
26 far_ptr video_mode_ptr; /* 14 */
27 u16 total_memory; /* 18 */
28
29 u16 oem_software_rev; /* 20 */
30 far_ptr oem_vendor_name_ptr; /* 22 */
31 far_ptr oem_product_name_ptr; /* 26 */
32 far_ptr oem_product_rev_ptr; /* 30 */
33
34 u8 reserved[222]; /* 34 */
35 u8 oem_data[256]; /* 256 */
36} __attribute__ ((packed));
37
38#define VESA_MAGIC ('V' + ('E' << 8) + ('S' << 16) + ('A' << 24))
39#define VBE2_MAGIC ('V' + ('B' << 8) + ('E' << 16) + ('2' << 24))
40
41struct vesa_mode_info {
42 u16 mode_attr; /* 0 */
43 u8 win_attr[2]; /* 2 */
44 u16 win_grain; /* 4 */
45 u16 win_size; /* 6 */
46 u16 win_seg[2]; /* 8 */
47 far_ptr win_scheme; /* 12 */
48 u16 logical_scan; /* 16 */
49
50 u16 h_res; /* 18 */
51 u16 v_res; /* 20 */
52 u8 char_width; /* 22 */
53 u8 char_height; /* 23 */
54 u8 memory_planes; /* 24 */
55 u8 bpp; /* 25 */
56 u8 banks; /* 26 */
57 u8 memory_layout; /* 27 */
58 u8 bank_size; /* 28 */
59 u8 image_planes; /* 29 */
60 u8 page_function; /* 30 */
61
62 u8 rmask; /* 31 */
63 u8 rpos; /* 32 */
64 u8 gmask; /* 33 */
65 u8 gpos; /* 34 */
66 u8 bmask; /* 35 */
67 u8 bpos; /* 36 */
68 u8 resv_mask; /* 37 */
69 u8 resv_pos; /* 38 */
70 u8 dcm_info; /* 39 */
71
72 u32 lfb_ptr; /* 40 Linear frame buffer address */
73 u32 offscreen_ptr; /* 44 Offscreen memory address */
74 u16 offscreen_size; /* 48 */
75
76 u8 reserved[206]; /* 50 */
77} __attribute__ ((packed));
78
79#endif /* LIB_SYS_VESA_H */
diff --git a/arch/x86/boot/video-bios.c b/arch/x86/boot/video-bios.c
new file mode 100644
index 000000000000..68e65d95cdfd
--- /dev/null
+++ b/arch/x86/boot/video-bios.c
@@ -0,0 +1,125 @@
1/* -*- linux-c -*- ------------------------------------------------------- *
2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 *
6 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2.
8 *
9 * ----------------------------------------------------------------------- */
10
11/*
12 * arch/i386/boot/video-bios.c
13 *
14 * Standard video BIOS modes
15 *
16 * We have two options for this; silent and scanned.
17 */
18
19#include "boot.h"
20#include "video.h"
21
22__videocard video_bios;
23
24/* Set a conventional BIOS mode */
25static int set_bios_mode(u8 mode);
26
27static int bios_set_mode(struct mode_info *mi)
28{
29 return set_bios_mode(mi->mode - VIDEO_FIRST_BIOS);
30}
31
32static int set_bios_mode(u8 mode)
33{
34 u16 ax;
35 u8 new_mode;
36
37 ax = mode; /* AH=0x00 Set Video Mode */
38 asm volatile(INT10
39 : "+a" (ax)
40 : : "ebx", "ecx", "edx", "esi", "edi");
41
42 ax = 0x0f00; /* Get Current Video Mode */
43 asm volatile(INT10
44 : "+a" (ax)
45 : : "ebx", "ecx", "edx", "esi", "edi");
46
47 do_restore = 1; /* Assume video contents were lost */
48 new_mode = ax & 0x7f; /* Not all BIOSes are clean with the top bit */
49
50 if (new_mode == mode)
51 return 0; /* Mode change OK */
52
53 if (new_mode != boot_params.screen_info.orig_video_mode) {
54 /* Mode setting failed, but we didn't end up where we
55 started. That's bad. Try to revert to the original
56 video mode. */
57 ax = boot_params.screen_info.orig_video_mode;
58 asm volatile(INT10
59 : "+a" (ax)
60 : : "ebx", "ecx", "edx", "esi", "edi");
61 }
62 return -1;
63}
64
65static int bios_probe(void)
66{
67 u8 mode;
68 u8 saved_mode = boot_params.screen_info.orig_video_mode;
69 u16 crtc;
70 struct mode_info *mi;
71 int nmodes = 0;
72
73 if (adapter != ADAPTER_EGA && adapter != ADAPTER_VGA)
74 return 0;
75
76 set_fs(0);
77 crtc = vga_crtc();
78
79 video_bios.modes = GET_HEAP(struct mode_info, 0);
80
81 for (mode = 0x14; mode <= 0x7f; mode++) {
82 if (heap_free() < sizeof(struct mode_info))
83 break;
84
85 if (mode_defined(VIDEO_FIRST_BIOS+mode))
86 continue;
87
88 if (set_bios_mode(mode))
89 continue;
90
91 /* Try to verify that it's a text mode. */
92
93 /* Attribute Controller: make graphics controller disabled */
94 if (in_idx(0x3c0, 0x10) & 0x01)
95 continue;
96
97 /* Graphics Controller: verify Alpha addressing enabled */
98 if (in_idx(0x3ce, 0x06) & 0x01)
99 continue;
100
101 /* CRTC cursor location low should be zero(?) */
102 if (in_idx(crtc, 0x0f))
103 continue;
104
105 mi = GET_HEAP(struct mode_info, 1);
106 mi->mode = VIDEO_FIRST_BIOS+mode;
107 mi->x = rdfs16(0x44a);
108 mi->y = rdfs8(0x484)+1;
109 nmodes++;
110 }
111
112 set_bios_mode(saved_mode);
113
114 return nmodes;
115}
116
117__videocard video_bios =
118{
119 .card_name = "BIOS (scanned)",
120 .probe = bios_probe,
121 .set_mode = bios_set_mode,
122 .unsafe = 1,
123 .xmode_first = VIDEO_FIRST_BIOS,
124 .xmode_n = 0x80,
125};
diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c
new file mode 100644
index 000000000000..192190710710
--- /dev/null
+++ b/arch/x86/boot/video-vesa.c
@@ -0,0 +1,292 @@
1/* -*- linux-c -*- ------------------------------------------------------- *
2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 *
6 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2.
8 *
9 * ----------------------------------------------------------------------- */
10
11/*
12 * arch/i386/boot/video-vesa.c
13 *
14 * VESA text modes
15 */
16
17#include "boot.h"
18#include "video.h"
19#include "vesa.h"
20
21/* VESA information */
22static struct vesa_general_info vginfo;
23static struct vesa_mode_info vminfo;
24
25__videocard video_vesa;
26
27static void vesa_store_mode_params_graphics(void);
28
29static int vesa_probe(void)
30{
31#if defined(CONFIG_VIDEO_VESA) || defined(CONFIG_FIRMWARE_EDID)
32 u16 ax, cx, di;
33 u16 mode;
34 addr_t mode_ptr;
35 struct mode_info *mi;
36 int nmodes = 0;
37
38 video_vesa.modes = GET_HEAP(struct mode_info, 0);
39
40 vginfo.signature = VBE2_MAGIC;
41
42 ax = 0x4f00;
43 di = (size_t)&vginfo;
44 asm(INT10
45 : "+a" (ax), "+D" (di), "=m" (vginfo)
46 : : "ebx", "ecx", "edx", "esi");
47
48 if (ax != 0x004f ||
49 vginfo.signature != VESA_MAGIC ||
50 vginfo.version < 0x0102)
51 return 0; /* Not present */
52#endif /* CONFIG_VIDEO_VESA || CONFIG_FIRMWARE_EDID */
53#ifdef CONFIG_VIDEO_VESA
54 set_fs(vginfo.video_mode_ptr.seg);
55 mode_ptr = vginfo.video_mode_ptr.off;
56
57 while ((mode = rdfs16(mode_ptr)) != 0xffff) {
58 mode_ptr += 2;
59
60 if (heap_free() < sizeof(struct mode_info))
61 break; /* Heap full, can't save mode info */
62
63 if (mode & ~0x1ff)
64 continue;
65
66 memset(&vminfo, 0, sizeof vminfo); /* Just in case... */
67
68 ax = 0x4f01;
69 cx = mode;
70 di = (size_t)&vminfo;
71 asm(INT10
72 : "+a" (ax), "+c" (cx), "+D" (di), "=m" (vminfo)
73 : : "ebx", "edx", "esi");
74
75 if (ax != 0x004f)
76 continue;
77
78 if ((vminfo.mode_attr & 0x15) == 0x05) {
79 /* Text Mode, TTY BIOS supported,
80 supported by hardware */
81 mi = GET_HEAP(struct mode_info, 1);
82 mi->mode = mode + VIDEO_FIRST_VESA;
83 mi->x = vminfo.h_res;
84 mi->y = vminfo.v_res;
85 nmodes++;
86 } else if ((vminfo.mode_attr & 0x99) == 0x99) {
87#ifdef CONFIG_FB
88 /* Graphics mode, color, linear frame buffer
89 supported -- register the mode but hide from
90 the menu. Only do this if framebuffer is
91 configured, however, otherwise the user will
92 be left without a screen. */
93 mi = GET_HEAP(struct mode_info, 1);
94 mi->mode = mode + VIDEO_FIRST_VESA;
95 mi->x = mi->y = 0;
96 nmodes++;
97#endif
98 }
99 }
100
101 return nmodes;
102#else
103 return 0;
104#endif /* CONFIG_VIDEO_VESA */
105}
106
107static int vesa_set_mode(struct mode_info *mode)
108{
109 u16 ax, bx, cx, di;
110 int is_graphic;
111 u16 vesa_mode = mode->mode - VIDEO_FIRST_VESA;
112
113 memset(&vminfo, 0, sizeof vminfo); /* Just in case... */
114
115 ax = 0x4f01;
116 cx = vesa_mode;
117 di = (size_t)&vminfo;
118 asm(INT10
119 : "+a" (ax), "+c" (cx), "+D" (di), "=m" (vminfo)
120 : : "ebx", "edx", "esi");
121
122 if (ax != 0x004f)
123 return -1;
124
125 if ((vminfo.mode_attr & 0x15) == 0x05) {
126 /* It's a supported text mode */
127 is_graphic = 0;
128 } else if ((vminfo.mode_attr & 0x99) == 0x99) {
129 /* It's a graphics mode with linear frame buffer */
130 is_graphic = 1;
131 vesa_mode |= 0x4000; /* Request linear frame buffer */
132 } else {
133 return -1; /* Invalid mode */
134 }
135
136
137 ax = 0x4f02;
138 bx = vesa_mode;
139 di = 0;
140 asm volatile(INT10
141 : "+a" (ax), "+b" (bx), "+D" (di)
142 : : "ecx", "edx", "esi");
143
144 if (ax != 0x004f)
145 return -1;
146
147 graphic_mode = is_graphic;
148 if (!is_graphic) {
149 /* Text mode */
150 force_x = mode->x;
151 force_y = mode->y;
152 do_restore = 1;
153 } else {
154 /* Graphics mode */
155 vesa_store_mode_params_graphics();
156 }
157
158 return 0;
159}
160
161
162/* Switch DAC to 8-bit mode */
163static void vesa_dac_set_8bits(void)
164{
165 u8 dac_size = 6;
166
167 /* If possible, switch the DAC to 8-bit mode */
168 if (vginfo.capabilities & 1) {
169 u16 ax, bx;
170
171 ax = 0x4f08;
172 bx = 0x0800;
173 asm volatile(INT10
174 : "+a" (ax), "+b" (bx)
175 : : "ecx", "edx", "esi", "edi");
176
177 if (ax == 0x004f)
178 dac_size = bx >> 8;
179 }
180
181 /* Set the color sizes to the DAC size, and offsets to 0 */
182 boot_params.screen_info.red_size = dac_size;
183 boot_params.screen_info.green_size = dac_size;
184 boot_params.screen_info.blue_size = dac_size;
185 boot_params.screen_info.rsvd_size = dac_size;
186
187 boot_params.screen_info.red_pos = 0;
188 boot_params.screen_info.green_pos = 0;
189 boot_params.screen_info.blue_pos = 0;
190 boot_params.screen_info.rsvd_pos = 0;
191}
192
193/* Save the VESA protected mode info */
194static void vesa_store_pm_info(void)
195{
196 u16 ax, bx, di, es;
197
198 ax = 0x4f0a;
199 bx = di = 0;
200 asm("pushw %%es; "INT10"; movw %%es,%0; popw %%es"
201 : "=d" (es), "+a" (ax), "+b" (bx), "+D" (di)
202 : : "ecx", "esi");
203
204 if (ax != 0x004f)
205 return;
206
207 boot_params.screen_info.vesapm_seg = es;
208 boot_params.screen_info.vesapm_off = di;
209}
210
211/*
212 * Save video mode parameters for graphics mode
213 */
214static void vesa_store_mode_params_graphics(void)
215{
216 /* Tell the kernel we're in VESA graphics mode */
217 boot_params.screen_info.orig_video_isVGA = 0x23;
218
219 /* Mode parameters */
220 boot_params.screen_info.vesa_attributes = vminfo.mode_attr;
221 boot_params.screen_info.lfb_linelength = vminfo.logical_scan;
222 boot_params.screen_info.lfb_width = vminfo.h_res;
223 boot_params.screen_info.lfb_height = vminfo.v_res;
224 boot_params.screen_info.lfb_depth = vminfo.bpp;
225 boot_params.screen_info.pages = vminfo.image_planes;
226 boot_params.screen_info.lfb_base = vminfo.lfb_ptr;
227 memcpy(&boot_params.screen_info.red_size,
228 &vminfo.rmask, 8);
229
230 /* General parameters */
231 boot_params.screen_info.lfb_size = vginfo.total_memory;
232
233 if (vminfo.bpp <= 8)
234 vesa_dac_set_8bits();
235
236 vesa_store_pm_info();
237}
238
239/*
240 * Save EDID information for the kernel; this is invoked, separately,
241 * after mode-setting.
242 */
243void vesa_store_edid(void)
244{
245#ifdef CONFIG_FIRMWARE_EDID
246 u16 ax, bx, cx, dx, di;
247
248 /* Apparently used as a nonsense token... */
249 memset(&boot_params.edid_info, 0x13, sizeof boot_params.edid_info);
250
251 if (vginfo.version < 0x0200)
252 return; /* EDID requires VBE 2.0+ */
253
254 ax = 0x4f15; /* VBE DDC */
255 bx = 0x0000; /* Report DDC capabilities */
256 cx = 0; /* Controller 0 */
257 di = 0; /* ES:DI must be 0 by spec */
258
259 /* Note: The VBE DDC spec is different from the main VESA spec;
260 we genuinely have to assume all registers are destroyed here. */
261
262 asm("pushw %%es; movw %2,%%es; "INT10"; popw %%es"
263 : "+a" (ax), "+b" (bx)
264 : "c" (cx), "D" (di)
265 : "esi");
266
267 if (ax != 0x004f)
268 return; /* No EDID */
269
270 /* BH = time in seconds to transfer EDD information */
271 /* BL = DDC level supported */
272
273 ax = 0x4f15; /* VBE DDC */
274 bx = 0x0001; /* Read EDID */
275 cx = 0; /* Controller 0 */
276 dx = 0; /* EDID block number */
277 di =(size_t) &boot_params.edid_info; /* (ES:)Pointer to block */
278 asm(INT10
279 : "+a" (ax), "+b" (bx), "+d" (dx), "=m" (boot_params.edid_info)
280 : "c" (cx), "D" (di)
281 : "esi");
282#endif /* CONFIG_FIRMWARE_EDID */
283}
284
285__videocard video_vesa =
286{
287 .card_name = "VESA",
288 .probe = vesa_probe,
289 .set_mode = vesa_set_mode,
290 .xmode_first = VIDEO_FIRST_VESA,
291 .xmode_n = 0x200,
292};
diff --git a/arch/x86/boot/video-vga.c b/arch/x86/boot/video-vga.c
new file mode 100644
index 000000000000..aef02f9ec0c1
--- /dev/null
+++ b/arch/x86/boot/video-vga.c
@@ -0,0 +1,261 @@
1/* -*- linux-c -*- ------------------------------------------------------- *
2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 *
6 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2.
8 *
9 * ----------------------------------------------------------------------- */
10
11/*
12 * arch/i386/boot/video-vga.c
13 *
14 * Common all-VGA modes
15 */
16
17#include "boot.h"
18#include "video.h"
19
20static struct mode_info vga_modes[] = {
21 { VIDEO_80x25, 80, 25 },
22 { VIDEO_8POINT, 80, 50 },
23 { VIDEO_80x43, 80, 43 },
24 { VIDEO_80x28, 80, 28 },
25 { VIDEO_80x30, 80, 30 },
26 { VIDEO_80x34, 80, 34 },
27 { VIDEO_80x60, 80, 60 },
28};
29
30static struct mode_info ega_modes[] = {
31 { VIDEO_80x25, 80, 25 },
32 { VIDEO_8POINT, 80, 43 },
33};
34
35static struct mode_info cga_modes[] = {
36 { VIDEO_80x25, 80, 25 },
37};
38
39__videocard video_vga;
40
41/* Set basic 80x25 mode */
42static u8 vga_set_basic_mode(void)
43{
44 u16 ax;
45 u8 rows;
46 u8 mode;
47
48#ifdef CONFIG_VIDEO_400_HACK
49 if (adapter >= ADAPTER_VGA) {
50 asm volatile(INT10
51 : : "a" (0x1202), "b" (0x0030)
52 : "ecx", "edx", "esi", "edi");
53 }
54#endif
55
56 ax = 0x0f00;
57 asm volatile(INT10
58 : "+a" (ax)
59 : : "ebx", "ecx", "edx", "esi", "edi");
60
61 mode = (u8)ax;
62
63 set_fs(0);
64 rows = rdfs8(0x484); /* rows minus one */
65
66#ifndef CONFIG_VIDEO_400_HACK
67 if ((ax == 0x5003 || ax == 0x5007) &&
68 (rows == 0 || rows == 24))
69 return mode;
70#endif
71
72 if (mode != 3 && mode != 7)
73 mode = 3;
74
75 /* Set the mode */
76 ax = mode;
77 asm volatile(INT10
78 : "+a" (ax)
79 : : "ebx", "ecx", "edx", "esi", "edi");
80 do_restore = 1;
81 return mode;
82}
83
84static void vga_set_8font(void)
85{
86 /* Set 8x8 font - 80x43 on EGA, 80x50 on VGA */
87
88 /* Set 8x8 font */
89 asm volatile(INT10 : : "a" (0x1112), "b" (0));
90
91 /* Use alternate print screen */
92 asm volatile(INT10 : : "a" (0x1200), "b" (0x20));
93
94 /* Turn off cursor emulation */
95 asm volatile(INT10 : : "a" (0x1201), "b" (0x34));
96
97 /* Cursor is scan lines 6-7 */
98 asm volatile(INT10 : : "a" (0x0100), "c" (0x0607));
99}
100
101static void vga_set_14font(void)
102{
103 /* Set 9x14 font - 80x28 on VGA */
104
105 /* Set 9x14 font */
106 asm volatile(INT10 : : "a" (0x1111), "b" (0));
107
108 /* Turn off cursor emulation */
109 asm volatile(INT10 : : "a" (0x1201), "b" (0x34));
110
111 /* Cursor is scan lines 11-12 */
112 asm volatile(INT10 : : "a" (0x0100), "c" (0x0b0c));
113}
114
115static void vga_set_80x43(void)
116{
117 /* Set 80x43 mode on VGA (not EGA) */
118
119 /* Set 350 scans */
120 asm volatile(INT10 : : "a" (0x1201), "b" (0x30));
121
122 /* Reset video mode */
123 asm volatile(INT10 : : "a" (0x0003));
124
125 vga_set_8font();
126}
127
128/* I/O address of the VGA CRTC */
129u16 vga_crtc(void)
130{
131 return (inb(0x3cc) & 1) ? 0x3d4 : 0x3b4;
132}
133
134static void vga_set_480_scanlines(int end)
135{
136 u16 crtc;
137 u8 csel;
138
139 crtc = vga_crtc();
140
141 out_idx(0x0c, crtc, 0x11); /* Vertical sync end, unlock CR0-7 */
142 out_idx(0x0b, crtc, 0x06); /* Vertical total */
143 out_idx(0x3e, crtc, 0x07); /* Vertical overflow */
144 out_idx(0xea, crtc, 0x10); /* Vertical sync start */
145 out_idx(end, crtc, 0x12); /* Vertical display end */
146 out_idx(0xe7, crtc, 0x15); /* Vertical blank start */
147 out_idx(0x04, crtc, 0x16); /* Vertical blank end */
148 csel = inb(0x3cc);
149 csel &= 0x0d;
150 csel |= 0xe2;
151 outb(csel, 0x3cc);
152}
153
154static void vga_set_80x30(void)
155{
156 vga_set_480_scanlines(0xdf);
157}
158
159static void vga_set_80x34(void)
160{
161 vga_set_14font();
162 vga_set_480_scanlines(0xdb);
163}
164
165static void vga_set_80x60(void)
166{
167 vga_set_8font();
168 vga_set_480_scanlines(0xdf);
169}
170
171static int vga_set_mode(struct mode_info *mode)
172{
173 /* Set the basic mode */
174 vga_set_basic_mode();
175
176 /* Override a possibly broken BIOS */
177 force_x = mode->x;
178 force_y = mode->y;
179
180 switch (mode->mode) {
181 case VIDEO_80x25:
182 break;
183 case VIDEO_8POINT:
184 vga_set_8font();
185 break;
186 case VIDEO_80x43:
187 vga_set_80x43();
188 break;
189 case VIDEO_80x28:
190 vga_set_14font();
191 break;
192 case VIDEO_80x30:
193 vga_set_80x30();
194 break;
195 case VIDEO_80x34:
196 vga_set_80x34();
197 break;
198 case VIDEO_80x60:
199 vga_set_80x60();
200 break;
201 }
202
203 return 0;
204}
205
206/*
207 * Note: this probe includes basic information required by all
208 * systems. It should be executed first, by making sure
209 * video-vga.c is listed first in the Makefile.
210 */
211static int vga_probe(void)
212{
213 static const char *card_name[] = {
214 "CGA/MDA/HGC", "EGA", "VGA"
215 };
216 static struct mode_info *mode_lists[] = {
217 cga_modes,
218 ega_modes,
219 vga_modes,
220 };
221 static int mode_count[] = {
222 sizeof(cga_modes)/sizeof(struct mode_info),
223 sizeof(ega_modes)/sizeof(struct mode_info),
224 sizeof(vga_modes)/sizeof(struct mode_info),
225 };
226 u8 vga_flag;
227
228 asm(INT10
229 : "=b" (boot_params.screen_info.orig_video_ega_bx)
230 : "a" (0x1200), "b" (0x10) /* Check EGA/VGA */
231 : "ecx", "edx", "esi", "edi");
232
233 /* If we have MDA/CGA/HGC then BL will be unchanged at 0x10 */
234 if ((u8)boot_params.screen_info.orig_video_ega_bx != 0x10) {
235 /* EGA/VGA */
236 asm(INT10
237 : "=a" (vga_flag)
238 : "a" (0x1a00)
239 : "ebx", "ecx", "edx", "esi", "edi");
240
241 if (vga_flag == 0x1a) {
242 adapter = ADAPTER_VGA;
243 boot_params.screen_info.orig_video_isVGA = 1;
244 } else {
245 adapter = ADAPTER_EGA;
246 }
247 } else {
248 adapter = ADAPTER_CGA;
249 }
250
251 video_vga.modes = mode_lists[adapter];
252 video_vga.card_name = card_name[adapter];
253 return mode_count[adapter];
254}
255
256__videocard video_vga =
257{
258 .card_name = "VGA",
259 .probe = vga_probe,
260 .set_mode = vga_set_mode,
261};
diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c
new file mode 100644
index 000000000000..e4ba897bf9a3
--- /dev/null
+++ b/arch/x86/boot/video.c
@@ -0,0 +1,467 @@
1/* -*- linux-c -*- ------------------------------------------------------- *
2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 *
6 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2.
8 *
9 * ----------------------------------------------------------------------- */
10
11/*
12 * arch/i386/boot/video.c
13 *
14 * Select video mode
15 */
16
17#include "boot.h"
18#include "video.h"
19#include "vesa.h"
20
21/*
22 * Mode list variables
23 */
24static struct card_info cards[]; /* List of cards to probe for */
25
26/*
27 * Common variables
28 */
29int adapter; /* 0=CGA/MDA/HGC, 1=EGA, 2=VGA+ */
30u16 video_segment;
31int force_x, force_y; /* Don't query the BIOS for cols/rows */
32
33int do_restore = 0; /* Screen contents changed during mode flip */
34int graphic_mode; /* Graphic mode with linear frame buffer */
35
36static void store_cursor_position(void)
37{
38 u16 curpos;
39 u16 ax, bx;
40
41 ax = 0x0300;
42 bx = 0;
43 asm(INT10
44 : "=d" (curpos), "+a" (ax), "+b" (bx)
45 : : "ecx", "esi", "edi");
46
47 boot_params.screen_info.orig_x = curpos;
48 boot_params.screen_info.orig_y = curpos >> 8;
49}
50
51static void store_video_mode(void)
52{
53 u16 ax, page;
54
55 /* N.B.: the saving of the video page here is a bit silly,
56 since we pretty much assume page 0 everywhere. */
57 ax = 0x0f00;
58 asm(INT10
59 : "+a" (ax), "=b" (page)
60 : : "ecx", "edx", "esi", "edi");
61
62 /* Not all BIOSes are clean with respect to the top bit */
63 boot_params.screen_info.orig_video_mode = ax & 0x7f;
64 boot_params.screen_info.orig_video_page = page >> 8;
65}
66
67/*
68 * Store the video mode parameters for later usage by the kernel.
69 * This is done by asking the BIOS except for the rows/columns
70 * parameters in the default 80x25 mode -- these are set directly,
71 * because some very obscure BIOSes supply insane values.
72 */
73static void store_mode_params(void)
74{
75 u16 font_size;
76 int x, y;
77
78 /* For graphics mode, it is up to the mode-setting driver
79 (currently only video-vesa.c) to store the parameters */
80 if (graphic_mode)
81 return;
82
83 store_cursor_position();
84 store_video_mode();
85
86 if (boot_params.screen_info.orig_video_mode == 0x07) {
87 /* MDA, HGC, or VGA in monochrome mode */
88 video_segment = 0xb000;
89 } else {
90 /* CGA, EGA, VGA and so forth */
91 video_segment = 0xb800;
92 }
93
94 set_fs(0);
95 font_size = rdfs16(0x485); /* Font size, BIOS area */
96 boot_params.screen_info.orig_video_points = font_size;
97
98 x = rdfs16(0x44a);
99 y = (adapter == ADAPTER_CGA) ? 25 : rdfs8(0x484)+1;
100
101 if (force_x)
102 x = force_x;
103 if (force_y)
104 y = force_y;
105
106 boot_params.screen_info.orig_video_cols = x;
107 boot_params.screen_info.orig_video_lines = y;
108}
109
110/* Probe the video drivers and have them generate their mode lists. */
111static void probe_cards(int unsafe)
112{
113 struct card_info *card;
114 static u8 probed[2];
115
116 if (probed[unsafe])
117 return;
118
119 probed[unsafe] = 1;
120
121 for (card = video_cards; card < video_cards_end; card++) {
122 if (card->unsafe == unsafe) {
123 if (card->probe)
124 card->nmodes = card->probe();
125 else
126 card->nmodes = 0;
127 }
128 }
129}
130
131/* Test if a mode is defined */
132int mode_defined(u16 mode)
133{
134 struct card_info *card;
135 struct mode_info *mi;
136 int i;
137
138 for (card = video_cards; card < video_cards_end; card++) {
139 mi = card->modes;
140 for (i = 0; i < card->nmodes; i++, mi++) {
141 if (mi->mode == mode)
142 return 1;
143 }
144 }
145
146 return 0;
147}
148
149/* Set mode (without recalc) */
150static int raw_set_mode(u16 mode, u16 *real_mode)
151{
152 int nmode, i;
153 struct card_info *card;
154 struct mode_info *mi;
155
156 /* Drop the recalc bit if set */
157 mode &= ~VIDEO_RECALC;
158
159 /* Scan for mode based on fixed ID, position, or resolution */
160 nmode = 0;
161 for (card = video_cards; card < video_cards_end; card++) {
162 mi = card->modes;
163 for (i = 0; i < card->nmodes; i++, mi++) {
164 int visible = mi->x || mi->y;
165
166 if ((mode == nmode && visible) ||
167 mode == mi->mode ||
168 mode == (mi->y << 8)+mi->x) {
169 *real_mode = mi->mode;
170 return card->set_mode(mi);
171 }
172
173 if (visible)
174 nmode++;
175 }
176 }
177
178 /* Nothing found? Is it an "exceptional" (unprobed) mode? */
179 for (card = video_cards; card < video_cards_end; card++) {
180 if (mode >= card->xmode_first &&
181 mode < card->xmode_first+card->xmode_n) {
182 struct mode_info mix;
183 *real_mode = mix.mode = mode;
184 mix.x = mix.y = 0;
185 return card->set_mode(&mix);
186 }
187 }
188
189 /* Otherwise, failure... */
190 return -1;
191}
192
193/*
194 * Recalculate the vertical video cutoff (hack!)
195 */
196static void vga_recalc_vertical(void)
197{
198 unsigned int font_size, rows;
199 u16 crtc;
200 u8 pt, ov;
201
202 set_fs(0);
203 font_size = rdfs8(0x485); /* BIOS: font size (pixels) */
204 rows = force_y ? force_y : rdfs8(0x484)+1; /* Text rows */
205
206 rows *= font_size; /* Visible scan lines */
207 rows--; /* ... minus one */
208
209 crtc = vga_crtc();
210
211 pt = in_idx(crtc, 0x11);
212 pt &= ~0x80; /* Unlock CR0-7 */
213 out_idx(pt, crtc, 0x11);
214
215 out_idx((u8)rows, crtc, 0x12); /* Lower height register */
216
217 ov = in_idx(crtc, 0x07); /* Overflow register */
218 ov &= 0xbd;
219 ov |= (rows >> (8-1)) & 0x02;
220 ov |= (rows >> (9-6)) & 0x40;
221 out_idx(ov, crtc, 0x07);
222}
223
224/* Set mode (with recalc if specified) */
225static int set_mode(u16 mode)
226{
227 int rv;
228 u16 real_mode;
229
230 /* Very special mode numbers... */
231 if (mode == VIDEO_CURRENT_MODE)
232 return 0; /* Nothing to do... */
233 else if (mode == NORMAL_VGA)
234 mode = VIDEO_80x25;
235 else if (mode == EXTENDED_VGA)
236 mode = VIDEO_8POINT;
237
238 rv = raw_set_mode(mode, &real_mode);
239 if (rv)
240 return rv;
241
242 if (mode & VIDEO_RECALC)
243 vga_recalc_vertical();
244
245 /* Save the canonical mode number for the kernel, not
246 an alias, size specification or menu position */
247 boot_params.hdr.vid_mode = real_mode;
248 return 0;
249}
250
251static unsigned int get_entry(void)
252{
253 char entry_buf[4];
254 int i, len = 0;
255 int key;
256 unsigned int v;
257
258 do {
259 key = getchar();
260
261 if (key == '\b') {
262 if (len > 0) {
263 puts("\b \b");
264 len--;
265 }
266 } else if ((key >= '0' && key <= '9') ||
267 (key >= 'A' && key <= 'Z') ||
268 (key >= 'a' && key <= 'z')) {
269 if (len < sizeof entry_buf) {
270 entry_buf[len++] = key;
271 putchar(key);
272 }
273 }
274 } while (key != '\r');
275 putchar('\n');
276
277 if (len == 0)
278 return VIDEO_CURRENT_MODE; /* Default */
279
280 v = 0;
281 for (i = 0; i < len; i++) {
282 v <<= 4;
283 key = entry_buf[i] | 0x20;
284 v += (key > '9') ? key-'a'+10 : key-'0';
285 }
286
287 return v;
288}
289
290static void display_menu(void)
291{
292 struct card_info *card;
293 struct mode_info *mi;
294 char ch;
295 int i;
296
297 puts("Mode: COLSxROWS:\n");
298
299 ch = '0';
300 for (card = video_cards; card < video_cards_end; card++) {
301 mi = card->modes;
302 for (i = 0; i < card->nmodes; i++, mi++) {
303 int visible = mi->x && mi->y;
304 u16 mode_id = mi->mode ? mi->mode :
305 (mi->y << 8)+mi->x;
306
307 if (!visible)
308 continue; /* Hidden mode */
309
310 printf("%c %04X %3dx%-3d %s\n",
311 ch, mode_id, mi->x, mi->y, card->card_name);
312
313 if (ch == '9')
314 ch = 'a';
315 else if (ch == 'z' || ch == ' ')
316 ch = ' '; /* Out of keys... */
317 else
318 ch++;
319 }
320 }
321}
322
323#define H(x) ((x)-'a'+10)
324#define SCAN ((H('s')<<12)+(H('c')<<8)+(H('a')<<4)+H('n'))
325
326static unsigned int mode_menu(void)
327{
328 int key;
329 unsigned int sel;
330
331 puts("Press <ENTER> to see video modes available, "
332 "<SPACE> to continue, or wait 30 sec\n");
333
334 kbd_flush();
335 while (1) {
336 key = getchar_timeout();
337 if (key == ' ' || key == 0)
338 return VIDEO_CURRENT_MODE; /* Default */
339 if (key == '\r')
340 break;
341 putchar('\a'); /* Beep! */
342 }
343
344
345 for (;;) {
346 display_menu();
347
348 puts("Enter a video mode or \"scan\" to scan for "
349 "additional modes: ");
350 sel = get_entry();
351 if (sel != SCAN)
352 return sel;
353
354 probe_cards(1);
355 }
356}
357
358#ifdef CONFIG_VIDEO_RETAIN
359/* Save screen content to the heap */
360struct saved_screen {
361 int x, y;
362 int curx, cury;
363 u16 *data;
364} saved;
365
366static void save_screen(void)
367{
368 /* Should be called after store_mode_params() */
369 saved.x = boot_params.screen_info.orig_video_cols;
370 saved.y = boot_params.screen_info.orig_video_lines;
371 saved.curx = boot_params.screen_info.orig_x;
372 saved.cury = boot_params.screen_info.orig_y;
373
374 if (heap_free() < saved.x*saved.y*sizeof(u16)+512)
375 return; /* Not enough heap to save the screen */
376
377 saved.data = GET_HEAP(u16, saved.x*saved.y);
378
379 set_fs(video_segment);
380 copy_from_fs(saved.data, 0, saved.x*saved.y*sizeof(u16));
381}
382
383static void restore_screen(void)
384{
385 /* Should be called after store_mode_params() */
386 int xs = boot_params.screen_info.orig_video_cols;
387 int ys = boot_params.screen_info.orig_video_lines;
388 int y;
389 addr_t dst = 0;
390 u16 *src = saved.data;
391 u16 ax, bx, dx;
392
393 if (graphic_mode)
394 return; /* Can't restore onto a graphic mode */
395
396 if (!src)
397 return; /* No saved screen contents */
398
399 /* Restore screen contents */
400
401 set_fs(video_segment);
402 for (y = 0; y < ys; y++) {
403 int npad;
404
405 if (y < saved.y) {
406 int copy = (xs < saved.x) ? xs : saved.x;
407 copy_to_fs(dst, src, copy*sizeof(u16));
408 dst += copy*sizeof(u16);
409 src += saved.x;
410 npad = (xs < saved.x) ? 0 : xs-saved.x;
411 } else {
412 npad = xs;
413 }
414
415 /* Writes "npad" blank characters to
416 video_segment:dst and advances dst */
417 asm volatile("pushw %%es ; "
418 "movw %2,%%es ; "
419 "shrw %%cx ; "
420 "jnc 1f ; "
421 "stosw \n\t"
422 "1: rep;stosl ; "
423 "popw %%es"
424 : "+D" (dst), "+c" (npad)
425 : "bdS" (video_segment),
426 "a" (0x07200720));
427 }
428
429 /* Restore cursor position */
430 ax = 0x0200; /* Set cursor position */
431 bx = 0; /* Page number (<< 8) */
432 dx = (saved.cury << 8)+saved.curx;
433 asm volatile(INT10
434 : "+a" (ax), "+b" (bx), "+d" (dx)
435 : : "ecx", "esi", "edi");
436}
437#else
438#define save_screen() ((void)0)
439#define restore_screen() ((void)0)
440#endif
441
442void set_video(void)
443{
444 u16 mode = boot_params.hdr.vid_mode;
445
446 RESET_HEAP();
447
448 store_mode_params();
449 save_screen();
450 probe_cards(0);
451
452 for (;;) {
453 if (mode == ASK_VGA)
454 mode = mode_menu();
455
456 if (!set_mode(mode))
457 break;
458
459 printf("Undefined video mode number: %x\n", mode);
460 mode = ASK_VGA;
461 }
462 vesa_store_edid();
463 store_mode_params();
464
465 if (do_restore)
466 restore_screen();
467}
diff --git a/arch/x86/boot/video.h b/arch/x86/boot/video.h
new file mode 100644
index 000000000000..b92447d51213
--- /dev/null
+++ b/arch/x86/boot/video.h
@@ -0,0 +1,152 @@
1/* -*- linux-c -*- ------------------------------------------------------- *
2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 *
6 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2.
8 *
9 * ----------------------------------------------------------------------- */
10
11/*
12 * arch/i386/boot/video.h
13 *
14 * Header file for the real-mode video probing code
15 */
16
17#ifndef BOOT_VIDEO_H
18#define BOOT_VIDEO_H
19
20#include <linux/types.h>
21
22/* Enable autodetection of SVGA adapters and modes. */
23#undef CONFIG_VIDEO_SVGA
24
25/* Enable autodetection of VESA modes */
26#define CONFIG_VIDEO_VESA
27
28/* Retain screen contents when switching modes */
29#define CONFIG_VIDEO_RETAIN
30
31/* Force 400 scan lines for standard modes (hack to fix bad BIOS behaviour */
32#undef CONFIG_VIDEO_400_HACK
33
34/* This code uses an extended set of video mode numbers. These include:
35 * Aliases for standard modes
36 * NORMAL_VGA (-1)
37 * EXTENDED_VGA (-2)
38 * ASK_VGA (-3)
39 * Video modes numbered by menu position -- NOT RECOMMENDED because of lack
40 * of compatibility when extending the table. These are between 0x00 and 0xff.
41 */
42#define VIDEO_FIRST_MENU 0x0000
43
44/* Standard BIOS video modes (BIOS number + 0x0100) */
45#define VIDEO_FIRST_BIOS 0x0100
46
47/* VESA BIOS video modes (VESA number + 0x0200) */
48#define VIDEO_FIRST_VESA 0x0200
49
50/* Video7 special modes (BIOS number + 0x0900) */
51#define VIDEO_FIRST_V7 0x0900
52
53/* Special video modes */
54#define VIDEO_FIRST_SPECIAL 0x0f00
55#define VIDEO_80x25 0x0f00
56#define VIDEO_8POINT 0x0f01
57#define VIDEO_80x43 0x0f02
58#define VIDEO_80x28 0x0f03
59#define VIDEO_CURRENT_MODE 0x0f04
60#define VIDEO_80x30 0x0f05
61#define VIDEO_80x34 0x0f06
62#define VIDEO_80x60 0x0f07
63#define VIDEO_GFX_HACK 0x0f08
64#define VIDEO_LAST_SPECIAL 0x0f09
65
66/* Video modes given by resolution */
67#define VIDEO_FIRST_RESOLUTION 0x1000
68
69/* The "recalculate timings" flag */
70#define VIDEO_RECALC 0x8000
71
72/* Define DO_STORE according to CONFIG_VIDEO_RETAIN */
73#ifdef CONFIG_VIDEO_RETAIN
74void store_screen(void);
75#define DO_STORE() store_screen()
76#else
77#define DO_STORE() ((void)0)
78#endif /* CONFIG_VIDEO_RETAIN */
79
80/*
81 * Mode table structures
82 */
83
84struct mode_info {
85 u16 mode; /* Mode number (vga= style) */
86 u8 x, y; /* Width, height */
87};
88
89struct card_info {
90 const char *card_name;
91 int (*set_mode)(struct mode_info *mode);
92 int (*probe)(void);
93 struct mode_info *modes;
94 int nmodes; /* Number of probed modes so far */
95 int unsafe; /* Probing is unsafe, only do after "scan" */
96 u16 xmode_first; /* Unprobed modes to try to call anyway */
97 u16 xmode_n; /* Size of unprobed mode range */
98};
99
100#define __videocard struct card_info __attribute__((section(".videocards")))
101extern struct card_info video_cards[], video_cards_end[];
102
103int mode_defined(u16 mode); /* video.c */
104
105/* Basic video information */
106#define ADAPTER_CGA 0 /* CGA/MDA/HGC */
107#define ADAPTER_EGA 1
108#define ADAPTER_VGA 2
109
110extern int adapter;
111extern u16 video_segment;
112extern int force_x, force_y; /* Don't query the BIOS for cols/rows */
113extern int do_restore; /* Restore screen contents */
114extern int graphic_mode; /* Graphics mode with linear frame buffer */
115
116/*
117 * int $0x10 is notorious for touching registers it shouldn't.
118 * gcc doesn't like %ebp being clobbered, so define it as a push/pop
119 * sequence here.
120 *
121 * A number of systems, including the original PC can clobber %bp in
122 * certain circumstances, like when scrolling. There exists at least
123 * one Trident video card which could clobber DS under a set of
124 * circumstances that we are unlikely to encounter (scrolling when
125 * using an extended graphics mode of more than 800x600 pixels), but
126 * it's cheap insurance to deal with that here.
127 */
128#define INT10 "pushl %%ebp; pushw %%ds; int $0x10; popw %%ds; popl %%ebp"
129
130/* Accessing VGA indexed registers */
131static inline u8 in_idx(u16 port, u8 index)
132{
133 outb(index, port);
134 return inb(port+1);
135}
136
137static inline void out_idx(u8 v, u16 port, u8 index)
138{
139 outw(index+(v << 8), port);
140}
141
142/* Writes a value to an indexed port and then reads the port again */
143static inline u8 tst_idx(u8 v, u16 port, u8 index)
144{
145 out_idx(port, index, v);
146 return in_idx(port, index);
147}
148
149/* Get the I/O port of the VGA CRTC */
150u16 vga_crtc(void); /* video-vga.c */
151
152#endif /* BOOT_VIDEO_H */
diff --git a/arch/x86/boot/voyager.c b/arch/x86/boot/voyager.c
new file mode 100644
index 000000000000..61c8fe0453be
--- /dev/null
+++ b/arch/x86/boot/voyager.c
@@ -0,0 +1,46 @@
1/* -*- linux-c -*- ------------------------------------------------------- *
2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 *
6 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2.
8 *
9 * ----------------------------------------------------------------------- */
10
11/*
12 * arch/i386/boot/voyager.c
13 *
14 * Get the Voyager config information
15 */
16
17#include "boot.h"
18
19#ifdef CONFIG_X86_VOYAGER
20
21int query_voyager(void)
22{
23 u8 err;
24 u16 es, di;
25 /* Abuse the apm_bios_info area for this */
26 u8 *data_ptr = (u8 *)&boot_params.apm_bios_info;
27
28 data_ptr[0] = 0xff; /* Flag on config not found(?) */
29
30 asm("pushw %%es ; "
31 "int $0x15 ; "
32 "setc %0 ; "
33 "movw %%es, %1 ; "
34 "popw %%es"
35 : "=q" (err), "=r" (es), "=D" (di)
36 : "a" (0xffc0));
37
38 if (err)
39 return -1; /* Not Voyager */
40
41 set_fs(es);
42 copy_from_fs(data_ptr, di, 7); /* Table is 7 bytes apparently */
43 return 0;
44}
45
46#endif /* CONFIG_X86_VOYAGER */
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
new file mode 100644
index 000000000000..18dcdc6fb7aa
--- /dev/null
+++ b/arch/x86/crypto/Makefile
@@ -0,0 +1,5 @@
1ifeq ($(CONFIG_X86_32),y)
2include ${srctree}/arch/x86/crypto/Makefile_32
3else
4include ${srctree}/arch/x86/crypto/Makefile_64
5endif
diff --git a/arch/x86/crypto/Makefile_32 b/arch/x86/crypto/Makefile_32
new file mode 100644
index 000000000000..2d873a2388ed
--- /dev/null
+++ b/arch/x86/crypto/Makefile_32
@@ -0,0 +1,12 @@
1#
2# x86/crypto/Makefile
3#
4# Arch-specific CryptoAPI modules.
5#
6
7obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o
8obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o
9
10aes-i586-y := aes-i586-asm_32.o aes_32.o
11twofish-i586-y := twofish-i586-asm_32.o twofish_32.o
12
diff --git a/arch/x86/crypto/Makefile_64 b/arch/x86/crypto/Makefile_64
new file mode 100644
index 000000000000..b40896276e93
--- /dev/null
+++ b/arch/x86/crypto/Makefile_64
@@ -0,0 +1,12 @@
1#
2# x86/crypto/Makefile
3#
4# Arch-specific CryptoAPI modules.
5#
6
7obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
8obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
9
10aes-x86_64-y := aes-x86_64-asm_64.o aes_64.o
11twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_64.o
12
diff --git a/arch/x86/crypto/aes-i586-asm_32.S b/arch/x86/crypto/aes-i586-asm_32.S
new file mode 100644
index 000000000000..f942f0c8f630
--- /dev/null
+++ b/arch/x86/crypto/aes-i586-asm_32.S
@@ -0,0 +1,373 @@
1// -------------------------------------------------------------------------
2// Copyright (c) 2001, Dr Brian Gladman < >, Worcester, UK.
3// All rights reserved.
4//
5// LICENSE TERMS
6//
7// The free distribution and use of this software in both source and binary
8// form is allowed (with or without changes) provided that:
9//
10// 1. distributions of this source code include the above copyright
11// notice, this list of conditions and the following disclaimer//
12//
13// 2. distributions in binary form include the above copyright
14// notice, this list of conditions and the following disclaimer
15// in the documentation and/or other associated materials//
16//
17// 3. the copyright holder's name is not used to endorse products
18// built using this software without specific written permission.
19//
20//
21// ALTERNATIVELY, provided that this notice is retained in full, this product
22// may be distributed under the terms of the GNU General Public License (GPL),
23// in which case the provisions of the GPL apply INSTEAD OF those given above.
24//
25// Copyright (c) 2004 Linus Torvalds <torvalds@osdl.org>
26// Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
27
28// DISCLAIMER
29//
30// This software is provided 'as is' with no explicit or implied warranties
31// in respect of its properties including, but not limited to, correctness
32// and fitness for purpose.
33// -------------------------------------------------------------------------
34// Issue Date: 29/07/2002
35
36.file "aes-i586-asm.S"
37.text
38
39#include <asm/asm-offsets.h>
40
41#define tlen 1024 // length of each of 4 'xor' arrays (256 32-bit words)
42
43/* offsets to parameters with one register pushed onto stack */
44#define tfm 8
45#define out_blk 12
46#define in_blk 16
47
48/* offsets in crypto_tfm structure */
49#define ekey (crypto_tfm_ctx_offset + 0)
50#define nrnd (crypto_tfm_ctx_offset + 256)
51#define dkey (crypto_tfm_ctx_offset + 260)
52
53// register mapping for encrypt and decrypt subroutines
54
55#define r0 eax
56#define r1 ebx
57#define r2 ecx
58#define r3 edx
59#define r4 esi
60#define r5 edi
61
62#define eaxl al
63#define eaxh ah
64#define ebxl bl
65#define ebxh bh
66#define ecxl cl
67#define ecxh ch
68#define edxl dl
69#define edxh dh
70
71#define _h(reg) reg##h
72#define h(reg) _h(reg)
73
74#define _l(reg) reg##l
75#define l(reg) _l(reg)
76
77// This macro takes a 32-bit word representing a column and uses
78// each of its four bytes to index into four tables of 256 32-bit
79// words to obtain values that are then xored into the appropriate
80// output registers r0, r1, r4 or r5.
81
82// Parameters:
83// table table base address
84// %1 out_state[0]
85// %2 out_state[1]
86// %3 out_state[2]
87// %4 out_state[3]
88// idx input register for the round (destroyed)
89// tmp scratch register for the round
90// sched key schedule
91
92#define do_col(table, a1,a2,a3,a4, idx, tmp) \
93 movzx %l(idx),%tmp; \
94 xor table(,%tmp,4),%a1; \
95 movzx %h(idx),%tmp; \
96 shr $16,%idx; \
97 xor table+tlen(,%tmp,4),%a2; \
98 movzx %l(idx),%tmp; \
99 movzx %h(idx),%idx; \
100 xor table+2*tlen(,%tmp,4),%a3; \
101 xor table+3*tlen(,%idx,4),%a4;
102
103// initialise output registers from the key schedule
104// NB1: original value of a3 is in idx on exit
105// NB2: original values of a1,a2,a4 aren't used
106#define do_fcol(table, a1,a2,a3,a4, idx, tmp, sched) \
107 mov 0 sched,%a1; \
108 movzx %l(idx),%tmp; \
109 mov 12 sched,%a2; \
110 xor table(,%tmp,4),%a1; \
111 mov 4 sched,%a4; \
112 movzx %h(idx),%tmp; \
113 shr $16,%idx; \
114 xor table+tlen(,%tmp,4),%a2; \
115 movzx %l(idx),%tmp; \
116 movzx %h(idx),%idx; \
117 xor table+3*tlen(,%idx,4),%a4; \
118 mov %a3,%idx; \
119 mov 8 sched,%a3; \
120 xor table+2*tlen(,%tmp,4),%a3;
121
122// initialise output registers from the key schedule
123// NB1: original value of a3 is in idx on exit
124// NB2: original values of a1,a2,a4 aren't used
125#define do_icol(table, a1,a2,a3,a4, idx, tmp, sched) \
126 mov 0 sched,%a1; \
127 movzx %l(idx),%tmp; \
128 mov 4 sched,%a2; \
129 xor table(,%tmp,4),%a1; \
130 mov 12 sched,%a4; \
131 movzx %h(idx),%tmp; \
132 shr $16,%idx; \
133 xor table+tlen(,%tmp,4),%a2; \
134 movzx %l(idx),%tmp; \
135 movzx %h(idx),%idx; \
136 xor table+3*tlen(,%idx,4),%a4; \
137 mov %a3,%idx; \
138 mov 8 sched,%a3; \
139 xor table+2*tlen(,%tmp,4),%a3;
140
141
142// original Gladman had conditional saves to MMX regs.
143#define save(a1, a2) \
144 mov %a2,4*a1(%esp)
145
146#define restore(a1, a2) \
147 mov 4*a2(%esp),%a1
148
149// These macros perform a forward encryption cycle. They are entered with
150// the first previous round column values in r0,r1,r4,r5 and
151// exit with the final values in the same registers, using stack
152// for temporary storage.
153
154// round column values
155// on entry: r0,r1,r4,r5
156// on exit: r2,r1,r4,r5
157#define fwd_rnd1(arg, table) \
158 save (0,r1); \
159 save (1,r5); \
160 \
161 /* compute new column values */ \
162 do_fcol(table, r2,r5,r4,r1, r0,r3, arg); /* idx=r0 */ \
163 do_col (table, r4,r1,r2,r5, r0,r3); /* idx=r4 */ \
164 restore(r0,0); \
165 do_col (table, r1,r2,r5,r4, r0,r3); /* idx=r1 */ \
166 restore(r0,1); \
167 do_col (table, r5,r4,r1,r2, r0,r3); /* idx=r5 */
168
169// round column values
170// on entry: r2,r1,r4,r5
171// on exit: r0,r1,r4,r5
172#define fwd_rnd2(arg, table) \
173 save (0,r1); \
174 save (1,r5); \
175 \
176 /* compute new column values */ \
177 do_fcol(table, r0,r5,r4,r1, r2,r3, arg); /* idx=r2 */ \
178 do_col (table, r4,r1,r0,r5, r2,r3); /* idx=r4 */ \
179 restore(r2,0); \
180 do_col (table, r1,r0,r5,r4, r2,r3); /* idx=r1 */ \
181 restore(r2,1); \
182 do_col (table, r5,r4,r1,r0, r2,r3); /* idx=r5 */
183
184// These macros performs an inverse encryption cycle. They are entered with
185// the first previous round column values in r0,r1,r4,r5 and
186// exit with the final values in the same registers, using stack
187// for temporary storage
188
189// round column values
190// on entry: r0,r1,r4,r5
191// on exit: r2,r1,r4,r5
192#define inv_rnd1(arg, table) \
193 save (0,r1); \
194 save (1,r5); \
195 \
196 /* compute new column values */ \
197 do_icol(table, r2,r1,r4,r5, r0,r3, arg); /* idx=r0 */ \
198 do_col (table, r4,r5,r2,r1, r0,r3); /* idx=r4 */ \
199 restore(r0,0); \
200 do_col (table, r1,r4,r5,r2, r0,r3); /* idx=r1 */ \
201 restore(r0,1); \
202 do_col (table, r5,r2,r1,r4, r0,r3); /* idx=r5 */
203
204// round column values
205// on entry: r2,r1,r4,r5
206// on exit: r0,r1,r4,r5
207#define inv_rnd2(arg, table) \
208 save (0,r1); \
209 save (1,r5); \
210 \
211 /* compute new column values */ \
212 do_icol(table, r0,r1,r4,r5, r2,r3, arg); /* idx=r2 */ \
213 do_col (table, r4,r5,r0,r1, r2,r3); /* idx=r4 */ \
214 restore(r2,0); \
215 do_col (table, r1,r4,r5,r0, r2,r3); /* idx=r1 */ \
216 restore(r2,1); \
217 do_col (table, r5,r0,r1,r4, r2,r3); /* idx=r5 */
218
219// AES (Rijndael) Encryption Subroutine
220/* void aes_enc_blk(struct crypto_tfm *tfm, u8 *out_blk, const u8 *in_blk) */
221
222.global aes_enc_blk
223
224.extern ft_tab
225.extern fl_tab
226
227.align 4
228
229aes_enc_blk:
230 push %ebp
231 mov tfm(%esp),%ebp
232
233// CAUTION: the order and the values used in these assigns
234// rely on the register mappings
235
2361: push %ebx
237 mov in_blk+4(%esp),%r2
238 push %esi
239 mov nrnd(%ebp),%r3 // number of rounds
240 push %edi
241#if ekey != 0
242 lea ekey(%ebp),%ebp // key pointer
243#endif
244
245// input four columns and xor in first round key
246
247 mov (%r2),%r0
248 mov 4(%r2),%r1
249 mov 8(%r2),%r4
250 mov 12(%r2),%r5
251 xor (%ebp),%r0
252 xor 4(%ebp),%r1
253 xor 8(%ebp),%r4
254 xor 12(%ebp),%r5
255
256 sub $8,%esp // space for register saves on stack
257 add $16,%ebp // increment to next round key
258 cmp $12,%r3
259 jb 4f // 10 rounds for 128-bit key
260 lea 32(%ebp),%ebp
261 je 3f // 12 rounds for 192-bit key
262 lea 32(%ebp),%ebp
263
2642: fwd_rnd1( -64(%ebp) ,ft_tab) // 14 rounds for 256-bit key
265 fwd_rnd2( -48(%ebp) ,ft_tab)
2663: fwd_rnd1( -32(%ebp) ,ft_tab) // 12 rounds for 192-bit key
267 fwd_rnd2( -16(%ebp) ,ft_tab)
2684: fwd_rnd1( (%ebp) ,ft_tab) // 10 rounds for 128-bit key
269 fwd_rnd2( +16(%ebp) ,ft_tab)
270 fwd_rnd1( +32(%ebp) ,ft_tab)
271 fwd_rnd2( +48(%ebp) ,ft_tab)
272 fwd_rnd1( +64(%ebp) ,ft_tab)
273 fwd_rnd2( +80(%ebp) ,ft_tab)
274 fwd_rnd1( +96(%ebp) ,ft_tab)
275 fwd_rnd2(+112(%ebp) ,ft_tab)
276 fwd_rnd1(+128(%ebp) ,ft_tab)
277 fwd_rnd2(+144(%ebp) ,fl_tab) // last round uses a different table
278
279// move final values to the output array. CAUTION: the
280// order of these assigns rely on the register mappings
281
282 add $8,%esp
283 mov out_blk+12(%esp),%ebp
284 mov %r5,12(%ebp)
285 pop %edi
286 mov %r4,8(%ebp)
287 pop %esi
288 mov %r1,4(%ebp)
289 pop %ebx
290 mov %r0,(%ebp)
291 pop %ebp
292 mov $1,%eax
293 ret
294
295// AES (Rijndael) Decryption Subroutine
296/* void aes_dec_blk(struct crypto_tfm *tfm, u8 *out_blk, const u8 *in_blk) */
297
298.global aes_dec_blk
299
300.extern it_tab
301.extern il_tab
302
303.align 4
304
305aes_dec_blk:
306 push %ebp
307 mov tfm(%esp),%ebp
308
309// CAUTION: the order and the values used in these assigns
310// rely on the register mappings
311
3121: push %ebx
313 mov in_blk+4(%esp),%r2
314 push %esi
315 mov nrnd(%ebp),%r3 // number of rounds
316 push %edi
317#if dkey != 0
318 lea dkey(%ebp),%ebp // key pointer
319#endif
320 mov %r3,%r0
321 shl $4,%r0
322 add %r0,%ebp
323
324// input four columns and xor in first round key
325
326 mov (%r2),%r0
327 mov 4(%r2),%r1
328 mov 8(%r2),%r4
329 mov 12(%r2),%r5
330 xor (%ebp),%r0
331 xor 4(%ebp),%r1
332 xor 8(%ebp),%r4
333 xor 12(%ebp),%r5
334
335 sub $8,%esp // space for register saves on stack
336 sub $16,%ebp // increment to next round key
337 cmp $12,%r3
338 jb 4f // 10 rounds for 128-bit key
339 lea -32(%ebp),%ebp
340 je 3f // 12 rounds for 192-bit key
341 lea -32(%ebp),%ebp
342
3432: inv_rnd1( +64(%ebp), it_tab) // 14 rounds for 256-bit key
344 inv_rnd2( +48(%ebp), it_tab)
3453: inv_rnd1( +32(%ebp), it_tab) // 12 rounds for 192-bit key
346 inv_rnd2( +16(%ebp), it_tab)
3474: inv_rnd1( (%ebp), it_tab) // 10 rounds for 128-bit key
348 inv_rnd2( -16(%ebp), it_tab)
349 inv_rnd1( -32(%ebp), it_tab)
350 inv_rnd2( -48(%ebp), it_tab)
351 inv_rnd1( -64(%ebp), it_tab)
352 inv_rnd2( -80(%ebp), it_tab)
353 inv_rnd1( -96(%ebp), it_tab)
354 inv_rnd2(-112(%ebp), it_tab)
355 inv_rnd1(-128(%ebp), it_tab)
356 inv_rnd2(-144(%ebp), il_tab) // last round uses a different table
357
358// move final values to the output array. CAUTION: the
359// order of these assigns rely on the register mappings
360
361 add $8,%esp
362 mov out_blk+12(%esp),%ebp
363 mov %r5,12(%ebp)
364 pop %edi
365 mov %r4,8(%ebp)
366 pop %esi
367 mov %r1,4(%ebp)
368 pop %ebx
369 mov %r0,(%ebp)
370 pop %ebp
371 mov $1,%eax
372 ret
373
diff --git a/arch/x86/crypto/aes-x86_64-asm_64.S b/arch/x86/crypto/aes-x86_64-asm_64.S
new file mode 100644
index 000000000000..26b40de4d0b0
--- /dev/null
+++ b/arch/x86/crypto/aes-x86_64-asm_64.S
@@ -0,0 +1,190 @@
1/* AES (Rijndael) implementation (FIPS PUB 197) for x86_64
2 *
3 * Copyright (C) 2005 Andreas Steinmetz, <ast@domdv.de>
4 *
5 * License:
6 * This code can be distributed under the terms of the GNU General Public
7 * License (GPL) Version 2 provided that the above header down to and
8 * including this sentence is retained in full.
9 */
10
11.extern aes_ft_tab
12.extern aes_it_tab
13.extern aes_fl_tab
14.extern aes_il_tab
15
16.text
17
18#include <asm/asm-offsets.h>
19
20#define BASE crypto_tfm_ctx_offset
21
22#define R1 %rax
23#define R1E %eax
24#define R1X %ax
25#define R1H %ah
26#define R1L %al
27#define R2 %rbx
28#define R2E %ebx
29#define R2X %bx
30#define R2H %bh
31#define R2L %bl
32#define R3 %rcx
33#define R3E %ecx
34#define R3X %cx
35#define R3H %ch
36#define R3L %cl
37#define R4 %rdx
38#define R4E %edx
39#define R4X %dx
40#define R4H %dh
41#define R4L %dl
42#define R5 %rsi
43#define R5E %esi
44#define R6 %rdi
45#define R6E %edi
46#define R7 %rbp
47#define R7E %ebp
48#define R8 %r8
49#define R9 %r9
50#define R10 %r10
51#define R11 %r11
52
53#define prologue(FUNC,KEY,B128,B192,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11) \
54 .global FUNC; \
55 .type FUNC,@function; \
56 .align 8; \
57FUNC: movq r1,r2; \
58 movq r3,r4; \
59 leaq BASE+KEY+52(r8),r9; \
60 movq r10,r11; \
61 movl (r7),r5 ## E; \
62 movl 4(r7),r1 ## E; \
63 movl 8(r7),r6 ## E; \
64 movl 12(r7),r7 ## E; \
65 movl BASE(r8),r10 ## E; \
66 xorl -48(r9),r5 ## E; \
67 xorl -44(r9),r1 ## E; \
68 xorl -40(r9),r6 ## E; \
69 xorl -36(r9),r7 ## E; \
70 cmpl $24,r10 ## E; \
71 jb B128; \
72 leaq 32(r9),r9; \
73 je B192; \
74 leaq 32(r9),r9;
75
76#define epilogue(r1,r2,r3,r4,r5,r6,r7,r8,r9) \
77 movq r1,r2; \
78 movq r3,r4; \
79 movl r5 ## E,(r9); \
80 movl r6 ## E,4(r9); \
81 movl r7 ## E,8(r9); \
82 movl r8 ## E,12(r9); \
83 ret;
84
85#define round(TAB,OFFSET,r1,r2,r3,r4,r5,r6,r7,r8,ra,rb,rc,rd) \
86 movzbl r2 ## H,r5 ## E; \
87 movzbl r2 ## L,r6 ## E; \
88 movl TAB+1024(,r5,4),r5 ## E;\
89 movw r4 ## X,r2 ## X; \
90 movl TAB(,r6,4),r6 ## E; \
91 roll $16,r2 ## E; \
92 shrl $16,r4 ## E; \
93 movzbl r4 ## H,r7 ## E; \
94 movzbl r4 ## L,r4 ## E; \
95 xorl OFFSET(r8),ra ## E; \
96 xorl OFFSET+4(r8),rb ## E; \
97 xorl TAB+3072(,r7,4),r5 ## E;\
98 xorl TAB+2048(,r4,4),r6 ## E;\
99 movzbl r1 ## L,r7 ## E; \
100 movzbl r1 ## H,r4 ## E; \
101 movl TAB+1024(,r4,4),r4 ## E;\
102 movw r3 ## X,r1 ## X; \
103 roll $16,r1 ## E; \
104 shrl $16,r3 ## E; \
105 xorl TAB(,r7,4),r5 ## E; \
106 movzbl r3 ## H,r7 ## E; \
107 movzbl r3 ## L,r3 ## E; \
108 xorl TAB+3072(,r7,4),r4 ## E;\
109 xorl TAB+2048(,r3,4),r5 ## E;\
110 movzbl r1 ## H,r7 ## E; \
111 movzbl r1 ## L,r3 ## E; \
112 shrl $16,r1 ## E; \
113 xorl TAB+3072(,r7,4),r6 ## E;\
114 movl TAB+2048(,r3,4),r3 ## E;\
115 movzbl r1 ## H,r7 ## E; \
116 movzbl r1 ## L,r1 ## E; \
117 xorl TAB+1024(,r7,4),r6 ## E;\
118 xorl TAB(,r1,4),r3 ## E; \
119 movzbl r2 ## H,r1 ## E; \
120 movzbl r2 ## L,r7 ## E; \
121 shrl $16,r2 ## E; \
122 xorl TAB+3072(,r1,4),r3 ## E;\
123 xorl TAB+2048(,r7,4),r4 ## E;\
124 movzbl r2 ## H,r1 ## E; \
125 movzbl r2 ## L,r2 ## E; \
126 xorl OFFSET+8(r8),rc ## E; \
127 xorl OFFSET+12(r8),rd ## E; \
128 xorl TAB+1024(,r1,4),r3 ## E;\
129 xorl TAB(,r2,4),r4 ## E;
130
131#define move_regs(r1,r2,r3,r4) \
132 movl r3 ## E,r1 ## E; \
133 movl r4 ## E,r2 ## E;
134
135#define entry(FUNC,KEY,B128,B192) \
136 prologue(FUNC,KEY,B128,B192,R2,R8,R7,R9,R1,R3,R4,R6,R10,R5,R11)
137
138#define return epilogue(R8,R2,R9,R7,R5,R6,R3,R4,R11)
139
140#define encrypt_round(TAB,OFFSET) \
141 round(TAB,OFFSET,R1,R2,R3,R4,R5,R6,R7,R10,R5,R6,R3,R4) \
142 move_regs(R1,R2,R5,R6)
143
144#define encrypt_final(TAB,OFFSET) \
145 round(TAB,OFFSET,R1,R2,R3,R4,R5,R6,R7,R10,R5,R6,R3,R4)
146
147#define decrypt_round(TAB,OFFSET) \
148 round(TAB,OFFSET,R2,R1,R4,R3,R6,R5,R7,R10,R5,R6,R3,R4) \
149 move_regs(R1,R2,R5,R6)
150
151#define decrypt_final(TAB,OFFSET) \
152 round(TAB,OFFSET,R2,R1,R4,R3,R6,R5,R7,R10,R5,R6,R3,R4)
153
154/* void aes_enc_blk(stuct crypto_tfm *tfm, u8 *out, const u8 *in) */
155
156 entry(aes_enc_blk,0,enc128,enc192)
157 encrypt_round(aes_ft_tab,-96)
158 encrypt_round(aes_ft_tab,-80)
159enc192: encrypt_round(aes_ft_tab,-64)
160 encrypt_round(aes_ft_tab,-48)
161enc128: encrypt_round(aes_ft_tab,-32)
162 encrypt_round(aes_ft_tab,-16)
163 encrypt_round(aes_ft_tab, 0)
164 encrypt_round(aes_ft_tab, 16)
165 encrypt_round(aes_ft_tab, 32)
166 encrypt_round(aes_ft_tab, 48)
167 encrypt_round(aes_ft_tab, 64)
168 encrypt_round(aes_ft_tab, 80)
169 encrypt_round(aes_ft_tab, 96)
170 encrypt_final(aes_fl_tab,112)
171 return
172
173/* void aes_dec_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in) */
174
175 entry(aes_dec_blk,240,dec128,dec192)
176 decrypt_round(aes_it_tab,-96)
177 decrypt_round(aes_it_tab,-80)
178dec192: decrypt_round(aes_it_tab,-64)
179 decrypt_round(aes_it_tab,-48)
180dec128: decrypt_round(aes_it_tab,-32)
181 decrypt_round(aes_it_tab,-16)
182 decrypt_round(aes_it_tab, 0)
183 decrypt_round(aes_it_tab, 16)
184 decrypt_round(aes_it_tab, 32)
185 decrypt_round(aes_it_tab, 48)
186 decrypt_round(aes_it_tab, 64)
187 decrypt_round(aes_it_tab, 80)
188 decrypt_round(aes_it_tab, 96)
189 decrypt_final(aes_il_tab,112)
190 return
diff --git a/arch/x86/crypto/aes_32.c b/arch/x86/crypto/aes_32.c
new file mode 100644
index 000000000000..49aad9397f10
--- /dev/null
+++ b/arch/x86/crypto/aes_32.c
@@ -0,0 +1,515 @@
1/*
2 *
3 * Glue Code for optimized 586 assembler version of AES
4 *
5 * Copyright (c) 2002, Dr Brian Gladman <>, Worcester, UK.
6 * All rights reserved.
7 *
8 * LICENSE TERMS
9 *
10 * The free distribution and use of this software in both source and binary
11 * form is allowed (with or without changes) provided that:
12 *
13 * 1. distributions of this source code include the above copyright
14 * notice, this list of conditions and the following disclaimer;
15 *
16 * 2. distributions in binary form include the above copyright
17 * notice, this list of conditions and the following disclaimer
18 * in the documentation and/or other associated materials;
19 *
20 * 3. the copyright holder's name is not used to endorse products
21 * built using this software without specific written permission.
22 *
23 * ALTERNATIVELY, provided that this notice is retained in full, this product
24 * may be distributed under the terms of the GNU General Public License (GPL),
25 * in which case the provisions of the GPL apply INSTEAD OF those given above.
26 *
27 * DISCLAIMER
28 *
29 * This software is provided 'as is' with no explicit or implied warranties
30 * in respect of its properties, including, but not limited to, correctness
31 * and/or fitness for purpose.
32 *
33 * Copyright (c) 2003, Adam J. Richter <adam@yggdrasil.com> (conversion to
34 * 2.5 API).
35 * Copyright (c) 2003, 2004 Fruhwirth Clemens <clemens@endorphin.org>
36 * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
37 *
38 */
39
40#include <asm/byteorder.h>
41#include <linux/kernel.h>
42#include <linux/module.h>
43#include <linux/init.h>
44#include <linux/types.h>
45#include <linux/crypto.h>
46#include <linux/linkage.h>
47
48asmlinkage void aes_enc_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
49asmlinkage void aes_dec_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
50
51#define AES_MIN_KEY_SIZE 16
52#define AES_MAX_KEY_SIZE 32
53#define AES_BLOCK_SIZE 16
54#define AES_KS_LENGTH 4 * AES_BLOCK_SIZE
55#define RC_LENGTH 29
56
57struct aes_ctx {
58 u32 ekey[AES_KS_LENGTH];
59 u32 rounds;
60 u32 dkey[AES_KS_LENGTH];
61};
62
63#define WPOLY 0x011b
64#define bytes2word(b0, b1, b2, b3) \
65 (((u32)(b3) << 24) | ((u32)(b2) << 16) | ((u32)(b1) << 8) | (b0))
66
67/* define the finite field multiplies required for Rijndael */
68#define f2(x) ((x) ? pow[log[x] + 0x19] : 0)
69#define f3(x) ((x) ? pow[log[x] + 0x01] : 0)
70#define f9(x) ((x) ? pow[log[x] + 0xc7] : 0)
71#define fb(x) ((x) ? pow[log[x] + 0x68] : 0)
72#define fd(x) ((x) ? pow[log[x] + 0xee] : 0)
73#define fe(x) ((x) ? pow[log[x] + 0xdf] : 0)
74#define fi(x) ((x) ? pow[255 - log[x]]: 0)
75
76static inline u32 upr(u32 x, int n)
77{
78 return (x << 8 * n) | (x >> (32 - 8 * n));
79}
80
81static inline u8 bval(u32 x, int n)
82{
83 return x >> 8 * n;
84}
85
86/* The forward and inverse affine transformations used in the S-box */
87#define fwd_affine(x) \
88 (w = (u32)x, w ^= (w<<1)^(w<<2)^(w<<3)^(w<<4), 0x63^(u8)(w^(w>>8)))
89
90#define inv_affine(x) \
91 (w = (u32)x, w = (w<<1)^(w<<3)^(w<<6), 0x05^(u8)(w^(w>>8)))
92
93static u32 rcon_tab[RC_LENGTH];
94
95u32 ft_tab[4][256];
96u32 fl_tab[4][256];
97static u32 im_tab[4][256];
98u32 il_tab[4][256];
99u32 it_tab[4][256];
100
101static void gen_tabs(void)
102{
103 u32 i, w;
104 u8 pow[512], log[256];
105
106 /*
107 * log and power tables for GF(2^8) finite field with
108 * WPOLY as modular polynomial - the simplest primitive
109 * root is 0x03, used here to generate the tables.
110 */
111 i = 0; w = 1;
112
113 do {
114 pow[i] = (u8)w;
115 pow[i + 255] = (u8)w;
116 log[w] = (u8)i++;
117 w ^= (w << 1) ^ (w & 0x80 ? WPOLY : 0);
118 } while (w != 1);
119
120 for(i = 0, w = 1; i < RC_LENGTH; ++i) {
121 rcon_tab[i] = bytes2word(w, 0, 0, 0);
122 w = f2(w);
123 }
124
125 for(i = 0; i < 256; ++i) {
126 u8 b;
127
128 b = fwd_affine(fi((u8)i));
129 w = bytes2word(f2(b), b, b, f3(b));
130
131 /* tables for a normal encryption round */
132 ft_tab[0][i] = w;
133 ft_tab[1][i] = upr(w, 1);
134 ft_tab[2][i] = upr(w, 2);
135 ft_tab[3][i] = upr(w, 3);
136 w = bytes2word(b, 0, 0, 0);
137
138 /*
139 * tables for last encryption round
140 * (may also be used in the key schedule)
141 */
142 fl_tab[0][i] = w;
143 fl_tab[1][i] = upr(w, 1);
144 fl_tab[2][i] = upr(w, 2);
145 fl_tab[3][i] = upr(w, 3);
146
147 b = fi(inv_affine((u8)i));
148 w = bytes2word(fe(b), f9(b), fd(b), fb(b));
149
150 /* tables for the inverse mix column operation */
151 im_tab[0][b] = w;
152 im_tab[1][b] = upr(w, 1);
153 im_tab[2][b] = upr(w, 2);
154 im_tab[3][b] = upr(w, 3);
155
156 /* tables for a normal decryption round */
157 it_tab[0][i] = w;
158 it_tab[1][i] = upr(w,1);
159 it_tab[2][i] = upr(w,2);
160 it_tab[3][i] = upr(w,3);
161
162 w = bytes2word(b, 0, 0, 0);
163
164 /* tables for last decryption round */
165 il_tab[0][i] = w;
166 il_tab[1][i] = upr(w,1);
167 il_tab[2][i] = upr(w,2);
168 il_tab[3][i] = upr(w,3);
169 }
170}
171
172#define four_tables(x,tab,vf,rf,c) \
173( tab[0][bval(vf(x,0,c),rf(0,c))] ^ \
174 tab[1][bval(vf(x,1,c),rf(1,c))] ^ \
175 tab[2][bval(vf(x,2,c),rf(2,c))] ^ \
176 tab[3][bval(vf(x,3,c),rf(3,c))] \
177)
178
179#define vf1(x,r,c) (x)
180#define rf1(r,c) (r)
181#define rf2(r,c) ((r-c)&3)
182
183#define inv_mcol(x) four_tables(x,im_tab,vf1,rf1,0)
184#define ls_box(x,c) four_tables(x,fl_tab,vf1,rf2,c)
185
186#define ff(x) inv_mcol(x)
187
188#define ke4(k,i) \
189{ \
190 k[4*(i)+4] = ss[0] ^= ls_box(ss[3],3) ^ rcon_tab[i]; \
191 k[4*(i)+5] = ss[1] ^= ss[0]; \
192 k[4*(i)+6] = ss[2] ^= ss[1]; \
193 k[4*(i)+7] = ss[3] ^= ss[2]; \
194}
195
196#define kel4(k,i) \
197{ \
198 k[4*(i)+4] = ss[0] ^= ls_box(ss[3],3) ^ rcon_tab[i]; \
199 k[4*(i)+5] = ss[1] ^= ss[0]; \
200 k[4*(i)+6] = ss[2] ^= ss[1]; k[4*(i)+7] = ss[3] ^= ss[2]; \
201}
202
203#define ke6(k,i) \
204{ \
205 k[6*(i)+ 6] = ss[0] ^= ls_box(ss[5],3) ^ rcon_tab[i]; \
206 k[6*(i)+ 7] = ss[1] ^= ss[0]; \
207 k[6*(i)+ 8] = ss[2] ^= ss[1]; \
208 k[6*(i)+ 9] = ss[3] ^= ss[2]; \
209 k[6*(i)+10] = ss[4] ^= ss[3]; \
210 k[6*(i)+11] = ss[5] ^= ss[4]; \
211}
212
213#define kel6(k,i) \
214{ \
215 k[6*(i)+ 6] = ss[0] ^= ls_box(ss[5],3) ^ rcon_tab[i]; \
216 k[6*(i)+ 7] = ss[1] ^= ss[0]; \
217 k[6*(i)+ 8] = ss[2] ^= ss[1]; \
218 k[6*(i)+ 9] = ss[3] ^= ss[2]; \
219}
220
221#define ke8(k,i) \
222{ \
223 k[8*(i)+ 8] = ss[0] ^= ls_box(ss[7],3) ^ rcon_tab[i]; \
224 k[8*(i)+ 9] = ss[1] ^= ss[0]; \
225 k[8*(i)+10] = ss[2] ^= ss[1]; \
226 k[8*(i)+11] = ss[3] ^= ss[2]; \
227 k[8*(i)+12] = ss[4] ^= ls_box(ss[3],0); \
228 k[8*(i)+13] = ss[5] ^= ss[4]; \
229 k[8*(i)+14] = ss[6] ^= ss[5]; \
230 k[8*(i)+15] = ss[7] ^= ss[6]; \
231}
232
233#define kel8(k,i) \
234{ \
235 k[8*(i)+ 8] = ss[0] ^= ls_box(ss[7],3) ^ rcon_tab[i]; \
236 k[8*(i)+ 9] = ss[1] ^= ss[0]; \
237 k[8*(i)+10] = ss[2] ^= ss[1]; \
238 k[8*(i)+11] = ss[3] ^= ss[2]; \
239}
240
241#define kdf4(k,i) \
242{ \
243 ss[0] = ss[0] ^ ss[2] ^ ss[1] ^ ss[3]; \
244 ss[1] = ss[1] ^ ss[3]; \
245 ss[2] = ss[2] ^ ss[3]; \
246 ss[3] = ss[3]; \
247 ss[4] = ls_box(ss[(i+3) % 4], 3) ^ rcon_tab[i]; \
248 ss[i % 4] ^= ss[4]; \
249 ss[4] ^= k[4*(i)]; \
250 k[4*(i)+4] = ff(ss[4]); \
251 ss[4] ^= k[4*(i)+1]; \
252 k[4*(i)+5] = ff(ss[4]); \
253 ss[4] ^= k[4*(i)+2]; \
254 k[4*(i)+6] = ff(ss[4]); \
255 ss[4] ^= k[4*(i)+3]; \
256 k[4*(i)+7] = ff(ss[4]); \
257}
258
259#define kd4(k,i) \
260{ \
261 ss[4] = ls_box(ss[(i+3) % 4], 3) ^ rcon_tab[i]; \
262 ss[i % 4] ^= ss[4]; \
263 ss[4] = ff(ss[4]); \
264 k[4*(i)+4] = ss[4] ^= k[4*(i)]; \
265 k[4*(i)+5] = ss[4] ^= k[4*(i)+1]; \
266 k[4*(i)+6] = ss[4] ^= k[4*(i)+2]; \
267 k[4*(i)+7] = ss[4] ^= k[4*(i)+3]; \
268}
269
270#define kdl4(k,i) \
271{ \
272 ss[4] = ls_box(ss[(i+3) % 4], 3) ^ rcon_tab[i]; \
273 ss[i % 4] ^= ss[4]; \
274 k[4*(i)+4] = (ss[0] ^= ss[1]) ^ ss[2] ^ ss[3]; \
275 k[4*(i)+5] = ss[1] ^ ss[3]; \
276 k[4*(i)+6] = ss[0]; \
277 k[4*(i)+7] = ss[1]; \
278}
279
280#define kdf6(k,i) \
281{ \
282 ss[0] ^= ls_box(ss[5],3) ^ rcon_tab[i]; \
283 k[6*(i)+ 6] = ff(ss[0]); \
284 ss[1] ^= ss[0]; \
285 k[6*(i)+ 7] = ff(ss[1]); \
286 ss[2] ^= ss[1]; \
287 k[6*(i)+ 8] = ff(ss[2]); \
288 ss[3] ^= ss[2]; \
289 k[6*(i)+ 9] = ff(ss[3]); \
290 ss[4] ^= ss[3]; \
291 k[6*(i)+10] = ff(ss[4]); \
292 ss[5] ^= ss[4]; \
293 k[6*(i)+11] = ff(ss[5]); \
294}
295
296#define kd6(k,i) \
297{ \
298 ss[6] = ls_box(ss[5],3) ^ rcon_tab[i]; \
299 ss[0] ^= ss[6]; ss[6] = ff(ss[6]); \
300 k[6*(i)+ 6] = ss[6] ^= k[6*(i)]; \
301 ss[1] ^= ss[0]; \
302 k[6*(i)+ 7] = ss[6] ^= k[6*(i)+ 1]; \
303 ss[2] ^= ss[1]; \
304 k[6*(i)+ 8] = ss[6] ^= k[6*(i)+ 2]; \
305 ss[3] ^= ss[2]; \
306 k[6*(i)+ 9] = ss[6] ^= k[6*(i)+ 3]; \
307 ss[4] ^= ss[3]; \
308 k[6*(i)+10] = ss[6] ^= k[6*(i)+ 4]; \
309 ss[5] ^= ss[4]; \
310 k[6*(i)+11] = ss[6] ^= k[6*(i)+ 5]; \
311}
312
313#define kdl6(k,i) \
314{ \
315 ss[0] ^= ls_box(ss[5],3) ^ rcon_tab[i]; \
316 k[6*(i)+ 6] = ss[0]; \
317 ss[1] ^= ss[0]; \
318 k[6*(i)+ 7] = ss[1]; \
319 ss[2] ^= ss[1]; \
320 k[6*(i)+ 8] = ss[2]; \
321 ss[3] ^= ss[2]; \
322 k[6*(i)+ 9] = ss[3]; \
323}
324
325#define kdf8(k,i) \
326{ \
327 ss[0] ^= ls_box(ss[7],3) ^ rcon_tab[i]; \
328 k[8*(i)+ 8] = ff(ss[0]); \
329 ss[1] ^= ss[0]; \
330 k[8*(i)+ 9] = ff(ss[1]); \
331 ss[2] ^= ss[1]; \
332 k[8*(i)+10] = ff(ss[2]); \
333 ss[3] ^= ss[2]; \
334 k[8*(i)+11] = ff(ss[3]); \
335 ss[4] ^= ls_box(ss[3],0); \
336 k[8*(i)+12] = ff(ss[4]); \
337 ss[5] ^= ss[4]; \
338 k[8*(i)+13] = ff(ss[5]); \
339 ss[6] ^= ss[5]; \
340 k[8*(i)+14] = ff(ss[6]); \
341 ss[7] ^= ss[6]; \
342 k[8*(i)+15] = ff(ss[7]); \
343}
344
345#define kd8(k,i) \
346{ \
347 u32 __g = ls_box(ss[7],3) ^ rcon_tab[i]; \
348 ss[0] ^= __g; \
349 __g = ff(__g); \
350 k[8*(i)+ 8] = __g ^= k[8*(i)]; \
351 ss[1] ^= ss[0]; \
352 k[8*(i)+ 9] = __g ^= k[8*(i)+ 1]; \
353 ss[2] ^= ss[1]; \
354 k[8*(i)+10] = __g ^= k[8*(i)+ 2]; \
355 ss[3] ^= ss[2]; \
356 k[8*(i)+11] = __g ^= k[8*(i)+ 3]; \
357 __g = ls_box(ss[3],0); \
358 ss[4] ^= __g; \
359 __g = ff(__g); \
360 k[8*(i)+12] = __g ^= k[8*(i)+ 4]; \
361 ss[5] ^= ss[4]; \
362 k[8*(i)+13] = __g ^= k[8*(i)+ 5]; \
363 ss[6] ^= ss[5]; \
364 k[8*(i)+14] = __g ^= k[8*(i)+ 6]; \
365 ss[7] ^= ss[6]; \
366 k[8*(i)+15] = __g ^= k[8*(i)+ 7]; \
367}
368
369#define kdl8(k,i) \
370{ \
371 ss[0] ^= ls_box(ss[7],3) ^ rcon_tab[i]; \
372 k[8*(i)+ 8] = ss[0]; \
373 ss[1] ^= ss[0]; \
374 k[8*(i)+ 9] = ss[1]; \
375 ss[2] ^= ss[1]; \
376 k[8*(i)+10] = ss[2]; \
377 ss[3] ^= ss[2]; \
378 k[8*(i)+11] = ss[3]; \
379}
380
381static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
382 unsigned int key_len)
383{
384 int i;
385 u32 ss[8];
386 struct aes_ctx *ctx = crypto_tfm_ctx(tfm);
387 const __le32 *key = (const __le32 *)in_key;
388 u32 *flags = &tfm->crt_flags;
389
390 /* encryption schedule */
391
392 ctx->ekey[0] = ss[0] = le32_to_cpu(key[0]);
393 ctx->ekey[1] = ss[1] = le32_to_cpu(key[1]);
394 ctx->ekey[2] = ss[2] = le32_to_cpu(key[2]);
395 ctx->ekey[3] = ss[3] = le32_to_cpu(key[3]);
396
397 switch(key_len) {
398 case 16:
399 for (i = 0; i < 9; i++)
400 ke4(ctx->ekey, i);
401 kel4(ctx->ekey, 9);
402 ctx->rounds = 10;
403 break;
404
405 case 24:
406 ctx->ekey[4] = ss[4] = le32_to_cpu(key[4]);
407 ctx->ekey[5] = ss[5] = le32_to_cpu(key[5]);
408 for (i = 0; i < 7; i++)
409 ke6(ctx->ekey, i);
410 kel6(ctx->ekey, 7);
411 ctx->rounds = 12;
412 break;
413
414 case 32:
415 ctx->ekey[4] = ss[4] = le32_to_cpu(key[4]);
416 ctx->ekey[5] = ss[5] = le32_to_cpu(key[5]);
417 ctx->ekey[6] = ss[6] = le32_to_cpu(key[6]);
418 ctx->ekey[7] = ss[7] = le32_to_cpu(key[7]);
419 for (i = 0; i < 6; i++)
420 ke8(ctx->ekey, i);
421 kel8(ctx->ekey, 6);
422 ctx->rounds = 14;
423 break;
424
425 default:
426 *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
427 return -EINVAL;
428 }
429
430 /* decryption schedule */
431
432 ctx->dkey[0] = ss[0] = le32_to_cpu(key[0]);
433 ctx->dkey[1] = ss[1] = le32_to_cpu(key[1]);
434 ctx->dkey[2] = ss[2] = le32_to_cpu(key[2]);
435 ctx->dkey[3] = ss[3] = le32_to_cpu(key[3]);
436
437 switch (key_len) {
438 case 16:
439 kdf4(ctx->dkey, 0);
440 for (i = 1; i < 9; i++)
441 kd4(ctx->dkey, i);
442 kdl4(ctx->dkey, 9);
443 break;
444
445 case 24:
446 ctx->dkey[4] = ff(ss[4] = le32_to_cpu(key[4]));
447 ctx->dkey[5] = ff(ss[5] = le32_to_cpu(key[5]));
448 kdf6(ctx->dkey, 0);
449 for (i = 1; i < 7; i++)
450 kd6(ctx->dkey, i);
451 kdl6(ctx->dkey, 7);
452 break;
453
454 case 32:
455 ctx->dkey[4] = ff(ss[4] = le32_to_cpu(key[4]));
456 ctx->dkey[5] = ff(ss[5] = le32_to_cpu(key[5]));
457 ctx->dkey[6] = ff(ss[6] = le32_to_cpu(key[6]));
458 ctx->dkey[7] = ff(ss[7] = le32_to_cpu(key[7]));
459 kdf8(ctx->dkey, 0);
460 for (i = 1; i < 6; i++)
461 kd8(ctx->dkey, i);
462 kdl8(ctx->dkey, 6);
463 break;
464 }
465 return 0;
466}
467
468static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
469{
470 aes_enc_blk(tfm, dst, src);
471}
472
473static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
474{
475 aes_dec_blk(tfm, dst, src);
476}
477
478static struct crypto_alg aes_alg = {
479 .cra_name = "aes",
480 .cra_driver_name = "aes-i586",
481 .cra_priority = 200,
482 .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
483 .cra_blocksize = AES_BLOCK_SIZE,
484 .cra_ctxsize = sizeof(struct aes_ctx),
485 .cra_module = THIS_MODULE,
486 .cra_list = LIST_HEAD_INIT(aes_alg.cra_list),
487 .cra_u = {
488 .cipher = {
489 .cia_min_keysize = AES_MIN_KEY_SIZE,
490 .cia_max_keysize = AES_MAX_KEY_SIZE,
491 .cia_setkey = aes_set_key,
492 .cia_encrypt = aes_encrypt,
493 .cia_decrypt = aes_decrypt
494 }
495 }
496};
497
498static int __init aes_init(void)
499{
500 gen_tabs();
501 return crypto_register_alg(&aes_alg);
502}
503
504static void __exit aes_fini(void)
505{
506 crypto_unregister_alg(&aes_alg);
507}
508
509module_init(aes_init);
510module_exit(aes_fini);
511
512MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm, i586 asm optimized");
513MODULE_LICENSE("Dual BSD/GPL");
514MODULE_AUTHOR("Fruhwirth Clemens, James Morris, Brian Gladman, Adam Richter");
515MODULE_ALIAS("aes");
diff --git a/arch/x86/crypto/aes_64.c b/arch/x86/crypto/aes_64.c
new file mode 100644
index 000000000000..5cdb13ea5cc2
--- /dev/null
+++ b/arch/x86/crypto/aes_64.c
@@ -0,0 +1,336 @@
1/*
2 * Cryptographic API.
3 *
4 * AES Cipher Algorithm.
5 *
6 * Based on Brian Gladman's code.
7 *
8 * Linux developers:
9 * Alexander Kjeldaas <astor@fast.no>
10 * Herbert Valerio Riedel <hvr@hvrlab.org>
11 * Kyle McMartin <kyle@debian.org>
12 * Adam J. Richter <adam@yggdrasil.com> (conversion to 2.5 API).
13 * Andreas Steinmetz <ast@domdv.de> (adapted to x86_64 assembler)
14 *
15 * This program is free software; you can redistribute it and/or modify
16 * it under the terms of the GNU General Public License as published by
17 * the Free Software Foundation; either version 2 of the License, or
18 * (at your option) any later version.
19 *
20 * ---------------------------------------------------------------------------
21 * Copyright (c) 2002, Dr Brian Gladman <brg@gladman.me.uk>, Worcester, UK.
22 * All rights reserved.
23 *
24 * LICENSE TERMS
25 *
26 * The free distribution and use of this software in both source and binary
27 * form is allowed (with or without changes) provided that:
28 *
29 * 1. distributions of this source code include the above copyright
30 * notice, this list of conditions and the following disclaimer;
31 *
32 * 2. distributions in binary form include the above copyright
33 * notice, this list of conditions and the following disclaimer
34 * in the documentation and/or other associated materials;
35 *
36 * 3. the copyright holder's name is not used to endorse products
37 * built using this software without specific written permission.
38 *
39 * ALTERNATIVELY, provided that this notice is retained in full, this product
40 * may be distributed under the terms of the GNU General Public License (GPL),
41 * in which case the provisions of the GPL apply INSTEAD OF those given above.
42 *
43 * DISCLAIMER
44 *
45 * This software is provided 'as is' with no explicit or implied warranties
46 * in respect of its properties, including, but not limited to, correctness
47 * and/or fitness for purpose.
48 * ---------------------------------------------------------------------------
49 */
50
51/* Some changes from the Gladman version:
52 s/RIJNDAEL(e_key)/E_KEY/g
53 s/RIJNDAEL(d_key)/D_KEY/g
54*/
55
56#include <asm/byteorder.h>
57#include <linux/bitops.h>
58#include <linux/crypto.h>
59#include <linux/errno.h>
60#include <linux/init.h>
61#include <linux/module.h>
62#include <linux/types.h>
63
64#define AES_MIN_KEY_SIZE 16
65#define AES_MAX_KEY_SIZE 32
66
67#define AES_BLOCK_SIZE 16
68
69/*
70 * #define byte(x, nr) ((unsigned char)((x) >> (nr*8)))
71 */
72static inline u8 byte(const u32 x, const unsigned n)
73{
74 return x >> (n << 3);
75}
76
77struct aes_ctx
78{
79 u32 key_length;
80 u32 buf[120];
81};
82
83#define E_KEY (&ctx->buf[0])
84#define D_KEY (&ctx->buf[60])
85
86static u8 pow_tab[256] __initdata;
87static u8 log_tab[256] __initdata;
88static u8 sbx_tab[256] __initdata;
89static u8 isb_tab[256] __initdata;
90static u32 rco_tab[10];
91u32 aes_ft_tab[4][256];
92u32 aes_it_tab[4][256];
93
94u32 aes_fl_tab[4][256];
95u32 aes_il_tab[4][256];
96
97static inline u8 f_mult(u8 a, u8 b)
98{
99 u8 aa = log_tab[a], cc = aa + log_tab[b];
100
101 return pow_tab[cc + (cc < aa ? 1 : 0)];
102}
103
104#define ff_mult(a, b) (a && b ? f_mult(a, b) : 0)
105
106#define ls_box(x) \
107 (aes_fl_tab[0][byte(x, 0)] ^ \
108 aes_fl_tab[1][byte(x, 1)] ^ \
109 aes_fl_tab[2][byte(x, 2)] ^ \
110 aes_fl_tab[3][byte(x, 3)])
111
112static void __init gen_tabs(void)
113{
114 u32 i, t;
115 u8 p, q;
116
117 /* log and power tables for GF(2**8) finite field with
118 0x011b as modular polynomial - the simplest primitive
119 root is 0x03, used here to generate the tables */
120
121 for (i = 0, p = 1; i < 256; ++i) {
122 pow_tab[i] = (u8)p;
123 log_tab[p] = (u8)i;
124
125 p ^= (p << 1) ^ (p & 0x80 ? 0x01b : 0);
126 }
127
128 log_tab[1] = 0;
129
130 for (i = 0, p = 1; i < 10; ++i) {
131 rco_tab[i] = p;
132
133 p = (p << 1) ^ (p & 0x80 ? 0x01b : 0);
134 }
135
136 for (i = 0; i < 256; ++i) {
137 p = (i ? pow_tab[255 - log_tab[i]] : 0);
138 q = ((p >> 7) | (p << 1)) ^ ((p >> 6) | (p << 2));
139 p ^= 0x63 ^ q ^ ((q >> 6) | (q << 2));
140 sbx_tab[i] = p;
141 isb_tab[p] = (u8)i;
142 }
143
144 for (i = 0; i < 256; ++i) {
145 p = sbx_tab[i];
146
147 t = p;
148 aes_fl_tab[0][i] = t;
149 aes_fl_tab[1][i] = rol32(t, 8);
150 aes_fl_tab[2][i] = rol32(t, 16);
151 aes_fl_tab[3][i] = rol32(t, 24);
152
153 t = ((u32)ff_mult(2, p)) |
154 ((u32)p << 8) |
155 ((u32)p << 16) | ((u32)ff_mult(3, p) << 24);
156
157 aes_ft_tab[0][i] = t;
158 aes_ft_tab[1][i] = rol32(t, 8);
159 aes_ft_tab[2][i] = rol32(t, 16);
160 aes_ft_tab[3][i] = rol32(t, 24);
161
162 p = isb_tab[i];
163
164 t = p;
165 aes_il_tab[0][i] = t;
166 aes_il_tab[1][i] = rol32(t, 8);
167 aes_il_tab[2][i] = rol32(t, 16);
168 aes_il_tab[3][i] = rol32(t, 24);
169
170 t = ((u32)ff_mult(14, p)) |
171 ((u32)ff_mult(9, p) << 8) |
172 ((u32)ff_mult(13, p) << 16) |
173 ((u32)ff_mult(11, p) << 24);
174
175 aes_it_tab[0][i] = t;
176 aes_it_tab[1][i] = rol32(t, 8);
177 aes_it_tab[2][i] = rol32(t, 16);
178 aes_it_tab[3][i] = rol32(t, 24);
179 }
180}
181
182#define star_x(x) (((x) & 0x7f7f7f7f) << 1) ^ ((((x) & 0x80808080) >> 7) * 0x1b)
183
184#define imix_col(y, x) \
185 u = star_x(x); \
186 v = star_x(u); \
187 w = star_x(v); \
188 t = w ^ (x); \
189 (y) = u ^ v ^ w; \
190 (y) ^= ror32(u ^ t, 8) ^ \
191 ror32(v ^ t, 16) ^ \
192 ror32(t, 24)
193
194/* initialise the key schedule from the user supplied key */
195
196#define loop4(i) \
197{ \
198 t = ror32(t, 8); t = ls_box(t) ^ rco_tab[i]; \
199 t ^= E_KEY[4 * i]; E_KEY[4 * i + 4] = t; \
200 t ^= E_KEY[4 * i + 1]; E_KEY[4 * i + 5] = t; \
201 t ^= E_KEY[4 * i + 2]; E_KEY[4 * i + 6] = t; \
202 t ^= E_KEY[4 * i + 3]; E_KEY[4 * i + 7] = t; \
203}
204
205#define loop6(i) \
206{ \
207 t = ror32(t, 8); t = ls_box(t) ^ rco_tab[i]; \
208 t ^= E_KEY[6 * i]; E_KEY[6 * i + 6] = t; \
209 t ^= E_KEY[6 * i + 1]; E_KEY[6 * i + 7] = t; \
210 t ^= E_KEY[6 * i + 2]; E_KEY[6 * i + 8] = t; \
211 t ^= E_KEY[6 * i + 3]; E_KEY[6 * i + 9] = t; \
212 t ^= E_KEY[6 * i + 4]; E_KEY[6 * i + 10] = t; \
213 t ^= E_KEY[6 * i + 5]; E_KEY[6 * i + 11] = t; \
214}
215
216#define loop8(i) \
217{ \
218 t = ror32(t, 8); ; t = ls_box(t) ^ rco_tab[i]; \
219 t ^= E_KEY[8 * i]; E_KEY[8 * i + 8] = t; \
220 t ^= E_KEY[8 * i + 1]; E_KEY[8 * i + 9] = t; \
221 t ^= E_KEY[8 * i + 2]; E_KEY[8 * i + 10] = t; \
222 t ^= E_KEY[8 * i + 3]; E_KEY[8 * i + 11] = t; \
223 t = E_KEY[8 * i + 4] ^ ls_box(t); \
224 E_KEY[8 * i + 12] = t; \
225 t ^= E_KEY[8 * i + 5]; E_KEY[8 * i + 13] = t; \
226 t ^= E_KEY[8 * i + 6]; E_KEY[8 * i + 14] = t; \
227 t ^= E_KEY[8 * i + 7]; E_KEY[8 * i + 15] = t; \
228}
229
230static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
231 unsigned int key_len)
232{
233 struct aes_ctx *ctx = crypto_tfm_ctx(tfm);
234 const __le32 *key = (const __le32 *)in_key;
235 u32 *flags = &tfm->crt_flags;
236 u32 i, j, t, u, v, w;
237
238 if (key_len % 8) {
239 *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
240 return -EINVAL;
241 }
242
243 ctx->key_length = key_len;
244
245 D_KEY[key_len + 24] = E_KEY[0] = le32_to_cpu(key[0]);
246 D_KEY[key_len + 25] = E_KEY[1] = le32_to_cpu(key[1]);
247 D_KEY[key_len + 26] = E_KEY[2] = le32_to_cpu(key[2]);
248 D_KEY[key_len + 27] = E_KEY[3] = le32_to_cpu(key[3]);
249
250 switch (key_len) {
251 case 16:
252 t = E_KEY[3];
253 for (i = 0; i < 10; ++i)
254 loop4(i);
255 break;
256
257 case 24:
258 E_KEY[4] = le32_to_cpu(key[4]);
259 t = E_KEY[5] = le32_to_cpu(key[5]);
260 for (i = 0; i < 8; ++i)
261 loop6 (i);
262 break;
263
264 case 32:
265 E_KEY[4] = le32_to_cpu(key[4]);
266 E_KEY[5] = le32_to_cpu(key[5]);
267 E_KEY[6] = le32_to_cpu(key[6]);
268 t = E_KEY[7] = le32_to_cpu(key[7]);
269 for (i = 0; i < 7; ++i)
270 loop8(i);
271 break;
272 }
273
274 D_KEY[0] = E_KEY[key_len + 24];
275 D_KEY[1] = E_KEY[key_len + 25];
276 D_KEY[2] = E_KEY[key_len + 26];
277 D_KEY[3] = E_KEY[key_len + 27];
278
279 for (i = 4; i < key_len + 24; ++i) {
280 j = key_len + 24 - (i & ~3) + (i & 3);
281 imix_col(D_KEY[j], E_KEY[i]);
282 }
283
284 return 0;
285}
286
287asmlinkage void aes_enc_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in);
288asmlinkage void aes_dec_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in);
289
290static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
291{
292 aes_enc_blk(tfm, dst, src);
293}
294
295static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
296{
297 aes_dec_blk(tfm, dst, src);
298}
299
300static struct crypto_alg aes_alg = {
301 .cra_name = "aes",
302 .cra_driver_name = "aes-x86_64",
303 .cra_priority = 200,
304 .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
305 .cra_blocksize = AES_BLOCK_SIZE,
306 .cra_ctxsize = sizeof(struct aes_ctx),
307 .cra_module = THIS_MODULE,
308 .cra_list = LIST_HEAD_INIT(aes_alg.cra_list),
309 .cra_u = {
310 .cipher = {
311 .cia_min_keysize = AES_MIN_KEY_SIZE,
312 .cia_max_keysize = AES_MAX_KEY_SIZE,
313 .cia_setkey = aes_set_key,
314 .cia_encrypt = aes_encrypt,
315 .cia_decrypt = aes_decrypt
316 }
317 }
318};
319
320static int __init aes_init(void)
321{
322 gen_tabs();
323 return crypto_register_alg(&aes_alg);
324}
325
326static void __exit aes_fini(void)
327{
328 crypto_unregister_alg(&aes_alg);
329}
330
331module_init(aes_init);
332module_exit(aes_fini);
333
334MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm");
335MODULE_LICENSE("GPL");
336MODULE_ALIAS("aes");
diff --git a/arch/x86/crypto/twofish-i586-asm_32.S b/arch/x86/crypto/twofish-i586-asm_32.S
new file mode 100644
index 000000000000..39b98ed2c1b9
--- /dev/null
+++ b/arch/x86/crypto/twofish-i586-asm_32.S
@@ -0,0 +1,335 @@
1/***************************************************************************
2* Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de> *
3* *
4* This program is free software; you can redistribute it and/or modify *
5* it under the terms of the GNU General Public License as published by *
6* the Free Software Foundation; either version 2 of the License, or *
7* (at your option) any later version. *
8* *
9* This program is distributed in the hope that it will be useful, *
10* but WITHOUT ANY WARRANTY; without even the implied warranty of *
11* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
12* GNU General Public License for more details. *
13* *
14* You should have received a copy of the GNU General Public License *
15* along with this program; if not, write to the *
16* Free Software Foundation, Inc., *
17* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
18***************************************************************************/
19
20.file "twofish-i586-asm.S"
21.text
22
23#include <asm/asm-offsets.h>
24
25/* return adress at 0 */
26
27#define in_blk 12 /* input byte array address parameter*/
28#define out_blk 8 /* output byte array address parameter*/
29#define tfm 4 /* Twofish context structure */
30
31#define a_offset 0
32#define b_offset 4
33#define c_offset 8
34#define d_offset 12
35
36/* Structure of the crypto context struct*/
37
38#define s0 0 /* S0 Array 256 Words each */
39#define s1 1024 /* S1 Array */
40#define s2 2048 /* S2 Array */
41#define s3 3072 /* S3 Array */
42#define w 4096 /* 8 whitening keys (word) */
43#define k 4128 /* key 1-32 ( word ) */
44
45/* define a few register aliases to allow macro substitution */
46
47#define R0D %eax
48#define R0B %al
49#define R0H %ah
50
51#define R1D %ebx
52#define R1B %bl
53#define R1H %bh
54
55#define R2D %ecx
56#define R2B %cl
57#define R2H %ch
58
59#define R3D %edx
60#define R3B %dl
61#define R3H %dh
62
63
64/* performs input whitening */
65#define input_whitening(src,context,offset)\
66 xor w+offset(context), src;
67
68/* performs input whitening */
69#define output_whitening(src,context,offset)\
70 xor w+16+offset(context), src;
71
72/*
73 * a input register containing a (rotated 16)
74 * b input register containing b
75 * c input register containing c
76 * d input register containing d (already rol $1)
77 * operations on a and b are interleaved to increase performance
78 */
79#define encrypt_round(a,b,c,d,round)\
80 push d ## D;\
81 movzx b ## B, %edi;\
82 mov s1(%ebp,%edi,4),d ## D;\
83 movzx a ## B, %edi;\
84 mov s2(%ebp,%edi,4),%esi;\
85 movzx b ## H, %edi;\
86 ror $16, b ## D;\
87 xor s2(%ebp,%edi,4),d ## D;\
88 movzx a ## H, %edi;\
89 ror $16, a ## D;\
90 xor s3(%ebp,%edi,4),%esi;\
91 movzx b ## B, %edi;\
92 xor s3(%ebp,%edi,4),d ## D;\
93 movzx a ## B, %edi;\
94 xor (%ebp,%edi,4), %esi;\
95 movzx b ## H, %edi;\
96 ror $15, b ## D;\
97 xor (%ebp,%edi,4), d ## D;\
98 movzx a ## H, %edi;\
99 xor s1(%ebp,%edi,4),%esi;\
100 pop %edi;\
101 add d ## D, %esi;\
102 add %esi, d ## D;\
103 add k+round(%ebp), %esi;\
104 xor %esi, c ## D;\
105 rol $15, c ## D;\
106 add k+4+round(%ebp),d ## D;\
107 xor %edi, d ## D;
108
109/*
110 * a input register containing a (rotated 16)
111 * b input register containing b
112 * c input register containing c
113 * d input register containing d (already rol $1)
114 * operations on a and b are interleaved to increase performance
115 * last round has different rotations for the output preparation
116 */
117#define encrypt_last_round(a,b,c,d,round)\
118 push d ## D;\
119 movzx b ## B, %edi;\
120 mov s1(%ebp,%edi,4),d ## D;\
121 movzx a ## B, %edi;\
122 mov s2(%ebp,%edi,4),%esi;\
123 movzx b ## H, %edi;\
124 ror $16, b ## D;\
125 xor s2(%ebp,%edi,4),d ## D;\
126 movzx a ## H, %edi;\
127 ror $16, a ## D;\
128 xor s3(%ebp,%edi,4),%esi;\
129 movzx b ## B, %edi;\
130 xor s3(%ebp,%edi,4),d ## D;\
131 movzx a ## B, %edi;\
132 xor (%ebp,%edi,4), %esi;\
133 movzx b ## H, %edi;\
134 ror $16, b ## D;\
135 xor (%ebp,%edi,4), d ## D;\
136 movzx a ## H, %edi;\
137 xor s1(%ebp,%edi,4),%esi;\
138 pop %edi;\
139 add d ## D, %esi;\
140 add %esi, d ## D;\
141 add k+round(%ebp), %esi;\
142 xor %esi, c ## D;\
143 ror $1, c ## D;\
144 add k+4+round(%ebp),d ## D;\
145 xor %edi, d ## D;
146
147/*
148 * a input register containing a
149 * b input register containing b (rotated 16)
150 * c input register containing c
151 * d input register containing d (already rol $1)
152 * operations on a and b are interleaved to increase performance
153 */
154#define decrypt_round(a,b,c,d,round)\
155 push c ## D;\
156 movzx a ## B, %edi;\
157 mov (%ebp,%edi,4), c ## D;\
158 movzx b ## B, %edi;\
159 mov s3(%ebp,%edi,4),%esi;\
160 movzx a ## H, %edi;\
161 ror $16, a ## D;\
162 xor s1(%ebp,%edi,4),c ## D;\
163 movzx b ## H, %edi;\
164 ror $16, b ## D;\
165 xor (%ebp,%edi,4), %esi;\
166 movzx a ## B, %edi;\
167 xor s2(%ebp,%edi,4),c ## D;\
168 movzx b ## B, %edi;\
169 xor s1(%ebp,%edi,4),%esi;\
170 movzx a ## H, %edi;\
171 ror $15, a ## D;\
172 xor s3(%ebp,%edi,4),c ## D;\
173 movzx b ## H, %edi;\
174 xor s2(%ebp,%edi,4),%esi;\
175 pop %edi;\
176 add %esi, c ## D;\
177 add c ## D, %esi;\
178 add k+round(%ebp), c ## D;\
179 xor %edi, c ## D;\
180 add k+4+round(%ebp),%esi;\
181 xor %esi, d ## D;\
182 rol $15, d ## D;
183
184/*
185 * a input register containing a
186 * b input register containing b (rotated 16)
187 * c input register containing c
188 * d input register containing d (already rol $1)
189 * operations on a and b are interleaved to increase performance
190 * last round has different rotations for the output preparation
191 */
192#define decrypt_last_round(a,b,c,d,round)\
193 push c ## D;\
194 movzx a ## B, %edi;\
195 mov (%ebp,%edi,4), c ## D;\
196 movzx b ## B, %edi;\
197 mov s3(%ebp,%edi,4),%esi;\
198 movzx a ## H, %edi;\
199 ror $16, a ## D;\
200 xor s1(%ebp,%edi,4),c ## D;\
201 movzx b ## H, %edi;\
202 ror $16, b ## D;\
203 xor (%ebp,%edi,4), %esi;\
204 movzx a ## B, %edi;\
205 xor s2(%ebp,%edi,4),c ## D;\
206 movzx b ## B, %edi;\
207 xor s1(%ebp,%edi,4),%esi;\
208 movzx a ## H, %edi;\
209 ror $16, a ## D;\
210 xor s3(%ebp,%edi,4),c ## D;\
211 movzx b ## H, %edi;\
212 xor s2(%ebp,%edi,4),%esi;\
213 pop %edi;\
214 add %esi, c ## D;\
215 add c ## D, %esi;\
216 add k+round(%ebp), c ## D;\
217 xor %edi, c ## D;\
218 add k+4+round(%ebp),%esi;\
219 xor %esi, d ## D;\
220 ror $1, d ## D;
221
222.align 4
223.global twofish_enc_blk
224.global twofish_dec_blk
225
226twofish_enc_blk:
227 push %ebp /* save registers according to calling convention*/
228 push %ebx
229 push %esi
230 push %edi
231
232 mov tfm + 16(%esp), %ebp /* abuse the base pointer: set new base bointer to the crypto tfm */
233 add $crypto_tfm_ctx_offset, %ebp /* ctx adress */
234 mov in_blk+16(%esp),%edi /* input adress in edi */
235
236 mov (%edi), %eax
237 mov b_offset(%edi), %ebx
238 mov c_offset(%edi), %ecx
239 mov d_offset(%edi), %edx
240 input_whitening(%eax,%ebp,a_offset)
241 ror $16, %eax
242 input_whitening(%ebx,%ebp,b_offset)
243 input_whitening(%ecx,%ebp,c_offset)
244 input_whitening(%edx,%ebp,d_offset)
245 rol $1, %edx
246
247 encrypt_round(R0,R1,R2,R3,0);
248 encrypt_round(R2,R3,R0,R1,8);
249 encrypt_round(R0,R1,R2,R3,2*8);
250 encrypt_round(R2,R3,R0,R1,3*8);
251 encrypt_round(R0,R1,R2,R3,4*8);
252 encrypt_round(R2,R3,R0,R1,5*8);
253 encrypt_round(R0,R1,R2,R3,6*8);
254 encrypt_round(R2,R3,R0,R1,7*8);
255 encrypt_round(R0,R1,R2,R3,8*8);
256 encrypt_round(R2,R3,R0,R1,9*8);
257 encrypt_round(R0,R1,R2,R3,10*8);
258 encrypt_round(R2,R3,R0,R1,11*8);
259 encrypt_round(R0,R1,R2,R3,12*8);
260 encrypt_round(R2,R3,R0,R1,13*8);
261 encrypt_round(R0,R1,R2,R3,14*8);
262 encrypt_last_round(R2,R3,R0,R1,15*8);
263
264 output_whitening(%eax,%ebp,c_offset)
265 output_whitening(%ebx,%ebp,d_offset)
266 output_whitening(%ecx,%ebp,a_offset)
267 output_whitening(%edx,%ebp,b_offset)
268 mov out_blk+16(%esp),%edi;
269 mov %eax, c_offset(%edi)
270 mov %ebx, d_offset(%edi)
271 mov %ecx, (%edi)
272 mov %edx, b_offset(%edi)
273
274 pop %edi
275 pop %esi
276 pop %ebx
277 pop %ebp
278 mov $1, %eax
279 ret
280
281twofish_dec_blk:
282 push %ebp /* save registers according to calling convention*/
283 push %ebx
284 push %esi
285 push %edi
286
287
288 mov tfm + 16(%esp), %ebp /* abuse the base pointer: set new base bointer to the crypto tfm */
289 add $crypto_tfm_ctx_offset, %ebp /* ctx adress */
290 mov in_blk+16(%esp),%edi /* input adress in edi */
291
292 mov (%edi), %eax
293 mov b_offset(%edi), %ebx
294 mov c_offset(%edi), %ecx
295 mov d_offset(%edi), %edx
296 output_whitening(%eax,%ebp,a_offset)
297 output_whitening(%ebx,%ebp,b_offset)
298 ror $16, %ebx
299 output_whitening(%ecx,%ebp,c_offset)
300 output_whitening(%edx,%ebp,d_offset)
301 rol $1, %ecx
302
303 decrypt_round(R0,R1,R2,R3,15*8);
304 decrypt_round(R2,R3,R0,R1,14*8);
305 decrypt_round(R0,R1,R2,R3,13*8);
306 decrypt_round(R2,R3,R0,R1,12*8);
307 decrypt_round(R0,R1,R2,R3,11*8);
308 decrypt_round(R2,R3,R0,R1,10*8);
309 decrypt_round(R0,R1,R2,R3,9*8);
310 decrypt_round(R2,R3,R0,R1,8*8);
311 decrypt_round(R0,R1,R2,R3,7*8);
312 decrypt_round(R2,R3,R0,R1,6*8);
313 decrypt_round(R0,R1,R2,R3,5*8);
314 decrypt_round(R2,R3,R0,R1,4*8);
315 decrypt_round(R0,R1,R2,R3,3*8);
316 decrypt_round(R2,R3,R0,R1,2*8);
317 decrypt_round(R0,R1,R2,R3,1*8);
318 decrypt_last_round(R2,R3,R0,R1,0);
319
320 input_whitening(%eax,%ebp,c_offset)
321 input_whitening(%ebx,%ebp,d_offset)
322 input_whitening(%ecx,%ebp,a_offset)
323 input_whitening(%edx,%ebp,b_offset)
324 mov out_blk+16(%esp),%edi;
325 mov %eax, c_offset(%edi)
326 mov %ebx, d_offset(%edi)
327 mov %ecx, (%edi)
328 mov %edx, b_offset(%edi)
329
330 pop %edi
331 pop %esi
332 pop %ebx
333 pop %ebp
334 mov $1, %eax
335 ret
diff --git a/arch/x86/crypto/twofish-x86_64-asm_64.S b/arch/x86/crypto/twofish-x86_64-asm_64.S
new file mode 100644
index 000000000000..35974a586615
--- /dev/null
+++ b/arch/x86/crypto/twofish-x86_64-asm_64.S
@@ -0,0 +1,324 @@
1/***************************************************************************
2* Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de> *
3* *
4* This program is free software; you can redistribute it and/or modify *
5* it under the terms of the GNU General Public License as published by *
6* the Free Software Foundation; either version 2 of the License, or *
7* (at your option) any later version. *
8* *
9* This program is distributed in the hope that it will be useful, *
10* but WITHOUT ANY WARRANTY; without even the implied warranty of *
11* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
12* GNU General Public License for more details. *
13* *
14* You should have received a copy of the GNU General Public License *
15* along with this program; if not, write to the *
16* Free Software Foundation, Inc., *
17* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
18***************************************************************************/
19
20.file "twofish-x86_64-asm.S"
21.text
22
23#include <asm/asm-offsets.h>
24
25#define a_offset 0
26#define b_offset 4
27#define c_offset 8
28#define d_offset 12
29
30/* Structure of the crypto context struct*/
31
32#define s0 0 /* S0 Array 256 Words each */
33#define s1 1024 /* S1 Array */
34#define s2 2048 /* S2 Array */
35#define s3 3072 /* S3 Array */
36#define w 4096 /* 8 whitening keys (word) */
37#define k 4128 /* key 1-32 ( word ) */
38
39/* define a few register aliases to allow macro substitution */
40
41#define R0 %rax
42#define R0D %eax
43#define R0B %al
44#define R0H %ah
45
46#define R1 %rbx
47#define R1D %ebx
48#define R1B %bl
49#define R1H %bh
50
51#define R2 %rcx
52#define R2D %ecx
53#define R2B %cl
54#define R2H %ch
55
56#define R3 %rdx
57#define R3D %edx
58#define R3B %dl
59#define R3H %dh
60
61
62/* performs input whitening */
63#define input_whitening(src,context,offset)\
64 xor w+offset(context), src;
65
66/* performs input whitening */
67#define output_whitening(src,context,offset)\
68 xor w+16+offset(context), src;
69
70
71/*
72 * a input register containing a (rotated 16)
73 * b input register containing b
74 * c input register containing c
75 * d input register containing d (already rol $1)
76 * operations on a and b are interleaved to increase performance
77 */
78#define encrypt_round(a,b,c,d,round)\
79 movzx b ## B, %edi;\
80 mov s1(%r11,%rdi,4),%r8d;\
81 movzx a ## B, %edi;\
82 mov s2(%r11,%rdi,4),%r9d;\
83 movzx b ## H, %edi;\
84 ror $16, b ## D;\
85 xor s2(%r11,%rdi,4),%r8d;\
86 movzx a ## H, %edi;\
87 ror $16, a ## D;\
88 xor s3(%r11,%rdi,4),%r9d;\
89 movzx b ## B, %edi;\
90 xor s3(%r11,%rdi,4),%r8d;\
91 movzx a ## B, %edi;\
92 xor (%r11,%rdi,4), %r9d;\
93 movzx b ## H, %edi;\
94 ror $15, b ## D;\
95 xor (%r11,%rdi,4), %r8d;\
96 movzx a ## H, %edi;\
97 xor s1(%r11,%rdi,4),%r9d;\
98 add %r8d, %r9d;\
99 add %r9d, %r8d;\
100 add k+round(%r11), %r9d;\
101 xor %r9d, c ## D;\
102 rol $15, c ## D;\
103 add k+4+round(%r11),%r8d;\
104 xor %r8d, d ## D;
105
106/*
107 * a input register containing a(rotated 16)
108 * b input register containing b
109 * c input register containing c
110 * d input register containing d (already rol $1)
111 * operations on a and b are interleaved to increase performance
112 * during the round a and b are prepared for the output whitening
113 */
114#define encrypt_last_round(a,b,c,d,round)\
115 mov b ## D, %r10d;\
116 shl $32, %r10;\
117 movzx b ## B, %edi;\
118 mov s1(%r11,%rdi,4),%r8d;\
119 movzx a ## B, %edi;\
120 mov s2(%r11,%rdi,4),%r9d;\
121 movzx b ## H, %edi;\
122 ror $16, b ## D;\
123 xor s2(%r11,%rdi,4),%r8d;\
124 movzx a ## H, %edi;\
125 ror $16, a ## D;\
126 xor s3(%r11,%rdi,4),%r9d;\
127 movzx b ## B, %edi;\
128 xor s3(%r11,%rdi,4),%r8d;\
129 movzx a ## B, %edi;\
130 xor (%r11,%rdi,4), %r9d;\
131 xor a, %r10;\
132 movzx b ## H, %edi;\
133 xor (%r11,%rdi,4), %r8d;\
134 movzx a ## H, %edi;\
135 xor s1(%r11,%rdi,4),%r9d;\
136 add %r8d, %r9d;\
137 add %r9d, %r8d;\
138 add k+round(%r11), %r9d;\
139 xor %r9d, c ## D;\
140 ror $1, c ## D;\
141 add k+4+round(%r11),%r8d;\
142 xor %r8d, d ## D
143
144/*
145 * a input register containing a
146 * b input register containing b (rotated 16)
147 * c input register containing c (already rol $1)
148 * d input register containing d
149 * operations on a and b are interleaved to increase performance
150 */
151#define decrypt_round(a,b,c,d,round)\
152 movzx a ## B, %edi;\
153 mov (%r11,%rdi,4), %r9d;\
154 movzx b ## B, %edi;\
155 mov s3(%r11,%rdi,4),%r8d;\
156 movzx a ## H, %edi;\
157 ror $16, a ## D;\
158 xor s1(%r11,%rdi,4),%r9d;\
159 movzx b ## H, %edi;\
160 ror $16, b ## D;\
161 xor (%r11,%rdi,4), %r8d;\
162 movzx a ## B, %edi;\
163 xor s2(%r11,%rdi,4),%r9d;\
164 movzx b ## B, %edi;\
165 xor s1(%r11,%rdi,4),%r8d;\
166 movzx a ## H, %edi;\
167 ror $15, a ## D;\
168 xor s3(%r11,%rdi,4),%r9d;\
169 movzx b ## H, %edi;\
170 xor s2(%r11,%rdi,4),%r8d;\
171 add %r8d, %r9d;\
172 add %r9d, %r8d;\
173 add k+round(%r11), %r9d;\
174 xor %r9d, c ## D;\
175 add k+4+round(%r11),%r8d;\
176 xor %r8d, d ## D;\
177 rol $15, d ## D;
178
179/*
180 * a input register containing a
181 * b input register containing b
182 * c input register containing c (already rol $1)
183 * d input register containing d
184 * operations on a and b are interleaved to increase performance
185 * during the round a and b are prepared for the output whitening
186 */
187#define decrypt_last_round(a,b,c,d,round)\
188 movzx a ## B, %edi;\
189 mov (%r11,%rdi,4), %r9d;\
190 movzx b ## B, %edi;\
191 mov s3(%r11,%rdi,4),%r8d;\
192 movzx b ## H, %edi;\
193 ror $16, b ## D;\
194 xor (%r11,%rdi,4), %r8d;\
195 movzx a ## H, %edi;\
196 mov b ## D, %r10d;\
197 shl $32, %r10;\
198 xor a, %r10;\
199 ror $16, a ## D;\
200 xor s1(%r11,%rdi,4),%r9d;\
201 movzx b ## B, %edi;\
202 xor s1(%r11,%rdi,4),%r8d;\
203 movzx a ## B, %edi;\
204 xor s2(%r11,%rdi,4),%r9d;\
205 movzx b ## H, %edi;\
206 xor s2(%r11,%rdi,4),%r8d;\
207 movzx a ## H, %edi;\
208 xor s3(%r11,%rdi,4),%r9d;\
209 add %r8d, %r9d;\
210 add %r9d, %r8d;\
211 add k+round(%r11), %r9d;\
212 xor %r9d, c ## D;\
213 add k+4+round(%r11),%r8d;\
214 xor %r8d, d ## D;\
215 ror $1, d ## D;
216
217.align 8
218.global twofish_enc_blk
219.global twofish_dec_blk
220
221twofish_enc_blk:
222 pushq R1
223
224 /* %rdi contains the crypto tfm adress */
225 /* %rsi contains the output adress */
226 /* %rdx contains the input adress */
227 add $crypto_tfm_ctx_offset, %rdi /* set ctx adress */
228 /* ctx adress is moved to free one non-rex register
229 as target for the 8bit high operations */
230 mov %rdi, %r11
231
232 movq (R3), R1
233 movq 8(R3), R3
234 input_whitening(R1,%r11,a_offset)
235 input_whitening(R3,%r11,c_offset)
236 mov R1D, R0D
237 rol $16, R0D
238 shr $32, R1
239 mov R3D, R2D
240 shr $32, R3
241 rol $1, R3D
242
243 encrypt_round(R0,R1,R2,R3,0);
244 encrypt_round(R2,R3,R0,R1,8);
245 encrypt_round(R0,R1,R2,R3,2*8);
246 encrypt_round(R2,R3,R0,R1,3*8);
247 encrypt_round(R0,R1,R2,R3,4*8);
248 encrypt_round(R2,R3,R0,R1,5*8);
249 encrypt_round(R0,R1,R2,R3,6*8);
250 encrypt_round(R2,R3,R0,R1,7*8);
251 encrypt_round(R0,R1,R2,R3,8*8);
252 encrypt_round(R2,R3,R0,R1,9*8);
253 encrypt_round(R0,R1,R2,R3,10*8);
254 encrypt_round(R2,R3,R0,R1,11*8);
255 encrypt_round(R0,R1,R2,R3,12*8);
256 encrypt_round(R2,R3,R0,R1,13*8);
257 encrypt_round(R0,R1,R2,R3,14*8);
258 encrypt_last_round(R2,R3,R0,R1,15*8);
259
260
261 output_whitening(%r10,%r11,a_offset)
262 movq %r10, (%rsi)
263
264 shl $32, R1
265 xor R0, R1
266
267 output_whitening(R1,%r11,c_offset)
268 movq R1, 8(%rsi)
269
270 popq R1
271 movq $1,%rax
272 ret
273
274twofish_dec_blk:
275 pushq R1
276
277 /* %rdi contains the crypto tfm adress */
278 /* %rsi contains the output adress */
279 /* %rdx contains the input adress */
280 add $crypto_tfm_ctx_offset, %rdi /* set ctx adress */
281 /* ctx adress is moved to free one non-rex register
282 as target for the 8bit high operations */
283 mov %rdi, %r11
284
285 movq (R3), R1
286 movq 8(R3), R3
287 output_whitening(R1,%r11,a_offset)
288 output_whitening(R3,%r11,c_offset)
289 mov R1D, R0D
290 shr $32, R1
291 rol $16, R1D
292 mov R3D, R2D
293 shr $32, R3
294 rol $1, R2D
295
296 decrypt_round(R0,R1,R2,R3,15*8);
297 decrypt_round(R2,R3,R0,R1,14*8);
298 decrypt_round(R0,R1,R2,R3,13*8);
299 decrypt_round(R2,R3,R0,R1,12*8);
300 decrypt_round(R0,R1,R2,R3,11*8);
301 decrypt_round(R2,R3,R0,R1,10*8);
302 decrypt_round(R0,R1,R2,R3,9*8);
303 decrypt_round(R2,R3,R0,R1,8*8);
304 decrypt_round(R0,R1,R2,R3,7*8);
305 decrypt_round(R2,R3,R0,R1,6*8);
306 decrypt_round(R0,R1,R2,R3,5*8);
307 decrypt_round(R2,R3,R0,R1,4*8);
308 decrypt_round(R0,R1,R2,R3,3*8);
309 decrypt_round(R2,R3,R0,R1,2*8);
310 decrypt_round(R0,R1,R2,R3,1*8);
311 decrypt_last_round(R2,R3,R0,R1,0);
312
313 input_whitening(%r10,%r11,a_offset)
314 movq %r10, (%rsi)
315
316 shl $32, R1
317 xor R0, R1
318
319 input_whitening(R1,%r11,c_offset)
320 movq R1, 8(%rsi)
321
322 popq R1
323 movq $1,%rax
324 ret
diff --git a/arch/x86/crypto/twofish_32.c b/arch/x86/crypto/twofish_32.c
new file mode 100644
index 000000000000..e3004dfe9c7a
--- /dev/null
+++ b/arch/x86/crypto/twofish_32.c
@@ -0,0 +1,97 @@
1/*
2 * Glue Code for optimized 586 assembler version of TWOFISH
3 *
4 * Originally Twofish for GPG
5 * By Matthew Skala <mskala@ansuz.sooke.bc.ca>, July 26, 1998
6 * 256-bit key length added March 20, 1999
7 * Some modifications to reduce the text size by Werner Koch, April, 1998
8 * Ported to the kerneli patch by Marc Mutz <Marc@Mutz.com>
9 * Ported to CryptoAPI by Colin Slater <hoho@tacomeat.net>
10 *
11 * The original author has disclaimed all copyright interest in this
12 * code and thus put it in the public domain. The subsequent authors
13 * have put this under the GNU General Public License.
14 *
15 * This program is free software; you can redistribute it and/or modify
16 * it under the terms of the GNU General Public License as published by
17 * the Free Software Foundation; either version 2 of the License, or
18 * (at your option) any later version.
19 *
20 * This program is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * GNU General Public License for more details.
24 *
25 * You should have received a copy of the GNU General Public License
26 * along with this program; if not, write to the Free Software
27 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
28 * USA
29 *
30 * This code is a "clean room" implementation, written from the paper
31 * _Twofish: A 128-Bit Block Cipher_ by Bruce Schneier, John Kelsey,
32 * Doug Whiting, David Wagner, Chris Hall, and Niels Ferguson, available
33 * through http://www.counterpane.com/twofish.html
34 *
35 * For background information on multiplication in finite fields, used for
36 * the matrix operations in the key schedule, see the book _Contemporary
37 * Abstract Algebra_ by Joseph A. Gallian, especially chapter 22 in the
38 * Third Edition.
39 */
40
41#include <crypto/twofish.h>
42#include <linux/crypto.h>
43#include <linux/init.h>
44#include <linux/module.h>
45#include <linux/types.h>
46
47
48asmlinkage void twofish_enc_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
49asmlinkage void twofish_dec_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
50
51static void twofish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
52{
53 twofish_enc_blk(tfm, dst, src);
54}
55
56static void twofish_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
57{
58 twofish_dec_blk(tfm, dst, src);
59}
60
61static struct crypto_alg alg = {
62 .cra_name = "twofish",
63 .cra_driver_name = "twofish-i586",
64 .cra_priority = 200,
65 .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
66 .cra_blocksize = TF_BLOCK_SIZE,
67 .cra_ctxsize = sizeof(struct twofish_ctx),
68 .cra_alignmask = 3,
69 .cra_module = THIS_MODULE,
70 .cra_list = LIST_HEAD_INIT(alg.cra_list),
71 .cra_u = {
72 .cipher = {
73 .cia_min_keysize = TF_MIN_KEY_SIZE,
74 .cia_max_keysize = TF_MAX_KEY_SIZE,
75 .cia_setkey = twofish_setkey,
76 .cia_encrypt = twofish_encrypt,
77 .cia_decrypt = twofish_decrypt
78 }
79 }
80};
81
82static int __init init(void)
83{
84 return crypto_register_alg(&alg);
85}
86
87static void __exit fini(void)
88{
89 crypto_unregister_alg(&alg);
90}
91
92module_init(init);
93module_exit(fini);
94
95MODULE_LICENSE("GPL");
96MODULE_DESCRIPTION ("Twofish Cipher Algorithm, i586 asm optimized");
97MODULE_ALIAS("twofish");
diff --git a/arch/x86/crypto/twofish_64.c b/arch/x86/crypto/twofish_64.c
new file mode 100644
index 000000000000..182d91d5cfb9
--- /dev/null
+++ b/arch/x86/crypto/twofish_64.c
@@ -0,0 +1,97 @@
1/*
2 * Glue Code for optimized x86_64 assembler version of TWOFISH
3 *
4 * Originally Twofish for GPG
5 * By Matthew Skala <mskala@ansuz.sooke.bc.ca>, July 26, 1998
6 * 256-bit key length added March 20, 1999
7 * Some modifications to reduce the text size by Werner Koch, April, 1998
8 * Ported to the kerneli patch by Marc Mutz <Marc@Mutz.com>
9 * Ported to CryptoAPI by Colin Slater <hoho@tacomeat.net>
10 *
11 * The original author has disclaimed all copyright interest in this
12 * code and thus put it in the public domain. The subsequent authors
13 * have put this under the GNU General Public License.
14 *
15 * This program is free software; you can redistribute it and/or modify
16 * it under the terms of the GNU General Public License as published by
17 * the Free Software Foundation; either version 2 of the License, or
18 * (at your option) any later version.
19 *
20 * This program is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * GNU General Public License for more details.
24 *
25 * You should have received a copy of the GNU General Public License
26 * along with this program; if not, write to the Free Software
27 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
28 * USA
29 *
30 * This code is a "clean room" implementation, written from the paper
31 * _Twofish: A 128-Bit Block Cipher_ by Bruce Schneier, John Kelsey,
32 * Doug Whiting, David Wagner, Chris Hall, and Niels Ferguson, available
33 * through http://www.counterpane.com/twofish.html
34 *
35 * For background information on multiplication in finite fields, used for
36 * the matrix operations in the key schedule, see the book _Contemporary
37 * Abstract Algebra_ by Joseph A. Gallian, especially chapter 22 in the
38 * Third Edition.
39 */
40
41#include <crypto/twofish.h>
42#include <linux/crypto.h>
43#include <linux/init.h>
44#include <linux/kernel.h>
45#include <linux/module.h>
46#include <linux/types.h>
47
48asmlinkage void twofish_enc_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
49asmlinkage void twofish_dec_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
50
51static void twofish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
52{
53 twofish_enc_blk(tfm, dst, src);
54}
55
56static void twofish_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
57{
58 twofish_dec_blk(tfm, dst, src);
59}
60
61static struct crypto_alg alg = {
62 .cra_name = "twofish",
63 .cra_driver_name = "twofish-x86_64",
64 .cra_priority = 200,
65 .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
66 .cra_blocksize = TF_BLOCK_SIZE,
67 .cra_ctxsize = sizeof(struct twofish_ctx),
68 .cra_alignmask = 3,
69 .cra_module = THIS_MODULE,
70 .cra_list = LIST_HEAD_INIT(alg.cra_list),
71 .cra_u = {
72 .cipher = {
73 .cia_min_keysize = TF_MIN_KEY_SIZE,
74 .cia_max_keysize = TF_MAX_KEY_SIZE,
75 .cia_setkey = twofish_setkey,
76 .cia_encrypt = twofish_encrypt,
77 .cia_decrypt = twofish_decrypt
78 }
79 }
80};
81
82static int __init init(void)
83{
84 return crypto_register_alg(&alg);
85}
86
87static void __exit fini(void)
88{
89 crypto_unregister_alg(&alg);
90}
91
92module_init(init);
93module_exit(fini);
94
95MODULE_LICENSE("GPL");
96MODULE_DESCRIPTION ("Twofish Cipher Algorithm, x86_64 asm optimized");
97MODULE_ALIAS("twofish");
diff --git a/arch/x86/ia32/Makefile b/arch/x86/ia32/Makefile
new file mode 100644
index 000000000000..cdae36435e21
--- /dev/null
+++ b/arch/x86/ia32/Makefile
@@ -0,0 +1,35 @@
1#
2# Makefile for the ia32 kernel emulation subsystem.
3#
4
5obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o tls32.o \
6 ia32_binfmt.o fpu32.o ptrace32.o syscall32.o syscall32_syscall.o \
7 mmap32.o
8
9sysv-$(CONFIG_SYSVIPC) := ipc32.o
10obj-$(CONFIG_IA32_EMULATION) += $(sysv-y)
11
12obj-$(CONFIG_IA32_AOUT) += ia32_aout.o
13
14audit-class-$(CONFIG_AUDIT) := audit.o
15obj-$(CONFIG_IA32_EMULATION) += $(audit-class-y)
16
17$(obj)/syscall32_syscall.o: \
18 $(foreach F,sysenter syscall,$(obj)/vsyscall-$F.so)
19
20# Teach kbuild about targets
21targets := $(foreach F,sysenter syscall,vsyscall-$F.o vsyscall-$F.so)
22
23# The DSO images are built using a special linker script
24quiet_cmd_syscall = SYSCALL $@
25 cmd_syscall = $(CC) -m32 -nostdlib -shared -s \
26 $(call ld-option, -Wl$(comma)--hash-style=sysv) \
27 -Wl,-soname=linux-gate.so.1 -o $@ \
28 -Wl,-T,$(filter-out FORCE,$^)
29
30$(obj)/vsyscall-sysenter.so $(obj)/vsyscall-syscall.so: \
31$(obj)/vsyscall-%.so: $(src)/vsyscall.lds $(obj)/vsyscall-%.o FORCE
32 $(call if_changed,syscall)
33
34AFLAGS_vsyscall-sysenter.o = -m32 -Wa,-32
35AFLAGS_vsyscall-syscall.o = -m32 -Wa,-32
diff --git a/arch/x86/ia32/audit.c b/arch/x86/ia32/audit.c
new file mode 100644
index 000000000000..91b7b5922dfa
--- /dev/null
+++ b/arch/x86/ia32/audit.c
@@ -0,0 +1,42 @@
1#include <asm/unistd_32.h>
2
3unsigned ia32_dir_class[] = {
4#include <asm-generic/audit_dir_write.h>
5~0U
6};
7
8unsigned ia32_chattr_class[] = {
9#include <asm-generic/audit_change_attr.h>
10~0U
11};
12
13unsigned ia32_write_class[] = {
14#include <asm-generic/audit_write.h>
15~0U
16};
17
18unsigned ia32_read_class[] = {
19#include <asm-generic/audit_read.h>
20~0U
21};
22
23unsigned ia32_signal_class[] = {
24#include <asm-generic/audit_signal.h>
25~0U
26};
27
28int ia32_classify_syscall(unsigned syscall)
29{
30 switch(syscall) {
31 case __NR_open:
32 return 2;
33 case __NR_openat:
34 return 3;
35 case __NR_socketcall:
36 return 4;
37 case __NR_execve:
38 return 5;
39 default:
40 return 1;
41 }
42}
diff --git a/arch/x86/ia32/fpu32.c b/arch/x86/ia32/fpu32.c
new file mode 100644
index 000000000000..2c8209a3605a
--- /dev/null
+++ b/arch/x86/ia32/fpu32.c
@@ -0,0 +1,183 @@
1/*
2 * Copyright 2002 Andi Kleen, SuSE Labs.
3 * FXSAVE<->i387 conversion support. Based on code by Gareth Hughes.
4 * This is used for ptrace, signals and coredumps in 32bit emulation.
5 */
6
7#include <linux/sched.h>
8#include <asm/sigcontext32.h>
9#include <asm/processor.h>
10#include <asm/uaccess.h>
11#include <asm/i387.h>
12
13static inline unsigned short twd_i387_to_fxsr(unsigned short twd)
14{
15 unsigned int tmp; /* to avoid 16 bit prefixes in the code */
16
17 /* Transform each pair of bits into 01 (valid) or 00 (empty) */
18 tmp = ~twd;
19 tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */
20 /* and move the valid bits to the lower byte. */
21 tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */
22 tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */
23 tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */
24 return tmp;
25}
26
27static inline unsigned long twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave)
28{
29 struct _fpxreg *st = NULL;
30 unsigned long tos = (fxsave->swd >> 11) & 7;
31 unsigned long twd = (unsigned long) fxsave->twd;
32 unsigned long tag;
33 unsigned long ret = 0xffff0000;
34 int i;
35
36#define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16);
37
38 for (i = 0 ; i < 8 ; i++) {
39 if (twd & 0x1) {
40 st = FPREG_ADDR( fxsave, (i - tos) & 7 );
41
42 switch (st->exponent & 0x7fff) {
43 case 0x7fff:
44 tag = 2; /* Special */
45 break;
46 case 0x0000:
47 if ( !st->significand[0] &&
48 !st->significand[1] &&
49 !st->significand[2] &&
50 !st->significand[3] ) {
51 tag = 1; /* Zero */
52 } else {
53 tag = 2; /* Special */
54 }
55 break;
56 default:
57 if (st->significand[3] & 0x8000) {
58 tag = 0; /* Valid */
59 } else {
60 tag = 2; /* Special */
61 }
62 break;
63 }
64 } else {
65 tag = 3; /* Empty */
66 }
67 ret |= (tag << (2 * i));
68 twd = twd >> 1;
69 }
70 return ret;
71}
72
73
74static inline int convert_fxsr_from_user(struct i387_fxsave_struct *fxsave,
75 struct _fpstate_ia32 __user *buf)
76{
77 struct _fpxreg *to;
78 struct _fpreg __user *from;
79 int i;
80 u32 v;
81 int err = 0;
82
83#define G(num,val) err |= __get_user(val, num + (u32 __user *)buf)
84 G(0, fxsave->cwd);
85 G(1, fxsave->swd);
86 G(2, fxsave->twd);
87 fxsave->twd = twd_i387_to_fxsr(fxsave->twd);
88 G(3, fxsave->rip);
89 G(4, v);
90 fxsave->fop = v>>16; /* cs ignored */
91 G(5, fxsave->rdp);
92 /* 6: ds ignored */
93#undef G
94 if (err)
95 return -1;
96
97 to = (struct _fpxreg *)&fxsave->st_space[0];
98 from = &buf->_st[0];
99 for (i = 0 ; i < 8 ; i++, to++, from++) {
100 if (__copy_from_user(to, from, sizeof(*from)))
101 return -1;
102 }
103 return 0;
104}
105
106
107static inline int convert_fxsr_to_user(struct _fpstate_ia32 __user *buf,
108 struct i387_fxsave_struct *fxsave,
109 struct pt_regs *regs,
110 struct task_struct *tsk)
111{
112 struct _fpreg __user *to;
113 struct _fpxreg *from;
114 int i;
115 u16 cs,ds;
116 int err = 0;
117
118 if (tsk == current) {
119 /* should be actually ds/cs at fpu exception time,
120 but that information is not available in 64bit mode. */
121 asm("movw %%ds,%0 " : "=r" (ds));
122 asm("movw %%cs,%0 " : "=r" (cs));
123 } else { /* ptrace. task has stopped. */
124 ds = tsk->thread.ds;
125 cs = regs->cs;
126 }
127
128#define P(num,val) err |= __put_user(val, num + (u32 __user *)buf)
129 P(0, (u32)fxsave->cwd | 0xffff0000);
130 P(1, (u32)fxsave->swd | 0xffff0000);
131 P(2, twd_fxsr_to_i387(fxsave));
132 P(3, (u32)fxsave->rip);
133 P(4, cs | ((u32)fxsave->fop) << 16);
134 P(5, fxsave->rdp);
135 P(6, 0xffff0000 | ds);
136#undef P
137
138 if (err)
139 return -1;
140
141 to = &buf->_st[0];
142 from = (struct _fpxreg *) &fxsave->st_space[0];
143 for ( i = 0 ; i < 8 ; i++, to++, from++ ) {
144 if (__copy_to_user(to, from, sizeof(*to)))
145 return -1;
146 }
147 return 0;
148}
149
150int restore_i387_ia32(struct task_struct *tsk, struct _fpstate_ia32 __user *buf, int fsave)
151{
152 clear_fpu(tsk);
153 if (!fsave) {
154 if (__copy_from_user(&tsk->thread.i387.fxsave,
155 &buf->_fxsr_env[0],
156 sizeof(struct i387_fxsave_struct)))
157 return -1;
158 tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
159 set_stopped_child_used_math(tsk);
160 }
161 return convert_fxsr_from_user(&tsk->thread.i387.fxsave, buf);
162}
163
164int save_i387_ia32(struct task_struct *tsk,
165 struct _fpstate_ia32 __user *buf,
166 struct pt_regs *regs,
167 int fsave)
168{
169 int err = 0;
170
171 init_fpu(tsk);
172 if (convert_fxsr_to_user(buf, &tsk->thread.i387.fxsave, regs, tsk))
173 return -1;
174 if (fsave)
175 return 0;
176 err |= __put_user(tsk->thread.i387.fxsave.swd, &buf->status);
177 if (fsave)
178 return err ? -1 : 1;
179 err |= __put_user(X86_FXSR_MAGIC, &buf->magic);
180 err |= __copy_to_user(&buf->_fxsr_env[0], &tsk->thread.i387.fxsave,
181 sizeof(struct i387_fxsave_struct));
182 return err ? -1 : 1;
183}
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
new file mode 100644
index 000000000000..08781370256d
--- /dev/null
+++ b/arch/x86/ia32/ia32_aout.c
@@ -0,0 +1,528 @@
1/*
2 * a.out loader for x86-64
3 *
4 * Copyright (C) 1991, 1992, 1996 Linus Torvalds
5 * Hacked together by Andi Kleen
6 */
7
8#include <linux/module.h>
9
10#include <linux/time.h>
11#include <linux/kernel.h>
12#include <linux/mm.h>
13#include <linux/mman.h>
14#include <linux/a.out.h>
15#include <linux/errno.h>
16#include <linux/signal.h>
17#include <linux/string.h>
18#include <linux/fs.h>
19#include <linux/file.h>
20#include <linux/stat.h>
21#include <linux/fcntl.h>
22#include <linux/ptrace.h>
23#include <linux/user.h>
24#include <linux/slab.h>
25#include <linux/binfmts.h>
26#include <linux/personality.h>
27#include <linux/init.h>
28
29#include <asm/system.h>
30#include <asm/uaccess.h>
31#include <asm/pgalloc.h>
32#include <asm/cacheflush.h>
33#include <asm/user32.h>
34#include <asm/ia32.h>
35
36#undef WARN_OLD
37#undef CORE_DUMP /* probably broken */
38
39static int load_aout_binary(struct linux_binprm *, struct pt_regs * regs);
40static int load_aout_library(struct file*);
41
42#ifdef CORE_DUMP
43static int aout_core_dump(long signr, struct pt_regs * regs, struct file *file);
44
45/*
46 * fill in the user structure for a core dump..
47 */
48static void dump_thread32(struct pt_regs * regs, struct user32 * dump)
49{
50 u32 fs,gs;
51
52/* changed the size calculations - should hopefully work better. lbt */
53 dump->magic = CMAGIC;
54 dump->start_code = 0;
55 dump->start_stack = regs->rsp & ~(PAGE_SIZE - 1);
56 dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
57 dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
58 dump->u_dsize -= dump->u_tsize;
59 dump->u_ssize = 0;
60 dump->u_debugreg[0] = current->thread.debugreg0;
61 dump->u_debugreg[1] = current->thread.debugreg1;
62 dump->u_debugreg[2] = current->thread.debugreg2;
63 dump->u_debugreg[3] = current->thread.debugreg3;
64 dump->u_debugreg[4] = 0;
65 dump->u_debugreg[5] = 0;
66 dump->u_debugreg[6] = current->thread.debugreg6;
67 dump->u_debugreg[7] = current->thread.debugreg7;
68
69 if (dump->start_stack < 0xc0000000)
70 dump->u_ssize = ((unsigned long) (0xc0000000 - dump->start_stack)) >> PAGE_SHIFT;
71
72 dump->regs.ebx = regs->rbx;
73 dump->regs.ecx = regs->rcx;
74 dump->regs.edx = regs->rdx;
75 dump->regs.esi = regs->rsi;
76 dump->regs.edi = regs->rdi;
77 dump->regs.ebp = regs->rbp;
78 dump->regs.eax = regs->rax;
79 dump->regs.ds = current->thread.ds;
80 dump->regs.es = current->thread.es;
81 asm("movl %%fs,%0" : "=r" (fs)); dump->regs.fs = fs;
82 asm("movl %%gs,%0" : "=r" (gs)); dump->regs.gs = gs;
83 dump->regs.orig_eax = regs->orig_rax;
84 dump->regs.eip = regs->rip;
85 dump->regs.cs = regs->cs;
86 dump->regs.eflags = regs->eflags;
87 dump->regs.esp = regs->rsp;
88 dump->regs.ss = regs->ss;
89
90#if 1 /* FIXME */
91 dump->u_fpvalid = 0;
92#else
93 dump->u_fpvalid = dump_fpu (regs, &dump->i387);
94#endif
95}
96
97#endif
98
99static struct linux_binfmt aout_format = {
100 .module = THIS_MODULE,
101 .load_binary = load_aout_binary,
102 .load_shlib = load_aout_library,
103#ifdef CORE_DUMP
104 .core_dump = aout_core_dump,
105#endif
106 .min_coredump = PAGE_SIZE
107};
108
109static void set_brk(unsigned long start, unsigned long end)
110{
111 start = PAGE_ALIGN(start);
112 end = PAGE_ALIGN(end);
113 if (end <= start)
114 return;
115 down_write(&current->mm->mmap_sem);
116 do_brk(start, end - start);
117 up_write(&current->mm->mmap_sem);
118}
119
120#ifdef CORE_DUMP
121/*
122 * These are the only things you should do on a core-file: use only these
123 * macros to write out all the necessary info.
124 */
125
126static int dump_write(struct file *file, const void *addr, int nr)
127{
128 return file->f_op->write(file, addr, nr, &file->f_pos) == nr;
129}
130
131#define DUMP_WRITE(addr, nr) \
132 if (!dump_write(file, (void *)(addr), (nr))) \
133 goto end_coredump;
134
135#define DUMP_SEEK(offset) \
136if (file->f_op->llseek) { \
137 if (file->f_op->llseek(file,(offset),0) != (offset)) \
138 goto end_coredump; \
139} else file->f_pos = (offset)
140
141/*
142 * Routine writes a core dump image in the current directory.
143 * Currently only a stub-function.
144 *
145 * Note that setuid/setgid files won't make a core-dump if the uid/gid
146 * changed due to the set[u|g]id. It's enforced by the "current->mm->dumpable"
147 * field, which also makes sure the core-dumps won't be recursive if the
148 * dumping of the process results in another error..
149 */
150
151static int aout_core_dump(long signr, struct pt_regs * regs, struct file *file)
152{
153 mm_segment_t fs;
154 int has_dumped = 0;
155 unsigned long dump_start, dump_size;
156 struct user32 dump;
157# define START_DATA(u) (u.u_tsize << PAGE_SHIFT)
158# define START_STACK(u) (u.start_stack)
159
160 fs = get_fs();
161 set_fs(KERNEL_DS);
162 has_dumped = 1;
163 current->flags |= PF_DUMPCORE;
164 strncpy(dump.u_comm, current->comm, sizeof(current->comm));
165 dump.u_ar0 = (u32)(((unsigned long)(&dump.regs)) - ((unsigned long)(&dump)));
166 dump.signal = signr;
167 dump_thread32(regs, &dump);
168
169/* If the size of the dump file exceeds the rlimit, then see what would happen
170 if we wrote the stack, but not the data area. */
171 if ((dump.u_dsize+dump.u_ssize+1) * PAGE_SIZE >
172 current->signal->rlim[RLIMIT_CORE].rlim_cur)
173 dump.u_dsize = 0;
174
175/* Make sure we have enough room to write the stack and data areas. */
176 if ((dump.u_ssize+1) * PAGE_SIZE >
177 current->signal->rlim[RLIMIT_CORE].rlim_cur)
178 dump.u_ssize = 0;
179
180/* make sure we actually have a data and stack area to dump */
181 set_fs(USER_DS);
182 if (!access_ok(VERIFY_READ, (void *) (unsigned long)START_DATA(dump), dump.u_dsize << PAGE_SHIFT))
183 dump.u_dsize = 0;
184 if (!access_ok(VERIFY_READ, (void *) (unsigned long)START_STACK(dump), dump.u_ssize << PAGE_SHIFT))
185 dump.u_ssize = 0;
186
187 set_fs(KERNEL_DS);
188/* struct user */
189 DUMP_WRITE(&dump,sizeof(dump));
190/* Now dump all of the user data. Include malloced stuff as well */
191 DUMP_SEEK(PAGE_SIZE);
192/* now we start writing out the user space info */
193 set_fs(USER_DS);
194/* Dump the data area */
195 if (dump.u_dsize != 0) {
196 dump_start = START_DATA(dump);
197 dump_size = dump.u_dsize << PAGE_SHIFT;
198 DUMP_WRITE(dump_start,dump_size);
199 }
200/* Now prepare to dump the stack area */
201 if (dump.u_ssize != 0) {
202 dump_start = START_STACK(dump);
203 dump_size = dump.u_ssize << PAGE_SHIFT;
204 DUMP_WRITE(dump_start,dump_size);
205 }
206/* Finally dump the task struct. Not be used by gdb, but could be useful */
207 set_fs(KERNEL_DS);
208 DUMP_WRITE(current,sizeof(*current));
209end_coredump:
210 set_fs(fs);
211 return has_dumped;
212}
213#endif
214
215/*
216 * create_aout_tables() parses the env- and arg-strings in new user
217 * memory and creates the pointer tables from them, and puts their
218 * addresses on the "stack", returning the new stack pointer value.
219 */
220static u32 __user *create_aout_tables(char __user *p, struct linux_binprm *bprm)
221{
222 u32 __user *argv;
223 u32 __user *envp;
224 u32 __user *sp;
225 int argc = bprm->argc;
226 int envc = bprm->envc;
227
228 sp = (u32 __user *) ((-(unsigned long)sizeof(u32)) & (unsigned long) p);
229 sp -= envc+1;
230 envp = sp;
231 sp -= argc+1;
232 argv = sp;
233 put_user((unsigned long) envp,--sp);
234 put_user((unsigned long) argv,--sp);
235 put_user(argc,--sp);
236 current->mm->arg_start = (unsigned long) p;
237 while (argc-->0) {
238 char c;
239 put_user((u32)(unsigned long)p,argv++);
240 do {
241 get_user(c,p++);
242 } while (c);
243 }
244 put_user(0, argv);
245 current->mm->arg_end = current->mm->env_start = (unsigned long) p;
246 while (envc-->0) {
247 char c;
248 put_user((u32)(unsigned long)p,envp++);
249 do {
250 get_user(c,p++);
251 } while (c);
252 }
253 put_user(0, envp);
254 current->mm->env_end = (unsigned long) p;
255 return sp;
256}
257
258/*
259 * These are the functions used to load a.out style executables and shared
260 * libraries. There is no binary dependent code anywhere else.
261 */
262
263static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
264{
265 struct exec ex;
266 unsigned long error;
267 unsigned long fd_offset;
268 unsigned long rlim;
269 int retval;
270
271 ex = *((struct exec *) bprm->buf); /* exec-header */
272 if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != OMAGIC &&
273 N_MAGIC(ex) != QMAGIC && N_MAGIC(ex) != NMAGIC) ||
274 N_TRSIZE(ex) || N_DRSIZE(ex) ||
275 i_size_read(bprm->file->f_path.dentry->d_inode) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) {
276 return -ENOEXEC;
277 }
278
279 fd_offset = N_TXTOFF(ex);
280
281 /* Check initial limits. This avoids letting people circumvent
282 * size limits imposed on them by creating programs with large
283 * arrays in the data or bss.
284 */
285 rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
286 if (rlim >= RLIM_INFINITY)
287 rlim = ~0;
288 if (ex.a_data + ex.a_bss > rlim)
289 return -ENOMEM;
290
291 /* Flush all traces of the currently running executable */
292 retval = flush_old_exec(bprm);
293 if (retval)
294 return retval;
295
296 regs->cs = __USER32_CS;
297 regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 =
298 regs->r13 = regs->r14 = regs->r15 = 0;
299
300 /* OK, This is the point of no return */
301 set_personality(PER_LINUX);
302 set_thread_flag(TIF_IA32);
303 clear_thread_flag(TIF_ABI_PENDING);
304
305 current->mm->end_code = ex.a_text +
306 (current->mm->start_code = N_TXTADDR(ex));
307 current->mm->end_data = ex.a_data +
308 (current->mm->start_data = N_DATADDR(ex));
309 current->mm->brk = ex.a_bss +
310 (current->mm->start_brk = N_BSSADDR(ex));
311 current->mm->free_area_cache = TASK_UNMAPPED_BASE;
312 current->mm->cached_hole_size = 0;
313
314 current->mm->mmap = NULL;
315 compute_creds(bprm);
316 current->flags &= ~PF_FORKNOEXEC;
317
318 if (N_MAGIC(ex) == OMAGIC) {
319 unsigned long text_addr, map_size;
320 loff_t pos;
321
322 text_addr = N_TXTADDR(ex);
323
324 pos = 32;
325 map_size = ex.a_text+ex.a_data;
326
327 down_write(&current->mm->mmap_sem);
328 error = do_brk(text_addr & PAGE_MASK, map_size);
329 up_write(&current->mm->mmap_sem);
330
331 if (error != (text_addr & PAGE_MASK)) {
332 send_sig(SIGKILL, current, 0);
333 return error;
334 }
335
336 error = bprm->file->f_op->read(bprm->file,
337 (char __user *)text_addr,
338 ex.a_text+ex.a_data, &pos);
339 if ((signed long)error < 0) {
340 send_sig(SIGKILL, current, 0);
341 return error;
342 }
343
344 flush_icache_range(text_addr, text_addr+ex.a_text+ex.a_data);
345 } else {
346#ifdef WARN_OLD
347 static unsigned long error_time, error_time2;
348 if ((ex.a_text & 0xfff || ex.a_data & 0xfff) &&
349 (N_MAGIC(ex) != NMAGIC) && (jiffies-error_time2) > 5*HZ)
350 {
351 printk(KERN_NOTICE "executable not page aligned\n");
352 error_time2 = jiffies;
353 }
354
355 if ((fd_offset & ~PAGE_MASK) != 0 &&
356 (jiffies-error_time) > 5*HZ)
357 {
358 printk(KERN_WARNING
359 "fd_offset is not page aligned. Please convert program: %s\n",
360 bprm->file->f_path.dentry->d_name.name);
361 error_time = jiffies;
362 }
363#endif
364
365 if (!bprm->file->f_op->mmap||((fd_offset & ~PAGE_MASK) != 0)) {
366 loff_t pos = fd_offset;
367 down_write(&current->mm->mmap_sem);
368 do_brk(N_TXTADDR(ex), ex.a_text+ex.a_data);
369 up_write(&current->mm->mmap_sem);
370 bprm->file->f_op->read(bprm->file,
371 (char __user *)N_TXTADDR(ex),
372 ex.a_text+ex.a_data, &pos);
373 flush_icache_range((unsigned long) N_TXTADDR(ex),
374 (unsigned long) N_TXTADDR(ex) +
375 ex.a_text+ex.a_data);
376 goto beyond_if;
377 }
378
379 down_write(&current->mm->mmap_sem);
380 error = do_mmap(bprm->file, N_TXTADDR(ex), ex.a_text,
381 PROT_READ | PROT_EXEC,
382 MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE | MAP_32BIT,
383 fd_offset);
384 up_write(&current->mm->mmap_sem);
385
386 if (error != N_TXTADDR(ex)) {
387 send_sig(SIGKILL, current, 0);
388 return error;
389 }
390
391 down_write(&current->mm->mmap_sem);
392 error = do_mmap(bprm->file, N_DATADDR(ex), ex.a_data,
393 PROT_READ | PROT_WRITE | PROT_EXEC,
394 MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE | MAP_32BIT,
395 fd_offset + ex.a_text);
396 up_write(&current->mm->mmap_sem);
397 if (error != N_DATADDR(ex)) {
398 send_sig(SIGKILL, current, 0);
399 return error;
400 }
401 }
402beyond_if:
403 set_binfmt(&aout_format);
404
405 set_brk(current->mm->start_brk, current->mm->brk);
406
407 retval = setup_arg_pages(bprm, IA32_STACK_TOP, EXSTACK_DEFAULT);
408 if (retval < 0) {
409 /* Someone check-me: is this error path enough? */
410 send_sig(SIGKILL, current, 0);
411 return retval;
412 }
413
414 current->mm->start_stack =
415 (unsigned long)create_aout_tables((char __user *)bprm->p, bprm);
416 /* start thread */
417 asm volatile("movl %0,%%fs" :: "r" (0)); \
418 asm volatile("movl %0,%%es; movl %0,%%ds": :"r" (__USER32_DS));
419 load_gs_index(0);
420 (regs)->rip = ex.a_entry;
421 (regs)->rsp = current->mm->start_stack;
422 (regs)->eflags = 0x200;
423 (regs)->cs = __USER32_CS;
424 (regs)->ss = __USER32_DS;
425 set_fs(USER_DS);
426 if (unlikely(current->ptrace & PT_PTRACED)) {
427 if (current->ptrace & PT_TRACE_EXEC)
428 ptrace_notify ((PTRACE_EVENT_EXEC << 8) | SIGTRAP);
429 else
430 send_sig(SIGTRAP, current, 0);
431 }
432 return 0;
433}
434
435static int load_aout_library(struct file *file)
436{
437 struct inode * inode;
438 unsigned long bss, start_addr, len;
439 unsigned long error;
440 int retval;
441 struct exec ex;
442
443 inode = file->f_path.dentry->d_inode;
444
445 retval = -ENOEXEC;
446 error = kernel_read(file, 0, (char *) &ex, sizeof(ex));
447 if (error != sizeof(ex))
448 goto out;
449
450 /* We come in here for the regular a.out style of shared libraries */
451 if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != QMAGIC) || N_TRSIZE(ex) ||
452 N_DRSIZE(ex) || ((ex.a_entry & 0xfff) && N_MAGIC(ex) == ZMAGIC) ||
453 i_size_read(inode) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) {
454 goto out;
455 }
456
457 if (N_FLAGS(ex))
458 goto out;
459
460 /* For QMAGIC, the starting address is 0x20 into the page. We mask
461 this off to get the starting address for the page */
462
463 start_addr = ex.a_entry & 0xfffff000;
464
465 if ((N_TXTOFF(ex) & ~PAGE_MASK) != 0) {
466 loff_t pos = N_TXTOFF(ex);
467
468#ifdef WARN_OLD
469 static unsigned long error_time;
470 if ((jiffies-error_time) > 5*HZ)
471 {
472 printk(KERN_WARNING
473 "N_TXTOFF is not page aligned. Please convert library: %s\n",
474 file->f_path.dentry->d_name.name);
475 error_time = jiffies;
476 }
477#endif
478 down_write(&current->mm->mmap_sem);
479 do_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss);
480 up_write(&current->mm->mmap_sem);
481
482 file->f_op->read(file, (char __user *)start_addr,
483 ex.a_text + ex.a_data, &pos);
484 flush_icache_range((unsigned long) start_addr,
485 (unsigned long) start_addr + ex.a_text + ex.a_data);
486
487 retval = 0;
488 goto out;
489 }
490 /* Now use mmap to map the library into memory. */
491 down_write(&current->mm->mmap_sem);
492 error = do_mmap(file, start_addr, ex.a_text + ex.a_data,
493 PROT_READ | PROT_WRITE | PROT_EXEC,
494 MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_32BIT,
495 N_TXTOFF(ex));
496 up_write(&current->mm->mmap_sem);
497 retval = error;
498 if (error != start_addr)
499 goto out;
500
501 len = PAGE_ALIGN(ex.a_text + ex.a_data);
502 bss = ex.a_text + ex.a_data + ex.a_bss;
503 if (bss > len) {
504 down_write(&current->mm->mmap_sem);
505 error = do_brk(start_addr + len, bss - len);
506 up_write(&current->mm->mmap_sem);
507 retval = error;
508 if (error != start_addr + len)
509 goto out;
510 }
511 retval = 0;
512out:
513 return retval;
514}
515
516static int __init init_aout_binfmt(void)
517{
518 return register_binfmt(&aout_format);
519}
520
521static void __exit exit_aout_binfmt(void)
522{
523 unregister_binfmt(&aout_format);
524}
525
526module_init(init_aout_binfmt);
527module_exit(exit_aout_binfmt);
528MODULE_LICENSE("GPL");
diff --git a/arch/x86/ia32/ia32_binfmt.c b/arch/x86/ia32/ia32_binfmt.c
new file mode 100644
index 000000000000..dffd2ac72747
--- /dev/null
+++ b/arch/x86/ia32/ia32_binfmt.c
@@ -0,0 +1,320 @@
1/*
2 * Written 2000,2002 by Andi Kleen.
3 *
4 * Loosely based on the sparc64 and IA64 32bit emulation loaders.
5 * This tricks binfmt_elf.c into loading 32bit binaries using lots
6 * of ugly preprocessor tricks. Talk about very very poor man's inheritance.
7 */
8#define __ASM_X86_64_ELF_H 1
9
10#undef ELF_CLASS
11#define ELF_CLASS ELFCLASS32
12
13#include <linux/types.h>
14#include <linux/stddef.h>
15#include <linux/rwsem.h>
16#include <linux/sched.h>
17#include <linux/compat.h>
18#include <linux/string.h>
19#include <linux/binfmts.h>
20#include <linux/mm.h>
21#include <linux/security.h>
22
23#include <asm/segment.h>
24#include <asm/ptrace.h>
25#include <asm/processor.h>
26#include <asm/user32.h>
27#include <asm/sigcontext32.h>
28#include <asm/fpu32.h>
29#include <asm/i387.h>
30#include <asm/uaccess.h>
31#include <asm/ia32.h>
32#include <asm/vsyscall32.h>
33
34#define ELF_NAME "elf/i386"
35
36#define AT_SYSINFO 32
37#define AT_SYSINFO_EHDR 33
38
39int sysctl_vsyscall32 = 1;
40
41#undef ARCH_DLINFO
42#define ARCH_DLINFO do { \
43 if (sysctl_vsyscall32) { \
44 current->mm->context.vdso = (void *)VSYSCALL32_BASE; \
45 NEW_AUX_ENT(AT_SYSINFO, (u32)(u64)VSYSCALL32_VSYSCALL); \
46 NEW_AUX_ENT(AT_SYSINFO_EHDR, VSYSCALL32_BASE); \
47 } \
48} while(0)
49
50struct file;
51struct elf_phdr;
52
53#define IA32_EMULATOR 1
54
55#define ELF_ET_DYN_BASE (TASK_UNMAPPED_BASE + 0x1000000)
56
57#undef ELF_ARCH
58#define ELF_ARCH EM_386
59
60#define ELF_DATA ELFDATA2LSB
61
62#define USE_ELF_CORE_DUMP 1
63
64/* Override elfcore.h */
65#define _LINUX_ELFCORE_H 1
66typedef unsigned int elf_greg_t;
67
68#define ELF_NGREG (sizeof (struct user_regs_struct32) / sizeof(elf_greg_t))
69typedef elf_greg_t elf_gregset_t[ELF_NGREG];
70
71struct elf_siginfo
72{
73 int si_signo; /* signal number */
74 int si_code; /* extra code */
75 int si_errno; /* errno */
76};
77
78#define jiffies_to_timeval(a,b) do { (b)->tv_usec = 0; (b)->tv_sec = (a)/HZ; }while(0)
79
80struct elf_prstatus
81{
82 struct elf_siginfo pr_info; /* Info associated with signal */
83 short pr_cursig; /* Current signal */
84 unsigned int pr_sigpend; /* Set of pending signals */
85 unsigned int pr_sighold; /* Set of held signals */
86 pid_t pr_pid;
87 pid_t pr_ppid;
88 pid_t pr_pgrp;
89 pid_t pr_sid;
90 struct compat_timeval pr_utime; /* User time */
91 struct compat_timeval pr_stime; /* System time */
92 struct compat_timeval pr_cutime; /* Cumulative user time */
93 struct compat_timeval pr_cstime; /* Cumulative system time */
94 elf_gregset_t pr_reg; /* GP registers */
95 int pr_fpvalid; /* True if math co-processor being used. */
96};
97
98#define ELF_PRARGSZ (80) /* Number of chars for args */
99
100struct elf_prpsinfo
101{
102 char pr_state; /* numeric process state */
103 char pr_sname; /* char for pr_state */
104 char pr_zomb; /* zombie */
105 char pr_nice; /* nice val */
106 unsigned int pr_flag; /* flags */
107 __u16 pr_uid;
108 __u16 pr_gid;
109 pid_t pr_pid, pr_ppid, pr_pgrp, pr_sid;
110 /* Lots missing */
111 char pr_fname[16]; /* filename of executable */
112 char pr_psargs[ELF_PRARGSZ]; /* initial part of arg list */
113};
114
115#define __STR(x) #x
116#define STR(x) __STR(x)
117
118#define _GET_SEG(x) \
119 ({ __u32 seg; asm("movl %%" STR(x) ",%0" : "=r"(seg)); seg; })
120
121/* Assumes current==process to be dumped */
122#define ELF_CORE_COPY_REGS(pr_reg, regs) \
123 pr_reg[0] = regs->rbx; \
124 pr_reg[1] = regs->rcx; \
125 pr_reg[2] = regs->rdx; \
126 pr_reg[3] = regs->rsi; \
127 pr_reg[4] = regs->rdi; \
128 pr_reg[5] = regs->rbp; \
129 pr_reg[6] = regs->rax; \
130 pr_reg[7] = _GET_SEG(ds); \
131 pr_reg[8] = _GET_SEG(es); \
132 pr_reg[9] = _GET_SEG(fs); \
133 pr_reg[10] = _GET_SEG(gs); \
134 pr_reg[11] = regs->orig_rax; \
135 pr_reg[12] = regs->rip; \
136 pr_reg[13] = regs->cs; \
137 pr_reg[14] = regs->eflags; \
138 pr_reg[15] = regs->rsp; \
139 pr_reg[16] = regs->ss;
140
141#define user user32
142
143#undef elf_read_implies_exec
144#define elf_read_implies_exec(ex, executable_stack) (executable_stack != EXSTACK_DISABLE_X)
145//#include <asm/ia32.h>
146#include <linux/elf.h>
147
148typedef struct user_i387_ia32_struct elf_fpregset_t;
149typedef struct user32_fxsr_struct elf_fpxregset_t;
150
151
152static inline void elf_core_copy_regs(elf_gregset_t *elfregs, struct pt_regs *regs)
153{
154 ELF_CORE_COPY_REGS((*elfregs), regs)
155}
156
157static inline int elf_core_copy_task_regs(struct task_struct *t, elf_gregset_t* elfregs)
158{
159 struct pt_regs *pp = task_pt_regs(t);
160 ELF_CORE_COPY_REGS((*elfregs), pp);
161 /* fix wrong segments */
162 (*elfregs)[7] = t->thread.ds;
163 (*elfregs)[9] = t->thread.fsindex;
164 (*elfregs)[10] = t->thread.gsindex;
165 (*elfregs)[8] = t->thread.es;
166 return 1;
167}
168
169static inline int
170elf_core_copy_task_fpregs(struct task_struct *tsk, struct pt_regs *regs, elf_fpregset_t *fpu)
171{
172 struct _fpstate_ia32 *fpstate = (void*)fpu;
173 mm_segment_t oldfs = get_fs();
174
175 if (!tsk_used_math(tsk))
176 return 0;
177 if (!regs)
178 regs = task_pt_regs(tsk);
179 if (tsk == current)
180 unlazy_fpu(tsk);
181 set_fs(KERNEL_DS);
182 save_i387_ia32(tsk, fpstate, regs, 1);
183 /* Correct for i386 bug. It puts the fop into the upper 16bits of
184 the tag word (like FXSAVE), not into the fcs*/
185 fpstate->cssel |= fpstate->tag & 0xffff0000;
186 set_fs(oldfs);
187 return 1;
188}
189
190#define ELF_CORE_COPY_XFPREGS 1
191static inline int
192elf_core_copy_task_xfpregs(struct task_struct *t, elf_fpxregset_t *xfpu)
193{
194 struct pt_regs *regs = task_pt_regs(t);
195 if (!tsk_used_math(t))
196 return 0;
197 if (t == current)
198 unlazy_fpu(t);
199 memcpy(xfpu, &t->thread.i387.fxsave, sizeof(elf_fpxregset_t));
200 xfpu->fcs = regs->cs;
201 xfpu->fos = t->thread.ds; /* right? */
202 return 1;
203}
204
205#undef elf_check_arch
206#define elf_check_arch(x) \
207 ((x)->e_machine == EM_386)
208
209extern int force_personality32;
210
211#define ELF_EXEC_PAGESIZE PAGE_SIZE
212#define ELF_HWCAP (boot_cpu_data.x86_capability[0])
213#define ELF_PLATFORM ("i686")
214#define SET_PERSONALITY(ex, ibcs2) \
215do { \
216 unsigned long new_flags = 0; \
217 if ((ex).e_ident[EI_CLASS] == ELFCLASS32) \
218 new_flags = _TIF_IA32; \
219 if ((current_thread_info()->flags & _TIF_IA32) \
220 != new_flags) \
221 set_thread_flag(TIF_ABI_PENDING); \
222 else \
223 clear_thread_flag(TIF_ABI_PENDING); \
224 /* XXX This overwrites the user set personality */ \
225 current->personality |= force_personality32; \
226} while (0)
227
228/* Override some function names */
229#define elf_format elf32_format
230
231#define init_elf_binfmt init_elf32_binfmt
232#define exit_elf_binfmt exit_elf32_binfmt
233
234#define load_elf_binary load_elf32_binary
235
236#define ELF_PLAT_INIT(r, load_addr) elf32_init(r)
237
238#undef start_thread
239#define start_thread(regs,new_rip,new_rsp) do { \
240 asm volatile("movl %0,%%fs" :: "r" (0)); \
241 asm volatile("movl %0,%%es; movl %0,%%ds": :"r" (__USER32_DS)); \
242 load_gs_index(0); \
243 (regs)->rip = (new_rip); \
244 (regs)->rsp = (new_rsp); \
245 (regs)->eflags = 0x200; \
246 (regs)->cs = __USER32_CS; \
247 (regs)->ss = __USER32_DS; \
248 set_fs(USER_DS); \
249} while(0)
250
251
252#include <linux/module.h>
253
254MODULE_DESCRIPTION("Binary format loader for compatibility with IA32 ELF binaries.");
255MODULE_AUTHOR("Eric Youngdale, Andi Kleen");
256
257#undef MODULE_DESCRIPTION
258#undef MODULE_AUTHOR
259
260static void elf32_init(struct pt_regs *);
261
262#define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
263#define arch_setup_additional_pages syscall32_setup_pages
264extern int syscall32_setup_pages(struct linux_binprm *, int exstack);
265
266#include "../../../fs/binfmt_elf.c"
267
268static void elf32_init(struct pt_regs *regs)
269{
270 struct task_struct *me = current;
271 regs->rdi = 0;
272 regs->rsi = 0;
273 regs->rdx = 0;
274 regs->rcx = 0;
275 regs->rax = 0;
276 regs->rbx = 0;
277 regs->rbp = 0;
278 regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 =
279 regs->r13 = regs->r14 = regs->r15 = 0;
280 me->thread.fs = 0;
281 me->thread.gs = 0;
282 me->thread.fsindex = 0;
283 me->thread.gsindex = 0;
284 me->thread.ds = __USER_DS;
285 me->thread.es = __USER_DS;
286}
287
288#ifdef CONFIG_SYSCTL
289/* Register vsyscall32 into the ABI table */
290#include <linux/sysctl.h>
291
292static ctl_table abi_table2[] = {
293 {
294 .ctl_name = 99,
295 .procname = "vsyscall32",
296 .data = &sysctl_vsyscall32,
297 .maxlen = sizeof(int),
298 .mode = 0644,
299 .proc_handler = proc_dointvec
300 },
301 {}
302};
303
304static ctl_table abi_root_table2[] = {
305 {
306 .ctl_name = CTL_ABI,
307 .procname = "abi",
308 .mode = 0555,
309 .child = abi_table2
310 },
311 {}
312};
313
314static __init int ia32_binfmt_init(void)
315{
316 register_sysctl_table(abi_root_table2);
317 return 0;
318}
319__initcall(ia32_binfmt_init);
320#endif
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
new file mode 100644
index 000000000000..6ea19c25f90d
--- /dev/null
+++ b/arch/x86/ia32/ia32_signal.c
@@ -0,0 +1,617 @@
1/*
2 * linux/arch/x86_64/ia32/ia32_signal.c
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 *
6 * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson
7 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
8 * 2000-12-* x86-64 compatibility mode signal handling by Andi Kleen
9 */
10
11#include <linux/sched.h>
12#include <linux/mm.h>
13#include <linux/smp.h>
14#include <linux/kernel.h>
15#include <linux/signal.h>
16#include <linux/errno.h>
17#include <linux/wait.h>
18#include <linux/ptrace.h>
19#include <linux/unistd.h>
20#include <linux/stddef.h>
21#include <linux/personality.h>
22#include <linux/compat.h>
23#include <linux/binfmts.h>
24#include <asm/ucontext.h>
25#include <asm/uaccess.h>
26#include <asm/i387.h>
27#include <asm/ia32.h>
28#include <asm/ptrace.h>
29#include <asm/ia32_unistd.h>
30#include <asm/user32.h>
31#include <asm/sigcontext32.h>
32#include <asm/fpu32.h>
33#include <asm/proto.h>
34#include <asm/vsyscall32.h>
35
36#define DEBUG_SIG 0
37
38#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
39
40asmlinkage int do_signal(struct pt_regs *regs, sigset_t *oldset);
41void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
42
43int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from)
44{
45 int err;
46 if (!access_ok (VERIFY_WRITE, to, sizeof(compat_siginfo_t)))
47 return -EFAULT;
48
49 /* If you change siginfo_t structure, please make sure that
50 this code is fixed accordingly.
51 It should never copy any pad contained in the structure
52 to avoid security leaks, but must copy the generic
53 3 ints plus the relevant union member. */
54 err = __put_user(from->si_signo, &to->si_signo);
55 err |= __put_user(from->si_errno, &to->si_errno);
56 err |= __put_user((short)from->si_code, &to->si_code);
57
58 if (from->si_code < 0) {
59 err |= __put_user(from->si_pid, &to->si_pid);
60 err |= __put_user(from->si_uid, &to->si_uid);
61 err |= __put_user(ptr_to_compat(from->si_ptr), &to->si_ptr);
62 } else {
63 /* First 32bits of unions are always present:
64 * si_pid === si_band === si_tid === si_addr(LS half) */
65 err |= __put_user(from->_sifields._pad[0], &to->_sifields._pad[0]);
66 switch (from->si_code >> 16) {
67 case __SI_FAULT >> 16:
68 break;
69 case __SI_CHLD >> 16:
70 err |= __put_user(from->si_utime, &to->si_utime);
71 err |= __put_user(from->si_stime, &to->si_stime);
72 err |= __put_user(from->si_status, &to->si_status);
73 /* FALL THROUGH */
74 default:
75 case __SI_KILL >> 16:
76 err |= __put_user(from->si_uid, &to->si_uid);
77 break;
78 case __SI_POLL >> 16:
79 err |= __put_user(from->si_fd, &to->si_fd);
80 break;
81 case __SI_TIMER >> 16:
82 err |= __put_user(from->si_overrun, &to->si_overrun);
83 err |= __put_user(ptr_to_compat(from->si_ptr),
84 &to->si_ptr);
85 break;
86 case __SI_RT >> 16: /* This is not generated by the kernel as of now. */
87 case __SI_MESGQ >> 16:
88 err |= __put_user(from->si_uid, &to->si_uid);
89 err |= __put_user(from->si_int, &to->si_int);
90 break;
91 }
92 }
93 return err;
94}
95
96int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
97{
98 int err;
99 u32 ptr32;
100 if (!access_ok (VERIFY_READ, from, sizeof(compat_siginfo_t)))
101 return -EFAULT;
102
103 err = __get_user(to->si_signo, &from->si_signo);
104 err |= __get_user(to->si_errno, &from->si_errno);
105 err |= __get_user(to->si_code, &from->si_code);
106
107 err |= __get_user(to->si_pid, &from->si_pid);
108 err |= __get_user(to->si_uid, &from->si_uid);
109 err |= __get_user(ptr32, &from->si_ptr);
110 to->si_ptr = compat_ptr(ptr32);
111
112 return err;
113}
114
115asmlinkage long
116sys32_sigsuspend(int history0, int history1, old_sigset_t mask)
117{
118 mask &= _BLOCKABLE;
119 spin_lock_irq(&current->sighand->siglock);
120 current->saved_sigmask = current->blocked;
121 siginitset(&current->blocked, mask);
122 recalc_sigpending();
123 spin_unlock_irq(&current->sighand->siglock);
124
125 current->state = TASK_INTERRUPTIBLE;
126 schedule();
127 set_thread_flag(TIF_RESTORE_SIGMASK);
128 return -ERESTARTNOHAND;
129}
130
131asmlinkage long
132sys32_sigaltstack(const stack_ia32_t __user *uss_ptr,
133 stack_ia32_t __user *uoss_ptr,
134 struct pt_regs *regs)
135{
136 stack_t uss,uoss;
137 int ret;
138 mm_segment_t seg;
139 if (uss_ptr) {
140 u32 ptr;
141 memset(&uss,0,sizeof(stack_t));
142 if (!access_ok(VERIFY_READ,uss_ptr,sizeof(stack_ia32_t)) ||
143 __get_user(ptr, &uss_ptr->ss_sp) ||
144 __get_user(uss.ss_flags, &uss_ptr->ss_flags) ||
145 __get_user(uss.ss_size, &uss_ptr->ss_size))
146 return -EFAULT;
147 uss.ss_sp = compat_ptr(ptr);
148 }
149 seg = get_fs();
150 set_fs(KERNEL_DS);
151 ret = do_sigaltstack(uss_ptr ? &uss : NULL, &uoss, regs->rsp);
152 set_fs(seg);
153 if (ret >= 0 && uoss_ptr) {
154 if (!access_ok(VERIFY_WRITE,uoss_ptr,sizeof(stack_ia32_t)) ||
155 __put_user(ptr_to_compat(uoss.ss_sp), &uoss_ptr->ss_sp) ||
156 __put_user(uoss.ss_flags, &uoss_ptr->ss_flags) ||
157 __put_user(uoss.ss_size, &uoss_ptr->ss_size))
158 ret = -EFAULT;
159 }
160 return ret;
161}
162
163/*
164 * Do a signal return; undo the signal stack.
165 */
166
167struct sigframe
168{
169 u32 pretcode;
170 int sig;
171 struct sigcontext_ia32 sc;
172 struct _fpstate_ia32 fpstate;
173 unsigned int extramask[_COMPAT_NSIG_WORDS-1];
174 char retcode[8];
175};
176
177struct rt_sigframe
178{
179 u32 pretcode;
180 int sig;
181 u32 pinfo;
182 u32 puc;
183 compat_siginfo_t info;
184 struct ucontext_ia32 uc;
185 struct _fpstate_ia32 fpstate;
186 char retcode[8];
187};
188
189static int
190ia32_restore_sigcontext(struct pt_regs *regs, struct sigcontext_ia32 __user *sc, unsigned int *peax)
191{
192 unsigned int err = 0;
193
194 /* Always make any pending restarted system calls return -EINTR */
195 current_thread_info()->restart_block.fn = do_no_restart_syscall;
196
197#if DEBUG_SIG
198 printk("SIG restore_sigcontext: sc=%p err(%x) eip(%x) cs(%x) flg(%x)\n",
199 sc, sc->err, sc->eip, sc->cs, sc->eflags);
200#endif
201#define COPY(x) { \
202 unsigned int reg; \
203 err |= __get_user(reg, &sc->e ##x); \
204 regs->r ## x = reg; \
205}
206
207#define RELOAD_SEG(seg,mask) \
208 { unsigned int cur; \
209 unsigned short pre; \
210 err |= __get_user(pre, &sc->seg); \
211 asm volatile("movl %%" #seg ",%0" : "=r" (cur)); \
212 pre |= mask; \
213 if (pre != cur) loadsegment(seg,pre); }
214
215 /* Reload fs and gs if they have changed in the signal handler.
216 This does not handle long fs/gs base changes in the handler, but
217 does not clobber them at least in the normal case. */
218
219 {
220 unsigned gs, oldgs;
221 err |= __get_user(gs, &sc->gs);
222 gs |= 3;
223 asm("movl %%gs,%0" : "=r" (oldgs));
224 if (gs != oldgs)
225 load_gs_index(gs);
226 }
227 RELOAD_SEG(fs,3);
228 RELOAD_SEG(ds,3);
229 RELOAD_SEG(es,3);
230
231 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
232 COPY(dx); COPY(cx); COPY(ip);
233 /* Don't touch extended registers */
234
235 err |= __get_user(regs->cs, &sc->cs);
236 regs->cs |= 3;
237 err |= __get_user(regs->ss, &sc->ss);
238 regs->ss |= 3;
239
240 {
241 unsigned int tmpflags;
242 err |= __get_user(tmpflags, &sc->eflags);
243 regs->eflags = (regs->eflags & ~0x40DD5) | (tmpflags & 0x40DD5);
244 regs->orig_rax = -1; /* disable syscall checks */
245 }
246
247 {
248 u32 tmp;
249 struct _fpstate_ia32 __user * buf;
250 err |= __get_user(tmp, &sc->fpstate);
251 buf = compat_ptr(tmp);
252 if (buf) {
253 if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
254 goto badframe;
255 err |= restore_i387_ia32(current, buf, 0);
256 } else {
257 struct task_struct *me = current;
258 if (used_math()) {
259 clear_fpu(me);
260 clear_used_math();
261 }
262 }
263 }
264
265 {
266 u32 tmp;
267 err |= __get_user(tmp, &sc->eax);
268 *peax = tmp;
269 }
270 return err;
271
272badframe:
273 return 1;
274}
275
276asmlinkage long sys32_sigreturn(struct pt_regs *regs)
277{
278 struct sigframe __user *frame = (struct sigframe __user *)(regs->rsp-8);
279 sigset_t set;
280 unsigned int eax;
281
282 if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
283 goto badframe;
284 if (__get_user(set.sig[0], &frame->sc.oldmask)
285 || (_COMPAT_NSIG_WORDS > 1
286 && __copy_from_user((((char *) &set.sig) + 4), &frame->extramask,
287 sizeof(frame->extramask))))
288 goto badframe;
289
290 sigdelsetmask(&set, ~_BLOCKABLE);
291 spin_lock_irq(&current->sighand->siglock);
292 current->blocked = set;
293 recalc_sigpending();
294 spin_unlock_irq(&current->sighand->siglock);
295
296 if (ia32_restore_sigcontext(regs, &frame->sc, &eax))
297 goto badframe;
298 return eax;
299
300badframe:
301 signal_fault(regs, frame, "32bit sigreturn");
302 return 0;
303}
304
305asmlinkage long sys32_rt_sigreturn(struct pt_regs *regs)
306{
307 struct rt_sigframe __user *frame;
308 sigset_t set;
309 unsigned int eax;
310 struct pt_regs tregs;
311
312 frame = (struct rt_sigframe __user *)(regs->rsp - 4);
313
314 if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
315 goto badframe;
316 if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
317 goto badframe;
318
319 sigdelsetmask(&set, ~_BLOCKABLE);
320 spin_lock_irq(&current->sighand->siglock);
321 current->blocked = set;
322 recalc_sigpending();
323 spin_unlock_irq(&current->sighand->siglock);
324
325 if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax))
326 goto badframe;
327
328 tregs = *regs;
329 if (sys32_sigaltstack(&frame->uc.uc_stack, NULL, &tregs) == -EFAULT)
330 goto badframe;
331
332 return eax;
333
334badframe:
335 signal_fault(regs,frame,"32bit rt sigreturn");
336 return 0;
337}
338
339/*
340 * Set up a signal frame.
341 */
342
343static int
344ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, struct _fpstate_ia32 __user *fpstate,
345 struct pt_regs *regs, unsigned int mask)
346{
347 int tmp, err = 0;
348
349 tmp = 0;
350 __asm__("movl %%gs,%0" : "=r"(tmp): "0"(tmp));
351 err |= __put_user(tmp, (unsigned int __user *)&sc->gs);
352 __asm__("movl %%fs,%0" : "=r"(tmp): "0"(tmp));
353 err |= __put_user(tmp, (unsigned int __user *)&sc->fs);
354 __asm__("movl %%ds,%0" : "=r"(tmp): "0"(tmp));
355 err |= __put_user(tmp, (unsigned int __user *)&sc->ds);
356 __asm__("movl %%es,%0" : "=r"(tmp): "0"(tmp));
357 err |= __put_user(tmp, (unsigned int __user *)&sc->es);
358
359 err |= __put_user((u32)regs->rdi, &sc->edi);
360 err |= __put_user((u32)regs->rsi, &sc->esi);
361 err |= __put_user((u32)regs->rbp, &sc->ebp);
362 err |= __put_user((u32)regs->rsp, &sc->esp);
363 err |= __put_user((u32)regs->rbx, &sc->ebx);
364 err |= __put_user((u32)regs->rdx, &sc->edx);
365 err |= __put_user((u32)regs->rcx, &sc->ecx);
366 err |= __put_user((u32)regs->rax, &sc->eax);
367 err |= __put_user((u32)regs->cs, &sc->cs);
368 err |= __put_user((u32)regs->ss, &sc->ss);
369 err |= __put_user(current->thread.trap_no, &sc->trapno);
370 err |= __put_user(current->thread.error_code, &sc->err);
371 err |= __put_user((u32)regs->rip, &sc->eip);
372 err |= __put_user((u32)regs->eflags, &sc->eflags);
373 err |= __put_user((u32)regs->rsp, &sc->esp_at_signal);
374
375 tmp = save_i387_ia32(current, fpstate, regs, 0);
376 if (tmp < 0)
377 err = -EFAULT;
378 else {
379 clear_used_math();
380 stts();
381 err |= __put_user(ptr_to_compat(tmp ? fpstate : NULL),
382 &sc->fpstate);
383 }
384
385 /* non-iBCS2 extensions.. */
386 err |= __put_user(mask, &sc->oldmask);
387 err |= __put_user(current->thread.cr2, &sc->cr2);
388
389 return err;
390}
391
392/*
393 * Determine which stack to use..
394 */
395static void __user *
396get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size)
397{
398 unsigned long rsp;
399
400 /* Default to using normal stack */
401 rsp = regs->rsp;
402
403 /* This is the X/Open sanctioned signal stack switching. */
404 if (ka->sa.sa_flags & SA_ONSTACK) {
405 if (sas_ss_flags(rsp) == 0)
406 rsp = current->sas_ss_sp + current->sas_ss_size;
407 }
408
409 /* This is the legacy signal stack switching. */
410 else if ((regs->ss & 0xffff) != __USER_DS &&
411 !(ka->sa.sa_flags & SA_RESTORER) &&
412 ka->sa.sa_restorer) {
413 rsp = (unsigned long) ka->sa.sa_restorer;
414 }
415
416 rsp -= frame_size;
417 /* Align the stack pointer according to the i386 ABI,
418 * i.e. so that on function entry ((sp + 4) & 15) == 0. */
419 rsp = ((rsp + 4) & -16ul) - 4;
420 return (void __user *) rsp;
421}
422
423int ia32_setup_frame(int sig, struct k_sigaction *ka,
424 compat_sigset_t *set, struct pt_regs * regs)
425{
426 struct sigframe __user *frame;
427 int err = 0;
428
429 frame = get_sigframe(ka, regs, sizeof(*frame));
430
431 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
432 goto give_sigsegv;
433
434 err |= __put_user(sig, &frame->sig);
435 if (err)
436 goto give_sigsegv;
437
438 err |= ia32_setup_sigcontext(&frame->sc, &frame->fpstate, regs,
439 set->sig[0]);
440 if (err)
441 goto give_sigsegv;
442
443 if (_COMPAT_NSIG_WORDS > 1) {
444 err |= __copy_to_user(frame->extramask, &set->sig[1],
445 sizeof(frame->extramask));
446 }
447 if (err)
448 goto give_sigsegv;
449
450 /* Return stub is in 32bit vsyscall page */
451 {
452 void __user *restorer;
453 if (current->binfmt->hasvdso)
454 restorer = VSYSCALL32_SIGRETURN;
455 else
456 restorer = (void *)&frame->retcode;
457 if (ka->sa.sa_flags & SA_RESTORER)
458 restorer = ka->sa.sa_restorer;
459 err |= __put_user(ptr_to_compat(restorer), &frame->pretcode);
460 }
461 /* These are actually not used anymore, but left because some
462 gdb versions depend on them as a marker. */
463 {
464 /* copy_to_user optimizes that into a single 8 byte store */
465 static const struct {
466 u16 poplmovl;
467 u32 val;
468 u16 int80;
469 u16 pad;
470 } __attribute__((packed)) code = {
471 0xb858, /* popl %eax ; movl $...,%eax */
472 __NR_ia32_sigreturn,
473 0x80cd, /* int $0x80 */
474 0,
475 };
476 err |= __copy_to_user(frame->retcode, &code, 8);
477 }
478 if (err)
479 goto give_sigsegv;
480
481 /* Set up registers for signal handler */
482 regs->rsp = (unsigned long) frame;
483 regs->rip = (unsigned long) ka->sa.sa_handler;
484
485 /* Make -mregparm=3 work */
486 regs->rax = sig;
487 regs->rdx = 0;
488 regs->rcx = 0;
489
490 asm volatile("movl %0,%%ds" :: "r" (__USER32_DS));
491 asm volatile("movl %0,%%es" :: "r" (__USER32_DS));
492
493 regs->cs = __USER32_CS;
494 regs->ss = __USER32_DS;
495
496 set_fs(USER_DS);
497 regs->eflags &= ~TF_MASK;
498 if (test_thread_flag(TIF_SINGLESTEP))
499 ptrace_notify(SIGTRAP);
500
501#if DEBUG_SIG
502 printk("SIG deliver (%s:%d): sp=%p pc=%lx ra=%u\n",
503 current->comm, current->pid, frame, regs->rip, frame->pretcode);
504#endif
505
506 return 0;
507
508give_sigsegv:
509 force_sigsegv(sig, current);
510 return -EFAULT;
511}
512
513int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
514 compat_sigset_t *set, struct pt_regs * regs)
515{
516 struct rt_sigframe __user *frame;
517 int err = 0;
518
519 frame = get_sigframe(ka, regs, sizeof(*frame));
520
521 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
522 goto give_sigsegv;
523
524 {
525 struct exec_domain *ed = current_thread_info()->exec_domain;
526 err |= __put_user((ed
527 && ed->signal_invmap
528 && sig < 32
529 ? ed->signal_invmap[sig]
530 : sig),
531 &frame->sig);
532 }
533 err |= __put_user(ptr_to_compat(&frame->info), &frame->pinfo);
534 err |= __put_user(ptr_to_compat(&frame->uc), &frame->puc);
535 err |= copy_siginfo_to_user32(&frame->info, info);
536 if (err)
537 goto give_sigsegv;
538
539 /* Create the ucontext. */
540 err |= __put_user(0, &frame->uc.uc_flags);
541 err |= __put_user(0, &frame->uc.uc_link);
542 err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
543 err |= __put_user(sas_ss_flags(regs->rsp),
544 &frame->uc.uc_stack.ss_flags);
545 err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
546 err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate,
547 regs, set->sig[0]);
548 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
549 if (err)
550 goto give_sigsegv;
551
552
553 {
554 void __user *restorer = VSYSCALL32_RTSIGRETURN;
555 if (ka->sa.sa_flags & SA_RESTORER)
556 restorer = ka->sa.sa_restorer;
557 err |= __put_user(ptr_to_compat(restorer), &frame->pretcode);
558 }
559
560 /* This is movl $,%eax ; int $0x80 */
561 /* Not actually used anymore, but left because some gdb versions
562 need it. */
563 {
564 /* __copy_to_user optimizes that into a single 8 byte store */
565 static const struct {
566 u8 movl;
567 u32 val;
568 u16 int80;
569 u16 pad;
570 u8 pad2;
571 } __attribute__((packed)) code = {
572 0xb8,
573 __NR_ia32_rt_sigreturn,
574 0x80cd,
575 0,
576 };
577 err |= __copy_to_user(frame->retcode, &code, 8);
578 }
579 if (err)
580 goto give_sigsegv;
581
582 /* Set up registers for signal handler */
583 regs->rsp = (unsigned long) frame;
584 regs->rip = (unsigned long) ka->sa.sa_handler;
585
586 /* Make -mregparm=3 work */
587 regs->rax = sig;
588 regs->rdx = (unsigned long) &frame->info;
589 regs->rcx = (unsigned long) &frame->uc;
590
591 /* Make -mregparm=3 work */
592 regs->rax = sig;
593 regs->rdx = (unsigned long) &frame->info;
594 regs->rcx = (unsigned long) &frame->uc;
595
596 asm volatile("movl %0,%%ds" :: "r" (__USER32_DS));
597 asm volatile("movl %0,%%es" :: "r" (__USER32_DS));
598
599 regs->cs = __USER32_CS;
600 regs->ss = __USER32_DS;
601
602 set_fs(USER_DS);
603 regs->eflags &= ~TF_MASK;
604 if (test_thread_flag(TIF_SINGLESTEP))
605 ptrace_notify(SIGTRAP);
606
607#if DEBUG_SIG
608 printk("SIG deliver (%s:%d): sp=%p pc=%lx ra=%u\n",
609 current->comm, current->pid, frame, regs->rip, frame->pretcode);
610#endif
611
612 return 0;
613
614give_sigsegv:
615 force_sigsegv(sig, current);
616 return -EFAULT;
617}
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
new file mode 100644
index 000000000000..18b231810908
--- /dev/null
+++ b/arch/x86/ia32/ia32entry.S
@@ -0,0 +1,736 @@
1/*
2 * Compatibility mode system call entry point for x86-64.
3 *
4 * Copyright 2000-2002 Andi Kleen, SuSE Labs.
5 */
6
7#include <asm/dwarf2.h>
8#include <asm/calling.h>
9#include <asm/asm-offsets.h>
10#include <asm/current.h>
11#include <asm/errno.h>
12#include <asm/ia32_unistd.h>
13#include <asm/thread_info.h>
14#include <asm/segment.h>
15#include <asm/vsyscall32.h>
16#include <asm/irqflags.h>
17#include <linux/linkage.h>
18
19#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
20
21 .macro IA32_ARG_FIXUP noebp=0
22 movl %edi,%r8d
23 .if \noebp
24 .else
25 movl %ebp,%r9d
26 .endif
27 xchg %ecx,%esi
28 movl %ebx,%edi
29 movl %edx,%edx /* zero extension */
30 .endm
31
32 /* clobbers %eax */
33 .macro CLEAR_RREGS
34 xorl %eax,%eax
35 movq %rax,R11(%rsp)
36 movq %rax,R10(%rsp)
37 movq %rax,R9(%rsp)
38 movq %rax,R8(%rsp)
39 .endm
40
41 .macro LOAD_ARGS32 offset
42 movl \offset(%rsp),%r11d
43 movl \offset+8(%rsp),%r10d
44 movl \offset+16(%rsp),%r9d
45 movl \offset+24(%rsp),%r8d
46 movl \offset+40(%rsp),%ecx
47 movl \offset+48(%rsp),%edx
48 movl \offset+56(%rsp),%esi
49 movl \offset+64(%rsp),%edi
50 movl \offset+72(%rsp),%eax
51 .endm
52
53 .macro CFI_STARTPROC32 simple
54 CFI_STARTPROC \simple
55 CFI_UNDEFINED r8
56 CFI_UNDEFINED r9
57 CFI_UNDEFINED r10
58 CFI_UNDEFINED r11
59 CFI_UNDEFINED r12
60 CFI_UNDEFINED r13
61 CFI_UNDEFINED r14
62 CFI_UNDEFINED r15
63 .endm
64
65/*
66 * 32bit SYSENTER instruction entry.
67 *
68 * Arguments:
69 * %eax System call number.
70 * %ebx Arg1
71 * %ecx Arg2
72 * %edx Arg3
73 * %esi Arg4
74 * %edi Arg5
75 * %ebp user stack
76 * 0(%ebp) Arg6
77 *
78 * Interrupts off.
79 *
80 * This is purely a fast path. For anything complicated we use the int 0x80
81 * path below. Set up a complete hardware stack frame to share code
82 * with the int 0x80 path.
83 */
84ENTRY(ia32_sysenter_target)
85 CFI_STARTPROC32 simple
86 CFI_SIGNAL_FRAME
87 CFI_DEF_CFA rsp,0
88 CFI_REGISTER rsp,rbp
89 swapgs
90 movq %gs:pda_kernelstack, %rsp
91 addq $(PDA_STACKOFFSET),%rsp
92 /*
93 * No need to follow this irqs on/off section: the syscall
94 * disabled irqs, here we enable it straight after entry:
95 */
96 sti
97 movl %ebp,%ebp /* zero extension */
98 pushq $__USER32_DS
99 CFI_ADJUST_CFA_OFFSET 8
100 /*CFI_REL_OFFSET ss,0*/
101 pushq %rbp
102 CFI_ADJUST_CFA_OFFSET 8
103 CFI_REL_OFFSET rsp,0
104 pushfq
105 CFI_ADJUST_CFA_OFFSET 8
106 /*CFI_REL_OFFSET rflags,0*/
107 movl $VSYSCALL32_SYSEXIT, %r10d
108 CFI_REGISTER rip,r10
109 pushq $__USER32_CS
110 CFI_ADJUST_CFA_OFFSET 8
111 /*CFI_REL_OFFSET cs,0*/
112 movl %eax, %eax
113 pushq %r10
114 CFI_ADJUST_CFA_OFFSET 8
115 CFI_REL_OFFSET rip,0
116 pushq %rax
117 CFI_ADJUST_CFA_OFFSET 8
118 cld
119 SAVE_ARGS 0,0,1
120 /* no need to do an access_ok check here because rbp has been
121 32bit zero extended */
1221: movl (%rbp),%r9d
123 .section __ex_table,"a"
124 .quad 1b,ia32_badarg
125 .previous
126 GET_THREAD_INFO(%r10)
127 orl $TS_COMPAT,threadinfo_status(%r10)
128 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
129 CFI_REMEMBER_STATE
130 jnz sysenter_tracesys
131sysenter_do_call:
132 cmpl $(IA32_NR_syscalls-1),%eax
133 ja ia32_badsys
134 IA32_ARG_FIXUP 1
135 call *ia32_sys_call_table(,%rax,8)
136 movq %rax,RAX-ARGOFFSET(%rsp)
137 GET_THREAD_INFO(%r10)
138 cli
139 TRACE_IRQS_OFF
140 testl $_TIF_ALLWORK_MASK,threadinfo_flags(%r10)
141 jnz int_ret_from_sys_call
142 andl $~TS_COMPAT,threadinfo_status(%r10)
143 /* clear IF, that popfq doesn't enable interrupts early */
144 andl $~0x200,EFLAGS-R11(%rsp)
145 RESTORE_ARGS 1,24,1,1,1,1
146 popfq
147 CFI_ADJUST_CFA_OFFSET -8
148 /*CFI_RESTORE rflags*/
149 popq %rcx /* User %esp */
150 CFI_ADJUST_CFA_OFFSET -8
151 CFI_REGISTER rsp,rcx
152 movl $VSYSCALL32_SYSEXIT,%edx /* User %eip */
153 CFI_REGISTER rip,rdx
154 TRACE_IRQS_ON
155 swapgs
156 sti /* sti only takes effect after the next instruction */
157 /* sysexit */
158 .byte 0xf, 0x35
159
160sysenter_tracesys:
161 CFI_RESTORE_STATE
162 SAVE_REST
163 CLEAR_RREGS
164 movq $-ENOSYS,RAX(%rsp) /* really needed? */
165 movq %rsp,%rdi /* &pt_regs -> arg1 */
166 call syscall_trace_enter
167 LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */
168 RESTORE_REST
169 movl %ebp, %ebp
170 /* no need to do an access_ok check here because rbp has been
171 32bit zero extended */
1721: movl (%rbp),%r9d
173 .section __ex_table,"a"
174 .quad 1b,ia32_badarg
175 .previous
176 jmp sysenter_do_call
177 CFI_ENDPROC
178ENDPROC(ia32_sysenter_target)
179
180/*
181 * 32bit SYSCALL instruction entry.
182 *
183 * Arguments:
184 * %eax System call number.
185 * %ebx Arg1
186 * %ecx return EIP
187 * %edx Arg3
188 * %esi Arg4
189 * %edi Arg5
190 * %ebp Arg2 [note: not saved in the stack frame, should not be touched]
191 * %esp user stack
192 * 0(%esp) Arg6
193 *
194 * Interrupts off.
195 *
196 * This is purely a fast path. For anything complicated we use the int 0x80
197 * path below. Set up a complete hardware stack frame to share code
198 * with the int 0x80 path.
199 */
200ENTRY(ia32_cstar_target)
201 CFI_STARTPROC32 simple
202 CFI_SIGNAL_FRAME
203 CFI_DEF_CFA rsp,PDA_STACKOFFSET
204 CFI_REGISTER rip,rcx
205 /*CFI_REGISTER rflags,r11*/
206 swapgs
207 movl %esp,%r8d
208 CFI_REGISTER rsp,r8
209 movq %gs:pda_kernelstack,%rsp
210 /*
211 * No need to follow this irqs on/off section: the syscall
212 * disabled irqs and here we enable it straight after entry:
213 */
214 sti
215 SAVE_ARGS 8,1,1
216 movl %eax,%eax /* zero extension */
217 movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
218 movq %rcx,RIP-ARGOFFSET(%rsp)
219 CFI_REL_OFFSET rip,RIP-ARGOFFSET
220 movq %rbp,RCX-ARGOFFSET(%rsp) /* this lies slightly to ptrace */
221 movl %ebp,%ecx
222 movq $__USER32_CS,CS-ARGOFFSET(%rsp)
223 movq $__USER32_DS,SS-ARGOFFSET(%rsp)
224 movq %r11,EFLAGS-ARGOFFSET(%rsp)
225 /*CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/
226 movq %r8,RSP-ARGOFFSET(%rsp)
227 CFI_REL_OFFSET rsp,RSP-ARGOFFSET
228 /* no need to do an access_ok check here because r8 has been
229 32bit zero extended */
230 /* hardware stack frame is complete now */
2311: movl (%r8),%r9d
232 .section __ex_table,"a"
233 .quad 1b,ia32_badarg
234 .previous
235 GET_THREAD_INFO(%r10)
236 orl $TS_COMPAT,threadinfo_status(%r10)
237 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
238 CFI_REMEMBER_STATE
239 jnz cstar_tracesys
240cstar_do_call:
241 cmpl $IA32_NR_syscalls-1,%eax
242 ja ia32_badsys
243 IA32_ARG_FIXUP 1
244 call *ia32_sys_call_table(,%rax,8)
245 movq %rax,RAX-ARGOFFSET(%rsp)
246 GET_THREAD_INFO(%r10)
247 cli
248 TRACE_IRQS_OFF
249 testl $_TIF_ALLWORK_MASK,threadinfo_flags(%r10)
250 jnz int_ret_from_sys_call
251 andl $~TS_COMPAT,threadinfo_status(%r10)
252 RESTORE_ARGS 1,-ARG_SKIP,1,1,1
253 movl RIP-ARGOFFSET(%rsp),%ecx
254 CFI_REGISTER rip,rcx
255 movl EFLAGS-ARGOFFSET(%rsp),%r11d
256 /*CFI_REGISTER rflags,r11*/
257 TRACE_IRQS_ON
258 movl RSP-ARGOFFSET(%rsp),%esp
259 CFI_RESTORE rsp
260 swapgs
261 sysretl
262
263cstar_tracesys:
264 CFI_RESTORE_STATE
265 SAVE_REST
266 CLEAR_RREGS
267 movq $-ENOSYS,RAX(%rsp) /* really needed? */
268 movq %rsp,%rdi /* &pt_regs -> arg1 */
269 call syscall_trace_enter
270 LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */
271 RESTORE_REST
272 movl RSP-ARGOFFSET(%rsp), %r8d
273 /* no need to do an access_ok check here because r8 has been
274 32bit zero extended */
2751: movl (%r8),%r9d
276 .section __ex_table,"a"
277 .quad 1b,ia32_badarg
278 .previous
279 jmp cstar_do_call
280END(ia32_cstar_target)
281
282ia32_badarg:
283 movq $-EFAULT,%rax
284 jmp ia32_sysret
285 CFI_ENDPROC
286
287/*
288 * Emulated IA32 system calls via int 0x80.
289 *
290 * Arguments:
291 * %eax System call number.
292 * %ebx Arg1
293 * %ecx Arg2
294 * %edx Arg3
295 * %esi Arg4
296 * %edi Arg5
297 * %ebp Arg6 [note: not saved in the stack frame, should not be touched]
298 *
299 * Notes:
300 * Uses the same stack frame as the x86-64 version.
301 * All registers except %eax must be saved (but ptrace may violate that)
302 * Arguments are zero extended. For system calls that want sign extension and
303 * take long arguments a wrapper is needed. Most calls can just be called
304 * directly.
305 * Assumes it is only called from user space and entered with interrupts off.
306 */
307
308ENTRY(ia32_syscall)
309 CFI_STARTPROC32 simple
310 CFI_SIGNAL_FRAME
311 CFI_DEF_CFA rsp,SS+8-RIP
312 /*CFI_REL_OFFSET ss,SS-RIP*/
313 CFI_REL_OFFSET rsp,RSP-RIP
314 /*CFI_REL_OFFSET rflags,EFLAGS-RIP*/
315 /*CFI_REL_OFFSET cs,CS-RIP*/
316 CFI_REL_OFFSET rip,RIP-RIP
317 swapgs
318 /*
319 * No need to follow this irqs on/off section: the syscall
320 * disabled irqs and here we enable it straight after entry:
321 */
322 sti
323 movl %eax,%eax
324 pushq %rax
325 CFI_ADJUST_CFA_OFFSET 8
326 cld
327 /* note the registers are not zero extended to the sf.
328 this could be a problem. */
329 SAVE_ARGS 0,0,1
330 GET_THREAD_INFO(%r10)
331 orl $TS_COMPAT,threadinfo_status(%r10)
332 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
333 jnz ia32_tracesys
334ia32_do_syscall:
335 cmpl $(IA32_NR_syscalls-1),%eax
336 ja ia32_badsys
337 IA32_ARG_FIXUP
338 call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
339ia32_sysret:
340 movq %rax,RAX-ARGOFFSET(%rsp)
341 jmp int_ret_from_sys_call
342
343ia32_tracesys:
344 SAVE_REST
345 CLEAR_RREGS
346 movq $-ENOSYS,RAX(%rsp) /* really needed? */
347 movq %rsp,%rdi /* &pt_regs -> arg1 */
348 call syscall_trace_enter
349 LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */
350 RESTORE_REST
351 jmp ia32_do_syscall
352END(ia32_syscall)
353
354ia32_badsys:
355 movq $0,ORIG_RAX-ARGOFFSET(%rsp)
356 movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
357 jmp int_ret_from_sys_call
358
359quiet_ni_syscall:
360 movq $-ENOSYS,%rax
361 ret
362 CFI_ENDPROC
363
364 .macro PTREGSCALL label, func, arg
365 .globl \label
366\label:
367 leaq \func(%rip),%rax
368 leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */
369 jmp ia32_ptregs_common
370 .endm
371
372 CFI_STARTPROC32
373
374 PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn, %rdi
375 PTREGSCALL stub32_sigreturn, sys32_sigreturn, %rdi
376 PTREGSCALL stub32_sigaltstack, sys32_sigaltstack, %rdx
377 PTREGSCALL stub32_sigsuspend, sys32_sigsuspend, %rcx
378 PTREGSCALL stub32_execve, sys32_execve, %rcx
379 PTREGSCALL stub32_fork, sys_fork, %rdi
380 PTREGSCALL stub32_clone, sys32_clone, %rdx
381 PTREGSCALL stub32_vfork, sys_vfork, %rdi
382 PTREGSCALL stub32_iopl, sys_iopl, %rsi
383 PTREGSCALL stub32_rt_sigsuspend, sys_rt_sigsuspend, %rdx
384
385ENTRY(ia32_ptregs_common)
386 popq %r11
387 CFI_ENDPROC
388 CFI_STARTPROC32 simple
389 CFI_SIGNAL_FRAME
390 CFI_DEF_CFA rsp,SS+8-ARGOFFSET
391 CFI_REL_OFFSET rax,RAX-ARGOFFSET
392 CFI_REL_OFFSET rcx,RCX-ARGOFFSET
393 CFI_REL_OFFSET rdx,RDX-ARGOFFSET
394 CFI_REL_OFFSET rsi,RSI-ARGOFFSET
395 CFI_REL_OFFSET rdi,RDI-ARGOFFSET
396 CFI_REL_OFFSET rip,RIP-ARGOFFSET
397/* CFI_REL_OFFSET cs,CS-ARGOFFSET*/
398/* CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/
399 CFI_REL_OFFSET rsp,RSP-ARGOFFSET
400/* CFI_REL_OFFSET ss,SS-ARGOFFSET*/
401 SAVE_REST
402 call *%rax
403 RESTORE_REST
404 jmp ia32_sysret /* misbalances the return cache */
405 CFI_ENDPROC
406END(ia32_ptregs_common)
407
408 .section .rodata,"a"
409 .align 8
410ia32_sys_call_table:
411 .quad sys_restart_syscall
412 .quad sys_exit
413 .quad stub32_fork
414 .quad sys_read
415 .quad sys_write
416 .quad compat_sys_open /* 5 */
417 .quad sys_close
418 .quad sys32_waitpid
419 .quad sys_creat
420 .quad sys_link
421 .quad sys_unlink /* 10 */
422 .quad stub32_execve
423 .quad sys_chdir
424 .quad compat_sys_time
425 .quad sys_mknod
426 .quad sys_chmod /* 15 */
427 .quad sys_lchown16
428 .quad quiet_ni_syscall /* old break syscall holder */
429 .quad sys_stat
430 .quad sys32_lseek
431 .quad sys_getpid /* 20 */
432 .quad compat_sys_mount /* mount */
433 .quad sys_oldumount /* old_umount */
434 .quad sys_setuid16
435 .quad sys_getuid16
436 .quad compat_sys_stime /* stime */ /* 25 */
437 .quad sys32_ptrace /* ptrace */
438 .quad sys_alarm
439 .quad sys_fstat /* (old)fstat */
440 .quad sys_pause
441 .quad compat_sys_utime /* 30 */
442 .quad quiet_ni_syscall /* old stty syscall holder */
443 .quad quiet_ni_syscall /* old gtty syscall holder */
444 .quad sys_access
445 .quad sys_nice
446 .quad quiet_ni_syscall /* 35 */ /* old ftime syscall holder */
447 .quad sys_sync
448 .quad sys32_kill
449 .quad sys_rename
450 .quad sys_mkdir
451 .quad sys_rmdir /* 40 */
452 .quad sys_dup
453 .quad sys32_pipe
454 .quad compat_sys_times
455 .quad quiet_ni_syscall /* old prof syscall holder */
456 .quad sys_brk /* 45 */
457 .quad sys_setgid16
458 .quad sys_getgid16
459 .quad sys_signal
460 .quad sys_geteuid16
461 .quad sys_getegid16 /* 50 */
462 .quad sys_acct
463 .quad sys_umount /* new_umount */
464 .quad quiet_ni_syscall /* old lock syscall holder */
465 .quad compat_sys_ioctl
466 .quad compat_sys_fcntl64 /* 55 */
467 .quad quiet_ni_syscall /* old mpx syscall holder */
468 .quad sys_setpgid
469 .quad quiet_ni_syscall /* old ulimit syscall holder */
470 .quad sys32_olduname
471 .quad sys_umask /* 60 */
472 .quad sys_chroot
473 .quad sys32_ustat
474 .quad sys_dup2
475 .quad sys_getppid
476 .quad sys_getpgrp /* 65 */
477 .quad sys_setsid
478 .quad sys32_sigaction
479 .quad sys_sgetmask
480 .quad sys_ssetmask
481 .quad sys_setreuid16 /* 70 */
482 .quad sys_setregid16
483 .quad stub32_sigsuspend
484 .quad compat_sys_sigpending
485 .quad sys_sethostname
486 .quad compat_sys_setrlimit /* 75 */
487 .quad compat_sys_old_getrlimit /* old_getrlimit */
488 .quad compat_sys_getrusage
489 .quad sys32_gettimeofday
490 .quad sys32_settimeofday
491 .quad sys_getgroups16 /* 80 */
492 .quad sys_setgroups16
493 .quad sys32_old_select
494 .quad sys_symlink
495 .quad sys_lstat
496 .quad sys_readlink /* 85 */
497 .quad sys_uselib
498 .quad sys_swapon
499 .quad sys_reboot
500 .quad compat_sys_old_readdir
501 .quad sys32_mmap /* 90 */
502 .quad sys_munmap
503 .quad sys_truncate
504 .quad sys_ftruncate
505 .quad sys_fchmod
506 .quad sys_fchown16 /* 95 */
507 .quad sys_getpriority
508 .quad sys_setpriority
509 .quad quiet_ni_syscall /* old profil syscall holder */
510 .quad compat_sys_statfs
511 .quad compat_sys_fstatfs /* 100 */
512 .quad sys_ioperm
513 .quad compat_sys_socketcall
514 .quad sys_syslog
515 .quad compat_sys_setitimer
516 .quad compat_sys_getitimer /* 105 */
517 .quad compat_sys_newstat
518 .quad compat_sys_newlstat
519 .quad compat_sys_newfstat
520 .quad sys32_uname
521 .quad stub32_iopl /* 110 */
522 .quad sys_vhangup
523 .quad quiet_ni_syscall /* old "idle" system call */
524 .quad sys32_vm86_warning /* vm86old */
525 .quad compat_sys_wait4
526 .quad sys_swapoff /* 115 */
527 .quad compat_sys_sysinfo
528 .quad sys32_ipc
529 .quad sys_fsync
530 .quad stub32_sigreturn
531 .quad stub32_clone /* 120 */
532 .quad sys_setdomainname
533 .quad sys_uname
534 .quad sys_modify_ldt
535 .quad compat_sys_adjtimex
536 .quad sys32_mprotect /* 125 */
537 .quad compat_sys_sigprocmask
538 .quad quiet_ni_syscall /* create_module */
539 .quad sys_init_module
540 .quad sys_delete_module
541 .quad quiet_ni_syscall /* 130 get_kernel_syms */
542 .quad sys32_quotactl
543 .quad sys_getpgid
544 .quad sys_fchdir
545 .quad quiet_ni_syscall /* bdflush */
546 .quad sys_sysfs /* 135 */
547 .quad sys_personality
548 .quad quiet_ni_syscall /* for afs_syscall */
549 .quad sys_setfsuid16
550 .quad sys_setfsgid16
551 .quad sys_llseek /* 140 */
552 .quad compat_sys_getdents
553 .quad compat_sys_select
554 .quad sys_flock
555 .quad sys_msync
556 .quad compat_sys_readv /* 145 */
557 .quad compat_sys_writev
558 .quad sys_getsid
559 .quad sys_fdatasync
560 .quad sys32_sysctl /* sysctl */
561 .quad sys_mlock /* 150 */
562 .quad sys_munlock
563 .quad sys_mlockall
564 .quad sys_munlockall
565 .quad sys_sched_setparam
566 .quad sys_sched_getparam /* 155 */
567 .quad sys_sched_setscheduler
568 .quad sys_sched_getscheduler
569 .quad sys_sched_yield
570 .quad sys_sched_get_priority_max
571 .quad sys_sched_get_priority_min /* 160 */
572 .quad sys32_sched_rr_get_interval
573 .quad compat_sys_nanosleep
574 .quad sys_mremap
575 .quad sys_setresuid16
576 .quad sys_getresuid16 /* 165 */
577 .quad sys32_vm86_warning /* vm86 */
578 .quad quiet_ni_syscall /* query_module */
579 .quad sys_poll
580 .quad compat_sys_nfsservctl
581 .quad sys_setresgid16 /* 170 */
582 .quad sys_getresgid16
583 .quad sys_prctl
584 .quad stub32_rt_sigreturn
585 .quad sys32_rt_sigaction
586 .quad sys32_rt_sigprocmask /* 175 */
587 .quad sys32_rt_sigpending
588 .quad compat_sys_rt_sigtimedwait
589 .quad sys32_rt_sigqueueinfo
590 .quad stub32_rt_sigsuspend
591 .quad sys32_pread /* 180 */
592 .quad sys32_pwrite
593 .quad sys_chown16
594 .quad sys_getcwd
595 .quad sys_capget
596 .quad sys_capset
597 .quad stub32_sigaltstack
598 .quad sys32_sendfile
599 .quad quiet_ni_syscall /* streams1 */
600 .quad quiet_ni_syscall /* streams2 */
601 .quad stub32_vfork /* 190 */
602 .quad compat_sys_getrlimit
603 .quad sys32_mmap2
604 .quad sys32_truncate64
605 .quad sys32_ftruncate64
606 .quad sys32_stat64 /* 195 */
607 .quad sys32_lstat64
608 .quad sys32_fstat64
609 .quad sys_lchown
610 .quad sys_getuid
611 .quad sys_getgid /* 200 */
612 .quad sys_geteuid
613 .quad sys_getegid
614 .quad sys_setreuid
615 .quad sys_setregid
616 .quad sys_getgroups /* 205 */
617 .quad sys_setgroups
618 .quad sys_fchown
619 .quad sys_setresuid
620 .quad sys_getresuid
621 .quad sys_setresgid /* 210 */
622 .quad sys_getresgid
623 .quad sys_chown
624 .quad sys_setuid
625 .quad sys_setgid
626 .quad sys_setfsuid /* 215 */
627 .quad sys_setfsgid
628 .quad sys_pivot_root
629 .quad sys_mincore
630 .quad sys_madvise
631 .quad compat_sys_getdents64 /* 220 getdents64 */
632 .quad compat_sys_fcntl64
633 .quad quiet_ni_syscall /* tux */
634 .quad quiet_ni_syscall /* security */
635 .quad sys_gettid
636 .quad sys32_readahead /* 225 */
637 .quad sys_setxattr
638 .quad sys_lsetxattr
639 .quad sys_fsetxattr
640 .quad sys_getxattr
641 .quad sys_lgetxattr /* 230 */
642 .quad sys_fgetxattr
643 .quad sys_listxattr
644 .quad sys_llistxattr
645 .quad sys_flistxattr
646 .quad sys_removexattr /* 235 */
647 .quad sys_lremovexattr
648 .quad sys_fremovexattr
649 .quad sys_tkill
650 .quad sys_sendfile64
651 .quad compat_sys_futex /* 240 */
652 .quad compat_sys_sched_setaffinity
653 .quad compat_sys_sched_getaffinity
654 .quad sys32_set_thread_area
655 .quad sys32_get_thread_area
656 .quad compat_sys_io_setup /* 245 */
657 .quad sys_io_destroy
658 .quad compat_sys_io_getevents
659 .quad compat_sys_io_submit
660 .quad sys_io_cancel
661 .quad sys32_fadvise64 /* 250 */
662 .quad quiet_ni_syscall /* free_huge_pages */
663 .quad sys_exit_group
664 .quad sys32_lookup_dcookie
665 .quad sys_epoll_create
666 .quad sys_epoll_ctl /* 255 */
667 .quad sys_epoll_wait
668 .quad sys_remap_file_pages
669 .quad sys_set_tid_address
670 .quad compat_sys_timer_create
671 .quad compat_sys_timer_settime /* 260 */
672 .quad compat_sys_timer_gettime
673 .quad sys_timer_getoverrun
674 .quad sys_timer_delete
675 .quad compat_sys_clock_settime
676 .quad compat_sys_clock_gettime /* 265 */
677 .quad compat_sys_clock_getres
678 .quad compat_sys_clock_nanosleep
679 .quad compat_sys_statfs64
680 .quad compat_sys_fstatfs64
681 .quad sys_tgkill /* 270 */
682 .quad compat_sys_utimes
683 .quad sys32_fadvise64_64
684 .quad quiet_ni_syscall /* sys_vserver */
685 .quad sys_mbind
686 .quad compat_sys_get_mempolicy /* 275 */
687 .quad sys_set_mempolicy
688 .quad compat_sys_mq_open
689 .quad sys_mq_unlink
690 .quad compat_sys_mq_timedsend
691 .quad compat_sys_mq_timedreceive /* 280 */
692 .quad compat_sys_mq_notify
693 .quad compat_sys_mq_getsetattr
694 .quad compat_sys_kexec_load /* reserved for kexec */
695 .quad compat_sys_waitid
696 .quad quiet_ni_syscall /* 285: sys_altroot */
697 .quad sys_add_key
698 .quad sys_request_key
699 .quad sys_keyctl
700 .quad sys_ioprio_set
701 .quad sys_ioprio_get /* 290 */
702 .quad sys_inotify_init
703 .quad sys_inotify_add_watch
704 .quad sys_inotify_rm_watch
705 .quad sys_migrate_pages
706 .quad compat_sys_openat /* 295 */
707 .quad sys_mkdirat
708 .quad sys_mknodat
709 .quad sys_fchownat
710 .quad compat_sys_futimesat
711 .quad sys32_fstatat /* 300 */
712 .quad sys_unlinkat
713 .quad sys_renameat
714 .quad sys_linkat
715 .quad sys_symlinkat
716 .quad sys_readlinkat /* 305 */
717 .quad sys_fchmodat
718 .quad sys_faccessat
719 .quad compat_sys_pselect6
720 .quad compat_sys_ppoll
721 .quad sys_unshare /* 310 */
722 .quad compat_sys_set_robust_list
723 .quad compat_sys_get_robust_list
724 .quad sys_splice
725 .quad sys32_sync_file_range
726 .quad sys_tee /* 315 */
727 .quad compat_sys_vmsplice
728 .quad compat_sys_move_pages
729 .quad sys_getcpu
730 .quad sys_epoll_pwait
731 .quad compat_sys_utimensat /* 320 */
732 .quad compat_sys_signalfd
733 .quad compat_sys_timerfd
734 .quad sys_eventfd
735 .quad sys32_fallocate
736ia32_syscall_end:
diff --git a/arch/x86/ia32/ipc32.c b/arch/x86/ia32/ipc32.c
new file mode 100644
index 000000000000..2e1869ec4db4
--- /dev/null
+++ b/arch/x86/ia32/ipc32.c
@@ -0,0 +1,57 @@
1#include <linux/kernel.h>
2#include <linux/spinlock.h>
3#include <linux/list.h>
4#include <linux/syscalls.h>
5#include <linux/time.h>
6#include <linux/sem.h>
7#include <linux/msg.h>
8#include <linux/shm.h>
9#include <linux/ipc.h>
10#include <linux/compat.h>
11
12#include <asm/ipc.h>
13
14asmlinkage long
15sys32_ipc(u32 call, int first, int second, int third,
16 compat_uptr_t ptr, u32 fifth)
17{
18 int version;
19
20 version = call >> 16; /* hack for backward compatibility */
21 call &= 0xffff;
22
23 switch (call) {
24 case SEMOP:
25 /* struct sembuf is the same on 32 and 64bit :)) */
26 return sys_semtimedop(first, compat_ptr(ptr), second, NULL);
27 case SEMTIMEDOP:
28 return compat_sys_semtimedop(first, compat_ptr(ptr), second,
29 compat_ptr(fifth));
30 case SEMGET:
31 return sys_semget(first, second, third);
32 case SEMCTL:
33 return compat_sys_semctl(first, second, third, compat_ptr(ptr));
34
35 case MSGSND:
36 return compat_sys_msgsnd(first, second, third, compat_ptr(ptr));
37 case MSGRCV:
38 return compat_sys_msgrcv(first, second, fifth, third,
39 version, compat_ptr(ptr));
40 case MSGGET:
41 return sys_msgget((key_t) first, second);
42 case MSGCTL:
43 return compat_sys_msgctl(first, second, compat_ptr(ptr));
44
45 case SHMAT:
46 return compat_sys_shmat(first, second, third, version,
47 compat_ptr(ptr));
48 break;
49 case SHMDT:
50 return sys_shmdt(compat_ptr(ptr));
51 case SHMGET:
52 return sys_shmget(first, (unsigned)second, third);
53 case SHMCTL:
54 return compat_sys_shmctl(first, second, compat_ptr(ptr));
55 }
56 return -ENOSYS;
57}
diff --git a/arch/x86/ia32/mmap32.c b/arch/x86/ia32/mmap32.c
new file mode 100644
index 000000000000..e4b84b4a417a
--- /dev/null
+++ b/arch/x86/ia32/mmap32.c
@@ -0,0 +1,79 @@
1/*
2 * linux/arch/x86_64/ia32/mm/mmap.c
3 *
4 * flexible mmap layout support
5 *
6 * Based on the i386 version which was
7 *
8 * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
9 * All Rights Reserved.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 *
25 *
26 * Started by Ingo Molnar <mingo@elte.hu>
27 */
28
29#include <linux/personality.h>
30#include <linux/mm.h>
31#include <linux/random.h>
32#include <linux/sched.h>
33
34/*
35 * Top of mmap area (just below the process stack).
36 *
37 * Leave an at least ~128 MB hole.
38 */
39#define MIN_GAP (128*1024*1024)
40#define MAX_GAP (TASK_SIZE/6*5)
41
42static inline unsigned long mmap_base(struct mm_struct *mm)
43{
44 unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur;
45 unsigned long random_factor = 0;
46
47 if (current->flags & PF_RANDOMIZE)
48 random_factor = get_random_int() % (1024*1024);
49
50 if (gap < MIN_GAP)
51 gap = MIN_GAP;
52 else if (gap > MAX_GAP)
53 gap = MAX_GAP;
54
55 return PAGE_ALIGN(TASK_SIZE - gap - random_factor);
56}
57
58/*
59 * This function, called very early during the creation of a new
60 * process VM image, sets up which VM layout function to use:
61 */
62void ia32_pick_mmap_layout(struct mm_struct *mm)
63{
64 /*
65 * Fall back to the standard layout if the personality
66 * bit is set, or if the expected stack growth is unlimited:
67 */
68 if (sysctl_legacy_va_layout ||
69 (current->personality & ADDR_COMPAT_LAYOUT) ||
70 current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY) {
71 mm->mmap_base = TASK_UNMAPPED_BASE;
72 mm->get_unmapped_area = arch_get_unmapped_area;
73 mm->unmap_area = arch_unmap_area;
74 } else {
75 mm->mmap_base = mmap_base(mm);
76 mm->get_unmapped_area = arch_get_unmapped_area_topdown;
77 mm->unmap_area = arch_unmap_area_topdown;
78 }
79}
diff --git a/arch/x86/ia32/ptrace32.c b/arch/x86/ia32/ptrace32.c
new file mode 100644
index 000000000000..4a233ad6269c
--- /dev/null
+++ b/arch/x86/ia32/ptrace32.c
@@ -0,0 +1,404 @@
1/*
2 * 32bit ptrace for x86-64.
3 *
4 * Copyright 2001,2002 Andi Kleen, SuSE Labs.
5 * Some parts copied from arch/i386/kernel/ptrace.c. See that file for earlier
6 * copyright.
7 *
8 * This allows to access 64bit processes too; but there is no way to see the extended
9 * register contents.
10 */
11
12#include <linux/kernel.h>
13#include <linux/stddef.h>
14#include <linux/sched.h>
15#include <linux/syscalls.h>
16#include <linux/unistd.h>
17#include <linux/mm.h>
18#include <linux/err.h>
19#include <linux/ptrace.h>
20#include <asm/ptrace.h>
21#include <asm/compat.h>
22#include <asm/uaccess.h>
23#include <asm/user32.h>
24#include <asm/user.h>
25#include <asm/errno.h>
26#include <asm/debugreg.h>
27#include <asm/i387.h>
28#include <asm/fpu32.h>
29#include <asm/ia32.h>
30
31/*
32 * Determines which flags the user has access to [1 = access, 0 = no access].
33 * Prohibits changing ID(21), VIP(20), VIF(19), VM(17), IOPL(12-13), IF(9).
34 * Also masks reserved bits (31-22, 15, 5, 3, 1).
35 */
36#define FLAG_MASK 0x54dd5UL
37
38#define R32(l,q) \
39 case offsetof(struct user32, regs.l): stack[offsetof(struct pt_regs, q)/8] = val; break
40
41static int putreg32(struct task_struct *child, unsigned regno, u32 val)
42{
43 int i;
44 __u64 *stack = (__u64 *)task_pt_regs(child);
45
46 switch (regno) {
47 case offsetof(struct user32, regs.fs):
48 if (val && (val & 3) != 3) return -EIO;
49 child->thread.fsindex = val & 0xffff;
50 break;
51 case offsetof(struct user32, regs.gs):
52 if (val && (val & 3) != 3) return -EIO;
53 child->thread.gsindex = val & 0xffff;
54 break;
55 case offsetof(struct user32, regs.ds):
56 if (val && (val & 3) != 3) return -EIO;
57 child->thread.ds = val & 0xffff;
58 break;
59 case offsetof(struct user32, regs.es):
60 child->thread.es = val & 0xffff;
61 break;
62 case offsetof(struct user32, regs.ss):
63 if ((val & 3) != 3) return -EIO;
64 stack[offsetof(struct pt_regs, ss)/8] = val & 0xffff;
65 break;
66 case offsetof(struct user32, regs.cs):
67 if ((val & 3) != 3) return -EIO;
68 stack[offsetof(struct pt_regs, cs)/8] = val & 0xffff;
69 break;
70
71 R32(ebx, rbx);
72 R32(ecx, rcx);
73 R32(edx, rdx);
74 R32(edi, rdi);
75 R32(esi, rsi);
76 R32(ebp, rbp);
77 R32(eax, rax);
78 R32(orig_eax, orig_rax);
79 R32(eip, rip);
80 R32(esp, rsp);
81
82 case offsetof(struct user32, regs.eflags): {
83 __u64 *flags = &stack[offsetof(struct pt_regs, eflags)/8];
84 val &= FLAG_MASK;
85 *flags = val | (*flags & ~FLAG_MASK);
86 break;
87 }
88
89 case offsetof(struct user32, u_debugreg[4]):
90 case offsetof(struct user32, u_debugreg[5]):
91 return -EIO;
92
93 case offsetof(struct user32, u_debugreg[0]):
94 child->thread.debugreg0 = val;
95 break;
96
97 case offsetof(struct user32, u_debugreg[1]):
98 child->thread.debugreg1 = val;
99 break;
100
101 case offsetof(struct user32, u_debugreg[2]):
102 child->thread.debugreg2 = val;
103 break;
104
105 case offsetof(struct user32, u_debugreg[3]):
106 child->thread.debugreg3 = val;
107 break;
108
109 case offsetof(struct user32, u_debugreg[6]):
110 child->thread.debugreg6 = val;
111 break;
112
113 case offsetof(struct user32, u_debugreg[7]):
114 val &= ~DR_CONTROL_RESERVED;
115 /* See arch/i386/kernel/ptrace.c for an explanation of
116 * this awkward check.*/
117 for(i=0; i<4; i++)
118 if ((0x5454 >> ((val >> (16 + 4*i)) & 0xf)) & 1)
119 return -EIO;
120 child->thread.debugreg7 = val;
121 if (val)
122 set_tsk_thread_flag(child, TIF_DEBUG);
123 else
124 clear_tsk_thread_flag(child, TIF_DEBUG);
125 break;
126
127 default:
128 if (regno > sizeof(struct user32) || (regno & 3))
129 return -EIO;
130
131 /* Other dummy fields in the virtual user structure are ignored */
132 break;
133 }
134 return 0;
135}
136
137#undef R32
138
139#define R32(l,q) \
140 case offsetof(struct user32, regs.l): *val = stack[offsetof(struct pt_regs, q)/8]; break
141
142static int getreg32(struct task_struct *child, unsigned regno, u32 *val)
143{
144 __u64 *stack = (__u64 *)task_pt_regs(child);
145
146 switch (regno) {
147 case offsetof(struct user32, regs.fs):
148 *val = child->thread.fsindex;
149 break;
150 case offsetof(struct user32, regs.gs):
151 *val = child->thread.gsindex;
152 break;
153 case offsetof(struct user32, regs.ds):
154 *val = child->thread.ds;
155 break;
156 case offsetof(struct user32, regs.es):
157 *val = child->thread.es;
158 break;
159
160 R32(cs, cs);
161 R32(ss, ss);
162 R32(ebx, rbx);
163 R32(ecx, rcx);
164 R32(edx, rdx);
165 R32(edi, rdi);
166 R32(esi, rsi);
167 R32(ebp, rbp);
168 R32(eax, rax);
169 R32(orig_eax, orig_rax);
170 R32(eip, rip);
171 R32(eflags, eflags);
172 R32(esp, rsp);
173
174 case offsetof(struct user32, u_debugreg[0]):
175 *val = child->thread.debugreg0;
176 break;
177 case offsetof(struct user32, u_debugreg[1]):
178 *val = child->thread.debugreg1;
179 break;
180 case offsetof(struct user32, u_debugreg[2]):
181 *val = child->thread.debugreg2;
182 break;
183 case offsetof(struct user32, u_debugreg[3]):
184 *val = child->thread.debugreg3;
185 break;
186 case offsetof(struct user32, u_debugreg[6]):
187 *val = child->thread.debugreg6;
188 break;
189 case offsetof(struct user32, u_debugreg[7]):
190 *val = child->thread.debugreg7;
191 break;
192
193 default:
194 if (regno > sizeof(struct user32) || (regno & 3))
195 return -EIO;
196
197 /* Other dummy fields in the virtual user structure are ignored */
198 *val = 0;
199 break;
200 }
201 return 0;
202}
203
204#undef R32
205
206static long ptrace32_siginfo(unsigned request, u32 pid, u32 addr, u32 data)
207{
208 int ret;
209 compat_siginfo_t __user *si32 = compat_ptr(data);
210 siginfo_t ssi;
211 siginfo_t __user *si = compat_alloc_user_space(sizeof(siginfo_t));
212 if (request == PTRACE_SETSIGINFO) {
213 memset(&ssi, 0, sizeof(siginfo_t));
214 ret = copy_siginfo_from_user32(&ssi, si32);
215 if (ret)
216 return ret;
217 if (copy_to_user(si, &ssi, sizeof(siginfo_t)))
218 return -EFAULT;
219 }
220 ret = sys_ptrace(request, pid, addr, (unsigned long)si);
221 if (ret)
222 return ret;
223 if (request == PTRACE_GETSIGINFO) {
224 if (copy_from_user(&ssi, si, sizeof(siginfo_t)))
225 return -EFAULT;
226 ret = copy_siginfo_to_user32(si32, &ssi);
227 }
228 return ret;
229}
230
231asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data)
232{
233 struct task_struct *child;
234 struct pt_regs *childregs;
235 void __user *datap = compat_ptr(data);
236 int ret;
237 __u32 val;
238
239 switch (request) {
240 case PTRACE_TRACEME:
241 case PTRACE_ATTACH:
242 case PTRACE_KILL:
243 case PTRACE_CONT:
244 case PTRACE_SINGLESTEP:
245 case PTRACE_DETACH:
246 case PTRACE_SYSCALL:
247 case PTRACE_OLDSETOPTIONS:
248 case PTRACE_SETOPTIONS:
249 case PTRACE_SET_THREAD_AREA:
250 case PTRACE_GET_THREAD_AREA:
251 return sys_ptrace(request, pid, addr, data);
252
253 default:
254 return -EINVAL;
255
256 case PTRACE_PEEKTEXT:
257 case PTRACE_PEEKDATA:
258 case PTRACE_POKEDATA:
259 case PTRACE_POKETEXT:
260 case PTRACE_POKEUSR:
261 case PTRACE_PEEKUSR:
262 case PTRACE_GETREGS:
263 case PTRACE_SETREGS:
264 case PTRACE_SETFPREGS:
265 case PTRACE_GETFPREGS:
266 case PTRACE_SETFPXREGS:
267 case PTRACE_GETFPXREGS:
268 case PTRACE_GETEVENTMSG:
269 break;
270
271 case PTRACE_SETSIGINFO:
272 case PTRACE_GETSIGINFO:
273 return ptrace32_siginfo(request, pid, addr, data);
274 }
275
276 child = ptrace_get_task_struct(pid);
277 if (IS_ERR(child))
278 return PTR_ERR(child);
279
280 ret = ptrace_check_attach(child, request == PTRACE_KILL);
281 if (ret < 0)
282 goto out;
283
284 childregs = task_pt_regs(child);
285
286 switch (request) {
287 case PTRACE_PEEKDATA:
288 case PTRACE_PEEKTEXT:
289 ret = 0;
290 if (access_process_vm(child, addr, &val, sizeof(u32), 0)!=sizeof(u32))
291 ret = -EIO;
292 else
293 ret = put_user(val, (unsigned int __user *)datap);
294 break;
295
296 case PTRACE_POKEDATA:
297 case PTRACE_POKETEXT:
298 ret = 0;
299 if (access_process_vm(child, addr, &data, sizeof(u32), 1)!=sizeof(u32))
300 ret = -EIO;
301 break;
302
303 case PTRACE_PEEKUSR:
304 ret = getreg32(child, addr, &val);
305 if (ret == 0)
306 ret = put_user(val, (__u32 __user *)datap);
307 break;
308
309 case PTRACE_POKEUSR:
310 ret = putreg32(child, addr, data);
311 break;
312
313 case PTRACE_GETREGS: { /* Get all gp regs from the child. */
314 int i;
315 if (!access_ok(VERIFY_WRITE, datap, 16*4)) {
316 ret = -EIO;
317 break;
318 }
319 ret = 0;
320 for ( i = 0; i <= 16*4 ; i += sizeof(__u32) ) {
321 getreg32(child, i, &val);
322 ret |= __put_user(val,(u32 __user *)datap);
323 datap += sizeof(u32);
324 }
325 break;
326 }
327
328 case PTRACE_SETREGS: { /* Set all gp regs in the child. */
329 unsigned long tmp;
330 int i;
331 if (!access_ok(VERIFY_READ, datap, 16*4)) {
332 ret = -EIO;
333 break;
334 }
335 ret = 0;
336 for ( i = 0; i <= 16*4; i += sizeof(u32) ) {
337 ret |= __get_user(tmp, (u32 __user *)datap);
338 putreg32(child, i, tmp);
339 datap += sizeof(u32);
340 }
341 break;
342 }
343
344 case PTRACE_GETFPREGS:
345 ret = -EIO;
346 if (!access_ok(VERIFY_READ, compat_ptr(data),
347 sizeof(struct user_i387_struct)))
348 break;
349 save_i387_ia32(child, datap, childregs, 1);
350 ret = 0;
351 break;
352
353 case PTRACE_SETFPREGS:
354 ret = -EIO;
355 if (!access_ok(VERIFY_WRITE, datap,
356 sizeof(struct user_i387_struct)))
357 break;
358 ret = 0;
359 /* don't check EFAULT to be bug-to-bug compatible to i386 */
360 restore_i387_ia32(child, datap, 1);
361 break;
362
363 case PTRACE_GETFPXREGS: {
364 struct user32_fxsr_struct __user *u = datap;
365 init_fpu(child);
366 ret = -EIO;
367 if (!access_ok(VERIFY_WRITE, u, sizeof(*u)))
368 break;
369 ret = -EFAULT;
370 if (__copy_to_user(u, &child->thread.i387.fxsave, sizeof(*u)))
371 break;
372 ret = __put_user(childregs->cs, &u->fcs);
373 ret |= __put_user(child->thread.ds, &u->fos);
374 break;
375 }
376 case PTRACE_SETFPXREGS: {
377 struct user32_fxsr_struct __user *u = datap;
378 unlazy_fpu(child);
379 ret = -EIO;
380 if (!access_ok(VERIFY_READ, u, sizeof(*u)))
381 break;
382 /* no checking to be bug-to-bug compatible with i386. */
383 /* but silence warning */
384 if (__copy_from_user(&child->thread.i387.fxsave, u, sizeof(*u)))
385 ;
386 set_stopped_child_used_math(child);
387 child->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
388 ret = 0;
389 break;
390 }
391
392 case PTRACE_GETEVENTMSG:
393 ret = put_user(child->ptrace_message,(unsigned int __user *)compat_ptr(data));
394 break;
395
396 default:
397 BUG();
398 }
399
400 out:
401 put_task_struct(child);
402 return ret;
403}
404
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
new file mode 100644
index 000000000000..bee96d614432
--- /dev/null
+++ b/arch/x86/ia32/sys_ia32.c
@@ -0,0 +1,889 @@
1/*
2 * sys_ia32.c: Conversion between 32bit and 64bit native syscalls. Based on
3 * sys_sparc32
4 *
5 * Copyright (C) 2000 VA Linux Co
6 * Copyright (C) 2000 Don Dugger <n0ano@valinux.com>
7 * Copyright (C) 1999 Arun Sharma <arun.sharma@intel.com>
8 * Copyright (C) 1997,1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
9 * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu)
10 * Copyright (C) 2000 Hewlett-Packard Co.
11 * Copyright (C) 2000 David Mosberger-Tang <davidm@hpl.hp.com>
12 * Copyright (C) 2000,2001,2002 Andi Kleen, SuSE Labs (x86-64 port)
13 *
14 * These routines maintain argument size conversion between 32bit and 64bit
15 * environment. In 2.5 most of this should be moved to a generic directory.
16 *
17 * This file assumes that there is a hole at the end of user address space.
18 *
19 * Some of the functions are LE specific currently. These are hopefully all marked.
20 * This should be fixed.
21 */
22
23#include <linux/kernel.h>
24#include <linux/sched.h>
25#include <linux/fs.h>
26#include <linux/file.h>
27#include <linux/signal.h>
28#include <linux/syscalls.h>
29#include <linux/resource.h>
30#include <linux/times.h>
31#include <linux/utsname.h>
32#include <linux/smp.h>
33#include <linux/smp_lock.h>
34#include <linux/sem.h>
35#include <linux/msg.h>
36#include <linux/mm.h>
37#include <linux/shm.h>
38#include <linux/slab.h>
39#include <linux/uio.h>
40#include <linux/nfs_fs.h>
41#include <linux/quota.h>
42#include <linux/module.h>
43#include <linux/sunrpc/svc.h>
44#include <linux/nfsd/nfsd.h>
45#include <linux/nfsd/cache.h>
46#include <linux/nfsd/xdr.h>
47#include <linux/nfsd/syscall.h>
48#include <linux/poll.h>
49#include <linux/personality.h>
50#include <linux/stat.h>
51#include <linux/ipc.h>
52#include <linux/rwsem.h>
53#include <linux/binfmts.h>
54#include <linux/init.h>
55#include <linux/aio_abi.h>
56#include <linux/aio.h>
57#include <linux/compat.h>
58#include <linux/vfs.h>
59#include <linux/ptrace.h>
60#include <linux/highuid.h>
61#include <linux/vmalloc.h>
62#include <linux/fsnotify.h>
63#include <linux/sysctl.h>
64#include <asm/mman.h>
65#include <asm/types.h>
66#include <asm/uaccess.h>
67#include <asm/semaphore.h>
68#include <asm/atomic.h>
69#include <asm/ldt.h>
70
71#include <net/scm.h>
72#include <net/sock.h>
73#include <asm/ia32.h>
74
75#define AA(__x) ((unsigned long)(__x))
76
77int cp_compat_stat(struct kstat *kbuf, struct compat_stat __user *ubuf)
78{
79 compat_ino_t ino;
80
81 typeof(ubuf->st_uid) uid = 0;
82 typeof(ubuf->st_gid) gid = 0;
83 SET_UID(uid, kbuf->uid);
84 SET_GID(gid, kbuf->gid);
85 if (!old_valid_dev(kbuf->dev) || !old_valid_dev(kbuf->rdev))
86 return -EOVERFLOW;
87 if (kbuf->size >= 0x7fffffff)
88 return -EOVERFLOW;
89 ino = kbuf->ino;
90 if (sizeof(ino) < sizeof(kbuf->ino) && ino != kbuf->ino)
91 return -EOVERFLOW;
92 if (!access_ok(VERIFY_WRITE, ubuf, sizeof(struct compat_stat)) ||
93 __put_user (old_encode_dev(kbuf->dev), &ubuf->st_dev) ||
94 __put_user (ino, &ubuf->st_ino) ||
95 __put_user (kbuf->mode, &ubuf->st_mode) ||
96 __put_user (kbuf->nlink, &ubuf->st_nlink) ||
97 __put_user (uid, &ubuf->st_uid) ||
98 __put_user (gid, &ubuf->st_gid) ||
99 __put_user (old_encode_dev(kbuf->rdev), &ubuf->st_rdev) ||
100 __put_user (kbuf->size, &ubuf->st_size) ||
101 __put_user (kbuf->atime.tv_sec, &ubuf->st_atime) ||
102 __put_user (kbuf->atime.tv_nsec, &ubuf->st_atime_nsec) ||
103 __put_user (kbuf->mtime.tv_sec, &ubuf->st_mtime) ||
104 __put_user (kbuf->mtime.tv_nsec, &ubuf->st_mtime_nsec) ||
105 __put_user (kbuf->ctime.tv_sec, &ubuf->st_ctime) ||
106 __put_user (kbuf->ctime.tv_nsec, &ubuf->st_ctime_nsec) ||
107 __put_user (kbuf->blksize, &ubuf->st_blksize) ||
108 __put_user (kbuf->blocks, &ubuf->st_blocks))
109 return -EFAULT;
110 return 0;
111}
112
113asmlinkage long
114sys32_truncate64(char __user * filename, unsigned long offset_low, unsigned long offset_high)
115{
116 return sys_truncate(filename, ((loff_t) offset_high << 32) | offset_low);
117}
118
119asmlinkage long
120sys32_ftruncate64(unsigned int fd, unsigned long offset_low, unsigned long offset_high)
121{
122 return sys_ftruncate(fd, ((loff_t) offset_high << 32) | offset_low);
123}
124
125/* Another set for IA32/LFS -- x86_64 struct stat is different due to
126 support for 64bit inode numbers. */
127
128static int
129cp_stat64(struct stat64 __user *ubuf, struct kstat *stat)
130{
131 typeof(ubuf->st_uid) uid = 0;
132 typeof(ubuf->st_gid) gid = 0;
133 SET_UID(uid, stat->uid);
134 SET_GID(gid, stat->gid);
135 if (!access_ok(VERIFY_WRITE, ubuf, sizeof(struct stat64)) ||
136 __put_user(huge_encode_dev(stat->dev), &ubuf->st_dev) ||
137 __put_user (stat->ino, &ubuf->__st_ino) ||
138 __put_user (stat->ino, &ubuf->st_ino) ||
139 __put_user (stat->mode, &ubuf->st_mode) ||
140 __put_user (stat->nlink, &ubuf->st_nlink) ||
141 __put_user (uid, &ubuf->st_uid) ||
142 __put_user (gid, &ubuf->st_gid) ||
143 __put_user (huge_encode_dev(stat->rdev), &ubuf->st_rdev) ||
144 __put_user (stat->size, &ubuf->st_size) ||
145 __put_user (stat->atime.tv_sec, &ubuf->st_atime) ||
146 __put_user (stat->atime.tv_nsec, &ubuf->st_atime_nsec) ||
147 __put_user (stat->mtime.tv_sec, &ubuf->st_mtime) ||
148 __put_user (stat->mtime.tv_nsec, &ubuf->st_mtime_nsec) ||
149 __put_user (stat->ctime.tv_sec, &ubuf->st_ctime) ||
150 __put_user (stat->ctime.tv_nsec, &ubuf->st_ctime_nsec) ||
151 __put_user (stat->blksize, &ubuf->st_blksize) ||
152 __put_user (stat->blocks, &ubuf->st_blocks))
153 return -EFAULT;
154 return 0;
155}
156
157asmlinkage long
158sys32_stat64(char __user * filename, struct stat64 __user *statbuf)
159{
160 struct kstat stat;
161 int ret = vfs_stat(filename, &stat);
162 if (!ret)
163 ret = cp_stat64(statbuf, &stat);
164 return ret;
165}
166
167asmlinkage long
168sys32_lstat64(char __user * filename, struct stat64 __user *statbuf)
169{
170 struct kstat stat;
171 int ret = vfs_lstat(filename, &stat);
172 if (!ret)
173 ret = cp_stat64(statbuf, &stat);
174 return ret;
175}
176
177asmlinkage long
178sys32_fstat64(unsigned int fd, struct stat64 __user *statbuf)
179{
180 struct kstat stat;
181 int ret = vfs_fstat(fd, &stat);
182 if (!ret)
183 ret = cp_stat64(statbuf, &stat);
184 return ret;
185}
186
187asmlinkage long
188sys32_fstatat(unsigned int dfd, char __user *filename,
189 struct stat64 __user* statbuf, int flag)
190{
191 struct kstat stat;
192 int error = -EINVAL;
193
194 if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
195 goto out;
196
197 if (flag & AT_SYMLINK_NOFOLLOW)
198 error = vfs_lstat_fd(dfd, filename, &stat);
199 else
200 error = vfs_stat_fd(dfd, filename, &stat);
201
202 if (!error)
203 error = cp_stat64(statbuf, &stat);
204
205out:
206 return error;
207}
208
209/*
210 * Linux/i386 didn't use to be able to handle more than
211 * 4 system call parameters, so these system calls used a memory
212 * block for parameter passing..
213 */
214
215struct mmap_arg_struct {
216 unsigned int addr;
217 unsigned int len;
218 unsigned int prot;
219 unsigned int flags;
220 unsigned int fd;
221 unsigned int offset;
222};
223
224asmlinkage long
225sys32_mmap(struct mmap_arg_struct __user *arg)
226{
227 struct mmap_arg_struct a;
228 struct file *file = NULL;
229 unsigned long retval;
230 struct mm_struct *mm ;
231
232 if (copy_from_user(&a, arg, sizeof(a)))
233 return -EFAULT;
234
235 if (a.offset & ~PAGE_MASK)
236 return -EINVAL;
237
238 if (!(a.flags & MAP_ANONYMOUS)) {
239 file = fget(a.fd);
240 if (!file)
241 return -EBADF;
242 }
243
244 mm = current->mm;
245 down_write(&mm->mmap_sem);
246 retval = do_mmap_pgoff(file, a.addr, a.len, a.prot, a.flags, a.offset>>PAGE_SHIFT);
247 if (file)
248 fput(file);
249
250 up_write(&mm->mmap_sem);
251
252 return retval;
253}
254
255asmlinkage long
256sys32_mprotect(unsigned long start, size_t len, unsigned long prot)
257{
258 return sys_mprotect(start,len,prot);
259}
260
261asmlinkage long
262sys32_pipe(int __user *fd)
263{
264 int retval;
265 int fds[2];
266
267 retval = do_pipe(fds);
268 if (retval)
269 goto out;
270 if (copy_to_user(fd, fds, sizeof(fds)))
271 retval = -EFAULT;
272 out:
273 return retval;
274}
275
276asmlinkage long
277sys32_rt_sigaction(int sig, struct sigaction32 __user *act,
278 struct sigaction32 __user *oact, unsigned int sigsetsize)
279{
280 struct k_sigaction new_ka, old_ka;
281 int ret;
282 compat_sigset_t set32;
283
284 /* XXX: Don't preclude handling different sized sigset_t's. */
285 if (sigsetsize != sizeof(compat_sigset_t))
286 return -EINVAL;
287
288 if (act) {
289 compat_uptr_t handler, restorer;
290
291 if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
292 __get_user(handler, &act->sa_handler) ||
293 __get_user(new_ka.sa.sa_flags, &act->sa_flags) ||
294 __get_user(restorer, &act->sa_restorer)||
295 __copy_from_user(&set32, &act->sa_mask, sizeof(compat_sigset_t)))
296 return -EFAULT;
297 new_ka.sa.sa_handler = compat_ptr(handler);
298 new_ka.sa.sa_restorer = compat_ptr(restorer);
299 /* FIXME: here we rely on _COMPAT_NSIG_WORS to be >= than _NSIG_WORDS << 1 */
300 switch (_NSIG_WORDS) {
301 case 4: new_ka.sa.sa_mask.sig[3] = set32.sig[6]
302 | (((long)set32.sig[7]) << 32);
303 case 3: new_ka.sa.sa_mask.sig[2] = set32.sig[4]
304 | (((long)set32.sig[5]) << 32);
305 case 2: new_ka.sa.sa_mask.sig[1] = set32.sig[2]
306 | (((long)set32.sig[3]) << 32);
307 case 1: new_ka.sa.sa_mask.sig[0] = set32.sig[0]
308 | (((long)set32.sig[1]) << 32);
309 }
310 }
311
312 ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
313
314 if (!ret && oact) {
315 /* FIXME: here we rely on _COMPAT_NSIG_WORS to be >= than _NSIG_WORDS << 1 */
316 switch (_NSIG_WORDS) {
317 case 4:
318 set32.sig[7] = (old_ka.sa.sa_mask.sig[3] >> 32);
319 set32.sig[6] = old_ka.sa.sa_mask.sig[3];
320 case 3:
321 set32.sig[5] = (old_ka.sa.sa_mask.sig[2] >> 32);
322 set32.sig[4] = old_ka.sa.sa_mask.sig[2];
323 case 2:
324 set32.sig[3] = (old_ka.sa.sa_mask.sig[1] >> 32);
325 set32.sig[2] = old_ka.sa.sa_mask.sig[1];
326 case 1:
327 set32.sig[1] = (old_ka.sa.sa_mask.sig[0] >> 32);
328 set32.sig[0] = old_ka.sa.sa_mask.sig[0];
329 }
330 if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
331 __put_user(ptr_to_compat(old_ka.sa.sa_handler), &oact->sa_handler) ||
332 __put_user(ptr_to_compat(old_ka.sa.sa_restorer), &oact->sa_restorer) ||
333 __put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
334 __copy_to_user(&oact->sa_mask, &set32, sizeof(compat_sigset_t)))
335 return -EFAULT;
336 }
337
338 return ret;
339}
340
341asmlinkage long
342sys32_sigaction (int sig, struct old_sigaction32 __user *act, struct old_sigaction32 __user *oact)
343{
344 struct k_sigaction new_ka, old_ka;
345 int ret;
346
347 if (act) {
348 compat_old_sigset_t mask;
349 compat_uptr_t handler, restorer;
350
351 if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
352 __get_user(handler, &act->sa_handler) ||
353 __get_user(new_ka.sa.sa_flags, &act->sa_flags) ||
354 __get_user(restorer, &act->sa_restorer) ||
355 __get_user(mask, &act->sa_mask))
356 return -EFAULT;
357
358 new_ka.sa.sa_handler = compat_ptr(handler);
359 new_ka.sa.sa_restorer = compat_ptr(restorer);
360
361 siginitset(&new_ka.sa.sa_mask, mask);
362 }
363
364 ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
365
366 if (!ret && oact) {
367 if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
368 __put_user(ptr_to_compat(old_ka.sa.sa_handler), &oact->sa_handler) ||
369 __put_user(ptr_to_compat(old_ka.sa.sa_restorer), &oact->sa_restorer) ||
370 __put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
371 __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask))
372 return -EFAULT;
373 }
374
375 return ret;
376}
377
378asmlinkage long
379sys32_rt_sigprocmask(int how, compat_sigset_t __user *set,
380 compat_sigset_t __user *oset, unsigned int sigsetsize)
381{
382 sigset_t s;
383 compat_sigset_t s32;
384 int ret;
385 mm_segment_t old_fs = get_fs();
386
387 if (set) {
388 if (copy_from_user (&s32, set, sizeof(compat_sigset_t)))
389 return -EFAULT;
390 switch (_NSIG_WORDS) {
391 case 4: s.sig[3] = s32.sig[6] | (((long)s32.sig[7]) << 32);
392 case 3: s.sig[2] = s32.sig[4] | (((long)s32.sig[5]) << 32);
393 case 2: s.sig[1] = s32.sig[2] | (((long)s32.sig[3]) << 32);
394 case 1: s.sig[0] = s32.sig[0] | (((long)s32.sig[1]) << 32);
395 }
396 }
397 set_fs (KERNEL_DS);
398 ret = sys_rt_sigprocmask(how,
399 set ? (sigset_t __user *)&s : NULL,
400 oset ? (sigset_t __user *)&s : NULL,
401 sigsetsize);
402 set_fs (old_fs);
403 if (ret) return ret;
404 if (oset) {
405 switch (_NSIG_WORDS) {
406 case 4: s32.sig[7] = (s.sig[3] >> 32); s32.sig[6] = s.sig[3];
407 case 3: s32.sig[5] = (s.sig[2] >> 32); s32.sig[4] = s.sig[2];
408 case 2: s32.sig[3] = (s.sig[1] >> 32); s32.sig[2] = s.sig[1];
409 case 1: s32.sig[1] = (s.sig[0] >> 32); s32.sig[0] = s.sig[0];
410 }
411 if (copy_to_user (oset, &s32, sizeof(compat_sigset_t)))
412 return -EFAULT;
413 }
414 return 0;
415}
416
417static inline long
418get_tv32(struct timeval *o, struct compat_timeval __user *i)
419{
420 int err = -EFAULT;
421 if (access_ok(VERIFY_READ, i, sizeof(*i))) {
422 err = __get_user(o->tv_sec, &i->tv_sec);
423 err |= __get_user(o->tv_usec, &i->tv_usec);
424 }
425 return err;
426}
427
428static inline long
429put_tv32(struct compat_timeval __user *o, struct timeval *i)
430{
431 int err = -EFAULT;
432 if (access_ok(VERIFY_WRITE, o, sizeof(*o))) {
433 err = __put_user(i->tv_sec, &o->tv_sec);
434 err |= __put_user(i->tv_usec, &o->tv_usec);
435 }
436 return err;
437}
438
439extern unsigned int alarm_setitimer(unsigned int seconds);
440
441asmlinkage long
442sys32_alarm(unsigned int seconds)
443{
444 return alarm_setitimer(seconds);
445}
446
447/* Translations due to time_t size differences. Which affects all
448 sorts of things, like timeval and itimerval. */
449
450extern struct timezone sys_tz;
451
452asmlinkage long
453sys32_gettimeofday(struct compat_timeval __user *tv, struct timezone __user *tz)
454{
455 if (tv) {
456 struct timeval ktv;
457 do_gettimeofday(&ktv);
458 if (put_tv32(tv, &ktv))
459 return -EFAULT;
460 }
461 if (tz) {
462 if (copy_to_user(tz, &sys_tz, sizeof(sys_tz)))
463 return -EFAULT;
464 }
465 return 0;
466}
467
468asmlinkage long
469sys32_settimeofday(struct compat_timeval __user *tv, struct timezone __user *tz)
470{
471 struct timeval ktv;
472 struct timespec kts;
473 struct timezone ktz;
474
475 if (tv) {
476 if (get_tv32(&ktv, tv))
477 return -EFAULT;
478 kts.tv_sec = ktv.tv_sec;
479 kts.tv_nsec = ktv.tv_usec * NSEC_PER_USEC;
480 }
481 if (tz) {
482 if (copy_from_user(&ktz, tz, sizeof(ktz)))
483 return -EFAULT;
484 }
485
486 return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL);
487}
488
489struct sel_arg_struct {
490 unsigned int n;
491 unsigned int inp;
492 unsigned int outp;
493 unsigned int exp;
494 unsigned int tvp;
495};
496
497asmlinkage long
498sys32_old_select(struct sel_arg_struct __user *arg)
499{
500 struct sel_arg_struct a;
501
502 if (copy_from_user(&a, arg, sizeof(a)))
503 return -EFAULT;
504 return compat_sys_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp),
505 compat_ptr(a.exp), compat_ptr(a.tvp));
506}
507
508extern asmlinkage long
509compat_sys_wait4(compat_pid_t pid, compat_uint_t * stat_addr, int options,
510 struct compat_rusage *ru);
511
512asmlinkage long
513sys32_waitpid(compat_pid_t pid, unsigned int *stat_addr, int options)
514{
515 return compat_sys_wait4(pid, stat_addr, options, NULL);
516}
517
518/* 32-bit timeval and related flotsam. */
519
520asmlinkage long
521sys32_sysfs(int option, u32 arg1, u32 arg2)
522{
523 return sys_sysfs(option, arg1, arg2);
524}
525
526asmlinkage long
527sys32_sched_rr_get_interval(compat_pid_t pid, struct compat_timespec __user *interval)
528{
529 struct timespec t;
530 int ret;
531 mm_segment_t old_fs = get_fs ();
532
533 set_fs (KERNEL_DS);
534 ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t);
535 set_fs (old_fs);
536 if (put_compat_timespec(&t, interval))
537 return -EFAULT;
538 return ret;
539}
540
541asmlinkage long
542sys32_rt_sigpending(compat_sigset_t __user *set, compat_size_t sigsetsize)
543{
544 sigset_t s;
545 compat_sigset_t s32;
546 int ret;
547 mm_segment_t old_fs = get_fs();
548
549 set_fs (KERNEL_DS);
550 ret = sys_rt_sigpending((sigset_t __user *)&s, sigsetsize);
551 set_fs (old_fs);
552 if (!ret) {
553 switch (_NSIG_WORDS) {
554 case 4: s32.sig[7] = (s.sig[3] >> 32); s32.sig[6] = s.sig[3];
555 case 3: s32.sig[5] = (s.sig[2] >> 32); s32.sig[4] = s.sig[2];
556 case 2: s32.sig[3] = (s.sig[1] >> 32); s32.sig[2] = s.sig[1];
557 case 1: s32.sig[1] = (s.sig[0] >> 32); s32.sig[0] = s.sig[0];
558 }
559 if (copy_to_user (set, &s32, sizeof(compat_sigset_t)))
560 return -EFAULT;
561 }
562 return ret;
563}
564
565asmlinkage long
566sys32_rt_sigqueueinfo(int pid, int sig, compat_siginfo_t __user *uinfo)
567{
568 siginfo_t info;
569 int ret;
570 mm_segment_t old_fs = get_fs();
571
572 if (copy_siginfo_from_user32(&info, uinfo))
573 return -EFAULT;
574 set_fs (KERNEL_DS);
575 ret = sys_rt_sigqueueinfo(pid, sig, (siginfo_t __user *)&info);
576 set_fs (old_fs);
577 return ret;
578}
579
580/* These are here just in case some old ia32 binary calls it. */
581asmlinkage long
582sys32_pause(void)
583{
584 current->state = TASK_INTERRUPTIBLE;
585 schedule();
586 return -ERESTARTNOHAND;
587}
588
589
590#ifdef CONFIG_SYSCTL_SYSCALL
591struct sysctl_ia32 {
592 unsigned int name;
593 int nlen;
594 unsigned int oldval;
595 unsigned int oldlenp;
596 unsigned int newval;
597 unsigned int newlen;
598 unsigned int __unused[4];
599};
600
601
602asmlinkage long
603sys32_sysctl(struct sysctl_ia32 __user *args32)
604{
605 struct sysctl_ia32 a32;
606 mm_segment_t old_fs = get_fs ();
607 void __user *oldvalp, *newvalp;
608 size_t oldlen;
609 int __user *namep;
610 long ret;
611
612 if (copy_from_user(&a32, args32, sizeof (a32)))
613 return -EFAULT;
614
615 /*
616 * We need to pre-validate these because we have to disable address checking
617 * before calling do_sysctl() because of OLDLEN but we can't run the risk of the
618 * user specifying bad addresses here. Well, since we're dealing with 32 bit
619 * addresses, we KNOW that access_ok() will always succeed, so this is an
620 * expensive NOP, but so what...
621 */
622 namep = compat_ptr(a32.name);
623 oldvalp = compat_ptr(a32.oldval);
624 newvalp = compat_ptr(a32.newval);
625
626 if ((oldvalp && get_user(oldlen, (int __user *)compat_ptr(a32.oldlenp)))
627 || !access_ok(VERIFY_WRITE, namep, 0)
628 || !access_ok(VERIFY_WRITE, oldvalp, 0)
629 || !access_ok(VERIFY_WRITE, newvalp, 0))
630 return -EFAULT;
631
632 set_fs(KERNEL_DS);
633 lock_kernel();
634 ret = do_sysctl(namep, a32.nlen, oldvalp, (size_t __user *)&oldlen,
635 newvalp, (size_t) a32.newlen);
636 unlock_kernel();
637 set_fs(old_fs);
638
639 if (oldvalp && put_user (oldlen, (int __user *)compat_ptr(a32.oldlenp)))
640 return -EFAULT;
641
642 return ret;
643}
644#endif
645
646/* warning: next two assume little endian */
647asmlinkage long
648sys32_pread(unsigned int fd, char __user *ubuf, u32 count, u32 poslo, u32 poshi)
649{
650 return sys_pread64(fd, ubuf, count,
651 ((loff_t)AA(poshi) << 32) | AA(poslo));
652}
653
654asmlinkage long
655sys32_pwrite(unsigned int fd, char __user *ubuf, u32 count, u32 poslo, u32 poshi)
656{
657 return sys_pwrite64(fd, ubuf, count,
658 ((loff_t)AA(poshi) << 32) | AA(poslo));
659}
660
661
662asmlinkage long
663sys32_personality(unsigned long personality)
664{
665 int ret;
666 if (personality(current->personality) == PER_LINUX32 &&
667 personality == PER_LINUX)
668 personality = PER_LINUX32;
669 ret = sys_personality(personality);
670 if (ret == PER_LINUX32)
671 ret = PER_LINUX;
672 return ret;
673}
674
675asmlinkage long
676sys32_sendfile(int out_fd, int in_fd, compat_off_t __user *offset, s32 count)
677{
678 mm_segment_t old_fs = get_fs();
679 int ret;
680 off_t of;
681
682 if (offset && get_user(of, offset))
683 return -EFAULT;
684
685 set_fs(KERNEL_DS);
686 ret = sys_sendfile(out_fd, in_fd, offset ? (off_t __user *)&of : NULL,
687 count);
688 set_fs(old_fs);
689
690 if (offset && put_user(of, offset))
691 return -EFAULT;
692
693 return ret;
694}
695
696asmlinkage long sys32_mmap2(unsigned long addr, unsigned long len,
697 unsigned long prot, unsigned long flags,
698 unsigned long fd, unsigned long pgoff)
699{
700 struct mm_struct *mm = current->mm;
701 unsigned long error;
702 struct file * file = NULL;
703
704 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
705 if (!(flags & MAP_ANONYMOUS)) {
706 file = fget(fd);
707 if (!file)
708 return -EBADF;
709 }
710
711 down_write(&mm->mmap_sem);
712 error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
713 up_write(&mm->mmap_sem);
714
715 if (file)
716 fput(file);
717 return error;
718}
719
720asmlinkage long sys32_olduname(struct oldold_utsname __user * name)
721{
722 int err;
723
724 if (!name)
725 return -EFAULT;
726 if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname)))
727 return -EFAULT;
728
729 down_read(&uts_sem);
730
731 err = __copy_to_user(&name->sysname,&utsname()->sysname,
732 __OLD_UTS_LEN);
733 err |= __put_user(0,name->sysname+__OLD_UTS_LEN);
734 err |= __copy_to_user(&name->nodename,&utsname()->nodename,
735 __OLD_UTS_LEN);
736 err |= __put_user(0,name->nodename+__OLD_UTS_LEN);
737 err |= __copy_to_user(&name->release,&utsname()->release,
738 __OLD_UTS_LEN);
739 err |= __put_user(0,name->release+__OLD_UTS_LEN);
740 err |= __copy_to_user(&name->version,&utsname()->version,
741 __OLD_UTS_LEN);
742 err |= __put_user(0,name->version+__OLD_UTS_LEN);
743 {
744 char *arch = "x86_64";
745 if (personality(current->personality) == PER_LINUX32)
746 arch = "i686";
747
748 err |= __copy_to_user(&name->machine, arch, strlen(arch)+1);
749 }
750
751 up_read(&uts_sem);
752
753 err = err ? -EFAULT : 0;
754
755 return err;
756}
757
758long sys32_uname(struct old_utsname __user * name)
759{
760 int err;
761 if (!name)
762 return -EFAULT;
763 down_read(&uts_sem);
764 err = copy_to_user(name, utsname(), sizeof (*name));
765 up_read(&uts_sem);
766 if (personality(current->personality) == PER_LINUX32)
767 err |= copy_to_user(&name->machine, "i686", 5);
768 return err?-EFAULT:0;
769}
770
771long sys32_ustat(unsigned dev, struct ustat32 __user *u32p)
772{
773 struct ustat u;
774 mm_segment_t seg;
775 int ret;
776
777 seg = get_fs();
778 set_fs(KERNEL_DS);
779 ret = sys_ustat(dev, (struct ustat __user *)&u);
780 set_fs(seg);
781 if (ret >= 0) {
782 if (!access_ok(VERIFY_WRITE,u32p,sizeof(struct ustat32)) ||
783 __put_user((__u32) u.f_tfree, &u32p->f_tfree) ||
784 __put_user((__u32) u.f_tinode, &u32p->f_tfree) ||
785 __copy_to_user(&u32p->f_fname, u.f_fname, sizeof(u.f_fname)) ||
786 __copy_to_user(&u32p->f_fpack, u.f_fpack, sizeof(u.f_fpack)))
787 ret = -EFAULT;
788 }
789 return ret;
790}
791
792asmlinkage long sys32_execve(char __user *name, compat_uptr_t __user *argv,
793 compat_uptr_t __user *envp, struct pt_regs *regs)
794{
795 long error;
796 char * filename;
797
798 filename = getname(name);
799 error = PTR_ERR(filename);
800 if (IS_ERR(filename))
801 return error;
802 error = compat_do_execve(filename, argv, envp, regs);
803 if (error == 0) {
804 task_lock(current);
805 current->ptrace &= ~PT_DTRACE;
806 task_unlock(current);
807 }
808 putname(filename);
809 return error;
810}
811
812asmlinkage long sys32_clone(unsigned int clone_flags, unsigned int newsp,
813 struct pt_regs *regs)
814{
815 void __user *parent_tid = (void __user *)regs->rdx;
816 void __user *child_tid = (void __user *)regs->rdi;
817 if (!newsp)
818 newsp = regs->rsp;
819 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
820}
821
822/*
823 * Some system calls that need sign extended arguments. This could be done by a generic wrapper.
824 */
825
826long sys32_lseek (unsigned int fd, int offset, unsigned int whence)
827{
828 return sys_lseek(fd, offset, whence);
829}
830
831long sys32_kill(int pid, int sig)
832{
833 return sys_kill(pid, sig);
834}
835
836long sys32_fadvise64_64(int fd, __u32 offset_low, __u32 offset_high,
837 __u32 len_low, __u32 len_high, int advice)
838{
839 return sys_fadvise64_64(fd,
840 (((u64)offset_high)<<32) | offset_low,
841 (((u64)len_high)<<32) | len_low,
842 advice);
843}
844
845long sys32_vm86_warning(void)
846{
847 struct task_struct *me = current;
848 static char lastcomm[sizeof(me->comm)];
849 if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) {
850 compat_printk(KERN_INFO "%s: vm86 mode not supported on 64 bit kernel\n",
851 me->comm);
852 strncpy(lastcomm, me->comm, sizeof(lastcomm));
853 }
854 return -ENOSYS;
855}
856
857long sys32_lookup_dcookie(u32 addr_low, u32 addr_high,
858 char __user * buf, size_t len)
859{
860 return sys_lookup_dcookie(((u64)addr_high << 32) | addr_low, buf, len);
861}
862
863asmlinkage ssize_t sys32_readahead(int fd, unsigned off_lo, unsigned off_hi, size_t count)
864{
865 return sys_readahead(fd, ((u64)off_hi << 32) | off_lo, count);
866}
867
868asmlinkage long sys32_sync_file_range(int fd, unsigned off_low, unsigned off_hi,
869 unsigned n_low, unsigned n_hi, int flags)
870{
871 return sys_sync_file_range(fd,
872 ((u64)off_hi << 32) | off_low,
873 ((u64)n_hi << 32) | n_low, flags);
874}
875
876asmlinkage long sys32_fadvise64(int fd, unsigned offset_lo, unsigned offset_hi, size_t len,
877 int advice)
878{
879 return sys_fadvise64_64(fd, ((u64)offset_hi << 32) | offset_lo,
880 len, advice);
881}
882
883asmlinkage long sys32_fallocate(int fd, int mode, unsigned offset_lo,
884 unsigned offset_hi, unsigned len_lo,
885 unsigned len_hi)
886{
887 return sys_fallocate(fd, mode, ((u64)offset_hi << 32) | offset_lo,
888 ((u64)len_hi << 32) | len_lo);
889}
diff --git a/arch/x86/ia32/syscall32.c b/arch/x86/ia32/syscall32.c
new file mode 100644
index 000000000000..15013bac181c
--- /dev/null
+++ b/arch/x86/ia32/syscall32.c
@@ -0,0 +1,83 @@
1/* Copyright 2002,2003 Andi Kleen, SuSE Labs */
2
3/* vsyscall handling for 32bit processes. Map a stub page into it
4 on demand because 32bit cannot reach the kernel's fixmaps */
5
6#include <linux/mm.h>
7#include <linux/string.h>
8#include <linux/kernel.h>
9#include <linux/gfp.h>
10#include <linux/init.h>
11#include <linux/stringify.h>
12#include <linux/security.h>
13#include <asm/proto.h>
14#include <asm/tlbflush.h>
15#include <asm/ia32_unistd.h>
16#include <asm/vsyscall32.h>
17
18extern unsigned char syscall32_syscall[], syscall32_syscall_end[];
19extern unsigned char syscall32_sysenter[], syscall32_sysenter_end[];
20extern int sysctl_vsyscall32;
21
22static struct page *syscall32_pages[1];
23static int use_sysenter = -1;
24
25struct linux_binprm;
26
27/* Setup a VMA at program startup for the vsyscall page */
28int syscall32_setup_pages(struct linux_binprm *bprm, int exstack)
29{
30 struct mm_struct *mm = current->mm;
31 int ret;
32
33 down_write(&mm->mmap_sem);
34 /*
35 * MAYWRITE to allow gdb to COW and set breakpoints
36 *
37 * Make sure the vDSO gets into every core dump.
38 * Dumping its contents makes post-mortem fully interpretable later
39 * without matching up the same kernel and hardware config to see
40 * what PC values meant.
41 */
42 /* Could randomize here */
43 ret = install_special_mapping(mm, VSYSCALL32_BASE, PAGE_SIZE,
44 VM_READ|VM_EXEC|
45 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
46 VM_ALWAYSDUMP,
47 syscall32_pages);
48 up_write(&mm->mmap_sem);
49 return ret;
50}
51
52static int __init init_syscall32(void)
53{
54 char *syscall32_page = (void *)get_zeroed_page(GFP_KERNEL);
55 if (!syscall32_page)
56 panic("Cannot allocate syscall32 page");
57 syscall32_pages[0] = virt_to_page(syscall32_page);
58 if (use_sysenter > 0) {
59 memcpy(syscall32_page, syscall32_sysenter,
60 syscall32_sysenter_end - syscall32_sysenter);
61 } else {
62 memcpy(syscall32_page, syscall32_syscall,
63 syscall32_syscall_end - syscall32_syscall);
64 }
65 return 0;
66}
67
68__initcall(init_syscall32);
69
70/* May not be __init: called during resume */
71void syscall32_cpu_init(void)
72{
73 if (use_sysenter < 0)
74 use_sysenter = (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL);
75
76 /* Load these always in case some future AMD CPU supports
77 SYSENTER from compat mode too. */
78 checking_wrmsrl(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
79 checking_wrmsrl(MSR_IA32_SYSENTER_ESP, 0ULL);
80 checking_wrmsrl(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
81
82 wrmsrl(MSR_CSTAR, ia32_cstar_target);
83}
diff --git a/arch/x86/ia32/syscall32_syscall.S b/arch/x86/ia32/syscall32_syscall.S
new file mode 100644
index 000000000000..933f0f08b1cf
--- /dev/null
+++ b/arch/x86/ia32/syscall32_syscall.S
@@ -0,0 +1,17 @@
1/* 32bit VDSOs mapped into user space. */
2
3 .section ".init.data","aw"
4
5 .globl syscall32_syscall
6 .globl syscall32_syscall_end
7
8syscall32_syscall:
9 .incbin "arch/x86/ia32/vsyscall-syscall.so"
10syscall32_syscall_end:
11
12 .globl syscall32_sysenter
13 .globl syscall32_sysenter_end
14
15syscall32_sysenter:
16 .incbin "arch/x86/ia32/vsyscall-sysenter.so"
17syscall32_sysenter_end:
diff --git a/arch/x86/ia32/tls32.c b/arch/x86/ia32/tls32.c
new file mode 100644
index 000000000000..1cc4340de3ca
--- /dev/null
+++ b/arch/x86/ia32/tls32.c
@@ -0,0 +1,163 @@
1#include <linux/kernel.h>
2#include <linux/errno.h>
3#include <linux/sched.h>
4#include <linux/user.h>
5
6#include <asm/uaccess.h>
7#include <asm/desc.h>
8#include <asm/system.h>
9#include <asm/ldt.h>
10#include <asm/processor.h>
11#include <asm/proto.h>
12
13/*
14 * sys_alloc_thread_area: get a yet unused TLS descriptor index.
15 */
16static int get_free_idx(void)
17{
18 struct thread_struct *t = &current->thread;
19 int idx;
20
21 for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++)
22 if (desc_empty((struct n_desc_struct *)(t->tls_array) + idx))
23 return idx + GDT_ENTRY_TLS_MIN;
24 return -ESRCH;
25}
26
27/*
28 * Set a given TLS descriptor:
29 * When you want addresses > 32bit use arch_prctl()
30 */
31int do_set_thread_area(struct thread_struct *t, struct user_desc __user *u_info)
32{
33 struct user_desc info;
34 struct n_desc_struct *desc;
35 int cpu, idx;
36
37 if (copy_from_user(&info, u_info, sizeof(info)))
38 return -EFAULT;
39
40 idx = info.entry_number;
41
42 /*
43 * index -1 means the kernel should try to find and
44 * allocate an empty descriptor:
45 */
46 if (idx == -1) {
47 idx = get_free_idx();
48 if (idx < 0)
49 return idx;
50 if (put_user(idx, &u_info->entry_number))
51 return -EFAULT;
52 }
53
54 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
55 return -EINVAL;
56
57 desc = ((struct n_desc_struct *)t->tls_array) + idx - GDT_ENTRY_TLS_MIN;
58
59 /*
60 * We must not get preempted while modifying the TLS.
61 */
62 cpu = get_cpu();
63
64 if (LDT_empty(&info)) {
65 desc->a = 0;
66 desc->b = 0;
67 } else {
68 desc->a = LDT_entry_a(&info);
69 desc->b = LDT_entry_b(&info);
70 }
71 if (t == &current->thread)
72 load_TLS(t, cpu);
73
74 put_cpu();
75 return 0;
76}
77
78asmlinkage long sys32_set_thread_area(struct user_desc __user *u_info)
79{
80 return do_set_thread_area(&current->thread, u_info);
81}
82
83
84/*
85 * Get the current Thread-Local Storage area:
86 */
87
88#define GET_BASE(desc) ( \
89 (((desc)->a >> 16) & 0x0000ffff) | \
90 (((desc)->b << 16) & 0x00ff0000) | \
91 ( (desc)->b & 0xff000000) )
92
93#define GET_LIMIT(desc) ( \
94 ((desc)->a & 0x0ffff) | \
95 ((desc)->b & 0xf0000) )
96
97#define GET_32BIT(desc) (((desc)->b >> 22) & 1)
98#define GET_CONTENTS(desc) (((desc)->b >> 10) & 3)
99#define GET_WRITABLE(desc) (((desc)->b >> 9) & 1)
100#define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1)
101#define GET_PRESENT(desc) (((desc)->b >> 15) & 1)
102#define GET_USEABLE(desc) (((desc)->b >> 20) & 1)
103#define GET_LONGMODE(desc) (((desc)->b >> 21) & 1)
104
105int do_get_thread_area(struct thread_struct *t, struct user_desc __user *u_info)
106{
107 struct user_desc info;
108 struct n_desc_struct *desc;
109 int idx;
110
111 if (get_user(idx, &u_info->entry_number))
112 return -EFAULT;
113 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
114 return -EINVAL;
115
116 desc = ((struct n_desc_struct *)t->tls_array) + idx - GDT_ENTRY_TLS_MIN;
117
118 memset(&info, 0, sizeof(struct user_desc));
119 info.entry_number = idx;
120 info.base_addr = GET_BASE(desc);
121 info.limit = GET_LIMIT(desc);
122 info.seg_32bit = GET_32BIT(desc);
123 info.contents = GET_CONTENTS(desc);
124 info.read_exec_only = !GET_WRITABLE(desc);
125 info.limit_in_pages = GET_LIMIT_PAGES(desc);
126 info.seg_not_present = !GET_PRESENT(desc);
127 info.useable = GET_USEABLE(desc);
128 info.lm = GET_LONGMODE(desc);
129
130 if (copy_to_user(u_info, &info, sizeof(info)))
131 return -EFAULT;
132 return 0;
133}
134
135asmlinkage long sys32_get_thread_area(struct user_desc __user *u_info)
136{
137 return do_get_thread_area(&current->thread, u_info);
138}
139
140
141int ia32_child_tls(struct task_struct *p, struct pt_regs *childregs)
142{
143 struct n_desc_struct *desc;
144 struct user_desc info;
145 struct user_desc __user *cp;
146 int idx;
147
148 cp = (void __user *)childregs->rsi;
149 if (copy_from_user(&info, cp, sizeof(info)))
150 return -EFAULT;
151 if (LDT_empty(&info))
152 return -EINVAL;
153
154 idx = info.entry_number;
155 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
156 return -EINVAL;
157
158 desc = (struct n_desc_struct *)(p->thread.tls_array) + idx - GDT_ENTRY_TLS_MIN;
159 desc->a = LDT_entry_a(&info);
160 desc->b = LDT_entry_b(&info);
161
162 return 0;
163}
diff --git a/arch/x86/ia32/vsyscall-sigreturn.S b/arch/x86/ia32/vsyscall-sigreturn.S
new file mode 100644
index 000000000000..b383be00baec
--- /dev/null
+++ b/arch/x86/ia32/vsyscall-sigreturn.S
@@ -0,0 +1,143 @@
1/*
2 * Common code for the sigreturn entry points on the vsyscall page.
3 * This code uses SYSCALL_ENTER_KERNEL (either syscall or int $0x80)
4 * to enter the kernel.
5 * This file is #include'd by vsyscall-*.S to define them after the
6 * vsyscall entry point. The addresses we get for these entry points
7 * by doing ".balign 32" must match in both versions of the page.
8 */
9
10 .code32
11 .section .text.sigreturn,"ax"
12 .balign 32
13 .globl __kernel_sigreturn
14 .type __kernel_sigreturn,@function
15__kernel_sigreturn:
16.LSTART_sigreturn:
17 popl %eax
18 movl $__NR_ia32_sigreturn, %eax
19 SYSCALL_ENTER_KERNEL
20.LEND_sigreturn:
21 .size __kernel_sigreturn,.-.LSTART_sigreturn
22
23 .section .text.rtsigreturn,"ax"
24 .balign 32
25 .globl __kernel_rt_sigreturn
26 .type __kernel_rt_sigreturn,@function
27__kernel_rt_sigreturn:
28.LSTART_rt_sigreturn:
29 movl $__NR_ia32_rt_sigreturn, %eax
30 SYSCALL_ENTER_KERNEL
31.LEND_rt_sigreturn:
32 .size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn
33
34 .section .eh_frame,"a",@progbits
35.LSTARTFRAMES:
36 .long .LENDCIES-.LSTARTCIES
37.LSTARTCIES:
38 .long 0 /* CIE ID */
39 .byte 1 /* Version number */
40 .string "zRS" /* NUL-terminated augmentation string */
41 .uleb128 1 /* Code alignment factor */
42 .sleb128 -4 /* Data alignment factor */
43 .byte 8 /* Return address register column */
44 .uleb128 1 /* Augmentation value length */
45 .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
46 .byte 0x0c /* DW_CFA_def_cfa */
47 .uleb128 4
48 .uleb128 4
49 .byte 0x88 /* DW_CFA_offset, column 0x8 */
50 .uleb128 1
51 .align 4
52.LENDCIES:
53
54 .long .LENDFDE2-.LSTARTFDE2 /* Length FDE */
55.LSTARTFDE2:
56 .long .LSTARTFDE2-.LSTARTFRAMES /* CIE pointer */
57 /* HACK: The dwarf2 unwind routines will subtract 1 from the
58 return address to get an address in the middle of the
59 presumed call instruction. Since we didn't get here via
60 a call, we need to include the nop before the real start
61 to make up for it. */
62 .long .LSTART_sigreturn-1-. /* PC-relative start address */
63 .long .LEND_sigreturn-.LSTART_sigreturn+1
64 .uleb128 0 /* Augmentation length */
65 /* What follows are the instructions for the table generation.
66 We record the locations of each register saved. This is
67 complicated by the fact that the "CFA" is always assumed to
68 be the value of the stack pointer in the caller. This means
69 that we must define the CFA of this body of code to be the
70 saved value of the stack pointer in the sigcontext. Which
71 also means that there is no fixed relation to the other
72 saved registers, which means that we must use DW_CFA_expression
73 to compute their addresses. It also means that when we
74 adjust the stack with the popl, we have to do it all over again. */
75
76#define do_cfa_expr(offset) \
77 .byte 0x0f; /* DW_CFA_def_cfa_expression */ \
78 .uleb128 1f-0f; /* length */ \
790: .byte 0x74; /* DW_OP_breg4 */ \
80 .sleb128 offset; /* offset */ \
81 .byte 0x06; /* DW_OP_deref */ \
821:
83
84#define do_expr(regno, offset) \
85 .byte 0x10; /* DW_CFA_expression */ \
86 .uleb128 regno; /* regno */ \
87 .uleb128 1f-0f; /* length */ \
880: .byte 0x74; /* DW_OP_breg4 */ \
89 .sleb128 offset; /* offset */ \
901:
91
92 do_cfa_expr(IA32_SIGCONTEXT_esp+4)
93 do_expr(0, IA32_SIGCONTEXT_eax+4)
94 do_expr(1, IA32_SIGCONTEXT_ecx+4)
95 do_expr(2, IA32_SIGCONTEXT_edx+4)
96 do_expr(3, IA32_SIGCONTEXT_ebx+4)
97 do_expr(5, IA32_SIGCONTEXT_ebp+4)
98 do_expr(6, IA32_SIGCONTEXT_esi+4)
99 do_expr(7, IA32_SIGCONTEXT_edi+4)
100 do_expr(8, IA32_SIGCONTEXT_eip+4)
101
102 .byte 0x42 /* DW_CFA_advance_loc 2 -- nop; popl eax. */
103
104 do_cfa_expr(IA32_SIGCONTEXT_esp)
105 do_expr(0, IA32_SIGCONTEXT_eax)
106 do_expr(1, IA32_SIGCONTEXT_ecx)
107 do_expr(2, IA32_SIGCONTEXT_edx)
108 do_expr(3, IA32_SIGCONTEXT_ebx)
109 do_expr(5, IA32_SIGCONTEXT_ebp)
110 do_expr(6, IA32_SIGCONTEXT_esi)
111 do_expr(7, IA32_SIGCONTEXT_edi)
112 do_expr(8, IA32_SIGCONTEXT_eip)
113
114 .align 4
115.LENDFDE2:
116
117 .long .LENDFDE3-.LSTARTFDE3 /* Length FDE */
118.LSTARTFDE3:
119 .long .LSTARTFDE3-.LSTARTFRAMES /* CIE pointer */
120 /* HACK: See above wrt unwind library assumptions. */
121 .long .LSTART_rt_sigreturn-1-. /* PC-relative start address */
122 .long .LEND_rt_sigreturn-.LSTART_rt_sigreturn+1
123 .uleb128 0 /* Augmentation */
124 /* What follows are the instructions for the table generation.
125 We record the locations of each register saved. This is
126 slightly less complicated than the above, since we don't
127 modify the stack pointer in the process. */
128
129 do_cfa_expr(IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_esp)
130 do_expr(0, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_eax)
131 do_expr(1, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ecx)
132 do_expr(2, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_edx)
133 do_expr(3, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ebx)
134 do_expr(5, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ebp)
135 do_expr(6, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_esi)
136 do_expr(7, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_edi)
137 do_expr(8, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_eip)
138
139 .align 4
140.LENDFDE3:
141
142#include "../../x86/kernel/vsyscall-note_32.S"
143
diff --git a/arch/x86/ia32/vsyscall-syscall.S b/arch/x86/ia32/vsyscall-syscall.S
new file mode 100644
index 000000000000..cf9ef678de3e
--- /dev/null
+++ b/arch/x86/ia32/vsyscall-syscall.S
@@ -0,0 +1,69 @@
1/*
2 * Code for the vsyscall page. This version uses the syscall instruction.
3 */
4
5#include <asm/ia32_unistd.h>
6#include <asm/asm-offsets.h>
7#include <asm/segment.h>
8
9 .code32
10 .text
11 .section .text.vsyscall,"ax"
12 .globl __kernel_vsyscall
13 .type __kernel_vsyscall,@function
14__kernel_vsyscall:
15.LSTART_vsyscall:
16 push %ebp
17.Lpush_ebp:
18 movl %ecx, %ebp
19 syscall
20 movl $__USER32_DS, %ecx
21 movl %ecx, %ss
22 movl %ebp, %ecx
23 popl %ebp
24.Lpop_ebp:
25 ret
26.LEND_vsyscall:
27 .size __kernel_vsyscall,.-.LSTART_vsyscall
28
29 .section .eh_frame,"a",@progbits
30.LSTARTFRAME:
31 .long .LENDCIE-.LSTARTCIE
32.LSTARTCIE:
33 .long 0 /* CIE ID */
34 .byte 1 /* Version number */
35 .string "zR" /* NUL-terminated augmentation string */
36 .uleb128 1 /* Code alignment factor */
37 .sleb128 -4 /* Data alignment factor */
38 .byte 8 /* Return address register column */
39 .uleb128 1 /* Augmentation value length */
40 .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
41 .byte 0x0c /* DW_CFA_def_cfa */
42 .uleb128 4
43 .uleb128 4
44 .byte 0x88 /* DW_CFA_offset, column 0x8 */
45 .uleb128 1
46 .align 4
47.LENDCIE:
48
49 .long .LENDFDE1-.LSTARTFDE1 /* Length FDE */
50.LSTARTFDE1:
51 .long .LSTARTFDE1-.LSTARTFRAME /* CIE pointer */
52 .long .LSTART_vsyscall-. /* PC-relative start address */
53 .long .LEND_vsyscall-.LSTART_vsyscall
54 .uleb128 0 /* Augmentation length */
55 /* What follows are the instructions for the table generation.
56 We have to record all changes of the stack pointer. */
57 .byte 0x40 + .Lpush_ebp-.LSTART_vsyscall /* DW_CFA_advance_loc */
58 .byte 0x0e /* DW_CFA_def_cfa_offset */
59 .uleb128 8
60 .byte 0x85, 0x02 /* DW_CFA_offset %ebp -8 */
61 .byte 0x40 + .Lpop_ebp-.Lpush_ebp /* DW_CFA_advance_loc */
62 .byte 0xc5 /* DW_CFA_restore %ebp */
63 .byte 0x0e /* DW_CFA_def_cfa_offset */
64 .uleb128 4
65 .align 4
66.LENDFDE1:
67
68#define SYSCALL_ENTER_KERNEL syscall
69#include "vsyscall-sigreturn.S"
diff --git a/arch/x86/ia32/vsyscall-sysenter.S b/arch/x86/ia32/vsyscall-sysenter.S
new file mode 100644
index 000000000000..ae056e553d13
--- /dev/null
+++ b/arch/x86/ia32/vsyscall-sysenter.S
@@ -0,0 +1,95 @@
1/*
2 * Code for the vsyscall page. This version uses the sysenter instruction.
3 */
4
5#include <asm/ia32_unistd.h>
6#include <asm/asm-offsets.h>
7
8 .code32
9 .text
10 .section .text.vsyscall,"ax"
11 .globl __kernel_vsyscall
12 .type __kernel_vsyscall,@function
13__kernel_vsyscall:
14.LSTART_vsyscall:
15 push %ecx
16.Lpush_ecx:
17 push %edx
18.Lpush_edx:
19 push %ebp
20.Lenter_kernel:
21 movl %esp,%ebp
22 sysenter
23 .space 7,0x90
24 jmp .Lenter_kernel
25 /* 16: System call normal return point is here! */
26 pop %ebp
27.Lpop_ebp:
28 pop %edx
29.Lpop_edx:
30 pop %ecx
31.Lpop_ecx:
32 ret
33.LEND_vsyscall:
34 .size __kernel_vsyscall,.-.LSTART_vsyscall
35
36 .section .eh_frame,"a",@progbits
37.LSTARTFRAME:
38 .long .LENDCIE-.LSTARTCIE
39.LSTARTCIE:
40 .long 0 /* CIE ID */
41 .byte 1 /* Version number */
42 .string "zR" /* NUL-terminated augmentation string */
43 .uleb128 1 /* Code alignment factor */
44 .sleb128 -4 /* Data alignment factor */
45 .byte 8 /* Return address register column */
46 .uleb128 1 /* Augmentation value length */
47 .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
48 .byte 0x0c /* DW_CFA_def_cfa */
49 .uleb128 4
50 .uleb128 4
51 .byte 0x88 /* DW_CFA_offset, column 0x8 */
52 .uleb128 1
53 .align 4
54.LENDCIE:
55
56 .long .LENDFDE1-.LSTARTFDE1 /* Length FDE */
57.LSTARTFDE1:
58 .long .LSTARTFDE1-.LSTARTFRAME /* CIE pointer */
59 .long .LSTART_vsyscall-. /* PC-relative start address */
60 .long .LEND_vsyscall-.LSTART_vsyscall
61 .uleb128 0 /* Augmentation length */
62 /* What follows are the instructions for the table generation.
63 We have to record all changes of the stack pointer. */
64 .byte 0x04 /* DW_CFA_advance_loc4 */
65 .long .Lpush_ecx-.LSTART_vsyscall
66 .byte 0x0e /* DW_CFA_def_cfa_offset */
67 .byte 0x08 /* RA at offset 8 now */
68 .byte 0x04 /* DW_CFA_advance_loc4 */
69 .long .Lpush_edx-.Lpush_ecx
70 .byte 0x0e /* DW_CFA_def_cfa_offset */
71 .byte 0x0c /* RA at offset 12 now */
72 .byte 0x04 /* DW_CFA_advance_loc4 */
73 .long .Lenter_kernel-.Lpush_edx
74 .byte 0x0e /* DW_CFA_def_cfa_offset */
75 .byte 0x10 /* RA at offset 16 now */
76 .byte 0x85, 0x04 /* DW_CFA_offset %ebp -16 */
77 /* Finally the epilogue. */
78 .byte 0x04 /* DW_CFA_advance_loc4 */
79 .long .Lpop_ebp-.Lenter_kernel
80 .byte 0x0e /* DW_CFA_def_cfa_offset */
81 .byte 0x12 /* RA at offset 12 now */
82 .byte 0xc5 /* DW_CFA_restore %ebp */
83 .byte 0x04 /* DW_CFA_advance_loc4 */
84 .long .Lpop_edx-.Lpop_ebp
85 .byte 0x0e /* DW_CFA_def_cfa_offset */
86 .byte 0x08 /* RA at offset 8 now */
87 .byte 0x04 /* DW_CFA_advance_loc4 */
88 .long .Lpop_ecx-.Lpop_edx
89 .byte 0x0e /* DW_CFA_def_cfa_offset */
90 .byte 0x04 /* RA at offset 4 now */
91 .align 4
92.LENDFDE1:
93
94#define SYSCALL_ENTER_KERNEL int $0x80
95#include "vsyscall-sigreturn.S"
diff --git a/arch/x86/ia32/vsyscall.lds b/arch/x86/ia32/vsyscall.lds
new file mode 100644
index 000000000000..1dc86ff5bcb9
--- /dev/null
+++ b/arch/x86/ia32/vsyscall.lds
@@ -0,0 +1,80 @@
1/*
2 * Linker script for vsyscall DSO. The vsyscall page is an ELF shared
3 * object prelinked to its virtual address. This script controls its layout.
4 */
5
6/* This must match <asm/fixmap.h>. */
7VSYSCALL_BASE = 0xffffe000;
8
9SECTIONS
10{
11 . = VSYSCALL_BASE + SIZEOF_HEADERS;
12
13 .hash : { *(.hash) } :text
14 .gnu.hash : { *(.gnu.hash) }
15 .dynsym : { *(.dynsym) }
16 .dynstr : { *(.dynstr) }
17 .gnu.version : { *(.gnu.version) }
18 .gnu.version_d : { *(.gnu.version_d) }
19 .gnu.version_r : { *(.gnu.version_r) }
20
21 /* This linker script is used both with -r and with -shared.
22 For the layouts to match, we need to skip more than enough
23 space for the dynamic symbol table et al. If this amount
24 is insufficient, ld -shared will barf. Just increase it here. */
25 . = VSYSCALL_BASE + 0x400;
26
27 .text.vsyscall : { *(.text.vsyscall) } :text =0x90909090
28
29 /* This is an 32bit object and we cannot easily get the offsets
30 into the 64bit kernel. Just hardcode them here. This assumes
31 that all the stubs don't need more than 0x100 bytes. */
32 . = VSYSCALL_BASE + 0x500;
33
34 .text.sigreturn : { *(.text.sigreturn) } :text =0x90909090
35
36 . = VSYSCALL_BASE + 0x600;
37
38 .text.rtsigreturn : { *(.text.rtsigreturn) } :text =0x90909090
39
40 .note : { *(.note.*) } :text :note
41 .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr
42 .eh_frame : { KEEP (*(.eh_frame)) } :text
43 .dynamic : { *(.dynamic) } :text :dynamic
44 .useless : {
45 *(.got.plt) *(.got)
46 *(.data .data.* .gnu.linkonce.d.*)
47 *(.dynbss)
48 *(.bss .bss.* .gnu.linkonce.b.*)
49 } :text
50}
51
52/*
53 * We must supply the ELF program headers explicitly to get just one
54 * PT_LOAD segment, and set the flags explicitly to make segments read-only.
55 */
56PHDRS
57{
58 text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */
59 dynamic PT_DYNAMIC FLAGS(4); /* PF_R */
60 note PT_NOTE FLAGS(4); /* PF_R */
61 eh_frame_hdr 0x6474e550; /* PT_GNU_EH_FRAME, but ld doesn't match the name */
62}
63
64/*
65 * This controls what symbols we export from the DSO.
66 */
67VERSION
68{
69 LINUX_2.5 {
70 global:
71 __kernel_vsyscall;
72 __kernel_sigreturn;
73 __kernel_rt_sigreturn;
74
75 local: *;
76 };
77}
78
79/* The ELF entry point can be used to set the AT_SYSINFO value. */
80ENTRY(__kernel_vsyscall);
diff --git a/arch/x86/kernel/.gitignore b/arch/x86/kernel/.gitignore
new file mode 100644
index 000000000000..40836ad9079c
--- /dev/null
+++ b/arch/x86/kernel/.gitignore
@@ -0,0 +1 @@
vsyscall.lds
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
new file mode 100644
index 000000000000..45855c97923e
--- /dev/null
+++ b/arch/x86/kernel/Makefile
@@ -0,0 +1,5 @@
1ifeq ($(CONFIG_X86_32),y)
2include ${srctree}/arch/x86/kernel/Makefile_32
3else
4include ${srctree}/arch/x86/kernel/Makefile_64
5endif
diff --git a/arch/x86/kernel/Makefile_32 b/arch/x86/kernel/Makefile_32
new file mode 100644
index 000000000000..c624193740fd
--- /dev/null
+++ b/arch/x86/kernel/Makefile_32
@@ -0,0 +1,86 @@
1#
2# Makefile for the linux kernel.
3#
4
5extra-y := head_32.o init_task_32.o vmlinux.lds
6
7obj-y := process_32.o signal_32.o entry_32.o traps_32.o irq_32.o \
8 ptrace_32.o time_32.o ioport_32.o ldt_32.o setup_32.o i8259_32.o sys_i386_32.o \
9 pci-dma_32.o i386_ksyms_32.o i387_32.o bootflag.o e820_32.o\
10 quirks.o i8237.o topology.o alternative.o i8253_32.o tsc_32.o
11
12obj-$(CONFIG_STACKTRACE) += stacktrace.o
13obj-y += cpu/
14obj-y += acpi/
15obj-$(CONFIG_X86_BIOS_REBOOT) += reboot_32.o
16obj-$(CONFIG_MCA) += mca_32.o
17obj-$(CONFIG_X86_MSR) += msr.o
18obj-$(CONFIG_X86_CPUID) += cpuid.o
19obj-$(CONFIG_MICROCODE) += microcode.o
20obj-$(CONFIG_APM) += apm_32.o
21obj-$(CONFIG_X86_SMP) += smp_32.o smpboot_32.o tsc_sync.o
22obj-$(CONFIG_SMP) += smpcommon_32.o
23obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_32.o
24obj-$(CONFIG_X86_MPPARSE) += mpparse_32.o
25obj-$(CONFIG_X86_LOCAL_APIC) += apic_32.o nmi_32.o
26obj-$(CONFIG_X86_IO_APIC) += io_apic_32.o
27obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
28obj-$(CONFIG_KEXEC) += machine_kexec_32.o relocate_kernel_32.o crash_32.o
29obj-$(CONFIG_CRASH_DUMP) += crash_dump_32.o
30obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
31obj-$(CONFIG_X86_SUMMIT_NUMA) += summit_32.o
32obj-$(CONFIG_KPROBES) += kprobes_32.o
33obj-$(CONFIG_MODULES) += module_32.o
34obj-y += sysenter_32.o vsyscall_32.o
35obj-$(CONFIG_ACPI_SRAT) += srat_32.o
36obj-$(CONFIG_EFI) += efi_32.o efi_stub_32.o
37obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o
38obj-$(CONFIG_VM86) += vm86_32.o
39obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
40obj-$(CONFIG_HPET_TIMER) += hpet_32.o
41obj-$(CONFIG_K8_NB) += k8.o
42obj-$(CONFIG_MGEODE_LX) += geode_32.o
43
44obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o
45obj-$(CONFIG_PARAVIRT) += paravirt_32.o
46obj-y += pcspeaker.o
47
48obj-$(CONFIG_SCx200) += scx200_32.o
49
50# vsyscall_32.o contains the vsyscall DSO images as __initdata.
51# We must build both images before we can assemble it.
52# Note: kbuild does not track this dependency due to usage of .incbin
53$(obj)/vsyscall_32.o: $(obj)/vsyscall-int80_32.so $(obj)/vsyscall-sysenter_32.so
54targets += $(foreach F,int80 sysenter,vsyscall-$F.o vsyscall-$F.so)
55targets += vsyscall-note_32.o vsyscall_32.lds
56
57# The DSO images are built using a special linker script.
58quiet_cmd_syscall = SYSCALL $@
59 cmd_syscall = $(CC) -m elf_i386 -nostdlib $(SYSCFLAGS_$(@F)) \
60 -Wl,-T,$(filter-out FORCE,$^) -o $@
61
62export CPPFLAGS_vsyscall_32.lds += -P -C -U$(ARCH)
63
64vsyscall-flags = -shared -s -Wl,-soname=linux-gate.so.1 \
65 $(call ld-option, -Wl$(comma)--hash-style=sysv)
66SYSCFLAGS_vsyscall-sysenter_32.so = $(vsyscall-flags)
67SYSCFLAGS_vsyscall-int80_32.so = $(vsyscall-flags)
68
69$(obj)/vsyscall-int80_32.so $(obj)/vsyscall-sysenter_32.so: \
70$(obj)/vsyscall-%.so: $(src)/vsyscall_32.lds \
71 $(obj)/vsyscall-%.o $(obj)/vsyscall-note_32.o FORCE
72 $(call if_changed,syscall)
73
74# We also create a special relocatable object that should mirror the symbol
75# table and layout of the linked DSO. With ld -R we can then refer to
76# these symbols in the kernel code rather than hand-coded addresses.
77extra-y += vsyscall-syms.o
78$(obj)/built-in.o: $(obj)/vsyscall-syms.o
79$(obj)/built-in.o: ld_flags += -R $(obj)/vsyscall-syms.o
80
81SYSCFLAGS_vsyscall-syms.o = -r
82$(obj)/vsyscall-syms.o: $(src)/vsyscall_32.lds \
83 $(obj)/vsyscall-sysenter_32.o $(obj)/vsyscall-note_32.o FORCE
84 $(call if_changed,syscall)
85
86
diff --git a/arch/x86/kernel/Makefile_64 b/arch/x86/kernel/Makefile_64
new file mode 100644
index 000000000000..3ab017a0a3b9
--- /dev/null
+++ b/arch/x86/kernel/Makefile_64
@@ -0,0 +1,54 @@
1#
2# Makefile for the linux kernel.
3#
4
5extra-y := head_64.o head64.o init_task_64.o vmlinux.lds
6EXTRA_AFLAGS := -traditional
7obj-y := process_64.o signal_64.o entry_64.o traps_64.o irq_64.o \
8 ptrace_64.o time_64.o ioport_64.o ldt_64.o setup_64.o i8259_64.o sys_x86_64.o \
9 x8664_ksyms_64.o i387_64.o syscall_64.o vsyscall_64.o \
10 setup64.o bootflag.o e820_64.o reboot_64.o quirks.o i8237.o \
11 pci-dma_64.o pci-nommu_64.o alternative.o hpet_64.o tsc_64.o bugs_64.o \
12 perfctr-watchdog.o
13
14obj-$(CONFIG_STACKTRACE) += stacktrace.o
15obj-$(CONFIG_X86_MCE) += mce_64.o therm_throt.o
16obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o
17obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o
18obj-$(CONFIG_MTRR) += cpu/mtrr/
19obj-$(CONFIG_ACPI) += acpi/
20obj-$(CONFIG_X86_MSR) += msr.o
21obj-$(CONFIG_MICROCODE) += microcode.o
22obj-$(CONFIG_X86_CPUID) += cpuid.o
23obj-$(CONFIG_SMP) += smp_64.o smpboot_64.o trampoline_64.o tsc_sync.o
24obj-y += apic_64.o nmi_64.o
25obj-y += io_apic_64.o mpparse_64.o genapic_64.o genapic_flat_64.o
26obj-$(CONFIG_KEXEC) += machine_kexec_64.o relocate_kernel_64.o crash_64.o
27obj-$(CONFIG_CRASH_DUMP) += crash_dump_64.o
28obj-$(CONFIG_PM) += suspend_64.o
29obj-$(CONFIG_HIBERNATION) += suspend_asm_64.o
30obj-$(CONFIG_CPU_FREQ) += cpu/cpufreq/
31obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
32obj-$(CONFIG_IOMMU) += pci-gart_64.o aperture_64.o
33obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o
34obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o
35obj-$(CONFIG_KPROBES) += kprobes_64.o
36obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o
37obj-$(CONFIG_X86_VSMP) += vsmp_64.o
38obj-$(CONFIG_K8_NB) += k8.o
39obj-$(CONFIG_AUDIT) += audit_64.o
40
41obj-$(CONFIG_MODULES) += module_64.o
42obj-$(CONFIG_PCI) += early-quirks_64.o
43
44obj-y += topology.o
45obj-y += intel_cacheinfo.o
46obj-y += addon_cpuid_features.o
47obj-y += pcspeaker.o
48
49CFLAGS_vsyscall_64.o := $(PROFILING) -g0
50
51therm_throt-y += cpu/mcheck/therm_throt.o
52intel_cacheinfo-y += cpu/intel_cacheinfo.o
53addon_cpuid_features-y += cpu/addon_cpuid_features.o
54perfctr-watchdog-y += cpu/perfctr-watchdog.o
diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile
new file mode 100644
index 000000000000..3d5671939542
--- /dev/null
+++ b/arch/x86/kernel/acpi/Makefile
@@ -0,0 +1,5 @@
1ifeq ($(CONFIG_X86_32),y)
2include ${srctree}/arch/x86/kernel/acpi/Makefile_32
3else
4include ${srctree}/arch/x86/kernel/acpi/Makefile_64
5endif
diff --git a/arch/x86/kernel/acpi/Makefile_32 b/arch/x86/kernel/acpi/Makefile_32
new file mode 100644
index 000000000000..a4852a2e9190
--- /dev/null
+++ b/arch/x86/kernel/acpi/Makefile_32
@@ -0,0 +1,10 @@
1obj-$(CONFIG_ACPI) += boot.o
2ifneq ($(CONFIG_PCI),)
3obj-$(CONFIG_X86_IO_APIC) += earlyquirk_32.o
4endif
5obj-$(CONFIG_ACPI_SLEEP) += sleep_32.o wakeup_32.o
6
7ifneq ($(CONFIG_ACPI_PROCESSOR),)
8obj-y += cstate.o processor.o
9endif
10
diff --git a/arch/x86/kernel/acpi/Makefile_64 b/arch/x86/kernel/acpi/Makefile_64
new file mode 100644
index 000000000000..629425bc002d
--- /dev/null
+++ b/arch/x86/kernel/acpi/Makefile_64
@@ -0,0 +1,7 @@
1obj-y := boot.o
2obj-$(CONFIG_ACPI_SLEEP) += sleep_64.o wakeup_64.o
3
4ifneq ($(CONFIG_ACPI_PROCESSOR),)
5obj-y += processor.o cstate.o
6endif
7
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
new file mode 100644
index 000000000000..afd2afe9102d
--- /dev/null
+++ b/arch/x86/kernel/acpi/boot.c
@@ -0,0 +1,1326 @@
1/*
2 * boot.c - Architecture-Specific Low-Level ACPI Boot Support
3 *
4 * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
5 * Copyright (C) 2001 Jun Nakajima <jun.nakajima@intel.com>
6 *
7 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 *
23 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
24 */
25
26#include <linux/init.h>
27#include <linux/acpi.h>
28#include <linux/acpi_pmtmr.h>
29#include <linux/efi.h>
30#include <linux/cpumask.h>
31#include <linux/module.h>
32#include <linux/dmi.h>
33#include <linux/irq.h>
34#include <linux/bootmem.h>
35#include <linux/ioport.h>
36
37#include <asm/pgtable.h>
38#include <asm/io_apic.h>
39#include <asm/apic.h>
40#include <asm/io.h>
41#include <asm/mpspec.h>
42
43static int __initdata acpi_force = 0;
44
45#ifdef CONFIG_ACPI
46int acpi_disabled = 0;
47#else
48int acpi_disabled = 1;
49#endif
50EXPORT_SYMBOL(acpi_disabled);
51
52#ifdef CONFIG_X86_64
53
54#include <asm/proto.h>
55
56static inline int acpi_madt_oem_check(char *oem_id, char *oem_table_id) { return 0; }
57
58
59#else /* X86 */
60
61#ifdef CONFIG_X86_LOCAL_APIC
62#include <mach_apic.h>
63#include <mach_mpparse.h>
64#endif /* CONFIG_X86_LOCAL_APIC */
65
66#endif /* X86 */
67
68#define BAD_MADT_ENTRY(entry, end) ( \
69 (!entry) || (unsigned long)entry + sizeof(*entry) > end || \
70 ((struct acpi_subtable_header *)entry)->length < sizeof(*entry))
71
72#define PREFIX "ACPI: "
73
74int acpi_noirq; /* skip ACPI IRQ initialization */
75int acpi_pci_disabled __initdata; /* skip ACPI PCI scan and IRQ initialization */
76int acpi_ht __initdata = 1; /* enable HT */
77
78int acpi_lapic;
79int acpi_ioapic;
80int acpi_strict;
81EXPORT_SYMBOL(acpi_strict);
82
83u8 acpi_sci_flags __initdata;
84int acpi_sci_override_gsi __initdata;
85int acpi_skip_timer_override __initdata;
86int acpi_use_timer_override __initdata;
87
88#ifdef CONFIG_X86_LOCAL_APIC
89static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
90#endif
91
92#ifndef __HAVE_ARCH_CMPXCHG
93#warning ACPI uses CMPXCHG, i486 and later hardware
94#endif
95
96/* --------------------------------------------------------------------------
97 Boot-time Configuration
98 -------------------------------------------------------------------------- */
99
100/*
101 * The default interrupt routing model is PIC (8259). This gets
102 * overriden if IOAPICs are enumerated (below).
103 */
104enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC;
105
106#ifdef CONFIG_X86_64
107
108/* rely on all ACPI tables being in the direct mapping */
109char *__acpi_map_table(unsigned long phys_addr, unsigned long size)
110{
111 if (!phys_addr || !size)
112 return NULL;
113
114 if (phys_addr+size <= (end_pfn_map << PAGE_SHIFT) + PAGE_SIZE)
115 return __va(phys_addr);
116
117 return NULL;
118}
119
120#else
121
122/*
123 * Temporarily use the virtual area starting from FIX_IO_APIC_BASE_END,
124 * to map the target physical address. The problem is that set_fixmap()
125 * provides a single page, and it is possible that the page is not
126 * sufficient.
127 * By using this area, we can map up to MAX_IO_APICS pages temporarily,
128 * i.e. until the next __va_range() call.
129 *
130 * Important Safety Note: The fixed I/O APIC page numbers are *subtracted*
131 * from the fixed base. That's why we start at FIX_IO_APIC_BASE_END and
132 * count idx down while incrementing the phys address.
133 */
134char *__acpi_map_table(unsigned long phys, unsigned long size)
135{
136 unsigned long base, offset, mapped_size;
137 int idx;
138
139 if (phys + size < 8 * 1024 * 1024)
140 return __va(phys);
141
142 offset = phys & (PAGE_SIZE - 1);
143 mapped_size = PAGE_SIZE - offset;
144 set_fixmap(FIX_ACPI_END, phys);
145 base = fix_to_virt(FIX_ACPI_END);
146
147 /*
148 * Most cases can be covered by the below.
149 */
150 idx = FIX_ACPI_END;
151 while (mapped_size < size) {
152 if (--idx < FIX_ACPI_BEGIN)
153 return NULL; /* cannot handle this */
154 phys += PAGE_SIZE;
155 set_fixmap(idx, phys);
156 mapped_size += PAGE_SIZE;
157 }
158
159 return ((unsigned char *)base + offset);
160}
161#endif
162
163#ifdef CONFIG_PCI_MMCONFIG
164/* The physical address of the MMCONFIG aperture. Set from ACPI tables. */
165struct acpi_mcfg_allocation *pci_mmcfg_config;
166int pci_mmcfg_config_num;
167
168int __init acpi_parse_mcfg(struct acpi_table_header *header)
169{
170 struct acpi_table_mcfg *mcfg;
171 unsigned long i;
172 int config_size;
173
174 if (!header)
175 return -EINVAL;
176
177 mcfg = (struct acpi_table_mcfg *)header;
178
179 /* how many config structures do we have */
180 pci_mmcfg_config_num = 0;
181 i = header->length - sizeof(struct acpi_table_mcfg);
182 while (i >= sizeof(struct acpi_mcfg_allocation)) {
183 ++pci_mmcfg_config_num;
184 i -= sizeof(struct acpi_mcfg_allocation);
185 };
186 if (pci_mmcfg_config_num == 0) {
187 printk(KERN_ERR PREFIX "MMCONFIG has no entries\n");
188 return -ENODEV;
189 }
190
191 config_size = pci_mmcfg_config_num * sizeof(*pci_mmcfg_config);
192 pci_mmcfg_config = kmalloc(config_size, GFP_KERNEL);
193 if (!pci_mmcfg_config) {
194 printk(KERN_WARNING PREFIX
195 "No memory for MCFG config tables\n");
196 return -ENOMEM;
197 }
198
199 memcpy(pci_mmcfg_config, &mcfg[1], config_size);
200 for (i = 0; i < pci_mmcfg_config_num; ++i) {
201 if (pci_mmcfg_config[i].address > 0xFFFFFFFF) {
202 printk(KERN_ERR PREFIX
203 "MMCONFIG not in low 4GB of memory\n");
204 kfree(pci_mmcfg_config);
205 pci_mmcfg_config_num = 0;
206 return -ENODEV;
207 }
208 }
209
210 return 0;
211}
212#endif /* CONFIG_PCI_MMCONFIG */
213
214#ifdef CONFIG_X86_LOCAL_APIC
215static int __init acpi_parse_madt(struct acpi_table_header *table)
216{
217 struct acpi_table_madt *madt = NULL;
218
219 if (!cpu_has_apic)
220 return -EINVAL;
221
222 madt = (struct acpi_table_madt *)table;
223 if (!madt) {
224 printk(KERN_WARNING PREFIX "Unable to map MADT\n");
225 return -ENODEV;
226 }
227
228 if (madt->address) {
229 acpi_lapic_addr = (u64) madt->address;
230
231 printk(KERN_DEBUG PREFIX "Local APIC address 0x%08x\n",
232 madt->address);
233 }
234
235 acpi_madt_oem_check(madt->header.oem_id, madt->header.oem_table_id);
236
237 return 0;
238}
239
240static int __init
241acpi_parse_lapic(struct acpi_subtable_header * header, const unsigned long end)
242{
243 struct acpi_madt_local_apic *processor = NULL;
244
245 processor = (struct acpi_madt_local_apic *)header;
246
247 if (BAD_MADT_ENTRY(processor, end))
248 return -EINVAL;
249
250 acpi_table_print_madt_entry(header);
251
252 /*
253 * We need to register disabled CPU as well to permit
254 * counting disabled CPUs. This allows us to size
255 * cpus_possible_map more accurately, to permit
256 * to not preallocating memory for all NR_CPUS
257 * when we use CPU hotplug.
258 */
259 mp_register_lapic(processor->id, /* APIC ID */
260 processor->lapic_flags & ACPI_MADT_ENABLED); /* Enabled? */
261
262 return 0;
263}
264
265static int __init
266acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header,
267 const unsigned long end)
268{
269 struct acpi_madt_local_apic_override *lapic_addr_ovr = NULL;
270
271 lapic_addr_ovr = (struct acpi_madt_local_apic_override *)header;
272
273 if (BAD_MADT_ENTRY(lapic_addr_ovr, end))
274 return -EINVAL;
275
276 acpi_lapic_addr = lapic_addr_ovr->address;
277
278 return 0;
279}
280
281static int __init
282acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long end)
283{
284 struct acpi_madt_local_apic_nmi *lapic_nmi = NULL;
285
286 lapic_nmi = (struct acpi_madt_local_apic_nmi *)header;
287
288 if (BAD_MADT_ENTRY(lapic_nmi, end))
289 return -EINVAL;
290
291 acpi_table_print_madt_entry(header);
292
293 if (lapic_nmi->lint != 1)
294 printk(KERN_WARNING PREFIX "NMI not connected to LINT 1!\n");
295
296 return 0;
297}
298
299#endif /*CONFIG_X86_LOCAL_APIC */
300
301#ifdef CONFIG_X86_IO_APIC
302
303static int __init
304acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end)
305{
306 struct acpi_madt_io_apic *ioapic = NULL;
307
308 ioapic = (struct acpi_madt_io_apic *)header;
309
310 if (BAD_MADT_ENTRY(ioapic, end))
311 return -EINVAL;
312
313 acpi_table_print_madt_entry(header);
314
315 mp_register_ioapic(ioapic->id,
316 ioapic->address, ioapic->global_irq_base);
317
318 return 0;
319}
320
321/*
322 * Parse Interrupt Source Override for the ACPI SCI
323 */
324static void __init acpi_sci_ioapic_setup(u32 gsi, u16 polarity, u16 trigger)
325{
326 if (trigger == 0) /* compatible SCI trigger is level */
327 trigger = 3;
328
329 if (polarity == 0) /* compatible SCI polarity is low */
330 polarity = 3;
331
332 /* Command-line over-ride via acpi_sci= */
333 if (acpi_sci_flags & ACPI_MADT_TRIGGER_MASK)
334 trigger = (acpi_sci_flags & ACPI_MADT_TRIGGER_MASK) >> 2;
335
336 if (acpi_sci_flags & ACPI_MADT_POLARITY_MASK)
337 polarity = acpi_sci_flags & ACPI_MADT_POLARITY_MASK;
338
339 /*
340 * mp_config_acpi_legacy_irqs() already setup IRQs < 16
341 * If GSI is < 16, this will update its flags,
342 * else it will create a new mp_irqs[] entry.
343 */
344 mp_override_legacy_irq(gsi, polarity, trigger, gsi);
345
346 /*
347 * stash over-ride to indicate we've been here
348 * and for later update of acpi_gbl_FADT
349 */
350 acpi_sci_override_gsi = gsi;
351 return;
352}
353
354static int __init
355acpi_parse_int_src_ovr(struct acpi_subtable_header * header,
356 const unsigned long end)
357{
358 struct acpi_madt_interrupt_override *intsrc = NULL;
359
360 intsrc = (struct acpi_madt_interrupt_override *)header;
361
362 if (BAD_MADT_ENTRY(intsrc, end))
363 return -EINVAL;
364
365 acpi_table_print_madt_entry(header);
366
367 if (intsrc->source_irq == acpi_gbl_FADT.sci_interrupt) {
368 acpi_sci_ioapic_setup(intsrc->global_irq,
369 intsrc->inti_flags & ACPI_MADT_POLARITY_MASK,
370 (intsrc->inti_flags & ACPI_MADT_TRIGGER_MASK) >> 2);
371 return 0;
372 }
373
374 if (acpi_skip_timer_override &&
375 intsrc->source_irq == 0 && intsrc->global_irq == 2) {
376 printk(PREFIX "BIOS IRQ0 pin2 override ignored.\n");
377 return 0;
378 }
379
380 mp_override_legacy_irq(intsrc->source_irq,
381 intsrc->inti_flags & ACPI_MADT_POLARITY_MASK,
382 (intsrc->inti_flags & ACPI_MADT_TRIGGER_MASK) >> 2,
383 intsrc->global_irq);
384
385 return 0;
386}
387
388static int __init
389acpi_parse_nmi_src(struct acpi_subtable_header * header, const unsigned long end)
390{
391 struct acpi_madt_nmi_source *nmi_src = NULL;
392
393 nmi_src = (struct acpi_madt_nmi_source *)header;
394
395 if (BAD_MADT_ENTRY(nmi_src, end))
396 return -EINVAL;
397
398 acpi_table_print_madt_entry(header);
399
400 /* TBD: Support nimsrc entries? */
401
402 return 0;
403}
404
405#endif /* CONFIG_X86_IO_APIC */
406
407/*
408 * acpi_pic_sci_set_trigger()
409 *
410 * use ELCR to set PIC-mode trigger type for SCI
411 *
412 * If a PIC-mode SCI is not recognized or gives spurious IRQ7's
413 * it may require Edge Trigger -- use "acpi_sci=edge"
414 *
415 * Port 0x4d0-4d1 are ECLR1 and ECLR2, the Edge/Level Control Registers
416 * for the 8259 PIC. bit[n] = 1 means irq[n] is Level, otherwise Edge.
417 * ECLR1 is IRQ's 0-7 (IRQ 0, 1, 2 must be 0)
418 * ECLR2 is IRQ's 8-15 (IRQ 8, 13 must be 0)
419 */
420
421void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger)
422{
423 unsigned int mask = 1 << irq;
424 unsigned int old, new;
425
426 /* Real old ELCR mask */
427 old = inb(0x4d0) | (inb(0x4d1) << 8);
428
429 /*
430 * If we use ACPI to set PCI irq's, then we should clear ELCR
431 * since we will set it correctly as we enable the PCI irq
432 * routing.
433 */
434 new = acpi_noirq ? old : 0;
435
436 /*
437 * Update SCI information in the ELCR, it isn't in the PCI
438 * routing tables..
439 */
440 switch (trigger) {
441 case 1: /* Edge - clear */
442 new &= ~mask;
443 break;
444 case 3: /* Level - set */
445 new |= mask;
446 break;
447 }
448
449 if (old == new)
450 return;
451
452 printk(PREFIX "setting ELCR to %04x (from %04x)\n", new, old);
453 outb(new, 0x4d0);
454 outb(new >> 8, 0x4d1);
455}
456
457int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
458{
459 *irq = gsi;
460 return 0;
461}
462
463/*
464 * success: return IRQ number (>=0)
465 * failure: return < 0
466 */
467int acpi_register_gsi(u32 gsi, int triggering, int polarity)
468{
469 unsigned int irq;
470 unsigned int plat_gsi = gsi;
471
472#ifdef CONFIG_PCI
473 /*
474 * Make sure all (legacy) PCI IRQs are set as level-triggered.
475 */
476 if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) {
477 extern void eisa_set_level_irq(unsigned int irq);
478
479 if (triggering == ACPI_LEVEL_SENSITIVE)
480 eisa_set_level_irq(gsi);
481 }
482#endif
483
484#ifdef CONFIG_X86_IO_APIC
485 if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) {
486 plat_gsi = mp_register_gsi(gsi, triggering, polarity);
487 }
488#endif
489 acpi_gsi_to_irq(plat_gsi, &irq);
490 return irq;
491}
492
493EXPORT_SYMBOL(acpi_register_gsi);
494
495/*
496 * ACPI based hotplug support for CPU
497 */
498#ifdef CONFIG_ACPI_HOTPLUG_CPU
499int acpi_map_lsapic(acpi_handle handle, int *pcpu)
500{
501 struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
502 union acpi_object *obj;
503 struct acpi_madt_local_apic *lapic;
504 cpumask_t tmp_map, new_map;
505 u8 physid;
506 int cpu;
507
508 if (ACPI_FAILURE(acpi_evaluate_object(handle, "_MAT", NULL, &buffer)))
509 return -EINVAL;
510
511 if (!buffer.length || !buffer.pointer)
512 return -EINVAL;
513
514 obj = buffer.pointer;
515 if (obj->type != ACPI_TYPE_BUFFER ||
516 obj->buffer.length < sizeof(*lapic)) {
517 kfree(buffer.pointer);
518 return -EINVAL;
519 }
520
521 lapic = (struct acpi_madt_local_apic *)obj->buffer.pointer;
522
523 if (lapic->header.type != ACPI_MADT_TYPE_LOCAL_APIC ||
524 !(lapic->lapic_flags & ACPI_MADT_ENABLED)) {
525 kfree(buffer.pointer);
526 return -EINVAL;
527 }
528
529 physid = lapic->id;
530
531 kfree(buffer.pointer);
532 buffer.length = ACPI_ALLOCATE_BUFFER;
533 buffer.pointer = NULL;
534
535 tmp_map = cpu_present_map;
536 mp_register_lapic(physid, lapic->lapic_flags & ACPI_MADT_ENABLED);
537
538 /*
539 * If mp_register_lapic successfully generates a new logical cpu
540 * number, then the following will get us exactly what was mapped
541 */
542 cpus_andnot(new_map, cpu_present_map, tmp_map);
543 if (cpus_empty(new_map)) {
544 printk ("Unable to map lapic to logical cpu number\n");
545 return -EINVAL;
546 }
547
548 cpu = first_cpu(new_map);
549
550 *pcpu = cpu;
551 return 0;
552}
553
554EXPORT_SYMBOL(acpi_map_lsapic);
555
556int acpi_unmap_lsapic(int cpu)
557{
558 x86_cpu_to_apicid[cpu] = -1;
559 cpu_clear(cpu, cpu_present_map);
560 num_processors--;
561
562 return (0);
563}
564
565EXPORT_SYMBOL(acpi_unmap_lsapic);
566#endif /* CONFIG_ACPI_HOTPLUG_CPU */
567
568int acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base)
569{
570 /* TBD */
571 return -EINVAL;
572}
573
574EXPORT_SYMBOL(acpi_register_ioapic);
575
576int acpi_unregister_ioapic(acpi_handle handle, u32 gsi_base)
577{
578 /* TBD */
579 return -EINVAL;
580}
581
582EXPORT_SYMBOL(acpi_unregister_ioapic);
583
584static unsigned long __init
585acpi_scan_rsdp(unsigned long start, unsigned long length)
586{
587 unsigned long offset = 0;
588 unsigned long sig_len = sizeof("RSD PTR ") - 1;
589
590 /*
591 * Scan all 16-byte boundaries of the physical memory region for the
592 * RSDP signature.
593 */
594 for (offset = 0; offset < length; offset += 16) {
595 if (strncmp((char *)(phys_to_virt(start) + offset), "RSD PTR ", sig_len))
596 continue;
597 return (start + offset);
598 }
599
600 return 0;
601}
602
603static int __init acpi_parse_sbf(struct acpi_table_header *table)
604{
605 struct acpi_table_boot *sb;
606
607 sb = (struct acpi_table_boot *)table;
608 if (!sb) {
609 printk(KERN_WARNING PREFIX "Unable to map SBF\n");
610 return -ENODEV;
611 }
612
613 sbf_port = sb->cmos_index; /* Save CMOS port */
614
615 return 0;
616}
617
618#ifdef CONFIG_HPET_TIMER
619#include <asm/hpet.h>
620
621static struct __initdata resource *hpet_res;
622
623static int __init acpi_parse_hpet(struct acpi_table_header *table)
624{
625 struct acpi_table_hpet *hpet_tbl;
626
627 hpet_tbl = (struct acpi_table_hpet *)table;
628 if (!hpet_tbl) {
629 printk(KERN_WARNING PREFIX "Unable to map HPET\n");
630 return -ENODEV;
631 }
632
633 if (hpet_tbl->address.space_id != ACPI_SPACE_MEM) {
634 printk(KERN_WARNING PREFIX "HPET timers must be located in "
635 "memory.\n");
636 return -1;
637 }
638
639 hpet_address = hpet_tbl->address.address;
640 printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n",
641 hpet_tbl->id, hpet_address);
642
643 /*
644 * Allocate and initialize the HPET firmware resource for adding into
645 * the resource tree during the lateinit timeframe.
646 */
647#define HPET_RESOURCE_NAME_SIZE 9
648 hpet_res = alloc_bootmem(sizeof(*hpet_res) + HPET_RESOURCE_NAME_SIZE);
649
650 if (!hpet_res)
651 return 0;
652
653 memset(hpet_res, 0, sizeof(*hpet_res));
654 hpet_res->name = (void *)&hpet_res[1];
655 hpet_res->flags = IORESOURCE_MEM;
656 snprintf((char *)hpet_res->name, HPET_RESOURCE_NAME_SIZE, "HPET %u",
657 hpet_tbl->sequence);
658
659 hpet_res->start = hpet_address;
660 hpet_res->end = hpet_address + (1 * 1024) - 1;
661
662 return 0;
663}
664
665/*
666 * hpet_insert_resource inserts the HPET resources used into the resource
667 * tree.
668 */
669static __init int hpet_insert_resource(void)
670{
671 if (!hpet_res)
672 return 1;
673
674 return insert_resource(&iomem_resource, hpet_res);
675}
676
677late_initcall(hpet_insert_resource);
678
679#else
680#define acpi_parse_hpet NULL
681#endif
682
683static int __init acpi_parse_fadt(struct acpi_table_header *table)
684{
685
686#ifdef CONFIG_X86_PM_TIMER
687 /* detect the location of the ACPI PM Timer */
688 if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID) {
689 /* FADT rev. 2 */
690 if (acpi_gbl_FADT.xpm_timer_block.space_id !=
691 ACPI_ADR_SPACE_SYSTEM_IO)
692 return 0;
693
694 pmtmr_ioport = acpi_gbl_FADT.xpm_timer_block.address;
695 /*
696 * "X" fields are optional extensions to the original V1.0
697 * fields, so we must selectively expand V1.0 fields if the
698 * corresponding X field is zero.
699 */
700 if (!pmtmr_ioport)
701 pmtmr_ioport = acpi_gbl_FADT.pm_timer_block;
702 } else {
703 /* FADT rev. 1 */
704 pmtmr_ioport = acpi_gbl_FADT.pm_timer_block;
705 }
706 if (pmtmr_ioport)
707 printk(KERN_INFO PREFIX "PM-Timer IO Port: %#x\n",
708 pmtmr_ioport);
709#endif
710 return 0;
711}
712
713unsigned long __init acpi_find_rsdp(void)
714{
715 unsigned long rsdp_phys = 0;
716
717 if (efi_enabled) {
718 if (efi.acpi20 != EFI_INVALID_TABLE_ADDR)
719 return efi.acpi20;
720 else if (efi.acpi != EFI_INVALID_TABLE_ADDR)
721 return efi.acpi;
722 }
723 /*
724 * Scan memory looking for the RSDP signature. First search EBDA (low
725 * memory) paragraphs and then search upper memory (E0000-FFFFF).
726 */
727 rsdp_phys = acpi_scan_rsdp(0, 0x400);
728 if (!rsdp_phys)
729 rsdp_phys = acpi_scan_rsdp(0xE0000, 0x20000);
730
731 return rsdp_phys;
732}
733
734#ifdef CONFIG_X86_LOCAL_APIC
735/*
736 * Parse LAPIC entries in MADT
737 * returns 0 on success, < 0 on error
738 */
739static int __init acpi_parse_madt_lapic_entries(void)
740{
741 int count;
742
743 if (!cpu_has_apic)
744 return -ENODEV;
745
746 /*
747 * Note that the LAPIC address is obtained from the MADT (32-bit value)
748 * and (optionally) overriden by a LAPIC_ADDR_OVR entry (64-bit value).
749 */
750
751 count =
752 acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE,
753 acpi_parse_lapic_addr_ovr, 0);
754 if (count < 0) {
755 printk(KERN_ERR PREFIX
756 "Error parsing LAPIC address override entry\n");
757 return count;
758 }
759
760 mp_register_lapic_address(acpi_lapic_addr);
761
762 count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC, acpi_parse_lapic,
763 MAX_APICS);
764 if (!count) {
765 printk(KERN_ERR PREFIX "No LAPIC entries present\n");
766 /* TBD: Cleanup to allow fallback to MPS */
767 return -ENODEV;
768 } else if (count < 0) {
769 printk(KERN_ERR PREFIX "Error parsing LAPIC entry\n");
770 /* TBD: Cleanup to allow fallback to MPS */
771 return count;
772 }
773
774 count =
775 acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_NMI, acpi_parse_lapic_nmi, 0);
776 if (count < 0) {
777 printk(KERN_ERR PREFIX "Error parsing LAPIC NMI entry\n");
778 /* TBD: Cleanup to allow fallback to MPS */
779 return count;
780 }
781 return 0;
782}
783#endif /* CONFIG_X86_LOCAL_APIC */
784
785#ifdef CONFIG_X86_IO_APIC
786/*
787 * Parse IOAPIC related entries in MADT
788 * returns 0 on success, < 0 on error
789 */
790static int __init acpi_parse_madt_ioapic_entries(void)
791{
792 int count;
793
794 /*
795 * ACPI interpreter is required to complete interrupt setup,
796 * so if it is off, don't enumerate the io-apics with ACPI.
797 * If MPS is present, it will handle them,
798 * otherwise the system will stay in PIC mode
799 */
800 if (acpi_disabled || acpi_noirq) {
801 return -ENODEV;
802 }
803
804 if (!cpu_has_apic)
805 return -ENODEV;
806
807 /*
808 * if "noapic" boot option, don't look for IO-APICs
809 */
810 if (skip_ioapic_setup) {
811 printk(KERN_INFO PREFIX "Skipping IOAPIC probe "
812 "due to 'noapic' option.\n");
813 return -ENODEV;
814 }
815
816 count =
817 acpi_table_parse_madt(ACPI_MADT_TYPE_IO_APIC, acpi_parse_ioapic,
818 MAX_IO_APICS);
819 if (!count) {
820 printk(KERN_ERR PREFIX "No IOAPIC entries present\n");
821 return -ENODEV;
822 } else if (count < 0) {
823 printk(KERN_ERR PREFIX "Error parsing IOAPIC entry\n");
824 return count;
825 }
826
827 count =
828 acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, acpi_parse_int_src_ovr,
829 NR_IRQ_VECTORS);
830 if (count < 0) {
831 printk(KERN_ERR PREFIX
832 "Error parsing interrupt source overrides entry\n");
833 /* TBD: Cleanup to allow fallback to MPS */
834 return count;
835 }
836
837 /*
838 * If BIOS did not supply an INT_SRC_OVR for the SCI
839 * pretend we got one so we can set the SCI flags.
840 */
841 if (!acpi_sci_override_gsi)
842 acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0);
843
844 /* Fill in identity legacy mapings where no override */
845 mp_config_acpi_legacy_irqs();
846
847 count =
848 acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE, acpi_parse_nmi_src,
849 NR_IRQ_VECTORS);
850 if (count < 0) {
851 printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n");
852 /* TBD: Cleanup to allow fallback to MPS */
853 return count;
854 }
855
856 return 0;
857}
858#else
859static inline int acpi_parse_madt_ioapic_entries(void)
860{
861 return -1;
862}
863#endif /* !CONFIG_X86_IO_APIC */
864
865static void __init acpi_process_madt(void)
866{
867#ifdef CONFIG_X86_LOCAL_APIC
868 int error;
869
870 if (!acpi_table_parse(ACPI_SIG_MADT, acpi_parse_madt)) {
871
872 /*
873 * Parse MADT LAPIC entries
874 */
875 error = acpi_parse_madt_lapic_entries();
876 if (!error) {
877 acpi_lapic = 1;
878
879#ifdef CONFIG_X86_GENERICARCH
880 generic_bigsmp_probe();
881#endif
882 /*
883 * Parse MADT IO-APIC entries
884 */
885 error = acpi_parse_madt_ioapic_entries();
886 if (!error) {
887 acpi_irq_model = ACPI_IRQ_MODEL_IOAPIC;
888 acpi_irq_balance_set(NULL);
889 acpi_ioapic = 1;
890
891 smp_found_config = 1;
892 setup_apic_routing();
893 }
894 }
895 if (error == -EINVAL) {
896 /*
897 * Dell Precision Workstation 410, 610 come here.
898 */
899 printk(KERN_ERR PREFIX
900 "Invalid BIOS MADT, disabling ACPI\n");
901 disable_acpi();
902 }
903 }
904#endif
905 return;
906}
907
908#ifdef __i386__
909
910static int __init disable_acpi_irq(const struct dmi_system_id *d)
911{
912 if (!acpi_force) {
913 printk(KERN_NOTICE "%s detected: force use of acpi=noirq\n",
914 d->ident);
915 acpi_noirq_set();
916 }
917 return 0;
918}
919
920static int __init disable_acpi_pci(const struct dmi_system_id *d)
921{
922 if (!acpi_force) {
923 printk(KERN_NOTICE "%s detected: force use of pci=noacpi\n",
924 d->ident);
925 acpi_disable_pci();
926 }
927 return 0;
928}
929
930static int __init dmi_disable_acpi(const struct dmi_system_id *d)
931{
932 if (!acpi_force) {
933 printk(KERN_NOTICE "%s detected: acpi off\n", d->ident);
934 disable_acpi();
935 } else {
936 printk(KERN_NOTICE
937 "Warning: DMI blacklist says broken, but acpi forced\n");
938 }
939 return 0;
940}
941
942/*
943 * Limit ACPI to CPU enumeration for HT
944 */
945static int __init force_acpi_ht(const struct dmi_system_id *d)
946{
947 if (!acpi_force) {
948 printk(KERN_NOTICE "%s detected: force use of acpi=ht\n",
949 d->ident);
950 disable_acpi();
951 acpi_ht = 1;
952 } else {
953 printk(KERN_NOTICE
954 "Warning: acpi=force overrules DMI blacklist: acpi=ht\n");
955 }
956 return 0;
957}
958
959/*
960 * If your system is blacklisted here, but you find that acpi=force
961 * works for you, please contact acpi-devel@sourceforge.net
962 */
963static struct dmi_system_id __initdata acpi_dmi_table[] = {
964 /*
965 * Boxes that need ACPI disabled
966 */
967 {
968 .callback = dmi_disable_acpi,
969 .ident = "IBM Thinkpad",
970 .matches = {
971 DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
972 DMI_MATCH(DMI_BOARD_NAME, "2629H1G"),
973 },
974 },
975
976 /*
977 * Boxes that need acpi=ht
978 */
979 {
980 .callback = force_acpi_ht,
981 .ident = "FSC Primergy T850",
982 .matches = {
983 DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"),
984 DMI_MATCH(DMI_PRODUCT_NAME, "PRIMERGY T850"),
985 },
986 },
987 {
988 .callback = force_acpi_ht,
989 .ident = "HP VISUALIZE NT Workstation",
990 .matches = {
991 DMI_MATCH(DMI_BOARD_VENDOR, "Hewlett-Packard"),
992 DMI_MATCH(DMI_PRODUCT_NAME, "HP VISUALIZE NT Workstation"),
993 },
994 },
995 {
996 .callback = force_acpi_ht,
997 .ident = "Compaq Workstation W8000",
998 .matches = {
999 DMI_MATCH(DMI_SYS_VENDOR, "Compaq"),
1000 DMI_MATCH(DMI_PRODUCT_NAME, "Workstation W8000"),
1001 },
1002 },
1003 {
1004 .callback = force_acpi_ht,
1005 .ident = "ASUS P4B266",
1006 .matches = {
1007 DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
1008 DMI_MATCH(DMI_BOARD_NAME, "P4B266"),
1009 },
1010 },
1011 {
1012 .callback = force_acpi_ht,
1013 .ident = "ASUS P2B-DS",
1014 .matches = {
1015 DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
1016 DMI_MATCH(DMI_BOARD_NAME, "P2B-DS"),
1017 },
1018 },
1019 {
1020 .callback = force_acpi_ht,
1021 .ident = "ASUS CUR-DLS",
1022 .matches = {
1023 DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
1024 DMI_MATCH(DMI_BOARD_NAME, "CUR-DLS"),
1025 },
1026 },
1027 {
1028 .callback = force_acpi_ht,
1029 .ident = "ABIT i440BX-W83977",
1030 .matches = {
1031 DMI_MATCH(DMI_BOARD_VENDOR, "ABIT <http://www.abit.com>"),
1032 DMI_MATCH(DMI_BOARD_NAME, "i440BX-W83977 (BP6)"),
1033 },
1034 },
1035 {
1036 .callback = force_acpi_ht,
1037 .ident = "IBM Bladecenter",
1038 .matches = {
1039 DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
1040 DMI_MATCH(DMI_BOARD_NAME, "IBM eServer BladeCenter HS20"),
1041 },
1042 },
1043 {
1044 .callback = force_acpi_ht,
1045 .ident = "IBM eServer xSeries 360",
1046 .matches = {
1047 DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
1048 DMI_MATCH(DMI_BOARD_NAME, "eServer xSeries 360"),
1049 },
1050 },
1051 {
1052 .callback = force_acpi_ht,
1053 .ident = "IBM eserver xSeries 330",
1054 .matches = {
1055 DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
1056 DMI_MATCH(DMI_BOARD_NAME, "eserver xSeries 330"),
1057 },
1058 },
1059 {
1060 .callback = force_acpi_ht,
1061 .ident = "IBM eserver xSeries 440",
1062 .matches = {
1063 DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
1064 DMI_MATCH(DMI_PRODUCT_NAME, "eserver xSeries 440"),
1065 },
1066 },
1067
1068 /*
1069 * Boxes that need ACPI PCI IRQ routing disabled
1070 */
1071 {
1072 .callback = disable_acpi_irq,
1073 .ident = "ASUS A7V",
1074 .matches = {
1075 DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC"),
1076 DMI_MATCH(DMI_BOARD_NAME, "<A7V>"),
1077 /* newer BIOS, Revision 1011, does work */
1078 DMI_MATCH(DMI_BIOS_VERSION,
1079 "ASUS A7V ACPI BIOS Revision 1007"),
1080 },
1081 },
1082 {
1083 /*
1084 * Latest BIOS for IBM 600E (1.16) has bad pcinum
1085 * for LPC bridge, which is needed for the PCI
1086 * interrupt links to work. DSDT fix is in bug 5966.
1087 * 2645, 2646 model numbers are shared with 600/600E/600X
1088 */
1089 .callback = disable_acpi_irq,
1090 .ident = "IBM Thinkpad 600 Series 2645",
1091 .matches = {
1092 DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
1093 DMI_MATCH(DMI_BOARD_NAME, "2645"),
1094 },
1095 },
1096 {
1097 .callback = disable_acpi_irq,
1098 .ident = "IBM Thinkpad 600 Series 2646",
1099 .matches = {
1100 DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
1101 DMI_MATCH(DMI_BOARD_NAME, "2646"),
1102 },
1103 },
1104 /*
1105 * Boxes that need ACPI PCI IRQ routing and PCI scan disabled
1106 */
1107 { /* _BBN 0 bug */
1108 .callback = disable_acpi_pci,
1109 .ident = "ASUS PR-DLS",
1110 .matches = {
1111 DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
1112 DMI_MATCH(DMI_BOARD_NAME, "PR-DLS"),
1113 DMI_MATCH(DMI_BIOS_VERSION,
1114 "ASUS PR-DLS ACPI BIOS Revision 1010"),
1115 DMI_MATCH(DMI_BIOS_DATE, "03/21/2003")
1116 },
1117 },
1118 {
1119 .callback = disable_acpi_pci,
1120 .ident = "Acer TravelMate 36x Laptop",
1121 .matches = {
1122 DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
1123 DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"),
1124 },
1125 },
1126 {}
1127};
1128
1129#endif /* __i386__ */
1130
1131/*
1132 * acpi_boot_table_init() and acpi_boot_init()
1133 * called from setup_arch(), always.
1134 * 1. checksums all tables
1135 * 2. enumerates lapics
1136 * 3. enumerates io-apics
1137 *
1138 * acpi_table_init() is separate to allow reading SRAT without
1139 * other side effects.
1140 *
1141 * side effects of acpi_boot_init:
1142 * acpi_lapic = 1 if LAPIC found
1143 * acpi_ioapic = 1 if IOAPIC found
1144 * if (acpi_lapic && acpi_ioapic) smp_found_config = 1;
1145 * if acpi_blacklisted() acpi_disabled = 1;
1146 * acpi_irq_model=...
1147 * ...
1148 *
1149 * return value: (currently ignored)
1150 * 0: success
1151 * !0: failure
1152 */
1153
1154int __init acpi_boot_table_init(void)
1155{
1156 int error;
1157
1158#ifdef __i386__
1159 dmi_check_system(acpi_dmi_table);
1160#endif
1161
1162 /*
1163 * If acpi_disabled, bail out
1164 * One exception: acpi=ht continues far enough to enumerate LAPICs
1165 */
1166 if (acpi_disabled && !acpi_ht)
1167 return 1;
1168
1169 /*
1170 * Initialize the ACPI boot-time table parser.
1171 */
1172 error = acpi_table_init();
1173 if (error) {
1174 disable_acpi();
1175 return error;
1176 }
1177
1178 acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf);
1179
1180 /*
1181 * blacklist may disable ACPI entirely
1182 */
1183 error = acpi_blacklisted();
1184 if (error) {
1185 if (acpi_force) {
1186 printk(KERN_WARNING PREFIX "acpi=force override\n");
1187 } else {
1188 printk(KERN_WARNING PREFIX "Disabling ACPI support\n");
1189 disable_acpi();
1190 return error;
1191 }
1192 }
1193
1194 return 0;
1195}
1196
1197int __init acpi_boot_init(void)
1198{
1199 /*
1200 * If acpi_disabled, bail out
1201 * One exception: acpi=ht continues far enough to enumerate LAPICs
1202 */
1203 if (acpi_disabled && !acpi_ht)
1204 return 1;
1205
1206 acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf);
1207
1208 /*
1209 * set sci_int and PM timer address
1210 */
1211 acpi_table_parse(ACPI_SIG_FADT, acpi_parse_fadt);
1212
1213 /*
1214 * Process the Multiple APIC Description Table (MADT), if present
1215 */
1216 acpi_process_madt();
1217
1218 acpi_table_parse(ACPI_SIG_HPET, acpi_parse_hpet);
1219
1220 return 0;
1221}
1222
1223static int __init parse_acpi(char *arg)
1224{
1225 if (!arg)
1226 return -EINVAL;
1227
1228 /* "acpi=off" disables both ACPI table parsing and interpreter */
1229 if (strcmp(arg, "off") == 0) {
1230 disable_acpi();
1231 }
1232 /* acpi=force to over-ride black-list */
1233 else if (strcmp(arg, "force") == 0) {
1234 acpi_force = 1;
1235 acpi_ht = 1;
1236 acpi_disabled = 0;
1237 }
1238 /* acpi=strict disables out-of-spec workarounds */
1239 else if (strcmp(arg, "strict") == 0) {
1240 acpi_strict = 1;
1241 }
1242 /* Limit ACPI just to boot-time to enable HT */
1243 else if (strcmp(arg, "ht") == 0) {
1244 if (!acpi_force)
1245 disable_acpi();
1246 acpi_ht = 1;
1247 }
1248 /* "acpi=noirq" disables ACPI interrupt routing */
1249 else if (strcmp(arg, "noirq") == 0) {
1250 acpi_noirq_set();
1251 } else {
1252 /* Core will printk when we return error. */
1253 return -EINVAL;
1254 }
1255 return 0;
1256}
1257early_param("acpi", parse_acpi);
1258
1259/* FIXME: Using pci= for an ACPI parameter is a travesty. */
1260static int __init parse_pci(char *arg)
1261{
1262 if (arg && strcmp(arg, "noacpi") == 0)
1263 acpi_disable_pci();
1264 return 0;
1265}
1266early_param("pci", parse_pci);
1267
1268#ifdef CONFIG_X86_IO_APIC
1269static int __init parse_acpi_skip_timer_override(char *arg)
1270{
1271 acpi_skip_timer_override = 1;
1272 return 0;
1273}
1274early_param("acpi_skip_timer_override", parse_acpi_skip_timer_override);
1275
1276static int __init parse_acpi_use_timer_override(char *arg)
1277{
1278 acpi_use_timer_override = 1;
1279 return 0;
1280}
1281early_param("acpi_use_timer_override", parse_acpi_use_timer_override);
1282#endif /* CONFIG_X86_IO_APIC */
1283
1284static int __init setup_acpi_sci(char *s)
1285{
1286 if (!s)
1287 return -EINVAL;
1288 if (!strcmp(s, "edge"))
1289 acpi_sci_flags = ACPI_MADT_TRIGGER_EDGE |
1290 (acpi_sci_flags & ~ACPI_MADT_TRIGGER_MASK);
1291 else if (!strcmp(s, "level"))
1292 acpi_sci_flags = ACPI_MADT_TRIGGER_LEVEL |
1293 (acpi_sci_flags & ~ACPI_MADT_TRIGGER_MASK);
1294 else if (!strcmp(s, "high"))
1295 acpi_sci_flags = ACPI_MADT_POLARITY_ACTIVE_HIGH |
1296 (acpi_sci_flags & ~ACPI_MADT_POLARITY_MASK);
1297 else if (!strcmp(s, "low"))
1298 acpi_sci_flags = ACPI_MADT_POLARITY_ACTIVE_LOW |
1299 (acpi_sci_flags & ~ACPI_MADT_POLARITY_MASK);
1300 else
1301 return -EINVAL;
1302 return 0;
1303}
1304early_param("acpi_sci", setup_acpi_sci);
1305
1306int __acpi_acquire_global_lock(unsigned int *lock)
1307{
1308 unsigned int old, new, val;
1309 do {
1310 old = *lock;
1311 new = (((old & ~0x3) + 2) + ((old >> 1) & 0x1));
1312 val = cmpxchg(lock, old, new);
1313 } while (unlikely (val != old));
1314 return (new < 3) ? -1 : 0;
1315}
1316
1317int __acpi_release_global_lock(unsigned int *lock)
1318{
1319 unsigned int old, new, val;
1320 do {
1321 old = *lock;
1322 new = old & ~0x3;
1323 val = cmpxchg(lock, old, new);
1324 } while (unlikely (val != old));
1325 return old & 0x1;
1326}
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
new file mode 100644
index 000000000000..2d39f55d29a8
--- /dev/null
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -0,0 +1,164 @@
1/*
2 * arch/i386/kernel/acpi/cstate.c
3 *
4 * Copyright (C) 2005 Intel Corporation
5 * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
6 * - Added _PDC for SMP C-states on Intel CPUs
7 */
8
9#include <linux/kernel.h>
10#include <linux/module.h>
11#include <linux/init.h>
12#include <linux/acpi.h>
13#include <linux/cpu.h>
14#include <linux/sched.h>
15
16#include <acpi/processor.h>
17#include <asm/acpi.h>
18
19/*
20 * Initialize bm_flags based on the CPU cache properties
21 * On SMP it depends on cache configuration
22 * - When cache is not shared among all CPUs, we flush cache
23 * before entering C3.
24 * - When cache is shared among all CPUs, we use bm_check
25 * mechanism as in UP case
26 *
27 * This routine is called only after all the CPUs are online
28 */
29void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags,
30 unsigned int cpu)
31{
32 struct cpuinfo_x86 *c = cpu_data + cpu;
33
34 flags->bm_check = 0;
35 if (num_online_cpus() == 1)
36 flags->bm_check = 1;
37 else if (c->x86_vendor == X86_VENDOR_INTEL) {
38 /*
39 * Today all CPUs that support C3 share cache.
40 * TBD: This needs to look at cache shared map, once
41 * multi-core detection patch makes to the base.
42 */
43 flags->bm_check = 1;
44 }
45}
46EXPORT_SYMBOL(acpi_processor_power_init_bm_check);
47
48/* The code below handles cstate entry with monitor-mwait pair on Intel*/
49
50struct cstate_entry {
51 struct {
52 unsigned int eax;
53 unsigned int ecx;
54 } states[ACPI_PROCESSOR_MAX_POWER];
55};
56static struct cstate_entry *cpu_cstate_entry; /* per CPU ptr */
57
58static short mwait_supported[ACPI_PROCESSOR_MAX_POWER];
59
60#define MWAIT_SUBSTATE_MASK (0xf)
61#define MWAIT_SUBSTATE_SIZE (4)
62
63#define CPUID_MWAIT_LEAF (5)
64#define CPUID5_ECX_EXTENSIONS_SUPPORTED (0x1)
65#define CPUID5_ECX_INTERRUPT_BREAK (0x2)
66
67#define MWAIT_ECX_INTERRUPT_BREAK (0x1)
68
69#define NATIVE_CSTATE_BEYOND_HALT (2)
70
71int acpi_processor_ffh_cstate_probe(unsigned int cpu,
72 struct acpi_processor_cx *cx, struct acpi_power_register *reg)
73{
74 struct cstate_entry *percpu_entry;
75 struct cpuinfo_x86 *c = cpu_data + cpu;
76
77 cpumask_t saved_mask;
78 int retval;
79 unsigned int eax, ebx, ecx, edx;
80 unsigned int edx_part;
81 unsigned int cstate_type; /* C-state type and not ACPI C-state type */
82 unsigned int num_cstate_subtype;
83
84 if (!cpu_cstate_entry || c->cpuid_level < CPUID_MWAIT_LEAF )
85 return -1;
86
87 if (reg->bit_offset != NATIVE_CSTATE_BEYOND_HALT)
88 return -1;
89
90 percpu_entry = per_cpu_ptr(cpu_cstate_entry, cpu);
91 percpu_entry->states[cx->index].eax = 0;
92 percpu_entry->states[cx->index].ecx = 0;
93
94 /* Make sure we are running on right CPU */
95 saved_mask = current->cpus_allowed;
96 retval = set_cpus_allowed(current, cpumask_of_cpu(cpu));
97 if (retval)
98 return -1;
99
100 cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
101
102 /* Check whether this particular cx_type (in CST) is supported or not */
103 cstate_type = (cx->address >> MWAIT_SUBSTATE_SIZE) + 1;
104 edx_part = edx >> (cstate_type * MWAIT_SUBSTATE_SIZE);
105 num_cstate_subtype = edx_part & MWAIT_SUBSTATE_MASK;
106
107 retval = 0;
108 if (num_cstate_subtype < (cx->address & MWAIT_SUBSTATE_MASK)) {
109 retval = -1;
110 goto out;
111 }
112
113 /* mwait ecx extensions INTERRUPT_BREAK should be supported for C2/C3 */
114 if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
115 !(ecx & CPUID5_ECX_INTERRUPT_BREAK)) {
116 retval = -1;
117 goto out;
118 }
119 percpu_entry->states[cx->index].ecx = MWAIT_ECX_INTERRUPT_BREAK;
120
121 /* Use the hint in CST */
122 percpu_entry->states[cx->index].eax = cx->address;
123
124 if (!mwait_supported[cstate_type]) {
125 mwait_supported[cstate_type] = 1;
126 printk(KERN_DEBUG "Monitor-Mwait will be used to enter C-%d "
127 "state\n", cx->type);
128 }
129
130out:
131 set_cpus_allowed(current, saved_mask);
132 return retval;
133}
134EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe);
135
136void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
137{
138 unsigned int cpu = smp_processor_id();
139 struct cstate_entry *percpu_entry;
140
141 percpu_entry = per_cpu_ptr(cpu_cstate_entry, cpu);
142 mwait_idle_with_hints(percpu_entry->states[cx->index].eax,
143 percpu_entry->states[cx->index].ecx);
144}
145EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_enter);
146
147static int __init ffh_cstate_init(void)
148{
149 struct cpuinfo_x86 *c = &boot_cpu_data;
150 if (c->x86_vendor != X86_VENDOR_INTEL)
151 return -1;
152
153 cpu_cstate_entry = alloc_percpu(struct cstate_entry);
154 return 0;
155}
156
157static void __exit ffh_cstate_exit(void)
158{
159 free_percpu(cpu_cstate_entry);
160 cpu_cstate_entry = NULL;
161}
162
163arch_initcall(ffh_cstate_init);
164__exitcall(ffh_cstate_exit);
diff --git a/arch/x86/kernel/acpi/earlyquirk_32.c b/arch/x86/kernel/acpi/earlyquirk_32.c
new file mode 100644
index 000000000000..23f78efc577d
--- /dev/null
+++ b/arch/x86/kernel/acpi/earlyquirk_32.c
@@ -0,0 +1,84 @@
1/*
2 * Do early PCI probing for bug detection when the main PCI subsystem is
3 * not up yet.
4 */
5#include <linux/init.h>
6#include <linux/kernel.h>
7#include <linux/pci.h>
8#include <linux/acpi.h>
9
10#include <asm/pci-direct.h>
11#include <asm/acpi.h>
12#include <asm/apic.h>
13
14#ifdef CONFIG_ACPI
15
16static int __init nvidia_hpet_check(struct acpi_table_header *header)
17{
18 return 0;
19}
20#endif
21
22static int __init check_bridge(int vendor, int device)
23{
24#ifdef CONFIG_ACPI
25 static int warned;
26 /* According to Nvidia all timer overrides are bogus unless HPET
27 is enabled. */
28 if (!acpi_use_timer_override && vendor == PCI_VENDOR_ID_NVIDIA) {
29 if (!warned && acpi_table_parse(ACPI_SIG_HPET,
30 nvidia_hpet_check)) {
31 warned = 1;
32 acpi_skip_timer_override = 1;
33 printk(KERN_INFO "Nvidia board "
34 "detected. Ignoring ACPI "
35 "timer override.\n");
36 printk(KERN_INFO "If you got timer trouble "
37 "try acpi_use_timer_override\n");
38
39 }
40 }
41#endif
42 if (vendor == PCI_VENDOR_ID_ATI && timer_over_8254 == 1) {
43 timer_over_8254 = 0;
44 printk(KERN_INFO "ATI board detected. Disabling timer routing "
45 "over 8254.\n");
46 }
47 return 0;
48}
49
50void __init check_acpi_pci(void)
51{
52 int num, slot, func;
53
54 /* Assume the machine supports type 1. If not it will
55 always read ffffffff and should not have any side effect.
56 Actually a few buggy systems can machine check. Allow the user
57 to disable it by command line option at least -AK */
58 if (!early_pci_allowed())
59 return;
60
61 /* Poor man's PCI discovery */
62 for (num = 0; num < 32; num++) {
63 for (slot = 0; slot < 32; slot++) {
64 for (func = 0; func < 8; func++) {
65 u32 class;
66 u32 vendor;
67 class = read_pci_config(num, slot, func,
68 PCI_CLASS_REVISION);
69 if (class == 0xffffffff)
70 break;
71
72 if ((class >> 16) != PCI_CLASS_BRIDGE_PCI)
73 continue;
74
75 vendor = read_pci_config(num, slot, func,
76 PCI_VENDOR_ID);
77
78 if (check_bridge(vendor & 0xffff, vendor >> 16))
79 return;
80 }
81
82 }
83 }
84}
diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c
new file mode 100644
index 000000000000..b54fded49834
--- /dev/null
+++ b/arch/x86/kernel/acpi/processor.c
@@ -0,0 +1,75 @@
1/*
2 * arch/i386/kernel/acpi/processor.c
3 *
4 * Copyright (C) 2005 Intel Corporation
5 * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
6 * - Added _PDC for platforms with Intel CPUs
7 */
8
9#include <linux/kernel.h>
10#include <linux/module.h>
11#include <linux/init.h>
12#include <linux/acpi.h>
13
14#include <acpi/processor.h>
15#include <asm/acpi.h>
16
17static void init_intel_pdc(struct acpi_processor *pr, struct cpuinfo_x86 *c)
18{
19 struct acpi_object_list *obj_list;
20 union acpi_object *obj;
21 u32 *buf;
22
23 /* allocate and initialize pdc. It will be used later. */
24 obj_list = kmalloc(sizeof(struct acpi_object_list), GFP_KERNEL);
25 if (!obj_list) {
26 printk(KERN_ERR "Memory allocation error\n");
27 return;
28 }
29
30 obj = kmalloc(sizeof(union acpi_object), GFP_KERNEL);
31 if (!obj) {
32 printk(KERN_ERR "Memory allocation error\n");
33 kfree(obj_list);
34 return;
35 }
36
37 buf = kmalloc(12, GFP_KERNEL);
38 if (!buf) {
39 printk(KERN_ERR "Memory allocation error\n");
40 kfree(obj);
41 kfree(obj_list);
42 return;
43 }
44
45 buf[0] = ACPI_PDC_REVISION_ID;
46 buf[1] = 1;
47 buf[2] = ACPI_PDC_C_CAPABILITY_SMP;
48
49 if (cpu_has(c, X86_FEATURE_EST))
50 buf[2] |= ACPI_PDC_EST_CAPABILITY_SWSMP;
51
52 obj->type = ACPI_TYPE_BUFFER;
53 obj->buffer.length = 12;
54 obj->buffer.pointer = (u8 *) buf;
55 obj_list->count = 1;
56 obj_list->pointer = obj;
57 pr->pdc = obj_list;
58
59 return;
60}
61
62/* Initialize _PDC data based on the CPU vendor */
63void arch_acpi_processor_init_pdc(struct acpi_processor *pr)
64{
65 unsigned int cpu = pr->id;
66 struct cpuinfo_x86 *c = cpu_data + cpu;
67
68 pr->pdc = NULL;
69 if (c->x86_vendor == X86_VENDOR_INTEL)
70 init_intel_pdc(pr, c);
71
72 return;
73}
74
75EXPORT_SYMBOL(arch_acpi_processor_init_pdc);
diff --git a/arch/x86/kernel/acpi/sleep_32.c b/arch/x86/kernel/acpi/sleep_32.c
new file mode 100644
index 000000000000..10699489cfe7
--- /dev/null
+++ b/arch/x86/kernel/acpi/sleep_32.c
@@ -0,0 +1,110 @@
1/*
2 * sleep.c - x86-specific ACPI sleep support.
3 *
4 * Copyright (C) 2001-2003 Patrick Mochel
5 * Copyright (C) 2001-2003 Pavel Machek <pavel@suse.cz>
6 */
7
8#include <linux/acpi.h>
9#include <linux/bootmem.h>
10#include <linux/dmi.h>
11#include <linux/cpumask.h>
12
13#include <asm/smp.h>
14
15/* address in low memory of the wakeup routine. */
16unsigned long acpi_wakeup_address = 0;
17unsigned long acpi_realmode_flags;
18extern char wakeup_start, wakeup_end;
19
20extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long));
21
22/**
23 * acpi_save_state_mem - save kernel state
24 *
25 * Create an identity mapped page table and copy the wakeup routine to
26 * low memory.
27 */
28int acpi_save_state_mem(void)
29{
30 if (!acpi_wakeup_address)
31 return 1;
32 memcpy((void *)acpi_wakeup_address, &wakeup_start,
33 &wakeup_end - &wakeup_start);
34 acpi_copy_wakeup_routine(acpi_wakeup_address);
35
36 return 0;
37}
38
39/*
40 * acpi_restore_state - undo effects of acpi_save_state_mem
41 */
42void acpi_restore_state_mem(void)
43{
44}
45
46/**
47 * acpi_reserve_bootmem - do _very_ early ACPI initialisation
48 *
49 * We allocate a page from the first 1MB of memory for the wakeup
50 * routine for when we come back from a sleep state. The
51 * runtime allocator allows specification of <16MB pages, but not
52 * <1MB pages.
53 */
54void __init acpi_reserve_bootmem(void)
55{
56 if ((&wakeup_end - &wakeup_start) > PAGE_SIZE) {
57 printk(KERN_ERR
58 "ACPI: Wakeup code way too big, S3 disabled.\n");
59 return;
60 }
61
62 acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE);
63 if (!acpi_wakeup_address)
64 printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
65}
66
67static int __init acpi_sleep_setup(char *str)
68{
69 while ((str != NULL) && (*str != '\0')) {
70 if (strncmp(str, "s3_bios", 7) == 0)
71 acpi_realmode_flags |= 1;
72 if (strncmp(str, "s3_mode", 7) == 0)
73 acpi_realmode_flags |= 2;
74 if (strncmp(str, "s3_beep", 7) == 0)
75 acpi_realmode_flags |= 4;
76 str = strchr(str, ',');
77 if (str != NULL)
78 str += strspn(str, ", \t");
79 }
80 return 1;
81}
82
83__setup("acpi_sleep=", acpi_sleep_setup);
84
85/* Ouch, we want to delete this. We already have better version in userspace, in
86 s2ram from suspend.sf.net project */
87static __init int reset_videomode_after_s3(const struct dmi_system_id *d)
88{
89 acpi_realmode_flags |= 2;
90 return 0;
91}
92
93static __initdata struct dmi_system_id acpisleep_dmi_table[] = {
94 { /* Reset video mode after returning from ACPI S3 sleep */
95 .callback = reset_videomode_after_s3,
96 .ident = "Toshiba Satellite 4030cdt",
97 .matches = {
98 DMI_MATCH(DMI_PRODUCT_NAME, "S4030CDT/4.3"),
99 },
100 },
101 {}
102};
103
104static int __init acpisleep_dmi_init(void)
105{
106 dmi_check_system(acpisleep_dmi_table);
107 return 0;
108}
109
110core_initcall(acpisleep_dmi_init);
diff --git a/arch/x86/kernel/acpi/sleep_64.c b/arch/x86/kernel/acpi/sleep_64.c
new file mode 100644
index 000000000000..79475d237071
--- /dev/null
+++ b/arch/x86/kernel/acpi/sleep_64.c
@@ -0,0 +1,120 @@
1/*
2 * acpi.c - Architecture-Specific Low-Level ACPI Support
3 *
4 * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
5 * Copyright (C) 2001 Jun Nakajima <jun.nakajima@intel.com>
6 * Copyright (C) 2001 Patrick Mochel <mochel@osdl.org>
7 * Copyright (C) 2002 Andi Kleen, SuSE Labs (x86-64 port)
8 * Copyright (C) 2003 Pavel Machek, SuSE Labs
9 *
10 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 *
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, write to the Free Software
24 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25 *
26 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
27 */
28
29#include <linux/kernel.h>
30#include <linux/init.h>
31#include <linux/types.h>
32#include <linux/stddef.h>
33#include <linux/slab.h>
34#include <linux/pci.h>
35#include <linux/bootmem.h>
36#include <linux/acpi.h>
37#include <linux/cpumask.h>
38
39#include <asm/mpspec.h>
40#include <asm/io.h>
41#include <asm/apic.h>
42#include <asm/apicdef.h>
43#include <asm/page.h>
44#include <asm/pgtable.h>
45#include <asm/pgalloc.h>
46#include <asm/io_apic.h>
47#include <asm/proto.h>
48#include <asm/tlbflush.h>
49
50/* --------------------------------------------------------------------------
51 Low-Level Sleep Support
52 -------------------------------------------------------------------------- */
53
54/* address in low memory of the wakeup routine. */
55unsigned long acpi_wakeup_address = 0;
56unsigned long acpi_realmode_flags;
57extern char wakeup_start, wakeup_end;
58
59extern unsigned long acpi_copy_wakeup_routine(unsigned long);
60
61/**
62 * acpi_save_state_mem - save kernel state
63 *
64 * Create an identity mapped page table and copy the wakeup routine to
65 * low memory.
66 */
67int acpi_save_state_mem(void)
68{
69 memcpy((void *)acpi_wakeup_address, &wakeup_start,
70 &wakeup_end - &wakeup_start);
71 acpi_copy_wakeup_routine(acpi_wakeup_address);
72
73 return 0;
74}
75
76/*
77 * acpi_restore_state
78 */
79void acpi_restore_state_mem(void)
80{
81}
82
83/**
84 * acpi_reserve_bootmem - do _very_ early ACPI initialisation
85 *
86 * We allocate a page in low memory for the wakeup
87 * routine for when we come back from a sleep state. The
88 * runtime allocator allows specification of <16M pages, but not
89 * <1M pages.
90 */
91void __init acpi_reserve_bootmem(void)
92{
93 acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE*2);
94 if ((&wakeup_end - &wakeup_start) > (PAGE_SIZE*2))
95 printk(KERN_CRIT
96 "ACPI: Wakeup code way too big, will crash on attempt"
97 " to suspend\n");
98}
99
100static int __init acpi_sleep_setup(char *str)
101{
102 while ((str != NULL) && (*str != '\0')) {
103 if (strncmp(str, "s3_bios", 7) == 0)
104 acpi_realmode_flags |= 1;
105 if (strncmp(str, "s3_mode", 7) == 0)
106 acpi_realmode_flags |= 2;
107 if (strncmp(str, "s3_beep", 7) == 0)
108 acpi_realmode_flags |= 4;
109 str = strchr(str, ',');
110 if (str != NULL)
111 str += strspn(str, ", \t");
112 }
113 return 1;
114}
115
116__setup("acpi_sleep=", acpi_sleep_setup);
117
118void acpi_pci_link_exit(void)
119{
120}
diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S
new file mode 100644
index 000000000000..f22ba8534d26
--- /dev/null
+++ b/arch/x86/kernel/acpi/wakeup_32.S
@@ -0,0 +1,321 @@
1.text
2#include <linux/linkage.h>
3#include <asm/segment.h>
4#include <asm/page.h>
5
6#
7# wakeup_code runs in real mode, and at unknown address (determined at run-time).
8# Therefore it must only use relative jumps/calls.
9#
10# Do we need to deal with A20? It is okay: ACPI specs says A20 must be enabled
11#
12# If physical address of wakeup_code is 0x12345, BIOS should call us with
13# cs = 0x1234, eip = 0x05
14#
15
16#define BEEP \
17 inb $97, %al; \
18 outb %al, $0x80; \
19 movb $3, %al; \
20 outb %al, $97; \
21 outb %al, $0x80; \
22 movb $-74, %al; \
23 outb %al, $67; \
24 outb %al, $0x80; \
25 movb $-119, %al; \
26 outb %al, $66; \
27 outb %al, $0x80; \
28 movb $15, %al; \
29 outb %al, $66;
30
31ALIGN
32 .align 4096
33ENTRY(wakeup_start)
34wakeup_code:
35 wakeup_code_start = .
36 .code16
37
38 movw $0xb800, %ax
39 movw %ax,%fs
40 movw $0x0e00 + 'L', %fs:(0x10)
41
42 cli
43 cld
44
45 # setup data segment
46 movw %cs, %ax
47 movw %ax, %ds # Make ds:0 point to wakeup_start
48 movw %ax, %ss
49
50 testl $4, realmode_flags - wakeup_code
51 jz 1f
52 BEEP
531:
54 mov $(wakeup_stack - wakeup_code), %sp # Private stack is needed for ASUS board
55 movw $0x0e00 + 'S', %fs:(0x12)
56
57 pushl $0 # Kill any dangerous flags
58 popfl
59
60 movl real_magic - wakeup_code, %eax
61 cmpl $0x12345678, %eax
62 jne bogus_real_magic
63
64 testl $1, realmode_flags - wakeup_code
65 jz 1f
66 lcall $0xc000,$3
67 movw %cs, %ax
68 movw %ax, %ds # Bios might have played with that
69 movw %ax, %ss
701:
71
72 testl $2, realmode_flags - wakeup_code
73 jz 1f
74 mov video_mode - wakeup_code, %ax
75 call mode_set
761:
77
78 # set up page table
79 movl $swsusp_pg_dir-__PAGE_OFFSET, %eax
80 movl %eax, %cr3
81
82 testl $1, real_efer_save_restore - wakeup_code
83 jz 4f
84 # restore efer setting
85 movl real_save_efer_edx - wakeup_code, %edx
86 movl real_save_efer_eax - wakeup_code, %eax
87 mov $0xc0000080, %ecx
88 wrmsr
894:
90 # make sure %cr4 is set correctly (features, etc)
91 movl real_save_cr4 - wakeup_code, %eax
92 movl %eax, %cr4
93 movw $0xb800, %ax
94 movw %ax,%fs
95 movw $0x0e00 + 'i', %fs:(0x12)
96
97 # need a gdt -- use lgdtl to force 32-bit operands, in case
98 # the GDT is located past 16 megabytes.
99 lgdtl real_save_gdt - wakeup_code
100
101 movl real_save_cr0 - wakeup_code, %eax
102 movl %eax, %cr0
103 jmp 1f
1041:
105 movw $0x0e00 + 'n', %fs:(0x14)
106
107 movl real_magic - wakeup_code, %eax
108 cmpl $0x12345678, %eax
109 jne bogus_real_magic
110
111 testl $8, realmode_flags - wakeup_code
112 jz 1f
113 BEEP
1141:
115 ljmpl $__KERNEL_CS, $wakeup_pmode_return
116
117real_save_gdt: .word 0
118 .long 0
119real_save_cr0: .long 0
120real_save_cr3: .long 0
121real_save_cr4: .long 0
122real_magic: .long 0
123video_mode: .long 0
124realmode_flags: .long 0
125beep_flags: .long 0
126real_efer_save_restore: .long 0
127real_save_efer_edx: .long 0
128real_save_efer_eax: .long 0
129
130bogus_real_magic:
131 movw $0x0e00 + 'B', %fs:(0x12)
132 jmp bogus_real_magic
133
134/* This code uses an extended set of video mode numbers. These include:
135 * Aliases for standard modes
136 * NORMAL_VGA (-1)
137 * EXTENDED_VGA (-2)
138 * ASK_VGA (-3)
139 * Video modes numbered by menu position -- NOT RECOMMENDED because of lack
140 * of compatibility when extending the table. These are between 0x00 and 0xff.
141 */
142#define VIDEO_FIRST_MENU 0x0000
143
144/* Standard BIOS video modes (BIOS number + 0x0100) */
145#define VIDEO_FIRST_BIOS 0x0100
146
147/* VESA BIOS video modes (VESA number + 0x0200) */
148#define VIDEO_FIRST_VESA 0x0200
149
150/* Video7 special modes (BIOS number + 0x0900) */
151#define VIDEO_FIRST_V7 0x0900
152
153# Setting of user mode (AX=mode ID) => CF=success
154
155# For now, we only handle VESA modes (0x0200..0x03ff). To handle other
156# modes, we should probably compile in the video code from the boot
157# directory.
158mode_set:
159 movw %ax, %bx
160 subb $VIDEO_FIRST_VESA>>8, %bh
161 cmpb $2, %bh
162 jb check_vesa
163
164setbad:
165 clc
166 ret
167
168check_vesa:
169 orw $0x4000, %bx # Use linear frame buffer
170 movw $0x4f02, %ax # VESA BIOS mode set call
171 int $0x10
172 cmpw $0x004f, %ax # AL=4f if implemented
173 jnz setbad # AH=0 if OK
174
175 stc
176 ret
177
178 .code32
179 ALIGN
180
181.org 0x800
182wakeup_stack_begin: # Stack grows down
183
184.org 0xff0 # Just below end of page
185wakeup_stack:
186ENTRY(wakeup_end)
187
188.org 0x1000
189
190wakeup_pmode_return:
191 movw $__KERNEL_DS, %ax
192 movw %ax, %ss
193 movw %ax, %ds
194 movw %ax, %es
195 movw %ax, %fs
196 movw %ax, %gs
197 movw $0x0e00 + 'u', 0xb8016
198
199 # reload the gdt, as we need the full 32 bit address
200 lgdt saved_gdt
201 lidt saved_idt
202 lldt saved_ldt
203 ljmp $(__KERNEL_CS),$1f
2041:
205 movl %cr3, %eax
206 movl %eax, %cr3
207 wbinvd
208
209 # and restore the stack ... but you need gdt for this to work
210 movl saved_context_esp, %esp
211
212 movl %cs:saved_magic, %eax
213 cmpl $0x12345678, %eax
214 jne bogus_magic
215
216 # jump to place where we left off
217 movl saved_eip,%eax
218 jmp *%eax
219
220bogus_magic:
221 movw $0x0e00 + 'B', 0xb8018
222 jmp bogus_magic
223
224
225##
226# acpi_copy_wakeup_routine
227#
228# Copy the above routine to low memory.
229#
230# Parameters:
231# %eax: place to copy wakeup routine to
232#
233# Returned address is location of code in low memory (past data and stack)
234#
235ENTRY(acpi_copy_wakeup_routine)
236
237 pushl %ebx
238 sgdt saved_gdt
239 sidt saved_idt
240 sldt saved_ldt
241 str saved_tss
242
243 movl nx_enabled, %edx
244 movl %edx, real_efer_save_restore - wakeup_start (%eax)
245 testl $1, real_efer_save_restore - wakeup_start (%eax)
246 jz 2f
247 # save efer setting
248 pushl %eax
249 movl %eax, %ebx
250 mov $0xc0000080, %ecx
251 rdmsr
252 movl %edx, real_save_efer_edx - wakeup_start (%ebx)
253 movl %eax, real_save_efer_eax - wakeup_start (%ebx)
254 popl %eax
2552:
256
257 movl %cr3, %edx
258 movl %edx, real_save_cr3 - wakeup_start (%eax)
259 movl %cr4, %edx
260 movl %edx, real_save_cr4 - wakeup_start (%eax)
261 movl %cr0, %edx
262 movl %edx, real_save_cr0 - wakeup_start (%eax)
263 sgdt real_save_gdt - wakeup_start (%eax)
264
265 movl saved_videomode, %edx
266 movl %edx, video_mode - wakeup_start (%eax)
267 movl acpi_realmode_flags, %edx
268 movl %edx, realmode_flags - wakeup_start (%eax)
269 movl $0x12345678, real_magic - wakeup_start (%eax)
270 movl $0x12345678, saved_magic
271 popl %ebx
272 ret
273
274save_registers:
275 leal 4(%esp), %eax
276 movl %eax, saved_context_esp
277 movl %ebx, saved_context_ebx
278 movl %ebp, saved_context_ebp
279 movl %esi, saved_context_esi
280 movl %edi, saved_context_edi
281 pushfl ; popl saved_context_eflags
282
283 movl $ret_point, saved_eip
284 ret
285
286
287restore_registers:
288 movl saved_context_ebp, %ebp
289 movl saved_context_ebx, %ebx
290 movl saved_context_esi, %esi
291 movl saved_context_edi, %edi
292 pushl saved_context_eflags ; popfl
293 ret
294
295ENTRY(do_suspend_lowlevel)
296 call save_processor_state
297 call save_registers
298 pushl $3
299 call acpi_enter_sleep_state
300 addl $4, %esp
301
302# In case of S3 failure, we'll emerge here. Jump
303# to ret_point to recover
304 jmp ret_point
305 .p2align 4,,7
306ret_point:
307 call restore_registers
308 call restore_processor_state
309 ret
310
311.data
312ALIGN
313ENTRY(saved_magic) .long 0
314ENTRY(saved_eip) .long 0
315
316# saved registers
317saved_gdt: .long 0,0
318saved_idt: .long 0,0
319saved_ldt: .long 0
320saved_tss: .long 0
321
diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S
new file mode 100644
index 000000000000..8b4357e1efe0
--- /dev/null
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -0,0 +1,456 @@
1.text
2#include <linux/linkage.h>
3#include <asm/segment.h>
4#include <asm/pgtable.h>
5#include <asm/page.h>
6#include <asm/msr.h>
7
8# Copyright 2003 Pavel Machek <pavel@suse.cz>, distribute under GPLv2
9#
10# wakeup_code runs in real mode, and at unknown address (determined at run-time).
11# Therefore it must only use relative jumps/calls.
12#
13# Do we need to deal with A20? It is okay: ACPI specs says A20 must be enabled
14#
15# If physical address of wakeup_code is 0x12345, BIOS should call us with
16# cs = 0x1234, eip = 0x05
17#
18
19#define BEEP \
20 inb $97, %al; \
21 outb %al, $0x80; \
22 movb $3, %al; \
23 outb %al, $97; \
24 outb %al, $0x80; \
25 movb $-74, %al; \
26 outb %al, $67; \
27 outb %al, $0x80; \
28 movb $-119, %al; \
29 outb %al, $66; \
30 outb %al, $0x80; \
31 movb $15, %al; \
32 outb %al, $66;
33
34
35ALIGN
36 .align 16
37ENTRY(wakeup_start)
38wakeup_code:
39 wakeup_code_start = .
40 .code16
41
42# Running in *copy* of this code, somewhere in low 1MB.
43
44 movb $0xa1, %al ; outb %al, $0x80
45 cli
46 cld
47 # setup data segment
48 movw %cs, %ax
49 movw %ax, %ds # Make ds:0 point to wakeup_start
50 movw %ax, %ss
51
52 # Data segment must be set up before we can see whether to beep.
53 testl $4, realmode_flags - wakeup_code
54 jz 1f
55 BEEP
561:
57
58 # Private stack is needed for ASUS board
59 mov $(wakeup_stack - wakeup_code), %sp
60
61 pushl $0 # Kill any dangerous flags
62 popfl
63
64 movl real_magic - wakeup_code, %eax
65 cmpl $0x12345678, %eax
66 jne bogus_real_magic
67
68 call verify_cpu # Verify the cpu supports long
69 # mode
70 testl %eax, %eax
71 jnz no_longmode
72
73 testl $1, realmode_flags - wakeup_code
74 jz 1f
75 lcall $0xc000,$3
76 movw %cs, %ax
77 movw %ax, %ds # Bios might have played with that
78 movw %ax, %ss
791:
80
81 testl $2, realmode_flags - wakeup_code
82 jz 1f
83 mov video_mode - wakeup_code, %ax
84 call mode_set
851:
86
87 movw $0xb800, %ax
88 movw %ax,%fs
89 movw $0x0e00 + 'L', %fs:(0x10)
90
91 movb $0xa2, %al ; outb %al, $0x80
92
93 mov %ds, %ax # Find 32bit wakeup_code addr
94 movzx %ax, %esi # (Convert %ds:gdt to a liner ptr)
95 shll $4, %esi
96 # Fix up the vectors
97 addl %esi, wakeup_32_vector - wakeup_code
98 addl %esi, wakeup_long64_vector - wakeup_code
99 addl %esi, gdt_48a + 2 - wakeup_code # Fixup the gdt pointer
100
101 lidtl %ds:idt_48a - wakeup_code
102 lgdtl %ds:gdt_48a - wakeup_code # load gdt with whatever is
103 # appropriate
104
105 movl $1, %eax # protected mode (PE) bit
106 lmsw %ax # This is it!
107 jmp 1f
1081:
109
110 ljmpl *(wakeup_32_vector - wakeup_code)
111
112 .balign 4
113wakeup_32_vector:
114 .long wakeup_32 - wakeup_code
115 .word __KERNEL32_CS, 0
116
117 .code32
118wakeup_32:
119# Running in this code, but at low address; paging is not yet turned on.
120 movb $0xa5, %al ; outb %al, $0x80
121
122 movl $__KERNEL_DS, %eax
123 movl %eax, %ds
124
125 movw $0x0e00 + 'i', %ds:(0xb8012)
126 movb $0xa8, %al ; outb %al, $0x80;
127
128 /*
129 * Prepare for entering 64bits mode
130 */
131
132 /* Enable PAE */
133 xorl %eax, %eax
134 btsl $5, %eax
135 movl %eax, %cr4
136
137 /* Setup early boot stage 4 level pagetables */
138 leal (wakeup_level4_pgt - wakeup_code)(%esi), %eax
139 movl %eax, %cr3
140
141 /* Check if nx is implemented */
142 movl $0x80000001, %eax
143 cpuid
144 movl %edx,%edi
145
146 /* Enable Long Mode */
147 xorl %eax, %eax
148 btsl $_EFER_LME, %eax
149
150 /* No Execute supported? */
151 btl $20,%edi
152 jnc 1f
153 btsl $_EFER_NX, %eax
154
155 /* Make changes effective */
1561: movl $MSR_EFER, %ecx
157 xorl %edx, %edx
158 wrmsr
159
160 xorl %eax, %eax
161 btsl $31, %eax /* Enable paging and in turn activate Long Mode */
162 btsl $0, %eax /* Enable protected mode */
163
164 /* Make changes effective */
165 movl %eax, %cr0
166
167 /* At this point:
168 CR4.PAE must be 1
169 CS.L must be 0
170 CR3 must point to PML4
171 Next instruction must be a branch
172 This must be on identity-mapped page
173 */
174 /*
175 * At this point we're in long mode but in 32bit compatibility mode
176 * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn
177 * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we load
178 * the new gdt/idt that has __KERNEL_CS with CS.L = 1.
179 */
180
181 /* Finally jump in 64bit mode */
182 ljmp *(wakeup_long64_vector - wakeup_code)(%esi)
183
184 .balign 4
185wakeup_long64_vector:
186 .long wakeup_long64 - wakeup_code
187 .word __KERNEL_CS, 0
188
189.code64
190
191 /* Hooray, we are in Long 64-bit mode (but still running in
192 * low memory)
193 */
194wakeup_long64:
195 /*
196 * We must switch to a new descriptor in kernel space for the GDT
197 * because soon the kernel won't have access anymore to the userspace
198 * addresses where we're currently running on. We have to do that here
199 * because in 32bit we couldn't load a 64bit linear address.
200 */
201 lgdt cpu_gdt_descr
202
203 movw $0x0e00 + 'n', %ds:(0xb8014)
204 movb $0xa9, %al ; outb %al, $0x80
205
206 movq saved_magic, %rax
207 movq $0x123456789abcdef0, %rdx
208 cmpq %rdx, %rax
209 jne bogus_64_magic
210
211 movw $0x0e00 + 'u', %ds:(0xb8016)
212
213 nop
214 nop
215 movw $__KERNEL_DS, %ax
216 movw %ax, %ss
217 movw %ax, %ds
218 movw %ax, %es
219 movw %ax, %fs
220 movw %ax, %gs
221 movq saved_rsp, %rsp
222
223 movw $0x0e00 + 'x', %ds:(0xb8018)
224 movq saved_rbx, %rbx
225 movq saved_rdi, %rdi
226 movq saved_rsi, %rsi
227 movq saved_rbp, %rbp
228
229 movw $0x0e00 + '!', %ds:(0xb801a)
230 movq saved_rip, %rax
231 jmp *%rax
232
233.code32
234
235 .align 64
236gdta:
237 /* Its good to keep gdt in sync with one in trampoline.S */
238 .word 0, 0, 0, 0 # dummy
239 /* ??? Why I need the accessed bit set in order for this to work? */
240 .quad 0x00cf9b000000ffff # __KERNEL32_CS
241 .quad 0x00af9b000000ffff # __KERNEL_CS
242 .quad 0x00cf93000000ffff # __KERNEL_DS
243
244idt_48a:
245 .word 0 # idt limit = 0
246 .word 0, 0 # idt base = 0L
247
248gdt_48a:
249 .word 0x800 # gdt limit=2048,
250 # 256 GDT entries
251 .long gdta - wakeup_code # gdt base (relocated in later)
252
253real_magic: .quad 0
254video_mode: .quad 0
255realmode_flags: .quad 0
256
257.code16
258bogus_real_magic:
259 movb $0xba,%al ; outb %al,$0x80
260 jmp bogus_real_magic
261
262.code64
263bogus_64_magic:
264 movb $0xb3,%al ; outb %al,$0x80
265 jmp bogus_64_magic
266
267.code16
268no_longmode:
269 movb $0xbc,%al ; outb %al,$0x80
270 jmp no_longmode
271
272#include "../verify_cpu_64.S"
273
274/* This code uses an extended set of video mode numbers. These include:
275 * Aliases for standard modes
276 * NORMAL_VGA (-1)
277 * EXTENDED_VGA (-2)
278 * ASK_VGA (-3)
279 * Video modes numbered by menu position -- NOT RECOMMENDED because of lack
280 * of compatibility when extending the table. These are between 0x00 and 0xff.
281 */
282#define VIDEO_FIRST_MENU 0x0000
283
284/* Standard BIOS video modes (BIOS number + 0x0100) */
285#define VIDEO_FIRST_BIOS 0x0100
286
287/* VESA BIOS video modes (VESA number + 0x0200) */
288#define VIDEO_FIRST_VESA 0x0200
289
290/* Video7 special modes (BIOS number + 0x0900) */
291#define VIDEO_FIRST_V7 0x0900
292
293# Setting of user mode (AX=mode ID) => CF=success
294
295# For now, we only handle VESA modes (0x0200..0x03ff). To handle other
296# modes, we should probably compile in the video code from the boot
297# directory.
298.code16
299mode_set:
300 movw %ax, %bx
301 subb $VIDEO_FIRST_VESA>>8, %bh
302 cmpb $2, %bh
303 jb check_vesa
304
305setbad:
306 clc
307 ret
308
309check_vesa:
310 orw $0x4000, %bx # Use linear frame buffer
311 movw $0x4f02, %ax # VESA BIOS mode set call
312 int $0x10
313 cmpw $0x004f, %ax # AL=4f if implemented
314 jnz setbad # AH=0 if OK
315
316 stc
317 ret
318
319wakeup_stack_begin: # Stack grows down
320
321.org 0xff0
322wakeup_stack: # Just below end of page
323
324.org 0x1000
325ENTRY(wakeup_level4_pgt)
326 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
327 .fill 510,8,0
328 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
329 .quad level3_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
330
331ENTRY(wakeup_end)
332
333##
334# acpi_copy_wakeup_routine
335#
336# Copy the above routine to low memory.
337#
338# Parameters:
339# %rdi: place to copy wakeup routine to
340#
341# Returned address is location of code in low memory (past data and stack)
342#
343 .code64
344ENTRY(acpi_copy_wakeup_routine)
345 pushq %rax
346 pushq %rdx
347
348 movl saved_video_mode, %edx
349 movl %edx, video_mode - wakeup_start (,%rdi)
350 movl acpi_realmode_flags, %edx
351 movl %edx, realmode_flags - wakeup_start (,%rdi)
352 movq $0x12345678, real_magic - wakeup_start (,%rdi)
353 movq $0x123456789abcdef0, %rdx
354 movq %rdx, saved_magic
355
356 movq saved_magic, %rax
357 movq $0x123456789abcdef0, %rdx
358 cmpq %rdx, %rax
359 jne bogus_64_magic
360
361 # restore the regs we used
362 popq %rdx
363 popq %rax
364ENTRY(do_suspend_lowlevel_s4bios)
365 ret
366
367 .align 2
368 .p2align 4,,15
369.globl do_suspend_lowlevel
370 .type do_suspend_lowlevel,@function
371do_suspend_lowlevel:
372.LFB5:
373 subq $8, %rsp
374 xorl %eax, %eax
375 call save_processor_state
376
377 movq %rsp, saved_context_esp(%rip)
378 movq %rax, saved_context_eax(%rip)
379 movq %rbx, saved_context_ebx(%rip)
380 movq %rcx, saved_context_ecx(%rip)
381 movq %rdx, saved_context_edx(%rip)
382 movq %rbp, saved_context_ebp(%rip)
383 movq %rsi, saved_context_esi(%rip)
384 movq %rdi, saved_context_edi(%rip)
385 movq %r8, saved_context_r08(%rip)
386 movq %r9, saved_context_r09(%rip)
387 movq %r10, saved_context_r10(%rip)
388 movq %r11, saved_context_r11(%rip)
389 movq %r12, saved_context_r12(%rip)
390 movq %r13, saved_context_r13(%rip)
391 movq %r14, saved_context_r14(%rip)
392 movq %r15, saved_context_r15(%rip)
393 pushfq ; popq saved_context_eflags(%rip)
394
395 movq $.L97, saved_rip(%rip)
396
397 movq %rsp,saved_rsp
398 movq %rbp,saved_rbp
399 movq %rbx,saved_rbx
400 movq %rdi,saved_rdi
401 movq %rsi,saved_rsi
402
403 addq $8, %rsp
404 movl $3, %edi
405 xorl %eax, %eax
406 jmp acpi_enter_sleep_state
407.L97:
408 .p2align 4,,7
409.L99:
410 .align 4
411 movl $24, %eax
412 movw %ax, %ds
413 movq saved_context+58(%rip), %rax
414 movq %rax, %cr4
415 movq saved_context+50(%rip), %rax
416 movq %rax, %cr3
417 movq saved_context+42(%rip), %rax
418 movq %rax, %cr2
419 movq saved_context+34(%rip), %rax
420 movq %rax, %cr0
421 pushq saved_context_eflags(%rip) ; popfq
422 movq saved_context_esp(%rip), %rsp
423 movq saved_context_ebp(%rip), %rbp
424 movq saved_context_eax(%rip), %rax
425 movq saved_context_ebx(%rip), %rbx
426 movq saved_context_ecx(%rip), %rcx
427 movq saved_context_edx(%rip), %rdx
428 movq saved_context_esi(%rip), %rsi
429 movq saved_context_edi(%rip), %rdi
430 movq saved_context_r08(%rip), %r8
431 movq saved_context_r09(%rip), %r9
432 movq saved_context_r10(%rip), %r10
433 movq saved_context_r11(%rip), %r11
434 movq saved_context_r12(%rip), %r12
435 movq saved_context_r13(%rip), %r13
436 movq saved_context_r14(%rip), %r14
437 movq saved_context_r15(%rip), %r15
438
439 xorl %eax, %eax
440 addq $8, %rsp
441 jmp restore_processor_state
442.LFE5:
443.Lfe5:
444 .size do_suspend_lowlevel,.Lfe5-do_suspend_lowlevel
445
446.data
447ALIGN
448ENTRY(saved_rbp) .quad 0
449ENTRY(saved_rsi) .quad 0
450ENTRY(saved_rdi) .quad 0
451ENTRY(saved_rbx) .quad 0
452
453ENTRY(saved_rip) .quad 0
454ENTRY(saved_rsp) .quad 0
455
456ENTRY(saved_magic) .quad 0
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
new file mode 100644
index 000000000000..bd72d94e713e
--- /dev/null
+++ b/arch/x86/kernel/alternative.c
@@ -0,0 +1,450 @@
1#include <linux/module.h>
2#include <linux/sched.h>
3#include <linux/spinlock.h>
4#include <linux/list.h>
5#include <linux/kprobes.h>
6#include <linux/mm.h>
7#include <linux/vmalloc.h>
8#include <asm/alternative.h>
9#include <asm/sections.h>
10#include <asm/pgtable.h>
11#include <asm/mce.h>
12#include <asm/nmi.h>
13
14#define MAX_PATCH_LEN (255-1)
15
16#ifdef CONFIG_HOTPLUG_CPU
17static int smp_alt_once;
18
19static int __init bootonly(char *str)
20{
21 smp_alt_once = 1;
22 return 1;
23}
24__setup("smp-alt-boot", bootonly);
25#else
26#define smp_alt_once 1
27#endif
28
29static int debug_alternative;
30
31static int __init debug_alt(char *str)
32{
33 debug_alternative = 1;
34 return 1;
35}
36__setup("debug-alternative", debug_alt);
37
38static int noreplace_smp;
39
40static int __init setup_noreplace_smp(char *str)
41{
42 noreplace_smp = 1;
43 return 1;
44}
45__setup("noreplace-smp", setup_noreplace_smp);
46
47#ifdef CONFIG_PARAVIRT
48static int noreplace_paravirt = 0;
49
50static int __init setup_noreplace_paravirt(char *str)
51{
52 noreplace_paravirt = 1;
53 return 1;
54}
55__setup("noreplace-paravirt", setup_noreplace_paravirt);
56#endif
57
58#define DPRINTK(fmt, args...) if (debug_alternative) \
59 printk(KERN_DEBUG fmt, args)
60
61#ifdef GENERIC_NOP1
62/* Use inline assembly to define this because the nops are defined
63 as inline assembly strings in the include files and we cannot
64 get them easily into strings. */
65asm("\t.data\nintelnops: "
66 GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6
67 GENERIC_NOP7 GENERIC_NOP8);
68extern unsigned char intelnops[];
69static unsigned char *intel_nops[ASM_NOP_MAX+1] = {
70 NULL,
71 intelnops,
72 intelnops + 1,
73 intelnops + 1 + 2,
74 intelnops + 1 + 2 + 3,
75 intelnops + 1 + 2 + 3 + 4,
76 intelnops + 1 + 2 + 3 + 4 + 5,
77 intelnops + 1 + 2 + 3 + 4 + 5 + 6,
78 intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
79};
80#endif
81
82#ifdef K8_NOP1
83asm("\t.data\nk8nops: "
84 K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6
85 K8_NOP7 K8_NOP8);
86extern unsigned char k8nops[];
87static unsigned char *k8_nops[ASM_NOP_MAX+1] = {
88 NULL,
89 k8nops,
90 k8nops + 1,
91 k8nops + 1 + 2,
92 k8nops + 1 + 2 + 3,
93 k8nops + 1 + 2 + 3 + 4,
94 k8nops + 1 + 2 + 3 + 4 + 5,
95 k8nops + 1 + 2 + 3 + 4 + 5 + 6,
96 k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
97};
98#endif
99
100#ifdef K7_NOP1
101asm("\t.data\nk7nops: "
102 K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6
103 K7_NOP7 K7_NOP8);
104extern unsigned char k7nops[];
105static unsigned char *k7_nops[ASM_NOP_MAX+1] = {
106 NULL,
107 k7nops,
108 k7nops + 1,
109 k7nops + 1 + 2,
110 k7nops + 1 + 2 + 3,
111 k7nops + 1 + 2 + 3 + 4,
112 k7nops + 1 + 2 + 3 + 4 + 5,
113 k7nops + 1 + 2 + 3 + 4 + 5 + 6,
114 k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
115};
116#endif
117
118#ifdef CONFIG_X86_64
119
120extern char __vsyscall_0;
121static inline unsigned char** find_nop_table(void)
122{
123 return k8_nops;
124}
125
126#else /* CONFIG_X86_64 */
127
128static struct nop {
129 int cpuid;
130 unsigned char **noptable;
131} noptypes[] = {
132 { X86_FEATURE_K8, k8_nops },
133 { X86_FEATURE_K7, k7_nops },
134 { -1, NULL }
135};
136
137static unsigned char** find_nop_table(void)
138{
139 unsigned char **noptable = intel_nops;
140 int i;
141
142 for (i = 0; noptypes[i].cpuid >= 0; i++) {
143 if (boot_cpu_has(noptypes[i].cpuid)) {
144 noptable = noptypes[i].noptable;
145 break;
146 }
147 }
148 return noptable;
149}
150
151#endif /* CONFIG_X86_64 */
152
153/* Use this to add nops to a buffer, then text_poke the whole buffer. */
154static void add_nops(void *insns, unsigned int len)
155{
156 unsigned char **noptable = find_nop_table();
157
158 while (len > 0) {
159 unsigned int noplen = len;
160 if (noplen > ASM_NOP_MAX)
161 noplen = ASM_NOP_MAX;
162 memcpy(insns, noptable[noplen], noplen);
163 insns += noplen;
164 len -= noplen;
165 }
166}
167
168extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
169extern u8 *__smp_locks[], *__smp_locks_end[];
170
171/* Replace instructions with better alternatives for this CPU type.
172 This runs before SMP is initialized to avoid SMP problems with
173 self modifying code. This implies that assymetric systems where
174 APs have less capabilities than the boot processor are not handled.
175 Tough. Make sure you disable such features by hand. */
176
177void apply_alternatives(struct alt_instr *start, struct alt_instr *end)
178{
179 struct alt_instr *a;
180 char insnbuf[MAX_PATCH_LEN];
181
182 DPRINTK("%s: alt table %p -> %p\n", __FUNCTION__, start, end);
183 for (a = start; a < end; a++) {
184 u8 *instr = a->instr;
185 BUG_ON(a->replacementlen > a->instrlen);
186 BUG_ON(a->instrlen > sizeof(insnbuf));
187 if (!boot_cpu_has(a->cpuid))
188 continue;
189#ifdef CONFIG_X86_64
190 /* vsyscall code is not mapped yet. resolve it manually. */
191 if (instr >= (u8 *)VSYSCALL_START && instr < (u8*)VSYSCALL_END) {
192 instr = __va(instr - (u8*)VSYSCALL_START + (u8*)__pa_symbol(&__vsyscall_0));
193 DPRINTK("%s: vsyscall fixup: %p => %p\n",
194 __FUNCTION__, a->instr, instr);
195 }
196#endif
197 memcpy(insnbuf, a->replacement, a->replacementlen);
198 add_nops(insnbuf + a->replacementlen,
199 a->instrlen - a->replacementlen);
200 text_poke(instr, insnbuf, a->instrlen);
201 }
202}
203
204#ifdef CONFIG_SMP
205
206static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
207{
208 u8 **ptr;
209
210 for (ptr = start; ptr < end; ptr++) {
211 if (*ptr < text)
212 continue;
213 if (*ptr > text_end)
214 continue;
215 text_poke(*ptr, ((unsigned char []){0xf0}), 1); /* add lock prefix */
216 };
217}
218
219static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end)
220{
221 u8 **ptr;
222 char insn[1];
223
224 if (noreplace_smp)
225 return;
226
227 add_nops(insn, 1);
228 for (ptr = start; ptr < end; ptr++) {
229 if (*ptr < text)
230 continue;
231 if (*ptr > text_end)
232 continue;
233 text_poke(*ptr, insn, 1);
234 };
235}
236
237struct smp_alt_module {
238 /* what is this ??? */
239 struct module *mod;
240 char *name;
241
242 /* ptrs to lock prefixes */
243 u8 **locks;
244 u8 **locks_end;
245
246 /* .text segment, needed to avoid patching init code ;) */
247 u8 *text;
248 u8 *text_end;
249
250 struct list_head next;
251};
252static LIST_HEAD(smp_alt_modules);
253static DEFINE_SPINLOCK(smp_alt);
254
255void alternatives_smp_module_add(struct module *mod, char *name,
256 void *locks, void *locks_end,
257 void *text, void *text_end)
258{
259 struct smp_alt_module *smp;
260 unsigned long flags;
261
262 if (noreplace_smp)
263 return;
264
265 if (smp_alt_once) {
266 if (boot_cpu_has(X86_FEATURE_UP))
267 alternatives_smp_unlock(locks, locks_end,
268 text, text_end);
269 return;
270 }
271
272 smp = kzalloc(sizeof(*smp), GFP_KERNEL);
273 if (NULL == smp)
274 return; /* we'll run the (safe but slow) SMP code then ... */
275
276 smp->mod = mod;
277 smp->name = name;
278 smp->locks = locks;
279 smp->locks_end = locks_end;
280 smp->text = text;
281 smp->text_end = text_end;
282 DPRINTK("%s: locks %p -> %p, text %p -> %p, name %s\n",
283 __FUNCTION__, smp->locks, smp->locks_end,
284 smp->text, smp->text_end, smp->name);
285
286 spin_lock_irqsave(&smp_alt, flags);
287 list_add_tail(&smp->next, &smp_alt_modules);
288 if (boot_cpu_has(X86_FEATURE_UP))
289 alternatives_smp_unlock(smp->locks, smp->locks_end,
290 smp->text, smp->text_end);
291 spin_unlock_irqrestore(&smp_alt, flags);
292}
293
294void alternatives_smp_module_del(struct module *mod)
295{
296 struct smp_alt_module *item;
297 unsigned long flags;
298
299 if (smp_alt_once || noreplace_smp)
300 return;
301
302 spin_lock_irqsave(&smp_alt, flags);
303 list_for_each_entry(item, &smp_alt_modules, next) {
304 if (mod != item->mod)
305 continue;
306 list_del(&item->next);
307 spin_unlock_irqrestore(&smp_alt, flags);
308 DPRINTK("%s: %s\n", __FUNCTION__, item->name);
309 kfree(item);
310 return;
311 }
312 spin_unlock_irqrestore(&smp_alt, flags);
313}
314
315void alternatives_smp_switch(int smp)
316{
317 struct smp_alt_module *mod;
318 unsigned long flags;
319
320#ifdef CONFIG_LOCKDEP
321 /*
322 * A not yet fixed binutils section handling bug prevents
323 * alternatives-replacement from working reliably, so turn
324 * it off:
325 */
326 printk("lockdep: not fixing up alternatives.\n");
327 return;
328#endif
329
330 if (noreplace_smp || smp_alt_once)
331 return;
332 BUG_ON(!smp && (num_online_cpus() > 1));
333
334 spin_lock_irqsave(&smp_alt, flags);
335 if (smp) {
336 printk(KERN_INFO "SMP alternatives: switching to SMP code\n");
337 clear_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability);
338 clear_bit(X86_FEATURE_UP, cpu_data[0].x86_capability);
339 list_for_each_entry(mod, &smp_alt_modules, next)
340 alternatives_smp_lock(mod->locks, mod->locks_end,
341 mod->text, mod->text_end);
342 } else {
343 printk(KERN_INFO "SMP alternatives: switching to UP code\n");
344 set_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability);
345 set_bit(X86_FEATURE_UP, cpu_data[0].x86_capability);
346 list_for_each_entry(mod, &smp_alt_modules, next)
347 alternatives_smp_unlock(mod->locks, mod->locks_end,
348 mod->text, mod->text_end);
349 }
350 spin_unlock_irqrestore(&smp_alt, flags);
351}
352
353#endif
354
355#ifdef CONFIG_PARAVIRT
356void apply_paravirt(struct paravirt_patch_site *start,
357 struct paravirt_patch_site *end)
358{
359 struct paravirt_patch_site *p;
360 char insnbuf[MAX_PATCH_LEN];
361
362 if (noreplace_paravirt)
363 return;
364
365 for (p = start; p < end; p++) {
366 unsigned int used;
367
368 BUG_ON(p->len > MAX_PATCH_LEN);
369 /* prep the buffer with the original instructions */
370 memcpy(insnbuf, p->instr, p->len);
371 used = paravirt_ops.patch(p->instrtype, p->clobbers, insnbuf,
372 (unsigned long)p->instr, p->len);
373
374 BUG_ON(used > p->len);
375
376 /* Pad the rest with nops */
377 add_nops(insnbuf + used, p->len - used);
378 text_poke(p->instr, insnbuf, p->len);
379 }
380}
381extern struct paravirt_patch_site __start_parainstructions[],
382 __stop_parainstructions[];
383#endif /* CONFIG_PARAVIRT */
384
385void __init alternative_instructions(void)
386{
387 unsigned long flags;
388
389 /* The patching is not fully atomic, so try to avoid local interruptions
390 that might execute the to be patched code.
391 Other CPUs are not running. */
392 stop_nmi();
393#ifdef CONFIG_X86_MCE
394 stop_mce();
395#endif
396
397 local_irq_save(flags);
398 apply_alternatives(__alt_instructions, __alt_instructions_end);
399
400 /* switch to patch-once-at-boottime-only mode and free the
401 * tables in case we know the number of CPUs will never ever
402 * change */
403#ifdef CONFIG_HOTPLUG_CPU
404 if (num_possible_cpus() < 2)
405 smp_alt_once = 1;
406#endif
407
408#ifdef CONFIG_SMP
409 if (smp_alt_once) {
410 if (1 == num_possible_cpus()) {
411 printk(KERN_INFO "SMP alternatives: switching to UP code\n");
412 set_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability);
413 set_bit(X86_FEATURE_UP, cpu_data[0].x86_capability);
414 alternatives_smp_unlock(__smp_locks, __smp_locks_end,
415 _text, _etext);
416 }
417 free_init_pages("SMP alternatives",
418 (unsigned long)__smp_locks,
419 (unsigned long)__smp_locks_end);
420 } else {
421 alternatives_smp_module_add(NULL, "core kernel",
422 __smp_locks, __smp_locks_end,
423 _text, _etext);
424 alternatives_smp_switch(0);
425 }
426#endif
427 apply_paravirt(__parainstructions, __parainstructions_end);
428 local_irq_restore(flags);
429
430 restart_nmi();
431#ifdef CONFIG_X86_MCE
432 restart_mce();
433#endif
434}
435
436/*
437 * Warning:
438 * When you use this code to patch more than one byte of an instruction
439 * you need to make sure that other CPUs cannot execute this code in parallel.
440 * Also no thread must be currently preempted in the middle of these instructions.
441 * And on the local CPU you need to be protected again NMI or MCE handlers
442 * seeing an inconsistent instruction while you patch.
443 */
444void __kprobes text_poke(void *addr, unsigned char *opcode, int len)
445{
446 memcpy(addr, opcode, len);
447 sync_core();
448 /* Could also do a CLFLUSH here to speed up CPU recovery; but
449 that causes hangs on some VIA CPUs. */
450}
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
new file mode 100644
index 000000000000..8f681cae7bf7
--- /dev/null
+++ b/arch/x86/kernel/aperture_64.c
@@ -0,0 +1,298 @@
1/*
2 * Firmware replacement code.
3 *
4 * Work around broken BIOSes that don't set an aperture or only set the
5 * aperture in the AGP bridge.
6 * If all fails map the aperture over some low memory. This is cheaper than
7 * doing bounce buffering. The memory is lost. This is done at early boot
8 * because only the bootmem allocator can allocate 32+MB.
9 *
10 * Copyright 2002 Andi Kleen, SuSE Labs.
11 */
12#include <linux/kernel.h>
13#include <linux/types.h>
14#include <linux/init.h>
15#include <linux/bootmem.h>
16#include <linux/mmzone.h>
17#include <linux/pci_ids.h>
18#include <linux/pci.h>
19#include <linux/bitops.h>
20#include <linux/ioport.h>
21#include <asm/e820.h>
22#include <asm/io.h>
23#include <asm/iommu.h>
24#include <asm/pci-direct.h>
25#include <asm/dma.h>
26#include <asm/k8.h>
27
28int iommu_aperture;
29int iommu_aperture_disabled __initdata = 0;
30int iommu_aperture_allowed __initdata = 0;
31
32int fallback_aper_order __initdata = 1; /* 64MB */
33int fallback_aper_force __initdata = 0;
34
35int fix_aperture __initdata = 1;
36
37static struct resource gart_resource = {
38 .name = "GART",
39 .flags = IORESOURCE_MEM,
40};
41
42static void __init insert_aperture_resource(u32 aper_base, u32 aper_size)
43{
44 gart_resource.start = aper_base;
45 gart_resource.end = aper_base + aper_size - 1;
46 insert_resource(&iomem_resource, &gart_resource);
47}
48
49/* This code runs before the PCI subsystem is initialized, so just
50 access the northbridge directly. */
51
52static u32 __init allocate_aperture(void)
53{
54 u32 aper_size;
55 void *p;
56
57 if (fallback_aper_order > 7)
58 fallback_aper_order = 7;
59 aper_size = (32 * 1024 * 1024) << fallback_aper_order;
60
61 /*
62 * Aperture has to be naturally aligned. This means an 2GB aperture won't
63 * have much chance of finding a place in the lower 4GB of memory.
64 * Unfortunately we cannot move it up because that would make the
65 * IOMMU useless.
66 */
67 p = __alloc_bootmem_nopanic(aper_size, aper_size, 0);
68 if (!p || __pa(p)+aper_size > 0xffffffff) {
69 printk("Cannot allocate aperture memory hole (%p,%uK)\n",
70 p, aper_size>>10);
71 if (p)
72 free_bootmem(__pa(p), aper_size);
73 return 0;
74 }
75 printk("Mapping aperture over %d KB of RAM @ %lx\n",
76 aper_size >> 10, __pa(p));
77 insert_aperture_resource((u32)__pa(p), aper_size);
78 return (u32)__pa(p);
79}
80
81static int __init aperture_valid(u64 aper_base, u32 aper_size)
82{
83 if (!aper_base)
84 return 0;
85 if (aper_size < 64*1024*1024) {
86 printk("Aperture too small (%d MB)\n", aper_size>>20);
87 return 0;
88 }
89 if (aper_base + aper_size > 0x100000000UL) {
90 printk("Aperture beyond 4GB. Ignoring.\n");
91 return 0;
92 }
93 if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) {
94 printk("Aperture pointing to e820 RAM. Ignoring.\n");
95 return 0;
96 }
97 return 1;
98}
99
100/* Find a PCI capability */
101static __u32 __init find_cap(int num, int slot, int func, int cap)
102{
103 u8 pos;
104 int bytes;
105 if (!(read_pci_config_16(num,slot,func,PCI_STATUS) & PCI_STATUS_CAP_LIST))
106 return 0;
107 pos = read_pci_config_byte(num,slot,func,PCI_CAPABILITY_LIST);
108 for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) {
109 u8 id;
110 pos &= ~3;
111 id = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_ID);
112 if (id == 0xff)
113 break;
114 if (id == cap)
115 return pos;
116 pos = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_NEXT);
117 }
118 return 0;
119}
120
121/* Read a standard AGPv3 bridge header */
122static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order)
123{
124 u32 apsize;
125 u32 apsizereg;
126 int nbits;
127 u32 aper_low, aper_hi;
128 u64 aper;
129
130 printk("AGP bridge at %02x:%02x:%02x\n", num, slot, func);
131 apsizereg = read_pci_config_16(num,slot,func, cap + 0x14);
132 if (apsizereg == 0xffffffff) {
133 printk("APSIZE in AGP bridge unreadable\n");
134 return 0;
135 }
136
137 apsize = apsizereg & 0xfff;
138 /* Some BIOS use weird encodings not in the AGPv3 table. */
139 if (apsize & 0xff)
140 apsize |= 0xf00;
141 nbits = hweight16(apsize);
142 *order = 7 - nbits;
143 if ((int)*order < 0) /* < 32MB */
144 *order = 0;
145
146 aper_low = read_pci_config(num,slot,func, 0x10);
147 aper_hi = read_pci_config(num,slot,func,0x14);
148 aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32);
149
150 printk("Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n",
151 aper, 32 << *order, apsizereg);
152
153 if (!aperture_valid(aper, (32*1024*1024) << *order))
154 return 0;
155 return (u32)aper;
156}
157
158/* Look for an AGP bridge. Windows only expects the aperture in the
159 AGP bridge and some BIOS forget to initialize the Northbridge too.
160 Work around this here.
161
162 Do an PCI bus scan by hand because we're running before the PCI
163 subsystem.
164
165 All K8 AGP bridges are AGPv3 compliant, so we can do this scan
166 generically. It's probably overkill to always scan all slots because
167 the AGP bridges should be always an own bus on the HT hierarchy,
168 but do it here for future safety. */
169static __u32 __init search_agp_bridge(u32 *order, int *valid_agp)
170{
171 int num, slot, func;
172
173 /* Poor man's PCI discovery */
174 for (num = 0; num < 256; num++) {
175 for (slot = 0; slot < 32; slot++) {
176 for (func = 0; func < 8; func++) {
177 u32 class, cap;
178 u8 type;
179 class = read_pci_config(num,slot,func,
180 PCI_CLASS_REVISION);
181 if (class == 0xffffffff)
182 break;
183
184 switch (class >> 16) {
185 case PCI_CLASS_BRIDGE_HOST:
186 case PCI_CLASS_BRIDGE_OTHER: /* needed? */
187 /* AGP bridge? */
188 cap = find_cap(num,slot,func,PCI_CAP_ID_AGP);
189 if (!cap)
190 break;
191 *valid_agp = 1;
192 return read_agp(num,slot,func,cap,order);
193 }
194
195 /* No multi-function device? */
196 type = read_pci_config_byte(num,slot,func,
197 PCI_HEADER_TYPE);
198 if (!(type & 0x80))
199 break;
200 }
201 }
202 }
203 printk("No AGP bridge found\n");
204 return 0;
205}
206
207void __init iommu_hole_init(void)
208{
209 int fix, num;
210 u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0;
211 u64 aper_base, last_aper_base = 0;
212 int valid_agp = 0;
213
214 if (iommu_aperture_disabled || !fix_aperture || !early_pci_allowed())
215 return;
216
217 printk(KERN_INFO "Checking aperture...\n");
218
219 fix = 0;
220 for (num = 24; num < 32; num++) {
221 if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
222 continue;
223
224 iommu_detected = 1;
225 iommu_aperture = 1;
226
227 aper_order = (read_pci_config(0, num, 3, 0x90) >> 1) & 7;
228 aper_size = (32 * 1024 * 1024) << aper_order;
229 aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff;
230 aper_base <<= 25;
231
232 printk("CPU %d: aperture @ %Lx size %u MB\n", num-24,
233 aper_base, aper_size>>20);
234
235 if (!aperture_valid(aper_base, aper_size)) {
236 fix = 1;
237 break;
238 }
239
240 if ((last_aper_order && aper_order != last_aper_order) ||
241 (last_aper_base && aper_base != last_aper_base)) {
242 fix = 1;
243 break;
244 }
245 last_aper_order = aper_order;
246 last_aper_base = aper_base;
247 }
248
249 if (!fix && !fallback_aper_force) {
250 if (last_aper_base) {
251 unsigned long n = (32 * 1024 * 1024) << last_aper_order;
252 insert_aperture_resource((u32)last_aper_base, n);
253 }
254 return;
255 }
256
257 if (!fallback_aper_force)
258 aper_alloc = search_agp_bridge(&aper_order, &valid_agp);
259
260 if (aper_alloc) {
261 /* Got the aperture from the AGP bridge */
262 } else if (swiotlb && !valid_agp) {
263 /* Do nothing */
264 } else if ((!no_iommu && end_pfn > MAX_DMA32_PFN) ||
265 force_iommu ||
266 valid_agp ||
267 fallback_aper_force) {
268 printk("Your BIOS doesn't leave a aperture memory hole\n");
269 printk("Please enable the IOMMU option in the BIOS setup\n");
270 printk("This costs you %d MB of RAM\n",
271 32 << fallback_aper_order);
272
273 aper_order = fallback_aper_order;
274 aper_alloc = allocate_aperture();
275 if (!aper_alloc) {
276 /* Could disable AGP and IOMMU here, but it's probably
277 not worth it. But the later users cannot deal with
278 bad apertures and turning on the aperture over memory
279 causes very strange problems, so it's better to
280 panic early. */
281 panic("Not enough memory for aperture");
282 }
283 } else {
284 return;
285 }
286
287 /* Fix up the north bridges */
288 for (num = 24; num < 32; num++) {
289 if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
290 continue;
291
292 /* Don't enable translation yet. That is done later.
293 Assume this BIOS didn't initialise the GART so
294 just overwrite all previous bits */
295 write_pci_config(0, num, 3, 0x90, aper_order<<1);
296 write_pci_config(0, num, 3, 0x94, aper_alloc>>25);
297 }
298}
diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
new file mode 100644
index 000000000000..3d67ae18d762
--- /dev/null
+++ b/arch/x86/kernel/apic_32.c
@@ -0,0 +1,1566 @@
1/*
2 * Local APIC handling, local APIC timers
3 *
4 * (c) 1999, 2000 Ingo Molnar <mingo@redhat.com>
5 *
6 * Fixes
7 * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
8 * thanks to Eric Gilmore
9 * and Rolf G. Tews
10 * for testing these extensively.
11 * Maciej W. Rozycki : Various updates and fixes.
12 * Mikael Pettersson : Power Management for UP-APIC.
13 * Pavel Machek and
14 * Mikael Pettersson : PM converted to driver model.
15 */
16
17#include <linux/init.h>
18
19#include <linux/mm.h>
20#include <linux/delay.h>
21#include <linux/bootmem.h>
22#include <linux/interrupt.h>
23#include <linux/mc146818rtc.h>
24#include <linux/kernel_stat.h>
25#include <linux/sysdev.h>
26#include <linux/cpu.h>
27#include <linux/clockchips.h>
28#include <linux/acpi_pmtmr.h>
29#include <linux/module.h>
30#include <linux/dmi.h>
31
32#include <asm/atomic.h>
33#include <asm/smp.h>
34#include <asm/mtrr.h>
35#include <asm/mpspec.h>
36#include <asm/desc.h>
37#include <asm/arch_hooks.h>
38#include <asm/hpet.h>
39#include <asm/i8253.h>
40#include <asm/nmi.h>
41
42#include <mach_apic.h>
43#include <mach_apicdef.h>
44#include <mach_ipi.h>
45
46#include "io_ports.h"
47
48/*
49 * Sanity check
50 */
51#if (SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F
52# error SPURIOUS_APIC_VECTOR definition error
53#endif
54
55/*
56 * Knob to control our willingness to enable the local APIC.
57 *
58 * -1=force-disable, +1=force-enable
59 */
60static int enable_local_apic __initdata = 0;
61
62/* Local APIC timer verification ok */
63static int local_apic_timer_verify_ok;
64/* Disable local APIC timer from the kernel commandline or via dmi quirk
65 or using CPU MSR check */
66int local_apic_timer_disabled;
67/* Local APIC timer works in C2 */
68int local_apic_timer_c2_ok;
69EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
70
71/*
72 * Debug level, exported for io_apic.c
73 */
74int apic_verbosity;
75
76static unsigned int calibration_result;
77
78static int lapic_next_event(unsigned long delta,
79 struct clock_event_device *evt);
80static void lapic_timer_setup(enum clock_event_mode mode,
81 struct clock_event_device *evt);
82static void lapic_timer_broadcast(cpumask_t mask);
83static void apic_pm_activate(void);
84
85/*
86 * The local apic timer can be used for any function which is CPU local.
87 */
88static struct clock_event_device lapic_clockevent = {
89 .name = "lapic",
90 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT
91 | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY,
92 .shift = 32,
93 .set_mode = lapic_timer_setup,
94 .set_next_event = lapic_next_event,
95 .broadcast = lapic_timer_broadcast,
96 .rating = 100,
97 .irq = -1,
98};
99static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
100
101/* Local APIC was disabled by the BIOS and enabled by the kernel */
102static int enabled_via_apicbase;
103
104/*
105 * Get the LAPIC version
106 */
107static inline int lapic_get_version(void)
108{
109 return GET_APIC_VERSION(apic_read(APIC_LVR));
110}
111
112/*
113 * Check, if the APIC is integrated or a seperate chip
114 */
115static inline int lapic_is_integrated(void)
116{
117 return APIC_INTEGRATED(lapic_get_version());
118}
119
120/*
121 * Check, whether this is a modern or a first generation APIC
122 */
123static int modern_apic(void)
124{
125 /* AMD systems use old APIC versions, so check the CPU */
126 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
127 boot_cpu_data.x86 >= 0xf)
128 return 1;
129 return lapic_get_version() >= 0x14;
130}
131
132void apic_wait_icr_idle(void)
133{
134 while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
135 cpu_relax();
136}
137
138unsigned long safe_apic_wait_icr_idle(void)
139{
140 unsigned long send_status;
141 int timeout;
142
143 timeout = 0;
144 do {
145 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
146 if (!send_status)
147 break;
148 udelay(100);
149 } while (timeout++ < 1000);
150
151 return send_status;
152}
153
154/**
155 * enable_NMI_through_LVT0 - enable NMI through local vector table 0
156 */
157void enable_NMI_through_LVT0 (void * dummy)
158{
159 unsigned int v = APIC_DM_NMI;
160
161 /* Level triggered for 82489DX */
162 if (!lapic_is_integrated())
163 v |= APIC_LVT_LEVEL_TRIGGER;
164 apic_write_around(APIC_LVT0, v);
165}
166
167/**
168 * get_physical_broadcast - Get number of physical broadcast IDs
169 */
170int get_physical_broadcast(void)
171{
172 return modern_apic() ? 0xff : 0xf;
173}
174
175/**
176 * lapic_get_maxlvt - get the maximum number of local vector table entries
177 */
178int lapic_get_maxlvt(void)
179{
180 unsigned int v = apic_read(APIC_LVR);
181
182 /* 82489DXs do not report # of LVT entries. */
183 return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2;
184}
185
186/*
187 * Local APIC timer
188 */
189
190/* Clock divisor is set to 16 */
191#define APIC_DIVISOR 16
192
193/*
194 * This function sets up the local APIC timer, with a timeout of
195 * 'clocks' APIC bus clock. During calibration we actually call
196 * this function twice on the boot CPU, once with a bogus timeout
197 * value, second time for real. The other (noncalibrating) CPUs
198 * call this function only once, with the real, calibrated value.
199 *
200 * We do reads before writes even if unnecessary, to get around the
201 * P5 APIC double write bug.
202 */
203static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
204{
205 unsigned int lvtt_value, tmp_value;
206
207 lvtt_value = LOCAL_TIMER_VECTOR;
208 if (!oneshot)
209 lvtt_value |= APIC_LVT_TIMER_PERIODIC;
210 if (!lapic_is_integrated())
211 lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV);
212
213 if (!irqen)
214 lvtt_value |= APIC_LVT_MASKED;
215
216 apic_write_around(APIC_LVTT, lvtt_value);
217
218 /*
219 * Divide PICLK by 16
220 */
221 tmp_value = apic_read(APIC_TDCR);
222 apic_write_around(APIC_TDCR, (tmp_value
223 & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE))
224 | APIC_TDR_DIV_16);
225
226 if (!oneshot)
227 apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR);
228}
229
230/*
231 * Program the next event, relative to now
232 */
233static int lapic_next_event(unsigned long delta,
234 struct clock_event_device *evt)
235{
236 apic_write_around(APIC_TMICT, delta);
237 return 0;
238}
239
240/*
241 * Setup the lapic timer in periodic or oneshot mode
242 */
243static void lapic_timer_setup(enum clock_event_mode mode,
244 struct clock_event_device *evt)
245{
246 unsigned long flags;
247 unsigned int v;
248
249 /* Lapic used for broadcast ? */
250 if (!local_apic_timer_verify_ok)
251 return;
252
253 local_irq_save(flags);
254
255 switch (mode) {
256 case CLOCK_EVT_MODE_PERIODIC:
257 case CLOCK_EVT_MODE_ONESHOT:
258 __setup_APIC_LVTT(calibration_result,
259 mode != CLOCK_EVT_MODE_PERIODIC, 1);
260 break;
261 case CLOCK_EVT_MODE_UNUSED:
262 case CLOCK_EVT_MODE_SHUTDOWN:
263 v = apic_read(APIC_LVTT);
264 v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
265 apic_write_around(APIC_LVTT, v);
266 break;
267 case CLOCK_EVT_MODE_RESUME:
268 /* Nothing to do here */
269 break;
270 }
271
272 local_irq_restore(flags);
273}
274
275/*
276 * Local APIC timer broadcast function
277 */
278static void lapic_timer_broadcast(cpumask_t mask)
279{
280#ifdef CONFIG_SMP
281 send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
282#endif
283}
284
285/*
286 * Setup the local APIC timer for this CPU. Copy the initilized values
287 * of the boot CPU and register the clock event in the framework.
288 */
289static void __devinit setup_APIC_timer(void)
290{
291 struct clock_event_device *levt = &__get_cpu_var(lapic_events);
292
293 memcpy(levt, &lapic_clockevent, sizeof(*levt));
294 levt->cpumask = cpumask_of_cpu(smp_processor_id());
295
296 clockevents_register_device(levt);
297}
298
299/*
300 * In this functions we calibrate APIC bus clocks to the external timer.
301 *
302 * We want to do the calibration only once since we want to have local timer
303 * irqs syncron. CPUs connected by the same APIC bus have the very same bus
304 * frequency.
305 *
306 * This was previously done by reading the PIT/HPET and waiting for a wrap
307 * around to find out, that a tick has elapsed. I have a box, where the PIT
308 * readout is broken, so it never gets out of the wait loop again. This was
309 * also reported by others.
310 *
311 * Monitoring the jiffies value is inaccurate and the clockevents
312 * infrastructure allows us to do a simple substitution of the interrupt
313 * handler.
314 *
315 * The calibration routine also uses the pm_timer when possible, as the PIT
316 * happens to run way too slow (factor 2.3 on my VAIO CoreDuo, which goes
317 * back to normal later in the boot process).
318 */
319
320#define LAPIC_CAL_LOOPS (HZ/10)
321
322static __initdata int lapic_cal_loops = -1;
323static __initdata long lapic_cal_t1, lapic_cal_t2;
324static __initdata unsigned long long lapic_cal_tsc1, lapic_cal_tsc2;
325static __initdata unsigned long lapic_cal_pm1, lapic_cal_pm2;
326static __initdata unsigned long lapic_cal_j1, lapic_cal_j2;
327
328/*
329 * Temporary interrupt handler.
330 */
331static void __init lapic_cal_handler(struct clock_event_device *dev)
332{
333 unsigned long long tsc = 0;
334 long tapic = apic_read(APIC_TMCCT);
335 unsigned long pm = acpi_pm_read_early();
336
337 if (cpu_has_tsc)
338 rdtscll(tsc);
339
340 switch (lapic_cal_loops++) {
341 case 0:
342 lapic_cal_t1 = tapic;
343 lapic_cal_tsc1 = tsc;
344 lapic_cal_pm1 = pm;
345 lapic_cal_j1 = jiffies;
346 break;
347
348 case LAPIC_CAL_LOOPS:
349 lapic_cal_t2 = tapic;
350 lapic_cal_tsc2 = tsc;
351 if (pm < lapic_cal_pm1)
352 pm += ACPI_PM_OVRRUN;
353 lapic_cal_pm2 = pm;
354 lapic_cal_j2 = jiffies;
355 break;
356 }
357}
358
359/*
360 * Setup the boot APIC
361 *
362 * Calibrate and verify the result.
363 */
364void __init setup_boot_APIC_clock(void)
365{
366 struct clock_event_device *levt = &__get_cpu_var(lapic_events);
367 const long pm_100ms = PMTMR_TICKS_PER_SEC/10;
368 const long pm_thresh = pm_100ms/100;
369 void (*real_handler)(struct clock_event_device *dev);
370 unsigned long deltaj;
371 long delta, deltapm;
372 int pm_referenced = 0;
373
374 /*
375 * The local apic timer can be disabled via the kernel
376 * commandline or from the CPU detection code. Register the lapic
377 * timer as a dummy clock event source on SMP systems, so the
378 * broadcast mechanism is used. On UP systems simply ignore it.
379 */
380 if (local_apic_timer_disabled) {
381 /* No broadcast on UP ! */
382 if (num_possible_cpus() > 1)
383 setup_APIC_timer();
384 return;
385 }
386
387 apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
388 "calibrating APIC timer ...\n");
389
390 local_irq_disable();
391
392 /* Replace the global interrupt handler */
393 real_handler = global_clock_event->event_handler;
394 global_clock_event->event_handler = lapic_cal_handler;
395
396 /*
397 * Setup the APIC counter to 1e9. There is no way the lapic
398 * can underflow in the 100ms detection time frame
399 */
400 __setup_APIC_LVTT(1000000000, 0, 0);
401
402 /* Let the interrupts run */
403 local_irq_enable();
404
405 while (lapic_cal_loops <= LAPIC_CAL_LOOPS)
406 cpu_relax();
407
408 local_irq_disable();
409
410 /* Restore the real event handler */
411 global_clock_event->event_handler = real_handler;
412
413 /* Build delta t1-t2 as apic timer counts down */
414 delta = lapic_cal_t1 - lapic_cal_t2;
415 apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta);
416
417 /* Check, if the PM timer is available */
418 deltapm = lapic_cal_pm2 - lapic_cal_pm1;
419 apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm);
420
421 if (deltapm) {
422 unsigned long mult;
423 u64 res;
424
425 mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22);
426
427 if (deltapm > (pm_100ms - pm_thresh) &&
428 deltapm < (pm_100ms + pm_thresh)) {
429 apic_printk(APIC_VERBOSE, "... PM timer result ok\n");
430 } else {
431 res = (((u64) deltapm) * mult) >> 22;
432 do_div(res, 1000000);
433 printk(KERN_WARNING "APIC calibration not consistent "
434 "with PM Timer: %ldms instead of 100ms\n",
435 (long)res);
436 /* Correct the lapic counter value */
437 res = (((u64) delta ) * pm_100ms);
438 do_div(res, deltapm);
439 printk(KERN_INFO "APIC delta adjusted to PM-Timer: "
440 "%lu (%ld)\n", (unsigned long) res, delta);
441 delta = (long) res;
442 }
443 pm_referenced = 1;
444 }
445
446 /* Calculate the scaled math multiplication factor */
447 lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, 32);
448 lapic_clockevent.max_delta_ns =
449 clockevent_delta2ns(0x7FFFFF, &lapic_clockevent);
450 lapic_clockevent.min_delta_ns =
451 clockevent_delta2ns(0xF, &lapic_clockevent);
452
453 calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS;
454
455 apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta);
456 apic_printk(APIC_VERBOSE, "..... mult: %ld\n", lapic_clockevent.mult);
457 apic_printk(APIC_VERBOSE, "..... calibration result: %u\n",
458 calibration_result);
459
460 if (cpu_has_tsc) {
461 delta = (long)(lapic_cal_tsc2 - lapic_cal_tsc1);
462 apic_printk(APIC_VERBOSE, "..... CPU clock speed is "
463 "%ld.%04ld MHz.\n",
464 (delta / LAPIC_CAL_LOOPS) / (1000000 / HZ),
465 (delta / LAPIC_CAL_LOOPS) % (1000000 / HZ));
466 }
467
468 apic_printk(APIC_VERBOSE, "..... host bus clock speed is "
469 "%u.%04u MHz.\n",
470 calibration_result / (1000000 / HZ),
471 calibration_result % (1000000 / HZ));
472
473 local_apic_timer_verify_ok = 1;
474
475 /* We trust the pm timer based calibration */
476 if (!pm_referenced) {
477 apic_printk(APIC_VERBOSE, "... verify APIC timer\n");
478
479 /*
480 * Setup the apic timer manually
481 */
482 levt->event_handler = lapic_cal_handler;
483 lapic_timer_setup(CLOCK_EVT_MODE_PERIODIC, levt);
484 lapic_cal_loops = -1;
485
486 /* Let the interrupts run */
487 local_irq_enable();
488
489 while (lapic_cal_loops <= LAPIC_CAL_LOOPS)
490 cpu_relax();
491
492 local_irq_disable();
493
494 /* Stop the lapic timer */
495 lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, levt);
496
497 local_irq_enable();
498
499 /* Jiffies delta */
500 deltaj = lapic_cal_j2 - lapic_cal_j1;
501 apic_printk(APIC_VERBOSE, "... jiffies delta = %lu\n", deltaj);
502
503 /* Check, if the jiffies result is consistent */
504 if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2)
505 apic_printk(APIC_VERBOSE, "... jiffies result ok\n");
506 else
507 local_apic_timer_verify_ok = 0;
508 } else
509 local_irq_enable();
510
511 if (!local_apic_timer_verify_ok) {
512 printk(KERN_WARNING
513 "APIC timer disabled due to verification failure.\n");
514 /* No broadcast on UP ! */
515 if (num_possible_cpus() == 1)
516 return;
517 } else {
518 /*
519 * If nmi_watchdog is set to IO_APIC, we need the
520 * PIT/HPET going. Otherwise register lapic as a dummy
521 * device.
522 */
523 if (nmi_watchdog != NMI_IO_APIC)
524 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
525 else
526 printk(KERN_WARNING "APIC timer registered as dummy,"
527 " due to nmi_watchdog=1!\n");
528 }
529
530 /* Setup the lapic or request the broadcast */
531 setup_APIC_timer();
532}
533
534void __devinit setup_secondary_APIC_clock(void)
535{
536 setup_APIC_timer();
537}
538
539/*
540 * The guts of the apic timer interrupt
541 */
542static void local_apic_timer_interrupt(void)
543{
544 int cpu = smp_processor_id();
545 struct clock_event_device *evt = &per_cpu(lapic_events, cpu);
546
547 /*
548 * Normally we should not be here till LAPIC has been initialized but
549 * in some cases like kdump, its possible that there is a pending LAPIC
550 * timer interrupt from previous kernel's context and is delivered in
551 * new kernel the moment interrupts are enabled.
552 *
553 * Interrupts are enabled early and LAPIC is setup much later, hence
554 * its possible that when we get here evt->event_handler is NULL.
555 * Check for event_handler being NULL and discard the interrupt as
556 * spurious.
557 */
558 if (!evt->event_handler) {
559 printk(KERN_WARNING
560 "Spurious LAPIC timer interrupt on cpu %d\n", cpu);
561 /* Switch it off */
562 lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt);
563 return;
564 }
565
566 per_cpu(irq_stat, cpu).apic_timer_irqs++;
567
568 evt->event_handler(evt);
569}
570
571/*
572 * Local APIC timer interrupt. This is the most natural way for doing
573 * local interrupts, but local timer interrupts can be emulated by
574 * broadcast interrupts too. [in case the hw doesn't support APIC timers]
575 *
576 * [ if a single-CPU system runs an SMP kernel then we call the local
577 * interrupt as well. Thus we cannot inline the local irq ... ]
578 */
579
580void fastcall smp_apic_timer_interrupt(struct pt_regs *regs)
581{
582 struct pt_regs *old_regs = set_irq_regs(regs);
583
584 /*
585 * NOTE! We'd better ACK the irq immediately,
586 * because timer handling can be slow.
587 */
588 ack_APIC_irq();
589 /*
590 * update_process_times() expects us to have done irq_enter().
591 * Besides, if we don't timer interrupts ignore the global
592 * interrupt lock, which is the WrongThing (tm) to do.
593 */
594 irq_enter();
595 local_apic_timer_interrupt();
596 irq_exit();
597
598 set_irq_regs(old_regs);
599}
600
601int setup_profiling_timer(unsigned int multiplier)
602{
603 return -EINVAL;
604}
605
606/*
607 * Local APIC start and shutdown
608 */
609
610/**
611 * clear_local_APIC - shutdown the local APIC
612 *
613 * This is called, when a CPU is disabled and before rebooting, so the state of
614 * the local APIC has no dangling leftovers. Also used to cleanout any BIOS
615 * leftovers during boot.
616 */
617void clear_local_APIC(void)
618{
619 int maxlvt = lapic_get_maxlvt();
620 unsigned long v;
621
622 /*
623 * Masking an LVT entry can trigger a local APIC error
624 * if the vector is zero. Mask LVTERR first to prevent this.
625 */
626 if (maxlvt >= 3) {
627 v = ERROR_APIC_VECTOR; /* any non-zero vector will do */
628 apic_write_around(APIC_LVTERR, v | APIC_LVT_MASKED);
629 }
630 /*
631 * Careful: we have to set masks only first to deassert
632 * any level-triggered sources.
633 */
634 v = apic_read(APIC_LVTT);
635 apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED);
636 v = apic_read(APIC_LVT0);
637 apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
638 v = apic_read(APIC_LVT1);
639 apic_write_around(APIC_LVT1, v | APIC_LVT_MASKED);
640 if (maxlvt >= 4) {
641 v = apic_read(APIC_LVTPC);
642 apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED);
643 }
644
645 /* lets not touch this if we didn't frob it */
646#ifdef CONFIG_X86_MCE_P4THERMAL
647 if (maxlvt >= 5) {
648 v = apic_read(APIC_LVTTHMR);
649 apic_write_around(APIC_LVTTHMR, v | APIC_LVT_MASKED);
650 }
651#endif
652 /*
653 * Clean APIC state for other OSs:
654 */
655 apic_write_around(APIC_LVTT, APIC_LVT_MASKED);
656 apic_write_around(APIC_LVT0, APIC_LVT_MASKED);
657 apic_write_around(APIC_LVT1, APIC_LVT_MASKED);
658 if (maxlvt >= 3)
659 apic_write_around(APIC_LVTERR, APIC_LVT_MASKED);
660 if (maxlvt >= 4)
661 apic_write_around(APIC_LVTPC, APIC_LVT_MASKED);
662
663#ifdef CONFIG_X86_MCE_P4THERMAL
664 if (maxlvt >= 5)
665 apic_write_around(APIC_LVTTHMR, APIC_LVT_MASKED);
666#endif
667 /* Integrated APIC (!82489DX) ? */
668 if (lapic_is_integrated()) {
669 if (maxlvt > 3)
670 /* Clear ESR due to Pentium errata 3AP and 11AP */
671 apic_write(APIC_ESR, 0);
672 apic_read(APIC_ESR);
673 }
674}
675
676/**
677 * disable_local_APIC - clear and disable the local APIC
678 */
679void disable_local_APIC(void)
680{
681 unsigned long value;
682
683 clear_local_APIC();
684
685 /*
686 * Disable APIC (implies clearing of registers
687 * for 82489DX!).
688 */
689 value = apic_read(APIC_SPIV);
690 value &= ~APIC_SPIV_APIC_ENABLED;
691 apic_write_around(APIC_SPIV, value);
692
693 /*
694 * When LAPIC was disabled by the BIOS and enabled by the kernel,
695 * restore the disabled state.
696 */
697 if (enabled_via_apicbase) {
698 unsigned int l, h;
699
700 rdmsr(MSR_IA32_APICBASE, l, h);
701 l &= ~MSR_IA32_APICBASE_ENABLE;
702 wrmsr(MSR_IA32_APICBASE, l, h);
703 }
704}
705
706/*
707 * If Linux enabled the LAPIC against the BIOS default disable it down before
708 * re-entering the BIOS on shutdown. Otherwise the BIOS may get confused and
709 * not power-off. Additionally clear all LVT entries before disable_local_APIC
710 * for the case where Linux didn't enable the LAPIC.
711 */
712void lapic_shutdown(void)
713{
714 unsigned long flags;
715
716 if (!cpu_has_apic)
717 return;
718
719 local_irq_save(flags);
720 clear_local_APIC();
721
722 if (enabled_via_apicbase)
723 disable_local_APIC();
724
725 local_irq_restore(flags);
726}
727
728/*
729 * This is to verify that we're looking at a real local APIC.
730 * Check these against your board if the CPUs aren't getting
731 * started for no apparent reason.
732 */
733int __init verify_local_APIC(void)
734{
735 unsigned int reg0, reg1;
736
737 /*
738 * The version register is read-only in a real APIC.
739 */
740 reg0 = apic_read(APIC_LVR);
741 apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0);
742 apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK);
743 reg1 = apic_read(APIC_LVR);
744 apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1);
745
746 /*
747 * The two version reads above should print the same
748 * numbers. If the second one is different, then we
749 * poke at a non-APIC.
750 */
751 if (reg1 != reg0)
752 return 0;
753
754 /*
755 * Check if the version looks reasonably.
756 */
757 reg1 = GET_APIC_VERSION(reg0);
758 if (reg1 == 0x00 || reg1 == 0xff)
759 return 0;
760 reg1 = lapic_get_maxlvt();
761 if (reg1 < 0x02 || reg1 == 0xff)
762 return 0;
763
764 /*
765 * The ID register is read/write in a real APIC.
766 */
767 reg0 = apic_read(APIC_ID);
768 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
769
770 /*
771 * The next two are just to see if we have sane values.
772 * They're only really relevant if we're in Virtual Wire
773 * compatibility mode, but most boxes are anymore.
774 */
775 reg0 = apic_read(APIC_LVT0);
776 apic_printk(APIC_DEBUG, "Getting LVT0: %x\n", reg0);
777 reg1 = apic_read(APIC_LVT1);
778 apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1);
779
780 return 1;
781}
782
783/**
784 * sync_Arb_IDs - synchronize APIC bus arbitration IDs
785 */
786void __init sync_Arb_IDs(void)
787{
788 /*
789 * Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 And not
790 * needed on AMD.
791 */
792 if (modern_apic())
793 return;
794 /*
795 * Wait for idle.
796 */
797 apic_wait_icr_idle();
798
799 apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
800 apic_write_around(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG
801 | APIC_DM_INIT);
802}
803
804/*
805 * An initial setup of the virtual wire mode.
806 */
807void __init init_bsp_APIC(void)
808{
809 unsigned long value;
810
811 /*
812 * Don't do the setup now if we have a SMP BIOS as the
813 * through-I/O-APIC virtual wire mode might be active.
814 */
815 if (smp_found_config || !cpu_has_apic)
816 return;
817
818 /*
819 * Do not trust the local APIC being empty at bootup.
820 */
821 clear_local_APIC();
822
823 /*
824 * Enable APIC.
825 */
826 value = apic_read(APIC_SPIV);
827 value &= ~APIC_VECTOR_MASK;
828 value |= APIC_SPIV_APIC_ENABLED;
829
830 /* This bit is reserved on P4/Xeon and should be cleared */
831 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
832 (boot_cpu_data.x86 == 15))
833 value &= ~APIC_SPIV_FOCUS_DISABLED;
834 else
835 value |= APIC_SPIV_FOCUS_DISABLED;
836 value |= SPURIOUS_APIC_VECTOR;
837 apic_write_around(APIC_SPIV, value);
838
839 /*
840 * Set up the virtual wire mode.
841 */
842 apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
843 value = APIC_DM_NMI;
844 if (!lapic_is_integrated()) /* 82489DX */
845 value |= APIC_LVT_LEVEL_TRIGGER;
846 apic_write_around(APIC_LVT1, value);
847}
848
849/**
850 * setup_local_APIC - setup the local APIC
851 */
852void __devinit setup_local_APIC(void)
853{
854 unsigned long oldvalue, value, maxlvt, integrated;
855 int i, j;
856
857 /* Pound the ESR really hard over the head with a big hammer - mbligh */
858 if (esr_disable) {
859 apic_write(APIC_ESR, 0);
860 apic_write(APIC_ESR, 0);
861 apic_write(APIC_ESR, 0);
862 apic_write(APIC_ESR, 0);
863 }
864
865 integrated = lapic_is_integrated();
866
867 /*
868 * Double-check whether this APIC is really registered.
869 */
870 if (!apic_id_registered())
871 BUG();
872
873 /*
874 * Intel recommends to set DFR, LDR and TPR before enabling
875 * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
876 * document number 292116). So here it goes...
877 */
878 init_apic_ldr();
879
880 /*
881 * Set Task Priority to 'accept all'. We never change this
882 * later on.
883 */
884 value = apic_read(APIC_TASKPRI);
885 value &= ~APIC_TPRI_MASK;
886 apic_write_around(APIC_TASKPRI, value);
887
888 /*
889 * After a crash, we no longer service the interrupts and a pending
890 * interrupt from previous kernel might still have ISR bit set.
891 *
892 * Most probably by now CPU has serviced that pending interrupt and
893 * it might not have done the ack_APIC_irq() because it thought,
894 * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it
895 * does not clear the ISR bit and cpu thinks it has already serivced
896 * the interrupt. Hence a vector might get locked. It was noticed
897 * for timer irq (vector 0x31). Issue an extra EOI to clear ISR.
898 */
899 for (i = APIC_ISR_NR - 1; i >= 0; i--) {
900 value = apic_read(APIC_ISR + i*0x10);
901 for (j = 31; j >= 0; j--) {
902 if (value & (1<<j))
903 ack_APIC_irq();
904 }
905 }
906
907 /*
908 * Now that we are all set up, enable the APIC
909 */
910 value = apic_read(APIC_SPIV);
911 value &= ~APIC_VECTOR_MASK;
912 /*
913 * Enable APIC
914 */
915 value |= APIC_SPIV_APIC_ENABLED;
916
917 /*
918 * Some unknown Intel IO/APIC (or APIC) errata is biting us with
919 * certain networking cards. If high frequency interrupts are
920 * happening on a particular IOAPIC pin, plus the IOAPIC routing
921 * entry is masked/unmasked at a high rate as well then sooner or
922 * later IOAPIC line gets 'stuck', no more interrupts are received
923 * from the device. If focus CPU is disabled then the hang goes
924 * away, oh well :-(
925 *
926 * [ This bug can be reproduced easily with a level-triggered
927 * PCI Ne2000 networking cards and PII/PIII processors, dual
928 * BX chipset. ]
929 */
930 /*
931 * Actually disabling the focus CPU check just makes the hang less
932 * frequent as it makes the interrupt distributon model be more
933 * like LRU than MRU (the short-term load is more even across CPUs).
934 * See also the comment in end_level_ioapic_irq(). --macro
935 */
936
937 /* Enable focus processor (bit==0) */
938 value &= ~APIC_SPIV_FOCUS_DISABLED;
939
940 /*
941 * Set spurious IRQ vector
942 */
943 value |= SPURIOUS_APIC_VECTOR;
944 apic_write_around(APIC_SPIV, value);
945
946 /*
947 * Set up LVT0, LVT1:
948 *
949 * set up through-local-APIC on the BP's LINT0. This is not
950 * strictly necessery in pure symmetric-IO mode, but sometimes
951 * we delegate interrupts to the 8259A.
952 */
953 /*
954 * TODO: set up through-local-APIC from through-I/O-APIC? --macro
955 */
956 value = apic_read(APIC_LVT0) & APIC_LVT_MASKED;
957 if (!smp_processor_id() && (pic_mode || !value)) {
958 value = APIC_DM_EXTINT;
959 apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n",
960 smp_processor_id());
961 } else {
962 value = APIC_DM_EXTINT | APIC_LVT_MASKED;
963 apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n",
964 smp_processor_id());
965 }
966 apic_write_around(APIC_LVT0, value);
967
968 /*
969 * only the BP should see the LINT1 NMI signal, obviously.
970 */
971 if (!smp_processor_id())
972 value = APIC_DM_NMI;
973 else
974 value = APIC_DM_NMI | APIC_LVT_MASKED;
975 if (!integrated) /* 82489DX */
976 value |= APIC_LVT_LEVEL_TRIGGER;
977 apic_write_around(APIC_LVT1, value);
978
979 if (integrated && !esr_disable) { /* !82489DX */
980 maxlvt = lapic_get_maxlvt();
981 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
982 apic_write(APIC_ESR, 0);
983 oldvalue = apic_read(APIC_ESR);
984
985 /* enables sending errors */
986 value = ERROR_APIC_VECTOR;
987 apic_write_around(APIC_LVTERR, value);
988 /*
989 * spec says clear errors after enabling vector.
990 */
991 if (maxlvt > 3)
992 apic_write(APIC_ESR, 0);
993 value = apic_read(APIC_ESR);
994 if (value != oldvalue)
995 apic_printk(APIC_VERBOSE, "ESR value before enabling "
996 "vector: 0x%08lx after: 0x%08lx\n",
997 oldvalue, value);
998 } else {
999 if (esr_disable)
1000 /*
1001 * Something untraceble is creating bad interrupts on
1002 * secondary quads ... for the moment, just leave the
1003 * ESR disabled - we can't do anything useful with the
1004 * errors anyway - mbligh
1005 */
1006 printk(KERN_INFO "Leaving ESR disabled.\n");
1007 else
1008 printk(KERN_INFO "No ESR for 82489DX.\n");
1009 }
1010
1011 /* Disable the local apic timer */
1012 value = apic_read(APIC_LVTT);
1013 value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
1014 apic_write_around(APIC_LVTT, value);
1015
1016 setup_apic_nmi_watchdog(NULL);
1017 apic_pm_activate();
1018}
1019
1020/*
1021 * Detect and initialize APIC
1022 */
1023static int __init detect_init_APIC (void)
1024{
1025 u32 h, l, features;
1026
1027 /* Disabled by kernel option? */
1028 if (enable_local_apic < 0)
1029 return -1;
1030
1031 switch (boot_cpu_data.x86_vendor) {
1032 case X86_VENDOR_AMD:
1033 if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model > 1) ||
1034 (boot_cpu_data.x86 == 15))
1035 break;
1036 goto no_apic;
1037 case X86_VENDOR_INTEL:
1038 if (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15 ||
1039 (boot_cpu_data.x86 == 5 && cpu_has_apic))
1040 break;
1041 goto no_apic;
1042 default:
1043 goto no_apic;
1044 }
1045
1046 if (!cpu_has_apic) {
1047 /*
1048 * Over-ride BIOS and try to enable the local APIC only if
1049 * "lapic" specified.
1050 */
1051 if (enable_local_apic <= 0) {
1052 printk(KERN_INFO "Local APIC disabled by BIOS -- "
1053 "you can enable it with \"lapic\"\n");
1054 return -1;
1055 }
1056 /*
1057 * Some BIOSes disable the local APIC in the APIC_BASE
1058 * MSR. This can only be done in software for Intel P6 or later
1059 * and AMD K7 (Model > 1) or later.
1060 */
1061 rdmsr(MSR_IA32_APICBASE, l, h);
1062 if (!(l & MSR_IA32_APICBASE_ENABLE)) {
1063 printk(KERN_INFO
1064 "Local APIC disabled by BIOS -- reenabling.\n");
1065 l &= ~MSR_IA32_APICBASE_BASE;
1066 l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE;
1067 wrmsr(MSR_IA32_APICBASE, l, h);
1068 enabled_via_apicbase = 1;
1069 }
1070 }
1071 /*
1072 * The APIC feature bit should now be enabled
1073 * in `cpuid'
1074 */
1075 features = cpuid_edx(1);
1076 if (!(features & (1 << X86_FEATURE_APIC))) {
1077 printk(KERN_WARNING "Could not enable APIC!\n");
1078 return -1;
1079 }
1080 set_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
1081 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
1082
1083 /* The BIOS may have set up the APIC at some other address */
1084 rdmsr(MSR_IA32_APICBASE, l, h);
1085 if (l & MSR_IA32_APICBASE_ENABLE)
1086 mp_lapic_addr = l & MSR_IA32_APICBASE_BASE;
1087
1088 if (nmi_watchdog != NMI_NONE && nmi_watchdog != NMI_DISABLED)
1089 nmi_watchdog = NMI_LOCAL_APIC;
1090
1091 printk(KERN_INFO "Found and enabled local APIC!\n");
1092
1093 apic_pm_activate();
1094
1095 return 0;
1096
1097no_apic:
1098 printk(KERN_INFO "No local APIC present or hardware disabled\n");
1099 return -1;
1100}
1101
1102/**
1103 * init_apic_mappings - initialize APIC mappings
1104 */
1105void __init init_apic_mappings(void)
1106{
1107 unsigned long apic_phys;
1108
1109 /*
1110 * If no local APIC can be found then set up a fake all
1111 * zeroes page to simulate the local APIC and another
1112 * one for the IO-APIC.
1113 */
1114 if (!smp_found_config && detect_init_APIC()) {
1115 apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
1116 apic_phys = __pa(apic_phys);
1117 } else
1118 apic_phys = mp_lapic_addr;
1119
1120 set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
1121 printk(KERN_DEBUG "mapped APIC to %08lx (%08lx)\n", APIC_BASE,
1122 apic_phys);
1123
1124 /*
1125 * Fetch the APIC ID of the BSP in case we have a
1126 * default configuration (or the MP table is broken).
1127 */
1128 if (boot_cpu_physical_apicid == -1U)
1129 boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
1130
1131#ifdef CONFIG_X86_IO_APIC
1132 {
1133 unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
1134 int i;
1135
1136 for (i = 0; i < nr_ioapics; i++) {
1137 if (smp_found_config) {
1138 ioapic_phys = mp_ioapics[i].mpc_apicaddr;
1139 if (!ioapic_phys) {
1140 printk(KERN_ERR
1141 "WARNING: bogus zero IO-APIC "
1142 "address found in MPTABLE, "
1143 "disabling IO/APIC support!\n");
1144 smp_found_config = 0;
1145 skip_ioapic_setup = 1;
1146 goto fake_ioapic_page;
1147 }
1148 } else {
1149fake_ioapic_page:
1150 ioapic_phys = (unsigned long)
1151 alloc_bootmem_pages(PAGE_SIZE);
1152 ioapic_phys = __pa(ioapic_phys);
1153 }
1154 set_fixmap_nocache(idx, ioapic_phys);
1155 printk(KERN_DEBUG "mapped IOAPIC to %08lx (%08lx)\n",
1156 __fix_to_virt(idx), ioapic_phys);
1157 idx++;
1158 }
1159 }
1160#endif
1161}
1162
1163/*
1164 * This initializes the IO-APIC and APIC hardware if this is
1165 * a UP kernel.
1166 */
1167int __init APIC_init_uniprocessor (void)
1168{
1169 if (enable_local_apic < 0)
1170 clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
1171
1172 if (!smp_found_config && !cpu_has_apic)
1173 return -1;
1174
1175 /*
1176 * Complain if the BIOS pretends there is one.
1177 */
1178 if (!cpu_has_apic &&
1179 APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
1180 printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
1181 boot_cpu_physical_apicid);
1182 clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
1183 return -1;
1184 }
1185
1186 verify_local_APIC();
1187
1188 connect_bsp_APIC();
1189
1190 /*
1191 * Hack: In case of kdump, after a crash, kernel might be booting
1192 * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid
1193 * might be zero if read from MP tables. Get it from LAPIC.
1194 */
1195#ifdef CONFIG_CRASH_DUMP
1196 boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
1197#endif
1198 phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid);
1199
1200 setup_local_APIC();
1201
1202#ifdef CONFIG_X86_IO_APIC
1203 if (smp_found_config)
1204 if (!skip_ioapic_setup && nr_ioapics)
1205 setup_IO_APIC();
1206#endif
1207 setup_boot_clock();
1208
1209 return 0;
1210}
1211
1212/*
1213 * APIC command line parameters
1214 */
1215static int __init parse_lapic(char *arg)
1216{
1217 enable_local_apic = 1;
1218 return 0;
1219}
1220early_param("lapic", parse_lapic);
1221
1222static int __init parse_nolapic(char *arg)
1223{
1224 enable_local_apic = -1;
1225 clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
1226 return 0;
1227}
1228early_param("nolapic", parse_nolapic);
1229
1230static int __init parse_disable_lapic_timer(char *arg)
1231{
1232 local_apic_timer_disabled = 1;
1233 return 0;
1234}
1235early_param("nolapic_timer", parse_disable_lapic_timer);
1236
1237static int __init parse_lapic_timer_c2_ok(char *arg)
1238{
1239 local_apic_timer_c2_ok = 1;
1240 return 0;
1241}
1242early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok);
1243
1244static int __init apic_set_verbosity(char *str)
1245{
1246 if (strcmp("debug", str) == 0)
1247 apic_verbosity = APIC_DEBUG;
1248 else if (strcmp("verbose", str) == 0)
1249 apic_verbosity = APIC_VERBOSE;
1250 return 1;
1251}
1252
1253__setup("apic=", apic_set_verbosity);
1254
1255
1256/*
1257 * Local APIC interrupts
1258 */
1259
1260/*
1261 * This interrupt should _never_ happen with our APIC/SMP architecture
1262 */
1263void smp_spurious_interrupt(struct pt_regs *regs)
1264{
1265 unsigned long v;
1266
1267 irq_enter();
1268 /*
1269 * Check if this really is a spurious interrupt and ACK it
1270 * if it is a vectored one. Just in case...
1271 * Spurious interrupts should not be ACKed.
1272 */
1273 v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1));
1274 if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
1275 ack_APIC_irq();
1276
1277 /* see sw-dev-man vol 3, chapter 7.4.13.5 */
1278 printk(KERN_INFO "spurious APIC interrupt on CPU#%d, "
1279 "should never happen.\n", smp_processor_id());
1280 irq_exit();
1281}
1282
1283/*
1284 * This interrupt should never happen with our APIC/SMP architecture
1285 */
1286void smp_error_interrupt(struct pt_regs *regs)
1287{
1288 unsigned long v, v1;
1289
1290 irq_enter();
1291 /* First tickle the hardware, only then report what went on. -- REW */
1292 v = apic_read(APIC_ESR);
1293 apic_write(APIC_ESR, 0);
1294 v1 = apic_read(APIC_ESR);
1295 ack_APIC_irq();
1296 atomic_inc(&irq_err_count);
1297
1298 /* Here is what the APIC error bits mean:
1299 0: Send CS error
1300 1: Receive CS error
1301 2: Send accept error
1302 3: Receive accept error
1303 4: Reserved
1304 5: Send illegal vector
1305 6: Received illegal vector
1306 7: Illegal register address
1307 */
1308 printk (KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n",
1309 smp_processor_id(), v , v1);
1310 irq_exit();
1311}
1312
1313/*
1314 * Initialize APIC interrupts
1315 */
1316void __init apic_intr_init(void)
1317{
1318#ifdef CONFIG_SMP
1319 smp_intr_init();
1320#endif
1321 /* self generated IPI for local APIC timer */
1322 set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
1323
1324 /* IPI vectors for APIC spurious and error interrupts */
1325 set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
1326 set_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
1327
1328 /* thermal monitor LVT interrupt */
1329#ifdef CONFIG_X86_MCE_P4THERMAL
1330 set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
1331#endif
1332}
1333
1334/**
1335 * connect_bsp_APIC - attach the APIC to the interrupt system
1336 */
1337void __init connect_bsp_APIC(void)
1338{
1339 if (pic_mode) {
1340 /*
1341 * Do not trust the local APIC being empty at bootup.
1342 */
1343 clear_local_APIC();
1344 /*
1345 * PIC mode, enable APIC mode in the IMCR, i.e. connect BSP's
1346 * local APIC to INT and NMI lines.
1347 */
1348 apic_printk(APIC_VERBOSE, "leaving PIC mode, "
1349 "enabling APIC mode.\n");
1350 outb(0x70, 0x22);
1351 outb(0x01, 0x23);
1352 }
1353 enable_apic_mode();
1354}
1355
1356/**
1357 * disconnect_bsp_APIC - detach the APIC from the interrupt system
1358 * @virt_wire_setup: indicates, whether virtual wire mode is selected
1359 *
1360 * Virtual wire mode is necessary to deliver legacy interrupts even when the
1361 * APIC is disabled.
1362 */
1363void disconnect_bsp_APIC(int virt_wire_setup)
1364{
1365 if (pic_mode) {
1366 /*
1367 * Put the board back into PIC mode (has an effect only on
1368 * certain older boards). Note that APIC interrupts, including
1369 * IPIs, won't work beyond this point! The only exception are
1370 * INIT IPIs.
1371 */
1372 apic_printk(APIC_VERBOSE, "disabling APIC mode, "
1373 "entering PIC mode.\n");
1374 outb(0x70, 0x22);
1375 outb(0x00, 0x23);
1376 } else {
1377 /* Go back to Virtual Wire compatibility mode */
1378 unsigned long value;
1379
1380 /* For the spurious interrupt use vector F, and enable it */
1381 value = apic_read(APIC_SPIV);
1382 value &= ~APIC_VECTOR_MASK;
1383 value |= APIC_SPIV_APIC_ENABLED;
1384 value |= 0xf;
1385 apic_write_around(APIC_SPIV, value);
1386
1387 if (!virt_wire_setup) {
1388 /*
1389 * For LVT0 make it edge triggered, active high,
1390 * external and enabled
1391 */
1392 value = apic_read(APIC_LVT0);
1393 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
1394 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
1395 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED );
1396 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1397 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
1398 apic_write_around(APIC_LVT0, value);
1399 } else {
1400 /* Disable LVT0 */
1401 apic_write_around(APIC_LVT0, APIC_LVT_MASKED);
1402 }
1403
1404 /*
1405 * For LVT1 make it edge triggered, active high, nmi and
1406 * enabled
1407 */
1408 value = apic_read(APIC_LVT1);
1409 value &= ~(
1410 APIC_MODE_MASK | APIC_SEND_PENDING |
1411 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
1412 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1413 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1414 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
1415 apic_write_around(APIC_LVT1, value);
1416 }
1417}
1418
1419/*
1420 * Power management
1421 */
1422#ifdef CONFIG_PM
1423
1424static struct {
1425 int active;
1426 /* r/w apic fields */
1427 unsigned int apic_id;
1428 unsigned int apic_taskpri;
1429 unsigned int apic_ldr;
1430 unsigned int apic_dfr;
1431 unsigned int apic_spiv;
1432 unsigned int apic_lvtt;
1433 unsigned int apic_lvtpc;
1434 unsigned int apic_lvt0;
1435 unsigned int apic_lvt1;
1436 unsigned int apic_lvterr;
1437 unsigned int apic_tmict;
1438 unsigned int apic_tdcr;
1439 unsigned int apic_thmr;
1440} apic_pm_state;
1441
1442static int lapic_suspend(struct sys_device *dev, pm_message_t state)
1443{
1444 unsigned long flags;
1445 int maxlvt;
1446
1447 if (!apic_pm_state.active)
1448 return 0;
1449
1450 maxlvt = lapic_get_maxlvt();
1451
1452 apic_pm_state.apic_id = apic_read(APIC_ID);
1453 apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
1454 apic_pm_state.apic_ldr = apic_read(APIC_LDR);
1455 apic_pm_state.apic_dfr = apic_read(APIC_DFR);
1456 apic_pm_state.apic_spiv = apic_read(APIC_SPIV);
1457 apic_pm_state.apic_lvtt = apic_read(APIC_LVTT);
1458 if (maxlvt >= 4)
1459 apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
1460 apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0);
1461 apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1);
1462 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
1463 apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
1464 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
1465#ifdef CONFIG_X86_MCE_P4THERMAL
1466 if (maxlvt >= 5)
1467 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
1468#endif
1469
1470 local_irq_save(flags);
1471 disable_local_APIC();
1472 local_irq_restore(flags);
1473 return 0;
1474}
1475
1476static int lapic_resume(struct sys_device *dev)
1477{
1478 unsigned int l, h;
1479 unsigned long flags;
1480 int maxlvt;
1481
1482 if (!apic_pm_state.active)
1483 return 0;
1484
1485 maxlvt = lapic_get_maxlvt();
1486
1487 local_irq_save(flags);
1488
1489 /*
1490 * Make sure the APICBASE points to the right address
1491 *
1492 * FIXME! This will be wrong if we ever support suspend on
1493 * SMP! We'll need to do this as part of the CPU restore!
1494 */
1495 rdmsr(MSR_IA32_APICBASE, l, h);
1496 l &= ~MSR_IA32_APICBASE_BASE;
1497 l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
1498 wrmsr(MSR_IA32_APICBASE, l, h);
1499
1500 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
1501 apic_write(APIC_ID, apic_pm_state.apic_id);
1502 apic_write(APIC_DFR, apic_pm_state.apic_dfr);
1503 apic_write(APIC_LDR, apic_pm_state.apic_ldr);
1504 apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri);
1505 apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
1506 apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
1507 apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
1508#ifdef CONFIG_X86_MCE_P4THERMAL
1509 if (maxlvt >= 5)
1510 apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
1511#endif
1512 if (maxlvt >= 4)
1513 apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
1514 apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
1515 apic_write(APIC_TDCR, apic_pm_state.apic_tdcr);
1516 apic_write(APIC_TMICT, apic_pm_state.apic_tmict);
1517 apic_write(APIC_ESR, 0);
1518 apic_read(APIC_ESR);
1519 apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
1520 apic_write(APIC_ESR, 0);
1521 apic_read(APIC_ESR);
1522 local_irq_restore(flags);
1523 return 0;
1524}
1525
1526/*
1527 * This device has no shutdown method - fully functioning local APICs
1528 * are needed on every CPU up until machine_halt/restart/poweroff.
1529 */
1530
1531static struct sysdev_class lapic_sysclass = {
1532 set_kset_name("lapic"),
1533 .resume = lapic_resume,
1534 .suspend = lapic_suspend,
1535};
1536
1537static struct sys_device device_lapic = {
1538 .id = 0,
1539 .cls = &lapic_sysclass,
1540};
1541
1542static void __devinit apic_pm_activate(void)
1543{
1544 apic_pm_state.active = 1;
1545}
1546
1547static int __init init_lapic_sysfs(void)
1548{
1549 int error;
1550
1551 if (!cpu_has_apic)
1552 return 0;
1553 /* XXX: remove suspend/resume procs if !apic_pm_state.active? */
1554
1555 error = sysdev_class_register(&lapic_sysclass);
1556 if (!error)
1557 error = sysdev_register(&device_lapic);
1558 return error;
1559}
1560device_initcall(init_lapic_sysfs);
1561
1562#else /* CONFIG_PM */
1563
1564static void apic_pm_activate(void) { }
1565
1566#endif /* CONFIG_PM */
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
new file mode 100644
index 000000000000..925758dbca0c
--- /dev/null
+++ b/arch/x86/kernel/apic_64.c
@@ -0,0 +1,1253 @@
1/*
2 * Local APIC handling, local APIC timers
3 *
4 * (c) 1999, 2000 Ingo Molnar <mingo@redhat.com>
5 *
6 * Fixes
7 * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
8 * thanks to Eric Gilmore
9 * and Rolf G. Tews
10 * for testing these extensively.
11 * Maciej W. Rozycki : Various updates and fixes.
12 * Mikael Pettersson : Power Management for UP-APIC.
13 * Pavel Machek and
14 * Mikael Pettersson : PM converted to driver model.
15 */
16
17#include <linux/init.h>
18
19#include <linux/mm.h>
20#include <linux/delay.h>
21#include <linux/bootmem.h>
22#include <linux/interrupt.h>
23#include <linux/mc146818rtc.h>
24#include <linux/kernel_stat.h>
25#include <linux/sysdev.h>
26#include <linux/module.h>
27#include <linux/ioport.h>
28
29#include <asm/atomic.h>
30#include <asm/smp.h>
31#include <asm/mtrr.h>
32#include <asm/mpspec.h>
33#include <asm/pgalloc.h>
34#include <asm/mach_apic.h>
35#include <asm/nmi.h>
36#include <asm/idle.h>
37#include <asm/proto.h>
38#include <asm/timex.h>
39#include <asm/hpet.h>
40#include <asm/apic.h>
41
42int apic_mapped;
43int apic_verbosity;
44int apic_runs_main_timer;
45int apic_calibrate_pmtmr __initdata;
46
47int disable_apic_timer __initdata;
48
49/* Local APIC timer works in C2? */
50int local_apic_timer_c2_ok;
51EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
52
53static struct resource *ioapic_resources;
54static struct resource lapic_resource = {
55 .name = "Local APIC",
56 .flags = IORESOURCE_MEM | IORESOURCE_BUSY,
57};
58
59/*
60 * cpu_mask that denotes the CPUs that needs timer interrupt coming in as
61 * IPIs in place of local APIC timers
62 */
63static cpumask_t timer_interrupt_broadcast_ipi_mask;
64
65/* Using APIC to generate smp_local_timer_interrupt? */
66int using_apic_timer __read_mostly = 0;
67
68static void apic_pm_activate(void);
69
70void apic_wait_icr_idle(void)
71{
72 while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
73 cpu_relax();
74}
75
76unsigned int safe_apic_wait_icr_idle(void)
77{
78 unsigned int send_status;
79 int timeout;
80
81 timeout = 0;
82 do {
83 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
84 if (!send_status)
85 break;
86 udelay(100);
87 } while (timeout++ < 1000);
88
89 return send_status;
90}
91
92void enable_NMI_through_LVT0 (void * dummy)
93{
94 unsigned int v;
95
96 /* unmask and set to NMI */
97 v = APIC_DM_NMI;
98 apic_write(APIC_LVT0, v);
99}
100
101int get_maxlvt(void)
102{
103 unsigned int v, maxlvt;
104
105 v = apic_read(APIC_LVR);
106 maxlvt = GET_APIC_MAXLVT(v);
107 return maxlvt;
108}
109
110/*
111 * 'what should we do if we get a hw irq event on an illegal vector'.
112 * each architecture has to answer this themselves.
113 */
114void ack_bad_irq(unsigned int irq)
115{
116 printk("unexpected IRQ trap at vector %02x\n", irq);
117 /*
118 * Currently unexpected vectors happen only on SMP and APIC.
119 * We _must_ ack these because every local APIC has only N
120 * irq slots per priority level, and a 'hanging, unacked' IRQ
121 * holds up an irq slot - in excessive cases (when multiple
122 * unexpected vectors occur) that might lock up the APIC
123 * completely.
124 * But don't ack when the APIC is disabled. -AK
125 */
126 if (!disable_apic)
127 ack_APIC_irq();
128}
129
130void clear_local_APIC(void)
131{
132 int maxlvt;
133 unsigned int v;
134
135 maxlvt = get_maxlvt();
136
137 /*
138 * Masking an LVT entry can trigger a local APIC error
139 * if the vector is zero. Mask LVTERR first to prevent this.
140 */
141 if (maxlvt >= 3) {
142 v = ERROR_APIC_VECTOR; /* any non-zero vector will do */
143 apic_write(APIC_LVTERR, v | APIC_LVT_MASKED);
144 }
145 /*
146 * Careful: we have to set masks only first to deassert
147 * any level-triggered sources.
148 */
149 v = apic_read(APIC_LVTT);
150 apic_write(APIC_LVTT, v | APIC_LVT_MASKED);
151 v = apic_read(APIC_LVT0);
152 apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
153 v = apic_read(APIC_LVT1);
154 apic_write(APIC_LVT1, v | APIC_LVT_MASKED);
155 if (maxlvt >= 4) {
156 v = apic_read(APIC_LVTPC);
157 apic_write(APIC_LVTPC, v | APIC_LVT_MASKED);
158 }
159
160 /*
161 * Clean APIC state for other OSs:
162 */
163 apic_write(APIC_LVTT, APIC_LVT_MASKED);
164 apic_write(APIC_LVT0, APIC_LVT_MASKED);
165 apic_write(APIC_LVT1, APIC_LVT_MASKED);
166 if (maxlvt >= 3)
167 apic_write(APIC_LVTERR, APIC_LVT_MASKED);
168 if (maxlvt >= 4)
169 apic_write(APIC_LVTPC, APIC_LVT_MASKED);
170 apic_write(APIC_ESR, 0);
171 apic_read(APIC_ESR);
172}
173
174void disconnect_bsp_APIC(int virt_wire_setup)
175{
176 /* Go back to Virtual Wire compatibility mode */
177 unsigned long value;
178
179 /* For the spurious interrupt use vector F, and enable it */
180 value = apic_read(APIC_SPIV);
181 value &= ~APIC_VECTOR_MASK;
182 value |= APIC_SPIV_APIC_ENABLED;
183 value |= 0xf;
184 apic_write(APIC_SPIV, value);
185
186 if (!virt_wire_setup) {
187 /* For LVT0 make it edge triggered, active high, external and enabled */
188 value = apic_read(APIC_LVT0);
189 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
190 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
191 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED );
192 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
193 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
194 apic_write(APIC_LVT0, value);
195 } else {
196 /* Disable LVT0 */
197 apic_write(APIC_LVT0, APIC_LVT_MASKED);
198 }
199
200 /* For LVT1 make it edge triggered, active high, nmi and enabled */
201 value = apic_read(APIC_LVT1);
202 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
203 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
204 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
205 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
206 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
207 apic_write(APIC_LVT1, value);
208}
209
210void disable_local_APIC(void)
211{
212 unsigned int value;
213
214 clear_local_APIC();
215
216 /*
217 * Disable APIC (implies clearing of registers
218 * for 82489DX!).
219 */
220 value = apic_read(APIC_SPIV);
221 value &= ~APIC_SPIV_APIC_ENABLED;
222 apic_write(APIC_SPIV, value);
223}
224
225/*
226 * This is to verify that we're looking at a real local APIC.
227 * Check these against your board if the CPUs aren't getting
228 * started for no apparent reason.
229 */
230int __init verify_local_APIC(void)
231{
232 unsigned int reg0, reg1;
233
234 /*
235 * The version register is read-only in a real APIC.
236 */
237 reg0 = apic_read(APIC_LVR);
238 apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0);
239 apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK);
240 reg1 = apic_read(APIC_LVR);
241 apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1);
242
243 /*
244 * The two version reads above should print the same
245 * numbers. If the second one is different, then we
246 * poke at a non-APIC.
247 */
248 if (reg1 != reg0)
249 return 0;
250
251 /*
252 * Check if the version looks reasonably.
253 */
254 reg1 = GET_APIC_VERSION(reg0);
255 if (reg1 == 0x00 || reg1 == 0xff)
256 return 0;
257 reg1 = get_maxlvt();
258 if (reg1 < 0x02 || reg1 == 0xff)
259 return 0;
260
261 /*
262 * The ID register is read/write in a real APIC.
263 */
264 reg0 = apic_read(APIC_ID);
265 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
266 apic_write(APIC_ID, reg0 ^ APIC_ID_MASK);
267 reg1 = apic_read(APIC_ID);
268 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1);
269 apic_write(APIC_ID, reg0);
270 if (reg1 != (reg0 ^ APIC_ID_MASK))
271 return 0;
272
273 /*
274 * The next two are just to see if we have sane values.
275 * They're only really relevant if we're in Virtual Wire
276 * compatibility mode, but most boxes are anymore.
277 */
278 reg0 = apic_read(APIC_LVT0);
279 apic_printk(APIC_DEBUG,"Getting LVT0: %x\n", reg0);
280 reg1 = apic_read(APIC_LVT1);
281 apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1);
282
283 return 1;
284}
285
286void __init sync_Arb_IDs(void)
287{
288 /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 */
289 unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR));
290 if (ver >= 0x14) /* P4 or higher */
291 return;
292
293 /*
294 * Wait for idle.
295 */
296 apic_wait_icr_idle();
297
298 apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
299 apic_write(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG
300 | APIC_DM_INIT);
301}
302
303/*
304 * An initial setup of the virtual wire mode.
305 */
306void __init init_bsp_APIC(void)
307{
308 unsigned int value;
309
310 /*
311 * Don't do the setup now if we have a SMP BIOS as the
312 * through-I/O-APIC virtual wire mode might be active.
313 */
314 if (smp_found_config || !cpu_has_apic)
315 return;
316
317 value = apic_read(APIC_LVR);
318
319 /*
320 * Do not trust the local APIC being empty at bootup.
321 */
322 clear_local_APIC();
323
324 /*
325 * Enable APIC.
326 */
327 value = apic_read(APIC_SPIV);
328 value &= ~APIC_VECTOR_MASK;
329 value |= APIC_SPIV_APIC_ENABLED;
330 value |= APIC_SPIV_FOCUS_DISABLED;
331 value |= SPURIOUS_APIC_VECTOR;
332 apic_write(APIC_SPIV, value);
333
334 /*
335 * Set up the virtual wire mode.
336 */
337 apic_write(APIC_LVT0, APIC_DM_EXTINT);
338 value = APIC_DM_NMI;
339 apic_write(APIC_LVT1, value);
340}
341
342void __cpuinit setup_local_APIC (void)
343{
344 unsigned int value, maxlvt;
345 int i, j;
346
347 value = apic_read(APIC_LVR);
348
349 BUILD_BUG_ON((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f);
350
351 /*
352 * Double-check whether this APIC is really registered.
353 * This is meaningless in clustered apic mode, so we skip it.
354 */
355 if (!apic_id_registered())
356 BUG();
357
358 /*
359 * Intel recommends to set DFR, LDR and TPR before enabling
360 * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
361 * document number 292116). So here it goes...
362 */
363 init_apic_ldr();
364
365 /*
366 * Set Task Priority to 'accept all'. We never change this
367 * later on.
368 */
369 value = apic_read(APIC_TASKPRI);
370 value &= ~APIC_TPRI_MASK;
371 apic_write(APIC_TASKPRI, value);
372
373 /*
374 * After a crash, we no longer service the interrupts and a pending
375 * interrupt from previous kernel might still have ISR bit set.
376 *
377 * Most probably by now CPU has serviced that pending interrupt and
378 * it might not have done the ack_APIC_irq() because it thought,
379 * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it
380 * does not clear the ISR bit and cpu thinks it has already serivced
381 * the interrupt. Hence a vector might get locked. It was noticed
382 * for timer irq (vector 0x31). Issue an extra EOI to clear ISR.
383 */
384 for (i = APIC_ISR_NR - 1; i >= 0; i--) {
385 value = apic_read(APIC_ISR + i*0x10);
386 for (j = 31; j >= 0; j--) {
387 if (value & (1<<j))
388 ack_APIC_irq();
389 }
390 }
391
392 /*
393 * Now that we are all set up, enable the APIC
394 */
395 value = apic_read(APIC_SPIV);
396 value &= ~APIC_VECTOR_MASK;
397 /*
398 * Enable APIC
399 */
400 value |= APIC_SPIV_APIC_ENABLED;
401
402 /* We always use processor focus */
403
404 /*
405 * Set spurious IRQ vector
406 */
407 value |= SPURIOUS_APIC_VECTOR;
408 apic_write(APIC_SPIV, value);
409
410 /*
411 * Set up LVT0, LVT1:
412 *
413 * set up through-local-APIC on the BP's LINT0. This is not
414 * strictly necessary in pure symmetric-IO mode, but sometimes
415 * we delegate interrupts to the 8259A.
416 */
417 /*
418 * TODO: set up through-local-APIC from through-I/O-APIC? --macro
419 */
420 value = apic_read(APIC_LVT0) & APIC_LVT_MASKED;
421 if (!smp_processor_id() && !value) {
422 value = APIC_DM_EXTINT;
423 apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", smp_processor_id());
424 } else {
425 value = APIC_DM_EXTINT | APIC_LVT_MASKED;
426 apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", smp_processor_id());
427 }
428 apic_write(APIC_LVT0, value);
429
430 /*
431 * only the BP should see the LINT1 NMI signal, obviously.
432 */
433 if (!smp_processor_id())
434 value = APIC_DM_NMI;
435 else
436 value = APIC_DM_NMI | APIC_LVT_MASKED;
437 apic_write(APIC_LVT1, value);
438
439 {
440 unsigned oldvalue;
441 maxlvt = get_maxlvt();
442 oldvalue = apic_read(APIC_ESR);
443 value = ERROR_APIC_VECTOR; // enables sending errors
444 apic_write(APIC_LVTERR, value);
445 /*
446 * spec says clear errors after enabling vector.
447 */
448 if (maxlvt > 3)
449 apic_write(APIC_ESR, 0);
450 value = apic_read(APIC_ESR);
451 if (value != oldvalue)
452 apic_printk(APIC_VERBOSE,
453 "ESR value after enabling vector: %08x, after %08x\n",
454 oldvalue, value);
455 }
456
457 nmi_watchdog_default();
458 setup_apic_nmi_watchdog(NULL);
459 apic_pm_activate();
460}
461
462#ifdef CONFIG_PM
463
464static struct {
465 /* 'active' is true if the local APIC was enabled by us and
466 not the BIOS; this signifies that we are also responsible
467 for disabling it before entering apm/acpi suspend */
468 int active;
469 /* r/w apic fields */
470 unsigned int apic_id;
471 unsigned int apic_taskpri;
472 unsigned int apic_ldr;
473 unsigned int apic_dfr;
474 unsigned int apic_spiv;
475 unsigned int apic_lvtt;
476 unsigned int apic_lvtpc;
477 unsigned int apic_lvt0;
478 unsigned int apic_lvt1;
479 unsigned int apic_lvterr;
480 unsigned int apic_tmict;
481 unsigned int apic_tdcr;
482 unsigned int apic_thmr;
483} apic_pm_state;
484
485static int lapic_suspend(struct sys_device *dev, pm_message_t state)
486{
487 unsigned long flags;
488 int maxlvt;
489
490 if (!apic_pm_state.active)
491 return 0;
492
493 maxlvt = get_maxlvt();
494
495 apic_pm_state.apic_id = apic_read(APIC_ID);
496 apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
497 apic_pm_state.apic_ldr = apic_read(APIC_LDR);
498 apic_pm_state.apic_dfr = apic_read(APIC_DFR);
499 apic_pm_state.apic_spiv = apic_read(APIC_SPIV);
500 apic_pm_state.apic_lvtt = apic_read(APIC_LVTT);
501 if (maxlvt >= 4)
502 apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
503 apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0);
504 apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1);
505 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
506 apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
507 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
508#ifdef CONFIG_X86_MCE_INTEL
509 if (maxlvt >= 5)
510 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
511#endif
512 local_irq_save(flags);
513 disable_local_APIC();
514 local_irq_restore(flags);
515 return 0;
516}
517
518static int lapic_resume(struct sys_device *dev)
519{
520 unsigned int l, h;
521 unsigned long flags;
522 int maxlvt;
523
524 if (!apic_pm_state.active)
525 return 0;
526
527 maxlvt = get_maxlvt();
528
529 local_irq_save(flags);
530 rdmsr(MSR_IA32_APICBASE, l, h);
531 l &= ~MSR_IA32_APICBASE_BASE;
532 l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
533 wrmsr(MSR_IA32_APICBASE, l, h);
534 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
535 apic_write(APIC_ID, apic_pm_state.apic_id);
536 apic_write(APIC_DFR, apic_pm_state.apic_dfr);
537 apic_write(APIC_LDR, apic_pm_state.apic_ldr);
538 apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri);
539 apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
540 apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
541 apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
542#ifdef CONFIG_X86_MCE_INTEL
543 if (maxlvt >= 5)
544 apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
545#endif
546 if (maxlvt >= 4)
547 apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
548 apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
549 apic_write(APIC_TDCR, apic_pm_state.apic_tdcr);
550 apic_write(APIC_TMICT, apic_pm_state.apic_tmict);
551 apic_write(APIC_ESR, 0);
552 apic_read(APIC_ESR);
553 apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
554 apic_write(APIC_ESR, 0);
555 apic_read(APIC_ESR);
556 local_irq_restore(flags);
557 return 0;
558}
559
560static struct sysdev_class lapic_sysclass = {
561 set_kset_name("lapic"),
562 .resume = lapic_resume,
563 .suspend = lapic_suspend,
564};
565
566static struct sys_device device_lapic = {
567 .id = 0,
568 .cls = &lapic_sysclass,
569};
570
571static void __cpuinit apic_pm_activate(void)
572{
573 apic_pm_state.active = 1;
574}
575
576static int __init init_lapic_sysfs(void)
577{
578 int error;
579 if (!cpu_has_apic)
580 return 0;
581 /* XXX: remove suspend/resume procs if !apic_pm_state.active? */
582 error = sysdev_class_register(&lapic_sysclass);
583 if (!error)
584 error = sysdev_register(&device_lapic);
585 return error;
586}
587device_initcall(init_lapic_sysfs);
588
589#else /* CONFIG_PM */
590
591static void apic_pm_activate(void) { }
592
593#endif /* CONFIG_PM */
594
595static int __init apic_set_verbosity(char *str)
596{
597 if (str == NULL) {
598 skip_ioapic_setup = 0;
599 ioapic_force = 1;
600 return 0;
601 }
602 if (strcmp("debug", str) == 0)
603 apic_verbosity = APIC_DEBUG;
604 else if (strcmp("verbose", str) == 0)
605 apic_verbosity = APIC_VERBOSE;
606 else {
607 printk(KERN_WARNING "APIC Verbosity level %s not recognised"
608 " use apic=verbose or apic=debug\n", str);
609 return -EINVAL;
610 }
611
612 return 0;
613}
614early_param("apic", apic_set_verbosity);
615
616/*
617 * Detect and enable local APICs on non-SMP boards.
618 * Original code written by Keir Fraser.
619 * On AMD64 we trust the BIOS - if it says no APIC it is likely
620 * not correctly set up (usually the APIC timer won't work etc.)
621 */
622
623static int __init detect_init_APIC (void)
624{
625 if (!cpu_has_apic) {
626 printk(KERN_INFO "No local APIC present\n");
627 return -1;
628 }
629
630 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
631 boot_cpu_id = 0;
632 return 0;
633}
634
635#ifdef CONFIG_X86_IO_APIC
636static struct resource * __init ioapic_setup_resources(void)
637{
638#define IOAPIC_RESOURCE_NAME_SIZE 11
639 unsigned long n;
640 struct resource *res;
641 char *mem;
642 int i;
643
644 if (nr_ioapics <= 0)
645 return NULL;
646
647 n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource);
648 n *= nr_ioapics;
649
650 mem = alloc_bootmem(n);
651 res = (void *)mem;
652
653 if (mem != NULL) {
654 memset(mem, 0, n);
655 mem += sizeof(struct resource) * nr_ioapics;
656
657 for (i = 0; i < nr_ioapics; i++) {
658 res[i].name = mem;
659 res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
660 sprintf(mem, "IOAPIC %u", i);
661 mem += IOAPIC_RESOURCE_NAME_SIZE;
662 }
663 }
664
665 ioapic_resources = res;
666
667 return res;
668}
669
670static int __init ioapic_insert_resources(void)
671{
672 int i;
673 struct resource *r = ioapic_resources;
674
675 if (!r) {
676 printk("IO APIC resources could be not be allocated.\n");
677 return -1;
678 }
679
680 for (i = 0; i < nr_ioapics; i++) {
681 insert_resource(&iomem_resource, r);
682 r++;
683 }
684
685 return 0;
686}
687
688/* Insert the IO APIC resources after PCI initialization has occured to handle
689 * IO APICS that are mapped in on a BAR in PCI space. */
690late_initcall(ioapic_insert_resources);
691#endif
692
693void __init init_apic_mappings(void)
694{
695 unsigned long apic_phys;
696
697 /*
698 * If no local APIC can be found then set up a fake all
699 * zeroes page to simulate the local APIC and another
700 * one for the IO-APIC.
701 */
702 if (!smp_found_config && detect_init_APIC()) {
703 apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
704 apic_phys = __pa(apic_phys);
705 } else
706 apic_phys = mp_lapic_addr;
707
708 set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
709 apic_mapped = 1;
710 apic_printk(APIC_VERBOSE,"mapped APIC to %16lx (%16lx)\n", APIC_BASE, apic_phys);
711
712 /* Put local APIC into the resource map. */
713 lapic_resource.start = apic_phys;
714 lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1;
715 insert_resource(&iomem_resource, &lapic_resource);
716
717 /*
718 * Fetch the APIC ID of the BSP in case we have a
719 * default configuration (or the MP table is broken).
720 */
721 boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
722
723 {
724 unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
725 int i;
726 struct resource *ioapic_res;
727
728 ioapic_res = ioapic_setup_resources();
729 for (i = 0; i < nr_ioapics; i++) {
730 if (smp_found_config) {
731 ioapic_phys = mp_ioapics[i].mpc_apicaddr;
732 } else {
733 ioapic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
734 ioapic_phys = __pa(ioapic_phys);
735 }
736 set_fixmap_nocache(idx, ioapic_phys);
737 apic_printk(APIC_VERBOSE,"mapped IOAPIC to %016lx (%016lx)\n",
738 __fix_to_virt(idx), ioapic_phys);
739 idx++;
740
741 if (ioapic_res != NULL) {
742 ioapic_res->start = ioapic_phys;
743 ioapic_res->end = ioapic_phys + (4 * 1024) - 1;
744 ioapic_res++;
745 }
746 }
747 }
748}
749
750/*
751 * This function sets up the local APIC timer, with a timeout of
752 * 'clocks' APIC bus clock. During calibration we actually call
753 * this function twice on the boot CPU, once with a bogus timeout
754 * value, second time for real. The other (noncalibrating) CPUs
755 * call this function only once, with the real, calibrated value.
756 *
757 * We do reads before writes even if unnecessary, to get around the
758 * P5 APIC double write bug.
759 */
760
761#define APIC_DIVISOR 16
762
763static void __setup_APIC_LVTT(unsigned int clocks)
764{
765 unsigned int lvtt_value, tmp_value;
766 int cpu = smp_processor_id();
767
768 lvtt_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR;
769
770 if (cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask))
771 lvtt_value |= APIC_LVT_MASKED;
772
773 apic_write(APIC_LVTT, lvtt_value);
774
775 /*
776 * Divide PICLK by 16
777 */
778 tmp_value = apic_read(APIC_TDCR);
779 apic_write(APIC_TDCR, (tmp_value
780 & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE))
781 | APIC_TDR_DIV_16);
782
783 apic_write(APIC_TMICT, clocks/APIC_DIVISOR);
784}
785
786static void setup_APIC_timer(unsigned int clocks)
787{
788 unsigned long flags;
789
790 local_irq_save(flags);
791
792 /* wait for irq slice */
793 if (hpet_address && hpet_use_timer) {
794 u32 trigger = hpet_readl(HPET_T0_CMP);
795 while (hpet_readl(HPET_T0_CMP) == trigger)
796 /* do nothing */ ;
797 } else {
798 int c1, c2;
799 outb_p(0x00, 0x43);
800 c2 = inb_p(0x40);
801 c2 |= inb_p(0x40) << 8;
802 do {
803 c1 = c2;
804 outb_p(0x00, 0x43);
805 c2 = inb_p(0x40);
806 c2 |= inb_p(0x40) << 8;
807 } while (c2 - c1 < 300);
808 }
809 __setup_APIC_LVTT(clocks);
810 /* Turn off PIT interrupt if we use APIC timer as main timer.
811 Only works with the PM timer right now
812 TBD fix it for HPET too. */
813 if ((pmtmr_ioport != 0) &&
814 smp_processor_id() == boot_cpu_id &&
815 apic_runs_main_timer == 1 &&
816 !cpu_isset(boot_cpu_id, timer_interrupt_broadcast_ipi_mask)) {
817 stop_timer_interrupt();
818 apic_runs_main_timer++;
819 }
820 local_irq_restore(flags);
821}
822
823/*
824 * In this function we calibrate APIC bus clocks to the external
825 * timer. Unfortunately we cannot use jiffies and the timer irq
826 * to calibrate, since some later bootup code depends on getting
827 * the first irq? Ugh.
828 *
829 * We want to do the calibration only once since we
830 * want to have local timer irqs syncron. CPUs connected
831 * by the same APIC bus have the very same bus frequency.
832 * And we want to have irqs off anyways, no accidental
833 * APIC irq that way.
834 */
835
836#define TICK_COUNT 100000000
837
838static int __init calibrate_APIC_clock(void)
839{
840 unsigned apic, apic_start;
841 unsigned long tsc, tsc_start;
842 int result;
843 /*
844 * Put whatever arbitrary (but long enough) timeout
845 * value into the APIC clock, we just want to get the
846 * counter running for calibration.
847 */
848 __setup_APIC_LVTT(4000000000);
849
850 apic_start = apic_read(APIC_TMCCT);
851#ifdef CONFIG_X86_PM_TIMER
852 if (apic_calibrate_pmtmr && pmtmr_ioport) {
853 pmtimer_wait(5000); /* 5ms wait */
854 apic = apic_read(APIC_TMCCT);
855 result = (apic_start - apic) * 1000L / 5;
856 } else
857#endif
858 {
859 rdtscll(tsc_start);
860
861 do {
862 apic = apic_read(APIC_TMCCT);
863 rdtscll(tsc);
864 } while ((tsc - tsc_start) < TICK_COUNT &&
865 (apic_start - apic) < TICK_COUNT);
866
867 result = (apic_start - apic) * 1000L * tsc_khz /
868 (tsc - tsc_start);
869 }
870 printk("result %d\n", result);
871
872
873 printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n",
874 result / 1000 / 1000, result / 1000 % 1000);
875
876 return result * APIC_DIVISOR / HZ;
877}
878
879static unsigned int calibration_result;
880
881void __init setup_boot_APIC_clock (void)
882{
883 if (disable_apic_timer) {
884 printk(KERN_INFO "Disabling APIC timer\n");
885 return;
886 }
887
888 printk(KERN_INFO "Using local APIC timer interrupts.\n");
889 using_apic_timer = 1;
890
891 local_irq_disable();
892
893 calibration_result = calibrate_APIC_clock();
894 /*
895 * Now set up the timer for real.
896 */
897 setup_APIC_timer(calibration_result);
898
899 local_irq_enable();
900}
901
902void __cpuinit setup_secondary_APIC_clock(void)
903{
904 local_irq_disable(); /* FIXME: Do we need this? --RR */
905 setup_APIC_timer(calibration_result);
906 local_irq_enable();
907}
908
909void disable_APIC_timer(void)
910{
911 if (using_apic_timer) {
912 unsigned long v;
913
914 v = apic_read(APIC_LVTT);
915 /*
916 * When an illegal vector value (0-15) is written to an LVT
917 * entry and delivery mode is Fixed, the APIC may signal an
918 * illegal vector error, with out regard to whether the mask
919 * bit is set or whether an interrupt is actually seen on input.
920 *
921 * Boot sequence might call this function when the LVTT has
922 * '0' vector value. So make sure vector field is set to
923 * valid value.
924 */
925 v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
926 apic_write(APIC_LVTT, v);
927 }
928}
929
930void enable_APIC_timer(void)
931{
932 int cpu = smp_processor_id();
933
934 if (using_apic_timer &&
935 !cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask)) {
936 unsigned long v;
937
938 v = apic_read(APIC_LVTT);
939 apic_write(APIC_LVTT, v & ~APIC_LVT_MASKED);
940 }
941}
942
943void switch_APIC_timer_to_ipi(void *cpumask)
944{
945 cpumask_t mask = *(cpumask_t *)cpumask;
946 int cpu = smp_processor_id();
947
948 if (cpu_isset(cpu, mask) &&
949 !cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask)) {
950 disable_APIC_timer();
951 cpu_set(cpu, timer_interrupt_broadcast_ipi_mask);
952 }
953}
954EXPORT_SYMBOL(switch_APIC_timer_to_ipi);
955
956void smp_send_timer_broadcast_ipi(void)
957{
958 int cpu = smp_processor_id();
959 cpumask_t mask;
960
961 cpus_and(mask, cpu_online_map, timer_interrupt_broadcast_ipi_mask);
962
963 if (cpu_isset(cpu, mask)) {
964 cpu_clear(cpu, mask);
965 add_pda(apic_timer_irqs, 1);
966 smp_local_timer_interrupt();
967 }
968
969 if (!cpus_empty(mask)) {
970 send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
971 }
972}
973
974void switch_ipi_to_APIC_timer(void *cpumask)
975{
976 cpumask_t mask = *(cpumask_t *)cpumask;
977 int cpu = smp_processor_id();
978
979 if (cpu_isset(cpu, mask) &&
980 cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask)) {
981 cpu_clear(cpu, timer_interrupt_broadcast_ipi_mask);
982 enable_APIC_timer();
983 }
984}
985EXPORT_SYMBOL(switch_ipi_to_APIC_timer);
986
987int setup_profiling_timer(unsigned int multiplier)
988{
989 return -EINVAL;
990}
991
992void setup_APIC_extended_lvt(unsigned char lvt_off, unsigned char vector,
993 unsigned char msg_type, unsigned char mask)
994{
995 unsigned long reg = (lvt_off << 4) + K8_APIC_EXT_LVT_BASE;
996 unsigned int v = (mask << 16) | (msg_type << 8) | vector;
997 apic_write(reg, v);
998}
999
1000#undef APIC_DIVISOR
1001
1002/*
1003 * Local timer interrupt handler. It does both profiling and
1004 * process statistics/rescheduling.
1005 *
1006 * We do profiling in every local tick, statistics/rescheduling
1007 * happen only every 'profiling multiplier' ticks. The default
1008 * multiplier is 1 and it can be changed by writing the new multiplier
1009 * value into /proc/profile.
1010 */
1011
1012void smp_local_timer_interrupt(void)
1013{
1014 profile_tick(CPU_PROFILING);
1015#ifdef CONFIG_SMP
1016 update_process_times(user_mode(get_irq_regs()));
1017#endif
1018 if (apic_runs_main_timer > 1 && smp_processor_id() == boot_cpu_id)
1019 main_timer_handler();
1020 /*
1021 * We take the 'long' return path, and there every subsystem
1022 * grabs the appropriate locks (kernel lock/ irq lock).
1023 *
1024 * We might want to decouple profiling from the 'long path',
1025 * and do the profiling totally in assembly.
1026 *
1027 * Currently this isn't too much of an issue (performance wise),
1028 * we can take more than 100K local irqs per second on a 100 MHz P5.
1029 */
1030}
1031
1032/*
1033 * Local APIC timer interrupt. This is the most natural way for doing
1034 * local interrupts, but local timer interrupts can be emulated by
1035 * broadcast interrupts too. [in case the hw doesn't support APIC timers]
1036 *
1037 * [ if a single-CPU system runs an SMP kernel then we call the local
1038 * interrupt as well. Thus we cannot inline the local irq ... ]
1039 */
1040void smp_apic_timer_interrupt(struct pt_regs *regs)
1041{
1042 struct pt_regs *old_regs = set_irq_regs(regs);
1043
1044 /*
1045 * the NMI deadlock-detector uses this.
1046 */
1047 add_pda(apic_timer_irqs, 1);
1048
1049 /*
1050 * NOTE! We'd better ACK the irq immediately,
1051 * because timer handling can be slow.
1052 */
1053 ack_APIC_irq();
1054 /*
1055 * update_process_times() expects us to have done irq_enter().
1056 * Besides, if we don't timer interrupts ignore the global
1057 * interrupt lock, which is the WrongThing (tm) to do.
1058 */
1059 exit_idle();
1060 irq_enter();
1061 smp_local_timer_interrupt();
1062 irq_exit();
1063 set_irq_regs(old_regs);
1064}
1065
1066/*
1067 * apic_is_clustered_box() -- Check if we can expect good TSC
1068 *
1069 * Thus far, the major user of this is IBM's Summit2 series:
1070 *
1071 * Clustered boxes may have unsynced TSC problems if they are
1072 * multi-chassis. Use available data to take a good guess.
1073 * If in doubt, go HPET.
1074 */
1075__cpuinit int apic_is_clustered_box(void)
1076{
1077 int i, clusters, zeros;
1078 unsigned id;
1079 DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS);
1080
1081 bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
1082
1083 for (i = 0; i < NR_CPUS; i++) {
1084 id = bios_cpu_apicid[i];
1085 if (id != BAD_APICID)
1086 __set_bit(APIC_CLUSTERID(id), clustermap);
1087 }
1088
1089 /* Problem: Partially populated chassis may not have CPUs in some of
1090 * the APIC clusters they have been allocated. Only present CPUs have
1091 * bios_cpu_apicid entries, thus causing zeroes in the bitmap. Since
1092 * clusters are allocated sequentially, count zeros only if they are
1093 * bounded by ones.
1094 */
1095 clusters = 0;
1096 zeros = 0;
1097 for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
1098 if (test_bit(i, clustermap)) {
1099 clusters += 1 + zeros;
1100 zeros = 0;
1101 } else
1102 ++zeros;
1103 }
1104
1105 /*
1106 * If clusters > 2, then should be multi-chassis.
1107 * May have to revisit this when multi-core + hyperthreaded CPUs come
1108 * out, but AFAIK this will work even for them.
1109 */
1110 return (clusters > 2);
1111}
1112
1113/*
1114 * This interrupt should _never_ happen with our APIC/SMP architecture
1115 */
1116asmlinkage void smp_spurious_interrupt(void)
1117{
1118 unsigned int v;
1119 exit_idle();
1120 irq_enter();
1121 /*
1122 * Check if this really is a spurious interrupt and ACK it
1123 * if it is a vectored one. Just in case...
1124 * Spurious interrupts should not be ACKed.
1125 */
1126 v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1));
1127 if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
1128 ack_APIC_irq();
1129
1130 irq_exit();
1131}
1132
1133/*
1134 * This interrupt should never happen with our APIC/SMP architecture
1135 */
1136
1137asmlinkage void smp_error_interrupt(void)
1138{
1139 unsigned int v, v1;
1140
1141 exit_idle();
1142 irq_enter();
1143 /* First tickle the hardware, only then report what went on. -- REW */
1144 v = apic_read(APIC_ESR);
1145 apic_write(APIC_ESR, 0);
1146 v1 = apic_read(APIC_ESR);
1147 ack_APIC_irq();
1148 atomic_inc(&irq_err_count);
1149
1150 /* Here is what the APIC error bits mean:
1151 0: Send CS error
1152 1: Receive CS error
1153 2: Send accept error
1154 3: Receive accept error
1155 4: Reserved
1156 5: Send illegal vector
1157 6: Received illegal vector
1158 7: Illegal register address
1159 */
1160 printk (KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n",
1161 smp_processor_id(), v , v1);
1162 irq_exit();
1163}
1164
1165int disable_apic;
1166
1167/*
1168 * This initializes the IO-APIC and APIC hardware if this is
1169 * a UP kernel.
1170 */
1171int __init APIC_init_uniprocessor (void)
1172{
1173 if (disable_apic) {
1174 printk(KERN_INFO "Apic disabled\n");
1175 return -1;
1176 }
1177 if (!cpu_has_apic) {
1178 disable_apic = 1;
1179 printk(KERN_INFO "Apic disabled by BIOS\n");
1180 return -1;
1181 }
1182
1183 verify_local_APIC();
1184
1185 phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id);
1186 apic_write(APIC_ID, SET_APIC_ID(boot_cpu_id));
1187
1188 setup_local_APIC();
1189
1190 if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
1191 setup_IO_APIC();
1192 else
1193 nr_ioapics = 0;
1194 setup_boot_APIC_clock();
1195 check_nmi_watchdog();
1196 return 0;
1197}
1198
1199static __init int setup_disableapic(char *str)
1200{
1201 disable_apic = 1;
1202 clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
1203 return 0;
1204}
1205early_param("disableapic", setup_disableapic);
1206
1207/* same as disableapic, for compatibility */
1208static __init int setup_nolapic(char *str)
1209{
1210 return setup_disableapic(str);
1211}
1212early_param("nolapic", setup_nolapic);
1213
1214static int __init parse_lapic_timer_c2_ok(char *arg)
1215{
1216 local_apic_timer_c2_ok = 1;
1217 return 0;
1218}
1219early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok);
1220
1221static __init int setup_noapictimer(char *str)
1222{
1223 if (str[0] != ' ' && str[0] != 0)
1224 return 0;
1225 disable_apic_timer = 1;
1226 return 1;
1227}
1228
1229static __init int setup_apicmaintimer(char *str)
1230{
1231 apic_runs_main_timer = 1;
1232 nohpet = 1;
1233 return 1;
1234}
1235__setup("apicmaintimer", setup_apicmaintimer);
1236
1237static __init int setup_noapicmaintimer(char *str)
1238{
1239 apic_runs_main_timer = -1;
1240 return 1;
1241}
1242__setup("noapicmaintimer", setup_noapicmaintimer);
1243
1244static __init int setup_apicpmtimer(char *s)
1245{
1246 apic_calibrate_pmtmr = 1;
1247 notsc_setup(NULL);
1248 return setup_apicmaintimer(NULL);
1249}
1250__setup("apicpmtimer", setup_apicpmtimer);
1251
1252__setup("noapictimer", setup_noapictimer);
1253
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
new file mode 100644
index 000000000000..32f2365c26ed
--- /dev/null
+++ b/arch/x86/kernel/apm_32.c
@@ -0,0 +1,2403 @@
1/* -*- linux-c -*-
2 * APM BIOS driver for Linux
3 * Copyright 1994-2001 Stephen Rothwell (sfr@canb.auug.org.au)
4 *
5 * Initial development of this driver was funded by NEC Australia P/L
6 * and NEC Corporation
7 *
8 * This program is free software; you can redistribute it and/or modify it
9 * under the terms of the GNU General Public License as published by the
10 * Free Software Foundation; either version 2, or (at your option) any
11 * later version.
12 *
13 * This program is distributed in the hope that it will be useful, but
14 * WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * General Public License for more details.
17 *
18 * October 1995, Rik Faith (faith@cs.unc.edu):
19 * Minor enhancements and updates (to the patch set) for 1.3.x
20 * Documentation
21 * January 1996, Rik Faith (faith@cs.unc.edu):
22 * Make /proc/apm easy to format (bump driver version)
23 * March 1996, Rik Faith (faith@cs.unc.edu):
24 * Prohibit APM BIOS calls unless apm_enabled.
25 * (Thanks to Ulrich Windl <Ulrich.Windl@rz.uni-regensburg.de>)
26 * April 1996, Stephen Rothwell (sfr@canb.auug.org.au)
27 * Version 1.0 and 1.1
28 * May 1996, Version 1.2
29 * Feb 1998, Version 1.3
30 * Feb 1998, Version 1.4
31 * Aug 1998, Version 1.5
32 * Sep 1998, Version 1.6
33 * Nov 1998, Version 1.7
34 * Jan 1999, Version 1.8
35 * Jan 1999, Version 1.9
36 * Oct 1999, Version 1.10
37 * Nov 1999, Version 1.11
38 * Jan 2000, Version 1.12
39 * Feb 2000, Version 1.13
40 * Nov 2000, Version 1.14
41 * Oct 2001, Version 1.15
42 * Jan 2002, Version 1.16
43 * Oct 2002, Version 1.16ac
44 *
45 * History:
46 * 0.6b: first version in official kernel, Linux 1.3.46
47 * 0.7: changed /proc/apm format, Linux 1.3.58
48 * 0.8: fixed gcc 2.7.[12] compilation problems, Linux 1.3.59
49 * 0.9: only call bios if bios is present, Linux 1.3.72
50 * 1.0: use fixed device number, consolidate /proc/apm into this file,
51 * Linux 1.3.85
52 * 1.1: support user-space standby and suspend, power off after system
53 * halted, Linux 1.3.98
54 * 1.2: When resetting RTC after resume, take care so that the time
55 * is only incorrect by 30-60mS (vs. 1S previously) (Gabor J. Toth
56 * <jtoth@princeton.edu>); improve interaction between
57 * screen-blanking and gpm (Stephen Rothwell); Linux 1.99.4
58 * 1.2a:Simple change to stop mysterious bug reports with SMP also added
59 * levels to the printk calls. APM is not defined for SMP machines.
60 * The new replacment for it is, but Linux doesn't yet support this.
61 * Alan Cox Linux 2.1.55
62 * 1.3: Set up a valid data descriptor 0x40 for buggy BIOS's
63 * 1.4: Upgraded to support APM 1.2. Integrated ThinkPad suspend patch by
64 * Dean Gaudet <dgaudet@arctic.org>.
65 * C. Scott Ananian <cananian@alumni.princeton.edu> Linux 2.1.87
66 * 1.5: Fix segment register reloading (in case of bad segments saved
67 * across BIOS call).
68 * Stephen Rothwell
69 * 1.6: Cope with complier/assembler differences.
70 * Only try to turn off the first display device.
71 * Fix OOPS at power off with no APM BIOS by Jan Echternach
72 * <echter@informatik.uni-rostock.de>
73 * Stephen Rothwell
74 * 1.7: Modify driver's cached copy of the disabled/disengaged flags
75 * to reflect current state of APM BIOS.
76 * Chris Rankin <rankinc@bellsouth.net>
77 * Reset interrupt 0 timer to 100Hz after suspend
78 * Chad Miller <cmiller@surfsouth.com>
79 * Add CONFIG_APM_IGNORE_SUSPEND_BOUNCE
80 * Richard Gooch <rgooch@atnf.csiro.au>
81 * Allow boot time disabling of APM
82 * Make boot messages far less verbose by default
83 * Make asm safer
84 * Stephen Rothwell
85 * 1.8: Add CONFIG_APM_RTC_IS_GMT
86 * Richard Gooch <rgooch@atnf.csiro.au>
87 * change APM_NOINTS to CONFIG_APM_ALLOW_INTS
88 * remove dependency on CONFIG_PROC_FS
89 * Stephen Rothwell
90 * 1.9: Fix small typo. <laslo@wodip.opole.pl>
91 * Try to cope with BIOS's that need to have all display
92 * devices blanked and not just the first one.
93 * Ross Paterson <ross@soi.city.ac.uk>
94 * Fix segment limit setting it has always been wrong as
95 * the segments needed to have byte granularity.
96 * Mark a few things __init.
97 * Add hack to allow power off of SMP systems by popular request.
98 * Use CONFIG_SMP instead of __SMP__
99 * Ignore BOUNCES for three seconds.
100 * Stephen Rothwell
101 * 1.10: Fix for Thinkpad return code.
102 * Merge 2.2 and 2.3 drivers.
103 * Remove APM dependencies in arch/i386/kernel/process.c
104 * Remove APM dependencies in drivers/char/sysrq.c
105 * Reset time across standby.
106 * Allow more inititialisation on SMP.
107 * Remove CONFIG_APM_POWER_OFF and make it boot time
108 * configurable (default on).
109 * Make debug only a boot time parameter (remove APM_DEBUG).
110 * Try to blank all devices on any error.
111 * 1.11: Remove APM dependencies in drivers/char/console.c
112 * Check nr_running to detect if we are idle (from
113 * Borislav Deianov <borislav@lix.polytechnique.fr>)
114 * Fix for bioses that don't zero the top part of the
115 * entrypoint offset (Mario Sitta <sitta@al.unipmn.it>)
116 * (reported by Panos Katsaloulis <teras@writeme.com>).
117 * Real mode power off patch (Walter Hofmann
118 * <Walter.Hofmann@physik.stud.uni-erlangen.de>).
119 * 1.12: Remove CONFIG_SMP as the compiler will optimize
120 * the code away anyway (smp_num_cpus == 1 in UP)
121 * noted by Artur Skawina <skawina@geocities.com>.
122 * Make power off under SMP work again.
123 * Fix thinko with initial engaging of BIOS.
124 * Make sure power off only happens on CPU 0
125 * (Paul "Rusty" Russell <rusty@rustcorp.com.au>).
126 * Do error notification to user mode if BIOS calls fail.
127 * Move entrypoint offset fix to ...boot/setup.S
128 * where it belongs (Cosmos <gis88564@cis.nctu.edu.tw>).
129 * Remove smp-power-off. SMP users must now specify
130 * "apm=power-off" on the kernel command line. Suggested
131 * by Jim Avera <jima@hal.com>, modified by Alan Cox
132 * <alan@lxorguk.ukuu.org.uk>.
133 * Register the /proc/apm entry even on SMP so that
134 * scripts that check for it before doing power off
135 * work (Jim Avera <jima@hal.com>).
136 * 1.13: Changes for new pm_ interfaces (Andy Henroid
137 * <andy_henroid@yahoo.com>).
138 * Modularize the code.
139 * Fix the Thinkpad (again) :-( (CONFIG_APM_IGNORE_MULTIPLE_SUSPENDS
140 * is now the way life works).
141 * Fix thinko in suspend() (wrong return).
142 * Notify drivers on critical suspend.
143 * Make kapmd absorb more idle time (Pavel Machek <pavel@suse.cz>
144 * modified by sfr).
145 * Disable interrupts while we are suspended (Andy Henroid
146 * <andy_henroid@yahoo.com> fixed by sfr).
147 * Make power off work on SMP again (Tony Hoyle
148 * <tmh@magenta-logic.com> and <zlatko@iskon.hr>) modified by sfr.
149 * Remove CONFIG_APM_SUSPEND_BOUNCE. The bounce ignore
150 * interval is now configurable.
151 * 1.14: Make connection version persist across module unload/load.
152 * Enable and engage power management earlier.
153 * Disengage power management on module unload.
154 * Changed to use the sysrq-register hack for registering the
155 * power off function called by magic sysrq based upon discussions
156 * in irc://irc.openprojects.net/#kernelnewbies
157 * (Crutcher Dunnavant <crutcher+kernel@datastacks.com>).
158 * Make CONFIG_APM_REAL_MODE_POWER_OFF run time configurable.
159 * (Arjan van de Ven <arjanv@redhat.com>) modified by sfr.
160 * Work around byte swap bug in one of the Vaio's BIOS's
161 * (Marc Boucher <marc@mbsi.ca>).
162 * Exposed the disable flag to dmi so that we can handle known
163 * broken APM (Alan Cox <alan@redhat.com>).
164 * 1.14ac: If the BIOS says "I slowed the CPU down" then don't spin
165 * calling it - instead idle. (Alan Cox <alan@redhat.com>)
166 * If an APM idle fails log it and idle sensibly
167 * 1.15: Don't queue events to clients who open the device O_WRONLY.
168 * Don't expect replies from clients who open the device O_RDONLY.
169 * (Idea from Thomas Hood)
170 * Minor waitqueue cleanups. (John Fremlin <chief@bandits.org>)
171 * 1.16: Fix idle calling. (Andreas Steinmetz <ast@domdv.de> et al.)
172 * Notify listeners of standby or suspend events before notifying
173 * drivers. Return EBUSY to ioctl() if suspend is rejected.
174 * (Russell King <rmk@arm.linux.org.uk> and Thomas Hood)
175 * Ignore first resume after we generate our own resume event
176 * after a suspend (Thomas Hood)
177 * Daemonize now gets rid of our controlling terminal (sfr).
178 * CONFIG_APM_CPU_IDLE now just affects the default value of
179 * idle_threshold (sfr).
180 * Change name of kernel apm daemon (as it no longer idles) (sfr).
181 * 1.16ac: Fix up SMP support somewhat. You can now force SMP on and we
182 * make _all_ APM calls on the CPU#0. Fix unsafe sign bug.
183 * TODO: determine if its "boot CPU" or "CPU0" we want to lock to.
184 *
185 * APM 1.1 Reference:
186 *
187 * Intel Corporation, Microsoft Corporation. Advanced Power Management
188 * (APM) BIOS Interface Specification, Revision 1.1, September 1993.
189 * Intel Order Number 241704-001. Microsoft Part Number 781-110-X01.
190 *
191 * [This document is available free from Intel by calling 800.628.8686 (fax
192 * 916.356.6100) or 800.548.4725; or via anonymous ftp from
193 * ftp://ftp.intel.com/pub/IAL/software_specs/apmv11.doc. It is also
194 * available from Microsoft by calling 206.882.8080.]
195 *
196 * APM 1.2 Reference:
197 * Intel Corporation, Microsoft Corporation. Advanced Power Management
198 * (APM) BIOS Interface Specification, Revision 1.2, February 1996.
199 *
200 * [This document is available from Microsoft at:
201 * http://www.microsoft.com/whdc/archive/amp_12.mspx]
202 */
203
204#include <linux/module.h>
205
206#include <linux/poll.h>
207#include <linux/types.h>
208#include <linux/stddef.h>
209#include <linux/timer.h>
210#include <linux/fcntl.h>
211#include <linux/slab.h>
212#include <linux/stat.h>
213#include <linux/proc_fs.h>
214#include <linux/seq_file.h>
215#include <linux/miscdevice.h>
216#include <linux/apm_bios.h>
217#include <linux/init.h>
218#include <linux/time.h>
219#include <linux/sched.h>
220#include <linux/pm.h>
221#include <linux/pm_legacy.h>
222#include <linux/capability.h>
223#include <linux/device.h>
224#include <linux/kernel.h>
225#include <linux/freezer.h>
226#include <linux/smp.h>
227#include <linux/dmi.h>
228#include <linux/suspend.h>
229#include <linux/kthread.h>
230
231#include <asm/system.h>
232#include <asm/uaccess.h>
233#include <asm/desc.h>
234#include <asm/i8253.h>
235#include <asm/paravirt.h>
236#include <asm/reboot.h>
237
238#include "io_ports.h"
239
240#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT)
241extern int (*console_blank_hook)(int);
242#endif
243
244/*
245 * The apm_bios device is one of the misc char devices.
246 * This is its minor number.
247 */
248#define APM_MINOR_DEV 134
249
250/*
251 * See Documentation/Config.help for the configuration options.
252 *
253 * Various options can be changed at boot time as follows:
254 * (We allow underscores for compatibility with the modules code)
255 * apm=on/off enable/disable APM
256 * [no-]allow[-_]ints allow interrupts during BIOS calls
257 * [no-]broken[-_]psr BIOS has a broken GetPowerStatus call
258 * [no-]realmode[-_]power[-_]off switch to real mode before
259 * powering off
260 * [no-]debug log some debugging messages
261 * [no-]power[-_]off power off on shutdown
262 * [no-]smp Use apm even on an SMP box
263 * bounce[-_]interval=<n> number of ticks to ignore suspend
264 * bounces
265 * idle[-_]threshold=<n> System idle percentage above which to
266 * make APM BIOS idle calls. Set it to
267 * 100 to disable.
268 * idle[-_]period=<n> Period (in 1/100s of a second) over
269 * which the idle percentage is
270 * calculated.
271 */
272
273/* KNOWN PROBLEM MACHINES:
274 *
275 * U: TI 4000M TravelMate: BIOS is *NOT* APM compliant
276 * [Confirmed by TI representative]
277 * ?: ACER 486DX4/75: uses dseg 0040, in violation of APM specification
278 * [Confirmed by BIOS disassembly]
279 * [This may work now ...]
280 * P: Toshiba 1950S: battery life information only gets updated after resume
281 * P: Midwest Micro Soundbook Elite DX2/66 monochrome: screen blanking
282 * broken in BIOS [Reported by Garst R. Reese <reese@isn.net>]
283 * ?: AcerNote-950: oops on reading /proc/apm - workaround is a WIP
284 * Neale Banks <neale@lowendale.com.au> December 2000
285 *
286 * Legend: U = unusable with APM patches
287 * P = partially usable with APM patches
288 */
289
290/*
291 * Define as 1 to make the driver always call the APM BIOS busy
292 * routine even if the clock was not reported as slowed by the
293 * idle routine. Otherwise, define as 0.
294 */
295#define ALWAYS_CALL_BUSY 1
296
297/*
298 * Define to make the APM BIOS calls zero all data segment registers (so
299 * that an incorrect BIOS implementation will cause a kernel panic if it
300 * tries to write to arbitrary memory).
301 */
302#define APM_ZERO_SEGS
303
304#include "apm.h"
305
306/*
307 * Define to re-initialize the interrupt 0 timer to 100 Hz after a suspend.
308 * This patched by Chad Miller <cmiller@surfsouth.com>, original code by
309 * David Chen <chen@ctpa04.mit.edu>
310 */
311#undef INIT_TIMER_AFTER_SUSPEND
312
313#ifdef INIT_TIMER_AFTER_SUSPEND
314#include <linux/timex.h>
315#include <asm/io.h>
316#include <linux/delay.h>
317#endif
318
319/*
320 * Need to poll the APM BIOS every second
321 */
322#define APM_CHECK_TIMEOUT (HZ)
323
324/*
325 * Ignore suspend events for this amount of time after a resume
326 */
327#define DEFAULT_BOUNCE_INTERVAL (3 * HZ)
328
329/*
330 * Maximum number of events stored
331 */
332#define APM_MAX_EVENTS 20
333
334/*
335 * The per-file APM data
336 */
337struct apm_user {
338 int magic;
339 struct apm_user * next;
340 unsigned int suser: 1;
341 unsigned int writer: 1;
342 unsigned int reader: 1;
343 unsigned int suspend_wait: 1;
344 int suspend_result;
345 int suspends_pending;
346 int standbys_pending;
347 int suspends_read;
348 int standbys_read;
349 int event_head;
350 int event_tail;
351 apm_event_t events[APM_MAX_EVENTS];
352};
353
354/*
355 * The magic number in apm_user
356 */
357#define APM_BIOS_MAGIC 0x4101
358
359/*
360 * idle percentage above which bios idle calls are done
361 */
362#ifdef CONFIG_APM_CPU_IDLE
363#define DEFAULT_IDLE_THRESHOLD 95
364#else
365#define DEFAULT_IDLE_THRESHOLD 100
366#endif
367#define DEFAULT_IDLE_PERIOD (100 / 3)
368
369/*
370 * Local variables
371 */
372static struct {
373 unsigned long offset;
374 unsigned short segment;
375} apm_bios_entry;
376static int clock_slowed;
377static int idle_threshold __read_mostly = DEFAULT_IDLE_THRESHOLD;
378static int idle_period __read_mostly = DEFAULT_IDLE_PERIOD;
379static int set_pm_idle;
380static int suspends_pending;
381static int standbys_pending;
382static int ignore_sys_suspend;
383static int ignore_normal_resume;
384static int bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL;
385
386static int debug __read_mostly;
387static int smp __read_mostly;
388static int apm_disabled = -1;
389#ifdef CONFIG_SMP
390static int power_off;
391#else
392static int power_off = 1;
393#endif
394#ifdef CONFIG_APM_REAL_MODE_POWER_OFF
395static int realmode_power_off = 1;
396#else
397static int realmode_power_off;
398#endif
399#ifdef CONFIG_APM_ALLOW_INTS
400static int allow_ints = 1;
401#else
402static int allow_ints;
403#endif
404static int broken_psr;
405
406static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue);
407static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue);
408static struct apm_user * user_list;
409static DEFINE_SPINLOCK(user_list_lock);
410static const struct desc_struct bad_bios_desc = { 0, 0x00409200 };
411
412static const char driver_version[] = "1.16ac"; /* no spaces */
413
414static struct task_struct *kapmd_task;
415
416/*
417 * APM event names taken from the APM 1.2 specification. These are
418 * the message codes that the BIOS uses to tell us about events
419 */
420static const char * const apm_event_name[] = {
421 "system standby",
422 "system suspend",
423 "normal resume",
424 "critical resume",
425 "low battery",
426 "power status change",
427 "update time",
428 "critical suspend",
429 "user standby",
430 "user suspend",
431 "system standby resume",
432 "capabilities change"
433};
434#define NR_APM_EVENT_NAME ARRAY_SIZE(apm_event_name)
435
436typedef struct lookup_t {
437 int key;
438 char * msg;
439} lookup_t;
440
441/*
442 * The BIOS returns a set of standard error codes in AX when the
443 * carry flag is set.
444 */
445
446static const lookup_t error_table[] = {
447/* N/A { APM_SUCCESS, "Operation succeeded" }, */
448 { APM_DISABLED, "Power management disabled" },
449 { APM_CONNECTED, "Real mode interface already connected" },
450 { APM_NOT_CONNECTED, "Interface not connected" },
451 { APM_16_CONNECTED, "16 bit interface already connected" },
452/* N/A { APM_16_UNSUPPORTED, "16 bit interface not supported" }, */
453 { APM_32_CONNECTED, "32 bit interface already connected" },
454 { APM_32_UNSUPPORTED, "32 bit interface not supported" },
455 { APM_BAD_DEVICE, "Unrecognized device ID" },
456 { APM_BAD_PARAM, "Parameter out of range" },
457 { APM_NOT_ENGAGED, "Interface not engaged" },
458 { APM_BAD_FUNCTION, "Function not supported" },
459 { APM_RESUME_DISABLED, "Resume timer disabled" },
460 { APM_BAD_STATE, "Unable to enter requested state" },
461/* N/A { APM_NO_EVENTS, "No events pending" }, */
462 { APM_NO_ERROR, "BIOS did not set a return code" },
463 { APM_NOT_PRESENT, "No APM present" }
464};
465#define ERROR_COUNT ARRAY_SIZE(error_table)
466
467/**
468 * apm_error - display an APM error
469 * @str: information string
470 * @err: APM BIOS return code
471 *
472 * Write a meaningful log entry to the kernel log in the event of
473 * an APM error.
474 */
475
476static void apm_error(char *str, int err)
477{
478 int i;
479
480 for (i = 0; i < ERROR_COUNT; i++)
481 if (error_table[i].key == err) break;
482 if (i < ERROR_COUNT)
483 printk(KERN_NOTICE "apm: %s: %s\n", str, error_table[i].msg);
484 else
485 printk(KERN_NOTICE "apm: %s: unknown error code %#2.2x\n",
486 str, err);
487}
488
489/*
490 * Lock APM functionality to physical CPU 0
491 */
492
493#ifdef CONFIG_SMP
494
495static cpumask_t apm_save_cpus(void)
496{
497 cpumask_t x = current->cpus_allowed;
498 /* Some bioses don't like being called from CPU != 0 */
499 set_cpus_allowed(current, cpumask_of_cpu(0));
500 BUG_ON(smp_processor_id() != 0);
501 return x;
502}
503
504static inline void apm_restore_cpus(cpumask_t mask)
505{
506 set_cpus_allowed(current, mask);
507}
508
509#else
510
511/*
512 * No CPU lockdown needed on a uniprocessor
513 */
514
515#define apm_save_cpus() (current->cpus_allowed)
516#define apm_restore_cpus(x) (void)(x)
517
518#endif
519
520/*
521 * These are the actual BIOS calls. Depending on APM_ZERO_SEGS and
522 * apm_info.allow_ints, we are being really paranoid here! Not only
523 * are interrupts disabled, but all the segment registers (except SS)
524 * are saved and zeroed this means that if the BIOS tries to reference
525 * any data without explicitly loading the segment registers, the kernel
526 * will fault immediately rather than have some unforeseen circumstances
527 * for the rest of the kernel. And it will be very obvious! :-) Doing
528 * this depends on CS referring to the same physical memory as DS so that
529 * DS can be zeroed before the call. Unfortunately, we can't do anything
530 * about the stack segment/pointer. Also, we tell the compiler that
531 * everything could change.
532 *
533 * Also, we KNOW that for the non error case of apm_bios_call, there
534 * is no useful data returned in the low order 8 bits of eax.
535 */
536
537static inline unsigned long __apm_irq_save(void)
538{
539 unsigned long flags;
540 local_save_flags(flags);
541 if (apm_info.allow_ints) {
542 if (irqs_disabled_flags(flags))
543 local_irq_enable();
544 } else
545 local_irq_disable();
546
547 return flags;
548}
549
550#define apm_irq_save(flags) \
551 do { flags = __apm_irq_save(); } while (0)
552
553static inline void apm_irq_restore(unsigned long flags)
554{
555 if (irqs_disabled_flags(flags))
556 local_irq_disable();
557 else if (irqs_disabled())
558 local_irq_enable();
559}
560
561#ifdef APM_ZERO_SEGS
562# define APM_DECL_SEGS \
563 unsigned int saved_fs; unsigned int saved_gs;
564# define APM_DO_SAVE_SEGS \
565 savesegment(fs, saved_fs); savesegment(gs, saved_gs)
566# define APM_DO_RESTORE_SEGS \
567 loadsegment(fs, saved_fs); loadsegment(gs, saved_gs)
568#else
569# define APM_DECL_SEGS
570# define APM_DO_SAVE_SEGS
571# define APM_DO_RESTORE_SEGS
572#endif
573
574/**
575 * apm_bios_call - Make an APM BIOS 32bit call
576 * @func: APM function to execute
577 * @ebx_in: EBX register for call entry
578 * @ecx_in: ECX register for call entry
579 * @eax: EAX register return
580 * @ebx: EBX register return
581 * @ecx: ECX register return
582 * @edx: EDX register return
583 * @esi: ESI register return
584 *
585 * Make an APM call using the 32bit protected mode interface. The
586 * caller is responsible for knowing if APM BIOS is configured and
587 * enabled. This call can disable interrupts for a long period of
588 * time on some laptops. The return value is in AH and the carry
589 * flag is loaded into AL. If there is an error, then the error
590 * code is returned in AH (bits 8-15 of eax) and this function
591 * returns non-zero.
592 */
593
594static u8 apm_bios_call(u32 func, u32 ebx_in, u32 ecx_in,
595 u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, u32 *esi)
596{
597 APM_DECL_SEGS
598 unsigned long flags;
599 cpumask_t cpus;
600 int cpu;
601 struct desc_struct save_desc_40;
602 struct desc_struct *gdt;
603
604 cpus = apm_save_cpus();
605
606 cpu = get_cpu();
607 gdt = get_cpu_gdt_table(cpu);
608 save_desc_40 = gdt[0x40 / 8];
609 gdt[0x40 / 8] = bad_bios_desc;
610
611 apm_irq_save(flags);
612 APM_DO_SAVE_SEGS;
613 apm_bios_call_asm(func, ebx_in, ecx_in, eax, ebx, ecx, edx, esi);
614 APM_DO_RESTORE_SEGS;
615 apm_irq_restore(flags);
616 gdt[0x40 / 8] = save_desc_40;
617 put_cpu();
618 apm_restore_cpus(cpus);
619
620 return *eax & 0xff;
621}
622
623/**
624 * apm_bios_call_simple - make a simple APM BIOS 32bit call
625 * @func: APM function to invoke
626 * @ebx_in: EBX register value for BIOS call
627 * @ecx_in: ECX register value for BIOS call
628 * @eax: EAX register on return from the BIOS call
629 *
630 * Make a BIOS call that returns one value only, or just status.
631 * If there is an error, then the error code is returned in AH
632 * (bits 8-15 of eax) and this function returns non-zero. This is
633 * used for simpler BIOS operations. This call may hold interrupts
634 * off for a long time on some laptops.
635 */
636
637static u8 apm_bios_call_simple(u32 func, u32 ebx_in, u32 ecx_in, u32 *eax)
638{
639 u8 error;
640 APM_DECL_SEGS
641 unsigned long flags;
642 cpumask_t cpus;
643 int cpu;
644 struct desc_struct save_desc_40;
645 struct desc_struct *gdt;
646
647 cpus = apm_save_cpus();
648
649 cpu = get_cpu();
650 gdt = get_cpu_gdt_table(cpu);
651 save_desc_40 = gdt[0x40 / 8];
652 gdt[0x40 / 8] = bad_bios_desc;
653
654 apm_irq_save(flags);
655 APM_DO_SAVE_SEGS;
656 error = apm_bios_call_simple_asm(func, ebx_in, ecx_in, eax);
657 APM_DO_RESTORE_SEGS;
658 apm_irq_restore(flags);
659 gdt[0x40 / 8] = save_desc_40;
660 put_cpu();
661 apm_restore_cpus(cpus);
662 return error;
663}
664
665/**
666 * apm_driver_version - APM driver version
667 * @val: loaded with the APM version on return
668 *
669 * Retrieve the APM version supported by the BIOS. This is only
670 * supported for APM 1.1 or higher. An error indicates APM 1.0 is
671 * probably present.
672 *
673 * On entry val should point to a value indicating the APM driver
674 * version with the high byte being the major and the low byte the
675 * minor number both in BCD
676 *
677 * On return it will hold the BIOS revision supported in the
678 * same format.
679 */
680
681static int apm_driver_version(u_short *val)
682{
683 u32 eax;
684
685 if (apm_bios_call_simple(APM_FUNC_VERSION, 0, *val, &eax))
686 return (eax >> 8) & 0xff;
687 *val = eax;
688 return APM_SUCCESS;
689}
690
691/**
692 * apm_get_event - get an APM event from the BIOS
693 * @event: pointer to the event
694 * @info: point to the event information
695 *
696 * The APM BIOS provides a polled information for event
697 * reporting. The BIOS expects to be polled at least every second
698 * when events are pending. When a message is found the caller should
699 * poll until no more messages are present. However, this causes
700 * problems on some laptops where a suspend event notification is
701 * not cleared until it is acknowledged.
702 *
703 * Additional information is returned in the info pointer, providing
704 * that APM 1.2 is in use. If no messges are pending the value 0x80
705 * is returned (No power management events pending).
706 */
707
708static int apm_get_event(apm_event_t *event, apm_eventinfo_t *info)
709{
710 u32 eax;
711 u32 ebx;
712 u32 ecx;
713 u32 dummy;
714
715 if (apm_bios_call(APM_FUNC_GET_EVENT, 0, 0, &eax, &ebx, &ecx,
716 &dummy, &dummy))
717 return (eax >> 8) & 0xff;
718 *event = ebx;
719 if (apm_info.connection_version < 0x0102)
720 *info = ~0; /* indicate info not valid */
721 else
722 *info = ecx;
723 return APM_SUCCESS;
724}
725
726/**
727 * set_power_state - set the power management state
728 * @what: which items to transition
729 * @state: state to transition to
730 *
731 * Request an APM change of state for one or more system devices. The
732 * processor state must be transitioned last of all. what holds the
733 * class of device in the upper byte and the device number (0xFF for
734 * all) for the object to be transitioned.
735 *
736 * The state holds the state to transition to, which may in fact
737 * be an acceptance of a BIOS requested state change.
738 */
739
740static int set_power_state(u_short what, u_short state)
741{
742 u32 eax;
743
744 if (apm_bios_call_simple(APM_FUNC_SET_STATE, what, state, &eax))
745 return (eax >> 8) & 0xff;
746 return APM_SUCCESS;
747}
748
749/**
750 * set_system_power_state - set system wide power state
751 * @state: which state to enter
752 *
753 * Transition the entire system into a new APM power state.
754 */
755
756static int set_system_power_state(u_short state)
757{
758 return set_power_state(APM_DEVICE_ALL, state);
759}
760
761/**
762 * apm_do_idle - perform power saving
763 *
764 * This function notifies the BIOS that the processor is (in the view
765 * of the OS) idle. It returns -1 in the event that the BIOS refuses
766 * to handle the idle request. On a success the function returns 1
767 * if the BIOS did clock slowing or 0 otherwise.
768 */
769
770static int apm_do_idle(void)
771{
772 u32 eax;
773 u8 ret = 0;
774 int idled = 0;
775 int polling;
776
777 polling = !!(current_thread_info()->status & TS_POLLING);
778 if (polling) {
779 current_thread_info()->status &= ~TS_POLLING;
780 /*
781 * TS_POLLING-cleared state must be visible before we
782 * test NEED_RESCHED:
783 */
784 smp_mb();
785 }
786 if (!need_resched()) {
787 idled = 1;
788 ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax);
789 }
790 if (polling)
791 current_thread_info()->status |= TS_POLLING;
792
793 if (!idled)
794 return 0;
795
796 if (ret) {
797 static unsigned long t;
798
799 /* This always fails on some SMP boards running UP kernels.
800 * Only report the failure the first 5 times.
801 */
802 if (++t < 5)
803 {
804 printk(KERN_DEBUG "apm_do_idle failed (%d)\n",
805 (eax >> 8) & 0xff);
806 t = jiffies;
807 }
808 return -1;
809 }
810 clock_slowed = (apm_info.bios.flags & APM_IDLE_SLOWS_CLOCK) != 0;
811 return clock_slowed;
812}
813
814/**
815 * apm_do_busy - inform the BIOS the CPU is busy
816 *
817 * Request that the BIOS brings the CPU back to full performance.
818 */
819
820static void apm_do_busy(void)
821{
822 u32 dummy;
823
824 if (clock_slowed || ALWAYS_CALL_BUSY) {
825 (void) apm_bios_call_simple(APM_FUNC_BUSY, 0, 0, &dummy);
826 clock_slowed = 0;
827 }
828}
829
830/*
831 * If no process has really been interested in
832 * the CPU for some time, we want to call BIOS
833 * power management - we probably want
834 * to conserve power.
835 */
836#define IDLE_CALC_LIMIT (HZ * 100)
837#define IDLE_LEAKY_MAX 16
838
839static void (*original_pm_idle)(void) __read_mostly;
840
841/**
842 * apm_cpu_idle - cpu idling for APM capable Linux
843 *
844 * This is the idling function the kernel executes when APM is available. It
845 * tries to do BIOS powermanagement based on the average system idle time.
846 * Furthermore it calls the system default idle routine.
847 */
848
849static void apm_cpu_idle(void)
850{
851 static int use_apm_idle; /* = 0 */
852 static unsigned int last_jiffies; /* = 0 */
853 static unsigned int last_stime; /* = 0 */
854
855 int apm_idle_done = 0;
856 unsigned int jiffies_since_last_check = jiffies - last_jiffies;
857 unsigned int bucket;
858
859recalc:
860 if (jiffies_since_last_check > IDLE_CALC_LIMIT) {
861 use_apm_idle = 0;
862 last_jiffies = jiffies;
863 last_stime = current->stime;
864 } else if (jiffies_since_last_check > idle_period) {
865 unsigned int idle_percentage;
866
867 idle_percentage = current->stime - last_stime;
868 idle_percentage *= 100;
869 idle_percentage /= jiffies_since_last_check;
870 use_apm_idle = (idle_percentage > idle_threshold);
871 if (apm_info.forbid_idle)
872 use_apm_idle = 0;
873 last_jiffies = jiffies;
874 last_stime = current->stime;
875 }
876
877 bucket = IDLE_LEAKY_MAX;
878
879 while (!need_resched()) {
880 if (use_apm_idle) {
881 unsigned int t;
882
883 t = jiffies;
884 switch (apm_do_idle()) {
885 case 0: apm_idle_done = 1;
886 if (t != jiffies) {
887 if (bucket) {
888 bucket = IDLE_LEAKY_MAX;
889 continue;
890 }
891 } else if (bucket) {
892 bucket--;
893 continue;
894 }
895 break;
896 case 1: apm_idle_done = 1;
897 break;
898 default: /* BIOS refused */
899 break;
900 }
901 }
902 if (original_pm_idle)
903 original_pm_idle();
904 else
905 default_idle();
906 jiffies_since_last_check = jiffies - last_jiffies;
907 if (jiffies_since_last_check > idle_period)
908 goto recalc;
909 }
910
911 if (apm_idle_done)
912 apm_do_busy();
913}
914
915/**
916 * apm_power_off - ask the BIOS to power off
917 *
918 * Handle the power off sequence. This is the one piece of code we
919 * will execute even on SMP machines. In order to deal with BIOS
920 * bugs we support real mode APM BIOS power off calls. We also make
921 * the SMP call on CPU0 as some systems will only honour this call
922 * on their first cpu.
923 */
924
925static void apm_power_off(void)
926{
927 unsigned char po_bios_call[] = {
928 0xb8, 0x00, 0x10, /* movw $0x1000,ax */
929 0x8e, 0xd0, /* movw ax,ss */
930 0xbc, 0x00, 0xf0, /* movw $0xf000,sp */
931 0xb8, 0x07, 0x53, /* movw $0x5307,ax */
932 0xbb, 0x01, 0x00, /* movw $0x0001,bx */
933 0xb9, 0x03, 0x00, /* movw $0x0003,cx */
934 0xcd, 0x15 /* int $0x15 */
935 };
936
937 /* Some bioses don't like being called from CPU != 0 */
938 if (apm_info.realmode_power_off)
939 {
940 (void)apm_save_cpus();
941 machine_real_restart(po_bios_call, sizeof(po_bios_call));
942 }
943 else
944 (void) set_system_power_state(APM_STATE_OFF);
945}
946
947#ifdef CONFIG_APM_DO_ENABLE
948
949/**
950 * apm_enable_power_management - enable BIOS APM power management
951 * @enable: enable yes/no
952 *
953 * Enable or disable the APM BIOS power services.
954 */
955
956static int apm_enable_power_management(int enable)
957{
958 u32 eax;
959
960 if ((enable == 0) && (apm_info.bios.flags & APM_BIOS_DISENGAGED))
961 return APM_NOT_ENGAGED;
962 if (apm_bios_call_simple(APM_FUNC_ENABLE_PM, APM_DEVICE_BALL,
963 enable, &eax))
964 return (eax >> 8) & 0xff;
965 if (enable)
966 apm_info.bios.flags &= ~APM_BIOS_DISABLED;
967 else
968 apm_info.bios.flags |= APM_BIOS_DISABLED;
969 return APM_SUCCESS;
970}
971#endif
972
973/**
974 * apm_get_power_status - get current power state
975 * @status: returned status
976 * @bat: battery info
977 * @life: estimated life
978 *
979 * Obtain the current power status from the APM BIOS. We return a
980 * status which gives the rough battery status, and current power
981 * source. The bat value returned give an estimate as a percentage
982 * of life and a status value for the battery. The estimated life
983 * if reported is a lifetime in secodnds/minutes at current powwer
984 * consumption.
985 */
986
987static int apm_get_power_status(u_short *status, u_short *bat, u_short *life)
988{
989 u32 eax;
990 u32 ebx;
991 u32 ecx;
992 u32 edx;
993 u32 dummy;
994
995 if (apm_info.get_power_status_broken)
996 return APM_32_UNSUPPORTED;
997 if (apm_bios_call(APM_FUNC_GET_STATUS, APM_DEVICE_ALL, 0,
998 &eax, &ebx, &ecx, &edx, &dummy))
999 return (eax >> 8) & 0xff;
1000 *status = ebx;
1001 *bat = ecx;
1002 if (apm_info.get_power_status_swabinminutes) {
1003 *life = swab16((u16)edx);
1004 *life |= 0x8000;
1005 } else
1006 *life = edx;
1007 return APM_SUCCESS;
1008}
1009
1010#if 0
1011static int apm_get_battery_status(u_short which, u_short *status,
1012 u_short *bat, u_short *life, u_short *nbat)
1013{
1014 u32 eax;
1015 u32 ebx;
1016 u32 ecx;
1017 u32 edx;
1018 u32 esi;
1019
1020 if (apm_info.connection_version < 0x0102) {
1021 /* pretend we only have one battery. */
1022 if (which != 1)
1023 return APM_BAD_DEVICE;
1024 *nbat = 1;
1025 return apm_get_power_status(status, bat, life);
1026 }
1027
1028 if (apm_bios_call(APM_FUNC_GET_STATUS, (0x8000 | (which)), 0, &eax,
1029 &ebx, &ecx, &edx, &esi))
1030 return (eax >> 8) & 0xff;
1031 *status = ebx;
1032 *bat = ecx;
1033 *life = edx;
1034 *nbat = esi;
1035 return APM_SUCCESS;
1036}
1037#endif
1038
1039/**
1040 * apm_engage_power_management - enable PM on a device
1041 * @device: identity of device
1042 * @enable: on/off
1043 *
1044 * Activate or deactive power management on either a specific device
1045 * or the entire system (%APM_DEVICE_ALL).
1046 */
1047
1048static int apm_engage_power_management(u_short device, int enable)
1049{
1050 u32 eax;
1051
1052 if ((enable == 0) && (device == APM_DEVICE_ALL)
1053 && (apm_info.bios.flags & APM_BIOS_DISABLED))
1054 return APM_DISABLED;
1055 if (apm_bios_call_simple(APM_FUNC_ENGAGE_PM, device, enable, &eax))
1056 return (eax >> 8) & 0xff;
1057 if (device == APM_DEVICE_ALL) {
1058 if (enable)
1059 apm_info.bios.flags &= ~APM_BIOS_DISENGAGED;
1060 else
1061 apm_info.bios.flags |= APM_BIOS_DISENGAGED;
1062 }
1063 return APM_SUCCESS;
1064}
1065
1066#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT)
1067
1068/**
1069 * apm_console_blank - blank the display
1070 * @blank: on/off
1071 *
1072 * Attempt to blank the console, firstly by blanking just video device
1073 * zero, and if that fails (some BIOSes don't support it) then it blanks
1074 * all video devices. Typically the BIOS will do laptop backlight and
1075 * monitor powerdown for us.
1076 */
1077
1078static int apm_console_blank(int blank)
1079{
1080 int error = APM_NOT_ENGAGED; /* silence gcc */
1081 int i;
1082 u_short state;
1083 static const u_short dev[3] = { 0x100, 0x1FF, 0x101 };
1084
1085 state = blank ? APM_STATE_STANDBY : APM_STATE_READY;
1086
1087 for (i = 0; i < ARRAY_SIZE(dev); i++) {
1088 error = set_power_state(dev[i], state);
1089
1090 if ((error == APM_SUCCESS) || (error == APM_NO_ERROR))
1091 return 1;
1092
1093 if (error == APM_NOT_ENGAGED)
1094 break;
1095 }
1096
1097 if (error == APM_NOT_ENGAGED) {
1098 static int tried;
1099 int eng_error;
1100 if (tried++ == 0) {
1101 eng_error = apm_engage_power_management(APM_DEVICE_ALL, 1);
1102 if (eng_error) {
1103 apm_error("set display", error);
1104 apm_error("engage interface", eng_error);
1105 return 0;
1106 } else
1107 return apm_console_blank(blank);
1108 }
1109 }
1110 apm_error("set display", error);
1111 return 0;
1112}
1113#endif
1114
1115static int queue_empty(struct apm_user *as)
1116{
1117 return as->event_head == as->event_tail;
1118}
1119
1120static apm_event_t get_queued_event(struct apm_user *as)
1121{
1122 if (++as->event_tail >= APM_MAX_EVENTS)
1123 as->event_tail = 0;
1124 return as->events[as->event_tail];
1125}
1126
1127static void queue_event(apm_event_t event, struct apm_user *sender)
1128{
1129 struct apm_user * as;
1130
1131 spin_lock(&user_list_lock);
1132 if (user_list == NULL)
1133 goto out;
1134 for (as = user_list; as != NULL; as = as->next) {
1135 if ((as == sender) || (!as->reader))
1136 continue;
1137 if (++as->event_head >= APM_MAX_EVENTS)
1138 as->event_head = 0;
1139
1140 if (as->event_head == as->event_tail) {
1141 static int notified;
1142
1143 if (notified++ == 0)
1144 printk(KERN_ERR "apm: an event queue overflowed\n");
1145 if (++as->event_tail >= APM_MAX_EVENTS)
1146 as->event_tail = 0;
1147 }
1148 as->events[as->event_head] = event;
1149 if ((!as->suser) || (!as->writer))
1150 continue;
1151 switch (event) {
1152 case APM_SYS_SUSPEND:
1153 case APM_USER_SUSPEND:
1154 as->suspends_pending++;
1155 suspends_pending++;
1156 break;
1157
1158 case APM_SYS_STANDBY:
1159 case APM_USER_STANDBY:
1160 as->standbys_pending++;
1161 standbys_pending++;
1162 break;
1163 }
1164 }
1165 wake_up_interruptible(&apm_waitqueue);
1166out:
1167 spin_unlock(&user_list_lock);
1168}
1169
1170static void reinit_timer(void)
1171{
1172#ifdef INIT_TIMER_AFTER_SUSPEND
1173 unsigned long flags;
1174
1175 spin_lock_irqsave(&i8253_lock, flags);
1176 /* set the clock to HZ */
1177 outb_p(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */
1178 udelay(10);
1179 outb_p(LATCH & 0xff, PIT_CH0); /* LSB */
1180 udelay(10);
1181 outb(LATCH >> 8, PIT_CH0); /* MSB */
1182 udelay(10);
1183 spin_unlock_irqrestore(&i8253_lock, flags);
1184#endif
1185}
1186
1187static int suspend(int vetoable)
1188{
1189 int err;
1190 struct apm_user *as;
1191
1192 if (pm_send_all(PM_SUSPEND, (void *)3)) {
1193 /* Vetoed */
1194 if (vetoable) {
1195 if (apm_info.connection_version > 0x100)
1196 set_system_power_state(APM_STATE_REJECT);
1197 err = -EBUSY;
1198 ignore_sys_suspend = 0;
1199 printk(KERN_WARNING "apm: suspend was vetoed.\n");
1200 goto out;
1201 }
1202 printk(KERN_CRIT "apm: suspend was vetoed, but suspending anyway.\n");
1203 }
1204
1205 device_suspend(PMSG_SUSPEND);
1206 local_irq_disable();
1207 device_power_down(PMSG_SUSPEND);
1208
1209 local_irq_enable();
1210
1211 save_processor_state();
1212 err = set_system_power_state(APM_STATE_SUSPEND);
1213 ignore_normal_resume = 1;
1214 restore_processor_state();
1215
1216 local_irq_disable();
1217 reinit_timer();
1218
1219 if (err == APM_NO_ERROR)
1220 err = APM_SUCCESS;
1221 if (err != APM_SUCCESS)
1222 apm_error("suspend", err);
1223 err = (err == APM_SUCCESS) ? 0 : -EIO;
1224 device_power_up();
1225 local_irq_enable();
1226 device_resume();
1227 pm_send_all(PM_RESUME, (void *)0);
1228 queue_event(APM_NORMAL_RESUME, NULL);
1229 out:
1230 spin_lock(&user_list_lock);
1231 for (as = user_list; as != NULL; as = as->next) {
1232 as->suspend_wait = 0;
1233 as->suspend_result = err;
1234 }
1235 spin_unlock(&user_list_lock);
1236 wake_up_interruptible(&apm_suspend_waitqueue);
1237 return err;
1238}
1239
1240static void standby(void)
1241{
1242 int err;
1243
1244 local_irq_disable();
1245 device_power_down(PMSG_SUSPEND);
1246 local_irq_enable();
1247
1248 err = set_system_power_state(APM_STATE_STANDBY);
1249 if ((err != APM_SUCCESS) && (err != APM_NO_ERROR))
1250 apm_error("standby", err);
1251
1252 local_irq_disable();
1253 device_power_up();
1254 local_irq_enable();
1255}
1256
1257static apm_event_t get_event(void)
1258{
1259 int error;
1260 apm_event_t event = APM_NO_EVENTS; /* silence gcc */
1261 apm_eventinfo_t info;
1262
1263 static int notified;
1264
1265 /* we don't use the eventinfo */
1266 error = apm_get_event(&event, &info);
1267 if (error == APM_SUCCESS)
1268 return event;
1269
1270 if ((error != APM_NO_EVENTS) && (notified++ == 0))
1271 apm_error("get_event", error);
1272
1273 return 0;
1274}
1275
1276static void check_events(void)
1277{
1278 apm_event_t event;
1279 static unsigned long last_resume;
1280 static int ignore_bounce;
1281
1282 while ((event = get_event()) != 0) {
1283 if (debug) {
1284 if (event <= NR_APM_EVENT_NAME)
1285 printk(KERN_DEBUG "apm: received %s notify\n",
1286 apm_event_name[event - 1]);
1287 else
1288 printk(KERN_DEBUG "apm: received unknown "
1289 "event 0x%02x\n", event);
1290 }
1291 if (ignore_bounce
1292 && ((jiffies - last_resume) > bounce_interval))
1293 ignore_bounce = 0;
1294
1295 switch (event) {
1296 case APM_SYS_STANDBY:
1297 case APM_USER_STANDBY:
1298 queue_event(event, NULL);
1299 if (standbys_pending <= 0)
1300 standby();
1301 break;
1302
1303 case APM_USER_SUSPEND:
1304#ifdef CONFIG_APM_IGNORE_USER_SUSPEND
1305 if (apm_info.connection_version > 0x100)
1306 set_system_power_state(APM_STATE_REJECT);
1307 break;
1308#endif
1309 case APM_SYS_SUSPEND:
1310 if (ignore_bounce) {
1311 if (apm_info.connection_version > 0x100)
1312 set_system_power_state(APM_STATE_REJECT);
1313 break;
1314 }
1315 /*
1316 * If we are already processing a SUSPEND,
1317 * then further SUSPEND events from the BIOS
1318 * will be ignored. We also return here to
1319 * cope with the fact that the Thinkpads keep
1320 * sending a SUSPEND event until something else
1321 * happens!
1322 */
1323 if (ignore_sys_suspend)
1324 return;
1325 ignore_sys_suspend = 1;
1326 queue_event(event, NULL);
1327 if (suspends_pending <= 0)
1328 (void) suspend(1);
1329 break;
1330
1331 case APM_NORMAL_RESUME:
1332 case APM_CRITICAL_RESUME:
1333 case APM_STANDBY_RESUME:
1334 ignore_sys_suspend = 0;
1335 last_resume = jiffies;
1336 ignore_bounce = 1;
1337 if ((event != APM_NORMAL_RESUME)
1338 || (ignore_normal_resume == 0)) {
1339 device_resume();
1340 pm_send_all(PM_RESUME, (void *)0);
1341 queue_event(event, NULL);
1342 }
1343 ignore_normal_resume = 0;
1344 break;
1345
1346 case APM_CAPABILITY_CHANGE:
1347 case APM_LOW_BATTERY:
1348 case APM_POWER_STATUS_CHANGE:
1349 queue_event(event, NULL);
1350 /* If needed, notify drivers here */
1351 break;
1352
1353 case APM_UPDATE_TIME:
1354 break;
1355
1356 case APM_CRITICAL_SUSPEND:
1357 /*
1358 * We are not allowed to reject a critical suspend.
1359 */
1360 (void) suspend(0);
1361 break;
1362 }
1363 }
1364}
1365
1366static void apm_event_handler(void)
1367{
1368 static int pending_count = 4;
1369 int err;
1370
1371 if ((standbys_pending > 0) || (suspends_pending > 0)) {
1372 if ((apm_info.connection_version > 0x100) &&
1373 (pending_count-- <= 0)) {
1374 pending_count = 4;
1375 if (debug)
1376 printk(KERN_DEBUG "apm: setting state busy\n");
1377 err = set_system_power_state(APM_STATE_BUSY);
1378 if (err)
1379 apm_error("busy", err);
1380 }
1381 } else
1382 pending_count = 4;
1383 check_events();
1384}
1385
1386/*
1387 * This is the APM thread main loop.
1388 */
1389
1390static void apm_mainloop(void)
1391{
1392 DECLARE_WAITQUEUE(wait, current);
1393
1394 add_wait_queue(&apm_waitqueue, &wait);
1395 set_current_state(TASK_INTERRUPTIBLE);
1396 for (;;) {
1397 schedule_timeout(APM_CHECK_TIMEOUT);
1398 if (kthread_should_stop())
1399 break;
1400 /*
1401 * Ok, check all events, check for idle (and mark us sleeping
1402 * so as not to count towards the load average)..
1403 */
1404 set_current_state(TASK_INTERRUPTIBLE);
1405 apm_event_handler();
1406 }
1407 remove_wait_queue(&apm_waitqueue, &wait);
1408}
1409
1410static int check_apm_user(struct apm_user *as, const char *func)
1411{
1412 if ((as == NULL) || (as->magic != APM_BIOS_MAGIC)) {
1413 printk(KERN_ERR "apm: %s passed bad filp\n", func);
1414 return 1;
1415 }
1416 return 0;
1417}
1418
1419static ssize_t do_read(struct file *fp, char __user *buf, size_t count, loff_t *ppos)
1420{
1421 struct apm_user * as;
1422 int i;
1423 apm_event_t event;
1424
1425 as = fp->private_data;
1426 if (check_apm_user(as, "read"))
1427 return -EIO;
1428 if ((int)count < sizeof(apm_event_t))
1429 return -EINVAL;
1430 if ((queue_empty(as)) && (fp->f_flags & O_NONBLOCK))
1431 return -EAGAIN;
1432 wait_event_interruptible(apm_waitqueue, !queue_empty(as));
1433 i = count;
1434 while ((i >= sizeof(event)) && !queue_empty(as)) {
1435 event = get_queued_event(as);
1436 if (copy_to_user(buf, &event, sizeof(event))) {
1437 if (i < count)
1438 break;
1439 return -EFAULT;
1440 }
1441 switch (event) {
1442 case APM_SYS_SUSPEND:
1443 case APM_USER_SUSPEND:
1444 as->suspends_read++;
1445 break;
1446
1447 case APM_SYS_STANDBY:
1448 case APM_USER_STANDBY:
1449 as->standbys_read++;
1450 break;
1451 }
1452 buf += sizeof(event);
1453 i -= sizeof(event);
1454 }
1455 if (i < count)
1456 return count - i;
1457 if (signal_pending(current))
1458 return -ERESTARTSYS;
1459 return 0;
1460}
1461
1462static unsigned int do_poll(struct file *fp, poll_table * wait)
1463{
1464 struct apm_user * as;
1465
1466 as = fp->private_data;
1467 if (check_apm_user(as, "poll"))
1468 return 0;
1469 poll_wait(fp, &apm_waitqueue, wait);
1470 if (!queue_empty(as))
1471 return POLLIN | POLLRDNORM;
1472 return 0;
1473}
1474
1475static int do_ioctl(struct inode * inode, struct file *filp,
1476 u_int cmd, u_long arg)
1477{
1478 struct apm_user * as;
1479
1480 as = filp->private_data;
1481 if (check_apm_user(as, "ioctl"))
1482 return -EIO;
1483 if ((!as->suser) || (!as->writer))
1484 return -EPERM;
1485 switch (cmd) {
1486 case APM_IOC_STANDBY:
1487 if (as->standbys_read > 0) {
1488 as->standbys_read--;
1489 as->standbys_pending--;
1490 standbys_pending--;
1491 } else
1492 queue_event(APM_USER_STANDBY, as);
1493 if (standbys_pending <= 0)
1494 standby();
1495 break;
1496 case APM_IOC_SUSPEND:
1497 if (as->suspends_read > 0) {
1498 as->suspends_read--;
1499 as->suspends_pending--;
1500 suspends_pending--;
1501 } else
1502 queue_event(APM_USER_SUSPEND, as);
1503 if (suspends_pending <= 0) {
1504 return suspend(1);
1505 } else {
1506 as->suspend_wait = 1;
1507 wait_event_interruptible(apm_suspend_waitqueue,
1508 as->suspend_wait == 0);
1509 return as->suspend_result;
1510 }
1511 break;
1512 default:
1513 return -EINVAL;
1514 }
1515 return 0;
1516}
1517
1518static int do_release(struct inode * inode, struct file * filp)
1519{
1520 struct apm_user * as;
1521
1522 as = filp->private_data;
1523 if (check_apm_user(as, "release"))
1524 return 0;
1525 filp->private_data = NULL;
1526 if (as->standbys_pending > 0) {
1527 standbys_pending -= as->standbys_pending;
1528 if (standbys_pending <= 0)
1529 standby();
1530 }
1531 if (as->suspends_pending > 0) {
1532 suspends_pending -= as->suspends_pending;
1533 if (suspends_pending <= 0)
1534 (void) suspend(1);
1535 }
1536 spin_lock(&user_list_lock);
1537 if (user_list == as)
1538 user_list = as->next;
1539 else {
1540 struct apm_user * as1;
1541
1542 for (as1 = user_list;
1543 (as1 != NULL) && (as1->next != as);
1544 as1 = as1->next)
1545 ;
1546 if (as1 == NULL)
1547 printk(KERN_ERR "apm: filp not in user list\n");
1548 else
1549 as1->next = as->next;
1550 }
1551 spin_unlock(&user_list_lock);
1552 kfree(as);
1553 return 0;
1554}
1555
1556static int do_open(struct inode * inode, struct file * filp)
1557{
1558 struct apm_user * as;
1559
1560 as = kmalloc(sizeof(*as), GFP_KERNEL);
1561 if (as == NULL) {
1562 printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n",
1563 sizeof(*as));
1564 return -ENOMEM;
1565 }
1566 as->magic = APM_BIOS_MAGIC;
1567 as->event_tail = as->event_head = 0;
1568 as->suspends_pending = as->standbys_pending = 0;
1569 as->suspends_read = as->standbys_read = 0;
1570 /*
1571 * XXX - this is a tiny bit broken, when we consider BSD
1572 * process accounting. If the device is opened by root, we
1573 * instantly flag that we used superuser privs. Who knows,
1574 * we might close the device immediately without doing a
1575 * privileged operation -- cevans
1576 */
1577 as->suser = capable(CAP_SYS_ADMIN);
1578 as->writer = (filp->f_mode & FMODE_WRITE) == FMODE_WRITE;
1579 as->reader = (filp->f_mode & FMODE_READ) == FMODE_READ;
1580 spin_lock(&user_list_lock);
1581 as->next = user_list;
1582 user_list = as;
1583 spin_unlock(&user_list_lock);
1584 filp->private_data = as;
1585 return 0;
1586}
1587
1588static int proc_apm_show(struct seq_file *m, void *v)
1589{
1590 unsigned short bx;
1591 unsigned short cx;
1592 unsigned short dx;
1593 int error;
1594 unsigned short ac_line_status = 0xff;
1595 unsigned short battery_status = 0xff;
1596 unsigned short battery_flag = 0xff;
1597 int percentage = -1;
1598 int time_units = -1;
1599 char *units = "?";
1600
1601 if ((num_online_cpus() == 1) &&
1602 !(error = apm_get_power_status(&bx, &cx, &dx))) {
1603 ac_line_status = (bx >> 8) & 0xff;
1604 battery_status = bx & 0xff;
1605 if ((cx & 0xff) != 0xff)
1606 percentage = cx & 0xff;
1607
1608 if (apm_info.connection_version > 0x100) {
1609 battery_flag = (cx >> 8) & 0xff;
1610 if (dx != 0xffff) {
1611 units = (dx & 0x8000) ? "min" : "sec";
1612 time_units = dx & 0x7fff;
1613 }
1614 }
1615 }
1616 /* Arguments, with symbols from linux/apm_bios.h. Information is
1617 from the Get Power Status (0x0a) call unless otherwise noted.
1618
1619 0) Linux driver version (this will change if format changes)
1620 1) APM BIOS Version. Usually 1.0, 1.1 or 1.2.
1621 2) APM flags from APM Installation Check (0x00):
1622 bit 0: APM_16_BIT_SUPPORT
1623 bit 1: APM_32_BIT_SUPPORT
1624 bit 2: APM_IDLE_SLOWS_CLOCK
1625 bit 3: APM_BIOS_DISABLED
1626 bit 4: APM_BIOS_DISENGAGED
1627 3) AC line status
1628 0x00: Off-line
1629 0x01: On-line
1630 0x02: On backup power (BIOS >= 1.1 only)
1631 0xff: Unknown
1632 4) Battery status
1633 0x00: High
1634 0x01: Low
1635 0x02: Critical
1636 0x03: Charging
1637 0x04: Selected battery not present (BIOS >= 1.2 only)
1638 0xff: Unknown
1639 5) Battery flag
1640 bit 0: High
1641 bit 1: Low
1642 bit 2: Critical
1643 bit 3: Charging
1644 bit 7: No system battery
1645 0xff: Unknown
1646 6) Remaining battery life (percentage of charge):
1647 0-100: valid
1648 -1: Unknown
1649 7) Remaining battery life (time units):
1650 Number of remaining minutes or seconds
1651 -1: Unknown
1652 8) min = minutes; sec = seconds */
1653
1654 seq_printf(m, "%s %d.%d 0x%02x 0x%02x 0x%02x 0x%02x %d%% %d %s\n",
1655 driver_version,
1656 (apm_info.bios.version >> 8) & 0xff,
1657 apm_info.bios.version & 0xff,
1658 apm_info.bios.flags,
1659 ac_line_status,
1660 battery_status,
1661 battery_flag,
1662 percentage,
1663 time_units,
1664 units);
1665 return 0;
1666}
1667
1668static int proc_apm_open(struct inode *inode, struct file *file)
1669{
1670 return single_open(file, proc_apm_show, NULL);
1671}
1672
1673static const struct file_operations apm_file_ops = {
1674 .owner = THIS_MODULE,
1675 .open = proc_apm_open,
1676 .read = seq_read,
1677 .llseek = seq_lseek,
1678 .release = single_release,
1679};
1680
1681static int apm(void *unused)
1682{
1683 unsigned short bx;
1684 unsigned short cx;
1685 unsigned short dx;
1686 int error;
1687 char * power_stat;
1688 char * bat_stat;
1689
1690#ifdef CONFIG_SMP
1691 /* 2002/08/01 - WT
1692 * This is to avoid random crashes at boot time during initialization
1693 * on SMP systems in case of "apm=power-off" mode. Seen on ASUS A7M266D.
1694 * Some bioses don't like being called from CPU != 0.
1695 * Method suggested by Ingo Molnar.
1696 */
1697 set_cpus_allowed(current, cpumask_of_cpu(0));
1698 BUG_ON(smp_processor_id() != 0);
1699#endif
1700
1701 if (apm_info.connection_version == 0) {
1702 apm_info.connection_version = apm_info.bios.version;
1703 if (apm_info.connection_version > 0x100) {
1704 /*
1705 * We only support BIOSs up to version 1.2
1706 */
1707 if (apm_info.connection_version > 0x0102)
1708 apm_info.connection_version = 0x0102;
1709 error = apm_driver_version(&apm_info.connection_version);
1710 if (error != APM_SUCCESS) {
1711 apm_error("driver version", error);
1712 /* Fall back to an APM 1.0 connection. */
1713 apm_info.connection_version = 0x100;
1714 }
1715 }
1716 }
1717
1718 if (debug)
1719 printk(KERN_INFO "apm: Connection version %d.%d\n",
1720 (apm_info.connection_version >> 8) & 0xff,
1721 apm_info.connection_version & 0xff);
1722
1723#ifdef CONFIG_APM_DO_ENABLE
1724 if (apm_info.bios.flags & APM_BIOS_DISABLED) {
1725 /*
1726 * This call causes my NEC UltraLite Versa 33/C to hang if it
1727 * is booted with PM disabled but not in the docking station.
1728 * Unfortunate ...
1729 */
1730 error = apm_enable_power_management(1);
1731 if (error) {
1732 apm_error("enable power management", error);
1733 return -1;
1734 }
1735 }
1736#endif
1737
1738 if ((apm_info.bios.flags & APM_BIOS_DISENGAGED)
1739 && (apm_info.connection_version > 0x0100)) {
1740 error = apm_engage_power_management(APM_DEVICE_ALL, 1);
1741 if (error) {
1742 apm_error("engage power management", error);
1743 return -1;
1744 }
1745 }
1746
1747 if (debug && (num_online_cpus() == 1 || smp )) {
1748 error = apm_get_power_status(&bx, &cx, &dx);
1749 if (error)
1750 printk(KERN_INFO "apm: power status not available\n");
1751 else {
1752 switch ((bx >> 8) & 0xff) {
1753 case 0: power_stat = "off line"; break;
1754 case 1: power_stat = "on line"; break;
1755 case 2: power_stat = "on backup power"; break;
1756 default: power_stat = "unknown"; break;
1757 }
1758 switch (bx & 0xff) {
1759 case 0: bat_stat = "high"; break;
1760 case 1: bat_stat = "low"; break;
1761 case 2: bat_stat = "critical"; break;
1762 case 3: bat_stat = "charging"; break;
1763 default: bat_stat = "unknown"; break;
1764 }
1765 printk(KERN_INFO
1766 "apm: AC %s, battery status %s, battery life ",
1767 power_stat, bat_stat);
1768 if ((cx & 0xff) == 0xff)
1769 printk("unknown\n");
1770 else
1771 printk("%d%%\n", cx & 0xff);
1772 if (apm_info.connection_version > 0x100) {
1773 printk(KERN_INFO
1774 "apm: battery flag 0x%02x, battery life ",
1775 (cx >> 8) & 0xff);
1776 if (dx == 0xffff)
1777 printk("unknown\n");
1778 else
1779 printk("%d %s\n", dx & 0x7fff,
1780 (dx & 0x8000) ?
1781 "minutes" : "seconds");
1782 }
1783 }
1784 }
1785
1786 /* Install our power off handler.. */
1787 if (power_off)
1788 pm_power_off = apm_power_off;
1789
1790 if (num_online_cpus() == 1 || smp) {
1791#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT)
1792 console_blank_hook = apm_console_blank;
1793#endif
1794 apm_mainloop();
1795#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT)
1796 console_blank_hook = NULL;
1797#endif
1798 }
1799
1800 return 0;
1801}
1802
1803#ifndef MODULE
1804static int __init apm_setup(char *str)
1805{
1806 int invert;
1807
1808 while ((str != NULL) && (*str != '\0')) {
1809 if (strncmp(str, "off", 3) == 0)
1810 apm_disabled = 1;
1811 if (strncmp(str, "on", 2) == 0)
1812 apm_disabled = 0;
1813 if ((strncmp(str, "bounce-interval=", 16) == 0) ||
1814 (strncmp(str, "bounce_interval=", 16) == 0))
1815 bounce_interval = simple_strtol(str + 16, NULL, 0);
1816 if ((strncmp(str, "idle-threshold=", 15) == 0) ||
1817 (strncmp(str, "idle_threshold=", 15) == 0))
1818 idle_threshold = simple_strtol(str + 15, NULL, 0);
1819 if ((strncmp(str, "idle-period=", 12) == 0) ||
1820 (strncmp(str, "idle_period=", 12) == 0))
1821 idle_period = simple_strtol(str + 12, NULL, 0);
1822 invert = (strncmp(str, "no-", 3) == 0) ||
1823 (strncmp(str, "no_", 3) == 0);
1824 if (invert)
1825 str += 3;
1826 if (strncmp(str, "debug", 5) == 0)
1827 debug = !invert;
1828 if ((strncmp(str, "power-off", 9) == 0) ||
1829 (strncmp(str, "power_off", 9) == 0))
1830 power_off = !invert;
1831 if (strncmp(str, "smp", 3) == 0)
1832 {
1833 smp = !invert;
1834 idle_threshold = 100;
1835 }
1836 if ((strncmp(str, "allow-ints", 10) == 0) ||
1837 (strncmp(str, "allow_ints", 10) == 0))
1838 apm_info.allow_ints = !invert;
1839 if ((strncmp(str, "broken-psr", 10) == 0) ||
1840 (strncmp(str, "broken_psr", 10) == 0))
1841 apm_info.get_power_status_broken = !invert;
1842 if ((strncmp(str, "realmode-power-off", 18) == 0) ||
1843 (strncmp(str, "realmode_power_off", 18) == 0))
1844 apm_info.realmode_power_off = !invert;
1845 str = strchr(str, ',');
1846 if (str != NULL)
1847 str += strspn(str, ", \t");
1848 }
1849 return 1;
1850}
1851
1852__setup("apm=", apm_setup);
1853#endif
1854
1855static const struct file_operations apm_bios_fops = {
1856 .owner = THIS_MODULE,
1857 .read = do_read,
1858 .poll = do_poll,
1859 .ioctl = do_ioctl,
1860 .open = do_open,
1861 .release = do_release,
1862};
1863
1864static struct miscdevice apm_device = {
1865 APM_MINOR_DEV,
1866 "apm_bios",
1867 &apm_bios_fops
1868};
1869
1870
1871/* Simple "print if true" callback */
1872static int __init print_if_true(const struct dmi_system_id *d)
1873{
1874 printk("%s\n", d->ident);
1875 return 0;
1876}
1877
1878/*
1879 * Some Bioses enable the PS/2 mouse (touchpad) at resume, even if it was
1880 * disabled before the suspend. Linux used to get terribly confused by that.
1881 */
1882static int __init broken_ps2_resume(const struct dmi_system_id *d)
1883{
1884 printk(KERN_INFO "%s machine detected. Mousepad Resume Bug workaround hopefully not needed.\n", d->ident);
1885 return 0;
1886}
1887
1888/* Some bioses have a broken protected mode poweroff and need to use realmode */
1889static int __init set_realmode_power_off(const struct dmi_system_id *d)
1890{
1891 if (apm_info.realmode_power_off == 0) {
1892 apm_info.realmode_power_off = 1;
1893 printk(KERN_INFO "%s bios detected. Using realmode poweroff only.\n", d->ident);
1894 }
1895 return 0;
1896}
1897
1898/* Some laptops require interrupts to be enabled during APM calls */
1899static int __init set_apm_ints(const struct dmi_system_id *d)
1900{
1901 if (apm_info.allow_ints == 0) {
1902 apm_info.allow_ints = 1;
1903 printk(KERN_INFO "%s machine detected. Enabling interrupts during APM calls.\n", d->ident);
1904 }
1905 return 0;
1906}
1907
1908/* Some APM bioses corrupt memory or just plain do not work */
1909static int __init apm_is_horked(const struct dmi_system_id *d)
1910{
1911 if (apm_info.disabled == 0) {
1912 apm_info.disabled = 1;
1913 printk(KERN_INFO "%s machine detected. Disabling APM.\n", d->ident);
1914 }
1915 return 0;
1916}
1917
1918static int __init apm_is_horked_d850md(const struct dmi_system_id *d)
1919{
1920 if (apm_info.disabled == 0) {
1921 apm_info.disabled = 1;
1922 printk(KERN_INFO "%s machine detected. Disabling APM.\n", d->ident);
1923 printk(KERN_INFO "This bug is fixed in bios P15 which is available for \n");
1924 printk(KERN_INFO "download from support.intel.com \n");
1925 }
1926 return 0;
1927}
1928
1929/* Some APM bioses hang on APM idle calls */
1930static int __init apm_likes_to_melt(const struct dmi_system_id *d)
1931{
1932 if (apm_info.forbid_idle == 0) {
1933 apm_info.forbid_idle = 1;
1934 printk(KERN_INFO "%s machine detected. Disabling APM idle calls.\n", d->ident);
1935 }
1936 return 0;
1937}
1938
1939/*
1940 * Check for clue free BIOS implementations who use
1941 * the following QA technique
1942 *
1943 * [ Write BIOS Code ]<------
1944 * | ^
1945 * < Does it Compile >----N--
1946 * |Y ^
1947 * < Does it Boot Win98 >-N--
1948 * |Y
1949 * [Ship It]
1950 *
1951 * Phoenix A04 08/24/2000 is known bad (Dell Inspiron 5000e)
1952 * Phoenix A07 09/29/2000 is known good (Dell Inspiron 5000)
1953 */
1954static int __init broken_apm_power(const struct dmi_system_id *d)
1955{
1956 apm_info.get_power_status_broken = 1;
1957 printk(KERN_WARNING "BIOS strings suggest APM bugs, disabling power status reporting.\n");
1958 return 0;
1959}
1960
1961/*
1962 * This bios swaps the APM minute reporting bytes over (Many sony laptops
1963 * have this problem).
1964 */
1965static int __init swab_apm_power_in_minutes(const struct dmi_system_id *d)
1966{
1967 apm_info.get_power_status_swabinminutes = 1;
1968 printk(KERN_WARNING "BIOS strings suggest APM reports battery life in minutes and wrong byte order.\n");
1969 return 0;
1970}
1971
1972static struct dmi_system_id __initdata apm_dmi_table[] = {
1973 {
1974 print_if_true,
1975 KERN_WARNING "IBM T23 - BIOS 1.03b+ and controller firmware 1.02+ may be needed for Linux APM.",
1976 { DMI_MATCH(DMI_SYS_VENDOR, "IBM"),
1977 DMI_MATCH(DMI_BIOS_VERSION, "1AET38WW (1.01b)"), },
1978 },
1979 { /* Handle problems with APM on the C600 */
1980 broken_ps2_resume, "Dell Latitude C600",
1981 { DMI_MATCH(DMI_SYS_VENDOR, "Dell"),
1982 DMI_MATCH(DMI_PRODUCT_NAME, "Latitude C600"), },
1983 },
1984 { /* Allow interrupts during suspend on Dell Latitude laptops*/
1985 set_apm_ints, "Dell Latitude",
1986 { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
1987 DMI_MATCH(DMI_PRODUCT_NAME, "Latitude C510"), }
1988 },
1989 { /* APM crashes */
1990 apm_is_horked, "Dell Inspiron 2500",
1991 { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
1992 DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 2500"),
1993 DMI_MATCH(DMI_BIOS_VENDOR,"Phoenix Technologies LTD"),
1994 DMI_MATCH(DMI_BIOS_VERSION,"A11"), },
1995 },
1996 { /* Allow interrupts during suspend on Dell Inspiron laptops*/
1997 set_apm_ints, "Dell Inspiron", {
1998 DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
1999 DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 4000"), },
2000 },
2001 { /* Handle problems with APM on Inspiron 5000e */
2002 broken_apm_power, "Dell Inspiron 5000e",
2003 { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
2004 DMI_MATCH(DMI_BIOS_VERSION, "A04"),
2005 DMI_MATCH(DMI_BIOS_DATE, "08/24/2000"), },
2006 },
2007 { /* Handle problems with APM on Inspiron 2500 */
2008 broken_apm_power, "Dell Inspiron 2500",
2009 { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
2010 DMI_MATCH(DMI_BIOS_VERSION, "A12"),
2011 DMI_MATCH(DMI_BIOS_DATE, "02/04/2002"), },
2012 },
2013 { /* APM crashes */
2014 apm_is_horked, "Dell Dimension 4100",
2015 { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
2016 DMI_MATCH(DMI_PRODUCT_NAME, "XPS-Z"),
2017 DMI_MATCH(DMI_BIOS_VENDOR,"Intel Corp."),
2018 DMI_MATCH(DMI_BIOS_VERSION,"A11"), },
2019 },
2020 { /* Allow interrupts during suspend on Compaq Laptops*/
2021 set_apm_ints, "Compaq 12XL125",
2022 { DMI_MATCH(DMI_SYS_VENDOR, "Compaq"),
2023 DMI_MATCH(DMI_PRODUCT_NAME, "Compaq PC"),
2024 DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
2025 DMI_MATCH(DMI_BIOS_VERSION,"4.06"), },
2026 },
2027 { /* Allow interrupts during APM or the clock goes slow */
2028 set_apm_ints, "ASUSTeK",
2029 { DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK Computer Inc."),
2030 DMI_MATCH(DMI_PRODUCT_NAME, "L8400K series Notebook PC"), },
2031 },
2032 { /* APM blows on shutdown */
2033 apm_is_horked, "ABIT KX7-333[R]",
2034 { DMI_MATCH(DMI_BOARD_VENDOR, "ABIT"),
2035 DMI_MATCH(DMI_BOARD_NAME, "VT8367-8233A (KX7-333[R])"), },
2036 },
2037 { /* APM crashes */
2038 apm_is_horked, "Trigem Delhi3",
2039 { DMI_MATCH(DMI_SYS_VENDOR, "TriGem Computer, Inc"),
2040 DMI_MATCH(DMI_PRODUCT_NAME, "Delhi3"), },
2041 },
2042 { /* APM crashes */
2043 apm_is_horked, "Fujitsu-Siemens",
2044 { DMI_MATCH(DMI_BIOS_VENDOR, "hoenix/FUJITSU SIEMENS"),
2045 DMI_MATCH(DMI_BIOS_VERSION, "Version1.01"), },
2046 },
2047 { /* APM crashes */
2048 apm_is_horked_d850md, "Intel D850MD",
2049 { DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."),
2050 DMI_MATCH(DMI_BIOS_VERSION, "MV85010A.86A.0016.P07.0201251536"), },
2051 },
2052 { /* APM crashes */
2053 apm_is_horked, "Intel D810EMO",
2054 { DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."),
2055 DMI_MATCH(DMI_BIOS_VERSION, "MO81010A.86A.0008.P04.0004170800"), },
2056 },
2057 { /* APM crashes */
2058 apm_is_horked, "Dell XPS-Z",
2059 { DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."),
2060 DMI_MATCH(DMI_BIOS_VERSION, "A11"),
2061 DMI_MATCH(DMI_PRODUCT_NAME, "XPS-Z"), },
2062 },
2063 { /* APM crashes */
2064 apm_is_horked, "Sharp PC-PJ/AX",
2065 { DMI_MATCH(DMI_SYS_VENDOR, "SHARP"),
2066 DMI_MATCH(DMI_PRODUCT_NAME, "PC-PJ/AX"),
2067 DMI_MATCH(DMI_BIOS_VENDOR,"SystemSoft"),
2068 DMI_MATCH(DMI_BIOS_VERSION,"Version R2.08"), },
2069 },
2070 { /* APM crashes */
2071 apm_is_horked, "Dell Inspiron 2500",
2072 { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
2073 DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 2500"),
2074 DMI_MATCH(DMI_BIOS_VENDOR,"Phoenix Technologies LTD"),
2075 DMI_MATCH(DMI_BIOS_VERSION,"A11"), },
2076 },
2077 { /* APM idle hangs */
2078 apm_likes_to_melt, "Jabil AMD",
2079 { DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."),
2080 DMI_MATCH(DMI_BIOS_VERSION, "0AASNP06"), },
2081 },
2082 { /* APM idle hangs */
2083 apm_likes_to_melt, "AMI Bios",
2084 { DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."),
2085 DMI_MATCH(DMI_BIOS_VERSION, "0AASNP05"), },
2086 },
2087 { /* Handle problems with APM on Sony Vaio PCG-N505X(DE) */
2088 swab_apm_power_in_minutes, "Sony VAIO",
2089 { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
2090 DMI_MATCH(DMI_BIOS_VERSION, "R0206H"),
2091 DMI_MATCH(DMI_BIOS_DATE, "08/23/99"), },
2092 },
2093 { /* Handle problems with APM on Sony Vaio PCG-N505VX */
2094 swab_apm_power_in_minutes, "Sony VAIO",
2095 { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
2096 DMI_MATCH(DMI_BIOS_VERSION, "W2K06H0"),
2097 DMI_MATCH(DMI_BIOS_DATE, "02/03/00"), },
2098 },
2099 { /* Handle problems with APM on Sony Vaio PCG-XG29 */
2100 swab_apm_power_in_minutes, "Sony VAIO",
2101 { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
2102 DMI_MATCH(DMI_BIOS_VERSION, "R0117A0"),
2103 DMI_MATCH(DMI_BIOS_DATE, "04/25/00"), },
2104 },
2105 { /* Handle problems with APM on Sony Vaio PCG-Z600NE */
2106 swab_apm_power_in_minutes, "Sony VAIO",
2107 { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
2108 DMI_MATCH(DMI_BIOS_VERSION, "R0121Z1"),
2109 DMI_MATCH(DMI_BIOS_DATE, "05/11/00"), },
2110 },
2111 { /* Handle problems with APM on Sony Vaio PCG-Z600NE */
2112 swab_apm_power_in_minutes, "Sony VAIO",
2113 { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
2114 DMI_MATCH(DMI_BIOS_VERSION, "WME01Z1"),
2115 DMI_MATCH(DMI_BIOS_DATE, "08/11/00"), },
2116 },
2117 { /* Handle problems with APM on Sony Vaio PCG-Z600LEK(DE) */
2118 swab_apm_power_in_minutes, "Sony VAIO",
2119 { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
2120 DMI_MATCH(DMI_BIOS_VERSION, "R0206Z3"),
2121 DMI_MATCH(DMI_BIOS_DATE, "12/25/00"), },
2122 },
2123 { /* Handle problems with APM on Sony Vaio PCG-Z505LS */
2124 swab_apm_power_in_minutes, "Sony VAIO",
2125 { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
2126 DMI_MATCH(DMI_BIOS_VERSION, "R0203D0"),
2127 DMI_MATCH(DMI_BIOS_DATE, "05/12/00"), },
2128 },
2129 { /* Handle problems with APM on Sony Vaio PCG-Z505LS */
2130 swab_apm_power_in_minutes, "Sony VAIO",
2131 { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
2132 DMI_MATCH(DMI_BIOS_VERSION, "R0203Z3"),
2133 DMI_MATCH(DMI_BIOS_DATE, "08/25/00"), },
2134 },
2135 { /* Handle problems with APM on Sony Vaio PCG-Z505LS (with updated BIOS) */
2136 swab_apm_power_in_minutes, "Sony VAIO",
2137 { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
2138 DMI_MATCH(DMI_BIOS_VERSION, "R0209Z3"),
2139 DMI_MATCH(DMI_BIOS_DATE, "05/12/01"), },
2140 },
2141 { /* Handle problems with APM on Sony Vaio PCG-F104K */
2142 swab_apm_power_in_minutes, "Sony VAIO",
2143 { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
2144 DMI_MATCH(DMI_BIOS_VERSION, "R0204K2"),
2145 DMI_MATCH(DMI_BIOS_DATE, "08/28/00"), },
2146 },
2147
2148 { /* Handle problems with APM on Sony Vaio PCG-C1VN/C1VE */
2149 swab_apm_power_in_minutes, "Sony VAIO",
2150 { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
2151 DMI_MATCH(DMI_BIOS_VERSION, "R0208P1"),
2152 DMI_MATCH(DMI_BIOS_DATE, "11/09/00"), },
2153 },
2154 { /* Handle problems with APM on Sony Vaio PCG-C1VE */
2155 swab_apm_power_in_minutes, "Sony VAIO",
2156 { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
2157 DMI_MATCH(DMI_BIOS_VERSION, "R0204P1"),
2158 DMI_MATCH(DMI_BIOS_DATE, "09/12/00"), },
2159 },
2160 { /* Handle problems with APM on Sony Vaio PCG-C1VE */
2161 swab_apm_power_in_minutes, "Sony VAIO",
2162 { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
2163 DMI_MATCH(DMI_BIOS_VERSION, "WXPO1Z3"),
2164 DMI_MATCH(DMI_BIOS_DATE, "10/26/01"), },
2165 },
2166 { /* broken PM poweroff bios */
2167 set_realmode_power_off, "Award Software v4.60 PGMA",
2168 { DMI_MATCH(DMI_BIOS_VENDOR, "Award Software International, Inc."),
2169 DMI_MATCH(DMI_BIOS_VERSION, "4.60 PGMA"),
2170 DMI_MATCH(DMI_BIOS_DATE, "134526184"), },
2171 },
2172
2173 /* Generic per vendor APM settings */
2174
2175 { /* Allow interrupts during suspend on IBM laptops */
2176 set_apm_ints, "IBM",
2177 { DMI_MATCH(DMI_SYS_VENDOR, "IBM"), },
2178 },
2179
2180 { }
2181};
2182
2183/*
2184 * Just start the APM thread. We do NOT want to do APM BIOS
2185 * calls from anything but the APM thread, if for no other reason
2186 * than the fact that we don't trust the APM BIOS. This way,
2187 * most common APM BIOS problems that lead to protection errors
2188 * etc will have at least some level of being contained...
2189 *
2190 * In short, if something bad happens, at least we have a choice
2191 * of just killing the apm thread..
2192 */
2193static int __init apm_init(void)
2194{
2195 struct proc_dir_entry *apm_proc;
2196 struct desc_struct *gdt;
2197 int err;
2198
2199 dmi_check_system(apm_dmi_table);
2200
2201 if (apm_info.bios.version == 0 || paravirt_enabled()) {
2202 printk(KERN_INFO "apm: BIOS not found.\n");
2203 return -ENODEV;
2204 }
2205 printk(KERN_INFO
2206 "apm: BIOS version %d.%d Flags 0x%02x (Driver version %s)\n",
2207 ((apm_info.bios.version >> 8) & 0xff),
2208 (apm_info.bios.version & 0xff),
2209 apm_info.bios.flags,
2210 driver_version);
2211 if ((apm_info.bios.flags & APM_32_BIT_SUPPORT) == 0) {
2212 printk(KERN_INFO "apm: no 32 bit BIOS support\n");
2213 return -ENODEV;
2214 }
2215
2216 if (allow_ints)
2217 apm_info.allow_ints = 1;
2218 if (broken_psr)
2219 apm_info.get_power_status_broken = 1;
2220 if (realmode_power_off)
2221 apm_info.realmode_power_off = 1;
2222 /* User can override, but default is to trust DMI */
2223 if (apm_disabled != -1)
2224 apm_info.disabled = apm_disabled;
2225
2226 /*
2227 * Fix for the Compaq Contura 3/25c which reports BIOS version 0.1
2228 * but is reportedly a 1.0 BIOS.
2229 */
2230 if (apm_info.bios.version == 0x001)
2231 apm_info.bios.version = 0x100;
2232
2233 /* BIOS < 1.2 doesn't set cseg_16_len */
2234 if (apm_info.bios.version < 0x102)
2235 apm_info.bios.cseg_16_len = 0; /* 64k */
2236
2237 if (debug) {
2238 printk(KERN_INFO "apm: entry %x:%x cseg16 %x dseg %x",
2239 apm_info.bios.cseg, apm_info.bios.offset,
2240 apm_info.bios.cseg_16, apm_info.bios.dseg);
2241 if (apm_info.bios.version > 0x100)
2242 printk(" cseg len %x, dseg len %x",
2243 apm_info.bios.cseg_len,
2244 apm_info.bios.dseg_len);
2245 if (apm_info.bios.version > 0x101)
2246 printk(" cseg16 len %x", apm_info.bios.cseg_16_len);
2247 printk("\n");
2248 }
2249
2250 if (apm_info.disabled) {
2251 printk(KERN_NOTICE "apm: disabled on user request.\n");
2252 return -ENODEV;
2253 }
2254 if ((num_online_cpus() > 1) && !power_off && !smp) {
2255 printk(KERN_NOTICE "apm: disabled - APM is not SMP safe.\n");
2256 apm_info.disabled = 1;
2257 return -ENODEV;
2258 }
2259 if (PM_IS_ACTIVE()) {
2260 printk(KERN_NOTICE "apm: overridden by ACPI.\n");
2261 apm_info.disabled = 1;
2262 return -ENODEV;
2263 }
2264#ifdef CONFIG_PM_LEGACY
2265 pm_active = 1;
2266#endif
2267
2268 /*
2269 * Set up a segment that references the real mode segment 0x40
2270 * that extends up to the end of page zero (that we have reserved).
2271 * This is for buggy BIOS's that refer to (real mode) segment 0x40
2272 * even though they are called in protected mode.
2273 */
2274 set_base(bad_bios_desc, __va((unsigned long)0x40 << 4));
2275 _set_limit((char *)&bad_bios_desc, 4095 - (0x40 << 4));
2276
2277 /*
2278 * Set up the long jump entry point to the APM BIOS, which is called
2279 * from inline assembly.
2280 */
2281 apm_bios_entry.offset = apm_info.bios.offset;
2282 apm_bios_entry.segment = APM_CS;
2283
2284 /*
2285 * The APM 1.1 BIOS is supposed to provide limit information that it
2286 * recognizes. Many machines do this correctly, but many others do
2287 * not restrict themselves to their claimed limit. When this happens,
2288 * they will cause a segmentation violation in the kernel at boot time.
2289 * Most BIOS's, however, will respect a 64k limit, so we use that.
2290 *
2291 * Note we only set APM segments on CPU zero, since we pin the APM
2292 * code to that CPU.
2293 */
2294 gdt = get_cpu_gdt_table(0);
2295 set_base(gdt[APM_CS >> 3],
2296 __va((unsigned long)apm_info.bios.cseg << 4));
2297 set_base(gdt[APM_CS_16 >> 3],
2298 __va((unsigned long)apm_info.bios.cseg_16 << 4));
2299 set_base(gdt[APM_DS >> 3],
2300 __va((unsigned long)apm_info.bios.dseg << 4));
2301
2302 apm_proc = create_proc_entry("apm", 0, NULL);
2303 if (apm_proc)
2304 apm_proc->proc_fops = &apm_file_ops;
2305
2306 kapmd_task = kthread_create(apm, NULL, "kapmd");
2307 if (IS_ERR(kapmd_task)) {
2308 printk(KERN_ERR "apm: disabled - Unable to start kernel "
2309 "thread.\n");
2310 err = PTR_ERR(kapmd_task);
2311 kapmd_task = NULL;
2312 remove_proc_entry("apm", NULL);
2313 return err;
2314 }
2315 wake_up_process(kapmd_task);
2316
2317 if (num_online_cpus() > 1 && !smp ) {
2318 printk(KERN_NOTICE
2319 "apm: disabled - APM is not SMP safe (power off active).\n");
2320 return 0;
2321 }
2322
2323 /*
2324 * Note we don't actually care if the misc_device cannot be registered.
2325 * this driver can do its job without it, even if userspace can't
2326 * control it. just log the error
2327 */
2328 if (misc_register(&apm_device))
2329 printk(KERN_WARNING "apm: Could not register misc device.\n");
2330
2331 if (HZ != 100)
2332 idle_period = (idle_period * HZ) / 100;
2333 if (idle_threshold < 100) {
2334 original_pm_idle = pm_idle;
2335 pm_idle = apm_cpu_idle;
2336 set_pm_idle = 1;
2337 }
2338
2339 return 0;
2340}
2341
2342static void __exit apm_exit(void)
2343{
2344 int error;
2345
2346 if (set_pm_idle) {
2347 pm_idle = original_pm_idle;
2348 /*
2349 * We are about to unload the current idle thread pm callback
2350 * (pm_idle), Wait for all processors to update cached/local
2351 * copies of pm_idle before proceeding.
2352 */
2353 cpu_idle_wait();
2354 }
2355 if (((apm_info.bios.flags & APM_BIOS_DISENGAGED) == 0)
2356 && (apm_info.connection_version > 0x0100)) {
2357 error = apm_engage_power_management(APM_DEVICE_ALL, 0);
2358 if (error)
2359 apm_error("disengage power management", error);
2360 }
2361 misc_deregister(&apm_device);
2362 remove_proc_entry("apm", NULL);
2363 if (power_off)
2364 pm_power_off = NULL;
2365 if (kapmd_task) {
2366 kthread_stop(kapmd_task);
2367 kapmd_task = NULL;
2368 }
2369#ifdef CONFIG_PM_LEGACY
2370 pm_active = 0;
2371#endif
2372}
2373
2374module_init(apm_init);
2375module_exit(apm_exit);
2376
2377MODULE_AUTHOR("Stephen Rothwell");
2378MODULE_DESCRIPTION("Advanced Power Management");
2379MODULE_LICENSE("GPL");
2380module_param(debug, bool, 0644);
2381MODULE_PARM_DESC(debug, "Enable debug mode");
2382module_param(power_off, bool, 0444);
2383MODULE_PARM_DESC(power_off, "Enable power off");
2384module_param(bounce_interval, int, 0444);
2385MODULE_PARM_DESC(bounce_interval,
2386 "Set the number of ticks to ignore suspend bounces");
2387module_param(allow_ints, bool, 0444);
2388MODULE_PARM_DESC(allow_ints, "Allow interrupts during BIOS calls");
2389module_param(broken_psr, bool, 0444);
2390MODULE_PARM_DESC(broken_psr, "BIOS has a broken GetPowerStatus call");
2391module_param(realmode_power_off, bool, 0444);
2392MODULE_PARM_DESC(realmode_power_off,
2393 "Switch to real mode before powering off");
2394module_param(idle_threshold, int, 0444);
2395MODULE_PARM_DESC(idle_threshold,
2396 "System idle percentage above which to make APM BIOS idle calls");
2397module_param(idle_period, int, 0444);
2398MODULE_PARM_DESC(idle_period,
2399 "Period (in sec/100) over which to caculate the idle percentage");
2400module_param(smp, bool, 0444);
2401MODULE_PARM_DESC(smp,
2402 "Set this to enable APM use on an SMP platform. Use with caution on older systems");
2403MODULE_ALIAS_MISCDEV(APM_MINOR_DEV);
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
new file mode 100644
index 000000000000..cfa82c899f47
--- /dev/null
+++ b/arch/x86/kernel/asm-offsets.c
@@ -0,0 +1,5 @@
1#ifdef CONFIG_X86_32
2# include "asm-offsets_32.c"
3#else
4# include "asm-offsets_64.c"
5#endif
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
new file mode 100644
index 000000000000..8029742c0fc1
--- /dev/null
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -0,0 +1,147 @@
1/*
2 * Generate definitions needed by assembly language modules.
3 * This code generates raw asm output which is post-processed
4 * to extract and format the required data.
5 */
6
7#include <linux/crypto.h>
8#include <linux/sched.h>
9#include <linux/signal.h>
10#include <linux/personality.h>
11#include <linux/suspend.h>
12#include <asm/ucontext.h>
13#include "sigframe_32.h"
14#include <asm/pgtable.h>
15#include <asm/fixmap.h>
16#include <asm/processor.h>
17#include <asm/thread_info.h>
18#include <asm/elf.h>
19
20#include <xen/interface/xen.h>
21
22#ifdef CONFIG_LGUEST_GUEST
23#include <linux/lguest.h>
24#include "../../../drivers/lguest/lg.h"
25#endif
26
27#define DEFINE(sym, val) \
28 asm volatile("\n->" #sym " %0 " #val : : "i" (val))
29
30#define BLANK() asm volatile("\n->" : : )
31
32#define OFFSET(sym, str, mem) \
33 DEFINE(sym, offsetof(struct str, mem));
34
35/* workaround for a warning with -Wmissing-prototypes */
36void foo(void);
37
38void foo(void)
39{
40 OFFSET(SIGCONTEXT_eax, sigcontext, eax);
41 OFFSET(SIGCONTEXT_ebx, sigcontext, ebx);
42 OFFSET(SIGCONTEXT_ecx, sigcontext, ecx);
43 OFFSET(SIGCONTEXT_edx, sigcontext, edx);
44 OFFSET(SIGCONTEXT_esi, sigcontext, esi);
45 OFFSET(SIGCONTEXT_edi, sigcontext, edi);
46 OFFSET(SIGCONTEXT_ebp, sigcontext, ebp);
47 OFFSET(SIGCONTEXT_esp, sigcontext, esp);
48 OFFSET(SIGCONTEXT_eip, sigcontext, eip);
49 BLANK();
50
51 OFFSET(CPUINFO_x86, cpuinfo_x86, x86);
52 OFFSET(CPUINFO_x86_vendor, cpuinfo_x86, x86_vendor);
53 OFFSET(CPUINFO_x86_model, cpuinfo_x86, x86_model);
54 OFFSET(CPUINFO_x86_mask, cpuinfo_x86, x86_mask);
55 OFFSET(CPUINFO_hard_math, cpuinfo_x86, hard_math);
56 OFFSET(CPUINFO_cpuid_level, cpuinfo_x86, cpuid_level);
57 OFFSET(CPUINFO_x86_capability, cpuinfo_x86, x86_capability);
58 OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id);
59 BLANK();
60
61 OFFSET(TI_task, thread_info, task);
62 OFFSET(TI_exec_domain, thread_info, exec_domain);
63 OFFSET(TI_flags, thread_info, flags);
64 OFFSET(TI_status, thread_info, status);
65 OFFSET(TI_preempt_count, thread_info, preempt_count);
66 OFFSET(TI_addr_limit, thread_info, addr_limit);
67 OFFSET(TI_restart_block, thread_info, restart_block);
68 OFFSET(TI_sysenter_return, thread_info, sysenter_return);
69 OFFSET(TI_cpu, thread_info, cpu);
70 BLANK();
71
72 OFFSET(GDS_size, Xgt_desc_struct, size);
73 OFFSET(GDS_address, Xgt_desc_struct, address);
74 OFFSET(GDS_pad, Xgt_desc_struct, pad);
75 BLANK();
76
77 OFFSET(PT_EBX, pt_regs, ebx);
78 OFFSET(PT_ECX, pt_regs, ecx);
79 OFFSET(PT_EDX, pt_regs, edx);
80 OFFSET(PT_ESI, pt_regs, esi);
81 OFFSET(PT_EDI, pt_regs, edi);
82 OFFSET(PT_EBP, pt_regs, ebp);
83 OFFSET(PT_EAX, pt_regs, eax);
84 OFFSET(PT_DS, pt_regs, xds);
85 OFFSET(PT_ES, pt_regs, xes);
86 OFFSET(PT_FS, pt_regs, xfs);
87 OFFSET(PT_ORIG_EAX, pt_regs, orig_eax);
88 OFFSET(PT_EIP, pt_regs, eip);
89 OFFSET(PT_CS, pt_regs, xcs);
90 OFFSET(PT_EFLAGS, pt_regs, eflags);
91 OFFSET(PT_OLDESP, pt_regs, esp);
92 OFFSET(PT_OLDSS, pt_regs, xss);
93 BLANK();
94
95 OFFSET(EXEC_DOMAIN_handler, exec_domain, handler);
96 OFFSET(RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext);
97 BLANK();
98
99 OFFSET(pbe_address, pbe, address);
100 OFFSET(pbe_orig_address, pbe, orig_address);
101 OFFSET(pbe_next, pbe, next);
102
103 /* Offset from the sysenter stack to tss.esp0 */
104 DEFINE(TSS_sysenter_esp0, offsetof(struct tss_struct, x86_tss.esp0) -
105 sizeof(struct tss_struct));
106
107 DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
108 DEFINE(PAGE_SHIFT_asm, PAGE_SHIFT);
109 DEFINE(PTRS_PER_PTE, PTRS_PER_PTE);
110 DEFINE(PTRS_PER_PMD, PTRS_PER_PMD);
111 DEFINE(PTRS_PER_PGD, PTRS_PER_PGD);
112
113 DEFINE(VDSO_PRELINK_asm, VDSO_PRELINK);
114
115 OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
116
117#ifdef CONFIG_PARAVIRT
118 BLANK();
119 OFFSET(PARAVIRT_enabled, paravirt_ops, paravirt_enabled);
120 OFFSET(PARAVIRT_irq_disable, paravirt_ops, irq_disable);
121 OFFSET(PARAVIRT_irq_enable, paravirt_ops, irq_enable);
122 OFFSET(PARAVIRT_irq_enable_sysexit, paravirt_ops, irq_enable_sysexit);
123 OFFSET(PARAVIRT_iret, paravirt_ops, iret);
124 OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0);
125#endif
126
127#ifdef CONFIG_XEN
128 BLANK();
129 OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
130 OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
131#endif
132
133#ifdef CONFIG_LGUEST_GUEST
134 BLANK();
135 OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
136 OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc);
137 OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc);
138 OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3);
139 OFFSET(LGUEST_PAGES_host_sp, lguest_pages, state.host_sp);
140 OFFSET(LGUEST_PAGES_guest_gdt_desc, lguest_pages,state.guest_gdt_desc);
141 OFFSET(LGUEST_PAGES_guest_idt_desc, lguest_pages,state.guest_idt_desc);
142 OFFSET(LGUEST_PAGES_guest_gdt, lguest_pages, state.guest_gdt);
143 OFFSET(LGUEST_PAGES_regs_trapnum, lguest_pages, regs.trapnum);
144 OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode);
145 OFFSET(LGUEST_PAGES_regs, lguest_pages, regs);
146#endif
147}
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
new file mode 100644
index 000000000000..778953bc636c
--- /dev/null
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -0,0 +1,85 @@
1/*
2 * Generate definitions needed by assembly language modules.
3 * This code generates raw asm output which is post-processed to extract
4 * and format the required data.
5 */
6
7#include <linux/crypto.h>
8#include <linux/sched.h>
9#include <linux/stddef.h>
10#include <linux/errno.h>
11#include <linux/hardirq.h>
12#include <linux/suspend.h>
13#include <asm/pda.h>
14#include <asm/processor.h>
15#include <asm/segment.h>
16#include <asm/thread_info.h>
17#include <asm/ia32.h>
18
19#define DEFINE(sym, val) \
20 asm volatile("\n->" #sym " %0 " #val : : "i" (val))
21
22#define BLANK() asm volatile("\n->" : : )
23
24#define __NO_STUBS 1
25#undef __SYSCALL
26#undef _ASM_X86_64_UNISTD_H_
27#define __SYSCALL(nr, sym) [nr] = 1,
28static char syscalls[] = {
29#include <asm/unistd.h>
30};
31
32int main(void)
33{
34#define ENTRY(entry) DEFINE(tsk_ ## entry, offsetof(struct task_struct, entry))
35 ENTRY(state);
36 ENTRY(flags);
37 ENTRY(thread);
38 ENTRY(pid);
39 BLANK();
40#undef ENTRY
41#define ENTRY(entry) DEFINE(threadinfo_ ## entry, offsetof(struct thread_info, entry))
42 ENTRY(flags);
43 ENTRY(addr_limit);
44 ENTRY(preempt_count);
45 ENTRY(status);
46 BLANK();
47#undef ENTRY
48#define ENTRY(entry) DEFINE(pda_ ## entry, offsetof(struct x8664_pda, entry))
49 ENTRY(kernelstack);
50 ENTRY(oldrsp);
51 ENTRY(pcurrent);
52 ENTRY(irqcount);
53 ENTRY(cpunumber);
54 ENTRY(irqstackptr);
55 ENTRY(data_offset);
56 BLANK();
57#undef ENTRY
58#ifdef CONFIG_IA32_EMULATION
59#define ENTRY(entry) DEFINE(IA32_SIGCONTEXT_ ## entry, offsetof(struct sigcontext_ia32, entry))
60 ENTRY(eax);
61 ENTRY(ebx);
62 ENTRY(ecx);
63 ENTRY(edx);
64 ENTRY(esi);
65 ENTRY(edi);
66 ENTRY(ebp);
67 ENTRY(esp);
68 ENTRY(eip);
69 BLANK();
70#undef ENTRY
71 DEFINE(IA32_RT_SIGFRAME_sigcontext,
72 offsetof (struct rt_sigframe32, uc.uc_mcontext));
73 BLANK();
74#endif
75 DEFINE(pbe_address, offsetof(struct pbe, address));
76 DEFINE(pbe_orig_address, offsetof(struct pbe, orig_address));
77 DEFINE(pbe_next, offsetof(struct pbe, next));
78 BLANK();
79 DEFINE(TSS_ist, offsetof(struct tss_struct, ist));
80 BLANK();
81 DEFINE(crypto_tfm_ctx_offset, offsetof(struct crypto_tfm, __crt_ctx));
82 BLANK();
83 DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
84 return 0;
85}
diff --git a/arch/x86/kernel/audit_64.c b/arch/x86/kernel/audit_64.c
new file mode 100644
index 000000000000..06d3e5a14d9d
--- /dev/null
+++ b/arch/x86/kernel/audit_64.c
@@ -0,0 +1,81 @@
1#include <linux/init.h>
2#include <linux/types.h>
3#include <linux/audit.h>
4#include <asm/unistd.h>
5
6static unsigned dir_class[] = {
7#include <asm-generic/audit_dir_write.h>
8~0U
9};
10
11static unsigned read_class[] = {
12#include <asm-generic/audit_read.h>
13~0U
14};
15
16static unsigned write_class[] = {
17#include <asm-generic/audit_write.h>
18~0U
19};
20
21static unsigned chattr_class[] = {
22#include <asm-generic/audit_change_attr.h>
23~0U
24};
25
26static unsigned signal_class[] = {
27#include <asm-generic/audit_signal.h>
28~0U
29};
30
31int audit_classify_arch(int arch)
32{
33#ifdef CONFIG_IA32_EMULATION
34 if (arch == AUDIT_ARCH_I386)
35 return 1;
36#endif
37 return 0;
38}
39
40int audit_classify_syscall(int abi, unsigned syscall)
41{
42#ifdef CONFIG_IA32_EMULATION
43 extern int ia32_classify_syscall(unsigned);
44 if (abi == AUDIT_ARCH_I386)
45 return ia32_classify_syscall(syscall);
46#endif
47 switch(syscall) {
48 case __NR_open:
49 return 2;
50 case __NR_openat:
51 return 3;
52 case __NR_execve:
53 return 5;
54 default:
55 return 0;
56 }
57}
58
59static int __init audit_classes_init(void)
60{
61#ifdef CONFIG_IA32_EMULATION
62 extern __u32 ia32_dir_class[];
63 extern __u32 ia32_write_class[];
64 extern __u32 ia32_read_class[];
65 extern __u32 ia32_chattr_class[];
66 extern __u32 ia32_signal_class[];
67 audit_register_class(AUDIT_CLASS_WRITE_32, ia32_write_class);
68 audit_register_class(AUDIT_CLASS_READ_32, ia32_read_class);
69 audit_register_class(AUDIT_CLASS_DIR_WRITE_32, ia32_dir_class);
70 audit_register_class(AUDIT_CLASS_CHATTR_32, ia32_chattr_class);
71 audit_register_class(AUDIT_CLASS_SIGNAL_32, ia32_signal_class);
72#endif
73 audit_register_class(AUDIT_CLASS_WRITE, write_class);
74 audit_register_class(AUDIT_CLASS_READ, read_class);
75 audit_register_class(AUDIT_CLASS_DIR_WRITE, dir_class);
76 audit_register_class(AUDIT_CLASS_CHATTR, chattr_class);
77 audit_register_class(AUDIT_CLASS_SIGNAL, signal_class);
78 return 0;
79}
80
81__initcall(audit_classes_init);
diff --git a/arch/x86/kernel/bootflag.c b/arch/x86/kernel/bootflag.c
new file mode 100644
index 000000000000..0b9860530a6b
--- /dev/null
+++ b/arch/x86/kernel/bootflag.c
@@ -0,0 +1,98 @@
1/*
2 * Implement 'Simple Boot Flag Specification 2.0'
3 */
4
5
6#include <linux/types.h>
7#include <linux/kernel.h>
8#include <linux/init.h>
9#include <linux/string.h>
10#include <linux/slab.h>
11#include <linux/spinlock.h>
12#include <linux/acpi.h>
13#include <asm/io.h>
14
15#include <linux/mc146818rtc.h>
16
17
18#define SBF_RESERVED (0x78)
19#define SBF_PNPOS (1<<0)
20#define SBF_BOOTING (1<<1)
21#define SBF_DIAG (1<<2)
22#define SBF_PARITY (1<<7)
23
24
25int sbf_port __initdata = -1; /* set via acpi_boot_init() */
26
27
28static int __init parity(u8 v)
29{
30 int x = 0;
31 int i;
32
33 for(i=0;i<8;i++)
34 {
35 x^=(v&1);
36 v>>=1;
37 }
38 return x;
39}
40
41static void __init sbf_write(u8 v)
42{
43 unsigned long flags;
44 if(sbf_port != -1)
45 {
46 v &= ~SBF_PARITY;
47 if(!parity(v))
48 v|=SBF_PARITY;
49
50 printk(KERN_INFO "Simple Boot Flag at 0x%x set to 0x%x\n", sbf_port, v);
51
52 spin_lock_irqsave(&rtc_lock, flags);
53 CMOS_WRITE(v, sbf_port);
54 spin_unlock_irqrestore(&rtc_lock, flags);
55 }
56}
57
58static u8 __init sbf_read(void)
59{
60 u8 v;
61 unsigned long flags;
62 if(sbf_port == -1)
63 return 0;
64 spin_lock_irqsave(&rtc_lock, flags);
65 v = CMOS_READ(sbf_port);
66 spin_unlock_irqrestore(&rtc_lock, flags);
67 return v;
68}
69
70static int __init sbf_value_valid(u8 v)
71{
72 if(v&SBF_RESERVED) /* Reserved bits */
73 return 0;
74 if(!parity(v))
75 return 0;
76 return 1;
77}
78
79static int __init sbf_init(void)
80{
81 u8 v;
82 if(sbf_port == -1)
83 return 0;
84 v = sbf_read();
85 if(!sbf_value_valid(v))
86 printk(KERN_WARNING "Simple Boot Flag value 0x%x read from CMOS RAM was invalid\n",v);
87
88 v &= ~SBF_RESERVED;
89 v &= ~SBF_BOOTING;
90 v &= ~SBF_DIAG;
91#if defined(CONFIG_ISAPNP)
92 v |= SBF_PNPOS;
93#endif
94 sbf_write(v);
95 return 0;
96}
97
98module_init(sbf_init);
diff --git a/arch/x86/kernel/bugs_64.c b/arch/x86/kernel/bugs_64.c
new file mode 100644
index 000000000000..4e5e9d364d63
--- /dev/null
+++ b/arch/x86/kernel/bugs_64.c
@@ -0,0 +1,24 @@
1/*
2 * arch/x86_64/kernel/bugs.c
3 *
4 * Copyright (C) 1994 Linus Torvalds
5 * Copyright (C) 2000 SuSE
6 */
7
8#include <linux/kernel.h>
9#include <linux/init.h>
10#include <asm/alternative.h>
11#include <asm/bugs.h>
12#include <asm/processor.h>
13#include <asm/mtrr.h>
14
15void __init check_bugs(void)
16{
17 identify_cpu(&boot_cpu_data);
18 mtrr_bp_init();
19#if !defined(CONFIG_SMP)
20 printk("CPU: ");
21 print_cpu_info(&boot_cpu_data);
22#endif
23 alternative_instructions();
24}
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
new file mode 100644
index 000000000000..778396c78d65
--- /dev/null
+++ b/arch/x86/kernel/cpu/Makefile
@@ -0,0 +1,20 @@
1#
2# Makefile for x86-compatible CPU details and quirks
3#
4
5obj-y := common.o proc.o bugs.o
6
7obj-y += amd.o
8obj-y += cyrix.o
9obj-y += centaur.o
10obj-y += transmeta.o
11obj-y += intel.o intel_cacheinfo.o addon_cpuid_features.o
12obj-y += nexgen.o
13obj-y += umc.o
14
15obj-$(CONFIG_X86_MCE) += mcheck/
16
17obj-$(CONFIG_MTRR) += mtrr/
18obj-$(CONFIG_CPU_FREQ) += cpufreq/
19
20obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
new file mode 100644
index 000000000000..3e91d3ee26ec
--- /dev/null
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -0,0 +1,50 @@
1
2/*
3 * Routines to indentify additional cpu features that are scattered in
4 * cpuid space.
5 */
6
7#include <linux/cpu.h>
8
9#include <asm/processor.h>
10
11struct cpuid_bit {
12 u16 feature;
13 u8 reg;
14 u8 bit;
15 u32 level;
16};
17
18enum cpuid_regs {
19 CR_EAX = 0,
20 CR_ECX,
21 CR_EDX,
22 CR_EBX
23};
24
25void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
26{
27 u32 max_level;
28 u32 regs[4];
29 const struct cpuid_bit *cb;
30
31 static const struct cpuid_bit cpuid_bits[] = {
32 { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 },
33 { 0, 0, 0, 0 }
34 };
35
36 for (cb = cpuid_bits; cb->feature; cb++) {
37
38 /* Verify that the level is valid */
39 max_level = cpuid_eax(cb->level & 0xffff0000);
40 if (max_level < cb->level ||
41 max_level > (cb->level | 0xffff))
42 continue;
43
44 cpuid(cb->level, &regs[CR_EAX], &regs[CR_EBX],
45 &regs[CR_ECX], &regs[CR_EDX]);
46
47 if (regs[cb->reg] & (1 << cb->bit))
48 set_bit(cb->feature, c->x86_capability);
49 }
50}
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
new file mode 100644
index 000000000000..dcf6bbb1c7c0
--- /dev/null
+++ b/arch/x86/kernel/cpu/amd.c
@@ -0,0 +1,337 @@
1#include <linux/init.h>
2#include <linux/bitops.h>
3#include <linux/mm.h>
4#include <asm/io.h>
5#include <asm/processor.h>
6#include <asm/apic.h>
7
8#include "cpu.h"
9
10/*
11 * B step AMD K6 before B 9730xxxx have hardware bugs that can cause
12 * misexecution of code under Linux. Owners of such processors should
13 * contact AMD for precise details and a CPU swap.
14 *
15 * See http://www.multimania.com/poulot/k6bug.html
16 * http://www.amd.com/K6/k6docs/revgd.html
17 *
18 * The following test is erm.. interesting. AMD neglected to up
19 * the chip setting when fixing the bug but they also tweaked some
20 * performance at the same time..
21 */
22
23extern void vide(void);
24__asm__(".align 4\nvide: ret");
25
26#ifdef CONFIG_X86_LOCAL_APIC
27#define ENABLE_C1E_MASK 0x18000000
28#define CPUID_PROCESSOR_SIGNATURE 1
29#define CPUID_XFAM 0x0ff00000
30#define CPUID_XFAM_K8 0x00000000
31#define CPUID_XFAM_10H 0x00100000
32#define CPUID_XFAM_11H 0x00200000
33#define CPUID_XMOD 0x000f0000
34#define CPUID_XMOD_REV_F 0x00040000
35
36/* AMD systems with C1E don't have a working lAPIC timer. Check for that. */
37static __cpuinit int amd_apic_timer_broken(void)
38{
39 u32 lo, hi;
40 u32 eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
41 switch (eax & CPUID_XFAM) {
42 case CPUID_XFAM_K8:
43 if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F)
44 break;
45 case CPUID_XFAM_10H:
46 case CPUID_XFAM_11H:
47 rdmsr(MSR_K8_ENABLE_C1E, lo, hi);
48 if (lo & ENABLE_C1E_MASK)
49 return 1;
50 break;
51 default:
52 /* err on the side of caution */
53 return 1;
54 }
55 return 0;
56}
57#endif
58
59int force_mwait __cpuinitdata;
60
61static void __cpuinit init_amd(struct cpuinfo_x86 *c)
62{
63 u32 l, h;
64 int mbytes = num_physpages >> (20-PAGE_SHIFT);
65 int r;
66
67#ifdef CONFIG_SMP
68 unsigned long long value;
69
70 /* Disable TLB flush filter by setting HWCR.FFDIS on K8
71 * bit 6 of msr C001_0015
72 *
73 * Errata 63 for SH-B3 steppings
74 * Errata 122 for all steppings (F+ have it disabled by default)
75 */
76 if (c->x86 == 15) {
77 rdmsrl(MSR_K7_HWCR, value);
78 value |= 1 << 6;
79 wrmsrl(MSR_K7_HWCR, value);
80 }
81#endif
82
83 /*
84 * FIXME: We should handle the K5 here. Set up the write
85 * range and also turn on MSR 83 bits 4 and 31 (write alloc,
86 * no bus pipeline)
87 */
88
89 /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
90 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
91 clear_bit(0*32+31, c->x86_capability);
92
93 r = get_model_name(c);
94
95 switch(c->x86)
96 {
97 case 4:
98 /*
99 * General Systems BIOSen alias the cpu frequency registers
100 * of the Elan at 0x000df000. Unfortuantly, one of the Linux
101 * drivers subsequently pokes it, and changes the CPU speed.
102 * Workaround : Remove the unneeded alias.
103 */
104#define CBAR (0xfffc) /* Configuration Base Address (32-bit) */
105#define CBAR_ENB (0x80000000)
106#define CBAR_KEY (0X000000CB)
107 if (c->x86_model==9 || c->x86_model == 10) {
108 if (inl (CBAR) & CBAR_ENB)
109 outl (0 | CBAR_KEY, CBAR);
110 }
111 break;
112 case 5:
113 if( c->x86_model < 6 )
114 {
115 /* Based on AMD doc 20734R - June 2000 */
116 if ( c->x86_model == 0 ) {
117 clear_bit(X86_FEATURE_APIC, c->x86_capability);
118 set_bit(X86_FEATURE_PGE, c->x86_capability);
119 }
120 break;
121 }
122
123 if ( c->x86_model == 6 && c->x86_mask == 1 ) {
124 const int K6_BUG_LOOP = 1000000;
125 int n;
126 void (*f_vide)(void);
127 unsigned long d, d2;
128
129 printk(KERN_INFO "AMD K6 stepping B detected - ");
130
131 /*
132 * It looks like AMD fixed the 2.6.2 bug and improved indirect
133 * calls at the same time.
134 */
135
136 n = K6_BUG_LOOP;
137 f_vide = vide;
138 rdtscl(d);
139 while (n--)
140 f_vide();
141 rdtscl(d2);
142 d = d2-d;
143
144 if (d > 20*K6_BUG_LOOP)
145 printk("system stability may be impaired when more than 32 MB are used.\n");
146 else
147 printk("probably OK (after B9730xxxx).\n");
148 printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n");
149 }
150
151 /* K6 with old style WHCR */
152 if (c->x86_model < 8 ||
153 (c->x86_model== 8 && c->x86_mask < 8)) {
154 /* We can only write allocate on the low 508Mb */
155 if(mbytes>508)
156 mbytes=508;
157
158 rdmsr(MSR_K6_WHCR, l, h);
159 if ((l&0x0000FFFF)==0) {
160 unsigned long flags;
161 l=(1<<0)|((mbytes/4)<<1);
162 local_irq_save(flags);
163 wbinvd();
164 wrmsr(MSR_K6_WHCR, l, h);
165 local_irq_restore(flags);
166 printk(KERN_INFO "Enabling old style K6 write allocation for %d Mb\n",
167 mbytes);
168 }
169 break;
170 }
171
172 if ((c->x86_model == 8 && c->x86_mask >7) ||
173 c->x86_model == 9 || c->x86_model == 13) {
174 /* The more serious chips .. */
175
176 if(mbytes>4092)
177 mbytes=4092;
178
179 rdmsr(MSR_K6_WHCR, l, h);
180 if ((l&0xFFFF0000)==0) {
181 unsigned long flags;
182 l=((mbytes>>2)<<22)|(1<<16);
183 local_irq_save(flags);
184 wbinvd();
185 wrmsr(MSR_K6_WHCR, l, h);
186 local_irq_restore(flags);
187 printk(KERN_INFO "Enabling new style K6 write allocation for %d Mb\n",
188 mbytes);
189 }
190
191 /* Set MTRR capability flag if appropriate */
192 if (c->x86_model == 13 || c->x86_model == 9 ||
193 (c->x86_model == 8 && c->x86_mask >= 8))
194 set_bit(X86_FEATURE_K6_MTRR, c->x86_capability);
195 break;
196 }
197
198 if (c->x86_model == 10) {
199 /* AMD Geode LX is model 10 */
200 /* placeholder for any needed mods */
201 break;
202 }
203 break;
204 case 6: /* An Athlon/Duron */
205
206 /* Bit 15 of Athlon specific MSR 15, needs to be 0
207 * to enable SSE on Palomino/Morgan/Barton CPU's.
208 * If the BIOS didn't enable it already, enable it here.
209 */
210 if (c->x86_model >= 6 && c->x86_model <= 10) {
211 if (!cpu_has(c, X86_FEATURE_XMM)) {
212 printk(KERN_INFO "Enabling disabled K7/SSE Support.\n");
213 rdmsr(MSR_K7_HWCR, l, h);
214 l &= ~0x00008000;
215 wrmsr(MSR_K7_HWCR, l, h);
216 set_bit(X86_FEATURE_XMM, c->x86_capability);
217 }
218 }
219
220 /* It's been determined by AMD that Athlons since model 8 stepping 1
221 * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx
222 * As per AMD technical note 27212 0.2
223 */
224 if ((c->x86_model == 8 && c->x86_mask>=1) || (c->x86_model > 8)) {
225 rdmsr(MSR_K7_CLK_CTL, l, h);
226 if ((l & 0xfff00000) != 0x20000000) {
227 printk ("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", l,
228 ((l & 0x000fffff)|0x20000000));
229 wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h);
230 }
231 }
232 break;
233 }
234
235 switch (c->x86) {
236 case 15:
237 /* Use K8 tuning for Fam10h and Fam11h */
238 case 0x10:
239 case 0x11:
240 set_bit(X86_FEATURE_K8, c->x86_capability);
241 break;
242 case 6:
243 set_bit(X86_FEATURE_K7, c->x86_capability);
244 break;
245 }
246 if (c->x86 >= 6)
247 set_bit(X86_FEATURE_FXSAVE_LEAK, c->x86_capability);
248
249 display_cacheinfo(c);
250
251 if (cpuid_eax(0x80000000) >= 0x80000008) {
252 c->x86_max_cores = (cpuid_ecx(0x80000008) & 0xff) + 1;
253 }
254
255 if (cpuid_eax(0x80000000) >= 0x80000007) {
256 c->x86_power = cpuid_edx(0x80000007);
257 if (c->x86_power & (1<<8))
258 set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
259 }
260
261#ifdef CONFIG_X86_HT
262 /*
263 * On a AMD multi core setup the lower bits of the APIC id
264 * distingush the cores.
265 */
266 if (c->x86_max_cores > 1) {
267 int cpu = smp_processor_id();
268 unsigned bits = (cpuid_ecx(0x80000008) >> 12) & 0xf;
269
270 if (bits == 0) {
271 while ((1 << bits) < c->x86_max_cores)
272 bits++;
273 }
274 c->cpu_core_id = c->phys_proc_id & ((1<<bits)-1);
275 c->phys_proc_id >>= bits;
276 printk(KERN_INFO "CPU %d(%d) -> Core %d\n",
277 cpu, c->x86_max_cores, c->cpu_core_id);
278 }
279#endif
280
281 if (cpuid_eax(0x80000000) >= 0x80000006) {
282 if ((c->x86 == 0x10) && (cpuid_edx(0x80000006) & 0xf000))
283 num_cache_leaves = 4;
284 else
285 num_cache_leaves = 3;
286 }
287
288#ifdef CONFIG_X86_LOCAL_APIC
289 if (amd_apic_timer_broken())
290 local_apic_timer_disabled = 1;
291#endif
292
293 if (c->x86 == 0x10 && !force_mwait)
294 clear_bit(X86_FEATURE_MWAIT, c->x86_capability);
295
296 /* K6s reports MCEs but don't actually have all the MSRs */
297 if (c->x86 < 6)
298 clear_bit(X86_FEATURE_MCE, c->x86_capability);
299}
300
301static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 * c, unsigned int size)
302{
303 /* AMD errata T13 (order #21922) */
304 if ((c->x86 == 6)) {
305 if (c->x86_model == 3 && c->x86_mask == 0) /* Duron Rev A0 */
306 size = 64;
307 if (c->x86_model == 4 &&
308 (c->x86_mask==0 || c->x86_mask==1)) /* Tbird rev A1/A2 */
309 size = 256;
310 }
311 return size;
312}
313
314static struct cpu_dev amd_cpu_dev __cpuinitdata = {
315 .c_vendor = "AMD",
316 .c_ident = { "AuthenticAMD" },
317 .c_models = {
318 { .vendor = X86_VENDOR_AMD, .family = 4, .model_names =
319 {
320 [3] = "486 DX/2",
321 [7] = "486 DX/2-WB",
322 [8] = "486 DX/4",
323 [9] = "486 DX/4-WB",
324 [14] = "Am5x86-WT",
325 [15] = "Am5x86-WB"
326 }
327 },
328 },
329 .c_init = init_amd,
330 .c_size_cache = amd_size_cache,
331};
332
333int __init amd_init_cpu(void)
334{
335 cpu_devs[X86_VENDOR_AMD] = &amd_cpu_dev;
336 return 0;
337}
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
new file mode 100644
index 000000000000..59266f03d1cd
--- /dev/null
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -0,0 +1,192 @@
1/*
2 * arch/i386/cpu/bugs.c
3 *
4 * Copyright (C) 1994 Linus Torvalds
5 *
6 * Cyrix stuff, June 1998 by:
7 * - Rafael R. Reilova (moved everything from head.S),
8 * <rreilova@ececs.uc.edu>
9 * - Channing Corn (tests & fixes),
10 * - Andrew D. Balsa (code cleanup).
11 */
12#include <linux/init.h>
13#include <linux/utsname.h>
14#include <asm/bugs.h>
15#include <asm/processor.h>
16#include <asm/i387.h>
17#include <asm/msr.h>
18#include <asm/paravirt.h>
19#include <asm/alternative.h>
20
21static int __init no_halt(char *s)
22{
23 boot_cpu_data.hlt_works_ok = 0;
24 return 1;
25}
26
27__setup("no-hlt", no_halt);
28
29static int __init mca_pentium(char *s)
30{
31 mca_pentium_flag = 1;
32 return 1;
33}
34
35__setup("mca-pentium", mca_pentium);
36
37static int __init no_387(char *s)
38{
39 boot_cpu_data.hard_math = 0;
40 write_cr0(0xE | read_cr0());
41 return 1;
42}
43
44__setup("no387", no_387);
45
46static double __initdata x = 4195835.0;
47static double __initdata y = 3145727.0;
48
49/*
50 * This used to check for exceptions..
51 * However, it turns out that to support that,
52 * the XMM trap handlers basically had to
53 * be buggy. So let's have a correct XMM trap
54 * handler, and forget about printing out
55 * some status at boot.
56 *
57 * We should really only care about bugs here
58 * anyway. Not features.
59 */
60static void __init check_fpu(void)
61{
62 if (!boot_cpu_data.hard_math) {
63#ifndef CONFIG_MATH_EMULATION
64 printk(KERN_EMERG "No coprocessor found and no math emulation present.\n");
65 printk(KERN_EMERG "Giving up.\n");
66 for (;;) ;
67#endif
68 return;
69 }
70
71/* trap_init() enabled FXSR and company _before_ testing for FP problems here. */
72 /* Test for the divl bug.. */
73 __asm__("fninit\n\t"
74 "fldl %1\n\t"
75 "fdivl %2\n\t"
76 "fmull %2\n\t"
77 "fldl %1\n\t"
78 "fsubp %%st,%%st(1)\n\t"
79 "fistpl %0\n\t"
80 "fwait\n\t"
81 "fninit"
82 : "=m" (*&boot_cpu_data.fdiv_bug)
83 : "m" (*&x), "m" (*&y));
84 if (boot_cpu_data.fdiv_bug)
85 printk("Hmm, FPU with FDIV bug.\n");
86}
87
88static void __init check_hlt(void)
89{
90 if (paravirt_enabled())
91 return;
92
93 printk(KERN_INFO "Checking 'hlt' instruction... ");
94 if (!boot_cpu_data.hlt_works_ok) {
95 printk("disabled\n");
96 return;
97 }
98 halt();
99 halt();
100 halt();
101 halt();
102 printk("OK.\n");
103}
104
105/*
106 * Most 386 processors have a bug where a POPAD can lock the
107 * machine even from user space.
108 */
109
110static void __init check_popad(void)
111{
112#ifndef CONFIG_X86_POPAD_OK
113 int res, inp = (int) &res;
114
115 printk(KERN_INFO "Checking for popad bug... ");
116 __asm__ __volatile__(
117 "movl $12345678,%%eax; movl $0,%%edi; pusha; popa; movl (%%edx,%%edi),%%ecx "
118 : "=&a" (res)
119 : "d" (inp)
120 : "ecx", "edi" );
121 /* If this fails, it means that any user program may lock the CPU hard. Too bad. */
122 if (res != 12345678) printk( "Buggy.\n" );
123 else printk( "OK.\n" );
124#endif
125}
126
127/*
128 * Check whether we are able to run this kernel safely on SMP.
129 *
130 * - In order to run on a i386, we need to be compiled for i386
131 * (for due to lack of "invlpg" and working WP on a i386)
132 * - In order to run on anything without a TSC, we need to be
133 * compiled for a i486.
134 * - In order to support the local APIC on a buggy Pentium machine,
135 * we need to be compiled with CONFIG_X86_GOOD_APIC disabled,
136 * which happens implicitly if compiled for a Pentium or lower
137 * (unless an advanced selection of CPU features is used) as an
138 * otherwise config implies a properly working local APIC without
139 * the need to do extra reads from the APIC.
140*/
141
142static void __init check_config(void)
143{
144/*
145 * We'd better not be a i386 if we're configured to use some
146 * i486+ only features! (WP works in supervisor mode and the
147 * new "invlpg" and "bswap" instructions)
148 */
149#if defined(CONFIG_X86_WP_WORKS_OK) || defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_BSWAP)
150 if (boot_cpu_data.x86 == 3)
151 panic("Kernel requires i486+ for 'invlpg' and other features");
152#endif
153
154/*
155 * If we configured ourselves for a TSC, we'd better have one!
156 */
157#ifdef CONFIG_X86_TSC
158 if (!cpu_has_tsc && !tsc_disable)
159 panic("Kernel compiled for Pentium+, requires TSC feature!");
160#endif
161
162/*
163 * If we were told we had a good local APIC, check for buggy Pentia,
164 * i.e. all B steppings and the C2 stepping of P54C when using their
165 * integrated APIC (see 11AP erratum in "Pentium Processor
166 * Specification Update").
167 */
168#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_GOOD_APIC)
169 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL
170 && cpu_has_apic
171 && boot_cpu_data.x86 == 5
172 && boot_cpu_data.x86_model == 2
173 && (boot_cpu_data.x86_mask < 6 || boot_cpu_data.x86_mask == 11))
174 panic("Kernel compiled for PMMX+, assumes a local APIC without the read-before-write bug!");
175#endif
176}
177
178
179void __init check_bugs(void)
180{
181 identify_boot_cpu();
182#ifndef CONFIG_SMP
183 printk("CPU: ");
184 print_cpu_info(&boot_cpu_data);
185#endif
186 check_config();
187 check_fpu();
188 check_hlt();
189 check_popad();
190 init_utsname()->machine[1] = '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86);
191 alternative_instructions();
192}
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
new file mode 100644
index 000000000000..473eac883c7b
--- /dev/null
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -0,0 +1,471 @@
1#include <linux/kernel.h>
2#include <linux/init.h>
3#include <linux/bitops.h>
4#include <asm/processor.h>
5#include <asm/msr.h>
6#include <asm/e820.h>
7#include <asm/mtrr.h>
8#include "cpu.h"
9
10#ifdef CONFIG_X86_OOSTORE
11
12static u32 __cpuinit power2(u32 x)
13{
14 u32 s=1;
15 while(s<=x)
16 s<<=1;
17 return s>>=1;
18}
19
20
21/*
22 * Set up an actual MCR
23 */
24
25static void __cpuinit centaur_mcr_insert(int reg, u32 base, u32 size, int key)
26{
27 u32 lo, hi;
28
29 hi = base & ~0xFFF;
30 lo = ~(size-1); /* Size is a power of 2 so this makes a mask */
31 lo &= ~0xFFF; /* Remove the ctrl value bits */
32 lo |= key; /* Attribute we wish to set */
33 wrmsr(reg+MSR_IDT_MCR0, lo, hi);
34 mtrr_centaur_report_mcr(reg, lo, hi); /* Tell the mtrr driver */
35}
36
37/*
38 * Figure what we can cover with MCR's
39 *
40 * Shortcut: We know you can't put 4Gig of RAM on a winchip
41 */
42
43static u32 __cpuinit ramtop(void) /* 16388 */
44{
45 int i;
46 u32 top = 0;
47 u32 clip = 0xFFFFFFFFUL;
48
49 for (i = 0; i < e820.nr_map; i++) {
50 unsigned long start, end;
51
52 if (e820.map[i].addr > 0xFFFFFFFFUL)
53 continue;
54 /*
55 * Don't MCR over reserved space. Ignore the ISA hole
56 * we frob around that catastrophy already
57 */
58
59 if (e820.map[i].type == E820_RESERVED)
60 {
61 if(e820.map[i].addr >= 0x100000UL && e820.map[i].addr < clip)
62 clip = e820.map[i].addr;
63 continue;
64 }
65 start = e820.map[i].addr;
66 end = e820.map[i].addr + e820.map[i].size;
67 if (start >= end)
68 continue;
69 if (end > top)
70 top = end;
71 }
72 /* Everything below 'top' should be RAM except for the ISA hole.
73 Because of the limited MCR's we want to map NV/ACPI into our
74 MCR range for gunk in RAM
75
76 Clip might cause us to MCR insufficient RAM but that is an
77 acceptable failure mode and should only bite obscure boxes with
78 a VESA hole at 15Mb
79
80 The second case Clip sometimes kicks in is when the EBDA is marked
81 as reserved. Again we fail safe with reasonable results
82 */
83
84 if(top>clip)
85 top=clip;
86
87 return top;
88}
89
90/*
91 * Compute a set of MCR's to give maximum coverage
92 */
93
94static int __cpuinit centaur_mcr_compute(int nr, int key)
95{
96 u32 mem = ramtop();
97 u32 root = power2(mem);
98 u32 base = root;
99 u32 top = root;
100 u32 floor = 0;
101 int ct = 0;
102
103 while(ct<nr)
104 {
105 u32 fspace = 0;
106
107 /*
108 * Find the largest block we will fill going upwards
109 */
110
111 u32 high = power2(mem-top);
112
113 /*
114 * Find the largest block we will fill going downwards
115 */
116
117 u32 low = base/2;
118
119 /*
120 * Don't fill below 1Mb going downwards as there
121 * is an ISA hole in the way.
122 */
123
124 if(base <= 1024*1024)
125 low = 0;
126
127 /*
128 * See how much space we could cover by filling below
129 * the ISA hole
130 */
131
132 if(floor == 0)
133 fspace = 512*1024;
134 else if(floor ==512*1024)
135 fspace = 128*1024;
136
137 /* And forget ROM space */
138
139 /*
140 * Now install the largest coverage we get
141 */
142
143 if(fspace > high && fspace > low)
144 {
145 centaur_mcr_insert(ct, floor, fspace, key);
146 floor += fspace;
147 }
148 else if(high > low)
149 {
150 centaur_mcr_insert(ct, top, high, key);
151 top += high;
152 }
153 else if(low > 0)
154 {
155 base -= low;
156 centaur_mcr_insert(ct, base, low, key);
157 }
158 else break;
159 ct++;
160 }
161 /*
162 * We loaded ct values. We now need to set the mask. The caller
163 * must do this bit.
164 */
165
166 return ct;
167}
168
169static void __cpuinit centaur_create_optimal_mcr(void)
170{
171 int i;
172 /*
173 * Allocate up to 6 mcrs to mark as much of ram as possible
174 * as write combining and weak write ordered.
175 *
176 * To experiment with: Linux never uses stack operations for
177 * mmio spaces so we could globally enable stack operation wc
178 *
179 * Load the registers with type 31 - full write combining, all
180 * writes weakly ordered.
181 */
182 int used = centaur_mcr_compute(6, 31);
183
184 /*
185 * Wipe unused MCRs
186 */
187
188 for(i=used;i<8;i++)
189 wrmsr(MSR_IDT_MCR0+i, 0, 0);
190}
191
192static void __cpuinit winchip2_create_optimal_mcr(void)
193{
194 u32 lo, hi;
195 int i;
196
197 /*
198 * Allocate up to 6 mcrs to mark as much of ram as possible
199 * as write combining, weak store ordered.
200 *
201 * Load the registers with type 25
202 * 8 - weak write ordering
203 * 16 - weak read ordering
204 * 1 - write combining
205 */
206
207 int used = centaur_mcr_compute(6, 25);
208
209 /*
210 * Mark the registers we are using.
211 */
212
213 rdmsr(MSR_IDT_MCR_CTRL, lo, hi);
214 for(i=0;i<used;i++)
215 lo|=1<<(9+i);
216 wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
217
218 /*
219 * Wipe unused MCRs
220 */
221
222 for(i=used;i<8;i++)
223 wrmsr(MSR_IDT_MCR0+i, 0, 0);
224}
225
226/*
227 * Handle the MCR key on the Winchip 2.
228 */
229
230static void __cpuinit winchip2_unprotect_mcr(void)
231{
232 u32 lo, hi;
233 u32 key;
234
235 rdmsr(MSR_IDT_MCR_CTRL, lo, hi);
236 lo&=~0x1C0; /* blank bits 8-6 */
237 key = (lo>>17) & 7;
238 lo |= key<<6; /* replace with unlock key */
239 wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
240}
241
242static void __cpuinit winchip2_protect_mcr(void)
243{
244 u32 lo, hi;
245
246 rdmsr(MSR_IDT_MCR_CTRL, lo, hi);
247 lo&=~0x1C0; /* blank bits 8-6 */
248 wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
249}
250#endif /* CONFIG_X86_OOSTORE */
251
252#define ACE_PRESENT (1 << 6)
253#define ACE_ENABLED (1 << 7)
254#define ACE_FCR (1 << 28) /* MSR_VIA_FCR */
255
256#define RNG_PRESENT (1 << 2)
257#define RNG_ENABLED (1 << 3)
258#define RNG_ENABLE (1 << 6) /* MSR_VIA_RNG */
259
260static void __cpuinit init_c3(struct cpuinfo_x86 *c)
261{
262 u32 lo, hi;
263
264 /* Test for Centaur Extended Feature Flags presence */
265 if (cpuid_eax(0xC0000000) >= 0xC0000001) {
266 u32 tmp = cpuid_edx(0xC0000001);
267
268 /* enable ACE unit, if present and disabled */
269 if ((tmp & (ACE_PRESENT | ACE_ENABLED)) == ACE_PRESENT) {
270 rdmsr (MSR_VIA_FCR, lo, hi);
271 lo |= ACE_FCR; /* enable ACE unit */
272 wrmsr (MSR_VIA_FCR, lo, hi);
273 printk(KERN_INFO "CPU: Enabled ACE h/w crypto\n");
274 }
275
276 /* enable RNG unit, if present and disabled */
277 if ((tmp & (RNG_PRESENT | RNG_ENABLED)) == RNG_PRESENT) {
278 rdmsr (MSR_VIA_RNG, lo, hi);
279 lo |= RNG_ENABLE; /* enable RNG unit */
280 wrmsr (MSR_VIA_RNG, lo, hi);
281 printk(KERN_INFO "CPU: Enabled h/w RNG\n");
282 }
283
284 /* store Centaur Extended Feature Flags as
285 * word 5 of the CPU capability bit array
286 */
287 c->x86_capability[5] = cpuid_edx(0xC0000001);
288 }
289
290 /* Cyrix III family needs CX8 & PGE explicity enabled. */
291 if (c->x86_model >=6 && c->x86_model <= 9) {
292 rdmsr (MSR_VIA_FCR, lo, hi);
293 lo |= (1<<1 | 1<<7);
294 wrmsr (MSR_VIA_FCR, lo, hi);
295 set_bit(X86_FEATURE_CX8, c->x86_capability);
296 }
297
298 /* Before Nehemiah, the C3's had 3dNOW! */
299 if (c->x86_model >=6 && c->x86_model <9)
300 set_bit(X86_FEATURE_3DNOW, c->x86_capability);
301
302 get_model_name(c);
303 display_cacheinfo(c);
304}
305
306static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
307{
308 enum {
309 ECX8=1<<1,
310 EIERRINT=1<<2,
311 DPM=1<<3,
312 DMCE=1<<4,
313 DSTPCLK=1<<5,
314 ELINEAR=1<<6,
315 DSMC=1<<7,
316 DTLOCK=1<<8,
317 EDCTLB=1<<8,
318 EMMX=1<<9,
319 DPDC=1<<11,
320 EBRPRED=1<<12,
321 DIC=1<<13,
322 DDC=1<<14,
323 DNA=1<<15,
324 ERETSTK=1<<16,
325 E2MMX=1<<19,
326 EAMD3D=1<<20,
327 };
328
329 char *name;
330 u32 fcr_set=0;
331 u32 fcr_clr=0;
332 u32 lo,hi,newlo;
333 u32 aa,bb,cc,dd;
334
335 /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
336 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
337 clear_bit(0*32+31, c->x86_capability);
338
339 switch (c->x86) {
340
341 case 5:
342 switch(c->x86_model) {
343 case 4:
344 name="C6";
345 fcr_set=ECX8|DSMC|EDCTLB|EMMX|ERETSTK;
346 fcr_clr=DPDC;
347 printk(KERN_NOTICE "Disabling bugged TSC.\n");
348 clear_bit(X86_FEATURE_TSC, c->x86_capability);
349#ifdef CONFIG_X86_OOSTORE
350 centaur_create_optimal_mcr();
351 /* Enable
352 write combining on non-stack, non-string
353 write combining on string, all types
354 weak write ordering
355
356 The C6 original lacks weak read order
357
358 Note 0x120 is write only on Winchip 1 */
359
360 wrmsr(MSR_IDT_MCR_CTRL, 0x01F0001F, 0);
361#endif
362 break;
363 case 8:
364 switch(c->x86_mask) {
365 default:
366 name="2";
367 break;
368 case 7 ... 9:
369 name="2A";
370 break;
371 case 10 ... 15:
372 name="2B";
373 break;
374 }
375 fcr_set=ECX8|DSMC|DTLOCK|EMMX|EBRPRED|ERETSTK|E2MMX|EAMD3D;
376 fcr_clr=DPDC;
377#ifdef CONFIG_X86_OOSTORE
378 winchip2_unprotect_mcr();
379 winchip2_create_optimal_mcr();
380 rdmsr(MSR_IDT_MCR_CTRL, lo, hi);
381 /* Enable
382 write combining on non-stack, non-string
383 write combining on string, all types
384 weak write ordering
385 */
386 lo|=31;
387 wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
388 winchip2_protect_mcr();
389#endif
390 break;
391 case 9:
392 name="3";
393 fcr_set=ECX8|DSMC|DTLOCK|EMMX|EBRPRED|ERETSTK|E2MMX|EAMD3D;
394 fcr_clr=DPDC;
395#ifdef CONFIG_X86_OOSTORE
396 winchip2_unprotect_mcr();
397 winchip2_create_optimal_mcr();
398 rdmsr(MSR_IDT_MCR_CTRL, lo, hi);
399 /* Enable
400 write combining on non-stack, non-string
401 write combining on string, all types
402 weak write ordering
403 */
404 lo|=31;
405 wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
406 winchip2_protect_mcr();
407#endif
408 break;
409 default:
410 name="??";
411 }
412
413 rdmsr(MSR_IDT_FCR1, lo, hi);
414 newlo=(lo|fcr_set) & (~fcr_clr);
415
416 if (newlo!=lo) {
417 printk(KERN_INFO "Centaur FCR was 0x%X now 0x%X\n", lo, newlo );
418 wrmsr(MSR_IDT_FCR1, newlo, hi );
419 } else {
420 printk(KERN_INFO "Centaur FCR is 0x%X\n",lo);
421 }
422 /* Emulate MTRRs using Centaur's MCR. */
423 set_bit(X86_FEATURE_CENTAUR_MCR, c->x86_capability);
424 /* Report CX8 */
425 set_bit(X86_FEATURE_CX8, c->x86_capability);
426 /* Set 3DNow! on Winchip 2 and above. */
427 if (c->x86_model >=8)
428 set_bit(X86_FEATURE_3DNOW, c->x86_capability);
429 /* See if we can find out some more. */
430 if ( cpuid_eax(0x80000000) >= 0x80000005 ) {
431 /* Yes, we can. */
432 cpuid(0x80000005,&aa,&bb,&cc,&dd);
433 /* Add L1 data and code cache sizes. */
434 c->x86_cache_size = (cc>>24)+(dd>>24);
435 }
436 sprintf( c->x86_model_id, "WinChip %s", name );
437 break;
438
439 case 6:
440 init_c3(c);
441 break;
442 }
443}
444
445static unsigned int __cpuinit centaur_size_cache(struct cpuinfo_x86 * c, unsigned int size)
446{
447 /* VIA C3 CPUs (670-68F) need further shifting. */
448 if ((c->x86 == 6) && ((c->x86_model == 7) || (c->x86_model == 8)))
449 size >>= 8;
450
451 /* VIA also screwed up Nehemiah stepping 1, and made
452 it return '65KB' instead of '64KB'
453 - Note, it seems this may only be in engineering samples. */
454 if ((c->x86==6) && (c->x86_model==9) && (c->x86_mask==1) && (size==65))
455 size -=1;
456
457 return size;
458}
459
460static struct cpu_dev centaur_cpu_dev __cpuinitdata = {
461 .c_vendor = "Centaur",
462 .c_ident = { "CentaurHauls" },
463 .c_init = init_centaur,
464 .c_size_cache = centaur_size_cache,
465};
466
467int __init centaur_init_cpu(void)
468{
469 cpu_devs[X86_VENDOR_CENTAUR] = &centaur_cpu_dev;
470 return 0;
471}
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
new file mode 100644
index 000000000000..d506201d397c
--- /dev/null
+++ b/arch/x86/kernel/cpu/common.c
@@ -0,0 +1,733 @@
1#include <linux/init.h>
2#include <linux/string.h>
3#include <linux/delay.h>
4#include <linux/smp.h>
5#include <linux/module.h>
6#include <linux/percpu.h>
7#include <linux/bootmem.h>
8#include <asm/semaphore.h>
9#include <asm/processor.h>
10#include <asm/i387.h>
11#include <asm/msr.h>
12#include <asm/io.h>
13#include <asm/mmu_context.h>
14#include <asm/mtrr.h>
15#include <asm/mce.h>
16#ifdef CONFIG_X86_LOCAL_APIC
17#include <asm/mpspec.h>
18#include <asm/apic.h>
19#include <mach_apic.h>
20#endif
21
22#include "cpu.h"
23
24DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
25 [GDT_ENTRY_KERNEL_CS] = { 0x0000ffff, 0x00cf9a00 },
26 [GDT_ENTRY_KERNEL_DS] = { 0x0000ffff, 0x00cf9200 },
27 [GDT_ENTRY_DEFAULT_USER_CS] = { 0x0000ffff, 0x00cffa00 },
28 [GDT_ENTRY_DEFAULT_USER_DS] = { 0x0000ffff, 0x00cff200 },
29 /*
30 * Segments used for calling PnP BIOS have byte granularity.
31 * They code segments and data segments have fixed 64k limits,
32 * the transfer segment sizes are set at run time.
33 */
34 [GDT_ENTRY_PNPBIOS_CS32] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */
35 [GDT_ENTRY_PNPBIOS_CS16] = { 0x0000ffff, 0x00009a00 },/* 16-bit code */
36 [GDT_ENTRY_PNPBIOS_DS] = { 0x0000ffff, 0x00009200 }, /* 16-bit data */
37 [GDT_ENTRY_PNPBIOS_TS1] = { 0x00000000, 0x00009200 },/* 16-bit data */
38 [GDT_ENTRY_PNPBIOS_TS2] = { 0x00000000, 0x00009200 },/* 16-bit data */
39 /*
40 * The APM segments have byte granularity and their bases
41 * are set at run time. All have 64k limits.
42 */
43 [GDT_ENTRY_APMBIOS_BASE] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */
44 /* 16-bit code */
45 [GDT_ENTRY_APMBIOS_BASE+1] = { 0x0000ffff, 0x00009a00 },
46 [GDT_ENTRY_APMBIOS_BASE+2] = { 0x0000ffff, 0x00409200 }, /* data */
47
48 [GDT_ENTRY_ESPFIX_SS] = { 0x00000000, 0x00c09200 },
49 [GDT_ENTRY_PERCPU] = { 0x00000000, 0x00000000 },
50} };
51EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
52
53static int cachesize_override __cpuinitdata = -1;
54static int disable_x86_fxsr __cpuinitdata;
55static int disable_x86_serial_nr __cpuinitdata = 1;
56static int disable_x86_sep __cpuinitdata;
57
58struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {};
59
60extern int disable_pse;
61
62static void __cpuinit default_init(struct cpuinfo_x86 * c)
63{
64 /* Not much we can do here... */
65 /* Check if at least it has cpuid */
66 if (c->cpuid_level == -1) {
67 /* No cpuid. It must be an ancient CPU */
68 if (c->x86 == 4)
69 strcpy(c->x86_model_id, "486");
70 else if (c->x86 == 3)
71 strcpy(c->x86_model_id, "386");
72 }
73}
74
75static struct cpu_dev __cpuinitdata default_cpu = {
76 .c_init = default_init,
77 .c_vendor = "Unknown",
78};
79static struct cpu_dev * this_cpu __cpuinitdata = &default_cpu;
80
81static int __init cachesize_setup(char *str)
82{
83 get_option (&str, &cachesize_override);
84 return 1;
85}
86__setup("cachesize=", cachesize_setup);
87
88int __cpuinit get_model_name(struct cpuinfo_x86 *c)
89{
90 unsigned int *v;
91 char *p, *q;
92
93 if (cpuid_eax(0x80000000) < 0x80000004)
94 return 0;
95
96 v = (unsigned int *) c->x86_model_id;
97 cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
98 cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
99 cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
100 c->x86_model_id[48] = 0;
101
102 /* Intel chips right-justify this string for some dumb reason;
103 undo that brain damage */
104 p = q = &c->x86_model_id[0];
105 while ( *p == ' ' )
106 p++;
107 if ( p != q ) {
108 while ( *p )
109 *q++ = *p++;
110 while ( q <= &c->x86_model_id[48] )
111 *q++ = '\0'; /* Zero-pad the rest */
112 }
113
114 return 1;
115}
116
117
118void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
119{
120 unsigned int n, dummy, ecx, edx, l2size;
121
122 n = cpuid_eax(0x80000000);
123
124 if (n >= 0x80000005) {
125 cpuid(0x80000005, &dummy, &dummy, &ecx, &edx);
126 printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
127 edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
128 c->x86_cache_size=(ecx>>24)+(edx>>24);
129 }
130
131 if (n < 0x80000006) /* Some chips just has a large L1. */
132 return;
133
134 ecx = cpuid_ecx(0x80000006);
135 l2size = ecx >> 16;
136
137 /* do processor-specific cache resizing */
138 if (this_cpu->c_size_cache)
139 l2size = this_cpu->c_size_cache(c,l2size);
140
141 /* Allow user to override all this if necessary. */
142 if (cachesize_override != -1)
143 l2size = cachesize_override;
144
145 if ( l2size == 0 )
146 return; /* Again, no L2 cache is possible */
147
148 c->x86_cache_size = l2size;
149
150 printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
151 l2size, ecx & 0xFF);
152}
153
154/* Naming convention should be: <Name> [(<Codename>)] */
155/* This table only is used unless init_<vendor>() below doesn't set it; */
156/* in particular, if CPUID levels 0x80000002..4 are supported, this isn't used */
157
158/* Look up CPU names by table lookup. */
159static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c)
160{
161 struct cpu_model_info *info;
162
163 if ( c->x86_model >= 16 )
164 return NULL; /* Range check */
165
166 if (!this_cpu)
167 return NULL;
168
169 info = this_cpu->c_models;
170
171 while (info && info->family) {
172 if (info->family == c->x86)
173 return info->model_names[c->x86_model];
174 info++;
175 }
176 return NULL; /* Not found */
177}
178
179
180static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c, int early)
181{
182 char *v = c->x86_vendor_id;
183 int i;
184 static int printed;
185
186 for (i = 0; i < X86_VENDOR_NUM; i++) {
187 if (cpu_devs[i]) {
188 if (!strcmp(v,cpu_devs[i]->c_ident[0]) ||
189 (cpu_devs[i]->c_ident[1] &&
190 !strcmp(v,cpu_devs[i]->c_ident[1]))) {
191 c->x86_vendor = i;
192 if (!early)
193 this_cpu = cpu_devs[i];
194 return;
195 }
196 }
197 }
198 if (!printed) {
199 printed++;
200 printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n");
201 printk(KERN_ERR "CPU: Your system may be unstable.\n");
202 }
203 c->x86_vendor = X86_VENDOR_UNKNOWN;
204 this_cpu = &default_cpu;
205}
206
207
208static int __init x86_fxsr_setup(char * s)
209{
210 /* Tell all the other CPU's to not use it... */
211 disable_x86_fxsr = 1;
212
213 /*
214 * ... and clear the bits early in the boot_cpu_data
215 * so that the bootup process doesn't try to do this
216 * either.
217 */
218 clear_bit(X86_FEATURE_FXSR, boot_cpu_data.x86_capability);
219 clear_bit(X86_FEATURE_XMM, boot_cpu_data.x86_capability);
220 return 1;
221}
222__setup("nofxsr", x86_fxsr_setup);
223
224
225static int __init x86_sep_setup(char * s)
226{
227 disable_x86_sep = 1;
228 return 1;
229}
230__setup("nosep", x86_sep_setup);
231
232
233/* Standard macro to see if a specific flag is changeable */
234static inline int flag_is_changeable_p(u32 flag)
235{
236 u32 f1, f2;
237
238 asm("pushfl\n\t"
239 "pushfl\n\t"
240 "popl %0\n\t"
241 "movl %0,%1\n\t"
242 "xorl %2,%0\n\t"
243 "pushl %0\n\t"
244 "popfl\n\t"
245 "pushfl\n\t"
246 "popl %0\n\t"
247 "popfl\n\t"
248 : "=&r" (f1), "=&r" (f2)
249 : "ir" (flag));
250
251 return ((f1^f2) & flag) != 0;
252}
253
254
255/* Probe for the CPUID instruction */
256static int __cpuinit have_cpuid_p(void)
257{
258 return flag_is_changeable_p(X86_EFLAGS_ID);
259}
260
261void __init cpu_detect(struct cpuinfo_x86 *c)
262{
263 /* Get vendor name */
264 cpuid(0x00000000, &c->cpuid_level,
265 (int *)&c->x86_vendor_id[0],
266 (int *)&c->x86_vendor_id[8],
267 (int *)&c->x86_vendor_id[4]);
268
269 c->x86 = 4;
270 if (c->cpuid_level >= 0x00000001) {
271 u32 junk, tfms, cap0, misc;
272 cpuid(0x00000001, &tfms, &misc, &junk, &cap0);
273 c->x86 = (tfms >> 8) & 15;
274 c->x86_model = (tfms >> 4) & 15;
275 if (c->x86 == 0xf)
276 c->x86 += (tfms >> 20) & 0xff;
277 if (c->x86 >= 0x6)
278 c->x86_model += ((tfms >> 16) & 0xF) << 4;
279 c->x86_mask = tfms & 15;
280 if (cap0 & (1<<19))
281 c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8;
282 }
283}
284
285/* Do minimum CPU detection early.
286 Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment.
287 The others are not touched to avoid unwanted side effects.
288
289 WARNING: this function is only called on the BP. Don't add code here
290 that is supposed to run on all CPUs. */
291static void __init early_cpu_detect(void)
292{
293 struct cpuinfo_x86 *c = &boot_cpu_data;
294
295 c->x86_cache_alignment = 32;
296
297 if (!have_cpuid_p())
298 return;
299
300 cpu_detect(c);
301
302 get_cpu_vendor(c, 1);
303}
304
305static void __cpuinit generic_identify(struct cpuinfo_x86 * c)
306{
307 u32 tfms, xlvl;
308 int ebx;
309
310 if (have_cpuid_p()) {
311 /* Get vendor name */
312 cpuid(0x00000000, &c->cpuid_level,
313 (int *)&c->x86_vendor_id[0],
314 (int *)&c->x86_vendor_id[8],
315 (int *)&c->x86_vendor_id[4]);
316
317 get_cpu_vendor(c, 0);
318 /* Initialize the standard set of capabilities */
319 /* Note that the vendor-specific code below might override */
320
321 /* Intel-defined flags: level 0x00000001 */
322 if ( c->cpuid_level >= 0x00000001 ) {
323 u32 capability, excap;
324 cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
325 c->x86_capability[0] = capability;
326 c->x86_capability[4] = excap;
327 c->x86 = (tfms >> 8) & 15;
328 c->x86_model = (tfms >> 4) & 15;
329 if (c->x86 == 0xf)
330 c->x86 += (tfms >> 20) & 0xff;
331 if (c->x86 >= 0x6)
332 c->x86_model += ((tfms >> 16) & 0xF) << 4;
333 c->x86_mask = tfms & 15;
334#ifdef CONFIG_X86_HT
335 c->apicid = phys_pkg_id((ebx >> 24) & 0xFF, 0);
336#else
337 c->apicid = (ebx >> 24) & 0xFF;
338#endif
339 if (c->x86_capability[0] & (1<<19))
340 c->x86_clflush_size = ((ebx >> 8) & 0xff) * 8;
341 } else {
342 /* Have CPUID level 0 only - unheard of */
343 c->x86 = 4;
344 }
345
346 /* AMD-defined flags: level 0x80000001 */
347 xlvl = cpuid_eax(0x80000000);
348 if ( (xlvl & 0xffff0000) == 0x80000000 ) {
349 if ( xlvl >= 0x80000001 ) {
350 c->x86_capability[1] = cpuid_edx(0x80000001);
351 c->x86_capability[6] = cpuid_ecx(0x80000001);
352 }
353 if ( xlvl >= 0x80000004 )
354 get_model_name(c); /* Default name */
355 }
356
357 init_scattered_cpuid_features(c);
358 }
359
360 early_intel_workaround(c);
361
362#ifdef CONFIG_X86_HT
363 c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
364#endif
365}
366
367static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
368{
369 if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr ) {
370 /* Disable processor serial number */
371 unsigned long lo,hi;
372 rdmsr(MSR_IA32_BBL_CR_CTL,lo,hi);
373 lo |= 0x200000;
374 wrmsr(MSR_IA32_BBL_CR_CTL,lo,hi);
375 printk(KERN_NOTICE "CPU serial number disabled.\n");
376 clear_bit(X86_FEATURE_PN, c->x86_capability);
377
378 /* Disabling the serial number may affect the cpuid level */
379 c->cpuid_level = cpuid_eax(0);
380 }
381}
382
383static int __init x86_serial_nr_setup(char *s)
384{
385 disable_x86_serial_nr = 0;
386 return 1;
387}
388__setup("serialnumber", x86_serial_nr_setup);
389
390
391
392/*
393 * This does the hard work of actually picking apart the CPU stuff...
394 */
395static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
396{
397 int i;
398
399 c->loops_per_jiffy = loops_per_jiffy;
400 c->x86_cache_size = -1;
401 c->x86_vendor = X86_VENDOR_UNKNOWN;
402 c->cpuid_level = -1; /* CPUID not detected */
403 c->x86_model = c->x86_mask = 0; /* So far unknown... */
404 c->x86_vendor_id[0] = '\0'; /* Unset */
405 c->x86_model_id[0] = '\0'; /* Unset */
406 c->x86_max_cores = 1;
407 c->x86_clflush_size = 32;
408 memset(&c->x86_capability, 0, sizeof c->x86_capability);
409
410 if (!have_cpuid_p()) {
411 /* First of all, decide if this is a 486 or higher */
412 /* It's a 486 if we can modify the AC flag */
413 if ( flag_is_changeable_p(X86_EFLAGS_AC) )
414 c->x86 = 4;
415 else
416 c->x86 = 3;
417 }
418
419 generic_identify(c);
420
421 printk(KERN_DEBUG "CPU: After generic identify, caps:");
422 for (i = 0; i < NCAPINTS; i++)
423 printk(" %08lx", c->x86_capability[i]);
424 printk("\n");
425
426 if (this_cpu->c_identify) {
427 this_cpu->c_identify(c);
428
429 printk(KERN_DEBUG "CPU: After vendor identify, caps:");
430 for (i = 0; i < NCAPINTS; i++)
431 printk(" %08lx", c->x86_capability[i]);
432 printk("\n");
433 }
434
435 /*
436 * Vendor-specific initialization. In this section we
437 * canonicalize the feature flags, meaning if there are
438 * features a certain CPU supports which CPUID doesn't
439 * tell us, CPUID claiming incorrect flags, or other bugs,
440 * we handle them here.
441 *
442 * At the end of this section, c->x86_capability better
443 * indicate the features this CPU genuinely supports!
444 */
445 if (this_cpu->c_init)
446 this_cpu->c_init(c);
447
448 /* Disable the PN if appropriate */
449 squash_the_stupid_serial_number(c);
450
451 /*
452 * The vendor-specific functions might have changed features. Now
453 * we do "generic changes."
454 */
455
456 /* TSC disabled? */
457 if ( tsc_disable )
458 clear_bit(X86_FEATURE_TSC, c->x86_capability);
459
460 /* FXSR disabled? */
461 if (disable_x86_fxsr) {
462 clear_bit(X86_FEATURE_FXSR, c->x86_capability);
463 clear_bit(X86_FEATURE_XMM, c->x86_capability);
464 }
465
466 /* SEP disabled? */
467 if (disable_x86_sep)
468 clear_bit(X86_FEATURE_SEP, c->x86_capability);
469
470 if (disable_pse)
471 clear_bit(X86_FEATURE_PSE, c->x86_capability);
472
473 /* If the model name is still unset, do table lookup. */
474 if ( !c->x86_model_id[0] ) {
475 char *p;
476 p = table_lookup_model(c);
477 if ( p )
478 strcpy(c->x86_model_id, p);
479 else
480 /* Last resort... */
481 sprintf(c->x86_model_id, "%02x/%02x",
482 c->x86, c->x86_model);
483 }
484
485 /* Now the feature flags better reflect actual CPU features! */
486
487 printk(KERN_DEBUG "CPU: After all inits, caps:");
488 for (i = 0; i < NCAPINTS; i++)
489 printk(" %08lx", c->x86_capability[i]);
490 printk("\n");
491
492 /*
493 * On SMP, boot_cpu_data holds the common feature set between
494 * all CPUs; so make sure that we indicate which features are
495 * common between the CPUs. The first time this routine gets
496 * executed, c == &boot_cpu_data.
497 */
498 if ( c != &boot_cpu_data ) {
499 /* AND the already accumulated flags with these */
500 for ( i = 0 ; i < NCAPINTS ; i++ )
501 boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
502 }
503
504 /* Init Machine Check Exception if available. */
505 mcheck_init(c);
506}
507
508void __init identify_boot_cpu(void)
509{
510 identify_cpu(&boot_cpu_data);
511 sysenter_setup();
512 enable_sep_cpu();
513 mtrr_bp_init();
514}
515
516void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
517{
518 BUG_ON(c == &boot_cpu_data);
519 identify_cpu(c);
520 enable_sep_cpu();
521 mtrr_ap_init();
522}
523
524#ifdef CONFIG_X86_HT
525void __cpuinit detect_ht(struct cpuinfo_x86 *c)
526{
527 u32 eax, ebx, ecx, edx;
528 int index_msb, core_bits;
529
530 cpuid(1, &eax, &ebx, &ecx, &edx);
531
532 if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
533 return;
534
535 smp_num_siblings = (ebx & 0xff0000) >> 16;
536
537 if (smp_num_siblings == 1) {
538 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
539 } else if (smp_num_siblings > 1 ) {
540
541 if (smp_num_siblings > NR_CPUS) {
542 printk(KERN_WARNING "CPU: Unsupported number of the "
543 "siblings %d", smp_num_siblings);
544 smp_num_siblings = 1;
545 return;
546 }
547
548 index_msb = get_count_order(smp_num_siblings);
549 c->phys_proc_id = phys_pkg_id((ebx >> 24) & 0xFF, index_msb);
550
551 printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
552 c->phys_proc_id);
553
554 smp_num_siblings = smp_num_siblings / c->x86_max_cores;
555
556 index_msb = get_count_order(smp_num_siblings) ;
557
558 core_bits = get_count_order(c->x86_max_cores);
559
560 c->cpu_core_id = phys_pkg_id((ebx >> 24) & 0xFF, index_msb) &
561 ((1 << core_bits) - 1);
562
563 if (c->x86_max_cores > 1)
564 printk(KERN_INFO "CPU: Processor Core ID: %d\n",
565 c->cpu_core_id);
566 }
567}
568#endif
569
570void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
571{
572 char *vendor = NULL;
573
574 if (c->x86_vendor < X86_VENDOR_NUM)
575 vendor = this_cpu->c_vendor;
576 else if (c->cpuid_level >= 0)
577 vendor = c->x86_vendor_id;
578
579 if (vendor && strncmp(c->x86_model_id, vendor, strlen(vendor)))
580 printk("%s ", vendor);
581
582 if (!c->x86_model_id[0])
583 printk("%d86", c->x86);
584 else
585 printk("%s", c->x86_model_id);
586
587 if (c->x86_mask || c->cpuid_level >= 0)
588 printk(" stepping %02x\n", c->x86_mask);
589 else
590 printk("\n");
591}
592
593cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
594
595/* This is hacky. :)
596 * We're emulating future behavior.
597 * In the future, the cpu-specific init functions will be called implicitly
598 * via the magic of initcalls.
599 * They will insert themselves into the cpu_devs structure.
600 * Then, when cpu_init() is called, we can just iterate over that array.
601 */
602
603extern int intel_cpu_init(void);
604extern int cyrix_init_cpu(void);
605extern int nsc_init_cpu(void);
606extern int amd_init_cpu(void);
607extern int centaur_init_cpu(void);
608extern int transmeta_init_cpu(void);
609extern int nexgen_init_cpu(void);
610extern int umc_init_cpu(void);
611
612void __init early_cpu_init(void)
613{
614 intel_cpu_init();
615 cyrix_init_cpu();
616 nsc_init_cpu();
617 amd_init_cpu();
618 centaur_init_cpu();
619 transmeta_init_cpu();
620 nexgen_init_cpu();
621 umc_init_cpu();
622 early_cpu_detect();
623
624#ifdef CONFIG_DEBUG_PAGEALLOC
625 /* pse is not compatible with on-the-fly unmapping,
626 * disable it even if the cpus claim to support it.
627 */
628 clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
629 disable_pse = 1;
630#endif
631}
632
633/* Make sure %fs is initialized properly in idle threads */
634struct pt_regs * __devinit idle_regs(struct pt_regs *regs)
635{
636 memset(regs, 0, sizeof(struct pt_regs));
637 regs->xfs = __KERNEL_PERCPU;
638 return regs;
639}
640
641/* Current gdt points %fs at the "master" per-cpu area: after this,
642 * it's on the real one. */
643void switch_to_new_gdt(void)
644{
645 struct Xgt_desc_struct gdt_descr;
646
647 gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
648 gdt_descr.size = GDT_SIZE - 1;
649 load_gdt(&gdt_descr);
650 asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory");
651}
652
653/*
654 * cpu_init() initializes state that is per-CPU. Some data is already
655 * initialized (naturally) in the bootstrap process, such as the GDT
656 * and IDT. We reload them nevertheless, this function acts as a
657 * 'CPU state barrier', nothing should get across.
658 */
659void __cpuinit cpu_init(void)
660{
661 int cpu = smp_processor_id();
662 struct task_struct *curr = current;
663 struct tss_struct * t = &per_cpu(init_tss, cpu);
664 struct thread_struct *thread = &curr->thread;
665
666 if (cpu_test_and_set(cpu, cpu_initialized)) {
667 printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
668 for (;;) local_irq_enable();
669 }
670
671 printk(KERN_INFO "Initializing CPU#%d\n", cpu);
672
673 if (cpu_has_vme || cpu_has_tsc || cpu_has_de)
674 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
675 if (tsc_disable && cpu_has_tsc) {
676 printk(KERN_NOTICE "Disabling TSC...\n");
677 /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/
678 clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
679 set_in_cr4(X86_CR4_TSD);
680 }
681
682 load_idt(&idt_descr);
683 switch_to_new_gdt();
684
685 /*
686 * Set up and load the per-CPU TSS and LDT
687 */
688 atomic_inc(&init_mm.mm_count);
689 curr->active_mm = &init_mm;
690 if (curr->mm)
691 BUG();
692 enter_lazy_tlb(&init_mm, curr);
693
694 load_esp0(t, thread);
695 set_tss_desc(cpu,t);
696 load_TR_desc();
697 load_LDT(&init_mm.context);
698
699#ifdef CONFIG_DOUBLEFAULT
700 /* Set up doublefault TSS pointer in the GDT */
701 __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
702#endif
703
704 /* Clear %gs. */
705 asm volatile ("mov %0, %%gs" : : "r" (0));
706
707 /* Clear all 6 debug registers: */
708 set_debugreg(0, 0);
709 set_debugreg(0, 1);
710 set_debugreg(0, 2);
711 set_debugreg(0, 3);
712 set_debugreg(0, 6);
713 set_debugreg(0, 7);
714
715 /*
716 * Force FPU initialization:
717 */
718 current_thread_info()->status = 0;
719 clear_used_math();
720 mxcsr_feature_mask_init();
721}
722
723#ifdef CONFIG_HOTPLUG_CPU
724void __cpuinit cpu_uninit(void)
725{
726 int cpu = raw_smp_processor_id();
727 cpu_clear(cpu, cpu_initialized);
728
729 /* lazy TLB state */
730 per_cpu(cpu_tlbstate, cpu).state = 0;
731 per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
732}
733#endif
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
new file mode 100644
index 000000000000..2f6432cef6ff
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -0,0 +1,28 @@
1
2struct cpu_model_info {
3 int vendor;
4 int family;
5 char *model_names[16];
6};
7
8/* attempt to consolidate cpu attributes */
9struct cpu_dev {
10 char * c_vendor;
11
12 /* some have two possibilities for cpuid string */
13 char * c_ident[2];
14
15 struct cpu_model_info c_models[4];
16
17 void (*c_init)(struct cpuinfo_x86 * c);
18 void (*c_identify)(struct cpuinfo_x86 * c);
19 unsigned int (*c_size_cache)(struct cpuinfo_x86 * c, unsigned int size);
20};
21
22extern struct cpu_dev * cpu_devs [X86_VENDOR_NUM];
23
24extern int get_model_name(struct cpuinfo_x86 *c);
25extern void display_cacheinfo(struct cpuinfo_x86 *c);
26
27extern void early_intel_workaround(struct cpuinfo_x86 *c);
28
diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig
new file mode 100644
index 000000000000..d8c6f132dc7a
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/Kconfig
@@ -0,0 +1,250 @@
1#
2# CPU Frequency scaling
3#
4
5menu "CPU Frequency scaling"
6
7source "drivers/cpufreq/Kconfig"
8
9if CPU_FREQ
10
11comment "CPUFreq processor drivers"
12
13config X86_ACPI_CPUFREQ
14 tristate "ACPI Processor P-States driver"
15 select CPU_FREQ_TABLE
16 depends on ACPI_PROCESSOR
17 help
18 This driver adds a CPUFreq driver which utilizes the ACPI
19 Processor Performance States.
20 This driver also supports Intel Enhanced Speedstep.
21
22 For details, take a look at <file:Documentation/cpu-freq/>.
23
24 If in doubt, say N.
25
26config ELAN_CPUFREQ
27 tristate "AMD Elan SC400 and SC410"
28 select CPU_FREQ_TABLE
29 depends on X86_ELAN
30 ---help---
31 This adds the CPUFreq driver for AMD Elan SC400 and SC410
32 processors.
33
34 You need to specify the processor maximum speed as boot
35 parameter: elanfreq=maxspeed (in kHz) or as module
36 parameter "max_freq".
37
38 For details, take a look at <file:Documentation/cpu-freq/>.
39
40 If in doubt, say N.
41
42config SC520_CPUFREQ
43 tristate "AMD Elan SC520"
44 select CPU_FREQ_TABLE
45 depends on X86_ELAN
46 ---help---
47 This adds the CPUFreq driver for AMD Elan SC520 processor.
48
49 For details, take a look at <file:Documentation/cpu-freq/>.
50
51 If in doubt, say N.
52
53
54config X86_POWERNOW_K6
55 tristate "AMD Mobile K6-2/K6-3 PowerNow!"
56 select CPU_FREQ_TABLE
57 help
58 This adds the CPUFreq driver for mobile AMD K6-2+ and mobile
59 AMD K6-3+ processors.
60
61 For details, take a look at <file:Documentation/cpu-freq/>.
62
63 If in doubt, say N.
64
65config X86_POWERNOW_K7
66 tristate "AMD Mobile Athlon/Duron PowerNow!"
67 select CPU_FREQ_TABLE
68 help
69 This adds the CPUFreq driver for mobile AMD K7 mobile processors.
70
71 For details, take a look at <file:Documentation/cpu-freq/>.
72
73 If in doubt, say N.
74
75config X86_POWERNOW_K7_ACPI
76 bool
77 depends on X86_POWERNOW_K7 && ACPI_PROCESSOR
78 depends on !(X86_POWERNOW_K7 = y && ACPI_PROCESSOR = m)
79 default y
80
81config X86_POWERNOW_K8
82 tristate "AMD Opteron/Athlon64 PowerNow!"
83 select CPU_FREQ_TABLE
84 depends on EXPERIMENTAL
85 help
86 This adds the CPUFreq driver for mobile AMD Opteron/Athlon64 processors.
87
88 For details, take a look at <file:Documentation/cpu-freq/>.
89
90 If in doubt, say N.
91
92config X86_POWERNOW_K8_ACPI
93 bool "ACPI Support"
94 select ACPI_PROCESSOR
95 depends on ACPI && X86_POWERNOW_K8
96 default y
97 help
98 This provides access to the K8s Processor Performance States via ACPI.
99 This driver is probably required for CPUFreq to work with multi-socket and
100 SMP systems. It is not required on at least some single-socket yet
101 multi-core systems, even if SMP is enabled.
102
103 It is safe to say Y here.
104
105config X86_GX_SUSPMOD
106 tristate "Cyrix MediaGX/NatSemi Geode Suspend Modulation"
107 depends on PCI
108 help
109 This add the CPUFreq driver for NatSemi Geode processors which
110 support suspend modulation.
111
112 For details, take a look at <file:Documentation/cpu-freq/>.
113
114 If in doubt, say N.
115
116config X86_SPEEDSTEP_CENTRINO
117 tristate "Intel Enhanced SpeedStep"
118 select CPU_FREQ_TABLE
119 select X86_SPEEDSTEP_CENTRINO_TABLE
120 help
121 This adds the CPUFreq driver for Enhanced SpeedStep enabled
122 mobile CPUs. This means Intel Pentium M (Centrino) CPUs. However,
123 you also need to say Y to "Use ACPI tables to decode..." below
124 [which might imply enabling ACPI] if you want to use this driver
125 on non-Banias CPUs.
126
127 For details, take a look at <file:Documentation/cpu-freq/>.
128
129 If in doubt, say N.
130
131config X86_SPEEDSTEP_CENTRINO_TABLE
132 bool "Built-in tables for Banias CPUs"
133 depends on X86_SPEEDSTEP_CENTRINO
134 default y
135 help
136 Use built-in tables for Banias CPUs if ACPI encoding
137 is not available.
138
139 If in doubt, say N.
140
141config X86_SPEEDSTEP_ICH
142 tristate "Intel Speedstep on ICH-M chipsets (ioport interface)"
143 select CPU_FREQ_TABLE
144 help
145 This adds the CPUFreq driver for certain mobile Intel Pentium III
146 (Coppermine), all mobile Intel Pentium III-M (Tualatin) and all
147 mobile Intel Pentium 4 P4-M on systems which have an Intel ICH2,
148 ICH3 or ICH4 southbridge.
149
150 For details, take a look at <file:Documentation/cpu-freq/>.
151
152 If in doubt, say N.
153
154config X86_SPEEDSTEP_SMI
155 tristate "Intel SpeedStep on 440BX/ZX/MX chipsets (SMI interface)"
156 select CPU_FREQ_TABLE
157 depends on EXPERIMENTAL
158 help
159 This adds the CPUFreq driver for certain mobile Intel Pentium III
160 (Coppermine), all mobile Intel Pentium III-M (Tualatin)
161 on systems which have an Intel 440BX/ZX/MX southbridge.
162
163 For details, take a look at <file:Documentation/cpu-freq/>.
164
165 If in doubt, say N.
166
167config X86_P4_CLOCKMOD
168 tristate "Intel Pentium 4 clock modulation"
169 select CPU_FREQ_TABLE
170 help
171 This adds the CPUFreq driver for Intel Pentium 4 / XEON
172 processors.
173
174 For details, take a look at <file:Documentation/cpu-freq/>.
175
176 If in doubt, say N.
177
178config X86_CPUFREQ_NFORCE2
179 tristate "nVidia nForce2 FSB changing"
180 depends on EXPERIMENTAL
181 help
182 This adds the CPUFreq driver for FSB changing on nVidia nForce2
183 platforms.
184
185 For details, take a look at <file:Documentation/cpu-freq/>.
186
187 If in doubt, say N.
188
189config X86_LONGRUN
190 tristate "Transmeta LongRun"
191 help
192 This adds the CPUFreq driver for Transmeta Crusoe and Efficeon processors
193 which support LongRun.
194
195 For details, take a look at <file:Documentation/cpu-freq/>.
196
197 If in doubt, say N.
198
199config X86_LONGHAUL
200 tristate "VIA Cyrix III Longhaul"
201 select CPU_FREQ_TABLE
202 depends on ACPI_PROCESSOR
203 help
204 This adds the CPUFreq driver for VIA Samuel/CyrixIII,
205 VIA Cyrix Samuel/C3, VIA Cyrix Ezra and VIA Cyrix Ezra-T
206 processors.
207
208 For details, take a look at <file:Documentation/cpu-freq/>.
209
210 If in doubt, say N.
211
212config X86_E_POWERSAVER
213 tristate "VIA C7 Enhanced PowerSaver (EXPERIMENTAL)"
214 select CPU_FREQ_TABLE
215 depends on EXPERIMENTAL
216 help
217 This adds the CPUFreq driver for VIA C7 processors.
218
219 If in doubt, say N.
220
221comment "shared options"
222
223config X86_ACPI_CPUFREQ_PROC_INTF
224 bool "/proc/acpi/processor/../performance interface (deprecated)"
225 depends on PROC_FS
226 depends on X86_ACPI_CPUFREQ || X86_POWERNOW_K7_ACPI || X86_POWERNOW_K8_ACPI
227 help
228 This enables the deprecated /proc/acpi/processor/../performance
229 interface. While it is helpful for debugging, the generic,
230 cross-architecture cpufreq interfaces should be used.
231
232 If in doubt, say N.
233
234config X86_SPEEDSTEP_LIB
235 tristate
236 default X86_SPEEDSTEP_ICH || X86_SPEEDSTEP_SMI || X86_P4_CLOCKMOD
237
238config X86_SPEEDSTEP_RELAXED_CAP_CHECK
239 bool "Relaxed speedstep capability checks"
240 depends on (X86_SPEEDSTEP_SMI || X86_SPEEDSTEP_ICH)
241 help
242 Don't perform all checks for a speedstep capable system which would
243 normally be done. Some ancient or strange systems, though speedstep
244 capable, don't always indicate that they are speedstep capable. This
245 option lets the probing code bypass some of those checks if the
246 parameter "relaxed_check=1" is passed to the module.
247
248endif # CPU_FREQ
249
250endmenu
diff --git a/arch/x86/kernel/cpu/cpufreq/Makefile b/arch/x86/kernel/cpu/cpufreq/Makefile
new file mode 100644
index 000000000000..560f7760dae5
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/Makefile
@@ -0,0 +1,16 @@
1obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o
2obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o
3obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o
4obj-$(CONFIG_X86_LONGHAUL) += longhaul.o
5obj-$(CONFIG_X86_E_POWERSAVER) += e_powersaver.o
6obj-$(CONFIG_ELAN_CPUFREQ) += elanfreq.o
7obj-$(CONFIG_SC520_CPUFREQ) += sc520_freq.o
8obj-$(CONFIG_X86_LONGRUN) += longrun.o
9obj-$(CONFIG_X86_GX_SUSPMOD) += gx-suspmod.o
10obj-$(CONFIG_X86_SPEEDSTEP_ICH) += speedstep-ich.o
11obj-$(CONFIG_X86_SPEEDSTEP_LIB) += speedstep-lib.o
12obj-$(CONFIG_X86_SPEEDSTEP_SMI) += speedstep-smi.o
13obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o
14obj-$(CONFIG_X86_SPEEDSTEP_CENTRINO) += speedstep-centrino.o
15obj-$(CONFIG_X86_P4_CLOCKMOD) += p4-clockmod.o
16obj-$(CONFIG_X86_CPUFREQ_NFORCE2) += cpufreq-nforce2.o
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
new file mode 100644
index 000000000000..b6434a7ef8b2
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -0,0 +1,799 @@
1/*
2 * acpi-cpufreq.c - ACPI Processor P-States Driver ($Revision: 1.4 $)
3 *
4 * Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com>
5 * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
6 * Copyright (C) 2002 - 2004 Dominik Brodowski <linux@brodo.de>
7 * Copyright (C) 2006 Denis Sadykov <denis.m.sadykov@intel.com>
8 *
9 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or (at
14 * your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful, but
17 * WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License along
22 * with this program; if not, write to the Free Software Foundation, Inc.,
23 * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
24 *
25 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
26 */
27
28#include <linux/kernel.h>
29#include <linux/module.h>
30#include <linux/init.h>
31#include <linux/smp.h>
32#include <linux/sched.h>
33#include <linux/cpufreq.h>
34#include <linux/compiler.h>
35#include <linux/dmi.h>
36
37#include <linux/acpi.h>
38#include <acpi/processor.h>
39
40#include <asm/io.h>
41#include <asm/msr.h>
42#include <asm/processor.h>
43#include <asm/cpufeature.h>
44#include <asm/delay.h>
45#include <asm/uaccess.h>
46
47#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "acpi-cpufreq", msg)
48
49MODULE_AUTHOR("Paul Diefenbaugh, Dominik Brodowski");
50MODULE_DESCRIPTION("ACPI Processor P-States Driver");
51MODULE_LICENSE("GPL");
52
53enum {
54 UNDEFINED_CAPABLE = 0,
55 SYSTEM_INTEL_MSR_CAPABLE,
56 SYSTEM_IO_CAPABLE,
57};
58
59#define INTEL_MSR_RANGE (0xffff)
60#define CPUID_6_ECX_APERFMPERF_CAPABILITY (0x1)
61
62struct acpi_cpufreq_data {
63 struct acpi_processor_performance *acpi_data;
64 struct cpufreq_frequency_table *freq_table;
65 unsigned int max_freq;
66 unsigned int resume;
67 unsigned int cpu_feature;
68};
69
70static struct acpi_cpufreq_data *drv_data[NR_CPUS];
71/* acpi_perf_data is a pointer to percpu data. */
72static struct acpi_processor_performance *acpi_perf_data;
73
74static struct cpufreq_driver acpi_cpufreq_driver;
75
76static unsigned int acpi_pstate_strict;
77
78static int check_est_cpu(unsigned int cpuid)
79{
80 struct cpuinfo_x86 *cpu = &cpu_data[cpuid];
81
82 if (cpu->x86_vendor != X86_VENDOR_INTEL ||
83 !cpu_has(cpu, X86_FEATURE_EST))
84 return 0;
85
86 return 1;
87}
88
89static unsigned extract_io(u32 value, struct acpi_cpufreq_data *data)
90{
91 struct acpi_processor_performance *perf;
92 int i;
93
94 perf = data->acpi_data;
95
96 for (i=0; i<perf->state_count; i++) {
97 if (value == perf->states[i].status)
98 return data->freq_table[i].frequency;
99 }
100 return 0;
101}
102
103static unsigned extract_msr(u32 msr, struct acpi_cpufreq_data *data)
104{
105 int i;
106 struct acpi_processor_performance *perf;
107
108 msr &= INTEL_MSR_RANGE;
109 perf = data->acpi_data;
110
111 for (i=0; data->freq_table[i].frequency != CPUFREQ_TABLE_END; i++) {
112 if (msr == perf->states[data->freq_table[i].index].status)
113 return data->freq_table[i].frequency;
114 }
115 return data->freq_table[0].frequency;
116}
117
118static unsigned extract_freq(u32 val, struct acpi_cpufreq_data *data)
119{
120 switch (data->cpu_feature) {
121 case SYSTEM_INTEL_MSR_CAPABLE:
122 return extract_msr(val, data);
123 case SYSTEM_IO_CAPABLE:
124 return extract_io(val, data);
125 default:
126 return 0;
127 }
128}
129
130struct msr_addr {
131 u32 reg;
132};
133
134struct io_addr {
135 u16 port;
136 u8 bit_width;
137};
138
139typedef union {
140 struct msr_addr msr;
141 struct io_addr io;
142} drv_addr_union;
143
144struct drv_cmd {
145 unsigned int type;
146 cpumask_t mask;
147 drv_addr_union addr;
148 u32 val;
149};
150
151static void do_drv_read(struct drv_cmd *cmd)
152{
153 u32 h;
154
155 switch (cmd->type) {
156 case SYSTEM_INTEL_MSR_CAPABLE:
157 rdmsr(cmd->addr.msr.reg, cmd->val, h);
158 break;
159 case SYSTEM_IO_CAPABLE:
160 acpi_os_read_port((acpi_io_address)cmd->addr.io.port,
161 &cmd->val,
162 (u32)cmd->addr.io.bit_width);
163 break;
164 default:
165 break;
166 }
167}
168
169static void do_drv_write(struct drv_cmd *cmd)
170{
171 u32 lo, hi;
172
173 switch (cmd->type) {
174 case SYSTEM_INTEL_MSR_CAPABLE:
175 rdmsr(cmd->addr.msr.reg, lo, hi);
176 lo = (lo & ~INTEL_MSR_RANGE) | (cmd->val & INTEL_MSR_RANGE);
177 wrmsr(cmd->addr.msr.reg, lo, hi);
178 break;
179 case SYSTEM_IO_CAPABLE:
180 acpi_os_write_port((acpi_io_address)cmd->addr.io.port,
181 cmd->val,
182 (u32)cmd->addr.io.bit_width);
183 break;
184 default:
185 break;
186 }
187}
188
189static void drv_read(struct drv_cmd *cmd)
190{
191 cpumask_t saved_mask = current->cpus_allowed;
192 cmd->val = 0;
193
194 set_cpus_allowed(current, cmd->mask);
195 do_drv_read(cmd);
196 set_cpus_allowed(current, saved_mask);
197}
198
199static void drv_write(struct drv_cmd *cmd)
200{
201 cpumask_t saved_mask = current->cpus_allowed;
202 unsigned int i;
203
204 for_each_cpu_mask(i, cmd->mask) {
205 set_cpus_allowed(current, cpumask_of_cpu(i));
206 do_drv_write(cmd);
207 }
208
209 set_cpus_allowed(current, saved_mask);
210 return;
211}
212
213static u32 get_cur_val(cpumask_t mask)
214{
215 struct acpi_processor_performance *perf;
216 struct drv_cmd cmd;
217
218 if (unlikely(cpus_empty(mask)))
219 return 0;
220
221 switch (drv_data[first_cpu(mask)]->cpu_feature) {
222 case SYSTEM_INTEL_MSR_CAPABLE:
223 cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
224 cmd.addr.msr.reg = MSR_IA32_PERF_STATUS;
225 break;
226 case SYSTEM_IO_CAPABLE:
227 cmd.type = SYSTEM_IO_CAPABLE;
228 perf = drv_data[first_cpu(mask)]->acpi_data;
229 cmd.addr.io.port = perf->control_register.address;
230 cmd.addr.io.bit_width = perf->control_register.bit_width;
231 break;
232 default:
233 return 0;
234 }
235
236 cmd.mask = mask;
237
238 drv_read(&cmd);
239
240 dprintk("get_cur_val = %u\n", cmd.val);
241
242 return cmd.val;
243}
244
245/*
246 * Return the measured active (C0) frequency on this CPU since last call
247 * to this function.
248 * Input: cpu number
249 * Return: Average CPU frequency in terms of max frequency (zero on error)
250 *
251 * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance
252 * over a period of time, while CPU is in C0 state.
253 * IA32_MPERF counts at the rate of max advertised frequency
254 * IA32_APERF counts at the rate of actual CPU frequency
255 * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
256 * no meaning should be associated with absolute values of these MSRs.
257 */
258static unsigned int get_measured_perf(unsigned int cpu)
259{
260 union {
261 struct {
262 u32 lo;
263 u32 hi;
264 } split;
265 u64 whole;
266 } aperf_cur, mperf_cur;
267
268 cpumask_t saved_mask;
269 unsigned int perf_percent;
270 unsigned int retval;
271
272 saved_mask = current->cpus_allowed;
273 set_cpus_allowed(current, cpumask_of_cpu(cpu));
274 if (get_cpu() != cpu) {
275 /* We were not able to run on requested processor */
276 put_cpu();
277 return 0;
278 }
279
280 rdmsr(MSR_IA32_APERF, aperf_cur.split.lo, aperf_cur.split.hi);
281 rdmsr(MSR_IA32_MPERF, mperf_cur.split.lo, mperf_cur.split.hi);
282
283 wrmsr(MSR_IA32_APERF, 0,0);
284 wrmsr(MSR_IA32_MPERF, 0,0);
285
286#ifdef __i386__
287 /*
288 * We dont want to do 64 bit divide with 32 bit kernel
289 * Get an approximate value. Return failure in case we cannot get
290 * an approximate value.
291 */
292 if (unlikely(aperf_cur.split.hi || mperf_cur.split.hi)) {
293 int shift_count;
294 u32 h;
295
296 h = max_t(u32, aperf_cur.split.hi, mperf_cur.split.hi);
297 shift_count = fls(h);
298
299 aperf_cur.whole >>= shift_count;
300 mperf_cur.whole >>= shift_count;
301 }
302
303 if (((unsigned long)(-1) / 100) < aperf_cur.split.lo) {
304 int shift_count = 7;
305 aperf_cur.split.lo >>= shift_count;
306 mperf_cur.split.lo >>= shift_count;
307 }
308
309 if (aperf_cur.split.lo && mperf_cur.split.lo)
310 perf_percent = (aperf_cur.split.lo * 100) / mperf_cur.split.lo;
311 else
312 perf_percent = 0;
313
314#else
315 if (unlikely(((unsigned long)(-1) / 100) < aperf_cur.whole)) {
316 int shift_count = 7;
317 aperf_cur.whole >>= shift_count;
318 mperf_cur.whole >>= shift_count;
319 }
320
321 if (aperf_cur.whole && mperf_cur.whole)
322 perf_percent = (aperf_cur.whole * 100) / mperf_cur.whole;
323 else
324 perf_percent = 0;
325
326#endif
327
328 retval = drv_data[cpu]->max_freq * perf_percent / 100;
329
330 put_cpu();
331 set_cpus_allowed(current, saved_mask);
332
333 dprintk("cpu %d: performance percent %d\n", cpu, perf_percent);
334 return retval;
335}
336
337static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
338{
339 struct acpi_cpufreq_data *data = drv_data[cpu];
340 unsigned int freq;
341
342 dprintk("get_cur_freq_on_cpu (%d)\n", cpu);
343
344 if (unlikely(data == NULL ||
345 data->acpi_data == NULL || data->freq_table == NULL)) {
346 return 0;
347 }
348
349 freq = extract_freq(get_cur_val(cpumask_of_cpu(cpu)), data);
350 dprintk("cur freq = %u\n", freq);
351
352 return freq;
353}
354
355static unsigned int check_freqs(cpumask_t mask, unsigned int freq,
356 struct acpi_cpufreq_data *data)
357{
358 unsigned int cur_freq;
359 unsigned int i;
360
361 for (i=0; i<100; i++) {
362 cur_freq = extract_freq(get_cur_val(mask), data);
363 if (cur_freq == freq)
364 return 1;
365 udelay(10);
366 }
367 return 0;
368}
369
370static int acpi_cpufreq_target(struct cpufreq_policy *policy,
371 unsigned int target_freq, unsigned int relation)
372{
373 struct acpi_cpufreq_data *data = drv_data[policy->cpu];
374 struct acpi_processor_performance *perf;
375 struct cpufreq_freqs freqs;
376 cpumask_t online_policy_cpus;
377 struct drv_cmd cmd;
378 unsigned int next_state = 0; /* Index into freq_table */
379 unsigned int next_perf_state = 0; /* Index into perf table */
380 unsigned int i;
381 int result = 0;
382
383 dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu);
384
385 if (unlikely(data == NULL ||
386 data->acpi_data == NULL || data->freq_table == NULL)) {
387 return -ENODEV;
388 }
389
390 perf = data->acpi_data;
391 result = cpufreq_frequency_table_target(policy,
392 data->freq_table,
393 target_freq,
394 relation, &next_state);
395 if (unlikely(result))
396 return -ENODEV;
397
398#ifdef CONFIG_HOTPLUG_CPU
399 /* cpufreq holds the hotplug lock, so we are safe from here on */
400 cpus_and(online_policy_cpus, cpu_online_map, policy->cpus);
401#else
402 online_policy_cpus = policy->cpus;
403#endif
404
405 next_perf_state = data->freq_table[next_state].index;
406 if (perf->state == next_perf_state) {
407 if (unlikely(data->resume)) {
408 dprintk("Called after resume, resetting to P%d\n",
409 next_perf_state);
410 data->resume = 0;
411 } else {
412 dprintk("Already at target state (P%d)\n",
413 next_perf_state);
414 return 0;
415 }
416 }
417
418 switch (data->cpu_feature) {
419 case SYSTEM_INTEL_MSR_CAPABLE:
420 cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
421 cmd.addr.msr.reg = MSR_IA32_PERF_CTL;
422 cmd.val = (u32) perf->states[next_perf_state].control;
423 break;
424 case SYSTEM_IO_CAPABLE:
425 cmd.type = SYSTEM_IO_CAPABLE;
426 cmd.addr.io.port = perf->control_register.address;
427 cmd.addr.io.bit_width = perf->control_register.bit_width;
428 cmd.val = (u32) perf->states[next_perf_state].control;
429 break;
430 default:
431 return -ENODEV;
432 }
433
434 cpus_clear(cmd.mask);
435
436 if (policy->shared_type != CPUFREQ_SHARED_TYPE_ANY)
437 cmd.mask = online_policy_cpus;
438 else
439 cpu_set(policy->cpu, cmd.mask);
440
441 freqs.old = perf->states[perf->state].core_frequency * 1000;
442 freqs.new = data->freq_table[next_state].frequency;
443 for_each_cpu_mask(i, cmd.mask) {
444 freqs.cpu = i;
445 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
446 }
447
448 drv_write(&cmd);
449
450 if (acpi_pstate_strict) {
451 if (!check_freqs(cmd.mask, freqs.new, data)) {
452 dprintk("acpi_cpufreq_target failed (%d)\n",
453 policy->cpu);
454 return -EAGAIN;
455 }
456 }
457
458 for_each_cpu_mask(i, cmd.mask) {
459 freqs.cpu = i;
460 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
461 }
462 perf->state = next_perf_state;
463
464 return result;
465}
466
467static int acpi_cpufreq_verify(struct cpufreq_policy *policy)
468{
469 struct acpi_cpufreq_data *data = drv_data[policy->cpu];
470
471 dprintk("acpi_cpufreq_verify\n");
472
473 return cpufreq_frequency_table_verify(policy, data->freq_table);
474}
475
476static unsigned long
477acpi_cpufreq_guess_freq(struct acpi_cpufreq_data *data, unsigned int cpu)
478{
479 struct acpi_processor_performance *perf = data->acpi_data;
480
481 if (cpu_khz) {
482 /* search the closest match to cpu_khz */
483 unsigned int i;
484 unsigned long freq;
485 unsigned long freqn = perf->states[0].core_frequency * 1000;
486
487 for (i=0; i<(perf->state_count-1); i++) {
488 freq = freqn;
489 freqn = perf->states[i+1].core_frequency * 1000;
490 if ((2 * cpu_khz) > (freqn + freq)) {
491 perf->state = i;
492 return freq;
493 }
494 }
495 perf->state = perf->state_count-1;
496 return freqn;
497 } else {
498 /* assume CPU is at P0... */
499 perf->state = 0;
500 return perf->states[0].core_frequency * 1000;
501 }
502}
503
504/*
505 * acpi_cpufreq_early_init - initialize ACPI P-States library
506 *
507 * Initialize the ACPI P-States library (drivers/acpi/processor_perflib.c)
508 * in order to determine correct frequency and voltage pairings. We can
509 * do _PDC and _PSD and find out the processor dependency for the
510 * actual init that will happen later...
511 */
512static int __init acpi_cpufreq_early_init(void)
513{
514 dprintk("acpi_cpufreq_early_init\n");
515
516 acpi_perf_data = alloc_percpu(struct acpi_processor_performance);
517 if (!acpi_perf_data) {
518 dprintk("Memory allocation error for acpi_perf_data.\n");
519 return -ENOMEM;
520 }
521
522 /* Do initialization in ACPI core */
523 acpi_processor_preregister_performance(acpi_perf_data);
524 return 0;
525}
526
527#ifdef CONFIG_SMP
528/*
529 * Some BIOSes do SW_ANY coordination internally, either set it up in hw
530 * or do it in BIOS firmware and won't inform about it to OS. If not
531 * detected, this has a side effect of making CPU run at a different speed
532 * than OS intended it to run at. Detect it and handle it cleanly.
533 */
534static int bios_with_sw_any_bug;
535
536static int sw_any_bug_found(const struct dmi_system_id *d)
537{
538 bios_with_sw_any_bug = 1;
539 return 0;
540}
541
542static const struct dmi_system_id sw_any_bug_dmi_table[] = {
543 {
544 .callback = sw_any_bug_found,
545 .ident = "Supermicro Server X6DLP",
546 .matches = {
547 DMI_MATCH(DMI_SYS_VENDOR, "Supermicro"),
548 DMI_MATCH(DMI_BIOS_VERSION, "080010"),
549 DMI_MATCH(DMI_PRODUCT_NAME, "X6DLP"),
550 },
551 },
552 { }
553};
554#endif
555
556static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
557{
558 unsigned int i;
559 unsigned int valid_states = 0;
560 unsigned int cpu = policy->cpu;
561 struct acpi_cpufreq_data *data;
562 unsigned int result = 0;
563 struct cpuinfo_x86 *c = &cpu_data[policy->cpu];
564 struct acpi_processor_performance *perf;
565
566 dprintk("acpi_cpufreq_cpu_init\n");
567
568 data = kzalloc(sizeof(struct acpi_cpufreq_data), GFP_KERNEL);
569 if (!data)
570 return -ENOMEM;
571
572 data->acpi_data = percpu_ptr(acpi_perf_data, cpu);
573 drv_data[cpu] = data;
574
575 if (cpu_has(c, X86_FEATURE_CONSTANT_TSC))
576 acpi_cpufreq_driver.flags |= CPUFREQ_CONST_LOOPS;
577
578 result = acpi_processor_register_performance(data->acpi_data, cpu);
579 if (result)
580 goto err_free;
581
582 perf = data->acpi_data;
583 policy->shared_type = perf->shared_type;
584
585 /*
586 * Will let policy->cpus know about dependency only when software
587 * coordination is required.
588 */
589 if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL ||
590 policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) {
591 policy->cpus = perf->shared_cpu_map;
592 }
593
594#ifdef CONFIG_SMP
595 dmi_check_system(sw_any_bug_dmi_table);
596 if (bios_with_sw_any_bug && cpus_weight(policy->cpus) == 1) {
597 policy->shared_type = CPUFREQ_SHARED_TYPE_ALL;
598 policy->cpus = cpu_core_map[cpu];
599 }
600#endif
601
602 /* capability check */
603 if (perf->state_count <= 1) {
604 dprintk("No P-States\n");
605 result = -ENODEV;
606 goto err_unreg;
607 }
608
609 if (perf->control_register.space_id != perf->status_register.space_id) {
610 result = -ENODEV;
611 goto err_unreg;
612 }
613
614 switch (perf->control_register.space_id) {
615 case ACPI_ADR_SPACE_SYSTEM_IO:
616 dprintk("SYSTEM IO addr space\n");
617 data->cpu_feature = SYSTEM_IO_CAPABLE;
618 break;
619 case ACPI_ADR_SPACE_FIXED_HARDWARE:
620 dprintk("HARDWARE addr space\n");
621 if (!check_est_cpu(cpu)) {
622 result = -ENODEV;
623 goto err_unreg;
624 }
625 data->cpu_feature = SYSTEM_INTEL_MSR_CAPABLE;
626 break;
627 default:
628 dprintk("Unknown addr space %d\n",
629 (u32) (perf->control_register.space_id));
630 result = -ENODEV;
631 goto err_unreg;
632 }
633
634 data->freq_table = kmalloc(sizeof(struct cpufreq_frequency_table) *
635 (perf->state_count+1), GFP_KERNEL);
636 if (!data->freq_table) {
637 result = -ENOMEM;
638 goto err_unreg;
639 }
640
641 /* detect transition latency */
642 policy->cpuinfo.transition_latency = 0;
643 for (i=0; i<perf->state_count; i++) {
644 if ((perf->states[i].transition_latency * 1000) >
645 policy->cpuinfo.transition_latency)
646 policy->cpuinfo.transition_latency =
647 perf->states[i].transition_latency * 1000;
648 }
649 policy->governor = CPUFREQ_DEFAULT_GOVERNOR;
650
651 data->max_freq = perf->states[0].core_frequency * 1000;
652 /* table init */
653 for (i=0; i<perf->state_count; i++) {
654 if (i>0 && perf->states[i].core_frequency >=
655 data->freq_table[valid_states-1].frequency / 1000)
656 continue;
657
658 data->freq_table[valid_states].index = i;
659 data->freq_table[valid_states].frequency =
660 perf->states[i].core_frequency * 1000;
661 valid_states++;
662 }
663 data->freq_table[valid_states].frequency = CPUFREQ_TABLE_END;
664 perf->state = 0;
665
666 result = cpufreq_frequency_table_cpuinfo(policy, data->freq_table);
667 if (result)
668 goto err_freqfree;
669
670 switch (perf->control_register.space_id) {
671 case ACPI_ADR_SPACE_SYSTEM_IO:
672 /* Current speed is unknown and not detectable by IO port */
673 policy->cur = acpi_cpufreq_guess_freq(data, policy->cpu);
674 break;
675 case ACPI_ADR_SPACE_FIXED_HARDWARE:
676 acpi_cpufreq_driver.get = get_cur_freq_on_cpu;
677 policy->cur = get_cur_freq_on_cpu(cpu);
678 break;
679 default:
680 break;
681 }
682
683 /* notify BIOS that we exist */
684 acpi_processor_notify_smm(THIS_MODULE);
685
686 /* Check for APERF/MPERF support in hardware */
687 if (c->x86_vendor == X86_VENDOR_INTEL && c->cpuid_level >= 6) {
688 unsigned int ecx;
689 ecx = cpuid_ecx(6);
690 if (ecx & CPUID_6_ECX_APERFMPERF_CAPABILITY)
691 acpi_cpufreq_driver.getavg = get_measured_perf;
692 }
693
694 dprintk("CPU%u - ACPI performance management activated.\n", cpu);
695 for (i = 0; i < perf->state_count; i++)
696 dprintk(" %cP%d: %d MHz, %d mW, %d uS\n",
697 (i == perf->state ? '*' : ' '), i,
698 (u32) perf->states[i].core_frequency,
699 (u32) perf->states[i].power,
700 (u32) perf->states[i].transition_latency);
701
702 cpufreq_frequency_table_get_attr(data->freq_table, policy->cpu);
703
704 /*
705 * the first call to ->target() should result in us actually
706 * writing something to the appropriate registers.
707 */
708 data->resume = 1;
709
710 return result;
711
712err_freqfree:
713 kfree(data->freq_table);
714err_unreg:
715 acpi_processor_unregister_performance(perf, cpu);
716err_free:
717 kfree(data);
718 drv_data[cpu] = NULL;
719
720 return result;
721}
722
723static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy)
724{
725 struct acpi_cpufreq_data *data = drv_data[policy->cpu];
726
727 dprintk("acpi_cpufreq_cpu_exit\n");
728
729 if (data) {
730 cpufreq_frequency_table_put_attr(policy->cpu);
731 drv_data[policy->cpu] = NULL;
732 acpi_processor_unregister_performance(data->acpi_data,
733 policy->cpu);
734 kfree(data);
735 }
736
737 return 0;
738}
739
740static int acpi_cpufreq_resume(struct cpufreq_policy *policy)
741{
742 struct acpi_cpufreq_data *data = drv_data[policy->cpu];
743
744 dprintk("acpi_cpufreq_resume\n");
745
746 data->resume = 1;
747
748 return 0;
749}
750
751static struct freq_attr *acpi_cpufreq_attr[] = {
752 &cpufreq_freq_attr_scaling_available_freqs,
753 NULL,
754};
755
756static struct cpufreq_driver acpi_cpufreq_driver = {
757 .verify = acpi_cpufreq_verify,
758 .target = acpi_cpufreq_target,
759 .init = acpi_cpufreq_cpu_init,
760 .exit = acpi_cpufreq_cpu_exit,
761 .resume = acpi_cpufreq_resume,
762 .name = "acpi-cpufreq",
763 .owner = THIS_MODULE,
764 .attr = acpi_cpufreq_attr,
765};
766
767static int __init acpi_cpufreq_init(void)
768{
769 int ret;
770
771 dprintk("acpi_cpufreq_init\n");
772
773 ret = acpi_cpufreq_early_init();
774 if (ret)
775 return ret;
776
777 return cpufreq_register_driver(&acpi_cpufreq_driver);
778}
779
780static void __exit acpi_cpufreq_exit(void)
781{
782 dprintk("acpi_cpufreq_exit\n");
783
784 cpufreq_unregister_driver(&acpi_cpufreq_driver);
785
786 free_percpu(acpi_perf_data);
787
788 return;
789}
790
791module_param(acpi_pstate_strict, uint, 0644);
792MODULE_PARM_DESC(acpi_pstate_strict,
793 "value 0 or non-zero. non-zero -> strict ACPI checks are "
794 "performed during frequency changes.");
795
796late_initcall(acpi_cpufreq_init);
797module_exit(acpi_cpufreq_exit);
798
799MODULE_ALIAS("acpi");
diff --git a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
new file mode 100644
index 000000000000..66acd5039918
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
@@ -0,0 +1,441 @@
1/*
2 * (C) 2004-2006 Sebastian Witt <se.witt@gmx.net>
3 *
4 * Licensed under the terms of the GNU GPL License version 2.
5 * Based upon reverse engineered information
6 *
7 * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
8 */
9
10#include <linux/kernel.h>
11#include <linux/module.h>
12#include <linux/moduleparam.h>
13#include <linux/init.h>
14#include <linux/cpufreq.h>
15#include <linux/pci.h>
16#include <linux/delay.h>
17
18#define NFORCE2_XTAL 25
19#define NFORCE2_BOOTFSB 0x48
20#define NFORCE2_PLLENABLE 0xa8
21#define NFORCE2_PLLREG 0xa4
22#define NFORCE2_PLLADR 0xa0
23#define NFORCE2_PLL(mul, div) (0x100000 | (mul << 8) | div)
24
25#define NFORCE2_MIN_FSB 50
26#define NFORCE2_SAFE_DISTANCE 50
27
28/* Delay in ms between FSB changes */
29//#define NFORCE2_DELAY 10
30
31/* nforce2_chipset:
32 * FSB is changed using the chipset
33 */
34static struct pci_dev *nforce2_chipset_dev;
35
36/* fid:
37 * multiplier * 10
38 */
39static int fid = 0;
40
41/* min_fsb, max_fsb:
42 * minimum and maximum FSB (= FSB at boot time)
43 */
44static int min_fsb = 0;
45static int max_fsb = 0;
46
47MODULE_AUTHOR("Sebastian Witt <se.witt@gmx.net>");
48MODULE_DESCRIPTION("nForce2 FSB changing cpufreq driver");
49MODULE_LICENSE("GPL");
50
51module_param(fid, int, 0444);
52module_param(min_fsb, int, 0444);
53
54MODULE_PARM_DESC(fid, "CPU multiplier to use (11.5 = 115)");
55MODULE_PARM_DESC(min_fsb,
56 "Minimum FSB to use, if not defined: current FSB - 50");
57
58#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "cpufreq-nforce2", msg)
59
60/**
61 * nforce2_calc_fsb - calculate FSB
62 * @pll: PLL value
63 *
64 * Calculates FSB from PLL value
65 */
66static int nforce2_calc_fsb(int pll)
67{
68 unsigned char mul, div;
69
70 mul = (pll >> 8) & 0xff;
71 div = pll & 0xff;
72
73 if (div > 0)
74 return NFORCE2_XTAL * mul / div;
75
76 return 0;
77}
78
79/**
80 * nforce2_calc_pll - calculate PLL value
81 * @fsb: FSB
82 *
83 * Calculate PLL value for given FSB
84 */
85static int nforce2_calc_pll(unsigned int fsb)
86{
87 unsigned char xmul, xdiv;
88 unsigned char mul = 0, div = 0;
89 int tried = 0;
90
91 /* Try to calculate multiplier and divider up to 4 times */
92 while (((mul == 0) || (div == 0)) && (tried <= 3)) {
93 for (xdiv = 2; xdiv <= 0x80; xdiv++)
94 for (xmul = 1; xmul <= 0xfe; xmul++)
95 if (nforce2_calc_fsb(NFORCE2_PLL(xmul, xdiv)) ==
96 fsb + tried) {
97 mul = xmul;
98 div = xdiv;
99 }
100 tried++;
101 }
102
103 if ((mul == 0) || (div == 0))
104 return -1;
105
106 return NFORCE2_PLL(mul, div);
107}
108
109/**
110 * nforce2_write_pll - write PLL value to chipset
111 * @pll: PLL value
112 *
113 * Writes new FSB PLL value to chipset
114 */
115static void nforce2_write_pll(int pll)
116{
117 int temp;
118
119 /* Set the pll addr. to 0x00 */
120 pci_write_config_dword(nforce2_chipset_dev, NFORCE2_PLLADR, 0);
121
122 /* Now write the value in all 64 registers */
123 for (temp = 0; temp <= 0x3f; temp++)
124 pci_write_config_dword(nforce2_chipset_dev, NFORCE2_PLLREG, pll);
125
126 return;
127}
128
129/**
130 * nforce2_fsb_read - Read FSB
131 *
132 * Read FSB from chipset
133 * If bootfsb != 0, return FSB at boot-time
134 */
135static unsigned int nforce2_fsb_read(int bootfsb)
136{
137 struct pci_dev *nforce2_sub5;
138 u32 fsb, temp = 0;
139
140 /* Get chipset boot FSB from subdevice 5 (FSB at boot-time) */
141 nforce2_sub5 = pci_get_subsys(PCI_VENDOR_ID_NVIDIA,
142 0x01EF,PCI_ANY_ID,PCI_ANY_ID,NULL);
143 if (!nforce2_sub5)
144 return 0;
145
146 pci_read_config_dword(nforce2_sub5, NFORCE2_BOOTFSB, &fsb);
147 fsb /= 1000000;
148
149 /* Check if PLL register is already set */
150 pci_read_config_byte(nforce2_chipset_dev,NFORCE2_PLLENABLE, (u8 *)&temp);
151
152 if(bootfsb || !temp)
153 return fsb;
154
155 /* Use PLL register FSB value */
156 pci_read_config_dword(nforce2_chipset_dev,NFORCE2_PLLREG, &temp);
157 fsb = nforce2_calc_fsb(temp);
158
159 return fsb;
160}
161
162/**
163 * nforce2_set_fsb - set new FSB
164 * @fsb: New FSB
165 *
166 * Sets new FSB
167 */
168static int nforce2_set_fsb(unsigned int fsb)
169{
170 u32 temp = 0;
171 unsigned int tfsb;
172 int diff;
173 int pll = 0;
174
175 if ((fsb > max_fsb) || (fsb < NFORCE2_MIN_FSB)) {
176 printk(KERN_ERR "cpufreq: FSB %d is out of range!\n", fsb);
177 return -EINVAL;
178 }
179
180 tfsb = nforce2_fsb_read(0);
181 if (!tfsb) {
182 printk(KERN_ERR "cpufreq: Error while reading the FSB\n");
183 return -EINVAL;
184 }
185
186 /* First write? Then set actual value */
187 pci_read_config_byte(nforce2_chipset_dev,NFORCE2_PLLENABLE, (u8 *)&temp);
188 if (!temp) {
189 pll = nforce2_calc_pll(tfsb);
190
191 if (pll < 0)
192 return -EINVAL;
193
194 nforce2_write_pll(pll);
195 }
196
197 /* Enable write access */
198 temp = 0x01;
199 pci_write_config_byte(nforce2_chipset_dev, NFORCE2_PLLENABLE, (u8)temp);
200
201 diff = tfsb - fsb;
202
203 if (!diff)
204 return 0;
205
206 while ((tfsb != fsb) && (tfsb <= max_fsb) && (tfsb >= min_fsb)) {
207 if (diff < 0)
208 tfsb++;
209 else
210 tfsb--;
211
212 /* Calculate the PLL reg. value */
213 if ((pll = nforce2_calc_pll(tfsb)) == -1)
214 return -EINVAL;
215
216 nforce2_write_pll(pll);
217#ifdef NFORCE2_DELAY
218 mdelay(NFORCE2_DELAY);
219#endif
220 }
221
222 temp = 0x40;
223 pci_write_config_byte(nforce2_chipset_dev, NFORCE2_PLLADR, (u8)temp);
224
225 return 0;
226}
227
228/**
229 * nforce2_get - get the CPU frequency
230 * @cpu: CPU number
231 *
232 * Returns the CPU frequency
233 */
234static unsigned int nforce2_get(unsigned int cpu)
235{
236 if (cpu)
237 return 0;
238 return nforce2_fsb_read(0) * fid * 100;
239}
240
241/**
242 * nforce2_target - set a new CPUFreq policy
243 * @policy: new policy
244 * @target_freq: the target frequency
245 * @relation: how that frequency relates to achieved frequency (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
246 *
247 * Sets a new CPUFreq policy.
248 */
249static int nforce2_target(struct cpufreq_policy *policy,
250 unsigned int target_freq, unsigned int relation)
251{
252// unsigned long flags;
253 struct cpufreq_freqs freqs;
254 unsigned int target_fsb;
255
256 if ((target_freq > policy->max) || (target_freq < policy->min))
257 return -EINVAL;
258
259 target_fsb = target_freq / (fid * 100);
260
261 freqs.old = nforce2_get(policy->cpu);
262 freqs.new = target_fsb * fid * 100;
263 freqs.cpu = 0; /* Only one CPU on nForce2 plattforms */
264
265 if (freqs.old == freqs.new)
266 return 0;
267
268 dprintk("Old CPU frequency %d kHz, new %d kHz\n",
269 freqs.old, freqs.new);
270
271 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
272
273 /* Disable IRQs */
274 //local_irq_save(flags);
275
276 if (nforce2_set_fsb(target_fsb) < 0)
277 printk(KERN_ERR "cpufreq: Changing FSB to %d failed\n",
278 target_fsb);
279 else
280 dprintk("Changed FSB successfully to %d\n",
281 target_fsb);
282
283 /* Enable IRQs */
284 //local_irq_restore(flags);
285
286 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
287
288 return 0;
289}
290
291/**
292 * nforce2_verify - verifies a new CPUFreq policy
293 * @policy: new policy
294 */
295static int nforce2_verify(struct cpufreq_policy *policy)
296{
297 unsigned int fsb_pol_max;
298
299 fsb_pol_max = policy->max / (fid * 100);
300
301 if (policy->min < (fsb_pol_max * fid * 100))
302 policy->max = (fsb_pol_max + 1) * fid * 100;
303
304 cpufreq_verify_within_limits(policy,
305 policy->cpuinfo.min_freq,
306 policy->cpuinfo.max_freq);
307 return 0;
308}
309
310static int nforce2_cpu_init(struct cpufreq_policy *policy)
311{
312 unsigned int fsb;
313 unsigned int rfid;
314
315 /* capability check */
316 if (policy->cpu != 0)
317 return -ENODEV;
318
319 /* Get current FSB */
320 fsb = nforce2_fsb_read(0);
321
322 if (!fsb)
323 return -EIO;
324
325 /* FIX: Get FID from CPU */
326 if (!fid) {
327 if (!cpu_khz) {
328 printk(KERN_WARNING
329 "cpufreq: cpu_khz not set, can't calculate multiplier!\n");
330 return -ENODEV;
331 }
332
333 fid = cpu_khz / (fsb * 100);
334 rfid = fid % 5;
335
336 if (rfid) {
337 if (rfid > 2)
338 fid += 5 - rfid;
339 else
340 fid -= rfid;
341 }
342 }
343
344 printk(KERN_INFO "cpufreq: FSB currently at %i MHz, FID %d.%d\n", fsb,
345 fid / 10, fid % 10);
346
347 /* Set maximum FSB to FSB at boot time */
348 max_fsb = nforce2_fsb_read(1);
349
350 if(!max_fsb)
351 return -EIO;
352
353 if (!min_fsb)
354 min_fsb = max_fsb - NFORCE2_SAFE_DISTANCE;
355
356 if (min_fsb < NFORCE2_MIN_FSB)
357 min_fsb = NFORCE2_MIN_FSB;
358
359 /* cpuinfo and default policy values */
360 policy->cpuinfo.min_freq = min_fsb * fid * 100;
361 policy->cpuinfo.max_freq = max_fsb * fid * 100;
362 policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
363 policy->cur = nforce2_get(policy->cpu);
364 policy->min = policy->cpuinfo.min_freq;
365 policy->max = policy->cpuinfo.max_freq;
366 policy->governor = CPUFREQ_DEFAULT_GOVERNOR;
367
368 return 0;
369}
370
371static int nforce2_cpu_exit(struct cpufreq_policy *policy)
372{
373 return 0;
374}
375
376static struct cpufreq_driver nforce2_driver = {
377 .name = "nforce2",
378 .verify = nforce2_verify,
379 .target = nforce2_target,
380 .get = nforce2_get,
381 .init = nforce2_cpu_init,
382 .exit = nforce2_cpu_exit,
383 .owner = THIS_MODULE,
384};
385
386/**
387 * nforce2_detect_chipset - detect the Southbridge which contains FSB PLL logic
388 *
389 * Detects nForce2 A2 and C1 stepping
390 *
391 */
392static unsigned int nforce2_detect_chipset(void)
393{
394 nforce2_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_NVIDIA,
395 PCI_DEVICE_ID_NVIDIA_NFORCE2,
396 PCI_ANY_ID, PCI_ANY_ID, NULL);
397
398 if (nforce2_chipset_dev == NULL)
399 return -ENODEV;
400
401 printk(KERN_INFO "cpufreq: Detected nForce2 chipset revision %X\n",
402 nforce2_chipset_dev->revision);
403 printk(KERN_INFO
404 "cpufreq: FSB changing is maybe unstable and can lead to crashes and data loss.\n");
405
406 return 0;
407}
408
409/**
410 * nforce2_init - initializes the nForce2 CPUFreq driver
411 *
412 * Initializes the nForce2 FSB support. Returns -ENODEV on unsupported
413 * devices, -EINVAL on problems during initiatization, and zero on
414 * success.
415 */
416static int __init nforce2_init(void)
417{
418 /* TODO: do we need to detect the processor? */
419
420 /* detect chipset */
421 if (nforce2_detect_chipset()) {
422 printk(KERN_ERR "cpufreq: No nForce2 chipset.\n");
423 return -ENODEV;
424 }
425
426 return cpufreq_register_driver(&nforce2_driver);
427}
428
429/**
430 * nforce2_exit - unregisters cpufreq module
431 *
432 * Unregisters nForce2 FSB change support.
433 */
434static void __exit nforce2_exit(void)
435{
436 cpufreq_unregister_driver(&nforce2_driver);
437}
438
439module_init(nforce2_init);
440module_exit(nforce2_exit);
441
diff --git a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c b/arch/x86/kernel/cpu/cpufreq/e_powersaver.c
new file mode 100644
index 000000000000..f43d98e11cc7
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/e_powersaver.c
@@ -0,0 +1,334 @@
1/*
2 * Based on documentation provided by Dave Jones. Thanks!
3 *
4 * Licensed under the terms of the GNU GPL License version 2.
5 *
6 * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
7 */
8
9#include <linux/kernel.h>
10#include <linux/module.h>
11#include <linux/init.h>
12#include <linux/cpufreq.h>
13#include <linux/ioport.h>
14#include <linux/slab.h>
15
16#include <asm/msr.h>
17#include <asm/tsc.h>
18#include <asm/timex.h>
19#include <asm/io.h>
20#include <asm/delay.h>
21
22#define EPS_BRAND_C7M 0
23#define EPS_BRAND_C7 1
24#define EPS_BRAND_EDEN 2
25#define EPS_BRAND_C3 3
26
27struct eps_cpu_data {
28 u32 fsb;
29 struct cpufreq_frequency_table freq_table[];
30};
31
32static struct eps_cpu_data *eps_cpu[NR_CPUS];
33
34
35static unsigned int eps_get(unsigned int cpu)
36{
37 struct eps_cpu_data *centaur;
38 u32 lo, hi;
39
40 if (cpu)
41 return 0;
42 centaur = eps_cpu[cpu];
43 if (centaur == NULL)
44 return 0;
45
46 /* Return current frequency */
47 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
48 return centaur->fsb * ((lo >> 8) & 0xff);
49}
50
51static int eps_set_state(struct eps_cpu_data *centaur,
52 unsigned int cpu,
53 u32 dest_state)
54{
55 struct cpufreq_freqs freqs;
56 u32 lo, hi;
57 int err = 0;
58 int i;
59
60 freqs.old = eps_get(cpu);
61 freqs.new = centaur->fsb * ((dest_state >> 8) & 0xff);
62 freqs.cpu = cpu;
63 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
64
65 /* Wait while CPU is busy */
66 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
67 i = 0;
68 while (lo & ((1 << 16) | (1 << 17))) {
69 udelay(16);
70 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
71 i++;
72 if (unlikely(i > 64)) {
73 err = -ENODEV;
74 goto postchange;
75 }
76 }
77 /* Set new multiplier and voltage */
78 wrmsr(MSR_IA32_PERF_CTL, dest_state & 0xffff, 0);
79 /* Wait until transition end */
80 i = 0;
81 do {
82 udelay(16);
83 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
84 i++;
85 if (unlikely(i > 64)) {
86 err = -ENODEV;
87 goto postchange;
88 }
89 } while (lo & ((1 << 16) | (1 << 17)));
90
91 /* Return current frequency */
92postchange:
93 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
94 freqs.new = centaur->fsb * ((lo >> 8) & 0xff);
95
96 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
97 return err;
98}
99
100static int eps_target(struct cpufreq_policy *policy,
101 unsigned int target_freq,
102 unsigned int relation)
103{
104 struct eps_cpu_data *centaur;
105 unsigned int newstate = 0;
106 unsigned int cpu = policy->cpu;
107 unsigned int dest_state;
108 int ret;
109
110 if (unlikely(eps_cpu[cpu] == NULL))
111 return -ENODEV;
112 centaur = eps_cpu[cpu];
113
114 if (unlikely(cpufreq_frequency_table_target(policy,
115 &eps_cpu[cpu]->freq_table[0],
116 target_freq,
117 relation,
118 &newstate))) {
119 return -EINVAL;
120 }
121
122 /* Make frequency transition */
123 dest_state = centaur->freq_table[newstate].index & 0xffff;
124 ret = eps_set_state(centaur, cpu, dest_state);
125 if (ret)
126 printk(KERN_ERR "eps: Timeout!\n");
127 return ret;
128}
129
130static int eps_verify(struct cpufreq_policy *policy)
131{
132 return cpufreq_frequency_table_verify(policy,
133 &eps_cpu[policy->cpu]->freq_table[0]);
134}
135
136static int eps_cpu_init(struct cpufreq_policy *policy)
137{
138 unsigned int i;
139 u32 lo, hi;
140 u64 val;
141 u8 current_multiplier, current_voltage;
142 u8 max_multiplier, max_voltage;
143 u8 min_multiplier, min_voltage;
144 u8 brand;
145 u32 fsb;
146 struct eps_cpu_data *centaur;
147 struct cpufreq_frequency_table *f_table;
148 int k, step, voltage;
149 int ret;
150 int states;
151
152 if (policy->cpu != 0)
153 return -ENODEV;
154
155 /* Check brand */
156 printk("eps: Detected VIA ");
157 rdmsr(0x1153, lo, hi);
158 brand = (((lo >> 2) ^ lo) >> 18) & 3;
159 switch(brand) {
160 case EPS_BRAND_C7M:
161 printk("C7-M\n");
162 break;
163 case EPS_BRAND_C7:
164 printk("C7\n");
165 break;
166 case EPS_BRAND_EDEN:
167 printk("Eden\n");
168 break;
169 case EPS_BRAND_C3:
170 printk("C3\n");
171 return -ENODEV;
172 break;
173 }
174 /* Enable Enhanced PowerSaver */
175 rdmsrl(MSR_IA32_MISC_ENABLE, val);
176 if (!(val & 1 << 16)) {
177 val |= 1 << 16;
178 wrmsrl(MSR_IA32_MISC_ENABLE, val);
179 /* Can be locked at 0 */
180 rdmsrl(MSR_IA32_MISC_ENABLE, val);
181 if (!(val & 1 << 16)) {
182 printk("eps: Can't enable Enhanced PowerSaver\n");
183 return -ENODEV;
184 }
185 }
186
187 /* Print voltage and multiplier */
188 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
189 current_voltage = lo & 0xff;
190 printk("eps: Current voltage = %dmV\n", current_voltage * 16 + 700);
191 current_multiplier = (lo >> 8) & 0xff;
192 printk("eps: Current multiplier = %d\n", current_multiplier);
193
194 /* Print limits */
195 max_voltage = hi & 0xff;
196 printk("eps: Highest voltage = %dmV\n", max_voltage * 16 + 700);
197 max_multiplier = (hi >> 8) & 0xff;
198 printk("eps: Highest multiplier = %d\n", max_multiplier);
199 min_voltage = (hi >> 16) & 0xff;
200 printk("eps: Lowest voltage = %dmV\n", min_voltage * 16 + 700);
201 min_multiplier = (hi >> 24) & 0xff;
202 printk("eps: Lowest multiplier = %d\n", min_multiplier);
203
204 /* Sanity checks */
205 if (current_multiplier == 0 || max_multiplier == 0
206 || min_multiplier == 0)
207 return -EINVAL;
208 if (current_multiplier > max_multiplier
209 || max_multiplier <= min_multiplier)
210 return -EINVAL;
211 if (current_voltage > 0x1c || max_voltage > 0x1c)
212 return -EINVAL;
213 if (max_voltage < min_voltage)
214 return -EINVAL;
215
216 /* Calc FSB speed */
217 fsb = cpu_khz / current_multiplier;
218 /* Calc number of p-states supported */
219 if (brand == EPS_BRAND_C7M)
220 states = max_multiplier - min_multiplier + 1;
221 else
222 states = 2;
223
224 /* Allocate private data and frequency table for current cpu */
225 centaur = kzalloc(sizeof(struct eps_cpu_data)
226 + (states + 1) * sizeof(struct cpufreq_frequency_table),
227 GFP_KERNEL);
228 if (!centaur)
229 return -ENOMEM;
230 eps_cpu[0] = centaur;
231
232 /* Copy basic values */
233 centaur->fsb = fsb;
234
235 /* Fill frequency and MSR value table */
236 f_table = &centaur->freq_table[0];
237 if (brand != EPS_BRAND_C7M) {
238 f_table[0].frequency = fsb * min_multiplier;
239 f_table[0].index = (min_multiplier << 8) | min_voltage;
240 f_table[1].frequency = fsb * max_multiplier;
241 f_table[1].index = (max_multiplier << 8) | max_voltage;
242 f_table[2].frequency = CPUFREQ_TABLE_END;
243 } else {
244 k = 0;
245 step = ((max_voltage - min_voltage) * 256)
246 / (max_multiplier - min_multiplier);
247 for (i = min_multiplier; i <= max_multiplier; i++) {
248 voltage = (k * step) / 256 + min_voltage;
249 f_table[k].frequency = fsb * i;
250 f_table[k].index = (i << 8) | voltage;
251 k++;
252 }
253 f_table[k].frequency = CPUFREQ_TABLE_END;
254 }
255
256 policy->governor = CPUFREQ_DEFAULT_GOVERNOR;
257 policy->cpuinfo.transition_latency = 140000; /* 844mV -> 700mV in ns */
258 policy->cur = fsb * current_multiplier;
259
260 ret = cpufreq_frequency_table_cpuinfo(policy, &centaur->freq_table[0]);
261 if (ret) {
262 kfree(centaur);
263 return ret;
264 }
265
266 cpufreq_frequency_table_get_attr(&centaur->freq_table[0], policy->cpu);
267 return 0;
268}
269
270static int eps_cpu_exit(struct cpufreq_policy *policy)
271{
272 unsigned int cpu = policy->cpu;
273 struct eps_cpu_data *centaur;
274 u32 lo, hi;
275
276 if (eps_cpu[cpu] == NULL)
277 return -ENODEV;
278 centaur = eps_cpu[cpu];
279
280 /* Get max frequency */
281 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
282 /* Set max frequency */
283 eps_set_state(centaur, cpu, hi & 0xffff);
284 /* Bye */
285 cpufreq_frequency_table_put_attr(policy->cpu);
286 kfree(eps_cpu[cpu]);
287 eps_cpu[cpu] = NULL;
288 return 0;
289}
290
291static struct freq_attr* eps_attr[] = {
292 &cpufreq_freq_attr_scaling_available_freqs,
293 NULL,
294};
295
296static struct cpufreq_driver eps_driver = {
297 .verify = eps_verify,
298 .target = eps_target,
299 .init = eps_cpu_init,
300 .exit = eps_cpu_exit,
301 .get = eps_get,
302 .name = "e_powersaver",
303 .owner = THIS_MODULE,
304 .attr = eps_attr,
305};
306
307static int __init eps_init(void)
308{
309 struct cpuinfo_x86 *c = cpu_data;
310
311 /* This driver will work only on Centaur C7 processors with
312 * Enhanced SpeedStep/PowerSaver registers */
313 if (c->x86_vendor != X86_VENDOR_CENTAUR
314 || c->x86 != 6 || c->x86_model != 10)
315 return -ENODEV;
316 if (!cpu_has(c, X86_FEATURE_EST))
317 return -ENODEV;
318
319 if (cpufreq_register_driver(&eps_driver))
320 return -EINVAL;
321 return 0;
322}
323
324static void __exit eps_exit(void)
325{
326 cpufreq_unregister_driver(&eps_driver);
327}
328
329MODULE_AUTHOR("Rafa³ Bilski <rafalbilski@interia.pl>");
330MODULE_DESCRIPTION("Enhanced PowerSaver driver for VIA C7 CPU's.");
331MODULE_LICENSE("GPL");
332
333module_init(eps_init);
334module_exit(eps_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/elanfreq.c b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
new file mode 100644
index 000000000000..f317276afa7a
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
@@ -0,0 +1,309 @@
1/*
2 * elanfreq: cpufreq driver for the AMD ELAN family
3 *
4 * (c) Copyright 2002 Robert Schwebel <r.schwebel@pengutronix.de>
5 *
6 * Parts of this code are (c) Sven Geggus <sven@geggus.net>
7 *
8 * All Rights Reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
14 *
15 * 2002-02-13: - initial revision for 2.4.18-pre9 by Robert Schwebel
16 *
17 */
18
19#include <linux/kernel.h>
20#include <linux/module.h>
21#include <linux/init.h>
22
23#include <linux/slab.h>
24#include <linux/delay.h>
25#include <linux/cpufreq.h>
26
27#include <asm/msr.h>
28#include <asm/timex.h>
29#include <asm/io.h>
30
31#define REG_CSCIR 0x22 /* Chip Setup and Control Index Register */
32#define REG_CSCDR 0x23 /* Chip Setup and Control Data Register */
33
34/* Module parameter */
35static int max_freq;
36
37struct s_elan_multiplier {
38 int clock; /* frequency in kHz */
39 int val40h; /* PMU Force Mode register */
40 int val80h; /* CPU Clock Speed Register */
41};
42
43/*
44 * It is important that the frequencies
45 * are listed in ascending order here!
46 */
47struct s_elan_multiplier elan_multiplier[] = {
48 {1000, 0x02, 0x18},
49 {2000, 0x02, 0x10},
50 {4000, 0x02, 0x08},
51 {8000, 0x00, 0x00},
52 {16000, 0x00, 0x02},
53 {33000, 0x00, 0x04},
54 {66000, 0x01, 0x04},
55 {99000, 0x01, 0x05}
56};
57
58static struct cpufreq_frequency_table elanfreq_table[] = {
59 {0, 1000},
60 {1, 2000},
61 {2, 4000},
62 {3, 8000},
63 {4, 16000},
64 {5, 33000},
65 {6, 66000},
66 {7, 99000},
67 {0, CPUFREQ_TABLE_END},
68};
69
70
71/**
72 * elanfreq_get_cpu_frequency: determine current cpu speed
73 *
74 * Finds out at which frequency the CPU of the Elan SOC runs
75 * at the moment. Frequencies from 1 to 33 MHz are generated
76 * the normal way, 66 and 99 MHz are called "Hyperspeed Mode"
77 * and have the rest of the chip running with 33 MHz.
78 */
79
80static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu)
81{
82 u8 clockspeed_reg; /* Clock Speed Register */
83
84 local_irq_disable();
85 outb_p(0x80,REG_CSCIR);
86 clockspeed_reg = inb_p(REG_CSCDR);
87 local_irq_enable();
88
89 if ((clockspeed_reg & 0xE0) == 0xE0)
90 return 0;
91
92 /* Are we in CPU clock multiplied mode (66/99 MHz)? */
93 if ((clockspeed_reg & 0xE0) == 0xC0) {
94 if ((clockspeed_reg & 0x01) == 0)
95 return 66000;
96 else
97 return 99000;
98 }
99
100 /* 33 MHz is not 32 MHz... */
101 if ((clockspeed_reg & 0xE0)==0xA0)
102 return 33000;
103
104 return ((1<<((clockspeed_reg & 0xE0) >> 5)) * 1000);
105}
106
107
108/**
109 * elanfreq_set_cpu_frequency: Change the CPU core frequency
110 * @cpu: cpu number
111 * @freq: frequency in kHz
112 *
113 * This function takes a frequency value and changes the CPU frequency
114 * according to this. Note that the frequency has to be checked by
115 * elanfreq_validatespeed() for correctness!
116 *
117 * There is no return value.
118 */
119
120static void elanfreq_set_cpu_state (unsigned int state)
121{
122 struct cpufreq_freqs freqs;
123
124 freqs.old = elanfreq_get_cpu_frequency(0);
125 freqs.new = elan_multiplier[state].clock;
126 freqs.cpu = 0; /* elanfreq.c is UP only driver */
127
128 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
129
130 printk(KERN_INFO "elanfreq: attempting to set frequency to %i kHz\n",
131 elan_multiplier[state].clock);
132
133
134 /*
135 * Access to the Elan's internal registers is indexed via
136 * 0x22: Chip Setup & Control Register Index Register (CSCI)
137 * 0x23: Chip Setup & Control Register Data Register (CSCD)
138 *
139 */
140
141 /*
142 * 0x40 is the Power Management Unit's Force Mode Register.
143 * Bit 6 enables Hyperspeed Mode (66/100 MHz core frequency)
144 */
145
146 local_irq_disable();
147 outb_p(0x40,REG_CSCIR); /* Disable hyperspeed mode */
148 outb_p(0x00,REG_CSCDR);
149 local_irq_enable(); /* wait till internal pipelines and */
150 udelay(1000); /* buffers have cleaned up */
151
152 local_irq_disable();
153
154 /* now, set the CPU clock speed register (0x80) */
155 outb_p(0x80,REG_CSCIR);
156 outb_p(elan_multiplier[state].val80h,REG_CSCDR);
157
158 /* now, the hyperspeed bit in PMU Force Mode Register (0x40) */
159 outb_p(0x40,REG_CSCIR);
160 outb_p(elan_multiplier[state].val40h,REG_CSCDR);
161 udelay(10000);
162 local_irq_enable();
163
164 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
165};
166
167
168/**
169 * elanfreq_validatespeed: test if frequency range is valid
170 * @policy: the policy to validate
171 *
172 * This function checks if a given frequency range in kHz is valid
173 * for the hardware supported by the driver.
174 */
175
176static int elanfreq_verify (struct cpufreq_policy *policy)
177{
178 return cpufreq_frequency_table_verify(policy, &elanfreq_table[0]);
179}
180
181static int elanfreq_target (struct cpufreq_policy *policy,
182 unsigned int target_freq,
183 unsigned int relation)
184{
185 unsigned int newstate = 0;
186
187 if (cpufreq_frequency_table_target(policy, &elanfreq_table[0], target_freq, relation, &newstate))
188 return -EINVAL;
189
190 elanfreq_set_cpu_state(newstate);
191
192 return 0;
193}
194
195
196/*
197 * Module init and exit code
198 */
199
200static int elanfreq_cpu_init(struct cpufreq_policy *policy)
201{
202 struct cpuinfo_x86 *c = cpu_data;
203 unsigned int i;
204 int result;
205
206 /* capability check */
207 if ((c->x86_vendor != X86_VENDOR_AMD) ||
208 (c->x86 != 4) || (c->x86_model!=10))
209 return -ENODEV;
210
211 /* max freq */
212 if (!max_freq)
213 max_freq = elanfreq_get_cpu_frequency(0);
214
215 /* table init */
216 for (i=0; (elanfreq_table[i].frequency != CPUFREQ_TABLE_END); i++) {
217 if (elanfreq_table[i].frequency > max_freq)
218 elanfreq_table[i].frequency = CPUFREQ_ENTRY_INVALID;
219 }
220
221 /* cpuinfo and default policy values */
222 policy->governor = CPUFREQ_DEFAULT_GOVERNOR;
223 policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
224 policy->cur = elanfreq_get_cpu_frequency(0);
225
226 result = cpufreq_frequency_table_cpuinfo(policy, elanfreq_table);
227 if (result)
228 return (result);
229
230 cpufreq_frequency_table_get_attr(elanfreq_table, policy->cpu);
231 return 0;
232}
233
234
235static int elanfreq_cpu_exit(struct cpufreq_policy *policy)
236{
237 cpufreq_frequency_table_put_attr(policy->cpu);
238 return 0;
239}
240
241
242#ifndef MODULE
243/**
244 * elanfreq_setup - elanfreq command line parameter parsing
245 *
246 * elanfreq command line parameter. Use:
247 * elanfreq=66000
248 * to set the maximum CPU frequency to 66 MHz. Note that in
249 * case you do not give this boot parameter, the maximum
250 * frequency will fall back to _current_ CPU frequency which
251 * might be lower. If you build this as a module, use the
252 * max_freq module parameter instead.
253 */
254static int __init elanfreq_setup(char *str)
255{
256 max_freq = simple_strtoul(str, &str, 0);
257 printk(KERN_WARNING "You're using the deprecated elanfreq command line option. Use elanfreq.max_freq instead, please!\n");
258 return 1;
259}
260__setup("elanfreq=", elanfreq_setup);
261#endif
262
263
264static struct freq_attr* elanfreq_attr[] = {
265 &cpufreq_freq_attr_scaling_available_freqs,
266 NULL,
267};
268
269
270static struct cpufreq_driver elanfreq_driver = {
271 .get = elanfreq_get_cpu_frequency,
272 .verify = elanfreq_verify,
273 .target = elanfreq_target,
274 .init = elanfreq_cpu_init,
275 .exit = elanfreq_cpu_exit,
276 .name = "elanfreq",
277 .owner = THIS_MODULE,
278 .attr = elanfreq_attr,
279};
280
281
282static int __init elanfreq_init(void)
283{
284 struct cpuinfo_x86 *c = cpu_data;
285
286 /* Test if we have the right hardware */
287 if ((c->x86_vendor != X86_VENDOR_AMD) ||
288 (c->x86 != 4) || (c->x86_model!=10)) {
289 printk(KERN_INFO "elanfreq: error: no Elan processor found!\n");
290 return -ENODEV;
291 }
292 return cpufreq_register_driver(&elanfreq_driver);
293}
294
295
296static void __exit elanfreq_exit(void)
297{
298 cpufreq_unregister_driver(&elanfreq_driver);
299}
300
301
302module_param (max_freq, int, 0444);
303
304MODULE_LICENSE("GPL");
305MODULE_AUTHOR("Robert Schwebel <r.schwebel@pengutronix.de>, Sven Geggus <sven@geggus.net>");
306MODULE_DESCRIPTION("cpufreq driver for AMD's Elan CPUs");
307
308module_init(elanfreq_init);
309module_exit(elanfreq_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
new file mode 100644
index 000000000000..461dabc4e495
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
@@ -0,0 +1,495 @@
1/*
2 * Cyrix MediaGX and NatSemi Geode Suspend Modulation
3 * (C) 2002 Zwane Mwaikambo <zwane@commfireservices.com>
4 * (C) 2002 Hiroshi Miura <miura@da-cha.org>
5 * All Rights Reserved
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * version 2 as published by the Free Software Foundation
10 *
11 * The author(s) of this software shall not be held liable for damages
12 * of any nature resulting due to the use of this software. This
13 * software is provided AS-IS with no warranties.
14 *
15 * Theoritical note:
16 *
17 * (see Geode(tm) CS5530 manual (rev.4.1) page.56)
18 *
19 * CPU frequency control on NatSemi Geode GX1/GXLV processor and CS55x0
20 * are based on Suspend Moduration.
21 *
22 * Suspend Modulation works by asserting and de-asserting the SUSP# pin
23 * to CPU(GX1/GXLV) for configurable durations. When asserting SUSP#
24 * the CPU enters an idle state. GX1 stops its core clock when SUSP# is
25 * asserted then power consumption is reduced.
26 *
27 * Suspend Modulation's OFF/ON duration are configurable
28 * with 'Suspend Modulation OFF Count Register'
29 * and 'Suspend Modulation ON Count Register'.
30 * These registers are 8bit counters that represent the number of
31 * 32us intervals which the SUSP# pin is asserted(ON)/de-asserted(OFF)
32 * to the processor.
33 *
34 * These counters define a ratio which is the effective frequency
35 * of operation of the system.
36 *
37 * OFF Count
38 * F_eff = Fgx * ----------------------
39 * OFF Count + ON Count
40 *
41 * 0 <= On Count, Off Count <= 255
42 *
43 * From these limits, we can get register values
44 *
45 * off_duration + on_duration <= MAX_DURATION
46 * on_duration = off_duration * (stock_freq - freq) / freq
47 *
48 * off_duration = (freq * DURATION) / stock_freq
49 * on_duration = DURATION - off_duration
50 *
51 *
52 *---------------------------------------------------------------------------
53 *
54 * ChangeLog:
55 * Dec. 12, 2003 Hiroshi Miura <miura@da-cha.org>
56 * - fix on/off register mistake
57 * - fix cpu_khz calc when it stops cpu modulation.
58 *
59 * Dec. 11, 2002 Hiroshi Miura <miura@da-cha.org>
60 * - rewrite for Cyrix MediaGX Cx5510/5520 and
61 * NatSemi Geode Cs5530(A).
62 *
63 * Jul. ??, 2002 Zwane Mwaikambo <zwane@commfireservices.com>
64 * - cs5530_mod patch for 2.4.19-rc1.
65 *
66 *---------------------------------------------------------------------------
67 *
68 * Todo
69 * Test on machines with 5510, 5530, 5530A
70 */
71
72/************************************************************************
73 * Suspend Modulation - Definitions *
74 ************************************************************************/
75
76#include <linux/kernel.h>
77#include <linux/module.h>
78#include <linux/init.h>
79#include <linux/smp.h>
80#include <linux/cpufreq.h>
81#include <linux/pci.h>
82#include <asm/processor-cyrix.h>
83#include <asm/errno.h>
84
85/* PCI config registers, all at F0 */
86#define PCI_PMER1 0x80 /* power management enable register 1 */
87#define PCI_PMER2 0x81 /* power management enable register 2 */
88#define PCI_PMER3 0x82 /* power management enable register 3 */
89#define PCI_IRQTC 0x8c /* irq speedup timer counter register:typical 2 to 4ms */
90#define PCI_VIDTC 0x8d /* video speedup timer counter register: typical 50 to 100ms */
91#define PCI_MODOFF 0x94 /* suspend modulation OFF counter register, 1 = 32us */
92#define PCI_MODON 0x95 /* suspend modulation ON counter register */
93#define PCI_SUSCFG 0x96 /* suspend configuration register */
94
95/* PMER1 bits */
96#define GPM (1<<0) /* global power management */
97#define GIT (1<<1) /* globally enable PM device idle timers */
98#define GTR (1<<2) /* globally enable IO traps */
99#define IRQ_SPDUP (1<<3) /* disable clock throttle during interrupt handling */
100#define VID_SPDUP (1<<4) /* disable clock throttle during vga video handling */
101
102/* SUSCFG bits */
103#define SUSMOD (1<<0) /* enable/disable suspend modulation */
104/* the belows support only with cs5530 (after rev.1.2)/cs5530A */
105#define SMISPDUP (1<<1) /* select how SMI re-enable suspend modulation: */
106 /* IRQTC timer or read SMI speedup disable reg.(F1BAR[08-09h]) */
107#define SUSCFG (1<<2) /* enable powering down a GXLV processor. "Special 3Volt Suspend" mode */
108/* the belows support only with cs5530A */
109#define PWRSVE_ISA (1<<3) /* stop ISA clock */
110#define PWRSVE (1<<4) /* active idle */
111
112struct gxfreq_params {
113 u8 on_duration;
114 u8 off_duration;
115 u8 pci_suscfg;
116 u8 pci_pmer1;
117 u8 pci_pmer2;
118 struct pci_dev *cs55x0;
119};
120
121static struct gxfreq_params *gx_params;
122static int stock_freq;
123
124/* PCI bus clock - defaults to 30.000 if cpu_khz is not available */
125static int pci_busclk = 0;
126module_param (pci_busclk, int, 0444);
127
128/* maximum duration for which the cpu may be suspended
129 * (32us * MAX_DURATION). If no parameter is given, this defaults
130 * to 255.
131 * Note that this leads to a maximum of 8 ms(!) where the CPU clock
132 * is suspended -- processing power is just 0.39% of what it used to be,
133 * though. 781.25 kHz(!) for a 200 MHz processor -- wow. */
134static int max_duration = 255;
135module_param (max_duration, int, 0444);
136
137/* For the default policy, we want at least some processing power
138 * - let's say 5%. (min = maxfreq / POLICY_MIN_DIV)
139 */
140#define POLICY_MIN_DIV 20
141
142
143#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "gx-suspmod", msg)
144
145/**
146 * we can detect a core multipiler from dir0_lsb
147 * from GX1 datasheet p.56,
148 * MULT[3:0]:
149 * 0000 = SYSCLK multiplied by 4 (test only)
150 * 0001 = SYSCLK multiplied by 10
151 * 0010 = SYSCLK multiplied by 4
152 * 0011 = SYSCLK multiplied by 6
153 * 0100 = SYSCLK multiplied by 9
154 * 0101 = SYSCLK multiplied by 5
155 * 0110 = SYSCLK multiplied by 7
156 * 0111 = SYSCLK multiplied by 8
157 * of 33.3MHz
158 **/
159static int gx_freq_mult[16] = {
160 4, 10, 4, 6, 9, 5, 7, 8,
161 0, 0, 0, 0, 0, 0, 0, 0
162};
163
164
165/****************************************************************
166 * Low Level chipset interface *
167 ****************************************************************/
168static struct pci_device_id gx_chipset_tbl[] __initdata = {
169 { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, PCI_ANY_ID, PCI_ANY_ID },
170 { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5520, PCI_ANY_ID, PCI_ANY_ID },
171 { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5510, PCI_ANY_ID, PCI_ANY_ID },
172 { 0, },
173};
174
175/**
176 * gx_detect_chipset:
177 *
178 **/
179static __init struct pci_dev *gx_detect_chipset(void)
180{
181 struct pci_dev *gx_pci = NULL;
182
183 /* check if CPU is a MediaGX or a Geode. */
184 if ((current_cpu_data.x86_vendor != X86_VENDOR_NSC) &&
185 (current_cpu_data.x86_vendor != X86_VENDOR_CYRIX)) {
186 dprintk("error: no MediaGX/Geode processor found!\n");
187 return NULL;
188 }
189
190 /* detect which companion chip is used */
191 while ((gx_pci = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, gx_pci)) != NULL) {
192 if ((pci_match_id(gx_chipset_tbl, gx_pci)) != NULL)
193 return gx_pci;
194 }
195
196 dprintk("error: no supported chipset found!\n");
197 return NULL;
198}
199
200/**
201 * gx_get_cpuspeed:
202 *
203 * Finds out at which efficient frequency the Cyrix MediaGX/NatSemi Geode CPU runs.
204 */
205static unsigned int gx_get_cpuspeed(unsigned int cpu)
206{
207 if ((gx_params->pci_suscfg & SUSMOD) == 0)
208 return stock_freq;
209
210 return (stock_freq * gx_params->off_duration)
211 / (gx_params->on_duration + gx_params->off_duration);
212}
213
214/**
215 * gx_validate_speed:
216 * determine current cpu speed
217 *
218 **/
219
220static unsigned int gx_validate_speed(unsigned int khz, u8 *on_duration, u8 *off_duration)
221{
222 unsigned int i;
223 u8 tmp_on, tmp_off;
224 int old_tmp_freq = stock_freq;
225 int tmp_freq;
226
227 *off_duration=1;
228 *on_duration=0;
229
230 for (i=max_duration; i>0; i--) {
231 tmp_off = ((khz * i) / stock_freq) & 0xff;
232 tmp_on = i - tmp_off;
233 tmp_freq = (stock_freq * tmp_off) / i;
234 /* if this relation is closer to khz, use this. If it's equal,
235 * prefer it, too - lower latency */
236 if (abs(tmp_freq - khz) <= abs(old_tmp_freq - khz)) {
237 *on_duration = tmp_on;
238 *off_duration = tmp_off;
239 old_tmp_freq = tmp_freq;
240 }
241 }
242
243 return old_tmp_freq;
244}
245
246
247/**
248 * gx_set_cpuspeed:
249 * set cpu speed in khz.
250 **/
251
252static void gx_set_cpuspeed(unsigned int khz)
253{
254 u8 suscfg, pmer1;
255 unsigned int new_khz;
256 unsigned long flags;
257 struct cpufreq_freqs freqs;
258
259 freqs.cpu = 0;
260 freqs.old = gx_get_cpuspeed(0);
261
262 new_khz = gx_validate_speed(khz, &gx_params->on_duration, &gx_params->off_duration);
263
264 freqs.new = new_khz;
265
266 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
267 local_irq_save(flags);
268
269 if (new_khz != stock_freq) { /* if new khz == 100% of CPU speed, it is special case */
270 switch (gx_params->cs55x0->device) {
271 case PCI_DEVICE_ID_CYRIX_5530_LEGACY:
272 pmer1 = gx_params->pci_pmer1 | IRQ_SPDUP | VID_SPDUP;
273 /* FIXME: need to test other values -- Zwane,Miura */
274 pci_write_config_byte(gx_params->cs55x0, PCI_IRQTC, 4); /* typical 2 to 4ms */
275 pci_write_config_byte(gx_params->cs55x0, PCI_VIDTC, 100);/* typical 50 to 100ms */
276 pci_write_config_byte(gx_params->cs55x0, PCI_PMER1, pmer1);
277
278 if (gx_params->cs55x0->revision < 0x10) { /* CS5530(rev 1.2, 1.3) */
279 suscfg = gx_params->pci_suscfg | SUSMOD;
280 } else { /* CS5530A,B.. */
281 suscfg = gx_params->pci_suscfg | SUSMOD | PWRSVE;
282 }
283 break;
284 case PCI_DEVICE_ID_CYRIX_5520:
285 case PCI_DEVICE_ID_CYRIX_5510:
286 suscfg = gx_params->pci_suscfg | SUSMOD;
287 break;
288 default:
289 local_irq_restore(flags);
290 dprintk("fatal: try to set unknown chipset.\n");
291 return;
292 }
293 } else {
294 suscfg = gx_params->pci_suscfg & ~(SUSMOD);
295 gx_params->off_duration = 0;
296 gx_params->on_duration = 0;
297 dprintk("suspend modulation disabled: cpu runs 100 percent speed.\n");
298 }
299
300 pci_write_config_byte(gx_params->cs55x0, PCI_MODOFF, gx_params->off_duration);
301 pci_write_config_byte(gx_params->cs55x0, PCI_MODON, gx_params->on_duration);
302
303 pci_write_config_byte(gx_params->cs55x0, PCI_SUSCFG, suscfg);
304 pci_read_config_byte(gx_params->cs55x0, PCI_SUSCFG, &suscfg);
305
306 local_irq_restore(flags);
307
308 gx_params->pci_suscfg = suscfg;
309
310 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
311
312 dprintk("suspend modulation w/ duration of ON:%d us, OFF:%d us\n",
313 gx_params->on_duration * 32, gx_params->off_duration * 32);
314 dprintk("suspend modulation w/ clock speed: %d kHz.\n", freqs.new);
315}
316
317/****************************************************************
318 * High level functions *
319 ****************************************************************/
320
321/*
322 * cpufreq_gx_verify: test if frequency range is valid
323 *
324 * This function checks if a given frequency range in kHz is valid
325 * for the hardware supported by the driver.
326 */
327
328static int cpufreq_gx_verify(struct cpufreq_policy *policy)
329{
330 unsigned int tmp_freq = 0;
331 u8 tmp1, tmp2;
332
333 if (!stock_freq || !policy)
334 return -EINVAL;
335
336 policy->cpu = 0;
337 cpufreq_verify_within_limits(policy, (stock_freq / max_duration), stock_freq);
338
339 /* it needs to be assured that at least one supported frequency is
340 * within policy->min and policy->max. If it is not, policy->max
341 * needs to be increased until one freuqency is supported.
342 * policy->min may not be decreased, though. This way we guarantee a
343 * specific processing capacity.
344 */
345 tmp_freq = gx_validate_speed(policy->min, &tmp1, &tmp2);
346 if (tmp_freq < policy->min)
347 tmp_freq += stock_freq / max_duration;
348 policy->min = tmp_freq;
349 if (policy->min > policy->max)
350 policy->max = tmp_freq;
351 tmp_freq = gx_validate_speed(policy->max, &tmp1, &tmp2);
352 if (tmp_freq > policy->max)
353 tmp_freq -= stock_freq / max_duration;
354 policy->max = tmp_freq;
355 if (policy->max < policy->min)
356 policy->max = policy->min;
357 cpufreq_verify_within_limits(policy, (stock_freq / max_duration), stock_freq);
358
359 return 0;
360}
361
362/*
363 * cpufreq_gx_target:
364 *
365 */
366static int cpufreq_gx_target(struct cpufreq_policy *policy,
367 unsigned int target_freq,
368 unsigned int relation)
369{
370 u8 tmp1, tmp2;
371 unsigned int tmp_freq;
372
373 if (!stock_freq || !policy)
374 return -EINVAL;
375
376 policy->cpu = 0;
377
378 tmp_freq = gx_validate_speed(target_freq, &tmp1, &tmp2);
379 while (tmp_freq < policy->min) {
380 tmp_freq += stock_freq / max_duration;
381 tmp_freq = gx_validate_speed(tmp_freq, &tmp1, &tmp2);
382 }
383 while (tmp_freq > policy->max) {
384 tmp_freq -= stock_freq / max_duration;
385 tmp_freq = gx_validate_speed(tmp_freq, &tmp1, &tmp2);
386 }
387
388 gx_set_cpuspeed(tmp_freq);
389
390 return 0;
391}
392
393static int cpufreq_gx_cpu_init(struct cpufreq_policy *policy)
394{
395 unsigned int maxfreq, curfreq;
396
397 if (!policy || policy->cpu != 0)
398 return -ENODEV;
399
400 /* determine maximum frequency */
401 if (pci_busclk) {
402 maxfreq = pci_busclk * gx_freq_mult[getCx86(CX86_DIR1) & 0x0f];
403 } else if (cpu_khz) {
404 maxfreq = cpu_khz;
405 } else {
406 maxfreq = 30000 * gx_freq_mult[getCx86(CX86_DIR1) & 0x0f];
407 }
408 stock_freq = maxfreq;
409 curfreq = gx_get_cpuspeed(0);
410
411 dprintk("cpu max frequency is %d.\n", maxfreq);
412 dprintk("cpu current frequency is %dkHz.\n",curfreq);
413
414 /* setup basic struct for cpufreq API */
415 policy->cpu = 0;
416
417 if (max_duration < POLICY_MIN_DIV)
418 policy->min = maxfreq / max_duration;
419 else
420 policy->min = maxfreq / POLICY_MIN_DIV;
421 policy->max = maxfreq;
422 policy->cur = curfreq;
423 policy->governor = CPUFREQ_DEFAULT_GOVERNOR;
424 policy->cpuinfo.min_freq = maxfreq / max_duration;
425 policy->cpuinfo.max_freq = maxfreq;
426 policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
427
428 return 0;
429}
430
431/*
432 * cpufreq_gx_init:
433 * MediaGX/Geode GX initialize cpufreq driver
434 */
435static struct cpufreq_driver gx_suspmod_driver = {
436 .get = gx_get_cpuspeed,
437 .verify = cpufreq_gx_verify,
438 .target = cpufreq_gx_target,
439 .init = cpufreq_gx_cpu_init,
440 .name = "gx-suspmod",
441 .owner = THIS_MODULE,
442};
443
444static int __init cpufreq_gx_init(void)
445{
446 int ret;
447 struct gxfreq_params *params;
448 struct pci_dev *gx_pci;
449
450 /* Test if we have the right hardware */
451 if ((gx_pci = gx_detect_chipset()) == NULL)
452 return -ENODEV;
453
454 /* check whether module parameters are sane */
455 if (max_duration > 0xff)
456 max_duration = 0xff;
457
458 dprintk("geode suspend modulation available.\n");
459
460 params = kzalloc(sizeof(struct gxfreq_params), GFP_KERNEL);
461 if (params == NULL)
462 return -ENOMEM;
463
464 params->cs55x0 = gx_pci;
465 gx_params = params;
466
467 /* keep cs55x0 configurations */
468 pci_read_config_byte(params->cs55x0, PCI_SUSCFG, &(params->pci_suscfg));
469 pci_read_config_byte(params->cs55x0, PCI_PMER1, &(params->pci_pmer1));
470 pci_read_config_byte(params->cs55x0, PCI_PMER2, &(params->pci_pmer2));
471 pci_read_config_byte(params->cs55x0, PCI_MODON, &(params->on_duration));
472 pci_read_config_byte(params->cs55x0, PCI_MODOFF, &(params->off_duration));
473
474 if ((ret = cpufreq_register_driver(&gx_suspmod_driver))) {
475 kfree(params);
476 return ret; /* register error! */
477 }
478
479 return 0;
480}
481
482static void __exit cpufreq_gx_exit(void)
483{
484 cpufreq_unregister_driver(&gx_suspmod_driver);
485 pci_dev_put(gx_params->cs55x0);
486 kfree(gx_params);
487}
488
489MODULE_AUTHOR ("Hiroshi Miura <miura@da-cha.org>");
490MODULE_DESCRIPTION ("Cpufreq driver for Cyrix MediaGX and NatSemi Geode");
491MODULE_LICENSE ("GPL");
492
493module_init(cpufreq_gx_init);
494module_exit(cpufreq_gx_exit);
495
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
new file mode 100644
index 000000000000..f0cce3c2dc3a
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c
@@ -0,0 +1,1024 @@
1/*
2 * (C) 2001-2004 Dave Jones. <davej@codemonkey.org.uk>
3 * (C) 2002 Padraig Brady. <padraig@antefacto.com>
4 *
5 * Licensed under the terms of the GNU GPL License version 2.
6 * Based upon datasheets & sample CPUs kindly provided by VIA.
7 *
8 * VIA have currently 3 different versions of Longhaul.
9 * Version 1 (Longhaul) uses the BCR2 MSR at 0x1147.
10 * It is present only in Samuel 1 (C5A), Samuel 2 (C5B) stepping 0.
11 * Version 2 of longhaul is backward compatible with v1, but adds
12 * LONGHAUL MSR for purpose of both frequency and voltage scaling.
13 * Present in Samuel 2 (steppings 1-7 only) (C5B), and Ezra (C5C).
14 * Version 3 of longhaul got renamed to Powersaver and redesigned
15 * to use only the POWERSAVER MSR at 0x110a.
16 * It is present in Ezra-T (C5M), Nehemiah (C5X) and above.
17 * It's pretty much the same feature wise to longhaul v2, though
18 * there is provision for scaling FSB too, but this doesn't work
19 * too well in practice so we don't even try to use this.
20 *
21 * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
22 */
23
24#include <linux/kernel.h>
25#include <linux/module.h>
26#include <linux/moduleparam.h>
27#include <linux/init.h>
28#include <linux/cpufreq.h>
29#include <linux/pci.h>
30#include <linux/slab.h>
31#include <linux/string.h>
32#include <linux/delay.h>
33
34#include <asm/msr.h>
35#include <asm/timex.h>
36#include <asm/io.h>
37#include <asm/acpi.h>
38#include <linux/acpi.h>
39#include <acpi/processor.h>
40
41#include "longhaul.h"
42
43#define PFX "longhaul: "
44
45#define TYPE_LONGHAUL_V1 1
46#define TYPE_LONGHAUL_V2 2
47#define TYPE_POWERSAVER 3
48
49#define CPU_SAMUEL 1
50#define CPU_SAMUEL2 2
51#define CPU_EZRA 3
52#define CPU_EZRA_T 4
53#define CPU_NEHEMIAH 5
54#define CPU_NEHEMIAH_C 6
55
56/* Flags */
57#define USE_ACPI_C3 (1 << 1)
58#define USE_NORTHBRIDGE (1 << 2)
59
60static int cpu_model;
61static unsigned int numscales=16;
62static unsigned int fsb;
63
64static const struct mV_pos *vrm_mV_table;
65static const unsigned char *mV_vrm_table;
66
67static unsigned int highest_speed, lowest_speed; /* kHz */
68static unsigned int minmult, maxmult;
69static int can_scale_voltage;
70static struct acpi_processor *pr = NULL;
71static struct acpi_processor_cx *cx = NULL;
72static u32 acpi_regs_addr;
73static u8 longhaul_flags;
74static unsigned int longhaul_index;
75
76/* Module parameters */
77static int scale_voltage;
78static int disable_acpi_c3;
79static int revid_errata;
80
81#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "longhaul", msg)
82
83
84/* Clock ratios multiplied by 10 */
85static int clock_ratio[32];
86static int eblcr_table[32];
87static int longhaul_version;
88static struct cpufreq_frequency_table *longhaul_table;
89
90#ifdef CONFIG_CPU_FREQ_DEBUG
91static char speedbuffer[8];
92
93static char *print_speed(int speed)
94{
95 if (speed < 1000) {
96 snprintf(speedbuffer, sizeof(speedbuffer),"%dMHz", speed);
97 return speedbuffer;
98 }
99
100 if (speed%1000 == 0)
101 snprintf(speedbuffer, sizeof(speedbuffer),
102 "%dGHz", speed/1000);
103 else
104 snprintf(speedbuffer, sizeof(speedbuffer),
105 "%d.%dGHz", speed/1000, (speed%1000)/100);
106
107 return speedbuffer;
108}
109#endif
110
111
112static unsigned int calc_speed(int mult)
113{
114 int khz;
115 khz = (mult/10)*fsb;
116 if (mult%10)
117 khz += fsb/2;
118 khz *= 1000;
119 return khz;
120}
121
122
123static int longhaul_get_cpu_mult(void)
124{
125 unsigned long invalue=0,lo, hi;
126
127 rdmsr (MSR_IA32_EBL_CR_POWERON, lo, hi);
128 invalue = (lo & (1<<22|1<<23|1<<24|1<<25)) >>22;
129 if (longhaul_version==TYPE_LONGHAUL_V2 || longhaul_version==TYPE_POWERSAVER) {
130 if (lo & (1<<27))
131 invalue+=16;
132 }
133 return eblcr_table[invalue];
134}
135
136/* For processor with BCR2 MSR */
137
138static void do_longhaul1(unsigned int clock_ratio_index)
139{
140 union msr_bcr2 bcr2;
141
142 rdmsrl(MSR_VIA_BCR2, bcr2.val);
143 /* Enable software clock multiplier */
144 bcr2.bits.ESOFTBF = 1;
145 bcr2.bits.CLOCKMUL = clock_ratio_index & 0xff;
146
147 /* Sync to timer tick */
148 safe_halt();
149 /* Change frequency on next halt or sleep */
150 wrmsrl(MSR_VIA_BCR2, bcr2.val);
151 /* Invoke transition */
152 ACPI_FLUSH_CPU_CACHE();
153 halt();
154
155 /* Disable software clock multiplier */
156 local_irq_disable();
157 rdmsrl(MSR_VIA_BCR2, bcr2.val);
158 bcr2.bits.ESOFTBF = 0;
159 wrmsrl(MSR_VIA_BCR2, bcr2.val);
160}
161
162/* For processor with Longhaul MSR */
163
164static void do_powersaver(int cx_address, unsigned int clock_ratio_index,
165 unsigned int dir)
166{
167 union msr_longhaul longhaul;
168 u32 t;
169
170 rdmsrl(MSR_VIA_LONGHAUL, longhaul.val);
171 /* Setup new frequency */
172 if (!revid_errata)
173 longhaul.bits.RevisionKey = longhaul.bits.RevisionID;
174 else
175 longhaul.bits.RevisionKey = 0;
176 longhaul.bits.SoftBusRatio = clock_ratio_index & 0xf;
177 longhaul.bits.SoftBusRatio4 = (clock_ratio_index & 0x10) >> 4;
178 /* Setup new voltage */
179 if (can_scale_voltage)
180 longhaul.bits.SoftVID = (clock_ratio_index >> 8) & 0x1f;
181 /* Sync to timer tick */
182 safe_halt();
183 /* Raise voltage if necessary */
184 if (can_scale_voltage && dir) {
185 longhaul.bits.EnableSoftVID = 1;
186 wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
187 /* Change voltage */
188 if (!cx_address) {
189 ACPI_FLUSH_CPU_CACHE();
190 halt();
191 } else {
192 ACPI_FLUSH_CPU_CACHE();
193 /* Invoke C3 */
194 inb(cx_address);
195 /* Dummy op - must do something useless after P_LVL3
196 * read */
197 t = inl(acpi_gbl_FADT.xpm_timer_block.address);
198 }
199 longhaul.bits.EnableSoftVID = 0;
200 wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
201 }
202
203 /* Change frequency on next halt or sleep */
204 longhaul.bits.EnableSoftBusRatio = 1;
205 wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
206 if (!cx_address) {
207 ACPI_FLUSH_CPU_CACHE();
208 halt();
209 } else {
210 ACPI_FLUSH_CPU_CACHE();
211 /* Invoke C3 */
212 inb(cx_address);
213 /* Dummy op - must do something useless after P_LVL3 read */
214 t = inl(acpi_gbl_FADT.xpm_timer_block.address);
215 }
216 /* Disable bus ratio bit */
217 longhaul.bits.EnableSoftBusRatio = 0;
218 wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
219
220 /* Reduce voltage if necessary */
221 if (can_scale_voltage && !dir) {
222 longhaul.bits.EnableSoftVID = 1;
223 wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
224 /* Change voltage */
225 if (!cx_address) {
226 ACPI_FLUSH_CPU_CACHE();
227 halt();
228 } else {
229 ACPI_FLUSH_CPU_CACHE();
230 /* Invoke C3 */
231 inb(cx_address);
232 /* Dummy op - must do something useless after P_LVL3
233 * read */
234 t = inl(acpi_gbl_FADT.xpm_timer_block.address);
235 }
236 longhaul.bits.EnableSoftVID = 0;
237 wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
238 }
239}
240
241/**
242 * longhaul_set_cpu_frequency()
243 * @clock_ratio_index : bitpattern of the new multiplier.
244 *
245 * Sets a new clock ratio.
246 */
247
248static void longhaul_setstate(unsigned int table_index)
249{
250 unsigned int clock_ratio_index;
251 int speed, mult;
252 struct cpufreq_freqs freqs;
253 unsigned long flags;
254 unsigned int pic1_mask, pic2_mask;
255 u16 bm_status = 0;
256 u32 bm_timeout = 1000;
257 unsigned int dir = 0;
258
259 clock_ratio_index = longhaul_table[table_index].index;
260 /* Safety precautions */
261 mult = clock_ratio[clock_ratio_index & 0x1f];
262 if (mult == -1)
263 return;
264 speed = calc_speed(mult);
265 if ((speed > highest_speed) || (speed < lowest_speed))
266 return;
267 /* Voltage transition before frequency transition? */
268 if (can_scale_voltage && longhaul_index < table_index)
269 dir = 1;
270
271 freqs.old = calc_speed(longhaul_get_cpu_mult());
272 freqs.new = speed;
273 freqs.cpu = 0; /* longhaul.c is UP only driver */
274
275 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
276
277 dprintk ("Setting to FSB:%dMHz Mult:%d.%dx (%s)\n",
278 fsb, mult/10, mult%10, print_speed(speed/1000));
279retry_loop:
280 preempt_disable();
281 local_irq_save(flags);
282
283 pic2_mask = inb(0xA1);
284 pic1_mask = inb(0x21); /* works on C3. save mask. */
285 outb(0xFF,0xA1); /* Overkill */
286 outb(0xFE,0x21); /* TMR0 only */
287
288 /* Wait while PCI bus is busy. */
289 if (acpi_regs_addr && (longhaul_flags & USE_NORTHBRIDGE
290 || ((pr != NULL) && pr->flags.bm_control))) {
291 bm_status = inw(acpi_regs_addr);
292 bm_status &= 1 << 4;
293 while (bm_status && bm_timeout) {
294 outw(1 << 4, acpi_regs_addr);
295 bm_timeout--;
296 bm_status = inw(acpi_regs_addr);
297 bm_status &= 1 << 4;
298 }
299 }
300
301 if (longhaul_flags & USE_NORTHBRIDGE) {
302 /* Disable AGP and PCI arbiters */
303 outb(3, 0x22);
304 } else if ((pr != NULL) && pr->flags.bm_control) {
305 /* Disable bus master arbitration */
306 acpi_set_register(ACPI_BITREG_ARB_DISABLE, 1);
307 }
308 switch (longhaul_version) {
309
310 /*
311 * Longhaul v1. (Samuel[C5A] and Samuel2 stepping 0[C5B])
312 * Software controlled multipliers only.
313 */
314 case TYPE_LONGHAUL_V1:
315 do_longhaul1(clock_ratio_index);
316 break;
317
318 /*
319 * Longhaul v2 appears in Samuel2 Steppings 1->7 [C5B] and Ezra [C5C]
320 *
321 * Longhaul v3 (aka Powersaver). (Ezra-T [C5M] & Nehemiah [C5N])
322 * Nehemiah can do FSB scaling too, but this has never been proven
323 * to work in practice.
324 */
325 case TYPE_LONGHAUL_V2:
326 case TYPE_POWERSAVER:
327 if (longhaul_flags & USE_ACPI_C3) {
328 /* Don't allow wakeup */
329 acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0);
330 do_powersaver(cx->address, clock_ratio_index, dir);
331 } else {
332 do_powersaver(0, clock_ratio_index, dir);
333 }
334 break;
335 }
336
337 if (longhaul_flags & USE_NORTHBRIDGE) {
338 /* Enable arbiters */
339 outb(0, 0x22);
340 } else if ((pr != NULL) && pr->flags.bm_control) {
341 /* Enable bus master arbitration */
342 acpi_set_register(ACPI_BITREG_ARB_DISABLE, 0);
343 }
344 outb(pic2_mask,0xA1); /* restore mask */
345 outb(pic1_mask,0x21);
346
347 local_irq_restore(flags);
348 preempt_enable();
349
350 freqs.new = calc_speed(longhaul_get_cpu_mult());
351 /* Check if requested frequency is set. */
352 if (unlikely(freqs.new != speed)) {
353 printk(KERN_INFO PFX "Failed to set requested frequency!\n");
354 /* Revision ID = 1 but processor is expecting revision key
355 * equal to 0. Jumpers at the bottom of processor will change
356 * multiplier and FSB, but will not change bits in Longhaul
357 * MSR nor enable voltage scaling. */
358 if (!revid_errata) {
359 printk(KERN_INFO PFX "Enabling \"Ignore Revision ID\" "
360 "option.\n");
361 revid_errata = 1;
362 msleep(200);
363 goto retry_loop;
364 }
365 /* Why ACPI C3 sometimes doesn't work is a mystery for me.
366 * But it does happen. Processor is entering ACPI C3 state,
367 * but it doesn't change frequency. I tried poking various
368 * bits in northbridge registers, but without success. */
369 if (longhaul_flags & USE_ACPI_C3) {
370 printk(KERN_INFO PFX "Disabling ACPI C3 support.\n");
371 longhaul_flags &= ~USE_ACPI_C3;
372 if (revid_errata) {
373 printk(KERN_INFO PFX "Disabling \"Ignore "
374 "Revision ID\" option.\n");
375 revid_errata = 0;
376 }
377 msleep(200);
378 goto retry_loop;
379 }
380 /* This shouldn't happen. Longhaul ver. 2 was reported not
381 * working on processors without voltage scaling, but with
382 * RevID = 1. RevID errata will make things right. Just
383 * to be 100% sure. */
384 if (longhaul_version == TYPE_LONGHAUL_V2) {
385 printk(KERN_INFO PFX "Switching to Longhaul ver. 1\n");
386 longhaul_version = TYPE_LONGHAUL_V1;
387 msleep(200);
388 goto retry_loop;
389 }
390 }
391 /* Report true CPU frequency */
392 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
393
394 if (!bm_timeout)
395 printk(KERN_INFO PFX "Warning: Timeout while waiting for idle PCI bus.\n");
396}
397
398/*
399 * Centaur decided to make life a little more tricky.
400 * Only longhaul v1 is allowed to read EBLCR BSEL[0:1].
401 * Samuel2 and above have to try and guess what the FSB is.
402 * We do this by assuming we booted at maximum multiplier, and interpolate
403 * between that value multiplied by possible FSBs and cpu_mhz which
404 * was calculated at boot time. Really ugly, but no other way to do this.
405 */
406
407#define ROUNDING 0xf
408
409static int guess_fsb(int mult)
410{
411 int speed = cpu_khz / 1000;
412 int i;
413 int speeds[] = { 666, 1000, 1333, 2000 };
414 int f_max, f_min;
415
416 for (i = 0; i < 4; i++) {
417 f_max = ((speeds[i] * mult) + 50) / 100;
418 f_max += (ROUNDING / 2);
419 f_min = f_max - ROUNDING;
420 if ((speed <= f_max) && (speed >= f_min))
421 return speeds[i] / 10;
422 }
423 return 0;
424}
425
426
427static int __init longhaul_get_ranges(void)
428{
429 unsigned int i, j, k = 0;
430 unsigned int ratio;
431 int mult;
432
433 /* Get current frequency */
434 mult = longhaul_get_cpu_mult();
435 if (mult == -1) {
436 printk(KERN_INFO PFX "Invalid (reserved) multiplier!\n");
437 return -EINVAL;
438 }
439 fsb = guess_fsb(mult);
440 if (fsb == 0) {
441 printk(KERN_INFO PFX "Invalid (reserved) FSB!\n");
442 return -EINVAL;
443 }
444 /* Get max multiplier - as we always did.
445 * Longhaul MSR is usefull only when voltage scaling is enabled.
446 * C3 is booting at max anyway. */
447 maxmult = mult;
448 /* Get min multiplier */
449 switch (cpu_model) {
450 case CPU_NEHEMIAH:
451 minmult = 50;
452 break;
453 case CPU_NEHEMIAH_C:
454 minmult = 40;
455 break;
456 default:
457 minmult = 30;
458 break;
459 }
460
461 dprintk ("MinMult:%d.%dx MaxMult:%d.%dx\n",
462 minmult/10, minmult%10, maxmult/10, maxmult%10);
463
464 highest_speed = calc_speed(maxmult);
465 lowest_speed = calc_speed(minmult);
466 dprintk ("FSB:%dMHz Lowest speed: %s Highest speed:%s\n", fsb,
467 print_speed(lowest_speed/1000),
468 print_speed(highest_speed/1000));
469
470 if (lowest_speed == highest_speed) {
471 printk (KERN_INFO PFX "highestspeed == lowest, aborting.\n");
472 return -EINVAL;
473 }
474 if (lowest_speed > highest_speed) {
475 printk (KERN_INFO PFX "nonsense! lowest (%d > %d) !\n",
476 lowest_speed, highest_speed);
477 return -EINVAL;
478 }
479
480 longhaul_table = kmalloc((numscales + 1) * sizeof(struct cpufreq_frequency_table), GFP_KERNEL);
481 if(!longhaul_table)
482 return -ENOMEM;
483
484 for (j = 0; j < numscales; j++) {
485 ratio = clock_ratio[j];
486 if (ratio == -1)
487 continue;
488 if (ratio > maxmult || ratio < minmult)
489 continue;
490 longhaul_table[k].frequency = calc_speed(ratio);
491 longhaul_table[k].index = j;
492 k++;
493 }
494 if (k <= 1) {
495 kfree(longhaul_table);
496 return -ENODEV;
497 }
498 /* Sort */
499 for (j = 0; j < k - 1; j++) {
500 unsigned int min_f, min_i;
501 min_f = longhaul_table[j].frequency;
502 min_i = j;
503 for (i = j + 1; i < k; i++) {
504 if (longhaul_table[i].frequency < min_f) {
505 min_f = longhaul_table[i].frequency;
506 min_i = i;
507 }
508 }
509 if (min_i != j) {
510 unsigned int temp;
511 temp = longhaul_table[j].frequency;
512 longhaul_table[j].frequency = longhaul_table[min_i].frequency;
513 longhaul_table[min_i].frequency = temp;
514 temp = longhaul_table[j].index;
515 longhaul_table[j].index = longhaul_table[min_i].index;
516 longhaul_table[min_i].index = temp;
517 }
518 }
519
520 longhaul_table[k].frequency = CPUFREQ_TABLE_END;
521
522 /* Find index we are running on */
523 for (j = 0; j < k; j++) {
524 if (clock_ratio[longhaul_table[j].index & 0x1f] == mult) {
525 longhaul_index = j;
526 break;
527 }
528 }
529 return 0;
530}
531
532
533static void __init longhaul_setup_voltagescaling(void)
534{
535 union msr_longhaul longhaul;
536 struct mV_pos minvid, maxvid, vid;
537 unsigned int j, speed, pos, kHz_step, numvscales;
538 int min_vid_speed;
539
540 rdmsrl(MSR_VIA_LONGHAUL, longhaul.val);
541 if (!(longhaul.bits.RevisionID & 1)) {
542 printk(KERN_INFO PFX "Voltage scaling not supported by CPU.\n");
543 return;
544 }
545
546 if (!longhaul.bits.VRMRev) {
547 printk(KERN_INFO PFX "VRM 8.5\n");
548 vrm_mV_table = &vrm85_mV[0];
549 mV_vrm_table = &mV_vrm85[0];
550 } else {
551 printk(KERN_INFO PFX "Mobile VRM\n");
552 if (cpu_model < CPU_NEHEMIAH)
553 return;
554 vrm_mV_table = &mobilevrm_mV[0];
555 mV_vrm_table = &mV_mobilevrm[0];
556 }
557
558 minvid = vrm_mV_table[longhaul.bits.MinimumVID];
559 maxvid = vrm_mV_table[longhaul.bits.MaximumVID];
560
561 if (minvid.mV == 0 || maxvid.mV == 0 || minvid.mV > maxvid.mV) {
562 printk (KERN_INFO PFX "Bogus values Min:%d.%03d Max:%d.%03d. "
563 "Voltage scaling disabled.\n",
564 minvid.mV/1000, minvid.mV%1000, maxvid.mV/1000, maxvid.mV%1000);
565 return;
566 }
567
568 if (minvid.mV == maxvid.mV) {
569 printk (KERN_INFO PFX "Claims to support voltage scaling but min & max are "
570 "both %d.%03d. Voltage scaling disabled\n",
571 maxvid.mV/1000, maxvid.mV%1000);
572 return;
573 }
574
575 /* How many voltage steps */
576 numvscales = maxvid.pos - minvid.pos + 1;
577 printk(KERN_INFO PFX
578 "Max VID=%d.%03d "
579 "Min VID=%d.%03d, "
580 "%d possible voltage scales\n",
581 maxvid.mV/1000, maxvid.mV%1000,
582 minvid.mV/1000, minvid.mV%1000,
583 numvscales);
584
585 /* Calculate max frequency at min voltage */
586 j = longhaul.bits.MinMHzBR;
587 if (longhaul.bits.MinMHzBR4)
588 j += 16;
589 min_vid_speed = eblcr_table[j];
590 if (min_vid_speed == -1)
591 return;
592 switch (longhaul.bits.MinMHzFSB) {
593 case 0:
594 min_vid_speed *= 13333;
595 break;
596 case 1:
597 min_vid_speed *= 10000;
598 break;
599 case 3:
600 min_vid_speed *= 6666;
601 break;
602 default:
603 return;
604 break;
605 }
606 if (min_vid_speed >= highest_speed)
607 return;
608 /* Calculate kHz for one voltage step */
609 kHz_step = (highest_speed - min_vid_speed) / numvscales;
610
611 j = 0;
612 while (longhaul_table[j].frequency != CPUFREQ_TABLE_END) {
613 speed = longhaul_table[j].frequency;
614 if (speed > min_vid_speed)
615 pos = (speed - min_vid_speed) / kHz_step + minvid.pos;
616 else
617 pos = minvid.pos;
618 longhaul_table[j].index |= mV_vrm_table[pos] << 8;
619 vid = vrm_mV_table[mV_vrm_table[pos]];
620 printk(KERN_INFO PFX "f: %d kHz, index: %d, vid: %d mV\n", speed, j, vid.mV);
621 j++;
622 }
623
624 can_scale_voltage = 1;
625 printk(KERN_INFO PFX "Voltage scaling enabled.\n");
626}
627
628
629static int longhaul_verify(struct cpufreq_policy *policy)
630{
631 return cpufreq_frequency_table_verify(policy, longhaul_table);
632}
633
634
635static int longhaul_target(struct cpufreq_policy *policy,
636 unsigned int target_freq, unsigned int relation)
637{
638 unsigned int table_index = 0;
639 unsigned int i;
640 unsigned int dir = 0;
641 u8 vid, current_vid;
642
643 if (cpufreq_frequency_table_target(policy, longhaul_table, target_freq, relation, &table_index))
644 return -EINVAL;
645
646 /* Don't set same frequency again */
647 if (longhaul_index == table_index)
648 return 0;
649
650 if (!can_scale_voltage)
651 longhaul_setstate(table_index);
652 else {
653 /* On test system voltage transitions exceeding single
654 * step up or down were turning motherboard off. Both
655 * "ondemand" and "userspace" are unsafe. C7 is doing
656 * this in hardware, C3 is old and we need to do this
657 * in software. */
658 i = longhaul_index;
659 current_vid = (longhaul_table[longhaul_index].index >> 8) & 0x1f;
660 if (table_index > longhaul_index)
661 dir = 1;
662 while (i != table_index) {
663 vid = (longhaul_table[i].index >> 8) & 0x1f;
664 if (vid != current_vid) {
665 longhaul_setstate(i);
666 current_vid = vid;
667 msleep(200);
668 }
669 if (dir)
670 i++;
671 else
672 i--;
673 }
674 longhaul_setstate(table_index);
675 }
676 longhaul_index = table_index;
677 return 0;
678}
679
680
681static unsigned int longhaul_get(unsigned int cpu)
682{
683 if (cpu)
684 return 0;
685 return calc_speed(longhaul_get_cpu_mult());
686}
687
688static acpi_status longhaul_walk_callback(acpi_handle obj_handle,
689 u32 nesting_level,
690 void *context, void **return_value)
691{
692 struct acpi_device *d;
693
694 if ( acpi_bus_get_device(obj_handle, &d) ) {
695 return 0;
696 }
697 *return_value = (void *)acpi_driver_data(d);
698 return 1;
699}
700
701/* VIA don't support PM2 reg, but have something similar */
702static int enable_arbiter_disable(void)
703{
704 struct pci_dev *dev;
705 int status = 1;
706 int reg;
707 u8 pci_cmd;
708
709 /* Find PLE133 host bridge */
710 reg = 0x78;
711 dev = pci_get_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8601_0,
712 NULL);
713 /* Find CLE266 host bridge */
714 if (dev == NULL) {
715 reg = 0x76;
716 dev = pci_get_device(PCI_VENDOR_ID_VIA,
717 PCI_DEVICE_ID_VIA_862X_0, NULL);
718 /* Find CN400 V-Link host bridge */
719 if (dev == NULL)
720 dev = pci_get_device(PCI_VENDOR_ID_VIA, 0x7259, NULL);
721 }
722 if (dev != NULL) {
723 /* Enable access to port 0x22 */
724 pci_read_config_byte(dev, reg, &pci_cmd);
725 if (!(pci_cmd & 1<<7)) {
726 pci_cmd |= 1<<7;
727 pci_write_config_byte(dev, reg, pci_cmd);
728 pci_read_config_byte(dev, reg, &pci_cmd);
729 if (!(pci_cmd & 1<<7)) {
730 printk(KERN_ERR PFX
731 "Can't enable access to port 0x22.\n");
732 status = 0;
733 }
734 }
735 pci_dev_put(dev);
736 return status;
737 }
738 return 0;
739}
740
741static int longhaul_setup_southbridge(void)
742{
743 struct pci_dev *dev;
744 u8 pci_cmd;
745
746 /* Find VT8235 southbridge */
747 dev = pci_get_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235, NULL);
748 if (dev == NULL)
749 /* Find VT8237 southbridge */
750 dev = pci_get_device(PCI_VENDOR_ID_VIA,
751 PCI_DEVICE_ID_VIA_8237, NULL);
752 if (dev != NULL) {
753 /* Set transition time to max */
754 pci_read_config_byte(dev, 0xec, &pci_cmd);
755 pci_cmd &= ~(1 << 2);
756 pci_write_config_byte(dev, 0xec, pci_cmd);
757 pci_read_config_byte(dev, 0xe4, &pci_cmd);
758 pci_cmd &= ~(1 << 7);
759 pci_write_config_byte(dev, 0xe4, pci_cmd);
760 pci_read_config_byte(dev, 0xe5, &pci_cmd);
761 pci_cmd |= 1 << 7;
762 pci_write_config_byte(dev, 0xe5, pci_cmd);
763 /* Get address of ACPI registers block*/
764 pci_read_config_byte(dev, 0x81, &pci_cmd);
765 if (pci_cmd & 1 << 7) {
766 pci_read_config_dword(dev, 0x88, &acpi_regs_addr);
767 acpi_regs_addr &= 0xff00;
768 printk(KERN_INFO PFX "ACPI I/O at 0x%x\n", acpi_regs_addr);
769 }
770
771 pci_dev_put(dev);
772 return 1;
773 }
774 return 0;
775}
776
777static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
778{
779 struct cpuinfo_x86 *c = cpu_data;
780 char *cpuname=NULL;
781 int ret;
782 u32 lo, hi;
783
784 /* Check what we have on this motherboard */
785 switch (c->x86_model) {
786 case 6:
787 cpu_model = CPU_SAMUEL;
788 cpuname = "C3 'Samuel' [C5A]";
789 longhaul_version = TYPE_LONGHAUL_V1;
790 memcpy (clock_ratio, samuel1_clock_ratio, sizeof(samuel1_clock_ratio));
791 memcpy (eblcr_table, samuel1_eblcr, sizeof(samuel1_eblcr));
792 break;
793
794 case 7:
795 switch (c->x86_mask) {
796 case 0:
797 longhaul_version = TYPE_LONGHAUL_V1;
798 cpu_model = CPU_SAMUEL2;
799 cpuname = "C3 'Samuel 2' [C5B]";
800 /* Note, this is not a typo, early Samuel2's had
801 * Samuel1 ratios. */
802 memcpy(clock_ratio, samuel1_clock_ratio,
803 sizeof(samuel1_clock_ratio));
804 memcpy(eblcr_table, samuel2_eblcr,
805 sizeof(samuel2_eblcr));
806 break;
807 case 1 ... 15:
808 longhaul_version = TYPE_LONGHAUL_V1;
809 if (c->x86_mask < 8) {
810 cpu_model = CPU_SAMUEL2;
811 cpuname = "C3 'Samuel 2' [C5B]";
812 } else {
813 cpu_model = CPU_EZRA;
814 cpuname = "C3 'Ezra' [C5C]";
815 }
816 memcpy(clock_ratio, ezra_clock_ratio,
817 sizeof(ezra_clock_ratio));
818 memcpy(eblcr_table, ezra_eblcr,
819 sizeof(ezra_eblcr));
820 break;
821 }
822 break;
823
824 case 8:
825 cpu_model = CPU_EZRA_T;
826 cpuname = "C3 'Ezra-T' [C5M]";
827 longhaul_version = TYPE_POWERSAVER;
828 numscales=32;
829 memcpy (clock_ratio, ezrat_clock_ratio, sizeof(ezrat_clock_ratio));
830 memcpy (eblcr_table, ezrat_eblcr, sizeof(ezrat_eblcr));
831 break;
832
833 case 9:
834 longhaul_version = TYPE_POWERSAVER;
835 numscales = 32;
836 memcpy(clock_ratio,
837 nehemiah_clock_ratio,
838 sizeof(nehemiah_clock_ratio));
839 memcpy(eblcr_table, nehemiah_eblcr, sizeof(nehemiah_eblcr));
840 switch (c->x86_mask) {
841 case 0 ... 1:
842 cpu_model = CPU_NEHEMIAH;
843 cpuname = "C3 'Nehemiah A' [C5XLOE]";
844 break;
845 case 2 ... 4:
846 cpu_model = CPU_NEHEMIAH;
847 cpuname = "C3 'Nehemiah B' [C5XLOH]";
848 break;
849 case 5 ... 15:
850 cpu_model = CPU_NEHEMIAH_C;
851 cpuname = "C3 'Nehemiah C' [C5P]";
852 break;
853 }
854 break;
855
856 default:
857 cpuname = "Unknown";
858 break;
859 }
860 /* Check Longhaul ver. 2 */
861 if (longhaul_version == TYPE_LONGHAUL_V2) {
862 rdmsr(MSR_VIA_LONGHAUL, lo, hi);
863 if (lo == 0 && hi == 0)
864 /* Looks like MSR isn't present */
865 longhaul_version = TYPE_LONGHAUL_V1;
866 }
867
868 printk (KERN_INFO PFX "VIA %s CPU detected. ", cpuname);
869 switch (longhaul_version) {
870 case TYPE_LONGHAUL_V1:
871 case TYPE_LONGHAUL_V2:
872 printk ("Longhaul v%d supported.\n", longhaul_version);
873 break;
874 case TYPE_POWERSAVER:
875 printk ("Powersaver supported.\n");
876 break;
877 };
878
879 /* Doesn't hurt */
880 longhaul_setup_southbridge();
881
882 /* Find ACPI data for processor */
883 acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT,
884 ACPI_UINT32_MAX, &longhaul_walk_callback,
885 NULL, (void *)&pr);
886
887 /* Check ACPI support for C3 state */
888 if (pr != NULL && longhaul_version == TYPE_POWERSAVER) {
889 cx = &pr->power.states[ACPI_STATE_C3];
890 if (cx->address > 0 && cx->latency <= 1000)
891 longhaul_flags |= USE_ACPI_C3;
892 }
893 /* Disable if it isn't working */
894 if (disable_acpi_c3)
895 longhaul_flags &= ~USE_ACPI_C3;
896 /* Check if northbridge is friendly */
897 if (enable_arbiter_disable())
898 longhaul_flags |= USE_NORTHBRIDGE;
899
900 /* Check ACPI support for bus master arbiter disable */
901 if (!(longhaul_flags & USE_ACPI_C3
902 || longhaul_flags & USE_NORTHBRIDGE)
903 && ((pr == NULL) || !(pr->flags.bm_control))) {
904 printk(KERN_ERR PFX
905 "No ACPI support. Unsupported northbridge.\n");
906 return -ENODEV;
907 }
908
909 if (longhaul_flags & USE_NORTHBRIDGE)
910 printk(KERN_INFO PFX "Using northbridge support.\n");
911 if (longhaul_flags & USE_ACPI_C3)
912 printk(KERN_INFO PFX "Using ACPI support.\n");
913
914 ret = longhaul_get_ranges();
915 if (ret != 0)
916 return ret;
917
918 if ((longhaul_version != TYPE_LONGHAUL_V1) && (scale_voltage != 0))
919 longhaul_setup_voltagescaling();
920
921 policy->governor = CPUFREQ_DEFAULT_GOVERNOR;
922 policy->cpuinfo.transition_latency = 200000; /* nsec */
923 policy->cur = calc_speed(longhaul_get_cpu_mult());
924
925 ret = cpufreq_frequency_table_cpuinfo(policy, longhaul_table);
926 if (ret)
927 return ret;
928
929 cpufreq_frequency_table_get_attr(longhaul_table, policy->cpu);
930
931 return 0;
932}
933
934static int __devexit longhaul_cpu_exit(struct cpufreq_policy *policy)
935{
936 cpufreq_frequency_table_put_attr(policy->cpu);
937 return 0;
938}
939
940static struct freq_attr* longhaul_attr[] = {
941 &cpufreq_freq_attr_scaling_available_freqs,
942 NULL,
943};
944
945static struct cpufreq_driver longhaul_driver = {
946 .verify = longhaul_verify,
947 .target = longhaul_target,
948 .get = longhaul_get,
949 .init = longhaul_cpu_init,
950 .exit = __devexit_p(longhaul_cpu_exit),
951 .name = "longhaul",
952 .owner = THIS_MODULE,
953 .attr = longhaul_attr,
954};
955
956
957static int __init longhaul_init(void)
958{
959 struct cpuinfo_x86 *c = cpu_data;
960
961 if (c->x86_vendor != X86_VENDOR_CENTAUR || c->x86 != 6)
962 return -ENODEV;
963
964#ifdef CONFIG_SMP
965 if (num_online_cpus() > 1) {
966 printk(KERN_ERR PFX "More than 1 CPU detected, longhaul disabled.\n");
967 return -ENODEV;
968 }
969#endif
970#ifdef CONFIG_X86_IO_APIC
971 if (cpu_has_apic) {
972 printk(KERN_ERR PFX "APIC detected. Longhaul is currently broken in this configuration.\n");
973 return -ENODEV;
974 }
975#endif
976 switch (c->x86_model) {
977 case 6 ... 9:
978 return cpufreq_register_driver(&longhaul_driver);
979 case 10:
980 printk(KERN_ERR PFX "Use acpi-cpufreq driver for VIA C7\n");
981 default:
982 ;;
983 }
984
985 return -ENODEV;
986}
987
988
989static void __exit longhaul_exit(void)
990{
991 int i;
992
993 for (i=0; i < numscales; i++) {
994 if (clock_ratio[i] == maxmult) {
995 longhaul_setstate(i);
996 break;
997 }
998 }
999
1000 cpufreq_unregister_driver(&longhaul_driver);
1001 kfree(longhaul_table);
1002}
1003
1004/* Even if BIOS is exporting ACPI C3 state, and it is used
1005 * with success when CPU is idle, this state doesn't
1006 * trigger frequency transition in some cases. */
1007module_param (disable_acpi_c3, int, 0644);
1008MODULE_PARM_DESC(disable_acpi_c3, "Don't use ACPI C3 support");
1009/* Change CPU voltage with frequency. Very usefull to save
1010 * power, but most VIA C3 processors aren't supporting it. */
1011module_param (scale_voltage, int, 0644);
1012MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor");
1013/* Force revision key to 0 for processors which doesn't
1014 * support voltage scaling, but are introducing itself as
1015 * such. */
1016module_param(revid_errata, int, 0644);
1017MODULE_PARM_DESC(revid_errata, "Ignore CPU Revision ID");
1018
1019MODULE_AUTHOR ("Dave Jones <davej@codemonkey.org.uk>");
1020MODULE_DESCRIPTION ("Longhaul driver for VIA Cyrix processors.");
1021MODULE_LICENSE ("GPL");
1022
1023late_initcall(longhaul_init);
1024module_exit(longhaul_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.h b/arch/x86/kernel/cpu/cpufreq/longhaul.h
new file mode 100644
index 000000000000..4fcc320997df
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.h
@@ -0,0 +1,353 @@
1/*
2 * longhaul.h
3 * (C) 2003 Dave Jones.
4 *
5 * Licensed under the terms of the GNU GPL License version 2.
6 *
7 * VIA-specific information
8 */
9
10union msr_bcr2 {
11 struct {
12 unsigned Reseved:19, // 18:0
13 ESOFTBF:1, // 19
14 Reserved2:3, // 22:20
15 CLOCKMUL:4, // 26:23
16 Reserved3:5; // 31:27
17 } bits;
18 unsigned long val;
19};
20
21union msr_longhaul {
22 struct {
23 unsigned RevisionID:4, // 3:0
24 RevisionKey:4, // 7:4
25 EnableSoftBusRatio:1, // 8
26 EnableSoftVID:1, // 9
27 EnableSoftBSEL:1, // 10
28 Reserved:3, // 11:13
29 SoftBusRatio4:1, // 14
30 VRMRev:1, // 15
31 SoftBusRatio:4, // 19:16
32 SoftVID:5, // 24:20
33 Reserved2:3, // 27:25
34 SoftBSEL:2, // 29:28
35 Reserved3:2, // 31:30
36 MaxMHzBR:4, // 35:32
37 MaximumVID:5, // 40:36
38 MaxMHzFSB:2, // 42:41
39 MaxMHzBR4:1, // 43
40 Reserved4:4, // 47:44
41 MinMHzBR:4, // 51:48
42 MinimumVID:5, // 56:52
43 MinMHzFSB:2, // 58:57
44 MinMHzBR4:1, // 59
45 Reserved5:4; // 63:60
46 } bits;
47 unsigned long long val;
48};
49
50/*
51 * Clock ratio tables. Div/Mod by 10 to get ratio.
52 * The eblcr ones specify the ratio read from the CPU.
53 * The clock_ratio ones specify what to write to the CPU.
54 */
55
56/*
57 * VIA C3 Samuel 1 & Samuel 2 (stepping 0)
58 */
59static const int __initdata samuel1_clock_ratio[16] = {
60 -1, /* 0000 -> RESERVED */
61 30, /* 0001 -> 3.0x */
62 40, /* 0010 -> 4.0x */
63 -1, /* 0011 -> RESERVED */
64 -1, /* 0100 -> RESERVED */
65 35, /* 0101 -> 3.5x */
66 45, /* 0110 -> 4.5x */
67 55, /* 0111 -> 5.5x */
68 60, /* 1000 -> 6.0x */
69 70, /* 1001 -> 7.0x */
70 80, /* 1010 -> 8.0x */
71 50, /* 1011 -> 5.0x */
72 65, /* 1100 -> 6.5x */
73 75, /* 1101 -> 7.5x */
74 -1, /* 1110 -> RESERVED */
75 -1, /* 1111 -> RESERVED */
76};
77
78static const int __initdata samuel1_eblcr[16] = {
79 50, /* 0000 -> RESERVED */
80 30, /* 0001 -> 3.0x */
81 40, /* 0010 -> 4.0x */
82 -1, /* 0011 -> RESERVED */
83 55, /* 0100 -> 5.5x */
84 35, /* 0101 -> 3.5x */
85 45, /* 0110 -> 4.5x */
86 -1, /* 0111 -> RESERVED */
87 -1, /* 1000 -> RESERVED */
88 70, /* 1001 -> 7.0x */
89 80, /* 1010 -> 8.0x */
90 60, /* 1011 -> 6.0x */
91 -1, /* 1100 -> RESERVED */
92 75, /* 1101 -> 7.5x */
93 -1, /* 1110 -> RESERVED */
94 65, /* 1111 -> 6.5x */
95};
96
97/*
98 * VIA C3 Samuel2 Stepping 1->15
99 */
100static const int __initdata samuel2_eblcr[16] = {
101 50, /* 0000 -> 5.0x */
102 30, /* 0001 -> 3.0x */
103 40, /* 0010 -> 4.0x */
104 100, /* 0011 -> 10.0x */
105 55, /* 0100 -> 5.5x */
106 35, /* 0101 -> 3.5x */
107 45, /* 0110 -> 4.5x */
108 110, /* 0111 -> 11.0x */
109 90, /* 1000 -> 9.0x */
110 70, /* 1001 -> 7.0x */
111 80, /* 1010 -> 8.0x */
112 60, /* 1011 -> 6.0x */
113 120, /* 1100 -> 12.0x */
114 75, /* 1101 -> 7.5x */
115 130, /* 1110 -> 13.0x */
116 65, /* 1111 -> 6.5x */
117};
118
119/*
120 * VIA C3 Ezra
121 */
122static const int __initdata ezra_clock_ratio[16] = {
123 100, /* 0000 -> 10.0x */
124 30, /* 0001 -> 3.0x */
125 40, /* 0010 -> 4.0x */
126 90, /* 0011 -> 9.0x */
127 95, /* 0100 -> 9.5x */
128 35, /* 0101 -> 3.5x */
129 45, /* 0110 -> 4.5x */
130 55, /* 0111 -> 5.5x */
131 60, /* 1000 -> 6.0x */
132 70, /* 1001 -> 7.0x */
133 80, /* 1010 -> 8.0x */
134 50, /* 1011 -> 5.0x */
135 65, /* 1100 -> 6.5x */
136 75, /* 1101 -> 7.5x */
137 85, /* 1110 -> 8.5x */
138 120, /* 1111 -> 12.0x */
139};
140
141static const int __initdata ezra_eblcr[16] = {
142 50, /* 0000 -> 5.0x */
143 30, /* 0001 -> 3.0x */
144 40, /* 0010 -> 4.0x */
145 100, /* 0011 -> 10.0x */
146 55, /* 0100 -> 5.5x */
147 35, /* 0101 -> 3.5x */
148 45, /* 0110 -> 4.5x */
149 95, /* 0111 -> 9.5x */
150 90, /* 1000 -> 9.0x */
151 70, /* 1001 -> 7.0x */
152 80, /* 1010 -> 8.0x */
153 60, /* 1011 -> 6.0x */
154 120, /* 1100 -> 12.0x */
155 75, /* 1101 -> 7.5x */
156 85, /* 1110 -> 8.5x */
157 65, /* 1111 -> 6.5x */
158};
159
160/*
161 * VIA C3 (Ezra-T) [C5M].
162 */
163static const int __initdata ezrat_clock_ratio[32] = {
164 100, /* 0000 -> 10.0x */
165 30, /* 0001 -> 3.0x */
166 40, /* 0010 -> 4.0x */
167 90, /* 0011 -> 9.0x */
168 95, /* 0100 -> 9.5x */
169 35, /* 0101 -> 3.5x */
170 45, /* 0110 -> 4.5x */
171 55, /* 0111 -> 5.5x */
172 60, /* 1000 -> 6.0x */
173 70, /* 1001 -> 7.0x */
174 80, /* 1010 -> 8.0x */
175 50, /* 1011 -> 5.0x */
176 65, /* 1100 -> 6.5x */
177 75, /* 1101 -> 7.5x */
178 85, /* 1110 -> 8.5x */
179 120, /* 1111 -> 12.0x */
180
181 -1, /* 0000 -> RESERVED (10.0x) */
182 110, /* 0001 -> 11.0x */
183 -1, /* 0010 -> 12.0x */
184 -1, /* 0011 -> RESERVED (9.0x)*/
185 105, /* 0100 -> 10.5x */
186 115, /* 0101 -> 11.5x */
187 125, /* 0110 -> 12.5x */
188 135, /* 0111 -> 13.5x */
189 140, /* 1000 -> 14.0x */
190 150, /* 1001 -> 15.0x */
191 160, /* 1010 -> 16.0x */
192 130, /* 1011 -> 13.0x */
193 145, /* 1100 -> 14.5x */
194 155, /* 1101 -> 15.5x */
195 -1, /* 1110 -> RESERVED (13.0x) */
196 -1, /* 1111 -> RESERVED (12.0x) */
197};
198
199static const int __initdata ezrat_eblcr[32] = {
200 50, /* 0000 -> 5.0x */
201 30, /* 0001 -> 3.0x */
202 40, /* 0010 -> 4.0x */
203 100, /* 0011 -> 10.0x */
204 55, /* 0100 -> 5.5x */
205 35, /* 0101 -> 3.5x */
206 45, /* 0110 -> 4.5x */
207 95, /* 0111 -> 9.5x */
208 90, /* 1000 -> 9.0x */
209 70, /* 1001 -> 7.0x */
210 80, /* 1010 -> 8.0x */
211 60, /* 1011 -> 6.0x */
212 120, /* 1100 -> 12.0x */
213 75, /* 1101 -> 7.5x */
214 85, /* 1110 -> 8.5x */
215 65, /* 1111 -> 6.5x */
216
217 -1, /* 0000 -> RESERVED (9.0x) */
218 110, /* 0001 -> 11.0x */
219 120, /* 0010 -> 12.0x */
220 -1, /* 0011 -> RESERVED (10.0x)*/
221 135, /* 0100 -> 13.5x */
222 115, /* 0101 -> 11.5x */
223 125, /* 0110 -> 12.5x */
224 105, /* 0111 -> 10.5x */
225 130, /* 1000 -> 13.0x */
226 150, /* 1001 -> 15.0x */
227 160, /* 1010 -> 16.0x */
228 140, /* 1011 -> 14.0x */
229 -1, /* 1100 -> RESERVED (12.0x) */
230 155, /* 1101 -> 15.5x */
231 -1, /* 1110 -> RESERVED (13.0x) */
232 145, /* 1111 -> 14.5x */
233};
234
235/*
236 * VIA C3 Nehemiah */
237
238static const int __initdata nehemiah_clock_ratio[32] = {
239 100, /* 0000 -> 10.0x */
240 -1, /* 0001 -> 16.0x */
241 40, /* 0010 -> 4.0x */
242 90, /* 0011 -> 9.0x */
243 95, /* 0100 -> 9.5x */
244 -1, /* 0101 -> RESERVED */
245 45, /* 0110 -> 4.5x */
246 55, /* 0111 -> 5.5x */
247 60, /* 1000 -> 6.0x */
248 70, /* 1001 -> 7.0x */
249 80, /* 1010 -> 8.0x */
250 50, /* 1011 -> 5.0x */
251 65, /* 1100 -> 6.5x */
252 75, /* 1101 -> 7.5x */
253 85, /* 1110 -> 8.5x */
254 120, /* 1111 -> 12.0x */
255 -1, /* 0000 -> 10.0x */
256 110, /* 0001 -> 11.0x */
257 -1, /* 0010 -> 12.0x */
258 -1, /* 0011 -> 9.0x */
259 105, /* 0100 -> 10.5x */
260 115, /* 0101 -> 11.5x */
261 125, /* 0110 -> 12.5x */
262 135, /* 0111 -> 13.5x */
263 140, /* 1000 -> 14.0x */
264 150, /* 1001 -> 15.0x */
265 160, /* 1010 -> 16.0x */
266 130, /* 1011 -> 13.0x */
267 145, /* 1100 -> 14.5x */
268 155, /* 1101 -> 15.5x */
269 -1, /* 1110 -> RESERVED (13.0x) */
270 -1, /* 1111 -> 12.0x */
271};
272
273static const int __initdata nehemiah_eblcr[32] = {
274 50, /* 0000 -> 5.0x */
275 160, /* 0001 -> 16.0x */
276 40, /* 0010 -> 4.0x */
277 100, /* 0011 -> 10.0x */
278 55, /* 0100 -> 5.5x */
279 -1, /* 0101 -> RESERVED */
280 45, /* 0110 -> 4.5x */
281 95, /* 0111 -> 9.5x */
282 90, /* 1000 -> 9.0x */
283 70, /* 1001 -> 7.0x */
284 80, /* 1010 -> 8.0x */
285 60, /* 1011 -> 6.0x */
286 120, /* 1100 -> 12.0x */
287 75, /* 1101 -> 7.5x */
288 85, /* 1110 -> 8.5x */
289 65, /* 1111 -> 6.5x */
290 90, /* 0000 -> 9.0x */
291 110, /* 0001 -> 11.0x */
292 120, /* 0010 -> 12.0x */
293 100, /* 0011 -> 10.0x */
294 135, /* 0100 -> 13.5x */
295 115, /* 0101 -> 11.5x */
296 125, /* 0110 -> 12.5x */
297 105, /* 0111 -> 10.5x */
298 130, /* 1000 -> 13.0x */
299 150, /* 1001 -> 15.0x */
300 160, /* 1010 -> 16.0x */
301 140, /* 1011 -> 14.0x */
302 120, /* 1100 -> 12.0x */
303 155, /* 1101 -> 15.5x */
304 -1, /* 1110 -> RESERVED (13.0x) */
305 145 /* 1111 -> 14.5x */
306};
307
308/*
309 * Voltage scales. Div/Mod by 1000 to get actual voltage.
310 * Which scale to use depends on the VRM type in use.
311 */
312
313struct mV_pos {
314 unsigned short mV;
315 unsigned short pos;
316};
317
318static const struct mV_pos __initdata vrm85_mV[32] = {
319 {1250, 8}, {1200, 6}, {1150, 4}, {1100, 2},
320 {1050, 0}, {1800, 30}, {1750, 28}, {1700, 26},
321 {1650, 24}, {1600, 22}, {1550, 20}, {1500, 18},
322 {1450, 16}, {1400, 14}, {1350, 12}, {1300, 10},
323 {1275, 9}, {1225, 7}, {1175, 5}, {1125, 3},
324 {1075, 1}, {1825, 31}, {1775, 29}, {1725, 27},
325 {1675, 25}, {1625, 23}, {1575, 21}, {1525, 19},
326 {1475, 17}, {1425, 15}, {1375, 13}, {1325, 11}
327};
328
329static const unsigned char __initdata mV_vrm85[32] = {
330 0x04, 0x14, 0x03, 0x13, 0x02, 0x12, 0x01, 0x11,
331 0x00, 0x10, 0x0f, 0x1f, 0x0e, 0x1e, 0x0d, 0x1d,
332 0x0c, 0x1c, 0x0b, 0x1b, 0x0a, 0x1a, 0x09, 0x19,
333 0x08, 0x18, 0x07, 0x17, 0x06, 0x16, 0x05, 0x15
334};
335
336static const struct mV_pos __initdata mobilevrm_mV[32] = {
337 {1750, 31}, {1700, 30}, {1650, 29}, {1600, 28},
338 {1550, 27}, {1500, 26}, {1450, 25}, {1400, 24},
339 {1350, 23}, {1300, 22}, {1250, 21}, {1200, 20},
340 {1150, 19}, {1100, 18}, {1050, 17}, {1000, 16},
341 {975, 15}, {950, 14}, {925, 13}, {900, 12},
342 {875, 11}, {850, 10}, {825, 9}, {800, 8},
343 {775, 7}, {750, 6}, {725, 5}, {700, 4},
344 {675, 3}, {650, 2}, {625, 1}, {600, 0}
345};
346
347static const unsigned char __initdata mV_mobilevrm[32] = {
348 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18,
349 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10,
350 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08,
351 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
352};
353
diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c
new file mode 100644
index 000000000000..b2689514295a
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/longrun.c
@@ -0,0 +1,325 @@
1/*
2 * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
3 *
4 * Licensed under the terms of the GNU GPL License version 2.
5 *
6 * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
7 */
8
9#include <linux/kernel.h>
10#include <linux/module.h>
11#include <linux/init.h>
12#include <linux/slab.h>
13#include <linux/cpufreq.h>
14
15#include <asm/msr.h>
16#include <asm/processor.h>
17#include <asm/timex.h>
18
19#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "longrun", msg)
20
21static struct cpufreq_driver longrun_driver;
22
23/**
24 * longrun_{low,high}_freq is needed for the conversion of cpufreq kHz
25 * values into per cent values. In TMTA microcode, the following is valid:
26 * performance_pctg = (current_freq - low_freq)/(high_freq - low_freq)
27 */
28static unsigned int longrun_low_freq, longrun_high_freq;
29
30
31/**
32 * longrun_get_policy - get the current LongRun policy
33 * @policy: struct cpufreq_policy where current policy is written into
34 *
35 * Reads the current LongRun policy by access to MSR_TMTA_LONGRUN_FLAGS
36 * and MSR_TMTA_LONGRUN_CTRL
37 */
38static void __init longrun_get_policy(struct cpufreq_policy *policy)
39{
40 u32 msr_lo, msr_hi;
41
42 rdmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi);
43 dprintk("longrun flags are %x - %x\n", msr_lo, msr_hi);
44 if (msr_lo & 0x01)
45 policy->policy = CPUFREQ_POLICY_PERFORMANCE;
46 else
47 policy->policy = CPUFREQ_POLICY_POWERSAVE;
48
49 rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
50 dprintk("longrun ctrl is %x - %x\n", msr_lo, msr_hi);
51 msr_lo &= 0x0000007F;
52 msr_hi &= 0x0000007F;
53
54 if ( longrun_high_freq <= longrun_low_freq ) {
55 /* Assume degenerate Longrun table */
56 policy->min = policy->max = longrun_high_freq;
57 } else {
58 policy->min = longrun_low_freq + msr_lo *
59 ((longrun_high_freq - longrun_low_freq) / 100);
60 policy->max = longrun_low_freq + msr_hi *
61 ((longrun_high_freq - longrun_low_freq) / 100);
62 }
63 policy->cpu = 0;
64}
65
66
67/**
68 * longrun_set_policy - sets a new CPUFreq policy
69 * @policy: new policy
70 *
71 * Sets a new CPUFreq policy on LongRun-capable processors. This function
72 * has to be called with cpufreq_driver locked.
73 */
74static int longrun_set_policy(struct cpufreq_policy *policy)
75{
76 u32 msr_lo, msr_hi;
77 u32 pctg_lo, pctg_hi;
78
79 if (!policy)
80 return -EINVAL;
81
82 if ( longrun_high_freq <= longrun_low_freq ) {
83 /* Assume degenerate Longrun table */
84 pctg_lo = pctg_hi = 100;
85 } else {
86 pctg_lo = (policy->min - longrun_low_freq) /
87 ((longrun_high_freq - longrun_low_freq) / 100);
88 pctg_hi = (policy->max - longrun_low_freq) /
89 ((longrun_high_freq - longrun_low_freq) / 100);
90 }
91
92 if (pctg_hi > 100)
93 pctg_hi = 100;
94 if (pctg_lo > pctg_hi)
95 pctg_lo = pctg_hi;
96
97 /* performance or economy mode */
98 rdmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi);
99 msr_lo &= 0xFFFFFFFE;
100 switch (policy->policy) {
101 case CPUFREQ_POLICY_PERFORMANCE:
102 msr_lo |= 0x00000001;
103 break;
104 case CPUFREQ_POLICY_POWERSAVE:
105 break;
106 }
107 wrmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi);
108
109 /* lower and upper boundary */
110 rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
111 msr_lo &= 0xFFFFFF80;
112 msr_hi &= 0xFFFFFF80;
113 msr_lo |= pctg_lo;
114 msr_hi |= pctg_hi;
115 wrmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
116
117 return 0;
118}
119
120
121/**
122 * longrun_verify_poliy - verifies a new CPUFreq policy
123 * @policy: the policy to verify
124 *
125 * Validates a new CPUFreq policy. This function has to be called with
126 * cpufreq_driver locked.
127 */
128static int longrun_verify_policy(struct cpufreq_policy *policy)
129{
130 if (!policy)
131 return -EINVAL;
132
133 policy->cpu = 0;
134 cpufreq_verify_within_limits(policy,
135 policy->cpuinfo.min_freq,
136 policy->cpuinfo.max_freq);
137
138 if ((policy->policy != CPUFREQ_POLICY_POWERSAVE) &&
139 (policy->policy != CPUFREQ_POLICY_PERFORMANCE))
140 return -EINVAL;
141
142 return 0;
143}
144
145static unsigned int longrun_get(unsigned int cpu)
146{
147 u32 eax, ebx, ecx, edx;
148
149 if (cpu)
150 return 0;
151
152 cpuid(0x80860007, &eax, &ebx, &ecx, &edx);
153 dprintk("cpuid eax is %u\n", eax);
154
155 return (eax * 1000);
156}
157
158/**
159 * longrun_determine_freqs - determines the lowest and highest possible core frequency
160 * @low_freq: an int to put the lowest frequency into
161 * @high_freq: an int to put the highest frequency into
162 *
163 * Determines the lowest and highest possible core frequencies on this CPU.
164 * This is necessary to calculate the performance percentage according to
165 * TMTA rules:
166 * performance_pctg = (target_freq - low_freq)/(high_freq - low_freq)
167 */
168static unsigned int __init longrun_determine_freqs(unsigned int *low_freq,
169 unsigned int *high_freq)
170{
171 u32 msr_lo, msr_hi;
172 u32 save_lo, save_hi;
173 u32 eax, ebx, ecx, edx;
174 u32 try_hi;
175 struct cpuinfo_x86 *c = cpu_data;
176
177 if (!low_freq || !high_freq)
178 return -EINVAL;
179
180 if (cpu_has(c, X86_FEATURE_LRTI)) {
181 /* if the LongRun Table Interface is present, the
182 * detection is a bit easier:
183 * For minimum frequency, read out the maximum
184 * level (msr_hi), write that into "currently
185 * selected level", and read out the frequency.
186 * For maximum frequency, read out level zero.
187 */
188 /* minimum */
189 rdmsr(MSR_TMTA_LRTI_READOUT, msr_lo, msr_hi);
190 wrmsr(MSR_TMTA_LRTI_READOUT, msr_hi, msr_hi);
191 rdmsr(MSR_TMTA_LRTI_VOLT_MHZ, msr_lo, msr_hi);
192 *low_freq = msr_lo * 1000; /* to kHz */
193
194 /* maximum */
195 wrmsr(MSR_TMTA_LRTI_READOUT, 0, msr_hi);
196 rdmsr(MSR_TMTA_LRTI_VOLT_MHZ, msr_lo, msr_hi);
197 *high_freq = msr_lo * 1000; /* to kHz */
198
199 dprintk("longrun table interface told %u - %u kHz\n", *low_freq, *high_freq);
200
201 if (*low_freq > *high_freq)
202 *low_freq = *high_freq;
203 return 0;
204 }
205
206 /* set the upper border to the value determined during TSC init */
207 *high_freq = (cpu_khz / 1000);
208 *high_freq = *high_freq * 1000;
209 dprintk("high frequency is %u kHz\n", *high_freq);
210
211 /* get current borders */
212 rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
213 save_lo = msr_lo & 0x0000007F;
214 save_hi = msr_hi & 0x0000007F;
215
216 /* if current perf_pctg is larger than 90%, we need to decrease the
217 * upper limit to make the calculation more accurate.
218 */
219 cpuid(0x80860007, &eax, &ebx, &ecx, &edx);
220 /* try decreasing in 10% steps, some processors react only
221 * on some barrier values */
222 for (try_hi = 80; try_hi > 0 && ecx > 90; try_hi -=10) {
223 /* set to 0 to try_hi perf_pctg */
224 msr_lo &= 0xFFFFFF80;
225 msr_hi &= 0xFFFFFF80;
226 msr_hi |= try_hi;
227 wrmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
228
229 /* read out current core MHz and current perf_pctg */
230 cpuid(0x80860007, &eax, &ebx, &ecx, &edx);
231
232 /* restore values */
233 wrmsr(MSR_TMTA_LONGRUN_CTRL, save_lo, save_hi);
234 }
235 dprintk("percentage is %u %%, freq is %u MHz\n", ecx, eax);
236
237 /* performance_pctg = (current_freq - low_freq)/(high_freq - low_freq)
238 * eqals
239 * low_freq * ( 1 - perf_pctg) = (cur_freq - high_freq * perf_pctg)
240 *
241 * high_freq * perf_pctg is stored tempoarily into "ebx".
242 */
243 ebx = (((cpu_khz / 1000) * ecx) / 100); /* to MHz */
244
245 if ((ecx > 95) || (ecx == 0) || (eax < ebx))
246 return -EIO;
247
248 edx = (eax - ebx) / (100 - ecx);
249 *low_freq = edx * 1000; /* back to kHz */
250
251 dprintk("low frequency is %u kHz\n", *low_freq);
252
253 if (*low_freq > *high_freq)
254 *low_freq = *high_freq;
255
256 return 0;
257}
258
259
260static int __init longrun_cpu_init(struct cpufreq_policy *policy)
261{
262 int result = 0;
263
264 /* capability check */
265 if (policy->cpu != 0)
266 return -ENODEV;
267
268 /* detect low and high frequency */
269 result = longrun_determine_freqs(&longrun_low_freq, &longrun_high_freq);
270 if (result)
271 return result;
272
273 /* cpuinfo and default policy values */
274 policy->cpuinfo.min_freq = longrun_low_freq;
275 policy->cpuinfo.max_freq = longrun_high_freq;
276 policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
277 longrun_get_policy(policy);
278
279 return 0;
280}
281
282
283static struct cpufreq_driver longrun_driver = {
284 .flags = CPUFREQ_CONST_LOOPS,
285 .verify = longrun_verify_policy,
286 .setpolicy = longrun_set_policy,
287 .get = longrun_get,
288 .init = longrun_cpu_init,
289 .name = "longrun",
290 .owner = THIS_MODULE,
291};
292
293
294/**
295 * longrun_init - initializes the Transmeta Crusoe LongRun CPUFreq driver
296 *
297 * Initializes the LongRun support.
298 */
299static int __init longrun_init(void)
300{
301 struct cpuinfo_x86 *c = cpu_data;
302
303 if (c->x86_vendor != X86_VENDOR_TRANSMETA ||
304 !cpu_has(c, X86_FEATURE_LONGRUN))
305 return -ENODEV;
306
307 return cpufreq_register_driver(&longrun_driver);
308}
309
310
311/**
312 * longrun_exit - unregisters LongRun support
313 */
314static void __exit longrun_exit(void)
315{
316 cpufreq_unregister_driver(&longrun_driver);
317}
318
319
320MODULE_AUTHOR ("Dominik Brodowski <linux@brodo.de>");
321MODULE_DESCRIPTION ("LongRun driver for Transmeta Crusoe and Efficeon processors.");
322MODULE_LICENSE ("GPL");
323
324module_init(longrun_init);
325module_exit(longrun_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
new file mode 100644
index 000000000000..4c76b511e194
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
@@ -0,0 +1,316 @@
1/*
2 * Pentium 4/Xeon CPU on demand clock modulation/speed scaling
3 * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
4 * (C) 2002 Zwane Mwaikambo <zwane@commfireservices.com>
5 * (C) 2002 Arjan van de Ven <arjanv@redhat.com>
6 * (C) 2002 Tora T. Engstad
7 * All Rights Reserved
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * as published by the Free Software Foundation; either version
12 * 2 of the License, or (at your option) any later version.
13 *
14 * The author(s) of this software shall not be held liable for damages
15 * of any nature resulting due to the use of this software. This
16 * software is provided AS-IS with no warranties.
17 *
18 * Date Errata Description
19 * 20020525 N44, O17 12.5% or 25% DC causes lockup
20 *
21 */
22
23#include <linux/kernel.h>
24#include <linux/module.h>
25#include <linux/init.h>
26#include <linux/smp.h>
27#include <linux/cpufreq.h>
28#include <linux/slab.h>
29#include <linux/cpumask.h>
30
31#include <asm/processor.h>
32#include <asm/msr.h>
33#include <asm/timex.h>
34
35#include "speedstep-lib.h"
36
37#define PFX "p4-clockmod: "
38#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "p4-clockmod", msg)
39
40/*
41 * Duty Cycle (3bits), note DC_DISABLE is not specified in
42 * intel docs i just use it to mean disable
43 */
44enum {
45 DC_RESV, DC_DFLT, DC_25PT, DC_38PT, DC_50PT,
46 DC_64PT, DC_75PT, DC_88PT, DC_DISABLE
47};
48
49#define DC_ENTRIES 8
50
51
52static int has_N44_O17_errata[NR_CPUS];
53static unsigned int stock_freq;
54static struct cpufreq_driver p4clockmod_driver;
55static unsigned int cpufreq_p4_get(unsigned int cpu);
56
57static int cpufreq_p4_setdc(unsigned int cpu, unsigned int newstate)
58{
59 u32 l, h;
60
61 if (!cpu_online(cpu) || (newstate > DC_DISABLE) || (newstate == DC_RESV))
62 return -EINVAL;
63
64 rdmsr_on_cpu(cpu, MSR_IA32_THERM_STATUS, &l, &h);
65
66 if (l & 0x01)
67 dprintk("CPU#%d currently thermal throttled\n", cpu);
68
69 if (has_N44_O17_errata[cpu] && (newstate == DC_25PT || newstate == DC_DFLT))
70 newstate = DC_38PT;
71
72 rdmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, &l, &h);
73 if (newstate == DC_DISABLE) {
74 dprintk("CPU#%d disabling modulation\n", cpu);
75 wrmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, l & ~(1<<4), h);
76 } else {
77 dprintk("CPU#%d setting duty cycle to %d%%\n",
78 cpu, ((125 * newstate) / 10));
79 /* bits 63 - 5 : reserved
80 * bit 4 : enable/disable
81 * bits 3-1 : duty cycle
82 * bit 0 : reserved
83 */
84 l = (l & ~14);
85 l = l | (1<<4) | ((newstate & 0x7)<<1);
86 wrmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, l, h);
87 }
88
89 return 0;
90}
91
92
93static struct cpufreq_frequency_table p4clockmod_table[] = {
94 {DC_RESV, CPUFREQ_ENTRY_INVALID},
95 {DC_DFLT, 0},
96 {DC_25PT, 0},
97 {DC_38PT, 0},
98 {DC_50PT, 0},
99 {DC_64PT, 0},
100 {DC_75PT, 0},
101 {DC_88PT, 0},
102 {DC_DISABLE, 0},
103 {DC_RESV, CPUFREQ_TABLE_END},
104};
105
106
107static int cpufreq_p4_target(struct cpufreq_policy *policy,
108 unsigned int target_freq,
109 unsigned int relation)
110{
111 unsigned int newstate = DC_RESV;
112 struct cpufreq_freqs freqs;
113 int i;
114
115 if (cpufreq_frequency_table_target(policy, &p4clockmod_table[0], target_freq, relation, &newstate))
116 return -EINVAL;
117
118 freqs.old = cpufreq_p4_get(policy->cpu);
119 freqs.new = stock_freq * p4clockmod_table[newstate].index / 8;
120
121 if (freqs.new == freqs.old)
122 return 0;
123
124 /* notifiers */
125 for_each_cpu_mask(i, policy->cpus) {
126 freqs.cpu = i;
127 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
128 }
129
130 /* run on each logical CPU, see section 13.15.3 of IA32 Intel Architecture Software
131 * Developer's Manual, Volume 3
132 */
133 for_each_cpu_mask(i, policy->cpus)
134 cpufreq_p4_setdc(i, p4clockmod_table[newstate].index);
135
136 /* notifiers */
137 for_each_cpu_mask(i, policy->cpus) {
138 freqs.cpu = i;
139 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
140 }
141
142 return 0;
143}
144
145
146static int cpufreq_p4_verify(struct cpufreq_policy *policy)
147{
148 return cpufreq_frequency_table_verify(policy, &p4clockmod_table[0]);
149}
150
151
152static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c)
153{
154 if (c->x86 == 0x06) {
155 if (cpu_has(c, X86_FEATURE_EST))
156 printk(KERN_WARNING PFX "Warning: EST-capable CPU detected. "
157 "The acpi-cpufreq module offers voltage scaling"
158 " in addition of frequency scaling. You should use "
159 "that instead of p4-clockmod, if possible.\n");
160 switch (c->x86_model) {
161 case 0x0E: /* Core */
162 case 0x0F: /* Core Duo */
163 p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
164 return speedstep_get_processor_frequency(SPEEDSTEP_PROCESSOR_PCORE);
165 case 0x0D: /* Pentium M (Dothan) */
166 p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
167 /* fall through */
168 case 0x09: /* Pentium M (Banias) */
169 return speedstep_get_processor_frequency(SPEEDSTEP_PROCESSOR_PM);
170 }
171 }
172
173 if (c->x86 != 0xF) {
174 printk(KERN_WARNING PFX "Unknown p4-clockmod-capable CPU. Please send an e-mail to <cpufreq@lists.linux.org.uk>\n");
175 return 0;
176 }
177
178 /* on P-4s, the TSC runs with constant frequency independent whether
179 * throttling is active or not. */
180 p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
181
182 if (speedstep_detect_processor() == SPEEDSTEP_PROCESSOR_P4M) {
183 printk(KERN_WARNING PFX "Warning: Pentium 4-M detected. "
184 "The speedstep-ich or acpi cpufreq modules offer "
185 "voltage scaling in addition of frequency scaling. "
186 "You should use either one instead of p4-clockmod, "
187 "if possible.\n");
188 return speedstep_get_processor_frequency(SPEEDSTEP_PROCESSOR_P4M);
189 }
190
191 return speedstep_get_processor_frequency(SPEEDSTEP_PROCESSOR_P4D);
192}
193
194
195
196static int cpufreq_p4_cpu_init(struct cpufreq_policy *policy)
197{
198 struct cpuinfo_x86 *c = &cpu_data[policy->cpu];
199 int cpuid = 0;
200 unsigned int i;
201
202#ifdef CONFIG_SMP
203 policy->cpus = cpu_sibling_map[policy->cpu];
204#endif
205
206 /* Errata workaround */
207 cpuid = (c->x86 << 8) | (c->x86_model << 4) | c->x86_mask;
208 switch (cpuid) {
209 case 0x0f07:
210 case 0x0f0a:
211 case 0x0f11:
212 case 0x0f12:
213 has_N44_O17_errata[policy->cpu] = 1;
214 dprintk("has errata -- disabling low frequencies\n");
215 }
216
217 /* get max frequency */
218 stock_freq = cpufreq_p4_get_frequency(c);
219 if (!stock_freq)
220 return -EINVAL;
221
222 /* table init */
223 for (i=1; (p4clockmod_table[i].frequency != CPUFREQ_TABLE_END); i++) {
224 if ((i<2) && (has_N44_O17_errata[policy->cpu]))
225 p4clockmod_table[i].frequency = CPUFREQ_ENTRY_INVALID;
226 else
227 p4clockmod_table[i].frequency = (stock_freq * i)/8;
228 }
229 cpufreq_frequency_table_get_attr(p4clockmod_table, policy->cpu);
230
231 /* cpuinfo and default policy values */
232 policy->governor = CPUFREQ_DEFAULT_GOVERNOR;
233 policy->cpuinfo.transition_latency = 1000000; /* assumed */
234 policy->cur = stock_freq;
235
236 return cpufreq_frequency_table_cpuinfo(policy, &p4clockmod_table[0]);
237}
238
239
240static int cpufreq_p4_cpu_exit(struct cpufreq_policy *policy)
241{
242 cpufreq_frequency_table_put_attr(policy->cpu);
243 return 0;
244}
245
246static unsigned int cpufreq_p4_get(unsigned int cpu)
247{
248 u32 l, h;
249
250 rdmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, &l, &h);
251
252 if (l & 0x10) {
253 l = l >> 1;
254 l &= 0x7;
255 } else
256 l = DC_DISABLE;
257
258 if (l != DC_DISABLE)
259 return (stock_freq * l / 8);
260
261 return stock_freq;
262}
263
264static struct freq_attr* p4clockmod_attr[] = {
265 &cpufreq_freq_attr_scaling_available_freqs,
266 NULL,
267};
268
269static struct cpufreq_driver p4clockmod_driver = {
270 .verify = cpufreq_p4_verify,
271 .target = cpufreq_p4_target,
272 .init = cpufreq_p4_cpu_init,
273 .exit = cpufreq_p4_cpu_exit,
274 .get = cpufreq_p4_get,
275 .name = "p4-clockmod",
276 .owner = THIS_MODULE,
277 .attr = p4clockmod_attr,
278};
279
280
281static int __init cpufreq_p4_init(void)
282{
283 struct cpuinfo_x86 *c = cpu_data;
284 int ret;
285
286 /*
287 * THERM_CONTROL is architectural for IA32 now, so
288 * we can rely on the capability checks
289 */
290 if (c->x86_vendor != X86_VENDOR_INTEL)
291 return -ENODEV;
292
293 if (!test_bit(X86_FEATURE_ACPI, c->x86_capability) ||
294 !test_bit(X86_FEATURE_ACC, c->x86_capability))
295 return -ENODEV;
296
297 ret = cpufreq_register_driver(&p4clockmod_driver);
298 if (!ret)
299 printk(KERN_INFO PFX "P4/Xeon(TM) CPU On-Demand Clock Modulation available\n");
300
301 return (ret);
302}
303
304
305static void __exit cpufreq_p4_exit(void)
306{
307 cpufreq_unregister_driver(&p4clockmod_driver);
308}
309
310
311MODULE_AUTHOR ("Zwane Mwaikambo <zwane@commfireservices.com>");
312MODULE_DESCRIPTION ("cpufreq driver for Pentium(TM) 4/Xeon(TM)");
313MODULE_LICENSE ("GPL");
314
315late_initcall(cpufreq_p4_init);
316module_exit(cpufreq_p4_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
new file mode 100644
index 000000000000..f89524051e4a
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
@@ -0,0 +1,256 @@
1/*
2 * This file was based upon code in Powertweak Linux (http://powertweak.sf.net)
3 * (C) 2000-2003 Dave Jones, Arjan van de Ven, Janne Pänkälä, Dominik Brodowski.
4 *
5 * Licensed under the terms of the GNU GPL License version 2.
6 *
7 * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
8 */
9
10#include <linux/kernel.h>
11#include <linux/module.h>
12#include <linux/init.h>
13#include <linux/cpufreq.h>
14#include <linux/ioport.h>
15#include <linux/slab.h>
16
17#include <asm/msr.h>
18#include <asm/timex.h>
19#include <asm/io.h>
20
21
22#define POWERNOW_IOPORT 0xfff0 /* it doesn't matter where, as long
23 as it is unused */
24
25static unsigned int busfreq; /* FSB, in 10 kHz */
26static unsigned int max_multiplier;
27
28
29/* Clock ratio multiplied by 10 - see table 27 in AMD#23446 */
30static struct cpufreq_frequency_table clock_ratio[] = {
31 {45, /* 000 -> 4.5x */ 0},
32 {50, /* 001 -> 5.0x */ 0},
33 {40, /* 010 -> 4.0x */ 0},
34 {55, /* 011 -> 5.5x */ 0},
35 {20, /* 100 -> 2.0x */ 0},
36 {30, /* 101 -> 3.0x */ 0},
37 {60, /* 110 -> 6.0x */ 0},
38 {35, /* 111 -> 3.5x */ 0},
39 {0, CPUFREQ_TABLE_END}
40};
41
42
43/**
44 * powernow_k6_get_cpu_multiplier - returns the current FSB multiplier
45 *
46 * Returns the current setting of the frequency multiplier. Core clock
47 * speed is frequency of the Front-Side Bus multiplied with this value.
48 */
49static int powernow_k6_get_cpu_multiplier(void)
50{
51 u64 invalue = 0;
52 u32 msrval;
53
54 msrval = POWERNOW_IOPORT + 0x1;
55 wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */
56 invalue=inl(POWERNOW_IOPORT + 0x8);
57 msrval = POWERNOW_IOPORT + 0x0;
58 wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */
59
60 return clock_ratio[(invalue >> 5)&7].index;
61}
62
63
64/**
65 * powernow_k6_set_state - set the PowerNow! multiplier
66 * @best_i: clock_ratio[best_i] is the target multiplier
67 *
68 * Tries to change the PowerNow! multiplier
69 */
70static void powernow_k6_set_state (unsigned int best_i)
71{
72 unsigned long outvalue=0, invalue=0;
73 unsigned long msrval;
74 struct cpufreq_freqs freqs;
75
76 if (clock_ratio[best_i].index > max_multiplier) {
77 printk(KERN_ERR "cpufreq: invalid target frequency\n");
78 return;
79 }
80
81 freqs.old = busfreq * powernow_k6_get_cpu_multiplier();
82 freqs.new = busfreq * clock_ratio[best_i].index;
83 freqs.cpu = 0; /* powernow-k6.c is UP only driver */
84
85 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
86
87 /* we now need to transform best_i to the BVC format, see AMD#23446 */
88
89 outvalue = (1<<12) | (1<<10) | (1<<9) | (best_i<<5);
90
91 msrval = POWERNOW_IOPORT + 0x1;
92 wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */
93 invalue=inl(POWERNOW_IOPORT + 0x8);
94 invalue = invalue & 0xf;
95 outvalue = outvalue | invalue;
96 outl(outvalue ,(POWERNOW_IOPORT + 0x8));
97 msrval = POWERNOW_IOPORT + 0x0;
98 wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */
99
100 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
101
102 return;
103}
104
105
106/**
107 * powernow_k6_verify - verifies a new CPUfreq policy
108 * @policy: new policy
109 *
110 * Policy must be within lowest and highest possible CPU Frequency,
111 * and at least one possible state must be within min and max.
112 */
113static int powernow_k6_verify(struct cpufreq_policy *policy)
114{
115 return cpufreq_frequency_table_verify(policy, &clock_ratio[0]);
116}
117
118
119/**
120 * powernow_k6_setpolicy - sets a new CPUFreq policy
121 * @policy: new policy
122 * @target_freq: the target frequency
123 * @relation: how that frequency relates to achieved frequency (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
124 *
125 * sets a new CPUFreq policy
126 */
127static int powernow_k6_target (struct cpufreq_policy *policy,
128 unsigned int target_freq,
129 unsigned int relation)
130{
131 unsigned int newstate = 0;
132
133 if (cpufreq_frequency_table_target(policy, &clock_ratio[0], target_freq, relation, &newstate))
134 return -EINVAL;
135
136 powernow_k6_set_state(newstate);
137
138 return 0;
139}
140
141
142static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
143{
144 unsigned int i;
145 int result;
146
147 if (policy->cpu != 0)
148 return -ENODEV;
149
150 /* get frequencies */
151 max_multiplier = powernow_k6_get_cpu_multiplier();
152 busfreq = cpu_khz / max_multiplier;
153
154 /* table init */
155 for (i=0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) {
156 if (clock_ratio[i].index > max_multiplier)
157 clock_ratio[i].frequency = CPUFREQ_ENTRY_INVALID;
158 else
159 clock_ratio[i].frequency = busfreq * clock_ratio[i].index;
160 }
161
162 /* cpuinfo and default policy values */
163 policy->governor = CPUFREQ_DEFAULT_GOVERNOR;
164 policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
165 policy->cur = busfreq * max_multiplier;
166
167 result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio);
168 if (result)
169 return (result);
170
171 cpufreq_frequency_table_get_attr(clock_ratio, policy->cpu);
172
173 return 0;
174}
175
176
177static int powernow_k6_cpu_exit(struct cpufreq_policy *policy)
178{
179 unsigned int i;
180 for (i=0; i<8; i++) {
181 if (i==max_multiplier)
182 powernow_k6_set_state(i);
183 }
184 cpufreq_frequency_table_put_attr(policy->cpu);
185 return 0;
186}
187
188static unsigned int powernow_k6_get(unsigned int cpu)
189{
190 return busfreq * powernow_k6_get_cpu_multiplier();
191}
192
193static struct freq_attr* powernow_k6_attr[] = {
194 &cpufreq_freq_attr_scaling_available_freqs,
195 NULL,
196};
197
198static struct cpufreq_driver powernow_k6_driver = {
199 .verify = powernow_k6_verify,
200 .target = powernow_k6_target,
201 .init = powernow_k6_cpu_init,
202 .exit = powernow_k6_cpu_exit,
203 .get = powernow_k6_get,
204 .name = "powernow-k6",
205 .owner = THIS_MODULE,
206 .attr = powernow_k6_attr,
207};
208
209
210/**
211 * powernow_k6_init - initializes the k6 PowerNow! CPUFreq driver
212 *
213 * Initializes the K6 PowerNow! support. Returns -ENODEV on unsupported
214 * devices, -EINVAL or -ENOMEM on problems during initiatization, and zero
215 * on success.
216 */
217static int __init powernow_k6_init(void)
218{
219 struct cpuinfo_x86 *c = cpu_data;
220
221 if ((c->x86_vendor != X86_VENDOR_AMD) || (c->x86 != 5) ||
222 ((c->x86_model != 12) && (c->x86_model != 13)))
223 return -ENODEV;
224
225 if (!request_region(POWERNOW_IOPORT, 16, "PowerNow!")) {
226 printk("cpufreq: PowerNow IOPORT region already used.\n");
227 return -EIO;
228 }
229
230 if (cpufreq_register_driver(&powernow_k6_driver)) {
231 release_region (POWERNOW_IOPORT, 16);
232 return -EINVAL;
233 }
234
235 return 0;
236}
237
238
239/**
240 * powernow_k6_exit - unregisters AMD K6-2+/3+ PowerNow! support
241 *
242 * Unregisters AMD K6-2+ / K6-3+ PowerNow! support.
243 */
244static void __exit powernow_k6_exit(void)
245{
246 cpufreq_unregister_driver(&powernow_k6_driver);
247 release_region (POWERNOW_IOPORT, 16);
248}
249
250
251MODULE_AUTHOR ("Arjan van de Ven <arjanv@redhat.com>, Dave Jones <davej@codemonkey.org.uk>, Dominik Brodowski <linux@brodo.de>");
252MODULE_DESCRIPTION ("PowerNow! driver for AMD K6-2+ / K6-3+ processors.");
253MODULE_LICENSE ("GPL");
254
255module_init(powernow_k6_init);
256module_exit(powernow_k6_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
new file mode 100644
index 000000000000..ca3e1d341889
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
@@ -0,0 +1,703 @@
1/*
2 * AMD K7 Powernow driver.
3 * (C) 2003 Dave Jones <davej@codemonkey.org.uk> on behalf of SuSE Labs.
4 * (C) 2003-2004 Dave Jones <davej@redhat.com>
5 *
6 * Licensed under the terms of the GNU GPL License version 2.
7 * Based upon datasheets & sample CPUs kindly provided by AMD.
8 *
9 * Errata 5: Processor may fail to execute a FID/VID change in presence of interrupt.
10 * - We cli/sti on stepping A0 CPUs around the FID/VID transition.
11 * Errata 15: Processors with half frequency multipliers may hang upon wakeup from disconnect.
12 * - We disable half multipliers if ACPI is used on A0 stepping CPUs.
13 */
14
15#include <linux/kernel.h>
16#include <linux/module.h>
17#include <linux/moduleparam.h>
18#include <linux/init.h>
19#include <linux/cpufreq.h>
20#include <linux/slab.h>
21#include <linux/string.h>
22#include <linux/dmi.h>
23
24#include <asm/msr.h>
25#include <asm/timer.h>
26#include <asm/timex.h>
27#include <asm/io.h>
28#include <asm/system.h>
29
30#ifdef CONFIG_X86_POWERNOW_K7_ACPI
31#include <linux/acpi.h>
32#include <acpi/processor.h>
33#endif
34
35#include "powernow-k7.h"
36
37#define PFX "powernow: "
38
39
40struct psb_s {
41 u8 signature[10];
42 u8 tableversion;
43 u8 flags;
44 u16 settlingtime;
45 u8 reserved1;
46 u8 numpst;
47};
48
49struct pst_s {
50 u32 cpuid;
51 u8 fsbspeed;
52 u8 maxfid;
53 u8 startvid;
54 u8 numpstates;
55};
56
57#ifdef CONFIG_X86_POWERNOW_K7_ACPI
58union powernow_acpi_control_t {
59 struct {
60 unsigned long fid:5,
61 vid:5,
62 sgtc:20,
63 res1:2;
64 } bits;
65 unsigned long val;
66};
67#endif
68
69#ifdef CONFIG_CPU_FREQ_DEBUG
70/* divide by 1000 to get VCore voltage in V. */
71static const int mobile_vid_table[32] = {
72 2000, 1950, 1900, 1850, 1800, 1750, 1700, 1650,
73 1600, 1550, 1500, 1450, 1400, 1350, 1300, 0,
74 1275, 1250, 1225, 1200, 1175, 1150, 1125, 1100,
75 1075, 1050, 1025, 1000, 975, 950, 925, 0,
76};
77#endif
78
79/* divide by 10 to get FID. */
80static const int fid_codes[32] = {
81 110, 115, 120, 125, 50, 55, 60, 65,
82 70, 75, 80, 85, 90, 95, 100, 105,
83 30, 190, 40, 200, 130, 135, 140, 210,
84 150, 225, 160, 165, 170, 180, -1, -1,
85};
86
87/* This parameter is used in order to force ACPI instead of legacy method for
88 * configuration purpose.
89 */
90
91static int acpi_force;
92
93static struct cpufreq_frequency_table *powernow_table;
94
95static unsigned int can_scale_bus;
96static unsigned int can_scale_vid;
97static unsigned int minimum_speed=-1;
98static unsigned int maximum_speed;
99static unsigned int number_scales;
100static unsigned int fsb;
101static unsigned int latency;
102static char have_a0;
103
104#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "powernow-k7", msg)
105
106static int check_fsb(unsigned int fsbspeed)
107{
108 int delta;
109 unsigned int f = fsb / 1000;
110
111 delta = (fsbspeed > f) ? fsbspeed - f : f - fsbspeed;
112 return (delta < 5);
113}
114
115static int check_powernow(void)
116{
117 struct cpuinfo_x86 *c = cpu_data;
118 unsigned int maxei, eax, ebx, ecx, edx;
119
120 if ((c->x86_vendor != X86_VENDOR_AMD) || (c->x86 !=6)) {
121#ifdef MODULE
122 printk (KERN_INFO PFX "This module only works with AMD K7 CPUs\n");
123#endif
124 return 0;
125 }
126
127 /* Get maximum capabilities */
128 maxei = cpuid_eax (0x80000000);
129 if (maxei < 0x80000007) { /* Any powernow info ? */
130#ifdef MODULE
131 printk (KERN_INFO PFX "No powernow capabilities detected\n");
132#endif
133 return 0;
134 }
135
136 if ((c->x86_model == 6) && (c->x86_mask == 0)) {
137 printk (KERN_INFO PFX "K7 660[A0] core detected, enabling errata workarounds\n");
138 have_a0 = 1;
139 }
140
141 cpuid(0x80000007, &eax, &ebx, &ecx, &edx);
142
143 /* Check we can actually do something before we say anything.*/
144 if (!(edx & (1 << 1 | 1 << 2)))
145 return 0;
146
147 printk (KERN_INFO PFX "PowerNOW! Technology present. Can scale: ");
148
149 if (edx & 1 << 1) {
150 printk ("frequency");
151 can_scale_bus=1;
152 }
153
154 if ((edx & (1 << 1 | 1 << 2)) == 0x6)
155 printk (" and ");
156
157 if (edx & 1 << 2) {
158 printk ("voltage");
159 can_scale_vid=1;
160 }
161
162 printk (".\n");
163 return 1;
164}
165
166
167static int get_ranges (unsigned char *pst)
168{
169 unsigned int j;
170 unsigned int speed;
171 u8 fid, vid;
172
173 powernow_table = kzalloc((sizeof(struct cpufreq_frequency_table) * (number_scales + 1)), GFP_KERNEL);
174 if (!powernow_table)
175 return -ENOMEM;
176
177 for (j=0 ; j < number_scales; j++) {
178 fid = *pst++;
179
180 powernow_table[j].frequency = (fsb * fid_codes[fid]) / 10;
181 powernow_table[j].index = fid; /* lower 8 bits */
182
183 speed = powernow_table[j].frequency;
184
185 if ((fid_codes[fid] % 10)==5) {
186#ifdef CONFIG_X86_POWERNOW_K7_ACPI
187 if (have_a0 == 1)
188 powernow_table[j].frequency = CPUFREQ_ENTRY_INVALID;
189#endif
190 }
191
192 if (speed < minimum_speed)
193 minimum_speed = speed;
194 if (speed > maximum_speed)
195 maximum_speed = speed;
196
197 vid = *pst++;
198 powernow_table[j].index |= (vid << 8); /* upper 8 bits */
199
200 dprintk (" FID: 0x%x (%d.%dx [%dMHz]) "
201 "VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10,
202 fid_codes[fid] % 10, speed/1000, vid,
203 mobile_vid_table[vid]/1000,
204 mobile_vid_table[vid]%1000);
205 }
206 powernow_table[number_scales].frequency = CPUFREQ_TABLE_END;
207 powernow_table[number_scales].index = 0;
208
209 return 0;
210}
211
212
213static void change_FID(int fid)
214{
215 union msr_fidvidctl fidvidctl;
216
217 rdmsrl (MSR_K7_FID_VID_CTL, fidvidctl.val);
218 if (fidvidctl.bits.FID != fid) {
219 fidvidctl.bits.SGTC = latency;
220 fidvidctl.bits.FID = fid;
221 fidvidctl.bits.VIDC = 0;
222 fidvidctl.bits.FIDC = 1;
223 wrmsrl (MSR_K7_FID_VID_CTL, fidvidctl.val);
224 }
225}
226
227
228static void change_VID(int vid)
229{
230 union msr_fidvidctl fidvidctl;
231
232 rdmsrl (MSR_K7_FID_VID_CTL, fidvidctl.val);
233 if (fidvidctl.bits.VID != vid) {
234 fidvidctl.bits.SGTC = latency;
235 fidvidctl.bits.VID = vid;
236 fidvidctl.bits.FIDC = 0;
237 fidvidctl.bits.VIDC = 1;
238 wrmsrl (MSR_K7_FID_VID_CTL, fidvidctl.val);
239 }
240}
241
242
243static void change_speed (unsigned int index)
244{
245 u8 fid, vid;
246 struct cpufreq_freqs freqs;
247 union msr_fidvidstatus fidvidstatus;
248 int cfid;
249
250 /* fid are the lower 8 bits of the index we stored into
251 * the cpufreq frequency table in powernow_decode_bios,
252 * vid are the upper 8 bits.
253 */
254
255 fid = powernow_table[index].index & 0xFF;
256 vid = (powernow_table[index].index & 0xFF00) >> 8;
257
258 freqs.cpu = 0;
259
260 rdmsrl (MSR_K7_FID_VID_STATUS, fidvidstatus.val);
261 cfid = fidvidstatus.bits.CFID;
262 freqs.old = fsb * fid_codes[cfid] / 10;
263
264 freqs.new = powernow_table[index].frequency;
265
266 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
267
268 /* Now do the magic poking into the MSRs. */
269
270 if (have_a0 == 1) /* A0 errata 5 */
271 local_irq_disable();
272
273 if (freqs.old > freqs.new) {
274 /* Going down, so change FID first */
275 change_FID(fid);
276 change_VID(vid);
277 } else {
278 /* Going up, so change VID first */
279 change_VID(vid);
280 change_FID(fid);
281 }
282
283
284 if (have_a0 == 1)
285 local_irq_enable();
286
287 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
288}
289
290
291#ifdef CONFIG_X86_POWERNOW_K7_ACPI
292
293static struct acpi_processor_performance *acpi_processor_perf;
294
295static int powernow_acpi_init(void)
296{
297 int i;
298 int retval = 0;
299 union powernow_acpi_control_t pc;
300
301 if (acpi_processor_perf != NULL && powernow_table != NULL) {
302 retval = -EINVAL;
303 goto err0;
304 }
305
306 acpi_processor_perf = kzalloc(sizeof(struct acpi_processor_performance),
307 GFP_KERNEL);
308 if (!acpi_processor_perf) {
309 retval = -ENOMEM;
310 goto err0;
311 }
312
313 if (acpi_processor_register_performance(acpi_processor_perf, 0)) {
314 retval = -EIO;
315 goto err1;
316 }
317
318 if (acpi_processor_perf->control_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) {
319 retval = -ENODEV;
320 goto err2;
321 }
322
323 if (acpi_processor_perf->status_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) {
324 retval = -ENODEV;
325 goto err2;
326 }
327
328 number_scales = acpi_processor_perf->state_count;
329
330 if (number_scales < 2) {
331 retval = -ENODEV;
332 goto err2;
333 }
334
335 powernow_table = kzalloc((number_scales + 1) * (sizeof(struct cpufreq_frequency_table)), GFP_KERNEL);
336 if (!powernow_table) {
337 retval = -ENOMEM;
338 goto err2;
339 }
340
341 pc.val = (unsigned long) acpi_processor_perf->states[0].control;
342 for (i = 0; i < number_scales; i++) {
343 u8 fid, vid;
344 struct acpi_processor_px *state =
345 &acpi_processor_perf->states[i];
346 unsigned int speed, speed_mhz;
347
348 pc.val = (unsigned long) state->control;
349 dprintk ("acpi: P%d: %d MHz %d mW %d uS control %08x SGTC %d\n",
350 i,
351 (u32) state->core_frequency,
352 (u32) state->power,
353 (u32) state->transition_latency,
354 (u32) state->control,
355 pc.bits.sgtc);
356
357 vid = pc.bits.vid;
358 fid = pc.bits.fid;
359
360 powernow_table[i].frequency = fsb * fid_codes[fid] / 10;
361 powernow_table[i].index = fid; /* lower 8 bits */
362 powernow_table[i].index |= (vid << 8); /* upper 8 bits */
363
364 speed = powernow_table[i].frequency;
365 speed_mhz = speed / 1000;
366
367 /* processor_perflib will multiply the MHz value by 1000 to
368 * get a KHz value (e.g. 1266000). However, powernow-k7 works
369 * with true KHz values (e.g. 1266768). To ensure that all
370 * powernow frequencies are available, we must ensure that
371 * ACPI doesn't restrict them, so we round up the MHz value
372 * to ensure that perflib's computed KHz value is greater than
373 * or equal to powernow's KHz value.
374 */
375 if (speed % 1000 > 0)
376 speed_mhz++;
377
378 if ((fid_codes[fid] % 10)==5) {
379 if (have_a0 == 1)
380 powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID;
381 }
382
383 dprintk (" FID: 0x%x (%d.%dx [%dMHz]) "
384 "VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10,
385 fid_codes[fid] % 10, speed_mhz, vid,
386 mobile_vid_table[vid]/1000,
387 mobile_vid_table[vid]%1000);
388
389 if (state->core_frequency != speed_mhz) {
390 state->core_frequency = speed_mhz;
391 dprintk(" Corrected ACPI frequency to %d\n",
392 speed_mhz);
393 }
394
395 if (latency < pc.bits.sgtc)
396 latency = pc.bits.sgtc;
397
398 if (speed < minimum_speed)
399 minimum_speed = speed;
400 if (speed > maximum_speed)
401 maximum_speed = speed;
402 }
403
404 powernow_table[i].frequency = CPUFREQ_TABLE_END;
405 powernow_table[i].index = 0;
406
407 /* notify BIOS that we exist */
408 acpi_processor_notify_smm(THIS_MODULE);
409
410 return 0;
411
412err2:
413 acpi_processor_unregister_performance(acpi_processor_perf, 0);
414err1:
415 kfree(acpi_processor_perf);
416err0:
417 printk(KERN_WARNING PFX "ACPI perflib can not be used in this platform\n");
418 acpi_processor_perf = NULL;
419 return retval;
420}
421#else
422static int powernow_acpi_init(void)
423{
424 printk(KERN_INFO PFX "no support for ACPI processor found."
425 " Please recompile your kernel with ACPI processor\n");
426 return -EINVAL;
427}
428#endif
429
430static int powernow_decode_bios (int maxfid, int startvid)
431{
432 struct psb_s *psb;
433 struct pst_s *pst;
434 unsigned int i, j;
435 unsigned char *p;
436 unsigned int etuple;
437 unsigned int ret;
438
439 etuple = cpuid_eax(0x80000001);
440
441 for (i=0xC0000; i < 0xffff0 ; i+=16) {
442
443 p = phys_to_virt(i);
444
445 if (memcmp(p, "AMDK7PNOW!", 10) == 0){
446 dprintk ("Found PSB header at %p\n", p);
447 psb = (struct psb_s *) p;
448 dprintk ("Table version: 0x%x\n", psb->tableversion);
449 if (psb->tableversion != 0x12) {
450 printk (KERN_INFO PFX "Sorry, only v1.2 tables supported right now\n");
451 return -ENODEV;
452 }
453
454 dprintk ("Flags: 0x%x\n", psb->flags);
455 if ((psb->flags & 1)==0) {
456 dprintk ("Mobile voltage regulator\n");
457 } else {
458 dprintk ("Desktop voltage regulator\n");
459 }
460
461 latency = psb->settlingtime;
462 if (latency < 100) {
463 printk (KERN_INFO PFX "BIOS set settling time to %d microseconds."
464 "Should be at least 100. Correcting.\n", latency);
465 latency = 100;
466 }
467 dprintk ("Settling Time: %d microseconds.\n", psb->settlingtime);
468 dprintk ("Has %d PST tables. (Only dumping ones relevant to this CPU).\n", psb->numpst);
469
470 p += sizeof (struct psb_s);
471
472 pst = (struct pst_s *) p;
473
474 for (j=0; j<psb->numpst; j++) {
475 pst = (struct pst_s *) p;
476 number_scales = pst->numpstates;
477
478 if ((etuple == pst->cpuid) && check_fsb(pst->fsbspeed) &&
479 (maxfid==pst->maxfid) && (startvid==pst->startvid))
480 {
481 dprintk ("PST:%d (@%p)\n", j, pst);
482 dprintk (" cpuid: 0x%x fsb: %d maxFID: 0x%x startvid: 0x%x\n",
483 pst->cpuid, pst->fsbspeed, pst->maxfid, pst->startvid);
484
485 ret = get_ranges ((char *) pst + sizeof (struct pst_s));
486 return ret;
487 } else {
488 unsigned int k;
489 p = (char *) pst + sizeof (struct pst_s);
490 for (k=0; k<number_scales; k++)
491 p+=2;
492 }
493 }
494 printk (KERN_INFO PFX "No PST tables match this cpuid (0x%x)\n", etuple);
495 printk (KERN_INFO PFX "This is indicative of a broken BIOS.\n");
496
497 return -EINVAL;
498 }
499 p++;
500 }
501
502 return -ENODEV;
503}
504
505
506static int powernow_target (struct cpufreq_policy *policy,
507 unsigned int target_freq,
508 unsigned int relation)
509{
510 unsigned int newstate;
511
512 if (cpufreq_frequency_table_target(policy, powernow_table, target_freq, relation, &newstate))
513 return -EINVAL;
514
515 change_speed(newstate);
516
517 return 0;
518}
519
520
521static int powernow_verify (struct cpufreq_policy *policy)
522{
523 return cpufreq_frequency_table_verify(policy, powernow_table);
524}
525
526/*
527 * We use the fact that the bus frequency is somehow
528 * a multiple of 100000/3 khz, then we compute sgtc according
529 * to this multiple.
530 * That way, we match more how AMD thinks all of that work.
531 * We will then get the same kind of behaviour already tested under
532 * the "well-known" other OS.
533 */
534static int __init fixup_sgtc(void)
535{
536 unsigned int sgtc;
537 unsigned int m;
538
539 m = fsb / 3333;
540 if ((m % 10) >= 5)
541 m += 5;
542
543 m /= 10;
544
545 sgtc = 100 * m * latency;
546 sgtc = sgtc / 3;
547 if (sgtc > 0xfffff) {
548 printk(KERN_WARNING PFX "SGTC too large %d\n", sgtc);
549 sgtc = 0xfffff;
550 }
551 return sgtc;
552}
553
554static unsigned int powernow_get(unsigned int cpu)
555{
556 union msr_fidvidstatus fidvidstatus;
557 unsigned int cfid;
558
559 if (cpu)
560 return 0;
561 rdmsrl (MSR_K7_FID_VID_STATUS, fidvidstatus.val);
562 cfid = fidvidstatus.bits.CFID;
563
564 return (fsb * fid_codes[cfid] / 10);
565}
566
567
568static int __init acer_cpufreq_pst(struct dmi_system_id *d)
569{
570 printk(KERN_WARNING "%s laptop with broken PST tables in BIOS detected.\n", d->ident);
571 printk(KERN_WARNING "You need to downgrade to 3A21 (09/09/2002), or try a newer BIOS than 3A71 (01/20/2003)\n");
572 printk(KERN_WARNING "cpufreq scaling has been disabled as a result of this.\n");
573 return 0;
574}
575
576/*
577 * Some Athlon laptops have really fucked PST tables.
578 * A BIOS update is all that can save them.
579 * Mention this, and disable cpufreq.
580 */
581static struct dmi_system_id __initdata powernow_dmi_table[] = {
582 {
583 .callback = acer_cpufreq_pst,
584 .ident = "Acer Aspire",
585 .matches = {
586 DMI_MATCH(DMI_SYS_VENDOR, "Insyde Software"),
587 DMI_MATCH(DMI_BIOS_VERSION, "3A71"),
588 },
589 },
590 { }
591};
592
593static int __init powernow_cpu_init (struct cpufreq_policy *policy)
594{
595 union msr_fidvidstatus fidvidstatus;
596 int result;
597
598 if (policy->cpu != 0)
599 return -ENODEV;
600
601 rdmsrl (MSR_K7_FID_VID_STATUS, fidvidstatus.val);
602
603 recalibrate_cpu_khz();
604
605 fsb = (10 * cpu_khz) / fid_codes[fidvidstatus.bits.CFID];
606 if (!fsb) {
607 printk(KERN_WARNING PFX "can not determine bus frequency\n");
608 return -EINVAL;
609 }
610 dprintk("FSB: %3dMHz\n", fsb/1000);
611
612 if (dmi_check_system(powernow_dmi_table) || acpi_force) {
613 printk (KERN_INFO PFX "PSB/PST known to be broken. Trying ACPI instead\n");
614 result = powernow_acpi_init();
615 } else {
616 result = powernow_decode_bios(fidvidstatus.bits.MFID, fidvidstatus.bits.SVID);
617 if (result) {
618 printk (KERN_INFO PFX "Trying ACPI perflib\n");
619 maximum_speed = 0;
620 minimum_speed = -1;
621 latency = 0;
622 result = powernow_acpi_init();
623 if (result) {
624 printk (KERN_INFO PFX "ACPI and legacy methods failed\n");
625 printk (KERN_INFO PFX "See http://www.codemonkey.org.uk/projects/cpufreq/powernow-k7.html\n");
626 }
627 } else {
628 /* SGTC use the bus clock as timer */
629 latency = fixup_sgtc();
630 printk(KERN_INFO PFX "SGTC: %d\n", latency);
631 }
632 }
633
634 if (result)
635 return result;
636
637 printk (KERN_INFO PFX "Minimum speed %d MHz. Maximum speed %d MHz.\n",
638 minimum_speed/1000, maximum_speed/1000);
639
640 policy->governor = CPUFREQ_DEFAULT_GOVERNOR;
641
642 policy->cpuinfo.transition_latency = cpufreq_scale(2000000UL, fsb, latency);
643
644 policy->cur = powernow_get(0);
645
646 cpufreq_frequency_table_get_attr(powernow_table, policy->cpu);
647
648 return cpufreq_frequency_table_cpuinfo(policy, powernow_table);
649}
650
651static int powernow_cpu_exit (struct cpufreq_policy *policy) {
652 cpufreq_frequency_table_put_attr(policy->cpu);
653
654#ifdef CONFIG_X86_POWERNOW_K7_ACPI
655 if (acpi_processor_perf) {
656 acpi_processor_unregister_performance(acpi_processor_perf, 0);
657 kfree(acpi_processor_perf);
658 }
659#endif
660
661 kfree(powernow_table);
662 return 0;
663}
664
665static struct freq_attr* powernow_table_attr[] = {
666 &cpufreq_freq_attr_scaling_available_freqs,
667 NULL,
668};
669
670static struct cpufreq_driver powernow_driver = {
671 .verify = powernow_verify,
672 .target = powernow_target,
673 .get = powernow_get,
674 .init = powernow_cpu_init,
675 .exit = powernow_cpu_exit,
676 .name = "powernow-k7",
677 .owner = THIS_MODULE,
678 .attr = powernow_table_attr,
679};
680
681static int __init powernow_init (void)
682{
683 if (check_powernow()==0)
684 return -ENODEV;
685 return cpufreq_register_driver(&powernow_driver);
686}
687
688
689static void __exit powernow_exit (void)
690{
691 cpufreq_unregister_driver(&powernow_driver);
692}
693
694module_param(acpi_force, int, 0444);
695MODULE_PARM_DESC(acpi_force, "Force ACPI to be used.");
696
697MODULE_AUTHOR ("Dave Jones <davej@codemonkey.org.uk>");
698MODULE_DESCRIPTION ("Powernow driver for AMD K7 processors.");
699MODULE_LICENSE ("GPL");
700
701late_initcall(powernow_init);
702module_exit(powernow_exit);
703
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.h b/arch/x86/kernel/cpu/cpufreq/powernow-k7.h
new file mode 100644
index 000000000000..f8a63b3664e3
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.h
@@ -0,0 +1,44 @@
1/*
2 * $Id: powernow-k7.h,v 1.2 2003/02/10 18:26:01 davej Exp $
3 * (C) 2003 Dave Jones.
4 *
5 * Licensed under the terms of the GNU GPL License version 2.
6 *
7 * AMD-specific information
8 *
9 */
10
11union msr_fidvidctl {
12 struct {
13 unsigned FID:5, // 4:0
14 reserved1:3, // 7:5
15 VID:5, // 12:8
16 reserved2:3, // 15:13
17 FIDC:1, // 16
18 VIDC:1, // 17
19 reserved3:2, // 19:18
20 FIDCHGRATIO:1, // 20
21 reserved4:11, // 31-21
22 SGTC:20, // 32:51
23 reserved5:12; // 63:52
24 } bits;
25 unsigned long long val;
26};
27
28union msr_fidvidstatus {
29 struct {
30 unsigned CFID:5, // 4:0
31 reserved1:3, // 7:5
32 SFID:5, // 12:8
33 reserved2:3, // 15:13
34 MFID:5, // 20:16
35 reserved3:11, // 31:21
36 CVID:5, // 36:32
37 reserved4:3, // 39:37
38 SVID:5, // 44:40
39 reserved5:3, // 47:45
40 MVID:5, // 52:48
41 reserved6:11; // 63:53
42 } bits;
43 unsigned long long val;
44};
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
new file mode 100644
index 000000000000..34ed53a06730
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -0,0 +1,1363 @@
1/*
2 * (c) 2003-2006 Advanced Micro Devices, Inc.
3 * Your use of this code is subject to the terms and conditions of the
4 * GNU general public license version 2. See "COPYING" or
5 * http://www.gnu.org/licenses/gpl.html
6 *
7 * Support : mark.langsdorf@amd.com
8 *
9 * Based on the powernow-k7.c module written by Dave Jones.
10 * (C) 2003 Dave Jones <davej@codemonkey.org.uk> on behalf of SuSE Labs
11 * (C) 2004 Dominik Brodowski <linux@brodo.de>
12 * (C) 2004 Pavel Machek <pavel@suse.cz>
13 * Licensed under the terms of the GNU GPL License version 2.
14 * Based upon datasheets & sample CPUs kindly provided by AMD.
15 *
16 * Valuable input gratefully received from Dave Jones, Pavel Machek,
17 * Dominik Brodowski, Jacob Shin, and others.
18 * Originally developed by Paul Devriendt.
19 * Processor information obtained from Chapter 9 (Power and Thermal Management)
20 * of the "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD
21 * Opteron Processors" available for download from www.amd.com
22 *
23 * Tables for specific CPUs can be inferred from
24 * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/30430.pdf
25 */
26
27#include <linux/kernel.h>
28#include <linux/smp.h>
29#include <linux/module.h>
30#include <linux/init.h>
31#include <linux/cpufreq.h>
32#include <linux/slab.h>
33#include <linux/string.h>
34#include <linux/cpumask.h>
35#include <linux/sched.h> /* for current / set_cpus_allowed() */
36
37#include <asm/msr.h>
38#include <asm/io.h>
39#include <asm/delay.h>
40
41#ifdef CONFIG_X86_POWERNOW_K8_ACPI
42#include <linux/acpi.h>
43#include <linux/mutex.h>
44#include <acpi/processor.h>
45#endif
46
47#define PFX "powernow-k8: "
48#define BFX PFX "BIOS error: "
49#define VERSION "version 2.00.00"
50#include "powernow-k8.h"
51
52/* serialize freq changes */
53static DEFINE_MUTEX(fidvid_mutex);
54
55static struct powernow_k8_data *powernow_data[NR_CPUS];
56
57static int cpu_family = CPU_OPTERON;
58
59#ifndef CONFIG_SMP
60static cpumask_t cpu_core_map[1];
61#endif
62
63/* Return a frequency in MHz, given an input fid */
64static u32 find_freq_from_fid(u32 fid)
65{
66 return 800 + (fid * 100);
67}
68
69
70/* Return a frequency in KHz, given an input fid */
71static u32 find_khz_freq_from_fid(u32 fid)
72{
73 return 1000 * find_freq_from_fid(fid);
74}
75
76/* Return a frequency in MHz, given an input fid and did */
77static u32 find_freq_from_fiddid(u32 fid, u32 did)
78{
79 return 100 * (fid + 0x10) >> did;
80}
81
82static u32 find_khz_freq_from_fiddid(u32 fid, u32 did)
83{
84 return 1000 * find_freq_from_fiddid(fid, did);
85}
86
87static u32 find_fid_from_pstate(u32 pstate)
88{
89 u32 hi, lo;
90 rdmsr(MSR_PSTATE_DEF_BASE + pstate, lo, hi);
91 return lo & HW_PSTATE_FID_MASK;
92}
93
94static u32 find_did_from_pstate(u32 pstate)
95{
96 u32 hi, lo;
97 rdmsr(MSR_PSTATE_DEF_BASE + pstate, lo, hi);
98 return (lo & HW_PSTATE_DID_MASK) >> HW_PSTATE_DID_SHIFT;
99}
100
101/* Return the vco fid for an input fid
102 *
103 * Each "low" fid has corresponding "high" fid, and you can get to "low" fids
104 * only from corresponding high fids. This returns "high" fid corresponding to
105 * "low" one.
106 */
107static u32 convert_fid_to_vco_fid(u32 fid)
108{
109 if (fid < HI_FID_TABLE_BOTTOM)
110 return 8 + (2 * fid);
111 else
112 return fid;
113}
114
115/*
116 * Return 1 if the pending bit is set. Unless we just instructed the processor
117 * to transition to a new state, seeing this bit set is really bad news.
118 */
119static int pending_bit_stuck(void)
120{
121 u32 lo, hi;
122
123 if (cpu_family == CPU_HW_PSTATE)
124 return 0;
125
126 rdmsr(MSR_FIDVID_STATUS, lo, hi);
127 return lo & MSR_S_LO_CHANGE_PENDING ? 1 : 0;
128}
129
130/*
131 * Update the global current fid / vid values from the status msr.
132 * Returns 1 on error.
133 */
134static int query_current_values_with_pending_wait(struct powernow_k8_data *data)
135{
136 u32 lo, hi;
137 u32 i = 0;
138
139 if (cpu_family == CPU_HW_PSTATE) {
140 rdmsr(MSR_PSTATE_STATUS, lo, hi);
141 i = lo & HW_PSTATE_MASK;
142 rdmsr(MSR_PSTATE_DEF_BASE + i, lo, hi);
143 data->currfid = lo & HW_PSTATE_FID_MASK;
144 data->currdid = (lo & HW_PSTATE_DID_MASK) >> HW_PSTATE_DID_SHIFT;
145 return 0;
146 }
147 do {
148 if (i++ > 10000) {
149 dprintk("detected change pending stuck\n");
150 return 1;
151 }
152 rdmsr(MSR_FIDVID_STATUS, lo, hi);
153 } while (lo & MSR_S_LO_CHANGE_PENDING);
154
155 data->currvid = hi & MSR_S_HI_CURRENT_VID;
156 data->currfid = lo & MSR_S_LO_CURRENT_FID;
157
158 return 0;
159}
160
161/* the isochronous relief time */
162static void count_off_irt(struct powernow_k8_data *data)
163{
164 udelay((1 << data->irt) * 10);
165 return;
166}
167
168/* the voltage stabalization time */
169static void count_off_vst(struct powernow_k8_data *data)
170{
171 udelay(data->vstable * VST_UNITS_20US);
172 return;
173}
174
175/* need to init the control msr to a safe value (for each cpu) */
176static void fidvid_msr_init(void)
177{
178 u32 lo, hi;
179 u8 fid, vid;
180
181 rdmsr(MSR_FIDVID_STATUS, lo, hi);
182 vid = hi & MSR_S_HI_CURRENT_VID;
183 fid = lo & MSR_S_LO_CURRENT_FID;
184 lo = fid | (vid << MSR_C_LO_VID_SHIFT);
185 hi = MSR_C_HI_STP_GNT_BENIGN;
186 dprintk("cpu%d, init lo 0x%x, hi 0x%x\n", smp_processor_id(), lo, hi);
187 wrmsr(MSR_FIDVID_CTL, lo, hi);
188}
189
190
191/* write the new fid value along with the other control fields to the msr */
192static int write_new_fid(struct powernow_k8_data *data, u32 fid)
193{
194 u32 lo;
195 u32 savevid = data->currvid;
196 u32 i = 0;
197
198 if ((fid & INVALID_FID_MASK) || (data->currvid & INVALID_VID_MASK)) {
199 printk(KERN_ERR PFX "internal error - overflow on fid write\n");
200 return 1;
201 }
202
203 lo = fid | (data->currvid << MSR_C_LO_VID_SHIFT) | MSR_C_LO_INIT_FID_VID;
204
205 dprintk("writing fid 0x%x, lo 0x%x, hi 0x%x\n",
206 fid, lo, data->plllock * PLL_LOCK_CONVERSION);
207
208 do {
209 wrmsr(MSR_FIDVID_CTL, lo, data->plllock * PLL_LOCK_CONVERSION);
210 if (i++ > 100) {
211 printk(KERN_ERR PFX "Hardware error - pending bit very stuck - no further pstate changes possible\n");
212 return 1;
213 }
214 } while (query_current_values_with_pending_wait(data));
215
216 count_off_irt(data);
217
218 if (savevid != data->currvid) {
219 printk(KERN_ERR PFX "vid change on fid trans, old 0x%x, new 0x%x\n",
220 savevid, data->currvid);
221 return 1;
222 }
223
224 if (fid != data->currfid) {
225 printk(KERN_ERR PFX "fid trans failed, fid 0x%x, curr 0x%x\n", fid,
226 data->currfid);
227 return 1;
228 }
229
230 return 0;
231}
232
233/* Write a new vid to the hardware */
234static int write_new_vid(struct powernow_k8_data *data, u32 vid)
235{
236 u32 lo;
237 u32 savefid = data->currfid;
238 int i = 0;
239
240 if ((data->currfid & INVALID_FID_MASK) || (vid & INVALID_VID_MASK)) {
241 printk(KERN_ERR PFX "internal error - overflow on vid write\n");
242 return 1;
243 }
244
245 lo = data->currfid | (vid << MSR_C_LO_VID_SHIFT) | MSR_C_LO_INIT_FID_VID;
246
247 dprintk("writing vid 0x%x, lo 0x%x, hi 0x%x\n",
248 vid, lo, STOP_GRANT_5NS);
249
250 do {
251 wrmsr(MSR_FIDVID_CTL, lo, STOP_GRANT_5NS);
252 if (i++ > 100) {
253 printk(KERN_ERR PFX "internal error - pending bit very stuck - no further pstate changes possible\n");
254 return 1;
255 }
256 } while (query_current_values_with_pending_wait(data));
257
258 if (savefid != data->currfid) {
259 printk(KERN_ERR PFX "fid changed on vid trans, old 0x%x new 0x%x\n",
260 savefid, data->currfid);
261 return 1;
262 }
263
264 if (vid != data->currvid) {
265 printk(KERN_ERR PFX "vid trans failed, vid 0x%x, curr 0x%x\n", vid,
266 data->currvid);
267 return 1;
268 }
269
270 return 0;
271}
272
273/*
274 * Reduce the vid by the max of step or reqvid.
275 * Decreasing vid codes represent increasing voltages:
276 * vid of 0 is 1.550V, vid of 0x1e is 0.800V, vid of VID_OFF is off.
277 */
278static int decrease_vid_code_by_step(struct powernow_k8_data *data, u32 reqvid, u32 step)
279{
280 if ((data->currvid - reqvid) > step)
281 reqvid = data->currvid - step;
282
283 if (write_new_vid(data, reqvid))
284 return 1;
285
286 count_off_vst(data);
287
288 return 0;
289}
290
291/* Change hardware pstate by single MSR write */
292static int transition_pstate(struct powernow_k8_data *data, u32 pstate)
293{
294 wrmsr(MSR_PSTATE_CTRL, pstate, 0);
295 data->currfid = find_fid_from_pstate(pstate);
296 return 0;
297}
298
299/* Change Opteron/Athlon64 fid and vid, by the 3 phases. */
300static int transition_fid_vid(struct powernow_k8_data *data, u32 reqfid, u32 reqvid)
301{
302 if (core_voltage_pre_transition(data, reqvid))
303 return 1;
304
305 if (core_frequency_transition(data, reqfid))
306 return 1;
307
308 if (core_voltage_post_transition(data, reqvid))
309 return 1;
310
311 if (query_current_values_with_pending_wait(data))
312 return 1;
313
314 if ((reqfid != data->currfid) || (reqvid != data->currvid)) {
315 printk(KERN_ERR PFX "failed (cpu%d): req 0x%x 0x%x, curr 0x%x 0x%x\n",
316 smp_processor_id(),
317 reqfid, reqvid, data->currfid, data->currvid);
318 return 1;
319 }
320
321 dprintk("transitioned (cpu%d): new fid 0x%x, vid 0x%x\n",
322 smp_processor_id(), data->currfid, data->currvid);
323
324 return 0;
325}
326
327/* Phase 1 - core voltage transition ... setup voltage */
328static int core_voltage_pre_transition(struct powernow_k8_data *data, u32 reqvid)
329{
330 u32 rvosteps = data->rvo;
331 u32 savefid = data->currfid;
332 u32 maxvid, lo;
333
334 dprintk("ph1 (cpu%d): start, currfid 0x%x, currvid 0x%x, reqvid 0x%x, rvo 0x%x\n",
335 smp_processor_id(),
336 data->currfid, data->currvid, reqvid, data->rvo);
337
338 rdmsr(MSR_FIDVID_STATUS, lo, maxvid);
339 maxvid = 0x1f & (maxvid >> 16);
340 dprintk("ph1 maxvid=0x%x\n", maxvid);
341 if (reqvid < maxvid) /* lower numbers are higher voltages */
342 reqvid = maxvid;
343
344 while (data->currvid > reqvid) {
345 dprintk("ph1: curr 0x%x, req vid 0x%x\n",
346 data->currvid, reqvid);
347 if (decrease_vid_code_by_step(data, reqvid, data->vidmvs))
348 return 1;
349 }
350
351 while ((rvosteps > 0) && ((data->rvo + data->currvid) > reqvid)) {
352 if (data->currvid == maxvid) {
353 rvosteps = 0;
354 } else {
355 dprintk("ph1: changing vid for rvo, req 0x%x\n",
356 data->currvid - 1);
357 if (decrease_vid_code_by_step(data, data->currvid - 1, 1))
358 return 1;
359 rvosteps--;
360 }
361 }
362
363 if (query_current_values_with_pending_wait(data))
364 return 1;
365
366 if (savefid != data->currfid) {
367 printk(KERN_ERR PFX "ph1 err, currfid changed 0x%x\n", data->currfid);
368 return 1;
369 }
370
371 dprintk("ph1 complete, currfid 0x%x, currvid 0x%x\n",
372 data->currfid, data->currvid);
373
374 return 0;
375}
376
377/* Phase 2 - core frequency transition */
378static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid)
379{
380 u32 vcoreqfid, vcocurrfid, vcofiddiff, fid_interval, savevid = data->currvid;
381
382 if ((reqfid < HI_FID_TABLE_BOTTOM) && (data->currfid < HI_FID_TABLE_BOTTOM)) {
383 printk(KERN_ERR PFX "ph2: illegal lo-lo transition 0x%x 0x%x\n",
384 reqfid, data->currfid);
385 return 1;
386 }
387
388 if (data->currfid == reqfid) {
389 printk(KERN_ERR PFX "ph2 null fid transition 0x%x\n", data->currfid);
390 return 0;
391 }
392
393 dprintk("ph2 (cpu%d): starting, currfid 0x%x, currvid 0x%x, reqfid 0x%x\n",
394 smp_processor_id(),
395 data->currfid, data->currvid, reqfid);
396
397 vcoreqfid = convert_fid_to_vco_fid(reqfid);
398 vcocurrfid = convert_fid_to_vco_fid(data->currfid);
399 vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid
400 : vcoreqfid - vcocurrfid;
401
402 while (vcofiddiff > 2) {
403 (data->currfid & 1) ? (fid_interval = 1) : (fid_interval = 2);
404
405 if (reqfid > data->currfid) {
406 if (data->currfid > LO_FID_TABLE_TOP) {
407 if (write_new_fid(data, data->currfid + fid_interval)) {
408 return 1;
409 }
410 } else {
411 if (write_new_fid
412 (data, 2 + convert_fid_to_vco_fid(data->currfid))) {
413 return 1;
414 }
415 }
416 } else {
417 if (write_new_fid(data, data->currfid - fid_interval))
418 return 1;
419 }
420
421 vcocurrfid = convert_fid_to_vco_fid(data->currfid);
422 vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid
423 : vcoreqfid - vcocurrfid;
424 }
425
426 if (write_new_fid(data, reqfid))
427 return 1;
428
429 if (query_current_values_with_pending_wait(data))
430 return 1;
431
432 if (data->currfid != reqfid) {
433 printk(KERN_ERR PFX
434 "ph2: mismatch, failed fid transition, curr 0x%x, req 0x%x\n",
435 data->currfid, reqfid);
436 return 1;
437 }
438
439 if (savevid != data->currvid) {
440 printk(KERN_ERR PFX "ph2: vid changed, save 0x%x, curr 0x%x\n",
441 savevid, data->currvid);
442 return 1;
443 }
444
445 dprintk("ph2 complete, currfid 0x%x, currvid 0x%x\n",
446 data->currfid, data->currvid);
447
448 return 0;
449}
450
451/* Phase 3 - core voltage transition flow ... jump to the final vid. */
452static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvid)
453{
454 u32 savefid = data->currfid;
455 u32 savereqvid = reqvid;
456
457 dprintk("ph3 (cpu%d): starting, currfid 0x%x, currvid 0x%x\n",
458 smp_processor_id(),
459 data->currfid, data->currvid);
460
461 if (reqvid != data->currvid) {
462 if (write_new_vid(data, reqvid))
463 return 1;
464
465 if (savefid != data->currfid) {
466 printk(KERN_ERR PFX
467 "ph3: bad fid change, save 0x%x, curr 0x%x\n",
468 savefid, data->currfid);
469 return 1;
470 }
471
472 if (data->currvid != reqvid) {
473 printk(KERN_ERR PFX
474 "ph3: failed vid transition\n, req 0x%x, curr 0x%x",
475 reqvid, data->currvid);
476 return 1;
477 }
478 }
479
480 if (query_current_values_with_pending_wait(data))
481 return 1;
482
483 if (savereqvid != data->currvid) {
484 dprintk("ph3 failed, currvid 0x%x\n", data->currvid);
485 return 1;
486 }
487
488 if (savefid != data->currfid) {
489 dprintk("ph3 failed, currfid changed 0x%x\n",
490 data->currfid);
491 return 1;
492 }
493
494 dprintk("ph3 complete, currfid 0x%x, currvid 0x%x\n",
495 data->currfid, data->currvid);
496
497 return 0;
498}
499
500static int check_supported_cpu(unsigned int cpu)
501{
502 cpumask_t oldmask = CPU_MASK_ALL;
503 u32 eax, ebx, ecx, edx;
504 unsigned int rc = 0;
505
506 oldmask = current->cpus_allowed;
507 set_cpus_allowed(current, cpumask_of_cpu(cpu));
508
509 if (smp_processor_id() != cpu) {
510 printk(KERN_ERR PFX "limiting to cpu %u failed\n", cpu);
511 goto out;
512 }
513
514 if (current_cpu_data.x86_vendor != X86_VENDOR_AMD)
515 goto out;
516
517 eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
518 if (((eax & CPUID_XFAM) != CPUID_XFAM_K8) &&
519 ((eax & CPUID_XFAM) < CPUID_XFAM_10H))
520 goto out;
521
522 if ((eax & CPUID_XFAM) == CPUID_XFAM_K8) {
523 if (((eax & CPUID_USE_XFAM_XMOD) != CPUID_USE_XFAM_XMOD) ||
524 ((eax & CPUID_XMOD) > CPUID_XMOD_REV_MASK)) {
525 printk(KERN_INFO PFX "Processor cpuid %x not supported\n", eax);
526 goto out;
527 }
528
529 eax = cpuid_eax(CPUID_GET_MAX_CAPABILITIES);
530 if (eax < CPUID_FREQ_VOLT_CAPABILITIES) {
531 printk(KERN_INFO PFX
532 "No frequency change capabilities detected\n");
533 goto out;
534 }
535
536 cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx);
537 if ((edx & P_STATE_TRANSITION_CAPABLE) != P_STATE_TRANSITION_CAPABLE) {
538 printk(KERN_INFO PFX "Power state transitions not supported\n");
539 goto out;
540 }
541 } else { /* must be a HW Pstate capable processor */
542 cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx);
543 if ((edx & USE_HW_PSTATE) == USE_HW_PSTATE)
544 cpu_family = CPU_HW_PSTATE;
545 else
546 goto out;
547 }
548
549 rc = 1;
550
551out:
552 set_cpus_allowed(current, oldmask);
553 return rc;
554}
555
556static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst, u8 maxvid)
557{
558 unsigned int j;
559 u8 lastfid = 0xff;
560
561 for (j = 0; j < data->numps; j++) {
562 if (pst[j].vid > LEAST_VID) {
563 printk(KERN_ERR PFX "vid %d invalid : 0x%x\n", j, pst[j].vid);
564 return -EINVAL;
565 }
566 if (pst[j].vid < data->rvo) { /* vid + rvo >= 0 */
567 printk(KERN_ERR BFX "0 vid exceeded with pstate %d\n", j);
568 return -ENODEV;
569 }
570 if (pst[j].vid < maxvid + data->rvo) { /* vid + rvo >= maxvid */
571 printk(KERN_ERR BFX "maxvid exceeded with pstate %d\n", j);
572 return -ENODEV;
573 }
574 if (pst[j].fid > MAX_FID) {
575 printk(KERN_ERR BFX "maxfid exceeded with pstate %d\n", j);
576 return -ENODEV;
577 }
578 if (j && (pst[j].fid < HI_FID_TABLE_BOTTOM)) {
579 /* Only first fid is allowed to be in "low" range */
580 printk(KERN_ERR BFX "two low fids - %d : 0x%x\n", j, pst[j].fid);
581 return -EINVAL;
582 }
583 if (pst[j].fid < lastfid)
584 lastfid = pst[j].fid;
585 }
586 if (lastfid & 1) {
587 printk(KERN_ERR BFX "lastfid invalid\n");
588 return -EINVAL;
589 }
590 if (lastfid > LO_FID_TABLE_TOP)
591 printk(KERN_INFO BFX "first fid not from lo freq table\n");
592
593 return 0;
594}
595
596static void print_basics(struct powernow_k8_data *data)
597{
598 int j;
599 for (j = 0; j < data->numps; j++) {
600 if (data->powernow_table[j].frequency != CPUFREQ_ENTRY_INVALID) {
601 if (cpu_family == CPU_HW_PSTATE) {
602 printk(KERN_INFO PFX " %d : fid 0x%x did 0x%x (%d MHz)\n",
603 j,
604 (data->powernow_table[j].index & 0xff00) >> 8,
605 (data->powernow_table[j].index & 0xff0000) >> 16,
606 data->powernow_table[j].frequency/1000);
607 } else {
608 printk(KERN_INFO PFX " %d : fid 0x%x (%d MHz), vid 0x%x\n",
609 j,
610 data->powernow_table[j].index & 0xff,
611 data->powernow_table[j].frequency/1000,
612 data->powernow_table[j].index >> 8);
613 }
614 }
615 }
616 if (data->batps)
617 printk(KERN_INFO PFX "Only %d pstates on battery\n", data->batps);
618}
619
620static int fill_powernow_table(struct powernow_k8_data *data, struct pst_s *pst, u8 maxvid)
621{
622 struct cpufreq_frequency_table *powernow_table;
623 unsigned int j;
624
625 if (data->batps) { /* use ACPI support to get full speed on mains power */
626 printk(KERN_WARNING PFX "Only %d pstates usable (use ACPI driver for full range\n", data->batps);
627 data->numps = data->batps;
628 }
629
630 for ( j=1; j<data->numps; j++ ) {
631 if (pst[j-1].fid >= pst[j].fid) {
632 printk(KERN_ERR PFX "PST out of sequence\n");
633 return -EINVAL;
634 }
635 }
636
637 if (data->numps < 2) {
638 printk(KERN_ERR PFX "no p states to transition\n");
639 return -ENODEV;
640 }
641
642 if (check_pst_table(data, pst, maxvid))
643 return -EINVAL;
644
645 powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table)
646 * (data->numps + 1)), GFP_KERNEL);
647 if (!powernow_table) {
648 printk(KERN_ERR PFX "powernow_table memory alloc failure\n");
649 return -ENOMEM;
650 }
651
652 for (j = 0; j < data->numps; j++) {
653 powernow_table[j].index = pst[j].fid; /* lower 8 bits */
654 powernow_table[j].index |= (pst[j].vid << 8); /* upper 8 bits */
655 powernow_table[j].frequency = find_khz_freq_from_fid(pst[j].fid);
656 }
657 powernow_table[data->numps].frequency = CPUFREQ_TABLE_END;
658 powernow_table[data->numps].index = 0;
659
660 if (query_current_values_with_pending_wait(data)) {
661 kfree(powernow_table);
662 return -EIO;
663 }
664
665 dprintk("cfid 0x%x, cvid 0x%x\n", data->currfid, data->currvid);
666 data->powernow_table = powernow_table;
667 if (first_cpu(cpu_core_map[data->cpu]) == data->cpu)
668 print_basics(data);
669
670 for (j = 0; j < data->numps; j++)
671 if ((pst[j].fid==data->currfid) && (pst[j].vid==data->currvid))
672 return 0;
673
674 dprintk("currfid/vid do not match PST, ignoring\n");
675 return 0;
676}
677
678/* Find and validate the PSB/PST table in BIOS. */
679static int find_psb_table(struct powernow_k8_data *data)
680{
681 struct psb_s *psb;
682 unsigned int i;
683 u32 mvs;
684 u8 maxvid;
685 u32 cpst = 0;
686 u32 thiscpuid;
687
688 for (i = 0xc0000; i < 0xffff0; i += 0x10) {
689 /* Scan BIOS looking for the signature. */
690 /* It can not be at ffff0 - it is too big. */
691
692 psb = phys_to_virt(i);
693 if (memcmp(psb, PSB_ID_STRING, PSB_ID_STRING_LEN) != 0)
694 continue;
695
696 dprintk("found PSB header at 0x%p\n", psb);
697
698 dprintk("table vers: 0x%x\n", psb->tableversion);
699 if (psb->tableversion != PSB_VERSION_1_4) {
700 printk(KERN_ERR BFX "PSB table is not v1.4\n");
701 return -ENODEV;
702 }
703
704 dprintk("flags: 0x%x\n", psb->flags1);
705 if (psb->flags1) {
706 printk(KERN_ERR BFX "unknown flags\n");
707 return -ENODEV;
708 }
709
710 data->vstable = psb->vstable;
711 dprintk("voltage stabilization time: %d(*20us)\n", data->vstable);
712
713 dprintk("flags2: 0x%x\n", psb->flags2);
714 data->rvo = psb->flags2 & 3;
715 data->irt = ((psb->flags2) >> 2) & 3;
716 mvs = ((psb->flags2) >> 4) & 3;
717 data->vidmvs = 1 << mvs;
718 data->batps = ((psb->flags2) >> 6) & 3;
719
720 dprintk("ramp voltage offset: %d\n", data->rvo);
721 dprintk("isochronous relief time: %d\n", data->irt);
722 dprintk("maximum voltage step: %d - 0x%x\n", mvs, data->vidmvs);
723
724 dprintk("numpst: 0x%x\n", psb->num_tables);
725 cpst = psb->num_tables;
726 if ((psb->cpuid == 0x00000fc0) || (psb->cpuid == 0x00000fe0) ){
727 thiscpuid = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
728 if ((thiscpuid == 0x00000fc0) || (thiscpuid == 0x00000fe0) ) {
729 cpst = 1;
730 }
731 }
732 if (cpst != 1) {
733 printk(KERN_ERR BFX "numpst must be 1\n");
734 return -ENODEV;
735 }
736
737 data->plllock = psb->plllocktime;
738 dprintk("plllocktime: 0x%x (units 1us)\n", psb->plllocktime);
739 dprintk("maxfid: 0x%x\n", psb->maxfid);
740 dprintk("maxvid: 0x%x\n", psb->maxvid);
741 maxvid = psb->maxvid;
742
743 data->numps = psb->numps;
744 dprintk("numpstates: 0x%x\n", data->numps);
745 return fill_powernow_table(data, (struct pst_s *)(psb+1), maxvid);
746 }
747 /*
748 * If you see this message, complain to BIOS manufacturer. If
749 * he tells you "we do not support Linux" or some similar
750 * nonsense, remember that Windows 2000 uses the same legacy
751 * mechanism that the old Linux PSB driver uses. Tell them it
752 * is broken with Windows 2000.
753 *
754 * The reference to the AMD documentation is chapter 9 in the
755 * BIOS and Kernel Developer's Guide, which is available on
756 * www.amd.com
757 */
758 printk(KERN_ERR PFX "BIOS error - no PSB or ACPI _PSS objects\n");
759 return -ENODEV;
760}
761
762#ifdef CONFIG_X86_POWERNOW_K8_ACPI
763static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index)
764{
765 if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE))
766 return;
767
768 data->irt = (data->acpi_data.states[index].control >> IRT_SHIFT) & IRT_MASK;
769 data->rvo = (data->acpi_data.states[index].control >> RVO_SHIFT) & RVO_MASK;
770 data->exttype = (data->acpi_data.states[index].control >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK;
771 data->plllock = (data->acpi_data.states[index].control >> PLL_L_SHIFT) & PLL_L_MASK;
772 data->vidmvs = 1 << ((data->acpi_data.states[index].control >> MVS_SHIFT) & MVS_MASK);
773 data->vstable = (data->acpi_data.states[index].control >> VST_SHIFT) & VST_MASK;
774}
775
776static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
777{
778 struct cpufreq_frequency_table *powernow_table;
779 int ret_val;
780
781 if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) {
782 dprintk("register performance failed: bad ACPI data\n");
783 return -EIO;
784 }
785
786 /* verify the data contained in the ACPI structures */
787 if (data->acpi_data.state_count <= 1) {
788 dprintk("No ACPI P-States\n");
789 goto err_out;
790 }
791
792 if ((data->acpi_data.control_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) ||
793 (data->acpi_data.status_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE)) {
794 dprintk("Invalid control/status registers (%x - %x)\n",
795 data->acpi_data.control_register.space_id,
796 data->acpi_data.status_register.space_id);
797 goto err_out;
798 }
799
800 /* fill in data->powernow_table */
801 powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table)
802 * (data->acpi_data.state_count + 1)), GFP_KERNEL);
803 if (!powernow_table) {
804 dprintk("powernow_table memory alloc failure\n");
805 goto err_out;
806 }
807
808 if (cpu_family == CPU_HW_PSTATE)
809 ret_val = fill_powernow_table_pstate(data, powernow_table);
810 else
811 ret_val = fill_powernow_table_fidvid(data, powernow_table);
812 if (ret_val)
813 goto err_out_mem;
814
815 powernow_table[data->acpi_data.state_count].frequency = CPUFREQ_TABLE_END;
816 powernow_table[data->acpi_data.state_count].index = 0;
817 data->powernow_table = powernow_table;
818
819 /* fill in data */
820 data->numps = data->acpi_data.state_count;
821 if (first_cpu(cpu_core_map[data->cpu]) == data->cpu)
822 print_basics(data);
823 powernow_k8_acpi_pst_values(data, 0);
824
825 /* notify BIOS that we exist */
826 acpi_processor_notify_smm(THIS_MODULE);
827
828 return 0;
829
830err_out_mem:
831 kfree(powernow_table);
832
833err_out:
834 acpi_processor_unregister_performance(&data->acpi_data, data->cpu);
835
836 /* data->acpi_data.state_count informs us at ->exit() whether ACPI was used */
837 data->acpi_data.state_count = 0;
838
839 return -ENODEV;
840}
841
842static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table)
843{
844 int i;
845
846 for (i = 0; i < data->acpi_data.state_count; i++) {
847 u32 index;
848 u32 hi = 0, lo = 0;
849 u32 fid;
850 u32 did;
851
852 index = data->acpi_data.states[i].control & HW_PSTATE_MASK;
853 if (index > MAX_HW_PSTATE) {
854 printk(KERN_ERR PFX "invalid pstate %d - bad value %d.\n", i, index);
855 printk(KERN_ERR PFX "Please report to BIOS manufacturer\n");
856 }
857 rdmsr(MSR_PSTATE_DEF_BASE + index, lo, hi);
858 if (!(hi & HW_PSTATE_VALID_MASK)) {
859 dprintk("invalid pstate %d, ignoring\n", index);
860 powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID;
861 continue;
862 }
863
864 fid = lo & HW_PSTATE_FID_MASK;
865 did = (lo & HW_PSTATE_DID_MASK) >> HW_PSTATE_DID_SHIFT;
866
867 dprintk(" %d : fid 0x%x, did 0x%x\n", index, fid, did);
868
869 powernow_table[i].index = index | (fid << HW_FID_INDEX_SHIFT) | (did << HW_DID_INDEX_SHIFT);
870
871 powernow_table[i].frequency = find_khz_freq_from_fiddid(fid, did);
872
873 if (powernow_table[i].frequency != (data->acpi_data.states[i].core_frequency * 1000)) {
874 printk(KERN_INFO PFX "invalid freq entries %u kHz vs. %u kHz\n",
875 powernow_table[i].frequency,
876 (unsigned int) (data->acpi_data.states[i].core_frequency * 1000));
877 powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID;
878 continue;
879 }
880 }
881 return 0;
882}
883
884static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table)
885{
886 int i;
887 int cntlofreq = 0;
888 for (i = 0; i < data->acpi_data.state_count; i++) {
889 u32 fid;
890 u32 vid;
891
892 if (data->exttype) {
893 fid = data->acpi_data.states[i].status & EXT_FID_MASK;
894 vid = (data->acpi_data.states[i].status >> VID_SHIFT) & EXT_VID_MASK;
895 } else {
896 fid = data->acpi_data.states[i].control & FID_MASK;
897 vid = (data->acpi_data.states[i].control >> VID_SHIFT) & VID_MASK;
898 }
899
900 dprintk(" %d : fid 0x%x, vid 0x%x\n", i, fid, vid);
901
902 powernow_table[i].index = fid; /* lower 8 bits */
903 powernow_table[i].index |= (vid << 8); /* upper 8 bits */
904 powernow_table[i].frequency = find_khz_freq_from_fid(fid);
905
906 /* verify frequency is OK */
907 if ((powernow_table[i].frequency > (MAX_FREQ * 1000)) ||
908 (powernow_table[i].frequency < (MIN_FREQ * 1000))) {
909 dprintk("invalid freq %u kHz, ignoring\n", powernow_table[i].frequency);
910 powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID;
911 continue;
912 }
913
914 /* verify voltage is OK - BIOSs are using "off" to indicate invalid */
915 if (vid == VID_OFF) {
916 dprintk("invalid vid %u, ignoring\n", vid);
917 powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID;
918 continue;
919 }
920
921 /* verify only 1 entry from the lo frequency table */
922 if (fid < HI_FID_TABLE_BOTTOM) {
923 if (cntlofreq) {
924 /* if both entries are the same, ignore this one ... */
925 if ((powernow_table[i].frequency != powernow_table[cntlofreq].frequency) ||
926 (powernow_table[i].index != powernow_table[cntlofreq].index)) {
927 printk(KERN_ERR PFX "Too many lo freq table entries\n");
928 return 1;
929 }
930
931 dprintk("double low frequency table entry, ignoring it.\n");
932 powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID;
933 continue;
934 } else
935 cntlofreq = i;
936 }
937
938 if (powernow_table[i].frequency != (data->acpi_data.states[i].core_frequency * 1000)) {
939 printk(KERN_INFO PFX "invalid freq entries %u kHz vs. %u kHz\n",
940 powernow_table[i].frequency,
941 (unsigned int) (data->acpi_data.states[i].core_frequency * 1000));
942 powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID;
943 continue;
944 }
945 }
946 return 0;
947}
948
949static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data)
950{
951 if (data->acpi_data.state_count)
952 acpi_processor_unregister_performance(&data->acpi_data, data->cpu);
953}
954
955#else
956static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) { return -ENODEV; }
957static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data) { return; }
958static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index) { return; }
959#endif /* CONFIG_X86_POWERNOW_K8_ACPI */
960
961/* Take a frequency, and issue the fid/vid transition command */
962static int transition_frequency_fidvid(struct powernow_k8_data *data, unsigned int index)
963{
964 u32 fid = 0;
965 u32 vid = 0;
966 int res, i;
967 struct cpufreq_freqs freqs;
968
969 dprintk("cpu %d transition to index %u\n", smp_processor_id(), index);
970
971 /* fid/vid correctness check for k8 */
972 /* fid are the lower 8 bits of the index we stored into
973 * the cpufreq frequency table in find_psb_table, vid
974 * are the upper 8 bits.
975 */
976 fid = data->powernow_table[index].index & 0xFF;
977 vid = (data->powernow_table[index].index & 0xFF00) >> 8;
978
979 dprintk("table matched fid 0x%x, giving vid 0x%x\n", fid, vid);
980
981 if (query_current_values_with_pending_wait(data))
982 return 1;
983
984 if ((data->currvid == vid) && (data->currfid == fid)) {
985 dprintk("target matches current values (fid 0x%x, vid 0x%x)\n",
986 fid, vid);
987 return 0;
988 }
989
990 if ((fid < HI_FID_TABLE_BOTTOM) && (data->currfid < HI_FID_TABLE_BOTTOM)) {
991 printk(KERN_ERR PFX
992 "ignoring illegal change in lo freq table-%x to 0x%x\n",
993 data->currfid, fid);
994 return 1;
995 }
996
997 dprintk("cpu %d, changing to fid 0x%x, vid 0x%x\n",
998 smp_processor_id(), fid, vid);
999 freqs.old = find_khz_freq_from_fid(data->currfid);
1000 freqs.new = find_khz_freq_from_fid(fid);
1001
1002 for_each_cpu_mask(i, *(data->available_cores)) {
1003 freqs.cpu = i;
1004 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
1005 }
1006
1007 res = transition_fid_vid(data, fid, vid);
1008 freqs.new = find_khz_freq_from_fid(data->currfid);
1009
1010 for_each_cpu_mask(i, *(data->available_cores)) {
1011 freqs.cpu = i;
1012 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
1013 }
1014 return res;
1015}
1016
1017/* Take a frequency, and issue the hardware pstate transition command */
1018static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned int index)
1019{
1020 u32 fid = 0;
1021 u32 did = 0;
1022 u32 pstate = 0;
1023 int res, i;
1024 struct cpufreq_freqs freqs;
1025
1026 dprintk("cpu %d transition to index %u\n", smp_processor_id(), index);
1027
1028 /* get fid did for hardware pstate transition */
1029 pstate = index & HW_PSTATE_MASK;
1030 if (pstate > MAX_HW_PSTATE)
1031 return 0;
1032 fid = (index & HW_FID_INDEX_MASK) >> HW_FID_INDEX_SHIFT;
1033 did = (index & HW_DID_INDEX_MASK) >> HW_DID_INDEX_SHIFT;
1034 freqs.old = find_khz_freq_from_fiddid(data->currfid, data->currdid);
1035 freqs.new = find_khz_freq_from_fiddid(fid, did);
1036
1037 for_each_cpu_mask(i, *(data->available_cores)) {
1038 freqs.cpu = i;
1039 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
1040 }
1041
1042 res = transition_pstate(data, pstate);
1043 data->currfid = find_fid_from_pstate(pstate);
1044 data->currdid = find_did_from_pstate(pstate);
1045 freqs.new = find_khz_freq_from_fiddid(data->currfid, data->currdid);
1046
1047 for_each_cpu_mask(i, *(data->available_cores)) {
1048 freqs.cpu = i;
1049 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
1050 }
1051 return res;
1052}
1053
1054/* Driver entry point to switch to the target frequency */
1055static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsigned relation)
1056{
1057 cpumask_t oldmask = CPU_MASK_ALL;
1058 struct powernow_k8_data *data = powernow_data[pol->cpu];
1059 u32 checkfid;
1060 u32 checkvid;
1061 unsigned int newstate;
1062 int ret = -EIO;
1063
1064 if (!data)
1065 return -EINVAL;
1066
1067 checkfid = data->currfid;
1068 checkvid = data->currvid;
1069
1070 /* only run on specific CPU from here on */
1071 oldmask = current->cpus_allowed;
1072 set_cpus_allowed(current, cpumask_of_cpu(pol->cpu));
1073
1074 if (smp_processor_id() != pol->cpu) {
1075 printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu);
1076 goto err_out;
1077 }
1078
1079 if (pending_bit_stuck()) {
1080 printk(KERN_ERR PFX "failing targ, change pending bit set\n");
1081 goto err_out;
1082 }
1083
1084 dprintk("targ: cpu %d, %d kHz, min %d, max %d, relation %d\n",
1085 pol->cpu, targfreq, pol->min, pol->max, relation);
1086
1087 if (query_current_values_with_pending_wait(data))
1088 goto err_out;
1089
1090 if (cpu_family == CPU_HW_PSTATE)
1091 dprintk("targ: curr fid 0x%x, did 0x%x\n",
1092 data->currfid, data->currdid);
1093 else {
1094 dprintk("targ: curr fid 0x%x, vid 0x%x\n",
1095 data->currfid, data->currvid);
1096
1097 if ((checkvid != data->currvid) || (checkfid != data->currfid)) {
1098 printk(KERN_INFO PFX
1099 "error - out of sync, fix 0x%x 0x%x, vid 0x%x 0x%x\n",
1100 checkfid, data->currfid, checkvid, data->currvid);
1101 }
1102 }
1103
1104 if (cpufreq_frequency_table_target(pol, data->powernow_table, targfreq, relation, &newstate))
1105 goto err_out;
1106
1107 mutex_lock(&fidvid_mutex);
1108
1109 powernow_k8_acpi_pst_values(data, newstate);
1110
1111 if (cpu_family == CPU_HW_PSTATE)
1112 ret = transition_frequency_pstate(data, newstate);
1113 else
1114 ret = transition_frequency_fidvid(data, newstate);
1115 if (ret) {
1116 printk(KERN_ERR PFX "transition frequency failed\n");
1117 ret = 1;
1118 mutex_unlock(&fidvid_mutex);
1119 goto err_out;
1120 }
1121 mutex_unlock(&fidvid_mutex);
1122
1123 if (cpu_family == CPU_HW_PSTATE)
1124 pol->cur = find_khz_freq_from_fiddid(data->currfid, data->currdid);
1125 else
1126 pol->cur = find_khz_freq_from_fid(data->currfid);
1127 ret = 0;
1128
1129err_out:
1130 set_cpus_allowed(current, oldmask);
1131 return ret;
1132}
1133
1134/* Driver entry point to verify the policy and range of frequencies */
1135static int powernowk8_verify(struct cpufreq_policy *pol)
1136{
1137 struct powernow_k8_data *data = powernow_data[pol->cpu];
1138
1139 if (!data)
1140 return -EINVAL;
1141
1142 return cpufreq_frequency_table_verify(pol, data->powernow_table);
1143}
1144
1145/* per CPU init entry point to the driver */
1146static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1147{
1148 struct powernow_k8_data *data;
1149 cpumask_t oldmask = CPU_MASK_ALL;
1150 int rc;
1151
1152 if (!cpu_online(pol->cpu))
1153 return -ENODEV;
1154
1155 if (!check_supported_cpu(pol->cpu))
1156 return -ENODEV;
1157
1158 data = kzalloc(sizeof(struct powernow_k8_data), GFP_KERNEL);
1159 if (!data) {
1160 printk(KERN_ERR PFX "unable to alloc powernow_k8_data");
1161 return -ENOMEM;
1162 }
1163
1164 data->cpu = pol->cpu;
1165
1166 if (powernow_k8_cpu_init_acpi(data)) {
1167 /*
1168 * Use the PSB BIOS structure. This is only availabe on
1169 * an UP version, and is deprecated by AMD.
1170 */
1171 if (num_online_cpus() != 1) {
1172 printk(KERN_ERR PFX "MP systems not supported by PSB BIOS structure\n");
1173 kfree(data);
1174 return -ENODEV;
1175 }
1176 if (pol->cpu != 0) {
1177 printk(KERN_ERR PFX "No _PSS objects for CPU other than CPU0\n");
1178 kfree(data);
1179 return -ENODEV;
1180 }
1181 rc = find_psb_table(data);
1182 if (rc) {
1183 kfree(data);
1184 return -ENODEV;
1185 }
1186 }
1187
1188 /* only run on specific CPU from here on */
1189 oldmask = current->cpus_allowed;
1190 set_cpus_allowed(current, cpumask_of_cpu(pol->cpu));
1191
1192 if (smp_processor_id() != pol->cpu) {
1193 printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu);
1194 goto err_out;
1195 }
1196
1197 if (pending_bit_stuck()) {
1198 printk(KERN_ERR PFX "failing init, change pending bit set\n");
1199 goto err_out;
1200 }
1201
1202 if (query_current_values_with_pending_wait(data))
1203 goto err_out;
1204
1205 if (cpu_family == CPU_OPTERON)
1206 fidvid_msr_init();
1207
1208 /* run on any CPU again */
1209 set_cpus_allowed(current, oldmask);
1210
1211 pol->governor = CPUFREQ_DEFAULT_GOVERNOR;
1212 if (cpu_family == CPU_HW_PSTATE)
1213 pol->cpus = cpumask_of_cpu(pol->cpu);
1214 else
1215 pol->cpus = cpu_core_map[pol->cpu];
1216 data->available_cores = &(pol->cpus);
1217
1218 /* Take a crude guess here.
1219 * That guess was in microseconds, so multiply with 1000 */
1220 pol->cpuinfo.transition_latency = (((data->rvo + 8) * data->vstable * VST_UNITS_20US)
1221 + (3 * (1 << data->irt) * 10)) * 1000;
1222
1223 if (cpu_family == CPU_HW_PSTATE)
1224 pol->cur = find_khz_freq_from_fiddid(data->currfid, data->currdid);
1225 else
1226 pol->cur = find_khz_freq_from_fid(data->currfid);
1227 dprintk("policy current frequency %d kHz\n", pol->cur);
1228
1229 /* min/max the cpu is capable of */
1230 if (cpufreq_frequency_table_cpuinfo(pol, data->powernow_table)) {
1231 printk(KERN_ERR PFX "invalid powernow_table\n");
1232 powernow_k8_cpu_exit_acpi(data);
1233 kfree(data->powernow_table);
1234 kfree(data);
1235 return -EINVAL;
1236 }
1237
1238 cpufreq_frequency_table_get_attr(data->powernow_table, pol->cpu);
1239
1240 if (cpu_family == CPU_HW_PSTATE)
1241 dprintk("cpu_init done, current fid 0x%x, did 0x%x\n",
1242 data->currfid, data->currdid);
1243 else
1244 dprintk("cpu_init done, current fid 0x%x, vid 0x%x\n",
1245 data->currfid, data->currvid);
1246
1247 powernow_data[pol->cpu] = data;
1248
1249 return 0;
1250
1251err_out:
1252 set_cpus_allowed(current, oldmask);
1253 powernow_k8_cpu_exit_acpi(data);
1254
1255 kfree(data);
1256 return -ENODEV;
1257}
1258
1259static int __devexit powernowk8_cpu_exit (struct cpufreq_policy *pol)
1260{
1261 struct powernow_k8_data *data = powernow_data[pol->cpu];
1262
1263 if (!data)
1264 return -EINVAL;
1265
1266 powernow_k8_cpu_exit_acpi(data);
1267
1268 cpufreq_frequency_table_put_attr(pol->cpu);
1269
1270 kfree(data->powernow_table);
1271 kfree(data);
1272
1273 return 0;
1274}
1275
1276static unsigned int powernowk8_get (unsigned int cpu)
1277{
1278 struct powernow_k8_data *data;
1279 cpumask_t oldmask = current->cpus_allowed;
1280 unsigned int khz = 0;
1281
1282 data = powernow_data[first_cpu(cpu_core_map[cpu])];
1283
1284 if (!data)
1285 return -EINVAL;
1286
1287 set_cpus_allowed(current, cpumask_of_cpu(cpu));
1288 if (smp_processor_id() != cpu) {
1289 printk(KERN_ERR PFX "limiting to CPU %d failed in powernowk8_get\n", cpu);
1290 set_cpus_allowed(current, oldmask);
1291 return 0;
1292 }
1293
1294 if (query_current_values_with_pending_wait(data))
1295 goto out;
1296
1297 if (cpu_family == CPU_HW_PSTATE)
1298 khz = find_khz_freq_from_fiddid(data->currfid, data->currdid);
1299 else
1300 khz = find_khz_freq_from_fid(data->currfid);
1301
1302
1303out:
1304 set_cpus_allowed(current, oldmask);
1305 return khz;
1306}
1307
1308static struct freq_attr* powernow_k8_attr[] = {
1309 &cpufreq_freq_attr_scaling_available_freqs,
1310 NULL,
1311};
1312
1313static struct cpufreq_driver cpufreq_amd64_driver = {
1314 .verify = powernowk8_verify,
1315 .target = powernowk8_target,
1316 .init = powernowk8_cpu_init,
1317 .exit = __devexit_p(powernowk8_cpu_exit),
1318 .get = powernowk8_get,
1319 .name = "powernow-k8",
1320 .owner = THIS_MODULE,
1321 .attr = powernow_k8_attr,
1322};
1323
1324/* driver entry point for init */
1325static int __cpuinit powernowk8_init(void)
1326{
1327 unsigned int i, supported_cpus = 0;
1328 unsigned int booted_cores = 1;
1329
1330 for_each_online_cpu(i) {
1331 if (check_supported_cpu(i))
1332 supported_cpus++;
1333 }
1334
1335#ifdef CONFIG_SMP
1336 booted_cores = cpu_data[0].booted_cores;
1337#endif
1338
1339 if (supported_cpus == num_online_cpus()) {
1340 printk(KERN_INFO PFX "Found %d %s "
1341 "processors (%d cpu cores) (" VERSION ")\n",
1342 supported_cpus/booted_cores,
1343 boot_cpu_data.x86_model_id, supported_cpus);
1344 return cpufreq_register_driver(&cpufreq_amd64_driver);
1345 }
1346
1347 return -ENODEV;
1348}
1349
1350/* driver entry point for term */
1351static void __exit powernowk8_exit(void)
1352{
1353 dprintk("exit\n");
1354
1355 cpufreq_unregister_driver(&cpufreq_amd64_driver);
1356}
1357
1358MODULE_AUTHOR("Paul Devriendt <paul.devriendt@amd.com> and Mark Langsdorf <mark.langsdorf@amd.com>");
1359MODULE_DESCRIPTION("AMD Athlon 64 and Opteron processor frequency driver.");
1360MODULE_LICENSE("GPL");
1361
1362late_initcall(powernowk8_init);
1363module_exit(powernowk8_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
new file mode 100644
index 000000000000..b06c812208ca
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
@@ -0,0 +1,232 @@
1/*
2 * (c) 2003-2006 Advanced Micro Devices, Inc.
3 * Your use of this code is subject to the terms and conditions of the
4 * GNU general public license version 2. See "COPYING" or
5 * http://www.gnu.org/licenses/gpl.html
6 */
7
8struct powernow_k8_data {
9 unsigned int cpu;
10
11 u32 numps; /* number of p-states */
12 u32 batps; /* number of p-states supported on battery */
13
14 /* these values are constant when the PSB is used to determine
15 * vid/fid pairings, but are modified during the ->target() call
16 * when ACPI is used */
17 u32 rvo; /* ramp voltage offset */
18 u32 irt; /* isochronous relief time */
19 u32 vidmvs; /* usable value calculated from mvs */
20 u32 vstable; /* voltage stabilization time, units 20 us */
21 u32 plllock; /* pll lock time, units 1 us */
22 u32 exttype; /* extended interface = 1 */
23
24 /* keep track of the current fid / vid or did */
25 u32 currvid, currfid, currdid;
26
27 /* the powernow_table includes all frequency and vid/fid pairings:
28 * fid are the lower 8 bits of the index, vid are the upper 8 bits.
29 * frequency is in kHz */
30 struct cpufreq_frequency_table *powernow_table;
31
32#ifdef CONFIG_X86_POWERNOW_K8_ACPI
33 /* the acpi table needs to be kept. it's only available if ACPI was
34 * used to determine valid frequency/vid/fid states */
35 struct acpi_processor_performance acpi_data;
36#endif
37 /* we need to keep track of associated cores, but let cpufreq
38 * handle hotplug events - so just point at cpufreq pol->cpus
39 * structure */
40 cpumask_t *available_cores;
41};
42
43
44/* processor's cpuid instruction support */
45#define CPUID_PROCESSOR_SIGNATURE 1 /* function 1 */
46#define CPUID_XFAM 0x0ff00000 /* extended family */
47#define CPUID_XFAM_K8 0
48#define CPUID_XMOD 0x000f0000 /* extended model */
49#define CPUID_XMOD_REV_MASK 0x00080000
50#define CPUID_XFAM_10H 0x00100000 /* family 0x10 */
51#define CPUID_USE_XFAM_XMOD 0x00000f00
52#define CPUID_GET_MAX_CAPABILITIES 0x80000000
53#define CPUID_FREQ_VOLT_CAPABILITIES 0x80000007
54#define P_STATE_TRANSITION_CAPABLE 6
55
56/* Model Specific Registers for p-state transitions. MSRs are 64-bit. For */
57/* writes (wrmsr - opcode 0f 30), the register number is placed in ecx, and */
58/* the value to write is placed in edx:eax. For reads (rdmsr - opcode 0f 32), */
59/* the register number is placed in ecx, and the data is returned in edx:eax. */
60
61#define MSR_FIDVID_CTL 0xc0010041
62#define MSR_FIDVID_STATUS 0xc0010042
63
64/* Field definitions within the FID VID Low Control MSR : */
65#define MSR_C_LO_INIT_FID_VID 0x00010000
66#define MSR_C_LO_NEW_VID 0x00003f00
67#define MSR_C_LO_NEW_FID 0x0000003f
68#define MSR_C_LO_VID_SHIFT 8
69
70/* Field definitions within the FID VID High Control MSR : */
71#define MSR_C_HI_STP_GNT_TO 0x000fffff
72
73/* Field definitions within the FID VID Low Status MSR : */
74#define MSR_S_LO_CHANGE_PENDING 0x80000000 /* cleared when completed */
75#define MSR_S_LO_MAX_RAMP_VID 0x3f000000
76#define MSR_S_LO_MAX_FID 0x003f0000
77#define MSR_S_LO_START_FID 0x00003f00
78#define MSR_S_LO_CURRENT_FID 0x0000003f
79
80/* Field definitions within the FID VID High Status MSR : */
81#define MSR_S_HI_MIN_WORKING_VID 0x3f000000
82#define MSR_S_HI_MAX_WORKING_VID 0x003f0000
83#define MSR_S_HI_START_VID 0x00003f00
84#define MSR_S_HI_CURRENT_VID 0x0000003f
85#define MSR_C_HI_STP_GNT_BENIGN 0x00000001
86
87
88/* Hardware Pstate _PSS and MSR definitions */
89#define USE_HW_PSTATE 0x00000080
90#define HW_PSTATE_FID_MASK 0x0000003f
91#define HW_PSTATE_DID_MASK 0x000001c0
92#define HW_PSTATE_DID_SHIFT 6
93#define HW_PSTATE_MASK 0x00000007
94#define HW_PSTATE_VALID_MASK 0x80000000
95#define HW_FID_INDEX_SHIFT 8
96#define HW_FID_INDEX_MASK 0x0000ff00
97#define HW_DID_INDEX_SHIFT 16
98#define HW_DID_INDEX_MASK 0x00ff0000
99#define HW_WATTS_MASK 0xff
100#define HW_PWR_DVR_MASK 0x300
101#define HW_PWR_DVR_SHIFT 8
102#define HW_PWR_MAX_MULT 3
103#define MAX_HW_PSTATE 8 /* hw pstate supports up to 8 */
104#define MSR_PSTATE_DEF_BASE 0xc0010064 /* base of Pstate MSRs */
105#define MSR_PSTATE_STATUS 0xc0010063 /* Pstate Status MSR */
106#define MSR_PSTATE_CTRL 0xc0010062 /* Pstate control MSR */
107
108/* define the two driver architectures */
109#define CPU_OPTERON 0
110#define CPU_HW_PSTATE 1
111
112
113/*
114 * There are restrictions frequencies have to follow:
115 * - only 1 entry in the low fid table ( <=1.4GHz )
116 * - lowest entry in the high fid table must be >= 2 * the entry in the
117 * low fid table
118 * - lowest entry in the high fid table must be a <= 200MHz + 2 * the entry
119 * in the low fid table
120 * - the parts can only step at <= 200 MHz intervals, odd fid values are
121 * supported in revision G and later revisions.
122 * - lowest frequency must be >= interprocessor hypertransport link speed
123 * (only applies to MP systems obviously)
124 */
125
126/* fids (frequency identifiers) are arranged in 2 tables - lo and hi */
127#define LO_FID_TABLE_TOP 7 /* fid values marking the boundary */
128#define HI_FID_TABLE_BOTTOM 8 /* between the low and high tables */
129
130#define LO_VCOFREQ_TABLE_TOP 1400 /* corresponding vco frequency values */
131#define HI_VCOFREQ_TABLE_BOTTOM 1600
132
133#define MIN_FREQ_RESOLUTION 200 /* fids jump by 2 matching freq jumps by 200 */
134
135#define MAX_FID 0x2a /* Spec only gives FID values as far as 5 GHz */
136#define LEAST_VID 0x3e /* Lowest (numerically highest) useful vid value */
137
138#define MIN_FREQ 800 /* Min and max freqs, per spec */
139#define MAX_FREQ 5000
140
141#define INVALID_FID_MASK 0xffffffc0 /* not a valid fid if these bits are set */
142#define INVALID_VID_MASK 0xffffffc0 /* not a valid vid if these bits are set */
143
144#define VID_OFF 0x3f
145
146#define STOP_GRANT_5NS 1 /* min poss memory access latency for voltage change */
147
148#define PLL_LOCK_CONVERSION (1000/5) /* ms to ns, then divide by clock period */
149
150#define MAXIMUM_VID_STEPS 1 /* Current cpus only allow a single step of 25mV */
151#define VST_UNITS_20US 20 /* Voltage Stabalization Time is in units of 20us */
152
153/*
154 * Most values of interest are enocoded in a single field of the _PSS
155 * entries: the "control" value.
156 */
157
158#define IRT_SHIFT 30
159#define RVO_SHIFT 28
160#define EXT_TYPE_SHIFT 27
161#define PLL_L_SHIFT 20
162#define MVS_SHIFT 18
163#define VST_SHIFT 11
164#define VID_SHIFT 6
165#define IRT_MASK 3
166#define RVO_MASK 3
167#define EXT_TYPE_MASK 1
168#define PLL_L_MASK 0x7f
169#define MVS_MASK 3
170#define VST_MASK 0x7f
171#define VID_MASK 0x1f
172#define FID_MASK 0x1f
173#define EXT_VID_MASK 0x3f
174#define EXT_FID_MASK 0x3f
175
176
177/*
178 * Version 1.4 of the PSB table. This table is constructed by BIOS and is
179 * to tell the OS's power management driver which VIDs and FIDs are
180 * supported by this particular processor.
181 * If the data in the PSB / PST is wrong, then this driver will program the
182 * wrong values into hardware, which is very likely to lead to a crash.
183 */
184
185#define PSB_ID_STRING "AMDK7PNOW!"
186#define PSB_ID_STRING_LEN 10
187
188#define PSB_VERSION_1_4 0x14
189
190struct psb_s {
191 u8 signature[10];
192 u8 tableversion;
193 u8 flags1;
194 u16 vstable;
195 u8 flags2;
196 u8 num_tables;
197 u32 cpuid;
198 u8 plllocktime;
199 u8 maxfid;
200 u8 maxvid;
201 u8 numps;
202};
203
204/* Pairs of fid/vid values are appended to the version 1.4 PSB table. */
205struct pst_s {
206 u8 fid;
207 u8 vid;
208};
209
210#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "powernow-k8", msg)
211
212static int core_voltage_pre_transition(struct powernow_k8_data *data, u32 reqvid);
213static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvid);
214static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid);
215
216static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index);
217
218#ifdef CONFIG_X86_POWERNOW_K8_ACPI
219static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table);
220static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table);
221#endif
222
223#ifdef CONFIG_SMP
224static inline void define_siblings(int cpu, cpumask_t cpu_sharedcore_mask[])
225{
226}
227#else
228static inline void define_siblings(int cpu, cpumask_t cpu_sharedcore_mask[])
229{
230 cpu_set(0, cpu_sharedcore_mask[0]);
231}
232#endif
diff --git a/arch/x86/kernel/cpu/cpufreq/sc520_freq.c b/arch/x86/kernel/cpu/cpufreq/sc520_freq.c
new file mode 100644
index 000000000000..b8fb4b521c62
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/sc520_freq.c
@@ -0,0 +1,191 @@
1/*
2 * sc520_freq.c: cpufreq driver for the AMD Elan sc520
3 *
4 * Copyright (C) 2005 Sean Young <sean@mess.org>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 * Based on elanfreq.c
12 *
13 * 2005-03-30: - initial revision
14 */
15
16#include <linux/kernel.h>
17#include <linux/module.h>
18#include <linux/init.h>
19
20#include <linux/delay.h>
21#include <linux/cpufreq.h>
22
23#include <asm/msr.h>
24#include <asm/timex.h>
25#include <asm/io.h>
26
27#define MMCR_BASE 0xfffef000 /* The default base address */
28#define OFFS_CPUCTL 0x2 /* CPU Control Register */
29
30static __u8 __iomem *cpuctl;
31
32#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "sc520_freq", msg)
33
34static struct cpufreq_frequency_table sc520_freq_table[] = {
35 {0x01, 100000},
36 {0x02, 133000},
37 {0, CPUFREQ_TABLE_END},
38};
39
40static unsigned int sc520_freq_get_cpu_frequency(unsigned int cpu)
41{
42 u8 clockspeed_reg = *cpuctl;
43
44 switch (clockspeed_reg & 0x03) {
45 default:
46 printk(KERN_ERR "sc520_freq: error: cpuctl register has unexpected value %02x\n", clockspeed_reg);
47 case 0x01:
48 return 100000;
49 case 0x02:
50 return 133000;
51 }
52}
53
54static void sc520_freq_set_cpu_state (unsigned int state)
55{
56
57 struct cpufreq_freqs freqs;
58 u8 clockspeed_reg;
59
60 freqs.old = sc520_freq_get_cpu_frequency(0);
61 freqs.new = sc520_freq_table[state].frequency;
62 freqs.cpu = 0; /* AMD Elan is UP */
63
64 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
65
66 dprintk("attempting to set frequency to %i kHz\n",
67 sc520_freq_table[state].frequency);
68
69 local_irq_disable();
70
71 clockspeed_reg = *cpuctl & ~0x03;
72 *cpuctl = clockspeed_reg | sc520_freq_table[state].index;
73
74 local_irq_enable();
75
76 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
77};
78
79static int sc520_freq_verify (struct cpufreq_policy *policy)
80{
81 return cpufreq_frequency_table_verify(policy, &sc520_freq_table[0]);
82}
83
84static int sc520_freq_target (struct cpufreq_policy *policy,
85 unsigned int target_freq,
86 unsigned int relation)
87{
88 unsigned int newstate = 0;
89
90 if (cpufreq_frequency_table_target(policy, sc520_freq_table, target_freq, relation, &newstate))
91 return -EINVAL;
92
93 sc520_freq_set_cpu_state(newstate);
94
95 return 0;
96}
97
98
99/*
100 * Module init and exit code
101 */
102
103static int sc520_freq_cpu_init(struct cpufreq_policy *policy)
104{
105 struct cpuinfo_x86 *c = cpu_data;
106 int result;
107
108 /* capability check */
109 if (c->x86_vendor != X86_VENDOR_AMD ||
110 c->x86 != 4 || c->x86_model != 9)
111 return -ENODEV;
112
113 /* cpuinfo and default policy values */
114 policy->governor = CPUFREQ_DEFAULT_GOVERNOR;
115 policy->cpuinfo.transition_latency = 1000000; /* 1ms */
116 policy->cur = sc520_freq_get_cpu_frequency(0);
117
118 result = cpufreq_frequency_table_cpuinfo(policy, sc520_freq_table);
119 if (result)
120 return (result);
121
122 cpufreq_frequency_table_get_attr(sc520_freq_table, policy->cpu);
123
124 return 0;
125}
126
127
128static int sc520_freq_cpu_exit(struct cpufreq_policy *policy)
129{
130 cpufreq_frequency_table_put_attr(policy->cpu);
131 return 0;
132}
133
134
135static struct freq_attr* sc520_freq_attr[] = {
136 &cpufreq_freq_attr_scaling_available_freqs,
137 NULL,
138};
139
140
141static struct cpufreq_driver sc520_freq_driver = {
142 .get = sc520_freq_get_cpu_frequency,
143 .verify = sc520_freq_verify,
144 .target = sc520_freq_target,
145 .init = sc520_freq_cpu_init,
146 .exit = sc520_freq_cpu_exit,
147 .name = "sc520_freq",
148 .owner = THIS_MODULE,
149 .attr = sc520_freq_attr,
150};
151
152
153static int __init sc520_freq_init(void)
154{
155 struct cpuinfo_x86 *c = cpu_data;
156 int err;
157
158 /* Test if we have the right hardware */
159 if(c->x86_vendor != X86_VENDOR_AMD ||
160 c->x86 != 4 || c->x86_model != 9) {
161 dprintk("no Elan SC520 processor found!\n");
162 return -ENODEV;
163 }
164 cpuctl = ioremap((unsigned long)(MMCR_BASE + OFFS_CPUCTL), 1);
165 if(!cpuctl) {
166 printk(KERN_ERR "sc520_freq: error: failed to remap memory\n");
167 return -ENOMEM;
168 }
169
170 err = cpufreq_register_driver(&sc520_freq_driver);
171 if (err)
172 iounmap(cpuctl);
173
174 return err;
175}
176
177
178static void __exit sc520_freq_exit(void)
179{
180 cpufreq_unregister_driver(&sc520_freq_driver);
181 iounmap(cpuctl);
182}
183
184
185MODULE_LICENSE("GPL");
186MODULE_AUTHOR("Sean Young <sean@mess.org>");
187MODULE_DESCRIPTION("cpufreq driver for AMD's Elan sc520 CPU");
188
189module_init(sc520_freq_init);
190module_exit(sc520_freq_exit);
191
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
new file mode 100644
index 000000000000..6c5dc2c85aeb
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
@@ -0,0 +1,634 @@
1/*
2 * cpufreq driver for Enhanced SpeedStep, as found in Intel's Pentium
3 * M (part of the Centrino chipset).
4 *
5 * Since the original Pentium M, most new Intel CPUs support Enhanced
6 * SpeedStep.
7 *
8 * Despite the "SpeedStep" in the name, this is almost entirely unlike
9 * traditional SpeedStep.
10 *
11 * Modelled on speedstep.c
12 *
13 * Copyright (C) 2003 Jeremy Fitzhardinge <jeremy@goop.org>
14 */
15
16#include <linux/kernel.h>
17#include <linux/module.h>
18#include <linux/init.h>
19#include <linux/cpufreq.h>
20#include <linux/sched.h> /* current */
21#include <linux/delay.h>
22#include <linux/compiler.h>
23
24#include <asm/msr.h>
25#include <asm/processor.h>
26#include <asm/cpufeature.h>
27
28#define PFX "speedstep-centrino: "
29#define MAINTAINER "cpufreq@lists.linux.org.uk"
30
31#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg)
32
33#define INTEL_MSR_RANGE (0xffff)
34
35struct cpu_id
36{
37 __u8 x86; /* CPU family */
38 __u8 x86_model; /* model */
39 __u8 x86_mask; /* stepping */
40};
41
42enum {
43 CPU_BANIAS,
44 CPU_DOTHAN_A1,
45 CPU_DOTHAN_A2,
46 CPU_DOTHAN_B0,
47 CPU_MP4HT_D0,
48 CPU_MP4HT_E0,
49};
50
51static const struct cpu_id cpu_ids[] = {
52 [CPU_BANIAS] = { 6, 9, 5 },
53 [CPU_DOTHAN_A1] = { 6, 13, 1 },
54 [CPU_DOTHAN_A2] = { 6, 13, 2 },
55 [CPU_DOTHAN_B0] = { 6, 13, 6 },
56 [CPU_MP4HT_D0] = {15, 3, 4 },
57 [CPU_MP4HT_E0] = {15, 4, 1 },
58};
59#define N_IDS ARRAY_SIZE(cpu_ids)
60
61struct cpu_model
62{
63 const struct cpu_id *cpu_id;
64 const char *model_name;
65 unsigned max_freq; /* max clock in kHz */
66
67 struct cpufreq_frequency_table *op_points; /* clock/voltage pairs */
68};
69static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, const struct cpu_id *x);
70
71/* Operating points for current CPU */
72static struct cpu_model *centrino_model[NR_CPUS];
73static const struct cpu_id *centrino_cpu[NR_CPUS];
74
75static struct cpufreq_driver centrino_driver;
76
77#ifdef CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE
78
79/* Computes the correct form for IA32_PERF_CTL MSR for a particular
80 frequency/voltage operating point; frequency in MHz, volts in mV.
81 This is stored as "index" in the structure. */
82#define OP(mhz, mv) \
83 { \
84 .frequency = (mhz) * 1000, \
85 .index = (((mhz)/100) << 8) | ((mv - 700) / 16) \
86 }
87
88/*
89 * These voltage tables were derived from the Intel Pentium M
90 * datasheet, document 25261202.pdf, Table 5. I have verified they
91 * are consistent with my IBM ThinkPad X31, which has a 1.3GHz Pentium
92 * M.
93 */
94
95/* Ultra Low Voltage Intel Pentium M processor 900MHz (Banias) */
96static struct cpufreq_frequency_table banias_900[] =
97{
98 OP(600, 844),
99 OP(800, 988),
100 OP(900, 1004),
101 { .frequency = CPUFREQ_TABLE_END }
102};
103
104/* Ultra Low Voltage Intel Pentium M processor 1000MHz (Banias) */
105static struct cpufreq_frequency_table banias_1000[] =
106{
107 OP(600, 844),
108 OP(800, 972),
109 OP(900, 988),
110 OP(1000, 1004),
111 { .frequency = CPUFREQ_TABLE_END }
112};
113
114/* Low Voltage Intel Pentium M processor 1.10GHz (Banias) */
115static struct cpufreq_frequency_table banias_1100[] =
116{
117 OP( 600, 956),
118 OP( 800, 1020),
119 OP( 900, 1100),
120 OP(1000, 1164),
121 OP(1100, 1180),
122 { .frequency = CPUFREQ_TABLE_END }
123};
124
125
126/* Low Voltage Intel Pentium M processor 1.20GHz (Banias) */
127static struct cpufreq_frequency_table banias_1200[] =
128{
129 OP( 600, 956),
130 OP( 800, 1004),
131 OP( 900, 1020),
132 OP(1000, 1100),
133 OP(1100, 1164),
134 OP(1200, 1180),
135 { .frequency = CPUFREQ_TABLE_END }
136};
137
138/* Intel Pentium M processor 1.30GHz (Banias) */
139static struct cpufreq_frequency_table banias_1300[] =
140{
141 OP( 600, 956),
142 OP( 800, 1260),
143 OP(1000, 1292),
144 OP(1200, 1356),
145 OP(1300, 1388),
146 { .frequency = CPUFREQ_TABLE_END }
147};
148
149/* Intel Pentium M processor 1.40GHz (Banias) */
150static struct cpufreq_frequency_table banias_1400[] =
151{
152 OP( 600, 956),
153 OP( 800, 1180),
154 OP(1000, 1308),
155 OP(1200, 1436),
156 OP(1400, 1484),
157 { .frequency = CPUFREQ_TABLE_END }
158};
159
160/* Intel Pentium M processor 1.50GHz (Banias) */
161static struct cpufreq_frequency_table banias_1500[] =
162{
163 OP( 600, 956),
164 OP( 800, 1116),
165 OP(1000, 1228),
166 OP(1200, 1356),
167 OP(1400, 1452),
168 OP(1500, 1484),
169 { .frequency = CPUFREQ_TABLE_END }
170};
171
172/* Intel Pentium M processor 1.60GHz (Banias) */
173static struct cpufreq_frequency_table banias_1600[] =
174{
175 OP( 600, 956),
176 OP( 800, 1036),
177 OP(1000, 1164),
178 OP(1200, 1276),
179 OP(1400, 1420),
180 OP(1600, 1484),
181 { .frequency = CPUFREQ_TABLE_END }
182};
183
184/* Intel Pentium M processor 1.70GHz (Banias) */
185static struct cpufreq_frequency_table banias_1700[] =
186{
187 OP( 600, 956),
188 OP( 800, 1004),
189 OP(1000, 1116),
190 OP(1200, 1228),
191 OP(1400, 1308),
192 OP(1700, 1484),
193 { .frequency = CPUFREQ_TABLE_END }
194};
195#undef OP
196
197#define _BANIAS(cpuid, max, name) \
198{ .cpu_id = cpuid, \
199 .model_name = "Intel(R) Pentium(R) M processor " name "MHz", \
200 .max_freq = (max)*1000, \
201 .op_points = banias_##max, \
202}
203#define BANIAS(max) _BANIAS(&cpu_ids[CPU_BANIAS], max, #max)
204
205/* CPU models, their operating frequency range, and freq/voltage
206 operating points */
207static struct cpu_model models[] =
208{
209 _BANIAS(&cpu_ids[CPU_BANIAS], 900, " 900"),
210 BANIAS(1000),
211 BANIAS(1100),
212 BANIAS(1200),
213 BANIAS(1300),
214 BANIAS(1400),
215 BANIAS(1500),
216 BANIAS(1600),
217 BANIAS(1700),
218
219 /* NULL model_name is a wildcard */
220 { &cpu_ids[CPU_DOTHAN_A1], NULL, 0, NULL },
221 { &cpu_ids[CPU_DOTHAN_A2], NULL, 0, NULL },
222 { &cpu_ids[CPU_DOTHAN_B0], NULL, 0, NULL },
223 { &cpu_ids[CPU_MP4HT_D0], NULL, 0, NULL },
224 { &cpu_ids[CPU_MP4HT_E0], NULL, 0, NULL },
225
226 { NULL, }
227};
228#undef _BANIAS
229#undef BANIAS
230
231static int centrino_cpu_init_table(struct cpufreq_policy *policy)
232{
233 struct cpuinfo_x86 *cpu = &cpu_data[policy->cpu];
234 struct cpu_model *model;
235
236 for(model = models; model->cpu_id != NULL; model++)
237 if (centrino_verify_cpu_id(cpu, model->cpu_id) &&
238 (model->model_name == NULL ||
239 strcmp(cpu->x86_model_id, model->model_name) == 0))
240 break;
241
242 if (model->cpu_id == NULL) {
243 /* No match at all */
244 dprintk("no support for CPU model \"%s\": "
245 "send /proc/cpuinfo to " MAINTAINER "\n",
246 cpu->x86_model_id);
247 return -ENOENT;
248 }
249
250 if (model->op_points == NULL) {
251 /* Matched a non-match */
252 dprintk("no table support for CPU model \"%s\"\n",
253 cpu->x86_model_id);
254 dprintk("try using the acpi-cpufreq driver\n");
255 return -ENOENT;
256 }
257
258 centrino_model[policy->cpu] = model;
259
260 dprintk("found \"%s\": max frequency: %dkHz\n",
261 model->model_name, model->max_freq);
262
263 return 0;
264}
265
266#else
267static inline int centrino_cpu_init_table(struct cpufreq_policy *policy) { return -ENODEV; }
268#endif /* CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE */
269
270static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, const struct cpu_id *x)
271{
272 if ((c->x86 == x->x86) &&
273 (c->x86_model == x->x86_model) &&
274 (c->x86_mask == x->x86_mask))
275 return 1;
276 return 0;
277}
278
279/* To be called only after centrino_model is initialized */
280static unsigned extract_clock(unsigned msr, unsigned int cpu, int failsafe)
281{
282 int i;
283
284 /*
285 * Extract clock in kHz from PERF_CTL value
286 * for centrino, as some DSDTs are buggy.
287 * Ideally, this can be done using the acpi_data structure.
288 */
289 if ((centrino_cpu[cpu] == &cpu_ids[CPU_BANIAS]) ||
290 (centrino_cpu[cpu] == &cpu_ids[CPU_DOTHAN_A1]) ||
291 (centrino_cpu[cpu] == &cpu_ids[CPU_DOTHAN_B0])) {
292 msr = (msr >> 8) & 0xff;
293 return msr * 100000;
294 }
295
296 if ((!centrino_model[cpu]) || (!centrino_model[cpu]->op_points))
297 return 0;
298
299 msr &= 0xffff;
300 for (i=0;centrino_model[cpu]->op_points[i].frequency != CPUFREQ_TABLE_END; i++) {
301 if (msr == centrino_model[cpu]->op_points[i].index)
302 return centrino_model[cpu]->op_points[i].frequency;
303 }
304 if (failsafe)
305 return centrino_model[cpu]->op_points[i-1].frequency;
306 else
307 return 0;
308}
309
310/* Return the current CPU frequency in kHz */
311static unsigned int get_cur_freq(unsigned int cpu)
312{
313 unsigned l, h;
314 unsigned clock_freq;
315 cpumask_t saved_mask;
316
317 saved_mask = current->cpus_allowed;
318 set_cpus_allowed(current, cpumask_of_cpu(cpu));
319 if (smp_processor_id() != cpu)
320 return 0;
321
322 rdmsr(MSR_IA32_PERF_STATUS, l, h);
323 clock_freq = extract_clock(l, cpu, 0);
324
325 if (unlikely(clock_freq == 0)) {
326 /*
327 * On some CPUs, we can see transient MSR values (which are
328 * not present in _PSS), while CPU is doing some automatic
329 * P-state transition (like TM2). Get the last freq set
330 * in PERF_CTL.
331 */
332 rdmsr(MSR_IA32_PERF_CTL, l, h);
333 clock_freq = extract_clock(l, cpu, 1);
334 }
335
336 set_cpus_allowed(current, saved_mask);
337 return clock_freq;
338}
339
340
341static int centrino_cpu_init(struct cpufreq_policy *policy)
342{
343 struct cpuinfo_x86 *cpu = &cpu_data[policy->cpu];
344 unsigned freq;
345 unsigned l, h;
346 int ret;
347 int i;
348
349 /* Only Intel makes Enhanced Speedstep-capable CPUs */
350 if (cpu->x86_vendor != X86_VENDOR_INTEL || !cpu_has(cpu, X86_FEATURE_EST))
351 return -ENODEV;
352
353 if (cpu_has(cpu, X86_FEATURE_CONSTANT_TSC))
354 centrino_driver.flags |= CPUFREQ_CONST_LOOPS;
355
356 if (policy->cpu != 0)
357 return -ENODEV;
358
359 for (i = 0; i < N_IDS; i++)
360 if (centrino_verify_cpu_id(cpu, &cpu_ids[i]))
361 break;
362
363 if (i != N_IDS)
364 centrino_cpu[policy->cpu] = &cpu_ids[i];
365
366 if (!centrino_cpu[policy->cpu]) {
367 dprintk("found unsupported CPU with "
368 "Enhanced SpeedStep: send /proc/cpuinfo to "
369 MAINTAINER "\n");
370 return -ENODEV;
371 }
372
373 if (centrino_cpu_init_table(policy)) {
374 return -ENODEV;
375 }
376
377 /* Check to see if Enhanced SpeedStep is enabled, and try to
378 enable it if not. */
379 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
380
381 if (!(l & (1<<16))) {
382 l |= (1<<16);
383 dprintk("trying to enable Enhanced SpeedStep (%x)\n", l);
384 wrmsr(MSR_IA32_MISC_ENABLE, l, h);
385
386 /* check to see if it stuck */
387 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
388 if (!(l & (1<<16))) {
389 printk(KERN_INFO PFX "couldn't enable Enhanced SpeedStep\n");
390 return -ENODEV;
391 }
392 }
393
394 freq = get_cur_freq(policy->cpu);
395
396 policy->governor = CPUFREQ_DEFAULT_GOVERNOR;
397 policy->cpuinfo.transition_latency = 10000; /* 10uS transition latency */
398 policy->cur = freq;
399
400 dprintk("centrino_cpu_init: cur=%dkHz\n", policy->cur);
401
402 ret = cpufreq_frequency_table_cpuinfo(policy, centrino_model[policy->cpu]->op_points);
403 if (ret)
404 return (ret);
405
406 cpufreq_frequency_table_get_attr(centrino_model[policy->cpu]->op_points, policy->cpu);
407
408 return 0;
409}
410
411static int centrino_cpu_exit(struct cpufreq_policy *policy)
412{
413 unsigned int cpu = policy->cpu;
414
415 if (!centrino_model[cpu])
416 return -ENODEV;
417
418 cpufreq_frequency_table_put_attr(cpu);
419
420 centrino_model[cpu] = NULL;
421
422 return 0;
423}
424
425/**
426 * centrino_verify - verifies a new CPUFreq policy
427 * @policy: new policy
428 *
429 * Limit must be within this model's frequency range at least one
430 * border included.
431 */
432static int centrino_verify (struct cpufreq_policy *policy)
433{
434 return cpufreq_frequency_table_verify(policy, centrino_model[policy->cpu]->op_points);
435}
436
437/**
438 * centrino_setpolicy - set a new CPUFreq policy
439 * @policy: new policy
440 * @target_freq: the target frequency
441 * @relation: how that frequency relates to achieved frequency (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
442 *
443 * Sets a new CPUFreq policy.
444 */
445static int centrino_target (struct cpufreq_policy *policy,
446 unsigned int target_freq,
447 unsigned int relation)
448{
449 unsigned int newstate = 0;
450 unsigned int msr, oldmsr = 0, h = 0, cpu = policy->cpu;
451 struct cpufreq_freqs freqs;
452 cpumask_t online_policy_cpus;
453 cpumask_t saved_mask;
454 cpumask_t set_mask;
455 cpumask_t covered_cpus;
456 int retval = 0;
457 unsigned int j, k, first_cpu, tmp;
458
459 if (unlikely(centrino_model[cpu] == NULL))
460 return -ENODEV;
461
462 if (unlikely(cpufreq_frequency_table_target(policy,
463 centrino_model[cpu]->op_points,
464 target_freq,
465 relation,
466 &newstate))) {
467 return -EINVAL;
468 }
469
470#ifdef CONFIG_HOTPLUG_CPU
471 /* cpufreq holds the hotplug lock, so we are safe from here on */
472 cpus_and(online_policy_cpus, cpu_online_map, policy->cpus);
473#else
474 online_policy_cpus = policy->cpus;
475#endif
476
477 saved_mask = current->cpus_allowed;
478 first_cpu = 1;
479 cpus_clear(covered_cpus);
480 for_each_cpu_mask(j, online_policy_cpus) {
481 /*
482 * Support for SMP systems.
483 * Make sure we are running on CPU that wants to change freq
484 */
485 cpus_clear(set_mask);
486 if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY)
487 cpus_or(set_mask, set_mask, online_policy_cpus);
488 else
489 cpu_set(j, set_mask);
490
491 set_cpus_allowed(current, set_mask);
492 preempt_disable();
493 if (unlikely(!cpu_isset(smp_processor_id(), set_mask))) {
494 dprintk("couldn't limit to CPUs in this domain\n");
495 retval = -EAGAIN;
496 if (first_cpu) {
497 /* We haven't started the transition yet. */
498 goto migrate_end;
499 }
500 preempt_enable();
501 break;
502 }
503
504 msr = centrino_model[cpu]->op_points[newstate].index;
505
506 if (first_cpu) {
507 rdmsr(MSR_IA32_PERF_CTL, oldmsr, h);
508 if (msr == (oldmsr & 0xffff)) {
509 dprintk("no change needed - msr was and needs "
510 "to be %x\n", oldmsr);
511 retval = 0;
512 goto migrate_end;
513 }
514
515 freqs.old = extract_clock(oldmsr, cpu, 0);
516 freqs.new = extract_clock(msr, cpu, 0);
517
518 dprintk("target=%dkHz old=%d new=%d msr=%04x\n",
519 target_freq, freqs.old, freqs.new, msr);
520
521 for_each_cpu_mask(k, online_policy_cpus) {
522 freqs.cpu = k;
523 cpufreq_notify_transition(&freqs,
524 CPUFREQ_PRECHANGE);
525 }
526
527 first_cpu = 0;
528 /* all but 16 LSB are reserved, treat them with care */
529 oldmsr &= ~0xffff;
530 msr &= 0xffff;
531 oldmsr |= msr;
532 }
533
534 wrmsr(MSR_IA32_PERF_CTL, oldmsr, h);
535 if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) {
536 preempt_enable();
537 break;
538 }
539
540 cpu_set(j, covered_cpus);
541 preempt_enable();
542 }
543
544 for_each_cpu_mask(k, online_policy_cpus) {
545 freqs.cpu = k;
546 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
547 }
548
549 if (unlikely(retval)) {
550 /*
551 * We have failed halfway through the frequency change.
552 * We have sent callbacks to policy->cpus and
553 * MSRs have already been written on coverd_cpus.
554 * Best effort undo..
555 */
556
557 if (!cpus_empty(covered_cpus)) {
558 for_each_cpu_mask(j, covered_cpus) {
559 set_cpus_allowed(current, cpumask_of_cpu(j));
560 wrmsr(MSR_IA32_PERF_CTL, oldmsr, h);
561 }
562 }
563
564 tmp = freqs.new;
565 freqs.new = freqs.old;
566 freqs.old = tmp;
567 for_each_cpu_mask(j, online_policy_cpus) {
568 freqs.cpu = j;
569 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
570 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
571 }
572 }
573 set_cpus_allowed(current, saved_mask);
574 return 0;
575
576migrate_end:
577 preempt_enable();
578 set_cpus_allowed(current, saved_mask);
579 return 0;
580}
581
582static struct freq_attr* centrino_attr[] = {
583 &cpufreq_freq_attr_scaling_available_freqs,
584 NULL,
585};
586
587static struct cpufreq_driver centrino_driver = {
588 .name = "centrino", /* should be speedstep-centrino,
589 but there's a 16 char limit */
590 .init = centrino_cpu_init,
591 .exit = centrino_cpu_exit,
592 .verify = centrino_verify,
593 .target = centrino_target,
594 .get = get_cur_freq,
595 .attr = centrino_attr,
596 .owner = THIS_MODULE,
597};
598
599
600/**
601 * centrino_init - initializes the Enhanced SpeedStep CPUFreq driver
602 *
603 * Initializes the Enhanced SpeedStep support. Returns -ENODEV on
604 * unsupported devices, -ENOENT if there's no voltage table for this
605 * particular CPU model, -EINVAL on problems during initiatization,
606 * and zero on success.
607 *
608 * This is quite picky. Not only does the CPU have to advertise the
609 * "est" flag in the cpuid capability flags, we look for a specific
610 * CPU model and stepping, and we need to have the exact model name in
611 * our voltage tables. That is, be paranoid about not releasing
612 * someone's valuable magic smoke.
613 */
614static int __init centrino_init(void)
615{
616 struct cpuinfo_x86 *cpu = cpu_data;
617
618 if (!cpu_has(cpu, X86_FEATURE_EST))
619 return -ENODEV;
620
621 return cpufreq_register_driver(&centrino_driver);
622}
623
624static void __exit centrino_exit(void)
625{
626 cpufreq_unregister_driver(&centrino_driver);
627}
628
629MODULE_AUTHOR ("Jeremy Fitzhardinge <jeremy@goop.org>");
630MODULE_DESCRIPTION ("Enhanced SpeedStep driver for Intel Pentium M processors.");
631MODULE_LICENSE ("GPL");
632
633late_initcall(centrino_init);
634module_exit(centrino_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
new file mode 100644
index 000000000000..a5b2346faf1f
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
@@ -0,0 +1,440 @@
1/*
2 * (C) 2001 Dave Jones, Arjan van de ven.
3 * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
4 *
5 * Licensed under the terms of the GNU GPL License version 2.
6 * Based upon reverse engineered information, and on Intel documentation
7 * for chipsets ICH2-M and ICH3-M.
8 *
9 * Many thanks to Ducrot Bruno for finding and fixing the last
10 * "missing link" for ICH2-M/ICH3-M support, and to Thomas Winkler
11 * for extensive testing.
12 *
13 * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
14 */
15
16
17/*********************************************************************
18 * SPEEDSTEP - DEFINITIONS *
19 *********************************************************************/
20
21#include <linux/kernel.h>
22#include <linux/module.h>
23#include <linux/init.h>
24#include <linux/cpufreq.h>
25#include <linux/pci.h>
26#include <linux/slab.h>
27#include <linux/sched.h>
28
29#include "speedstep-lib.h"
30
31
32/* speedstep_chipset:
33 * It is necessary to know which chipset is used. As accesses to
34 * this device occur at various places in this module, we need a
35 * static struct pci_dev * pointing to that device.
36 */
37static struct pci_dev *speedstep_chipset_dev;
38
39
40/* speedstep_processor
41 */
42static unsigned int speedstep_processor = 0;
43
44static u32 pmbase;
45
46/*
47 * There are only two frequency states for each processor. Values
48 * are in kHz for the time being.
49 */
50static struct cpufreq_frequency_table speedstep_freqs[] = {
51 {SPEEDSTEP_HIGH, 0},
52 {SPEEDSTEP_LOW, 0},
53 {0, CPUFREQ_TABLE_END},
54};
55
56
57#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-ich", msg)
58
59
60/**
61 * speedstep_find_register - read the PMBASE address
62 *
63 * Returns: -ENODEV if no register could be found
64 */
65static int speedstep_find_register (void)
66{
67 if (!speedstep_chipset_dev)
68 return -ENODEV;
69
70 /* get PMBASE */
71 pci_read_config_dword(speedstep_chipset_dev, 0x40, &pmbase);
72 if (!(pmbase & 0x01)) {
73 printk(KERN_ERR "speedstep-ich: could not find speedstep register\n");
74 return -ENODEV;
75 }
76
77 pmbase &= 0xFFFFFFFE;
78 if (!pmbase) {
79 printk(KERN_ERR "speedstep-ich: could not find speedstep register\n");
80 return -ENODEV;
81 }
82
83 dprintk("pmbase is 0x%x\n", pmbase);
84 return 0;
85}
86
87/**
88 * speedstep_set_state - set the SpeedStep state
89 * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH)
90 *
91 * Tries to change the SpeedStep state.
92 */
93static void speedstep_set_state (unsigned int state)
94{
95 u8 pm2_blk;
96 u8 value;
97 unsigned long flags;
98
99 if (state > 0x1)
100 return;
101
102 /* Disable IRQs */
103 local_irq_save(flags);
104
105 /* read state */
106 value = inb(pmbase + 0x50);
107
108 dprintk("read at pmbase 0x%x + 0x50 returned 0x%x\n", pmbase, value);
109
110 /* write new state */
111 value &= 0xFE;
112 value |= state;
113
114 dprintk("writing 0x%x to pmbase 0x%x + 0x50\n", value, pmbase);
115
116 /* Disable bus master arbitration */
117 pm2_blk = inb(pmbase + 0x20);
118 pm2_blk |= 0x01;
119 outb(pm2_blk, (pmbase + 0x20));
120
121 /* Actual transition */
122 outb(value, (pmbase + 0x50));
123
124 /* Restore bus master arbitration */
125 pm2_blk &= 0xfe;
126 outb(pm2_blk, (pmbase + 0x20));
127
128 /* check if transition was successful */
129 value = inb(pmbase + 0x50);
130
131 /* Enable IRQs */
132 local_irq_restore(flags);
133
134 dprintk("read at pmbase 0x%x + 0x50 returned 0x%x\n", pmbase, value);
135
136 if (state == (value & 0x1)) {
137 dprintk("change to %u MHz succeeded\n", (speedstep_get_processor_frequency(speedstep_processor) / 1000));
138 } else {
139 printk (KERN_ERR "cpufreq: change failed - I/O error\n");
140 }
141
142 return;
143}
144
145
146/**
147 * speedstep_activate - activate SpeedStep control in the chipset
148 *
149 * Tries to activate the SpeedStep status and control registers.
150 * Returns -EINVAL on an unsupported chipset, and zero on success.
151 */
152static int speedstep_activate (void)
153{
154 u16 value = 0;
155
156 if (!speedstep_chipset_dev)
157 return -EINVAL;
158
159 pci_read_config_word(speedstep_chipset_dev, 0x00A0, &value);
160 if (!(value & 0x08)) {
161 value |= 0x08;
162 dprintk("activating SpeedStep (TM) registers\n");
163 pci_write_config_word(speedstep_chipset_dev, 0x00A0, value);
164 }
165
166 return 0;
167}
168
169
170/**
171 * speedstep_detect_chipset - detect the Southbridge which contains SpeedStep logic
172 *
173 * Detects ICH2-M, ICH3-M and ICH4-M so far. The pci_dev points to
174 * the LPC bridge / PM module which contains all power-management
175 * functions. Returns the SPEEDSTEP_CHIPSET_-number for the detected
176 * chipset, or zero on failure.
177 */
178static unsigned int speedstep_detect_chipset (void)
179{
180 speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
181 PCI_DEVICE_ID_INTEL_82801DB_12,
182 PCI_ANY_ID,
183 PCI_ANY_ID,
184 NULL);
185 if (speedstep_chipset_dev)
186 return 4; /* 4-M */
187
188 speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
189 PCI_DEVICE_ID_INTEL_82801CA_12,
190 PCI_ANY_ID,
191 PCI_ANY_ID,
192 NULL);
193 if (speedstep_chipset_dev)
194 return 3; /* 3-M */
195
196
197 speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
198 PCI_DEVICE_ID_INTEL_82801BA_10,
199 PCI_ANY_ID,
200 PCI_ANY_ID,
201 NULL);
202 if (speedstep_chipset_dev) {
203 /* speedstep.c causes lockups on Dell Inspirons 8000 and
204 * 8100 which use a pretty old revision of the 82815
205 * host brige. Abort on these systems.
206 */
207 static struct pci_dev *hostbridge;
208
209 hostbridge = pci_get_subsys(PCI_VENDOR_ID_INTEL,
210 PCI_DEVICE_ID_INTEL_82815_MC,
211 PCI_ANY_ID,
212 PCI_ANY_ID,
213 NULL);
214
215 if (!hostbridge)
216 return 2; /* 2-M */
217
218 if (hostbridge->revision < 5) {
219 dprintk("hostbridge does not support speedstep\n");
220 speedstep_chipset_dev = NULL;
221 pci_dev_put(hostbridge);
222 return 0;
223 }
224
225 pci_dev_put(hostbridge);
226 return 2; /* 2-M */
227 }
228
229 return 0;
230}
231
232static unsigned int _speedstep_get(cpumask_t cpus)
233{
234 unsigned int speed;
235 cpumask_t cpus_allowed;
236
237 cpus_allowed = current->cpus_allowed;
238 set_cpus_allowed(current, cpus);
239 speed = speedstep_get_processor_frequency(speedstep_processor);
240 set_cpus_allowed(current, cpus_allowed);
241 dprintk("detected %u kHz as current frequency\n", speed);
242 return speed;
243}
244
245static unsigned int speedstep_get(unsigned int cpu)
246{
247 return _speedstep_get(cpumask_of_cpu(cpu));
248}
249
250/**
251 * speedstep_target - set a new CPUFreq policy
252 * @policy: new policy
253 * @target_freq: the target frequency
254 * @relation: how that frequency relates to achieved frequency (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
255 *
256 * Sets a new CPUFreq policy.
257 */
258static int speedstep_target (struct cpufreq_policy *policy,
259 unsigned int target_freq,
260 unsigned int relation)
261{
262 unsigned int newstate = 0;
263 struct cpufreq_freqs freqs;
264 cpumask_t cpus_allowed;
265 int i;
266
267 if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0], target_freq, relation, &newstate))
268 return -EINVAL;
269
270 freqs.old = _speedstep_get(policy->cpus);
271 freqs.new = speedstep_freqs[newstate].frequency;
272 freqs.cpu = policy->cpu;
273
274 dprintk("transiting from %u to %u kHz\n", freqs.old, freqs.new);
275
276 /* no transition necessary */
277 if (freqs.old == freqs.new)
278 return 0;
279
280 cpus_allowed = current->cpus_allowed;
281
282 for_each_cpu_mask(i, policy->cpus) {
283 freqs.cpu = i;
284 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
285 }
286
287 /* switch to physical CPU where state is to be changed */
288 set_cpus_allowed(current, policy->cpus);
289
290 speedstep_set_state(newstate);
291
292 /* allow to be run on all CPUs */
293 set_cpus_allowed(current, cpus_allowed);
294
295 for_each_cpu_mask(i, policy->cpus) {
296 freqs.cpu = i;
297 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
298 }
299
300 return 0;
301}
302
303
304/**
305 * speedstep_verify - verifies a new CPUFreq policy
306 * @policy: new policy
307 *
308 * Limit must be within speedstep_low_freq and speedstep_high_freq, with
309 * at least one border included.
310 */
311static int speedstep_verify (struct cpufreq_policy *policy)
312{
313 return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]);
314}
315
316
317static int speedstep_cpu_init(struct cpufreq_policy *policy)
318{
319 int result = 0;
320 unsigned int speed;
321 cpumask_t cpus_allowed;
322
323 /* only run on CPU to be set, or on its sibling */
324#ifdef CONFIG_SMP
325 policy->cpus = cpu_sibling_map[policy->cpu];
326#endif
327
328 cpus_allowed = current->cpus_allowed;
329 set_cpus_allowed(current, policy->cpus);
330
331 /* detect low and high frequency and transition latency */
332 result = speedstep_get_freqs(speedstep_processor,
333 &speedstep_freqs[SPEEDSTEP_LOW].frequency,
334 &speedstep_freqs[SPEEDSTEP_HIGH].frequency,
335 &policy->cpuinfo.transition_latency,
336 &speedstep_set_state);
337 set_cpus_allowed(current, cpus_allowed);
338 if (result)
339 return result;
340
341 /* get current speed setting */
342 speed = _speedstep_get(policy->cpus);
343 if (!speed)
344 return -EIO;
345
346 dprintk("currently at %s speed setting - %i MHz\n",
347 (speed == speedstep_freqs[SPEEDSTEP_LOW].frequency) ? "low" : "high",
348 (speed / 1000));
349
350 /* cpuinfo and default policy values */
351 policy->governor = CPUFREQ_DEFAULT_GOVERNOR;
352 policy->cur = speed;
353
354 result = cpufreq_frequency_table_cpuinfo(policy, speedstep_freqs);
355 if (result)
356 return (result);
357
358 cpufreq_frequency_table_get_attr(speedstep_freqs, policy->cpu);
359
360 return 0;
361}
362
363
364static int speedstep_cpu_exit(struct cpufreq_policy *policy)
365{
366 cpufreq_frequency_table_put_attr(policy->cpu);
367 return 0;
368}
369
370static struct freq_attr* speedstep_attr[] = {
371 &cpufreq_freq_attr_scaling_available_freqs,
372 NULL,
373};
374
375
376static struct cpufreq_driver speedstep_driver = {
377 .name = "speedstep-ich",
378 .verify = speedstep_verify,
379 .target = speedstep_target,
380 .init = speedstep_cpu_init,
381 .exit = speedstep_cpu_exit,
382 .get = speedstep_get,
383 .owner = THIS_MODULE,
384 .attr = speedstep_attr,
385};
386
387
388/**
389 * speedstep_init - initializes the SpeedStep CPUFreq driver
390 *
391 * Initializes the SpeedStep support. Returns -ENODEV on unsupported
392 * devices, -EINVAL on problems during initiatization, and zero on
393 * success.
394 */
395static int __init speedstep_init(void)
396{
397 /* detect processor */
398 speedstep_processor = speedstep_detect_processor();
399 if (!speedstep_processor) {
400 dprintk("Intel(R) SpeedStep(TM) capable processor not found\n");
401 return -ENODEV;
402 }
403
404 /* detect chipset */
405 if (!speedstep_detect_chipset()) {
406 dprintk("Intel(R) SpeedStep(TM) for this chipset not (yet) available.\n");
407 return -ENODEV;
408 }
409
410 /* activate speedstep support */
411 if (speedstep_activate()) {
412 pci_dev_put(speedstep_chipset_dev);
413 return -EINVAL;
414 }
415
416 if (speedstep_find_register())
417 return -ENODEV;
418
419 return cpufreq_register_driver(&speedstep_driver);
420}
421
422
423/**
424 * speedstep_exit - unregisters SpeedStep support
425 *
426 * Unregisters SpeedStep support.
427 */
428static void __exit speedstep_exit(void)
429{
430 pci_dev_put(speedstep_chipset_dev);
431 cpufreq_unregister_driver(&speedstep_driver);
432}
433
434
435MODULE_AUTHOR ("Dave Jones <davej@codemonkey.org.uk>, Dominik Brodowski <linux@brodo.de>");
436MODULE_DESCRIPTION ("Speedstep driver for Intel mobile processors on chipsets with ICH-M southbridges.");
437MODULE_LICENSE ("GPL");
438
439module_init(speedstep_init);
440module_exit(speedstep_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
new file mode 100644
index 000000000000..b1acc8ce3167
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
@@ -0,0 +1,444 @@
1/*
2 * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
3 *
4 * Licensed under the terms of the GNU GPL License version 2.
5 *
6 * Library for common functions for Intel SpeedStep v.1 and v.2 support
7 *
8 * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
9 */
10
11#include <linux/kernel.h>
12#include <linux/module.h>
13#include <linux/moduleparam.h>
14#include <linux/init.h>
15#include <linux/cpufreq.h>
16#include <linux/slab.h>
17
18#include <asm/msr.h>
19#include "speedstep-lib.h"
20
21#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-lib", msg)
22
23#ifdef CONFIG_X86_SPEEDSTEP_RELAXED_CAP_CHECK
24static int relaxed_check = 0;
25#else
26#define relaxed_check 0
27#endif
28
29/*********************************************************************
30 * GET PROCESSOR CORE SPEED IN KHZ *
31 *********************************************************************/
32
33static unsigned int pentium3_get_frequency (unsigned int processor)
34{
35 /* See table 14 of p3_ds.pdf and table 22 of 29834003.pdf */
36 struct {
37 unsigned int ratio; /* Frequency Multiplier (x10) */
38 u8 bitmap; /* power on configuration bits
39 [27, 25:22] (in MSR 0x2a) */
40 } msr_decode_mult [] = {
41 { 30, 0x01 },
42 { 35, 0x05 },
43 { 40, 0x02 },
44 { 45, 0x06 },
45 { 50, 0x00 },
46 { 55, 0x04 },
47 { 60, 0x0b },
48 { 65, 0x0f },
49 { 70, 0x09 },
50 { 75, 0x0d },
51 { 80, 0x0a },
52 { 85, 0x26 },
53 { 90, 0x20 },
54 { 100, 0x2b },
55 { 0, 0xff } /* error or unknown value */
56 };
57
58 /* PIII(-M) FSB settings: see table b1-b of 24547206.pdf */
59 struct {
60 unsigned int value; /* Front Side Bus speed in MHz */
61 u8 bitmap; /* power on configuration bits [18: 19]
62 (in MSR 0x2a) */
63 } msr_decode_fsb [] = {
64 { 66, 0x0 },
65 { 100, 0x2 },
66 { 133, 0x1 },
67 { 0, 0xff}
68 };
69
70 u32 msr_lo, msr_tmp;
71 int i = 0, j = 0;
72
73 /* read MSR 0x2a - we only need the low 32 bits */
74 rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp);
75 dprintk("P3 - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", msr_lo, msr_tmp);
76 msr_tmp = msr_lo;
77
78 /* decode the FSB */
79 msr_tmp &= 0x00c0000;
80 msr_tmp >>= 18;
81 while (msr_tmp != msr_decode_fsb[i].bitmap) {
82 if (msr_decode_fsb[i].bitmap == 0xff)
83 return 0;
84 i++;
85 }
86
87 /* decode the multiplier */
88 if (processor == SPEEDSTEP_PROCESSOR_PIII_C_EARLY) {
89 dprintk("workaround for early PIIIs\n");
90 msr_lo &= 0x03c00000;
91 } else
92 msr_lo &= 0x0bc00000;
93 msr_lo >>= 22;
94 while (msr_lo != msr_decode_mult[j].bitmap) {
95 if (msr_decode_mult[j].bitmap == 0xff)
96 return 0;
97 j++;
98 }
99
100 dprintk("speed is %u\n", (msr_decode_mult[j].ratio * msr_decode_fsb[i].value * 100));
101
102 return (msr_decode_mult[j].ratio * msr_decode_fsb[i].value * 100);
103}
104
105
106static unsigned int pentiumM_get_frequency(void)
107{
108 u32 msr_lo, msr_tmp;
109
110 rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp);
111 dprintk("PM - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", msr_lo, msr_tmp);
112
113 /* see table B-2 of 24547212.pdf */
114 if (msr_lo & 0x00040000) {
115 printk(KERN_DEBUG "speedstep-lib: PM - invalid FSB: 0x%x 0x%x\n", msr_lo, msr_tmp);
116 return 0;
117 }
118
119 msr_tmp = (msr_lo >> 22) & 0x1f;
120 dprintk("bits 22-26 are 0x%x, speed is %u\n", msr_tmp, (msr_tmp * 100 * 1000));
121
122 return (msr_tmp * 100 * 1000);
123}
124
125static unsigned int pentium_core_get_frequency(void)
126{
127 u32 fsb = 0;
128 u32 msr_lo, msr_tmp;
129
130 rdmsr(MSR_FSB_FREQ, msr_lo, msr_tmp);
131 /* see table B-2 of 25366920.pdf */
132 switch (msr_lo & 0x07) {
133 case 5:
134 fsb = 100000;
135 break;
136 case 1:
137 fsb = 133333;
138 break;
139 case 3:
140 fsb = 166667;
141 break;
142 default:
143 printk(KERN_ERR "PCORE - MSR_FSB_FREQ undefined value");
144 }
145
146 rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp);
147 dprintk("PCORE - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", msr_lo, msr_tmp);
148
149 msr_tmp = (msr_lo >> 22) & 0x1f;
150 dprintk("bits 22-26 are 0x%x, speed is %u\n", msr_tmp, (msr_tmp * fsb));
151
152 return (msr_tmp * fsb);
153}
154
155
156static unsigned int pentium4_get_frequency(void)
157{
158 struct cpuinfo_x86 *c = &boot_cpu_data;
159 u32 msr_lo, msr_hi, mult;
160 unsigned int fsb = 0;
161
162 rdmsr(0x2c, msr_lo, msr_hi);
163
164 dprintk("P4 - MSR_EBC_FREQUENCY_ID: 0x%x 0x%x\n", msr_lo, msr_hi);
165
166 /* decode the FSB: see IA-32 Intel (C) Architecture Software
167 * Developer's Manual, Volume 3: System Prgramming Guide,
168 * revision #12 in Table B-1: MSRs in the Pentium 4 and
169 * Intel Xeon Processors, on page B-4 and B-5.
170 */
171 if (c->x86_model < 2)
172 fsb = 100 * 1000;
173 else {
174 u8 fsb_code = (msr_lo >> 16) & 0x7;
175 switch (fsb_code) {
176 case 0:
177 fsb = 100 * 1000;
178 break;
179 case 1:
180 fsb = 13333 * 10;
181 break;
182 case 2:
183 fsb = 200 * 1000;
184 break;
185 }
186 }
187
188 if (!fsb)
189 printk(KERN_DEBUG "speedstep-lib: couldn't detect FSB speed. Please send an e-mail to <linux@brodo.de>\n");
190
191 /* Multiplier. */
192 if (c->x86_model < 2)
193 mult = msr_lo >> 27;
194 else
195 mult = msr_lo >> 24;
196
197 dprintk("P4 - FSB %u kHz; Multiplier %u; Speed %u kHz\n", fsb, mult, (fsb * mult));
198
199 return (fsb * mult);
200}
201
202
203unsigned int speedstep_get_processor_frequency(unsigned int processor)
204{
205 switch (processor) {
206 case SPEEDSTEP_PROCESSOR_PCORE:
207 return pentium_core_get_frequency();
208 case SPEEDSTEP_PROCESSOR_PM:
209 return pentiumM_get_frequency();
210 case SPEEDSTEP_PROCESSOR_P4D:
211 case SPEEDSTEP_PROCESSOR_P4M:
212 return pentium4_get_frequency();
213 case SPEEDSTEP_PROCESSOR_PIII_T:
214 case SPEEDSTEP_PROCESSOR_PIII_C:
215 case SPEEDSTEP_PROCESSOR_PIII_C_EARLY:
216 return pentium3_get_frequency(processor);
217 default:
218 return 0;
219 };
220 return 0;
221}
222EXPORT_SYMBOL_GPL(speedstep_get_processor_frequency);
223
224
225/*********************************************************************
226 * DETECT SPEEDSTEP-CAPABLE PROCESSOR *
227 *********************************************************************/
228
229unsigned int speedstep_detect_processor (void)
230{
231 struct cpuinfo_x86 *c = cpu_data;
232 u32 ebx, msr_lo, msr_hi;
233
234 dprintk("x86: %x, model: %x\n", c->x86, c->x86_model);
235
236 if ((c->x86_vendor != X86_VENDOR_INTEL) ||
237 ((c->x86 != 6) && (c->x86 != 0xF)))
238 return 0;
239
240 if (c->x86 == 0xF) {
241 /* Intel Mobile Pentium 4-M
242 * or Intel Mobile Pentium 4 with 533 MHz FSB */
243 if (c->x86_model != 2)
244 return 0;
245
246 ebx = cpuid_ebx(0x00000001);
247 ebx &= 0x000000FF;
248
249 dprintk("ebx value is %x, x86_mask is %x\n", ebx, c->x86_mask);
250
251 switch (c->x86_mask) {
252 case 4:
253 /*
254 * B-stepping [M-P4-M]
255 * sample has ebx = 0x0f, production has 0x0e.
256 */
257 if ((ebx == 0x0e) || (ebx == 0x0f))
258 return SPEEDSTEP_PROCESSOR_P4M;
259 break;
260 case 7:
261 /*
262 * C-stepping [M-P4-M]
263 * needs to have ebx=0x0e, else it's a celeron:
264 * cf. 25130917.pdf / page 7, footnote 5 even
265 * though 25072120.pdf / page 7 doesn't say
266 * samples are only of B-stepping...
267 */
268 if (ebx == 0x0e)
269 return SPEEDSTEP_PROCESSOR_P4M;
270 break;
271 case 9:
272 /*
273 * D-stepping [M-P4-M or M-P4/533]
274 *
275 * this is totally strange: CPUID 0x0F29 is
276 * used by M-P4-M, M-P4/533 and(!) Celeron CPUs.
277 * The latter need to be sorted out as they don't
278 * support speedstep.
279 * Celerons with CPUID 0x0F29 may have either
280 * ebx=0x8 or 0xf -- 25130917.pdf doesn't say anything
281 * specific.
282 * M-P4-Ms may have either ebx=0xe or 0xf [see above]
283 * M-P4/533 have either ebx=0xe or 0xf. [25317607.pdf]
284 * also, M-P4M HTs have ebx=0x8, too
285 * For now, they are distinguished by the model_id string
286 */
287 if ((ebx == 0x0e) || (strstr(c->x86_model_id,"Mobile Intel(R) Pentium(R) 4") != NULL))
288 return SPEEDSTEP_PROCESSOR_P4M;
289 break;
290 default:
291 break;
292 }
293 return 0;
294 }
295
296 switch (c->x86_model) {
297 case 0x0B: /* Intel PIII [Tualatin] */
298 /* cpuid_ebx(1) is 0x04 for desktop PIII, 0x06 for mobile PIII-M */
299 ebx = cpuid_ebx(0x00000001);
300 dprintk("ebx is %x\n", ebx);
301
302 ebx &= 0x000000FF;
303
304 if (ebx != 0x06)
305 return 0;
306
307 /* So far all PIII-M processors support SpeedStep. See
308 * Intel's 24540640.pdf of June 2003
309 */
310 return SPEEDSTEP_PROCESSOR_PIII_T;
311
312 case 0x08: /* Intel PIII [Coppermine] */
313
314 /* all mobile PIII Coppermines have FSB 100 MHz
315 * ==> sort out a few desktop PIIIs. */
316 rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_hi);
317 dprintk("Coppermine: MSR_IA32_EBL_CR_POWERON is 0x%x, 0x%x\n", msr_lo, msr_hi);
318 msr_lo &= 0x00c0000;
319 if (msr_lo != 0x0080000)
320 return 0;
321
322 /*
323 * If the processor is a mobile version,
324 * platform ID has bit 50 set
325 * it has SpeedStep technology if either
326 * bit 56 or 57 is set
327 */
328 rdmsr(MSR_IA32_PLATFORM_ID, msr_lo, msr_hi);
329 dprintk("Coppermine: MSR_IA32_PLATFORM ID is 0x%x, 0x%x\n", msr_lo, msr_hi);
330 if ((msr_hi & (1<<18)) && (relaxed_check ? 1 : (msr_hi & (3<<24)))) {
331 if (c->x86_mask == 0x01) {
332 dprintk("early PIII version\n");
333 return SPEEDSTEP_PROCESSOR_PIII_C_EARLY;
334 } else
335 return SPEEDSTEP_PROCESSOR_PIII_C;
336 }
337
338 default:
339 return 0;
340 }
341}
342EXPORT_SYMBOL_GPL(speedstep_detect_processor);
343
344
345/*********************************************************************
346 * DETECT SPEEDSTEP SPEEDS *
347 *********************************************************************/
348
349unsigned int speedstep_get_freqs(unsigned int processor,
350 unsigned int *low_speed,
351 unsigned int *high_speed,
352 unsigned int *transition_latency,
353 void (*set_state) (unsigned int state))
354{
355 unsigned int prev_speed;
356 unsigned int ret = 0;
357 unsigned long flags;
358 struct timeval tv1, tv2;
359
360 if ((!processor) || (!low_speed) || (!high_speed) || (!set_state))
361 return -EINVAL;
362
363 dprintk("trying to determine both speeds\n");
364
365 /* get current speed */
366 prev_speed = speedstep_get_processor_frequency(processor);
367 if (!prev_speed)
368 return -EIO;
369
370 dprintk("previous speed is %u\n", prev_speed);
371
372 local_irq_save(flags);
373
374 /* switch to low state */
375 set_state(SPEEDSTEP_LOW);
376 *low_speed = speedstep_get_processor_frequency(processor);
377 if (!*low_speed) {
378 ret = -EIO;
379 goto out;
380 }
381
382 dprintk("low speed is %u\n", *low_speed);
383
384 /* start latency measurement */
385 if (transition_latency)
386 do_gettimeofday(&tv1);
387
388 /* switch to high state */
389 set_state(SPEEDSTEP_HIGH);
390
391 /* end latency measurement */
392 if (transition_latency)
393 do_gettimeofday(&tv2);
394
395 *high_speed = speedstep_get_processor_frequency(processor);
396 if (!*high_speed) {
397 ret = -EIO;
398 goto out;
399 }
400
401 dprintk("high speed is %u\n", *high_speed);
402
403 if (*low_speed == *high_speed) {
404 ret = -ENODEV;
405 goto out;
406 }
407
408 /* switch to previous state, if necessary */
409 if (*high_speed != prev_speed)
410 set_state(SPEEDSTEP_LOW);
411
412 if (transition_latency) {
413 *transition_latency = (tv2.tv_sec - tv1.tv_sec) * USEC_PER_SEC +
414 tv2.tv_usec - tv1.tv_usec;
415 dprintk("transition latency is %u uSec\n", *transition_latency);
416
417 /* convert uSec to nSec and add 20% for safety reasons */
418 *transition_latency *= 1200;
419
420 /* check if the latency measurement is too high or too low
421 * and set it to a safe value (500uSec) in that case
422 */
423 if (*transition_latency > 10000000 || *transition_latency < 50000) {
424 printk (KERN_WARNING "speedstep: frequency transition measured seems out of "
425 "range (%u nSec), falling back to a safe one of %u nSec.\n",
426 *transition_latency, 500000);
427 *transition_latency = 500000;
428 }
429 }
430
431out:
432 local_irq_restore(flags);
433 return (ret);
434}
435EXPORT_SYMBOL_GPL(speedstep_get_freqs);
436
437#ifdef CONFIG_X86_SPEEDSTEP_RELAXED_CAP_CHECK
438module_param(relaxed_check, int, 0444);
439MODULE_PARM_DESC(relaxed_check, "Don't do all checks for speedstep capability.");
440#endif
441
442MODULE_AUTHOR ("Dominik Brodowski <linux@brodo.de>");
443MODULE_DESCRIPTION ("Library for Intel SpeedStep 1 or 2 cpufreq drivers.");
444MODULE_LICENSE ("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
new file mode 100644
index 000000000000..b11bcc608cac
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
@@ -0,0 +1,49 @@
1/*
2 * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
3 *
4 * Licensed under the terms of the GNU GPL License version 2.
5 *
6 * Library for common functions for Intel SpeedStep v.1 and v.2 support
7 *
8 * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
9 */
10
11
12
13/* processors */
14
15#define SPEEDSTEP_PROCESSOR_PIII_C_EARLY 0x00000001 /* Coppermine core */
16#define SPEEDSTEP_PROCESSOR_PIII_C 0x00000002 /* Coppermine core */
17#define SPEEDSTEP_PROCESSOR_PIII_T 0x00000003 /* Tualatin core */
18#define SPEEDSTEP_PROCESSOR_P4M 0x00000004 /* P4-M */
19
20/* the following processors are not speedstep-capable and are not auto-detected
21 * in speedstep_detect_processor(). However, their speed can be detected using
22 * the speedstep_get_processor_frequency() call. */
23#define SPEEDSTEP_PROCESSOR_PM 0xFFFFFF03 /* Pentium M */
24#define SPEEDSTEP_PROCESSOR_P4D 0xFFFFFF04 /* desktop P4 */
25#define SPEEDSTEP_PROCESSOR_PCORE 0xFFFFFF05 /* Core */
26
27/* speedstep states -- only two of them */
28
29#define SPEEDSTEP_HIGH 0x00000000
30#define SPEEDSTEP_LOW 0x00000001
31
32
33/* detect a speedstep-capable processor */
34extern unsigned int speedstep_detect_processor (void);
35
36/* detect the current speed (in khz) of the processor */
37extern unsigned int speedstep_get_processor_frequency(unsigned int processor);
38
39
40/* detect the low and high speeds of the processor. The callback
41 * set_state"'s first argument is either SPEEDSTEP_HIGH or
42 * SPEEDSTEP_LOW; the second argument is zero so that no
43 * cpufreq_notify_transition calls are initiated.
44 */
45extern unsigned int speedstep_get_freqs(unsigned int processor,
46 unsigned int *low_speed,
47 unsigned int *high_speed,
48 unsigned int *transition_latency,
49 void (*set_state) (unsigned int state));
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
new file mode 100644
index 000000000000..e1c509aa3054
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
@@ -0,0 +1,424 @@
1/*
2 * Intel SpeedStep SMI driver.
3 *
4 * (C) 2003 Hiroshi Miura <miura@da-cha.org>
5 *
6 * Licensed under the terms of the GNU GPL License version 2.
7 *
8 */
9
10
11/*********************************************************************
12 * SPEEDSTEP - DEFINITIONS *
13 *********************************************************************/
14
15#include <linux/kernel.h>
16#include <linux/module.h>
17#include <linux/moduleparam.h>
18#include <linux/init.h>
19#include <linux/cpufreq.h>
20#include <linux/slab.h>
21#include <linux/delay.h>
22#include <asm/ist.h>
23#include <asm/io.h>
24
25#include "speedstep-lib.h"
26
27/* speedstep system management interface port/command.
28 *
29 * These parameters are got from IST-SMI BIOS call.
30 * If user gives it, these are used.
31 *
32 */
33static int smi_port = 0;
34static int smi_cmd = 0;
35static unsigned int smi_sig = 0;
36
37/* info about the processor */
38static unsigned int speedstep_processor = 0;
39
40/*
41 * There are only two frequency states for each processor. Values
42 * are in kHz for the time being.
43 */
44static struct cpufreq_frequency_table speedstep_freqs[] = {
45 {SPEEDSTEP_HIGH, 0},
46 {SPEEDSTEP_LOW, 0},
47 {0, CPUFREQ_TABLE_END},
48};
49
50#define GET_SPEEDSTEP_OWNER 0
51#define GET_SPEEDSTEP_STATE 1
52#define SET_SPEEDSTEP_STATE 2
53#define GET_SPEEDSTEP_FREQS 4
54
55/* how often shall the SMI call be tried if it failed, e.g. because
56 * of DMA activity going on? */
57#define SMI_TRIES 5
58
59#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-smi", msg)
60
61/**
62 * speedstep_smi_ownership
63 */
64static int speedstep_smi_ownership (void)
65{
66 u32 command, result, magic;
67 u32 function = GET_SPEEDSTEP_OWNER;
68 unsigned char magic_data[] = "Copyright (c) 1999 Intel Corporation";
69
70 command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
71 magic = virt_to_phys(magic_data);
72
73 dprintk("trying to obtain ownership with command %x at port %x\n", command, smi_port);
74
75 __asm__ __volatile__(
76 "out %%al, (%%dx)\n"
77 : "=D" (result)
78 : "a" (command), "b" (function), "c" (0), "d" (smi_port),
79 "D" (0), "S" (magic)
80 : "memory"
81 );
82
83 dprintk("result is %x\n", result);
84
85 return result;
86}
87
88/**
89 * speedstep_smi_get_freqs - get SpeedStep preferred & current freq.
90 * @low: the low frequency value is placed here
91 * @high: the high frequency value is placed here
92 *
93 * Only available on later SpeedStep-enabled systems, returns false results or
94 * even hangs [cf. bugme.osdl.org # 1422] on earlier systems. Empirical testing
95 * shows that the latter occurs if !(ist_info.event & 0xFFFF).
96 */
97static int speedstep_smi_get_freqs (unsigned int *low, unsigned int *high)
98{
99 u32 command, result = 0, edi, high_mhz, low_mhz;
100 u32 state=0;
101 u32 function = GET_SPEEDSTEP_FREQS;
102
103 if (!(ist_info.event & 0xFFFF)) {
104 dprintk("bug #1422 -- can't read freqs from BIOS\n");
105 return -ENODEV;
106 }
107
108 command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
109
110 dprintk("trying to determine frequencies with command %x at port %x\n", command, smi_port);
111
112 __asm__ __volatile__("movl $0, %%edi\n"
113 "out %%al, (%%dx)\n"
114 : "=a" (result), "=b" (high_mhz), "=c" (low_mhz), "=d" (state), "=D" (edi)
115 : "a" (command), "b" (function), "c" (state), "d" (smi_port), "S" (0)
116 );
117
118 dprintk("result %x, low_freq %u, high_freq %u\n", result, low_mhz, high_mhz);
119
120 /* abort if results are obviously incorrect... */
121 if ((high_mhz + low_mhz) < 600)
122 return -EINVAL;
123
124 *high = high_mhz * 1000;
125 *low = low_mhz * 1000;
126
127 return result;
128}
129
130/**
131 * speedstep_get_state - set the SpeedStep state
132 * @state: processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH)
133 *
134 */
135static int speedstep_get_state (void)
136{
137 u32 function=GET_SPEEDSTEP_STATE;
138 u32 result, state, edi, command;
139
140 command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
141
142 dprintk("trying to determine current setting with command %x at port %x\n", command, smi_port);
143
144 __asm__ __volatile__("movl $0, %%edi\n"
145 "out %%al, (%%dx)\n"
146 : "=a" (result), "=b" (state), "=D" (edi)
147 : "a" (command), "b" (function), "c" (0), "d" (smi_port), "S" (0)
148 );
149
150 dprintk("state is %x, result is %x\n", state, result);
151
152 return (state & 1);
153}
154
155
156/**
157 * speedstep_set_state - set the SpeedStep state
158 * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH)
159 *
160 */
161static void speedstep_set_state (unsigned int state)
162{
163 unsigned int result = 0, command, new_state;
164 unsigned long flags;
165 unsigned int function=SET_SPEEDSTEP_STATE;
166 unsigned int retry = 0;
167
168 if (state > 0x1)
169 return;
170
171 /* Disable IRQs */
172 local_irq_save(flags);
173
174 command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
175
176 dprintk("trying to set frequency to state %u with command %x at port %x\n", state, command, smi_port);
177
178 do {
179 if (retry) {
180 dprintk("retry %u, previous result %u, waiting...\n", retry, result);
181 mdelay(retry * 50);
182 }
183 retry++;
184 __asm__ __volatile__(
185 "movl $0, %%edi\n"
186 "out %%al, (%%dx)\n"
187 : "=b" (new_state), "=D" (result)
188 : "a" (command), "b" (function), "c" (state), "d" (smi_port), "S" (0)
189 );
190 } while ((new_state != state) && (retry <= SMI_TRIES));
191
192 /* enable IRQs */
193 local_irq_restore(flags);
194
195 if (new_state == state) {
196 dprintk("change to %u MHz succeeded after %u tries with result %u\n", (speedstep_freqs[new_state].frequency / 1000), retry, result);
197 } else {
198 printk(KERN_ERR "cpufreq: change failed with new_state %u and result %u\n", new_state, result);
199 }
200
201 return;
202}
203
204
205/**
206 * speedstep_target - set a new CPUFreq policy
207 * @policy: new policy
208 * @target_freq: new freq
209 * @relation:
210 *
211 * Sets a new CPUFreq policy/freq.
212 */
213static int speedstep_target (struct cpufreq_policy *policy,
214 unsigned int target_freq, unsigned int relation)
215{
216 unsigned int newstate = 0;
217 struct cpufreq_freqs freqs;
218
219 if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0], target_freq, relation, &newstate))
220 return -EINVAL;
221
222 freqs.old = speedstep_freqs[speedstep_get_state()].frequency;
223 freqs.new = speedstep_freqs[newstate].frequency;
224 freqs.cpu = 0; /* speedstep.c is UP only driver */
225
226 if (freqs.old == freqs.new)
227 return 0;
228
229 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
230 speedstep_set_state(newstate);
231 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
232
233 return 0;
234}
235
236
237/**
238 * speedstep_verify - verifies a new CPUFreq policy
239 * @policy: new policy
240 *
241 * Limit must be within speedstep_low_freq and speedstep_high_freq, with
242 * at least one border included.
243 */
244static int speedstep_verify (struct cpufreq_policy *policy)
245{
246 return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]);
247}
248
249
250static int speedstep_cpu_init(struct cpufreq_policy *policy)
251{
252 int result;
253 unsigned int speed,state;
254
255 /* capability check */
256 if (policy->cpu != 0)
257 return -ENODEV;
258
259 result = speedstep_smi_ownership();
260 if (result) {
261 dprintk("fails in aquiring ownership of a SMI interface.\n");
262 return -EINVAL;
263 }
264
265 /* detect low and high frequency */
266 result = speedstep_smi_get_freqs(&speedstep_freqs[SPEEDSTEP_LOW].frequency,
267 &speedstep_freqs[SPEEDSTEP_HIGH].frequency);
268 if (result) {
269 /* fall back to speedstep_lib.c dection mechanism: try both states out */
270 dprintk("could not detect low and high frequencies by SMI call.\n");
271 result = speedstep_get_freqs(speedstep_processor,
272 &speedstep_freqs[SPEEDSTEP_LOW].frequency,
273 &speedstep_freqs[SPEEDSTEP_HIGH].frequency,
274 NULL,
275 &speedstep_set_state);
276
277 if (result) {
278 dprintk("could not detect two different speeds -- aborting.\n");
279 return result;
280 } else
281 dprintk("workaround worked.\n");
282 }
283
284 /* get current speed setting */
285 state = speedstep_get_state();
286 speed = speedstep_freqs[state].frequency;
287
288 dprintk("currently at %s speed setting - %i MHz\n",
289 (speed == speedstep_freqs[SPEEDSTEP_LOW].frequency) ? "low" : "high",
290 (speed / 1000));
291
292 /* cpuinfo and default policy values */
293 policy->governor = CPUFREQ_DEFAULT_GOVERNOR;
294 policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
295 policy->cur = speed;
296
297 result = cpufreq_frequency_table_cpuinfo(policy, speedstep_freqs);
298 if (result)
299 return (result);
300
301 cpufreq_frequency_table_get_attr(speedstep_freqs, policy->cpu);
302
303 return 0;
304}
305
306static int speedstep_cpu_exit(struct cpufreq_policy *policy)
307{
308 cpufreq_frequency_table_put_attr(policy->cpu);
309 return 0;
310}
311
312static unsigned int speedstep_get(unsigned int cpu)
313{
314 if (cpu)
315 return -ENODEV;
316 return speedstep_get_processor_frequency(speedstep_processor);
317}
318
319
320static int speedstep_resume(struct cpufreq_policy *policy)
321{
322 int result = speedstep_smi_ownership();
323
324 if (result)
325 dprintk("fails in re-aquiring ownership of a SMI interface.\n");
326
327 return result;
328}
329
330static struct freq_attr* speedstep_attr[] = {
331 &cpufreq_freq_attr_scaling_available_freqs,
332 NULL,
333};
334
335static struct cpufreq_driver speedstep_driver = {
336 .name = "speedstep-smi",
337 .verify = speedstep_verify,
338 .target = speedstep_target,
339 .init = speedstep_cpu_init,
340 .exit = speedstep_cpu_exit,
341 .get = speedstep_get,
342 .resume = speedstep_resume,
343 .owner = THIS_MODULE,
344 .attr = speedstep_attr,
345};
346
347/**
348 * speedstep_init - initializes the SpeedStep CPUFreq driver
349 *
350 * Initializes the SpeedStep support. Returns -ENODEV on unsupported
351 * BIOS, -EINVAL on problems during initiatization, and zero on
352 * success.
353 */
354static int __init speedstep_init(void)
355{
356 speedstep_processor = speedstep_detect_processor();
357
358 switch (speedstep_processor) {
359 case SPEEDSTEP_PROCESSOR_PIII_T:
360 case SPEEDSTEP_PROCESSOR_PIII_C:
361 case SPEEDSTEP_PROCESSOR_PIII_C_EARLY:
362 break;
363 default:
364 speedstep_processor = 0;
365 }
366
367 if (!speedstep_processor) {
368 dprintk ("No supported Intel CPU detected.\n");
369 return -ENODEV;
370 }
371
372 dprintk("signature:0x%.8lx, command:0x%.8lx, event:0x%.8lx, perf_level:0x%.8lx.\n",
373 ist_info.signature, ist_info.command, ist_info.event, ist_info.perf_level);
374
375 /* Error if no IST-SMI BIOS or no PARM
376 sig= 'ISGE' aka 'Intel Speedstep Gate E' */
377 if ((ist_info.signature != 0x47534943) && (
378 (smi_port == 0) || (smi_cmd == 0)))
379 return -ENODEV;
380
381 if (smi_sig == 1)
382 smi_sig = 0x47534943;
383 else
384 smi_sig = ist_info.signature;
385
386 /* setup smi_port from MODLULE_PARM or BIOS */
387 if ((smi_port > 0xff) || (smi_port < 0))
388 return -EINVAL;
389 else if (smi_port == 0)
390 smi_port = ist_info.command & 0xff;
391
392 if ((smi_cmd > 0xff) || (smi_cmd < 0))
393 return -EINVAL;
394 else if (smi_cmd == 0)
395 smi_cmd = (ist_info.command >> 16) & 0xff;
396
397 return cpufreq_register_driver(&speedstep_driver);
398}
399
400
401/**
402 * speedstep_exit - unregisters SpeedStep support
403 *
404 * Unregisters SpeedStep support.
405 */
406static void __exit speedstep_exit(void)
407{
408 cpufreq_unregister_driver(&speedstep_driver);
409}
410
411module_param(smi_port, int, 0444);
412module_param(smi_cmd, int, 0444);
413module_param(smi_sig, uint, 0444);
414
415MODULE_PARM_DESC(smi_port, "Override the BIOS-given IST port with this value -- Intel's default setting is 0xb2");
416MODULE_PARM_DESC(smi_cmd, "Override the BIOS-given IST command with this value -- Intel's default setting is 0x82");
417MODULE_PARM_DESC(smi_sig, "Set to 1 to fake the IST signature when using the SMI interface.");
418
419MODULE_AUTHOR ("Hiroshi Miura");
420MODULE_DESCRIPTION ("Speedstep driver for IST applet SMI interface.");
421MODULE_LICENSE ("GPL");
422
423module_init(speedstep_init);
424module_exit(speedstep_exit);
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
new file mode 100644
index 000000000000..122d2d75aa9f
--- /dev/null
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -0,0 +1,463 @@
1#include <linux/init.h>
2#include <linux/bitops.h>
3#include <linux/delay.h>
4#include <linux/pci.h>
5#include <asm/dma.h>
6#include <asm/io.h>
7#include <asm/processor-cyrix.h>
8#include <asm/timer.h>
9#include <asm/pci-direct.h>
10#include <asm/tsc.h>
11
12#include "cpu.h"
13
14/*
15 * Read NSC/Cyrix DEVID registers (DIR) to get more detailed info. about the CPU
16 */
17static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
18{
19 unsigned char ccr2, ccr3;
20 unsigned long flags;
21
22 /* we test for DEVID by checking whether CCR3 is writable */
23 local_irq_save(flags);
24 ccr3 = getCx86(CX86_CCR3);
25 setCx86(CX86_CCR3, ccr3 ^ 0x80);
26 getCx86(0xc0); /* dummy to change bus */
27
28 if (getCx86(CX86_CCR3) == ccr3) { /* no DEVID regs. */
29 ccr2 = getCx86(CX86_CCR2);
30 setCx86(CX86_CCR2, ccr2 ^ 0x04);
31 getCx86(0xc0); /* dummy */
32
33 if (getCx86(CX86_CCR2) == ccr2) /* old Cx486SLC/DLC */
34 *dir0 = 0xfd;
35 else { /* Cx486S A step */
36 setCx86(CX86_CCR2, ccr2);
37 *dir0 = 0xfe;
38 }
39 }
40 else {
41 setCx86(CX86_CCR3, ccr3); /* restore CCR3 */
42
43 /* read DIR0 and DIR1 CPU registers */
44 *dir0 = getCx86(CX86_DIR0);
45 *dir1 = getCx86(CX86_DIR1);
46 }
47 local_irq_restore(flags);
48}
49
50/*
51 * Cx86_dir0_msb is a HACK needed by check_cx686_cpuid/slop in bugs.h in
52 * order to identify the Cyrix CPU model after we're out of setup.c
53 *
54 * Actually since bugs.h doesn't even reference this perhaps someone should
55 * fix the documentation ???
56 */
57static unsigned char Cx86_dir0_msb __cpuinitdata = 0;
58
59static char Cx86_model[][9] __cpuinitdata = {
60 "Cx486", "Cx486", "5x86 ", "6x86", "MediaGX ", "6x86MX ",
61 "M II ", "Unknown"
62};
63static char Cx486_name[][5] __cpuinitdata = {
64 "SLC", "DLC", "SLC2", "DLC2", "SRx", "DRx",
65 "SRx2", "DRx2"
66};
67static char Cx486S_name[][4] __cpuinitdata = {
68 "S", "S2", "Se", "S2e"
69};
70static char Cx486D_name[][4] __cpuinitdata = {
71 "DX", "DX2", "?", "?", "?", "DX4"
72};
73static char Cx86_cb[] __cpuinitdata = "?.5x Core/Bus Clock";
74static char cyrix_model_mult1[] __cpuinitdata = "12??43";
75static char cyrix_model_mult2[] __cpuinitdata = "12233445";
76
77/*
78 * Reset the slow-loop (SLOP) bit on the 686(L) which is set by some old
79 * BIOSes for compatibility with DOS games. This makes the udelay loop
80 * work correctly, and improves performance.
81 *
82 * FIXME: our newer udelay uses the tsc. We don't need to frob with SLOP
83 */
84
85extern void calibrate_delay(void) __init;
86
87static void __cpuinit check_cx686_slop(struct cpuinfo_x86 *c)
88{
89 unsigned long flags;
90
91 if (Cx86_dir0_msb == 3) {
92 unsigned char ccr3, ccr5;
93
94 local_irq_save(flags);
95 ccr3 = getCx86(CX86_CCR3);
96 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
97 ccr5 = getCx86(CX86_CCR5);
98 if (ccr5 & 2)
99 setCx86(CX86_CCR5, ccr5 & 0xfd); /* reset SLOP */
100 setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
101 local_irq_restore(flags);
102
103 if (ccr5 & 2) { /* possible wrong calibration done */
104 printk(KERN_INFO "Recalibrating delay loop with SLOP bit reset\n");
105 calibrate_delay();
106 c->loops_per_jiffy = loops_per_jiffy;
107 }
108 }
109}
110
111
112static void __cpuinit set_cx86_reorder(void)
113{
114 u8 ccr3;
115
116 printk(KERN_INFO "Enable Memory access reorder on Cyrix/NSC processor.\n");
117 ccr3 = getCx86(CX86_CCR3);
118 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN  */
119
120 /* Load/Store Serialize to mem access disable (=reorder it)  */
121 setCx86(CX86_PCR0, getCx86(CX86_PCR0) & ~0x80);
122 /* set load/store serialize from 1GB to 4GB */
123 ccr3 |= 0xe0;
124 setCx86(CX86_CCR3, ccr3);
125}
126
127static void __cpuinit set_cx86_memwb(void)
128{
129 u32 cr0;
130
131 printk(KERN_INFO "Enable Memory-Write-back mode on Cyrix/NSC processor.\n");
132
133 /* CCR2 bit 2: unlock NW bit */
134 setCx86(CX86_CCR2, getCx86(CX86_CCR2) & ~0x04);
135 /* set 'Not Write-through' */
136 cr0 = 0x20000000;
137 write_cr0(read_cr0() | cr0);
138 /* CCR2 bit 2: lock NW bit and set WT1 */
139 setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x14 );
140}
141
142static void __cpuinit set_cx86_inc(void)
143{
144 unsigned char ccr3;
145
146 printk(KERN_INFO "Enable Incrementor on Cyrix/NSC processor.\n");
147
148 ccr3 = getCx86(CX86_CCR3);
149 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN  */
150 /* PCR1 -- Performance Control */
151 /* Incrementor on, whatever that is */
152 setCx86(CX86_PCR1, getCx86(CX86_PCR1) | 0x02);
153 /* PCR0 -- Performance Control */
154 /* Incrementor Margin 10 */
155 setCx86(CX86_PCR0, getCx86(CX86_PCR0) | 0x04);
156 setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
157}
158
159/*
160 * Configure later MediaGX and/or Geode processor.
161 */
162
163static void __cpuinit geode_configure(void)
164{
165 unsigned long flags;
166 u8 ccr3;
167 local_irq_save(flags);
168
169 /* Suspend on halt power saving and enable #SUSP pin */
170 setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x88);
171
172 ccr3 = getCx86(CX86_CCR3);
173 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
174
175
176 /* FPU fast, DTE cache, Mem bypass */
177 setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x38);
178 setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
179
180 set_cx86_memwb();
181 set_cx86_reorder();
182 set_cx86_inc();
183
184 local_irq_restore(flags);
185}
186
187
188static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
189{
190 unsigned char dir0, dir0_msn, dir0_lsn, dir1 = 0;
191 char *buf = c->x86_model_id;
192 const char *p = NULL;
193
194 /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
195 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
196 clear_bit(0*32+31, c->x86_capability);
197
198 /* Cyrix used bit 24 in extended (AMD) CPUID for Cyrix MMX extensions */
199 if ( test_bit(1*32+24, c->x86_capability) ) {
200 clear_bit(1*32+24, c->x86_capability);
201 set_bit(X86_FEATURE_CXMMX, c->x86_capability);
202 }
203
204 do_cyrix_devid(&dir0, &dir1);
205
206 check_cx686_slop(c);
207
208 Cx86_dir0_msb = dir0_msn = dir0 >> 4; /* identifies CPU "family" */
209 dir0_lsn = dir0 & 0xf; /* model or clock multiplier */
210
211 /* common case step number/rev -- exceptions handled below */
212 c->x86_model = (dir1 >> 4) + 1;
213 c->x86_mask = dir1 & 0xf;
214
215 /* Now cook; the original recipe is by Channing Corn, from Cyrix.
216 * We do the same thing for each generation: we work out
217 * the model, multiplier and stepping. Black magic included,
218 * to make the silicon step/rev numbers match the printed ones.
219 */
220
221 switch (dir0_msn) {
222 unsigned char tmp;
223
224 case 0: /* Cx486SLC/DLC/SRx/DRx */
225 p = Cx486_name[dir0_lsn & 7];
226 break;
227
228 case 1: /* Cx486S/DX/DX2/DX4 */
229 p = (dir0_lsn & 8) ? Cx486D_name[dir0_lsn & 5]
230 : Cx486S_name[dir0_lsn & 3];
231 break;
232
233 case 2: /* 5x86 */
234 Cx86_cb[2] = cyrix_model_mult1[dir0_lsn & 5];
235 p = Cx86_cb+2;
236 break;
237
238 case 3: /* 6x86/6x86L */
239 Cx86_cb[1] = ' ';
240 Cx86_cb[2] = cyrix_model_mult1[dir0_lsn & 5];
241 if (dir1 > 0x21) { /* 686L */
242 Cx86_cb[0] = 'L';
243 p = Cx86_cb;
244 (c->x86_model)++;
245 } else /* 686 */
246 p = Cx86_cb+1;
247 /* Emulate MTRRs using Cyrix's ARRs. */
248 set_bit(X86_FEATURE_CYRIX_ARR, c->x86_capability);
249 /* 6x86's contain this bug */
250 c->coma_bug = 1;
251 break;
252
253 case 4: /* MediaGX/GXm or Geode GXM/GXLV/GX1 */
254#ifdef CONFIG_PCI
255 {
256 u32 vendor, device;
257 /* It isn't really a PCI quirk directly, but the cure is the
258 same. The MediaGX has deep magic SMM stuff that handles the
259 SB emulation. It thows away the fifo on disable_dma() which
260 is wrong and ruins the audio.
261
262 Bug2: VSA1 has a wrap bug so that using maximum sized DMA
263 causes bad things. According to NatSemi VSA2 has another
264 bug to do with 'hlt'. I've not seen any boards using VSA2
265 and X doesn't seem to support it either so who cares 8).
266 VSA1 we work around however.
267 */
268
269 printk(KERN_INFO "Working around Cyrix MediaGX virtual DMA bugs.\n");
270 isa_dma_bridge_buggy = 2;
271
272 /* We do this before the PCI layer is running. However we
273 are safe here as we know the bridge must be a Cyrix
274 companion and must be present */
275 vendor = read_pci_config_16(0, 0, 0x12, PCI_VENDOR_ID);
276 device = read_pci_config_16(0, 0, 0x12, PCI_DEVICE_ID);
277
278 /*
279 * The 5510/5520 companion chips have a funky PIT.
280 */
281 if (vendor == PCI_VENDOR_ID_CYRIX &&
282 (device == PCI_DEVICE_ID_CYRIX_5510 || device == PCI_DEVICE_ID_CYRIX_5520))
283 mark_tsc_unstable("cyrix 5510/5520 detected");
284 }
285#endif
286 c->x86_cache_size=16; /* Yep 16K integrated cache thats it */
287
288 /* GXm supports extended cpuid levels 'ala' AMD */
289 if (c->cpuid_level == 2) {
290 /* Enable cxMMX extensions (GX1 Datasheet 54) */
291 setCx86(CX86_CCR7, getCx86(CX86_CCR7) | 1);
292
293 /*
294 * GXm : 0x30 ... 0x5f GXm datasheet 51
295 * GXlv: 0x6x GXlv datasheet 54
296 * ? : 0x7x
297 * GX1 : 0x8x GX1 datasheet 56
298 */
299 if((0x30 <= dir1 && dir1 <= 0x6f) || (0x80 <=dir1 && dir1 <= 0x8f))
300 geode_configure();
301 get_model_name(c); /* get CPU marketing name */
302 return;
303 }
304 else { /* MediaGX */
305 Cx86_cb[2] = (dir0_lsn & 1) ? '3' : '4';
306 p = Cx86_cb+2;
307 c->x86_model = (dir1 & 0x20) ? 1 : 2;
308 }
309 break;
310
311 case 5: /* 6x86MX/M II */
312 if (dir1 > 7)
313 {
314 dir0_msn++; /* M II */
315 /* Enable MMX extensions (App note 108) */
316 setCx86(CX86_CCR7, getCx86(CX86_CCR7)|1);
317 }
318 else
319 {
320 c->coma_bug = 1; /* 6x86MX, it has the bug. */
321 }
322 tmp = (!(dir0_lsn & 7) || dir0_lsn & 1) ? 2 : 0;
323 Cx86_cb[tmp] = cyrix_model_mult2[dir0_lsn & 7];
324 p = Cx86_cb+tmp;
325 if (((dir1 & 0x0f) > 4) || ((dir1 & 0xf0) == 0x20))
326 (c->x86_model)++;
327 /* Emulate MTRRs using Cyrix's ARRs. */
328 set_bit(X86_FEATURE_CYRIX_ARR, c->x86_capability);
329 break;
330
331 case 0xf: /* Cyrix 486 without DEVID registers */
332 switch (dir0_lsn) {
333 case 0xd: /* either a 486SLC or DLC w/o DEVID */
334 dir0_msn = 0;
335 p = Cx486_name[(c->hard_math) ? 1 : 0];
336 break;
337
338 case 0xe: /* a 486S A step */
339 dir0_msn = 0;
340 p = Cx486S_name[0];
341 break;
342 }
343 break;
344
345 default: /* unknown (shouldn't happen, we know everyone ;-) */
346 dir0_msn = 7;
347 break;
348 }
349 strcpy(buf, Cx86_model[dir0_msn & 7]);
350 if (p) strcat(buf, p);
351 return;
352}
353
354/*
355 * Handle National Semiconductor branded processors
356 */
357static void __cpuinit init_nsc(struct cpuinfo_x86 *c)
358{
359 /* There may be GX1 processors in the wild that are branded
360 * NSC and not Cyrix.
361 *
362 * This function only handles the GX processor, and kicks every
363 * thing else to the Cyrix init function above - that should
364 * cover any processors that might have been branded differently
365 * after NSC acquired Cyrix.
366 *
367 * If this breaks your GX1 horribly, please e-mail
368 * info-linux@ldcmail.amd.com to tell us.
369 */
370
371 /* Handle the GX (Formally known as the GX2) */
372
373 if (c->x86 == 5 && c->x86_model == 5)
374 display_cacheinfo(c);
375 else
376 init_cyrix(c);
377}
378
379/*
380 * Cyrix CPUs without cpuid or with cpuid not yet enabled can be detected
381 * by the fact that they preserve the flags across the division of 5/2.
382 * PII and PPro exhibit this behavior too, but they have cpuid available.
383 */
384
385/*
386 * Perform the Cyrix 5/2 test. A Cyrix won't change
387 * the flags, while other 486 chips will.
388 */
389static inline int test_cyrix_52div(void)
390{
391 unsigned int test;
392
393 __asm__ __volatile__(
394 "sahf\n\t" /* clear flags (%eax = 0x0005) */
395 "div %b2\n\t" /* divide 5 by 2 */
396 "lahf" /* store flags into %ah */
397 : "=a" (test)
398 : "0" (5), "q" (2)
399 : "cc");
400
401 /* AH is 0x02 on Cyrix after the divide.. */
402 return (unsigned char) (test >> 8) == 0x02;
403}
404
405static void __cpuinit cyrix_identify(struct cpuinfo_x86 * c)
406{
407 /* Detect Cyrix with disabled CPUID */
408 if ( c->x86 == 4 && test_cyrix_52div() ) {
409 unsigned char dir0, dir1;
410
411 strcpy(c->x86_vendor_id, "CyrixInstead");
412 c->x86_vendor = X86_VENDOR_CYRIX;
413
414 /* Actually enable cpuid on the older cyrix */
415
416 /* Retrieve CPU revisions */
417
418 do_cyrix_devid(&dir0, &dir1);
419
420 dir0>>=4;
421
422 /* Check it is an affected model */
423
424 if (dir0 == 5 || dir0 == 3)
425 {
426 unsigned char ccr3;
427 unsigned long flags;
428 printk(KERN_INFO "Enabling CPUID on Cyrix processor.\n");
429 local_irq_save(flags);
430 ccr3 = getCx86(CX86_CCR3);
431 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
432 setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x80); /* enable cpuid */
433 setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
434 local_irq_restore(flags);
435 }
436 }
437}
438
439static struct cpu_dev cyrix_cpu_dev __cpuinitdata = {
440 .c_vendor = "Cyrix",
441 .c_ident = { "CyrixInstead" },
442 .c_init = init_cyrix,
443 .c_identify = cyrix_identify,
444};
445
446int __init cyrix_init_cpu(void)
447{
448 cpu_devs[X86_VENDOR_CYRIX] = &cyrix_cpu_dev;
449 return 0;
450}
451
452static struct cpu_dev nsc_cpu_dev __cpuinitdata = {
453 .c_vendor = "NSC",
454 .c_ident = { "Geode by NSC" },
455 .c_init = init_nsc,
456};
457
458int __init nsc_init_cpu(void)
459{
460 cpu_devs[X86_VENDOR_NSC] = &nsc_cpu_dev;
461 return 0;
462}
463
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
new file mode 100644
index 000000000000..dc4e08147b1f
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel.c
@@ -0,0 +1,333 @@
1#include <linux/init.h>
2#include <linux/kernel.h>
3
4#include <linux/string.h>
5#include <linux/bitops.h>
6#include <linux/smp.h>
7#include <linux/thread_info.h>
8#include <linux/module.h>
9
10#include <asm/processor.h>
11#include <asm/msr.h>
12#include <asm/uaccess.h>
13
14#include "cpu.h"
15
16#ifdef CONFIG_X86_LOCAL_APIC
17#include <asm/mpspec.h>
18#include <asm/apic.h>
19#include <mach_apic.h>
20#endif
21
22extern int trap_init_f00f_bug(void);
23
24#ifdef CONFIG_X86_INTEL_USERCOPY
25/*
26 * Alignment at which movsl is preferred for bulk memory copies.
27 */
28struct movsl_mask movsl_mask __read_mostly;
29#endif
30
31void __cpuinit early_intel_workaround(struct cpuinfo_x86 *c)
32{
33 if (c->x86_vendor != X86_VENDOR_INTEL)
34 return;
35 /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */
36 if (c->x86 == 15 && c->x86_cache_alignment == 64)
37 c->x86_cache_alignment = 128;
38}
39
40/*
41 * Early probe support logic for ppro memory erratum #50
42 *
43 * This is called before we do cpu ident work
44 */
45
46int __cpuinit ppro_with_ram_bug(void)
47{
48 /* Uses data from early_cpu_detect now */
49 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
50 boot_cpu_data.x86 == 6 &&
51 boot_cpu_data.x86_model == 1 &&
52 boot_cpu_data.x86_mask < 8) {
53 printk(KERN_INFO "Pentium Pro with Errata#50 detected. Taking evasive action.\n");
54 return 1;
55 }
56 return 0;
57}
58
59
60/*
61 * P4 Xeon errata 037 workaround.
62 * Hardware prefetcher may cause stale data to be loaded into the cache.
63 */
64static void __cpuinit Intel_errata_workarounds(struct cpuinfo_x86 *c)
65{
66 unsigned long lo, hi;
67
68 if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) {
69 rdmsr (MSR_IA32_MISC_ENABLE, lo, hi);
70 if ((lo & (1<<9)) == 0) {
71 printk (KERN_INFO "CPU: C0 stepping P4 Xeon detected.\n");
72 printk (KERN_INFO "CPU: Disabling hardware prefetching (Errata 037)\n");
73 lo |= (1<<9); /* Disable hw prefetching */
74 wrmsr (MSR_IA32_MISC_ENABLE, lo, hi);
75 }
76 }
77}
78
79
80/*
81 * find out the number of processor cores on the die
82 */
83static int __cpuinit num_cpu_cores(struct cpuinfo_x86 *c)
84{
85 unsigned int eax, ebx, ecx, edx;
86
87 if (c->cpuid_level < 4)
88 return 1;
89
90 /* Intel has a non-standard dependency on %ecx for this CPUID level. */
91 cpuid_count(4, 0, &eax, &ebx, &ecx, &edx);
92 if (eax & 0x1f)
93 return ((eax >> 26) + 1);
94 else
95 return 1;
96}
97
98static void __cpuinit init_intel(struct cpuinfo_x86 *c)
99{
100 unsigned int l2 = 0;
101 char *p = NULL;
102
103#ifdef CONFIG_X86_F00F_BUG
104 /*
105 * All current models of Pentium and Pentium with MMX technology CPUs
106 * have the F0 0F bug, which lets nonprivileged users lock up the system.
107 * Note that the workaround only should be initialized once...
108 */
109 c->f00f_bug = 0;
110 if (!paravirt_enabled() && c->x86 == 5) {
111 static int f00f_workaround_enabled = 0;
112
113 c->f00f_bug = 1;
114 if ( !f00f_workaround_enabled ) {
115 trap_init_f00f_bug();
116 printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n");
117 f00f_workaround_enabled = 1;
118 }
119 }
120#endif
121
122 select_idle_routine(c);
123 l2 = init_intel_cacheinfo(c);
124 if (c->cpuid_level > 9 ) {
125 unsigned eax = cpuid_eax(10);
126 /* Check for version and the number of counters */
127 if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
128 set_bit(X86_FEATURE_ARCH_PERFMON, c->x86_capability);
129 }
130
131 /* SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until model 3 mask 3 */
132 if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633)
133 clear_bit(X86_FEATURE_SEP, c->x86_capability);
134
135 /* Names for the Pentium II/Celeron processors
136 detectable only by also checking the cache size.
137 Dixon is NOT a Celeron. */
138 if (c->x86 == 6) {
139 switch (c->x86_model) {
140 case 5:
141 if (c->x86_mask == 0) {
142 if (l2 == 0)
143 p = "Celeron (Covington)";
144 else if (l2 == 256)
145 p = "Mobile Pentium II (Dixon)";
146 }
147 break;
148
149 case 6:
150 if (l2 == 128)
151 p = "Celeron (Mendocino)";
152 else if (c->x86_mask == 0 || c->x86_mask == 5)
153 p = "Celeron-A";
154 break;
155
156 case 8:
157 if (l2 == 128)
158 p = "Celeron (Coppermine)";
159 break;
160 }
161 }
162
163 if ( p )
164 strcpy(c->x86_model_id, p);
165
166 c->x86_max_cores = num_cpu_cores(c);
167
168 detect_ht(c);
169
170 /* Work around errata */
171 Intel_errata_workarounds(c);
172
173#ifdef CONFIG_X86_INTEL_USERCOPY
174 /*
175 * Set up the preferred alignment for movsl bulk memory moves
176 */
177 switch (c->x86) {
178 case 4: /* 486: untested */
179 break;
180 case 5: /* Old Pentia: untested */
181 break;
182 case 6: /* PII/PIII only like movsl with 8-byte alignment */
183 movsl_mask.mask = 7;
184 break;
185 case 15: /* P4 is OK down to 8-byte alignment */
186 movsl_mask.mask = 7;
187 break;
188 }
189#endif
190
191 if (c->x86 == 15) {
192 set_bit(X86_FEATURE_P4, c->x86_capability);
193 set_bit(X86_FEATURE_SYNC_RDTSC, c->x86_capability);
194 }
195 if (c->x86 == 6)
196 set_bit(X86_FEATURE_P3, c->x86_capability);
197 if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
198 (c->x86 == 0x6 && c->x86_model >= 0x0e))
199 set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
200
201 if (cpu_has_ds) {
202 unsigned int l1;
203 rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
204 if (!(l1 & (1<<11)))
205 set_bit(X86_FEATURE_BTS, c->x86_capability);
206 if (!(l1 & (1<<12)))
207 set_bit(X86_FEATURE_PEBS, c->x86_capability);
208 }
209}
210
211static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 * c, unsigned int size)
212{
213 /* Intel PIII Tualatin. This comes in two flavours.
214 * One has 256kb of cache, the other 512. We have no way
215 * to determine which, so we use a boottime override
216 * for the 512kb model, and assume 256 otherwise.
217 */
218 if ((c->x86 == 6) && (c->x86_model == 11) && (size == 0))
219 size = 256;
220 return size;
221}
222
223static struct cpu_dev intel_cpu_dev __cpuinitdata = {
224 .c_vendor = "Intel",
225 .c_ident = { "GenuineIntel" },
226 .c_models = {
227 { .vendor = X86_VENDOR_INTEL, .family = 4, .model_names =
228 {
229 [0] = "486 DX-25/33",
230 [1] = "486 DX-50",
231 [2] = "486 SX",
232 [3] = "486 DX/2",
233 [4] = "486 SL",
234 [5] = "486 SX/2",
235 [7] = "486 DX/2-WB",
236 [8] = "486 DX/4",
237 [9] = "486 DX/4-WB"
238 }
239 },
240 { .vendor = X86_VENDOR_INTEL, .family = 5, .model_names =
241 {
242 [0] = "Pentium 60/66 A-step",
243 [1] = "Pentium 60/66",
244 [2] = "Pentium 75 - 200",
245 [3] = "OverDrive PODP5V83",
246 [4] = "Pentium MMX",
247 [7] = "Mobile Pentium 75 - 200",
248 [8] = "Mobile Pentium MMX"
249 }
250 },
251 { .vendor = X86_VENDOR_INTEL, .family = 6, .model_names =
252 {
253 [0] = "Pentium Pro A-step",
254 [1] = "Pentium Pro",
255 [3] = "Pentium II (Klamath)",
256 [4] = "Pentium II (Deschutes)",
257 [5] = "Pentium II (Deschutes)",
258 [6] = "Mobile Pentium II",
259 [7] = "Pentium III (Katmai)",
260 [8] = "Pentium III (Coppermine)",
261 [10] = "Pentium III (Cascades)",
262 [11] = "Pentium III (Tualatin)",
263 }
264 },
265 { .vendor = X86_VENDOR_INTEL, .family = 15, .model_names =
266 {
267 [0] = "Pentium 4 (Unknown)",
268 [1] = "Pentium 4 (Willamette)",
269 [2] = "Pentium 4 (Northwood)",
270 [4] = "Pentium 4 (Foster)",
271 [5] = "Pentium 4 (Foster)",
272 }
273 },
274 },
275 .c_init = init_intel,
276 .c_size_cache = intel_size_cache,
277};
278
279__init int intel_cpu_init(void)
280{
281 cpu_devs[X86_VENDOR_INTEL] = &intel_cpu_dev;
282 return 0;
283}
284
285#ifndef CONFIG_X86_CMPXCHG
286unsigned long cmpxchg_386_u8(volatile void *ptr, u8 old, u8 new)
287{
288 u8 prev;
289 unsigned long flags;
290
291 /* Poor man's cmpxchg for 386. Unsuitable for SMP */
292 local_irq_save(flags);
293 prev = *(u8 *)ptr;
294 if (prev == old)
295 *(u8 *)ptr = new;
296 local_irq_restore(flags);
297 return prev;
298}
299EXPORT_SYMBOL(cmpxchg_386_u8);
300
301unsigned long cmpxchg_386_u16(volatile void *ptr, u16 old, u16 new)
302{
303 u16 prev;
304 unsigned long flags;
305
306 /* Poor man's cmpxchg for 386. Unsuitable for SMP */
307 local_irq_save(flags);
308 prev = *(u16 *)ptr;
309 if (prev == old)
310 *(u16 *)ptr = new;
311 local_irq_restore(flags);
312 return prev;
313}
314EXPORT_SYMBOL(cmpxchg_386_u16);
315
316unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new)
317{
318 u32 prev;
319 unsigned long flags;
320
321 /* Poor man's cmpxchg for 386. Unsuitable for SMP */
322 local_irq_save(flags);
323 prev = *(u32 *)ptr;
324 if (prev == old)
325 *(u32 *)ptr = new;
326 local_irq_restore(flags);
327 return prev;
328}
329EXPORT_SYMBOL(cmpxchg_386_u32);
330#endif
331
332// arch_initcall(intel_cpu_init);
333
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
new file mode 100644
index 000000000000..db6c25aa5776
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -0,0 +1,806 @@
1/*
2 * Routines to indentify caches on Intel CPU.
3 *
4 * Changes:
5 * Venkatesh Pallipadi : Adding cache identification through cpuid(4)
6 * Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure.
7 * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD.
8 */
9
10#include <linux/init.h>
11#include <linux/slab.h>
12#include <linux/device.h>
13#include <linux/compiler.h>
14#include <linux/cpu.h>
15#include <linux/sched.h>
16
17#include <asm/processor.h>
18#include <asm/smp.h>
19
20#define LVL_1_INST 1
21#define LVL_1_DATA 2
22#define LVL_2 3
23#define LVL_3 4
24#define LVL_TRACE 5
25
26struct _cache_table
27{
28 unsigned char descriptor;
29 char cache_type;
30 short size;
31};
32
33/* all the cache descriptor types we care about (no TLB or trace cache entries) */
34static struct _cache_table cache_table[] __cpuinitdata =
35{
36 { 0x06, LVL_1_INST, 8 }, /* 4-way set assoc, 32 byte line size */
37 { 0x08, LVL_1_INST, 16 }, /* 4-way set assoc, 32 byte line size */
38 { 0x0a, LVL_1_DATA, 8 }, /* 2 way set assoc, 32 byte line size */
39 { 0x0c, LVL_1_DATA, 16 }, /* 4-way set assoc, 32 byte line size */
40 { 0x22, LVL_3, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */
41 { 0x23, LVL_3, 1024 }, /* 8-way set assoc, sectored cache, 64 byte line size */
42 { 0x25, LVL_3, 2048 }, /* 8-way set assoc, sectored cache, 64 byte line size */
43 { 0x29, LVL_3, 4096 }, /* 8-way set assoc, sectored cache, 64 byte line size */
44 { 0x2c, LVL_1_DATA, 32 }, /* 8-way set assoc, 64 byte line size */
45 { 0x30, LVL_1_INST, 32 }, /* 8-way set assoc, 64 byte line size */
46 { 0x39, LVL_2, 128 }, /* 4-way set assoc, sectored cache, 64 byte line size */
47 { 0x3a, LVL_2, 192 }, /* 6-way set assoc, sectored cache, 64 byte line size */
48 { 0x3b, LVL_2, 128 }, /* 2-way set assoc, sectored cache, 64 byte line size */
49 { 0x3c, LVL_2, 256 }, /* 4-way set assoc, sectored cache, 64 byte line size */
50 { 0x3d, LVL_2, 384 }, /* 6-way set assoc, sectored cache, 64 byte line size */
51 { 0x3e, LVL_2, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */
52 { 0x41, LVL_2, 128 }, /* 4-way set assoc, 32 byte line size */
53 { 0x42, LVL_2, 256 }, /* 4-way set assoc, 32 byte line size */
54 { 0x43, LVL_2, 512 }, /* 4-way set assoc, 32 byte line size */
55 { 0x44, LVL_2, 1024 }, /* 4-way set assoc, 32 byte line size */
56 { 0x45, LVL_2, 2048 }, /* 4-way set assoc, 32 byte line size */
57 { 0x46, LVL_3, 4096 }, /* 4-way set assoc, 64 byte line size */
58 { 0x47, LVL_3, 8192 }, /* 8-way set assoc, 64 byte line size */
59 { 0x49, LVL_3, 4096 }, /* 16-way set assoc, 64 byte line size */
60 { 0x4a, LVL_3, 6144 }, /* 12-way set assoc, 64 byte line size */
61 { 0x4b, LVL_3, 8192 }, /* 16-way set assoc, 64 byte line size */
62 { 0x4c, LVL_3, 12288 }, /* 12-way set assoc, 64 byte line size */
63 { 0x4d, LVL_3, 16384 }, /* 16-way set assoc, 64 byte line size */
64 { 0x60, LVL_1_DATA, 16 }, /* 8-way set assoc, sectored cache, 64 byte line size */
65 { 0x66, LVL_1_DATA, 8 }, /* 4-way set assoc, sectored cache, 64 byte line size */
66 { 0x67, LVL_1_DATA, 16 }, /* 4-way set assoc, sectored cache, 64 byte line size */
67 { 0x68, LVL_1_DATA, 32 }, /* 4-way set assoc, sectored cache, 64 byte line size */
68 { 0x70, LVL_TRACE, 12 }, /* 8-way set assoc */
69 { 0x71, LVL_TRACE, 16 }, /* 8-way set assoc */
70 { 0x72, LVL_TRACE, 32 }, /* 8-way set assoc */
71 { 0x73, LVL_TRACE, 64 }, /* 8-way set assoc */
72 { 0x78, LVL_2, 1024 }, /* 4-way set assoc, 64 byte line size */
73 { 0x79, LVL_2, 128 }, /* 8-way set assoc, sectored cache, 64 byte line size */
74 { 0x7a, LVL_2, 256 }, /* 8-way set assoc, sectored cache, 64 byte line size */
75 { 0x7b, LVL_2, 512 }, /* 8-way set assoc, sectored cache, 64 byte line size */
76 { 0x7c, LVL_2, 1024 }, /* 8-way set assoc, sectored cache, 64 byte line size */
77 { 0x7d, LVL_2, 2048 }, /* 8-way set assoc, 64 byte line size */
78 { 0x7f, LVL_2, 512 }, /* 2-way set assoc, 64 byte line size */
79 { 0x82, LVL_2, 256 }, /* 8-way set assoc, 32 byte line size */
80 { 0x83, LVL_2, 512 }, /* 8-way set assoc, 32 byte line size */
81 { 0x84, LVL_2, 1024 }, /* 8-way set assoc, 32 byte line size */
82 { 0x85, LVL_2, 2048 }, /* 8-way set assoc, 32 byte line size */
83 { 0x86, LVL_2, 512 }, /* 4-way set assoc, 64 byte line size */
84 { 0x87, LVL_2, 1024 }, /* 8-way set assoc, 64 byte line size */
85 { 0x00, 0, 0}
86};
87
88
89enum _cache_type
90{
91 CACHE_TYPE_NULL = 0,
92 CACHE_TYPE_DATA = 1,
93 CACHE_TYPE_INST = 2,
94 CACHE_TYPE_UNIFIED = 3
95};
96
97union _cpuid4_leaf_eax {
98 struct {
99 enum _cache_type type:5;
100 unsigned int level:3;
101 unsigned int is_self_initializing:1;
102 unsigned int is_fully_associative:1;
103 unsigned int reserved:4;
104 unsigned int num_threads_sharing:12;
105 unsigned int num_cores_on_die:6;
106 } split;
107 u32 full;
108};
109
110union _cpuid4_leaf_ebx {
111 struct {
112 unsigned int coherency_line_size:12;
113 unsigned int physical_line_partition:10;
114 unsigned int ways_of_associativity:10;
115 } split;
116 u32 full;
117};
118
119union _cpuid4_leaf_ecx {
120 struct {
121 unsigned int number_of_sets:32;
122 } split;
123 u32 full;
124};
125
126struct _cpuid4_info {
127 union _cpuid4_leaf_eax eax;
128 union _cpuid4_leaf_ebx ebx;
129 union _cpuid4_leaf_ecx ecx;
130 unsigned long size;
131 cpumask_t shared_cpu_map;
132};
133
134unsigned short num_cache_leaves;
135
136/* AMD doesn't have CPUID4. Emulate it here to report the same
137 information to the user. This makes some assumptions about the machine:
138 L2 not shared, no SMT etc. that is currently true on AMD CPUs.
139
140 In theory the TLBs could be reported as fake type (they are in "dummy").
141 Maybe later */
142union l1_cache {
143 struct {
144 unsigned line_size : 8;
145 unsigned lines_per_tag : 8;
146 unsigned assoc : 8;
147 unsigned size_in_kb : 8;
148 };
149 unsigned val;
150};
151
152union l2_cache {
153 struct {
154 unsigned line_size : 8;
155 unsigned lines_per_tag : 4;
156 unsigned assoc : 4;
157 unsigned size_in_kb : 16;
158 };
159 unsigned val;
160};
161
162union l3_cache {
163 struct {
164 unsigned line_size : 8;
165 unsigned lines_per_tag : 4;
166 unsigned assoc : 4;
167 unsigned res : 2;
168 unsigned size_encoded : 14;
169 };
170 unsigned val;
171};
172
173static const unsigned short assocs[] = {
174 [1] = 1, [2] = 2, [4] = 4, [6] = 8,
175 [8] = 16, [0xa] = 32, [0xb] = 48,
176 [0xc] = 64,
177 [0xf] = 0xffff // ??
178};
179
180static const unsigned char levels[] = { 1, 1, 2, 3 };
181static const unsigned char types[] = { 1, 2, 3, 3 };
182
183static void __cpuinit amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
184 union _cpuid4_leaf_ebx *ebx,
185 union _cpuid4_leaf_ecx *ecx)
186{
187 unsigned dummy;
188 unsigned line_size, lines_per_tag, assoc, size_in_kb;
189 union l1_cache l1i, l1d;
190 union l2_cache l2;
191 union l3_cache l3;
192 union l1_cache *l1 = &l1d;
193
194 eax->full = 0;
195 ebx->full = 0;
196 ecx->full = 0;
197
198 cpuid(0x80000005, &dummy, &dummy, &l1d.val, &l1i.val);
199 cpuid(0x80000006, &dummy, &dummy, &l2.val, &l3.val);
200
201 switch (leaf) {
202 case 1:
203 l1 = &l1i;
204 case 0:
205 if (!l1->val)
206 return;
207 assoc = l1->assoc;
208 line_size = l1->line_size;
209 lines_per_tag = l1->lines_per_tag;
210 size_in_kb = l1->size_in_kb;
211 break;
212 case 2:
213 if (!l2.val)
214 return;
215 assoc = l2.assoc;
216 line_size = l2.line_size;
217 lines_per_tag = l2.lines_per_tag;
218 /* cpu_data has errata corrections for K7 applied */
219 size_in_kb = current_cpu_data.x86_cache_size;
220 break;
221 case 3:
222 if (!l3.val)
223 return;
224 assoc = l3.assoc;
225 line_size = l3.line_size;
226 lines_per_tag = l3.lines_per_tag;
227 size_in_kb = l3.size_encoded * 512;
228 break;
229 default:
230 return;
231 }
232
233 eax->split.is_self_initializing = 1;
234 eax->split.type = types[leaf];
235 eax->split.level = levels[leaf];
236 if (leaf == 3)
237 eax->split.num_threads_sharing = current_cpu_data.x86_max_cores - 1;
238 else
239 eax->split.num_threads_sharing = 0;
240 eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1;
241
242
243 if (assoc == 0xf)
244 eax->split.is_fully_associative = 1;
245 ebx->split.coherency_line_size = line_size - 1;
246 ebx->split.ways_of_associativity = assocs[assoc] - 1;
247 ebx->split.physical_line_partition = lines_per_tag - 1;
248 ecx->split.number_of_sets = (size_in_kb * 1024) / line_size /
249 (ebx->split.ways_of_associativity + 1) - 1;
250}
251
252static int __cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf)
253{
254 union _cpuid4_leaf_eax eax;
255 union _cpuid4_leaf_ebx ebx;
256 union _cpuid4_leaf_ecx ecx;
257 unsigned edx;
258
259 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
260 amd_cpuid4(index, &eax, &ebx, &ecx);
261 else
262 cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
263 if (eax.split.type == CACHE_TYPE_NULL)
264 return -EIO; /* better error ? */
265
266 this_leaf->eax = eax;
267 this_leaf->ebx = ebx;
268 this_leaf->ecx = ecx;
269 this_leaf->size = (ecx.split.number_of_sets + 1) *
270 (ebx.split.coherency_line_size + 1) *
271 (ebx.split.physical_line_partition + 1) *
272 (ebx.split.ways_of_associativity + 1);
273 return 0;
274}
275
276static int __cpuinit find_num_cache_leaves(void)
277{
278 unsigned int eax, ebx, ecx, edx;
279 union _cpuid4_leaf_eax cache_eax;
280 int i = -1;
281
282 do {
283 ++i;
284 /* Do cpuid(4) loop to find out num_cache_leaves */
285 cpuid_count(4, i, &eax, &ebx, &ecx, &edx);
286 cache_eax.full = eax;
287 } while (cache_eax.split.type != CACHE_TYPE_NULL);
288 return i;
289}
290
291unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
292{
293 unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0; /* Cache sizes */
294 unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */
295 unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */
296 unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb;
297#ifdef CONFIG_X86_HT
298 unsigned int cpu = (c == &boot_cpu_data) ? 0 : (c - cpu_data);
299#endif
300
301 if (c->cpuid_level > 3) {
302 static int is_initialized;
303
304 if (is_initialized == 0) {
305 /* Init num_cache_leaves from boot CPU */
306 num_cache_leaves = find_num_cache_leaves();
307 is_initialized++;
308 }
309
310 /*
311 * Whenever possible use cpuid(4), deterministic cache
312 * parameters cpuid leaf to find the cache details
313 */
314 for (i = 0; i < num_cache_leaves; i++) {
315 struct _cpuid4_info this_leaf;
316
317 int retval;
318
319 retval = cpuid4_cache_lookup(i, &this_leaf);
320 if (retval >= 0) {
321 switch(this_leaf.eax.split.level) {
322 case 1:
323 if (this_leaf.eax.split.type ==
324 CACHE_TYPE_DATA)
325 new_l1d = this_leaf.size/1024;
326 else if (this_leaf.eax.split.type ==
327 CACHE_TYPE_INST)
328 new_l1i = this_leaf.size/1024;
329 break;
330 case 2:
331 new_l2 = this_leaf.size/1024;
332 num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
333 index_msb = get_count_order(num_threads_sharing);
334 l2_id = c->apicid >> index_msb;
335 break;
336 case 3:
337 new_l3 = this_leaf.size/1024;
338 num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
339 index_msb = get_count_order(num_threads_sharing);
340 l3_id = c->apicid >> index_msb;
341 break;
342 default:
343 break;
344 }
345 }
346 }
347 }
348 /*
349 * Don't use cpuid2 if cpuid4 is supported. For P4, we use cpuid2 for
350 * trace cache
351 */
352 if ((num_cache_leaves == 0 || c->x86 == 15) && c->cpuid_level > 1) {
353 /* supports eax=2 call */
354 int i, j, n;
355 int regs[4];
356 unsigned char *dp = (unsigned char *)regs;
357 int only_trace = 0;
358
359 if (num_cache_leaves != 0 && c->x86 == 15)
360 only_trace = 1;
361
362 /* Number of times to iterate */
363 n = cpuid_eax(2) & 0xFF;
364
365 for ( i = 0 ; i < n ; i++ ) {
366 cpuid(2, &regs[0], &regs[1], &regs[2], &regs[3]);
367
368 /* If bit 31 is set, this is an unknown format */
369 for ( j = 0 ; j < 3 ; j++ ) {
370 if ( regs[j] < 0 ) regs[j] = 0;
371 }
372
373 /* Byte 0 is level count, not a descriptor */
374 for ( j = 1 ; j < 16 ; j++ ) {
375 unsigned char des = dp[j];
376 unsigned char k = 0;
377
378 /* look up this descriptor in the table */
379 while (cache_table[k].descriptor != 0)
380 {
381 if (cache_table[k].descriptor == des) {
382 if (only_trace && cache_table[k].cache_type != LVL_TRACE)
383 break;
384 switch (cache_table[k].cache_type) {
385 case LVL_1_INST:
386 l1i += cache_table[k].size;
387 break;
388 case LVL_1_DATA:
389 l1d += cache_table[k].size;
390 break;
391 case LVL_2:
392 l2 += cache_table[k].size;
393 break;
394 case LVL_3:
395 l3 += cache_table[k].size;
396 break;
397 case LVL_TRACE:
398 trace += cache_table[k].size;
399 break;
400 }
401
402 break;
403 }
404
405 k++;
406 }
407 }
408 }
409 }
410
411 if (new_l1d)
412 l1d = new_l1d;
413
414 if (new_l1i)
415 l1i = new_l1i;
416
417 if (new_l2) {
418 l2 = new_l2;
419#ifdef CONFIG_X86_HT
420 cpu_llc_id[cpu] = l2_id;
421#endif
422 }
423
424 if (new_l3) {
425 l3 = new_l3;
426#ifdef CONFIG_X86_HT
427 cpu_llc_id[cpu] = l3_id;
428#endif
429 }
430
431 if (trace)
432 printk (KERN_INFO "CPU: Trace cache: %dK uops", trace);
433 else if ( l1i )
434 printk (KERN_INFO "CPU: L1 I cache: %dK", l1i);
435
436 if (l1d)
437 printk(", L1 D cache: %dK\n", l1d);
438 else
439 printk("\n");
440
441 if (l2)
442 printk(KERN_INFO "CPU: L2 cache: %dK\n", l2);
443
444 if (l3)
445 printk(KERN_INFO "CPU: L3 cache: %dK\n", l3);
446
447 c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d));
448
449 return l2;
450}
451
452/* pointer to _cpuid4_info array (for each cache leaf) */
453static struct _cpuid4_info *cpuid4_info[NR_CPUS];
454#define CPUID4_INFO_IDX(x,y) (&((cpuid4_info[x])[y]))
455
456#ifdef CONFIG_SMP
457static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
458{
459 struct _cpuid4_info *this_leaf, *sibling_leaf;
460 unsigned long num_threads_sharing;
461 int index_msb, i;
462 struct cpuinfo_x86 *c = cpu_data;
463
464 this_leaf = CPUID4_INFO_IDX(cpu, index);
465 num_threads_sharing = 1 + this_leaf->eax.split.num_threads_sharing;
466
467 if (num_threads_sharing == 1)
468 cpu_set(cpu, this_leaf->shared_cpu_map);
469 else {
470 index_msb = get_count_order(num_threads_sharing);
471
472 for_each_online_cpu(i) {
473 if (c[i].apicid >> index_msb ==
474 c[cpu].apicid >> index_msb) {
475 cpu_set(i, this_leaf->shared_cpu_map);
476 if (i != cpu && cpuid4_info[i]) {
477 sibling_leaf = CPUID4_INFO_IDX(i, index);
478 cpu_set(cpu, sibling_leaf->shared_cpu_map);
479 }
480 }
481 }
482 }
483}
484static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index)
485{
486 struct _cpuid4_info *this_leaf, *sibling_leaf;
487 int sibling;
488
489 this_leaf = CPUID4_INFO_IDX(cpu, index);
490 for_each_cpu_mask(sibling, this_leaf->shared_cpu_map) {
491 sibling_leaf = CPUID4_INFO_IDX(sibling, index);
492 cpu_clear(cpu, sibling_leaf->shared_cpu_map);
493 }
494}
495#else
496static void __init cache_shared_cpu_map_setup(unsigned int cpu, int index) {}
497static void __init cache_remove_shared_cpu_map(unsigned int cpu, int index) {}
498#endif
499
500static void free_cache_attributes(unsigned int cpu)
501{
502 kfree(cpuid4_info[cpu]);
503 cpuid4_info[cpu] = NULL;
504}
505
506static int __cpuinit detect_cache_attributes(unsigned int cpu)
507{
508 struct _cpuid4_info *this_leaf;
509 unsigned long j;
510 int retval;
511 cpumask_t oldmask;
512
513 if (num_cache_leaves == 0)
514 return -ENOENT;
515
516 cpuid4_info[cpu] = kzalloc(
517 sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL);
518 if (cpuid4_info[cpu] == NULL)
519 return -ENOMEM;
520
521 oldmask = current->cpus_allowed;
522 retval = set_cpus_allowed(current, cpumask_of_cpu(cpu));
523 if (retval)
524 goto out;
525
526 /* Do cpuid and store the results */
527 retval = 0;
528 for (j = 0; j < num_cache_leaves; j++) {
529 this_leaf = CPUID4_INFO_IDX(cpu, j);
530 retval = cpuid4_cache_lookup(j, this_leaf);
531 if (unlikely(retval < 0))
532 break;
533 cache_shared_cpu_map_setup(cpu, j);
534 }
535 set_cpus_allowed(current, oldmask);
536
537out:
538 if (retval)
539 free_cache_attributes(cpu);
540 return retval;
541}
542
543#ifdef CONFIG_SYSFS
544
545#include <linux/kobject.h>
546#include <linux/sysfs.h>
547
548extern struct sysdev_class cpu_sysdev_class; /* from drivers/base/cpu.c */
549
550/* pointer to kobject for cpuX/cache */
551static struct kobject * cache_kobject[NR_CPUS];
552
553struct _index_kobject {
554 struct kobject kobj;
555 unsigned int cpu;
556 unsigned short index;
557};
558
559/* pointer to array of kobjects for cpuX/cache/indexY */
560static struct _index_kobject *index_kobject[NR_CPUS];
561#define INDEX_KOBJECT_PTR(x,y) (&((index_kobject[x])[y]))
562
563#define show_one_plus(file_name, object, val) \
564static ssize_t show_##file_name \
565 (struct _cpuid4_info *this_leaf, char *buf) \
566{ \
567 return sprintf (buf, "%lu\n", (unsigned long)this_leaf->object + val); \
568}
569
570show_one_plus(level, eax.split.level, 0);
571show_one_plus(coherency_line_size, ebx.split.coherency_line_size, 1);
572show_one_plus(physical_line_partition, ebx.split.physical_line_partition, 1);
573show_one_plus(ways_of_associativity, ebx.split.ways_of_associativity, 1);
574show_one_plus(number_of_sets, ecx.split.number_of_sets, 1);
575
576static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf)
577{
578 return sprintf (buf, "%luK\n", this_leaf->size / 1024);
579}
580
581static ssize_t show_shared_cpu_map(struct _cpuid4_info *this_leaf, char *buf)
582{
583 char mask_str[NR_CPUS];
584 cpumask_scnprintf(mask_str, NR_CPUS, this_leaf->shared_cpu_map);
585 return sprintf(buf, "%s\n", mask_str);
586}
587
588static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) {
589 switch(this_leaf->eax.split.type) {
590 case CACHE_TYPE_DATA:
591 return sprintf(buf, "Data\n");
592 break;
593 case CACHE_TYPE_INST:
594 return sprintf(buf, "Instruction\n");
595 break;
596 case CACHE_TYPE_UNIFIED:
597 return sprintf(buf, "Unified\n");
598 break;
599 default:
600 return sprintf(buf, "Unknown\n");
601 break;
602 }
603}
604
605struct _cache_attr {
606 struct attribute attr;
607 ssize_t (*show)(struct _cpuid4_info *, char *);
608 ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count);
609};
610
611#define define_one_ro(_name) \
612static struct _cache_attr _name = \
613 __ATTR(_name, 0444, show_##_name, NULL)
614
615define_one_ro(level);
616define_one_ro(type);
617define_one_ro(coherency_line_size);
618define_one_ro(physical_line_partition);
619define_one_ro(ways_of_associativity);
620define_one_ro(number_of_sets);
621define_one_ro(size);
622define_one_ro(shared_cpu_map);
623
624static struct attribute * default_attrs[] = {
625 &type.attr,
626 &level.attr,
627 &coherency_line_size.attr,
628 &physical_line_partition.attr,
629 &ways_of_associativity.attr,
630 &number_of_sets.attr,
631 &size.attr,
632 &shared_cpu_map.attr,
633 NULL
634};
635
636#define to_object(k) container_of(k, struct _index_kobject, kobj)
637#define to_attr(a) container_of(a, struct _cache_attr, attr)
638
639static ssize_t show(struct kobject * kobj, struct attribute * attr, char * buf)
640{
641 struct _cache_attr *fattr = to_attr(attr);
642 struct _index_kobject *this_leaf = to_object(kobj);
643 ssize_t ret;
644
645 ret = fattr->show ?
646 fattr->show(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index),
647 buf) :
648 0;
649 return ret;
650}
651
652static ssize_t store(struct kobject * kobj, struct attribute * attr,
653 const char * buf, size_t count)
654{
655 return 0;
656}
657
658static struct sysfs_ops sysfs_ops = {
659 .show = show,
660 .store = store,
661};
662
663static struct kobj_type ktype_cache = {
664 .sysfs_ops = &sysfs_ops,
665 .default_attrs = default_attrs,
666};
667
668static struct kobj_type ktype_percpu_entry = {
669 .sysfs_ops = &sysfs_ops,
670};
671
672static void cpuid4_cache_sysfs_exit(unsigned int cpu)
673{
674 kfree(cache_kobject[cpu]);
675 kfree(index_kobject[cpu]);
676 cache_kobject[cpu] = NULL;
677 index_kobject[cpu] = NULL;
678 free_cache_attributes(cpu);
679}
680
681static int __cpuinit cpuid4_cache_sysfs_init(unsigned int cpu)
682{
683
684 if (num_cache_leaves == 0)
685 return -ENOENT;
686
687 detect_cache_attributes(cpu);
688 if (cpuid4_info[cpu] == NULL)
689 return -ENOENT;
690
691 /* Allocate all required memory */
692 cache_kobject[cpu] = kzalloc(sizeof(struct kobject), GFP_KERNEL);
693 if (unlikely(cache_kobject[cpu] == NULL))
694 goto err_out;
695
696 index_kobject[cpu] = kzalloc(
697 sizeof(struct _index_kobject ) * num_cache_leaves, GFP_KERNEL);
698 if (unlikely(index_kobject[cpu] == NULL))
699 goto err_out;
700
701 return 0;
702
703err_out:
704 cpuid4_cache_sysfs_exit(cpu);
705 return -ENOMEM;
706}
707
708/* Add/Remove cache interface for CPU device */
709static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
710{
711 unsigned int cpu = sys_dev->id;
712 unsigned long i, j;
713 struct _index_kobject *this_object;
714 int retval = 0;
715
716 retval = cpuid4_cache_sysfs_init(cpu);
717 if (unlikely(retval < 0))
718 return retval;
719
720 cache_kobject[cpu]->parent = &sys_dev->kobj;
721 kobject_set_name(cache_kobject[cpu], "%s", "cache");
722 cache_kobject[cpu]->ktype = &ktype_percpu_entry;
723 retval = kobject_register(cache_kobject[cpu]);
724
725 for (i = 0; i < num_cache_leaves; i++) {
726 this_object = INDEX_KOBJECT_PTR(cpu,i);
727 this_object->cpu = cpu;
728 this_object->index = i;
729 this_object->kobj.parent = cache_kobject[cpu];
730 kobject_set_name(&(this_object->kobj), "index%1lu", i);
731 this_object->kobj.ktype = &ktype_cache;
732 retval = kobject_register(&(this_object->kobj));
733 if (unlikely(retval)) {
734 for (j = 0; j < i; j++) {
735 kobject_unregister(
736 &(INDEX_KOBJECT_PTR(cpu,j)->kobj));
737 }
738 kobject_unregister(cache_kobject[cpu]);
739 cpuid4_cache_sysfs_exit(cpu);
740 break;
741 }
742 }
743 return retval;
744}
745
746static void __cpuinit cache_remove_dev(struct sys_device * sys_dev)
747{
748 unsigned int cpu = sys_dev->id;
749 unsigned long i;
750
751 if (cpuid4_info[cpu] == NULL)
752 return;
753 for (i = 0; i < num_cache_leaves; i++) {
754 cache_remove_shared_cpu_map(cpu, i);
755 kobject_unregister(&(INDEX_KOBJECT_PTR(cpu,i)->kobj));
756 }
757 kobject_unregister(cache_kobject[cpu]);
758 cpuid4_cache_sysfs_exit(cpu);
759 return;
760}
761
762static int __cpuinit cacheinfo_cpu_callback(struct notifier_block *nfb,
763 unsigned long action, void *hcpu)
764{
765 unsigned int cpu = (unsigned long)hcpu;
766 struct sys_device *sys_dev;
767
768 sys_dev = get_cpu_sysdev(cpu);
769 switch (action) {
770 case CPU_ONLINE:
771 case CPU_ONLINE_FROZEN:
772 cache_add_dev(sys_dev);
773 break;
774 case CPU_DEAD:
775 case CPU_DEAD_FROZEN:
776 cache_remove_dev(sys_dev);
777 break;
778 }
779 return NOTIFY_OK;
780}
781
782static struct notifier_block __cpuinitdata cacheinfo_cpu_notifier =
783{
784 .notifier_call = cacheinfo_cpu_callback,
785};
786
787static int __cpuinit cache_sysfs_init(void)
788{
789 int i;
790
791 if (num_cache_leaves == 0)
792 return 0;
793
794 register_hotcpu_notifier(&cacheinfo_cpu_notifier);
795
796 for_each_online_cpu(i) {
797 cacheinfo_cpu_callback(&cacheinfo_cpu_notifier, CPU_ONLINE,
798 (void *)(long)i);
799 }
800
801 return 0;
802}
803
804device_initcall(cache_sysfs_init);
805
806#endif
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
new file mode 100644
index 000000000000..f1ebe1c1c17a
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -0,0 +1,2 @@
1obj-y = mce.o k7.o p4.o p5.o p6.o winchip.o therm_throt.o
2obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o
diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c
new file mode 100644
index 000000000000..eef63e3630c2
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/k7.c
@@ -0,0 +1,102 @@
1/*
2 * Athlon/Hammer specific Machine Check Exception Reporting
3 * (C) Copyright 2002 Dave Jones <davej@codemonkey.org.uk>
4 */
5
6#include <linux/init.h>
7#include <linux/types.h>
8#include <linux/kernel.h>
9#include <linux/interrupt.h>
10#include <linux/smp.h>
11
12#include <asm/processor.h>
13#include <asm/system.h>
14#include <asm/msr.h>
15
16#include "mce.h"
17
18/* Machine Check Handler For AMD Athlon/Duron */
19static fastcall void k7_machine_check(struct pt_regs * regs, long error_code)
20{
21 int recover=1;
22 u32 alow, ahigh, high, low;
23 u32 mcgstl, mcgsth;
24 int i;
25
26 rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
27 if (mcgstl & (1<<0)) /* Recoverable ? */
28 recover=0;
29
30 printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
31 smp_processor_id(), mcgsth, mcgstl);
32
33 for (i=1; i<nr_mce_banks; i++) {
34 rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high);
35 if (high&(1<<31)) {
36 if (high & (1<<29))
37 recover |= 1;
38 if (high & (1<<25))
39 recover |= 2;
40 printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low);
41 high &= ~(1<<31);
42 if (high & (1<<27)) {
43 rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh);
44 printk ("[%08x%08x]", ahigh, alow);
45 }
46 if (high & (1<<26)) {
47 rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
48 printk (" at %08x%08x", ahigh, alow);
49 }
50 printk ("\n");
51 /* Clear it */
52 wrmsr (MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
53 /* Serialize */
54 wmb();
55 add_taint(TAINT_MACHINE_CHECK);
56 }
57 }
58
59 if (recover&2)
60 panic ("CPU context corrupt");
61 if (recover&1)
62 panic ("Unable to continue");
63 printk (KERN_EMERG "Attempting to continue.\n");
64 mcgstl &= ~(1<<2);
65 wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth);
66}
67
68
69/* AMD K7 machine check is Intel like */
70void amd_mcheck_init(struct cpuinfo_x86 *c)
71{
72 u32 l, h;
73 int i;
74
75 if (!cpu_has(c, X86_FEATURE_MCE))
76 return;
77
78 machine_check_vector = k7_machine_check;
79 wmb();
80
81 printk (KERN_INFO "Intel machine check architecture supported.\n");
82 rdmsr (MSR_IA32_MCG_CAP, l, h);
83 if (l & (1<<8)) /* Control register present ? */
84 wrmsr (MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
85 nr_mce_banks = l & 0xff;
86
87 /* Clear status for MC index 0 separately, we don't touch CTL,
88 * as some K7 Athlons cause spurious MCEs when its enabled. */
89 if (boot_cpu_data.x86 == 6) {
90 wrmsr (MSR_IA32_MC0_STATUS, 0x0, 0x0);
91 i = 1;
92 } else
93 i = 0;
94 for (; i<nr_mce_banks; i++) {
95 wrmsr (MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
96 wrmsr (MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
97 }
98
99 set_in_cr4 (X86_CR4_MCE);
100 printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
101 smp_processor_id());
102}
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
new file mode 100644
index 000000000000..34c781eddee4
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -0,0 +1,90 @@
1/*
2 * mce.c - x86 Machine Check Exception Reporting
3 * (c) 2002 Alan Cox <alan@redhat.com>, Dave Jones <davej@codemonkey.org.uk>
4 */
5
6#include <linux/init.h>
7#include <linux/types.h>
8#include <linux/kernel.h>
9#include <linux/module.h>
10#include <linux/smp.h>
11#include <linux/thread_info.h>
12
13#include <asm/processor.h>
14#include <asm/system.h>
15#include <asm/mce.h>
16
17#include "mce.h"
18
19int mce_disabled = 0;
20int nr_mce_banks;
21
22EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */
23
24/* Handle unconfigured int18 (should never happen) */
25static fastcall void unexpected_machine_check(struct pt_regs * regs, long error_code)
26{
27 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", smp_processor_id());
28}
29
30/* Call the installed machine check handler for this CPU setup. */
31void fastcall (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check;
32
33/* This has to be run for each processor */
34void mcheck_init(struct cpuinfo_x86 *c)
35{
36 if (mce_disabled==1)
37 return;
38
39 switch (c->x86_vendor) {
40 case X86_VENDOR_AMD:
41 amd_mcheck_init(c);
42 break;
43
44 case X86_VENDOR_INTEL:
45 if (c->x86==5)
46 intel_p5_mcheck_init(c);
47 if (c->x86==6)
48 intel_p6_mcheck_init(c);
49 if (c->x86==15)
50 intel_p4_mcheck_init(c);
51 break;
52
53 case X86_VENDOR_CENTAUR:
54 if (c->x86==5)
55 winchip_mcheck_init(c);
56 break;
57
58 default:
59 break;
60 }
61}
62
63static unsigned long old_cr4 __initdata;
64
65void __init stop_mce(void)
66{
67 old_cr4 = read_cr4();
68 clear_in_cr4(X86_CR4_MCE);
69}
70
71void __init restart_mce(void)
72{
73 if (old_cr4 & X86_CR4_MCE)
74 set_in_cr4(X86_CR4_MCE);
75}
76
77static int __init mcheck_disable(char *str)
78{
79 mce_disabled = 1;
80 return 1;
81}
82
83static int __init mcheck_enable(char *str)
84{
85 mce_disabled = -1;
86 return 1;
87}
88
89__setup("nomce", mcheck_disable);
90__setup("mce", mcheck_enable);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.h b/arch/x86/kernel/cpu/mcheck/mce.h
new file mode 100644
index 000000000000..81fb6e2d35f3
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce.h
@@ -0,0 +1,14 @@
1#include <linux/init.h>
2#include <asm/mce.h>
3
4void amd_mcheck_init(struct cpuinfo_x86 *c);
5void intel_p4_mcheck_init(struct cpuinfo_x86 *c);
6void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
7void intel_p6_mcheck_init(struct cpuinfo_x86 *c);
8void winchip_mcheck_init(struct cpuinfo_x86 *c);
9
10/* Call the installed machine check handler for this CPU setup. */
11extern fastcall void (*machine_check_vector)(struct pt_regs *, long error_code);
12
13extern int nr_mce_banks;
14
diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c
new file mode 100644
index 000000000000..bf39409b3838
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/non-fatal.c
@@ -0,0 +1,91 @@
1/*
2 * Non Fatal Machine Check Exception Reporting
3 *
4 * (C) Copyright 2002 Dave Jones. <davej@codemonkey.org.uk>
5 *
6 * This file contains routines to check for non-fatal MCEs every 15s
7 *
8 */
9
10#include <linux/init.h>
11#include <linux/types.h>
12#include <linux/kernel.h>
13#include <linux/jiffies.h>
14#include <linux/workqueue.h>
15#include <linux/interrupt.h>
16#include <linux/smp.h>
17#include <linux/module.h>
18
19#include <asm/processor.h>
20#include <asm/system.h>
21#include <asm/msr.h>
22
23#include "mce.h"
24
25static int firstbank;
26
27#define MCE_RATE 15*HZ /* timer rate is 15s */
28
29static void mce_checkregs (void *info)
30{
31 u32 low, high;
32 int i;
33
34 for (i=firstbank; i<nr_mce_banks; i++) {
35 rdmsr (MSR_IA32_MC0_STATUS+i*4, low, high);
36
37 if (high & (1<<31)) {
38 printk(KERN_INFO "MCE: The hardware reports a non "
39 "fatal, correctable incident occurred on "
40 "CPU %d.\n",
41 smp_processor_id());
42 printk (KERN_INFO "Bank %d: %08x%08x\n", i, high, low);
43
44 /* Scrub the error so we don't pick it up in MCE_RATE seconds time. */
45 wrmsr (MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
46
47 /* Serialize */
48 wmb();
49 add_taint(TAINT_MACHINE_CHECK);
50 }
51 }
52}
53
54static void mce_work_fn(struct work_struct *work);
55static DECLARE_DELAYED_WORK(mce_work, mce_work_fn);
56
57static void mce_work_fn(struct work_struct *work)
58{
59 on_each_cpu(mce_checkregs, NULL, 1, 1);
60 schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE));
61}
62
63static int __init init_nonfatal_mce_checker(void)
64{
65 struct cpuinfo_x86 *c = &boot_cpu_data;
66
67 /* Check for MCE support */
68 if (!cpu_has(c, X86_FEATURE_MCE))
69 return -ENODEV;
70
71 /* Check for PPro style MCA */
72 if (!cpu_has(c, X86_FEATURE_MCA))
73 return -ENODEV;
74
75 /* Some Athlons misbehave when we frob bank 0 */
76 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
77 boot_cpu_data.x86 == 6)
78 firstbank = 1;
79 else
80 firstbank = 0;
81
82 /*
83 * Check for non-fatal errors every MCE_RATE s
84 */
85 schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE));
86 printk(KERN_INFO "Machine check exception polling timer started.\n");
87 return 0;
88}
89module_init(init_nonfatal_mce_checker);
90
91MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c
new file mode 100644
index 000000000000..1509edfb2313
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/p4.c
@@ -0,0 +1,253 @@
1/*
2 * P4 specific Machine Check Exception Reporting
3 */
4
5#include <linux/init.h>
6#include <linux/types.h>
7#include <linux/kernel.h>
8#include <linux/interrupt.h>
9#include <linux/smp.h>
10
11#include <asm/processor.h>
12#include <asm/system.h>
13#include <asm/msr.h>
14#include <asm/apic.h>
15
16#include <asm/therm_throt.h>
17
18#include "mce.h"
19
20/* as supported by the P4/Xeon family */
21struct intel_mce_extended_msrs {
22 u32 eax;
23 u32 ebx;
24 u32 ecx;
25 u32 edx;
26 u32 esi;
27 u32 edi;
28 u32 ebp;
29 u32 esp;
30 u32 eflags;
31 u32 eip;
32 /* u32 *reserved[]; */
33};
34
35static int mce_num_extended_msrs = 0;
36
37
38#ifdef CONFIG_X86_MCE_P4THERMAL
39static void unexpected_thermal_interrupt(struct pt_regs *regs)
40{
41 printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n",
42 smp_processor_id());
43 add_taint(TAINT_MACHINE_CHECK);
44}
45
46/* P4/Xeon Thermal transition interrupt handler */
47static void intel_thermal_interrupt(struct pt_regs *regs)
48{
49 __u64 msr_val;
50
51 ack_APIC_irq();
52
53 rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
54 therm_throt_process(msr_val & 0x1);
55}
56
57/* Thermal interrupt handler for this CPU setup */
58static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = unexpected_thermal_interrupt;
59
60fastcall void smp_thermal_interrupt(struct pt_regs *regs)
61{
62 irq_enter();
63 vendor_thermal_interrupt(regs);
64 irq_exit();
65}
66
67/* P4/Xeon Thermal regulation detect and init */
68static void intel_init_thermal(struct cpuinfo_x86 *c)
69{
70 u32 l, h;
71 unsigned int cpu = smp_processor_id();
72
73 /* Thermal monitoring */
74 if (!cpu_has(c, X86_FEATURE_ACPI))
75 return; /* -ENODEV */
76
77 /* Clock modulation */
78 if (!cpu_has(c, X86_FEATURE_ACC))
79 return; /* -ENODEV */
80
81 /* first check if its enabled already, in which case there might
82 * be some SMM goo which handles it, so we can't even put a handler
83 * since it might be delivered via SMI already -zwanem.
84 */
85 rdmsr (MSR_IA32_MISC_ENABLE, l, h);
86 h = apic_read(APIC_LVTTHMR);
87 if ((l & (1<<3)) && (h & APIC_DM_SMI)) {
88 printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n",
89 cpu);
90 return; /* -EBUSY */
91 }
92
93 /* check whether a vector already exists, temporarily masked? */
94 if (h & APIC_VECTOR_MASK) {
95 printk(KERN_DEBUG "CPU%d: Thermal LVT vector (%#x) already "
96 "installed\n",
97 cpu, (h & APIC_VECTOR_MASK));
98 return; /* -EBUSY */
99 }
100
101 /* The temperature transition interrupt handler setup */
102 h = THERMAL_APIC_VECTOR; /* our delivery vector */
103 h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */
104 apic_write_around(APIC_LVTTHMR, h);
105
106 rdmsr (MSR_IA32_THERM_INTERRUPT, l, h);
107 wrmsr (MSR_IA32_THERM_INTERRUPT, l | 0x03 , h);
108
109 /* ok we're good to go... */
110 vendor_thermal_interrupt = intel_thermal_interrupt;
111
112 rdmsr (MSR_IA32_MISC_ENABLE, l, h);
113 wrmsr (MSR_IA32_MISC_ENABLE, l | (1<<3), h);
114
115 l = apic_read (APIC_LVTTHMR);
116 apic_write_around (APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
117 printk (KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu);
118
119 /* enable thermal throttle processing */
120 atomic_set(&therm_throt_en, 1);
121 return;
122}
123#endif /* CONFIG_X86_MCE_P4THERMAL */
124
125
126/* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */
127static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
128{
129 u32 h;
130
131 rdmsr (MSR_IA32_MCG_EAX, r->eax, h);
132 rdmsr (MSR_IA32_MCG_EBX, r->ebx, h);
133 rdmsr (MSR_IA32_MCG_ECX, r->ecx, h);
134 rdmsr (MSR_IA32_MCG_EDX, r->edx, h);
135 rdmsr (MSR_IA32_MCG_ESI, r->esi, h);
136 rdmsr (MSR_IA32_MCG_EDI, r->edi, h);
137 rdmsr (MSR_IA32_MCG_EBP, r->ebp, h);
138 rdmsr (MSR_IA32_MCG_ESP, r->esp, h);
139 rdmsr (MSR_IA32_MCG_EFLAGS, r->eflags, h);
140 rdmsr (MSR_IA32_MCG_EIP, r->eip, h);
141}
142
143static fastcall void intel_machine_check(struct pt_regs * regs, long error_code)
144{
145 int recover=1;
146 u32 alow, ahigh, high, low;
147 u32 mcgstl, mcgsth;
148 int i;
149
150 rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
151 if (mcgstl & (1<<0)) /* Recoverable ? */
152 recover=0;
153
154 printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
155 smp_processor_id(), mcgsth, mcgstl);
156
157 if (mce_num_extended_msrs > 0) {
158 struct intel_mce_extended_msrs dbg;
159 intel_get_extended_msrs(&dbg);
160 printk (KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n",
161 smp_processor_id(), dbg.eip, dbg.eflags);
162 printk (KERN_DEBUG "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n",
163 dbg.eax, dbg.ebx, dbg.ecx, dbg.edx);
164 printk (KERN_DEBUG "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n",
165 dbg.esi, dbg.edi, dbg.ebp, dbg.esp);
166 }
167
168 for (i=0; i<nr_mce_banks; i++) {
169 rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high);
170 if (high & (1<<31)) {
171 if (high & (1<<29))
172 recover |= 1;
173 if (high & (1<<25))
174 recover |= 2;
175 printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low);
176 high &= ~(1<<31);
177 if (high & (1<<27)) {
178 rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh);
179 printk ("[%08x%08x]", ahigh, alow);
180 }
181 if (high & (1<<26)) {
182 rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
183 printk (" at %08x%08x", ahigh, alow);
184 }
185 printk ("\n");
186 }
187 }
188
189 if (recover & 2)
190 panic ("CPU context corrupt");
191 if (recover & 1)
192 panic ("Unable to continue");
193
194 printk(KERN_EMERG "Attempting to continue.\n");
195 /*
196 * Do not clear the MSR_IA32_MCi_STATUS if the error is not
197 * recoverable/continuable.This will allow BIOS to look at the MSRs
198 * for errors if the OS could not log the error.
199 */
200 for (i=0; i<nr_mce_banks; i++) {
201 u32 msr;
202 msr = MSR_IA32_MC0_STATUS+i*4;
203 rdmsr (msr, low, high);
204 if (high&(1<<31)) {
205 /* Clear it */
206 wrmsr(msr, 0UL, 0UL);
207 /* Serialize */
208 wmb();
209 add_taint(TAINT_MACHINE_CHECK);
210 }
211 }
212 mcgstl &= ~(1<<2);
213 wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth);
214}
215
216
217void intel_p4_mcheck_init(struct cpuinfo_x86 *c)
218{
219 u32 l, h;
220 int i;
221
222 machine_check_vector = intel_machine_check;
223 wmb();
224
225 printk (KERN_INFO "Intel machine check architecture supported.\n");
226 rdmsr (MSR_IA32_MCG_CAP, l, h);
227 if (l & (1<<8)) /* Control register present ? */
228 wrmsr (MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
229 nr_mce_banks = l & 0xff;
230
231 for (i=0; i<nr_mce_banks; i++) {
232 wrmsr (MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
233 wrmsr (MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
234 }
235
236 set_in_cr4 (X86_CR4_MCE);
237 printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
238 smp_processor_id());
239
240 /* Check for P4/Xeon extended MCE MSRs */
241 rdmsr (MSR_IA32_MCG_CAP, l, h);
242 if (l & (1<<9)) {/* MCG_EXT_P */
243 mce_num_extended_msrs = (l >> 16) & 0xff;
244 printk (KERN_INFO "CPU%d: Intel P4/Xeon Extended MCE MSRs (%d)"
245 " available\n",
246 smp_processor_id(), mce_num_extended_msrs);
247
248#ifdef CONFIG_X86_MCE_P4THERMAL
249 /* Check for P4/Xeon Thermal monitor */
250 intel_init_thermal(c);
251#endif
252 }
253}
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c
new file mode 100644
index 000000000000..94bc43d950cf
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -0,0 +1,53 @@
1/*
2 * P5 specific Machine Check Exception Reporting
3 * (C) Copyright 2002 Alan Cox <alan@redhat.com>
4 */
5
6#include <linux/init.h>
7#include <linux/types.h>
8#include <linux/kernel.h>
9#include <linux/interrupt.h>
10#include <linux/smp.h>
11
12#include <asm/processor.h>
13#include <asm/system.h>
14#include <asm/msr.h>
15
16#include "mce.h"
17
18/* Machine check handler for Pentium class Intel */
19static fastcall void pentium_machine_check(struct pt_regs * regs, long error_code)
20{
21 u32 loaddr, hi, lotype;
22 rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
23 rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi);
24 printk(KERN_EMERG "CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n", smp_processor_id(), loaddr, lotype);
25 if(lotype&(1<<5))
26 printk(KERN_EMERG "CPU#%d: Possible thermal failure (CPU on fire ?).\n", smp_processor_id());
27 add_taint(TAINT_MACHINE_CHECK);
28}
29
30/* Set up machine check reporting for processors with Intel style MCE */
31void intel_p5_mcheck_init(struct cpuinfo_x86 *c)
32{
33 u32 l, h;
34
35 /*Check for MCE support */
36 if( !cpu_has(c, X86_FEATURE_MCE) )
37 return;
38
39 /* Default P5 to off as its often misconnected */
40 if(mce_disabled != -1)
41 return;
42 machine_check_vector = pentium_machine_check;
43 wmb();
44
45 /* Read registers before enabling */
46 rdmsr(MSR_IA32_P5_MC_ADDR, l, h);
47 rdmsr(MSR_IA32_P5_MC_TYPE, l, h);
48 printk(KERN_INFO "Intel old style machine check architecture supported.\n");
49
50 /* Enable MCE */
51 set_in_cr4(X86_CR4_MCE);
52 printk(KERN_INFO "Intel old style machine check reporting enabled on CPU#%d.\n", smp_processor_id());
53}
diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c
new file mode 100644
index 000000000000..deeae42ce199
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/p6.c
@@ -0,0 +1,119 @@
1/*
2 * P6 specific Machine Check Exception Reporting
3 * (C) Copyright 2002 Alan Cox <alan@redhat.com>
4 */
5
6#include <linux/init.h>
7#include <linux/types.h>
8#include <linux/kernel.h>
9#include <linux/interrupt.h>
10#include <linux/smp.h>
11
12#include <asm/processor.h>
13#include <asm/system.h>
14#include <asm/msr.h>
15
16#include "mce.h"
17
18/* Machine Check Handler For PII/PIII */
19static fastcall void intel_machine_check(struct pt_regs * regs, long error_code)
20{
21 int recover=1;
22 u32 alow, ahigh, high, low;
23 u32 mcgstl, mcgsth;
24 int i;
25
26 rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
27 if (mcgstl & (1<<0)) /* Recoverable ? */
28 recover=0;
29
30 printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
31 smp_processor_id(), mcgsth, mcgstl);
32
33 for (i=0; i<nr_mce_banks; i++) {
34 rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high);
35 if (high & (1<<31)) {
36 if (high & (1<<29))
37 recover |= 1;
38 if (high & (1<<25))
39 recover |= 2;
40 printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low);
41 high &= ~(1<<31);
42 if (high & (1<<27)) {
43 rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh);
44 printk ("[%08x%08x]", ahigh, alow);
45 }
46 if (high & (1<<26)) {
47 rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
48 printk (" at %08x%08x", ahigh, alow);
49 }
50 printk ("\n");
51 }
52 }
53
54 if (recover & 2)
55 panic ("CPU context corrupt");
56 if (recover & 1)
57 panic ("Unable to continue");
58
59 printk (KERN_EMERG "Attempting to continue.\n");
60 /*
61 * Do not clear the MSR_IA32_MCi_STATUS if the error is not
62 * recoverable/continuable.This will allow BIOS to look at the MSRs
63 * for errors if the OS could not log the error.
64 */
65 for (i=0; i<nr_mce_banks; i++) {
66 unsigned int msr;
67 msr = MSR_IA32_MC0_STATUS+i*4;
68 rdmsr (msr,low, high);
69 if (high & (1<<31)) {
70 /* Clear it */
71 wrmsr (msr, 0UL, 0UL);
72 /* Serialize */
73 wmb();
74 add_taint(TAINT_MACHINE_CHECK);
75 }
76 }
77 mcgstl &= ~(1<<2);
78 wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth);
79}
80
81/* Set up machine check reporting for processors with Intel style MCE */
82void intel_p6_mcheck_init(struct cpuinfo_x86 *c)
83{
84 u32 l, h;
85 int i;
86
87 /* Check for MCE support */
88 if (!cpu_has(c, X86_FEATURE_MCE))
89 return;
90
91 /* Check for PPro style MCA */
92 if (!cpu_has(c, X86_FEATURE_MCA))
93 return;
94
95 /* Ok machine check is available */
96 machine_check_vector = intel_machine_check;
97 wmb();
98
99 printk (KERN_INFO "Intel machine check architecture supported.\n");
100 rdmsr (MSR_IA32_MCG_CAP, l, h);
101 if (l & (1<<8)) /* Control register present ? */
102 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
103 nr_mce_banks = l & 0xff;
104
105 /*
106 * Following the example in IA-32 SDM Vol 3:
107 * - MC0_CTL should not be written
108 * - Status registers on all banks should be cleared on reset
109 */
110 for (i=1; i<nr_mce_banks; i++)
111 wrmsr (MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
112
113 for (i=0; i<nr_mce_banks; i++)
114 wrmsr (MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
115
116 set_in_cr4 (X86_CR4_MCE);
117 printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
118 smp_processor_id());
119}
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
new file mode 100644
index 000000000000..1203dc5ab87a
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -0,0 +1,186 @@
1/*
2 * linux/arch/i386/kernel/cpu/mcheck/therm_throt.c
3 *
4 * Thermal throttle event support code (such as syslog messaging and rate
5 * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c).
6 * This allows consistent reporting of CPU thermal throttle events.
7 *
8 * Maintains a counter in /sys that keeps track of the number of thermal
9 * events, such that the user knows how bad the thermal problem might be
10 * (since the logging to syslog and mcelog is rate limited).
11 *
12 * Author: Dmitriy Zavin (dmitriyz@google.com)
13 *
14 * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c.
15 * Inspired by Ross Biro's and Al Borchers' counter code.
16 */
17
18#include <linux/percpu.h>
19#include <linux/sysdev.h>
20#include <linux/cpu.h>
21#include <asm/cpu.h>
22#include <linux/notifier.h>
23#include <linux/jiffies.h>
24#include <asm/therm_throt.h>
25
26/* How long to wait between reporting thermal events */
27#define CHECK_INTERVAL (300 * HZ)
28
29static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES;
30static DEFINE_PER_CPU(unsigned long, thermal_throttle_count);
31atomic_t therm_throt_en = ATOMIC_INIT(0);
32
33#ifdef CONFIG_SYSFS
34#define define_therm_throt_sysdev_one_ro(_name) \
35 static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL)
36
37#define define_therm_throt_sysdev_show_func(name) \
38static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \
39 char *buf) \
40{ \
41 unsigned int cpu = dev->id; \
42 ssize_t ret; \
43 \
44 preempt_disable(); /* CPU hotplug */ \
45 if (cpu_online(cpu)) \
46 ret = sprintf(buf, "%lu\n", \
47 per_cpu(thermal_throttle_##name, cpu)); \
48 else \
49 ret = 0; \
50 preempt_enable(); \
51 \
52 return ret; \
53}
54
55define_therm_throt_sysdev_show_func(count);
56define_therm_throt_sysdev_one_ro(count);
57
58static struct attribute *thermal_throttle_attrs[] = {
59 &attr_count.attr,
60 NULL
61};
62
63static struct attribute_group thermal_throttle_attr_group = {
64 .attrs = thermal_throttle_attrs,
65 .name = "thermal_throttle"
66};
67#endif /* CONFIG_SYSFS */
68
69/***
70 * therm_throt_process - Process thermal throttling event from interrupt
71 * @curr: Whether the condition is current or not (boolean), since the
72 * thermal interrupt normally gets called both when the thermal
73 * event begins and once the event has ended.
74 *
75 * This function is called by the thermal interrupt after the
76 * IRQ has been acknowledged.
77 *
78 * It will take care of rate limiting and printing messages to the syslog.
79 *
80 * Returns: 0 : Event should NOT be further logged, i.e. still in
81 * "timeout" from previous log message.
82 * 1 : Event should be logged further, and a message has been
83 * printed to the syslog.
84 */
85int therm_throt_process(int curr)
86{
87 unsigned int cpu = smp_processor_id();
88 __u64 tmp_jiffs = get_jiffies_64();
89
90 if (curr)
91 __get_cpu_var(thermal_throttle_count)++;
92
93 if (time_before64(tmp_jiffs, __get_cpu_var(next_check)))
94 return 0;
95
96 __get_cpu_var(next_check) = tmp_jiffs + CHECK_INTERVAL;
97
98 /* if we just entered the thermal event */
99 if (curr) {
100 printk(KERN_CRIT "CPU%d: Temperature above threshold, "
101 "cpu clock throttled (total events = %lu)\n", cpu,
102 __get_cpu_var(thermal_throttle_count));
103
104 add_taint(TAINT_MACHINE_CHECK);
105 } else {
106 printk(KERN_CRIT "CPU%d: Temperature/speed normal\n", cpu);
107 }
108
109 return 1;
110}
111
112#ifdef CONFIG_SYSFS
113/* Add/Remove thermal_throttle interface for CPU device */
114static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev)
115{
116 return sysfs_create_group(&sys_dev->kobj, &thermal_throttle_attr_group);
117}
118
119static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)
120{
121 return sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group);
122}
123
124/* Mutex protecting device creation against CPU hotplug */
125static DEFINE_MUTEX(therm_cpu_lock);
126
127/* Get notified when a cpu comes on/off. Be hotplug friendly. */
128static __cpuinit int thermal_throttle_cpu_callback(struct notifier_block *nfb,
129 unsigned long action,
130 void *hcpu)
131{
132 unsigned int cpu = (unsigned long)hcpu;
133 struct sys_device *sys_dev;
134 int err;
135
136 sys_dev = get_cpu_sysdev(cpu);
137 switch (action) {
138 case CPU_ONLINE:
139 case CPU_ONLINE_FROZEN:
140 mutex_lock(&therm_cpu_lock);
141 err = thermal_throttle_add_dev(sys_dev);
142 mutex_unlock(&therm_cpu_lock);
143 WARN_ON(err);
144 break;
145 case CPU_DEAD:
146 case CPU_DEAD_FROZEN:
147 mutex_lock(&therm_cpu_lock);
148 thermal_throttle_remove_dev(sys_dev);
149 mutex_unlock(&therm_cpu_lock);
150 break;
151 }
152 return NOTIFY_OK;
153}
154
155static struct notifier_block thermal_throttle_cpu_notifier =
156{
157 .notifier_call = thermal_throttle_cpu_callback,
158};
159
160static __init int thermal_throttle_init_device(void)
161{
162 unsigned int cpu = 0;
163 int err;
164
165 if (!atomic_read(&therm_throt_en))
166 return 0;
167
168 register_hotcpu_notifier(&thermal_throttle_cpu_notifier);
169
170#ifdef CONFIG_HOTPLUG_CPU
171 mutex_lock(&therm_cpu_lock);
172#endif
173 /* connect live CPUs to sysfs */
174 for_each_online_cpu(cpu) {
175 err = thermal_throttle_add_dev(get_cpu_sysdev(cpu));
176 WARN_ON(err);
177 }
178#ifdef CONFIG_HOTPLUG_CPU
179 mutex_unlock(&therm_cpu_lock);
180#endif
181
182 return 0;
183}
184
185device_initcall(thermal_throttle_init_device);
186#endif /* CONFIG_SYSFS */
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c
new file mode 100644
index 000000000000..9e424b6c293d
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -0,0 +1,36 @@
1/*
2 * IDT Winchip specific Machine Check Exception Reporting
3 * (C) Copyright 2002 Alan Cox <alan@redhat.com>
4 */
5
6#include <linux/init.h>
7#include <linux/types.h>
8#include <linux/kernel.h>
9#include <linux/interrupt.h>
10
11#include <asm/processor.h>
12#include <asm/system.h>
13#include <asm/msr.h>
14
15#include "mce.h"
16
17/* Machine check handler for WinChip C6 */
18static fastcall void winchip_machine_check(struct pt_regs * regs, long error_code)
19{
20 printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
21 add_taint(TAINT_MACHINE_CHECK);
22}
23
24/* Set up machine check reporting on the Winchip C6 series */
25void winchip_mcheck_init(struct cpuinfo_x86 *c)
26{
27 u32 lo, hi;
28 machine_check_vector = winchip_machine_check;
29 wmb();
30 rdmsr(MSR_IDT_FCR1, lo, hi);
31 lo|= (1<<2); /* Enable EIERRINT (int 18 MCE) */
32 lo&= ~(1<<4); /* Enable MCE */
33 wrmsr(MSR_IDT_FCR1, lo, hi);
34 set_in_cr4(X86_CR4_MCE);
35 printk(KERN_INFO "Winchip machine check reporting enabled on CPU#0.\n");
36}
diff --git a/arch/x86/kernel/cpu/mtrr/Makefile b/arch/x86/kernel/cpu/mtrr/Makefile
new file mode 100644
index 000000000000..191fc0533649
--- /dev/null
+++ b/arch/x86/kernel/cpu/mtrr/Makefile
@@ -0,0 +1,3 @@
1obj-y := main.o if.o generic.o state.o
2obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o
3
diff --git a/arch/x86/kernel/cpu/mtrr/amd.c b/arch/x86/kernel/cpu/mtrr/amd.c
new file mode 100644
index 000000000000..0949cdbf848a
--- /dev/null
+++ b/arch/x86/kernel/cpu/mtrr/amd.c
@@ -0,0 +1,121 @@
1#include <linux/init.h>
2#include <linux/mm.h>
3#include <asm/mtrr.h>
4#include <asm/msr.h>
5
6#include "mtrr.h"
7
8static void
9amd_get_mtrr(unsigned int reg, unsigned long *base,
10 unsigned long *size, mtrr_type * type)
11{
12 unsigned long low, high;
13
14 rdmsr(MSR_K6_UWCCR, low, high);
15 /* Upper dword is region 1, lower is region 0 */
16 if (reg == 1)
17 low = high;
18 /* The base masks off on the right alignment */
19 *base = (low & 0xFFFE0000) >> PAGE_SHIFT;
20 *type = 0;
21 if (low & 1)
22 *type = MTRR_TYPE_UNCACHABLE;
23 if (low & 2)
24 *type = MTRR_TYPE_WRCOMB;
25 if (!(low & 3)) {
26 *size = 0;
27 return;
28 }
29 /*
30 * This needs a little explaining. The size is stored as an
31 * inverted mask of bits of 128K granularity 15 bits long offset
32 * 2 bits
33 *
34 * So to get a size we do invert the mask and add 1 to the lowest
35 * mask bit (4 as its 2 bits in). This gives us a size we then shift
36 * to turn into 128K blocks
37 *
38 * eg 111 1111 1111 1100 is 512K
39 *
40 * invert 000 0000 0000 0011
41 * +1 000 0000 0000 0100
42 * *128K ...
43 */
44 low = (~low) & 0x1FFFC;
45 *size = (low + 4) << (15 - PAGE_SHIFT);
46 return;
47}
48
49static void amd_set_mtrr(unsigned int reg, unsigned long base,
50 unsigned long size, mtrr_type type)
51/* [SUMMARY] Set variable MTRR register on the local CPU.
52 <reg> The register to set.
53 <base> The base address of the region.
54 <size> The size of the region. If this is 0 the region is disabled.
55 <type> The type of the region.
56 <do_safe> If TRUE, do the change safely. If FALSE, safety measures should
57 be done externally.
58 [RETURNS] Nothing.
59*/
60{
61 u32 regs[2];
62
63 /*
64 * Low is MTRR0 , High MTRR 1
65 */
66 rdmsr(MSR_K6_UWCCR, regs[0], regs[1]);
67 /*
68 * Blank to disable
69 */
70 if (size == 0)
71 regs[reg] = 0;
72 else
73 /* Set the register to the base, the type (off by one) and an
74 inverted bitmask of the size The size is the only odd
75 bit. We are fed say 512K We invert this and we get 111 1111
76 1111 1011 but if you subtract one and invert you get the
77 desired 111 1111 1111 1100 mask
78
79 But ~(x - 1) == ~x + 1 == -x. Two's complement rocks! */
80 regs[reg] = (-size >> (15 - PAGE_SHIFT) & 0x0001FFFC)
81 | (base << PAGE_SHIFT) | (type + 1);
82
83 /*
84 * The writeback rule is quite specific. See the manual. Its
85 * disable local interrupts, write back the cache, set the mtrr
86 */
87 wbinvd();
88 wrmsr(MSR_K6_UWCCR, regs[0], regs[1]);
89}
90
91static int amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type)
92{
93 /* Apply the K6 block alignment and size rules
94 In order
95 o Uncached or gathering only
96 o 128K or bigger block
97 o Power of 2 block
98 o base suitably aligned to the power
99 */
100 if (type > MTRR_TYPE_WRCOMB || size < (1 << (17 - PAGE_SHIFT))
101 || (size & ~(size - 1)) - size || (base & (size - 1)))
102 return -EINVAL;
103 return 0;
104}
105
106static struct mtrr_ops amd_mtrr_ops = {
107 .vendor = X86_VENDOR_AMD,
108 .set = amd_set_mtrr,
109 .get = amd_get_mtrr,
110 .get_free_region = generic_get_free_region,
111 .validate_add_page = amd_validate_add_page,
112 .have_wrcomb = positive_have_wrcomb,
113};
114
115int __init amd_init_mtrr(void)
116{
117 set_mtrr_ops(&amd_mtrr_ops);
118 return 0;
119}
120
121//arch_initcall(amd_mtrr_init);
diff --git a/arch/x86/kernel/cpu/mtrr/centaur.c b/arch/x86/kernel/cpu/mtrr/centaur.c
new file mode 100644
index 000000000000..cb9aa3a7a7ab
--- /dev/null
+++ b/arch/x86/kernel/cpu/mtrr/centaur.c
@@ -0,0 +1,224 @@
1#include <linux/init.h>
2#include <linux/mm.h>
3#include <asm/mtrr.h>
4#include <asm/msr.h>
5#include "mtrr.h"
6
7static struct {
8 unsigned long high;
9 unsigned long low;
10} centaur_mcr[8];
11
12static u8 centaur_mcr_reserved;
13static u8 centaur_mcr_type; /* 0 for winchip, 1 for winchip2 */
14
15/*
16 * Report boot time MCR setups
17 */
18
19static int
20centaur_get_free_region(unsigned long base, unsigned long size, int replace_reg)
21/* [SUMMARY] Get a free MTRR.
22 <base> The starting (base) address of the region.
23 <size> The size (in bytes) of the region.
24 [RETURNS] The index of the region on success, else -1 on error.
25*/
26{
27 int i, max;
28 mtrr_type ltype;
29 unsigned long lbase, lsize;
30
31 max = num_var_ranges;
32 if (replace_reg >= 0 && replace_reg < max)
33 return replace_reg;
34 for (i = 0; i < max; ++i) {
35 if (centaur_mcr_reserved & (1 << i))
36 continue;
37 mtrr_if->get(i, &lbase, &lsize, &ltype);
38 if (lsize == 0)
39 return i;
40 }
41 return -ENOSPC;
42}
43
44void
45mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi)
46{
47 centaur_mcr[mcr].low = lo;
48 centaur_mcr[mcr].high = hi;
49}
50
51static void
52centaur_get_mcr(unsigned int reg, unsigned long *base,
53 unsigned long *size, mtrr_type * type)
54{
55 *base = centaur_mcr[reg].high >> PAGE_SHIFT;
56 *size = -(centaur_mcr[reg].low & 0xfffff000) >> PAGE_SHIFT;
57 *type = MTRR_TYPE_WRCOMB; /* If it is there, it is write-combining */
58 if (centaur_mcr_type == 1 && ((centaur_mcr[reg].low & 31) & 2))
59 *type = MTRR_TYPE_UNCACHABLE;
60 if (centaur_mcr_type == 1 && (centaur_mcr[reg].low & 31) == 25)
61 *type = MTRR_TYPE_WRBACK;
62 if (centaur_mcr_type == 0 && (centaur_mcr[reg].low & 31) == 31)
63 *type = MTRR_TYPE_WRBACK;
64
65}
66
67static void centaur_set_mcr(unsigned int reg, unsigned long base,
68 unsigned long size, mtrr_type type)
69{
70 unsigned long low, high;
71
72 if (size == 0) {
73 /* Disable */
74 high = low = 0;
75 } else {
76 high = base << PAGE_SHIFT;
77 if (centaur_mcr_type == 0)
78 low = -size << PAGE_SHIFT | 0x1f; /* only support write-combining... */
79 else {
80 if (type == MTRR_TYPE_UNCACHABLE)
81 low = -size << PAGE_SHIFT | 0x02; /* NC */
82 else
83 low = -size << PAGE_SHIFT | 0x09; /* WWO,WC */
84 }
85 }
86 centaur_mcr[reg].high = high;
87 centaur_mcr[reg].low = low;
88 wrmsr(MSR_IDT_MCR0 + reg, low, high);
89}
90
91#if 0
92/*
93 * Initialise the later (saner) Winchip MCR variant. In this version
94 * the BIOS can pass us the registers it has used (but not their values)
95 * and the control register is read/write
96 */
97
98static void __init
99centaur_mcr1_init(void)
100{
101 unsigned i;
102 u32 lo, hi;
103
104 /* Unfortunately, MCR's are read-only, so there is no way to
105 * find out what the bios might have done.
106 */
107
108 rdmsr(MSR_IDT_MCR_CTRL, lo, hi);
109 if (((lo >> 17) & 7) == 1) { /* Type 1 Winchip2 MCR */
110 lo &= ~0x1C0; /* clear key */
111 lo |= 0x040; /* set key to 1 */
112 wrmsr(MSR_IDT_MCR_CTRL, lo, hi); /* unlock MCR */
113 }
114
115 centaur_mcr_type = 1;
116
117 /*
118 * Clear any unconfigured MCR's.
119 */
120
121 for (i = 0; i < 8; ++i) {
122 if (centaur_mcr[i].high == 0 && centaur_mcr[i].low == 0) {
123 if (!(lo & (1 << (9 + i))))
124 wrmsr(MSR_IDT_MCR0 + i, 0, 0);
125 else
126 /*
127 * If the BIOS set up an MCR we cannot see it
128 * but we don't wish to obliterate it
129 */
130 centaur_mcr_reserved |= (1 << i);
131 }
132 }
133 /*
134 * Throw the main write-combining switch...
135 * However if OOSTORE is enabled then people have already done far
136 * cleverer things and we should behave.
137 */
138
139 lo |= 15; /* Write combine enables */
140 wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
141}
142
143/*
144 * Initialise the original winchip with read only MCR registers
145 * no used bitmask for the BIOS to pass on and write only control
146 */
147
148static void __init
149centaur_mcr0_init(void)
150{
151 unsigned i;
152
153 /* Unfortunately, MCR's are read-only, so there is no way to
154 * find out what the bios might have done.
155 */
156
157 /* Clear any unconfigured MCR's.
158 * This way we are sure that the centaur_mcr array contains the actual
159 * values. The disadvantage is that any BIOS tweaks are thus undone.
160 *
161 */
162 for (i = 0; i < 8; ++i) {
163 if (centaur_mcr[i].high == 0 && centaur_mcr[i].low == 0)
164 wrmsr(MSR_IDT_MCR0 + i, 0, 0);
165 }
166
167 wrmsr(MSR_IDT_MCR_CTRL, 0x01F0001F, 0); /* Write only */
168}
169
170/*
171 * Initialise Winchip series MCR registers
172 */
173
174static void __init
175centaur_mcr_init(void)
176{
177 struct set_mtrr_context ctxt;
178
179 set_mtrr_prepare_save(&ctxt);
180 set_mtrr_cache_disable(&ctxt);
181
182 if (boot_cpu_data.x86_model == 4)
183 centaur_mcr0_init();
184 else if (boot_cpu_data.x86_model == 8 || boot_cpu_data.x86_model == 9)
185 centaur_mcr1_init();
186
187 set_mtrr_done(&ctxt);
188}
189#endif
190
191static int centaur_validate_add_page(unsigned long base,
192 unsigned long size, unsigned int type)
193{
194 /*
195 * FIXME: Winchip2 supports uncached
196 */
197 if (type != MTRR_TYPE_WRCOMB &&
198 (centaur_mcr_type == 0 || type != MTRR_TYPE_UNCACHABLE)) {
199 printk(KERN_WARNING
200 "mtrr: only write-combining%s supported\n",
201 centaur_mcr_type ? " and uncacheable are"
202 : " is");
203 return -EINVAL;
204 }
205 return 0;
206}
207
208static struct mtrr_ops centaur_mtrr_ops = {
209 .vendor = X86_VENDOR_CENTAUR,
210// .init = centaur_mcr_init,
211 .set = centaur_set_mcr,
212 .get = centaur_get_mcr,
213 .get_free_region = centaur_get_free_region,
214 .validate_add_page = centaur_validate_add_page,
215 .have_wrcomb = positive_have_wrcomb,
216};
217
218int __init centaur_init_mtrr(void)
219{
220 set_mtrr_ops(&centaur_mtrr_ops);
221 return 0;
222}
223
224//arch_initcall(centaur_init_mtrr);
diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c
new file mode 100644
index 000000000000..2287d4863a8a
--- /dev/null
+++ b/arch/x86/kernel/cpu/mtrr/cyrix.c
@@ -0,0 +1,380 @@
1#include <linux/init.h>
2#include <linux/mm.h>
3#include <asm/mtrr.h>
4#include <asm/msr.h>
5#include <asm/io.h>
6#include <asm/processor-cyrix.h>
7#include "mtrr.h"
8
9int arr3_protected;
10
11static void
12cyrix_get_arr(unsigned int reg, unsigned long *base,
13 unsigned long *size, mtrr_type * type)
14{
15 unsigned long flags;
16 unsigned char arr, ccr3, rcr, shift;
17
18 arr = CX86_ARR_BASE + (reg << 1) + reg; /* avoid multiplication by 3 */
19
20 /* Save flags and disable interrupts */
21 local_irq_save(flags);
22
23 ccr3 = getCx86(CX86_CCR3);
24 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
25 ((unsigned char *) base)[3] = getCx86(arr);
26 ((unsigned char *) base)[2] = getCx86(arr + 1);
27 ((unsigned char *) base)[1] = getCx86(arr + 2);
28 rcr = getCx86(CX86_RCR_BASE + reg);
29 setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
30
31 /* Enable interrupts if it was enabled previously */
32 local_irq_restore(flags);
33 shift = ((unsigned char *) base)[1] & 0x0f;
34 *base >>= PAGE_SHIFT;
35
36 /* Power of two, at least 4K on ARR0-ARR6, 256K on ARR7
37 * Note: shift==0xf means 4G, this is unsupported.
38 */
39 if (shift)
40 *size = (reg < 7 ? 0x1UL : 0x40UL) << (shift - 1);
41 else
42 *size = 0;
43
44 /* Bit 0 is Cache Enable on ARR7, Cache Disable on ARR0-ARR6 */
45 if (reg < 7) {
46 switch (rcr) {
47 case 1:
48 *type = MTRR_TYPE_UNCACHABLE;
49 break;
50 case 8:
51 *type = MTRR_TYPE_WRBACK;
52 break;
53 case 9:
54 *type = MTRR_TYPE_WRCOMB;
55 break;
56 case 24:
57 default:
58 *type = MTRR_TYPE_WRTHROUGH;
59 break;
60 }
61 } else {
62 switch (rcr) {
63 case 0:
64 *type = MTRR_TYPE_UNCACHABLE;
65 break;
66 case 8:
67 *type = MTRR_TYPE_WRCOMB;
68 break;
69 case 9:
70 *type = MTRR_TYPE_WRBACK;
71 break;
72 case 25:
73 default:
74 *type = MTRR_TYPE_WRTHROUGH;
75 break;
76 }
77 }
78}
79
80static int
81cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg)
82/* [SUMMARY] Get a free ARR.
83 <base> The starting (base) address of the region.
84 <size> The size (in bytes) of the region.
85 [RETURNS] The index of the region on success, else -1 on error.
86*/
87{
88 int i;
89 mtrr_type ltype;
90 unsigned long lbase, lsize;
91
92 switch (replace_reg) {
93 case 7:
94 if (size < 0x40)
95 break;
96 case 6:
97 case 5:
98 case 4:
99 return replace_reg;
100 case 3:
101 if (arr3_protected)
102 break;
103 case 2:
104 case 1:
105 case 0:
106 return replace_reg;
107 }
108 /* If we are to set up a region >32M then look at ARR7 immediately */
109 if (size > 0x2000) {
110 cyrix_get_arr(7, &lbase, &lsize, &ltype);
111 if (lsize == 0)
112 return 7;
113 /* Else try ARR0-ARR6 first */
114 } else {
115 for (i = 0; i < 7; i++) {
116 cyrix_get_arr(i, &lbase, &lsize, &ltype);
117 if ((i == 3) && arr3_protected)
118 continue;
119 if (lsize == 0)
120 return i;
121 }
122 /* ARR0-ARR6 isn't free, try ARR7 but its size must be at least 256K */
123 cyrix_get_arr(i, &lbase, &lsize, &ltype);
124 if ((lsize == 0) && (size >= 0x40))
125 return i;
126 }
127 return -ENOSPC;
128}
129
130static u32 cr4 = 0;
131static u32 ccr3;
132
133static void prepare_set(void)
134{
135 u32 cr0;
136
137 /* Save value of CR4 and clear Page Global Enable (bit 7) */
138 if ( cpu_has_pge ) {
139 cr4 = read_cr4();
140 write_cr4(cr4 & ~X86_CR4_PGE);
141 }
142
143 /* Disable and flush caches. Note that wbinvd flushes the TLBs as
144 a side-effect */
145 cr0 = read_cr0() | 0x40000000;
146 wbinvd();
147 write_cr0(cr0);
148 wbinvd();
149
150 /* Cyrix ARRs - everything else were excluded at the top */
151 ccr3 = getCx86(CX86_CCR3);
152
153 /* Cyrix ARRs - everything else were excluded at the top */
154 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10);
155
156}
157
158static void post_set(void)
159{
160 /* Flush caches and TLBs */
161 wbinvd();
162
163 /* Cyrix ARRs - everything else was excluded at the top */
164 setCx86(CX86_CCR3, ccr3);
165
166 /* Enable caches */
167 write_cr0(read_cr0() & 0xbfffffff);
168
169 /* Restore value of CR4 */
170 if ( cpu_has_pge )
171 write_cr4(cr4);
172}
173
174static void cyrix_set_arr(unsigned int reg, unsigned long base,
175 unsigned long size, mtrr_type type)
176{
177 unsigned char arr, arr_type, arr_size;
178
179 arr = CX86_ARR_BASE + (reg << 1) + reg; /* avoid multiplication by 3 */
180
181 /* count down from 32M (ARR0-ARR6) or from 2G (ARR7) */
182 if (reg >= 7)
183 size >>= 6;
184
185 size &= 0x7fff; /* make sure arr_size <= 14 */
186 for (arr_size = 0; size; arr_size++, size >>= 1) ;
187
188 if (reg < 7) {
189 switch (type) {
190 case MTRR_TYPE_UNCACHABLE:
191 arr_type = 1;
192 break;
193 case MTRR_TYPE_WRCOMB:
194 arr_type = 9;
195 break;
196 case MTRR_TYPE_WRTHROUGH:
197 arr_type = 24;
198 break;
199 default:
200 arr_type = 8;
201 break;
202 }
203 } else {
204 switch (type) {
205 case MTRR_TYPE_UNCACHABLE:
206 arr_type = 0;
207 break;
208 case MTRR_TYPE_WRCOMB:
209 arr_type = 8;
210 break;
211 case MTRR_TYPE_WRTHROUGH:
212 arr_type = 25;
213 break;
214 default:
215 arr_type = 9;
216 break;
217 }
218 }
219
220 prepare_set();
221
222 base <<= PAGE_SHIFT;
223 setCx86(arr, ((unsigned char *) &base)[3]);
224 setCx86(arr + 1, ((unsigned char *) &base)[2]);
225 setCx86(arr + 2, (((unsigned char *) &base)[1]) | arr_size);
226 setCx86(CX86_RCR_BASE + reg, arr_type);
227
228 post_set();
229}
230
231typedef struct {
232 unsigned long base;
233 unsigned long size;
234 mtrr_type type;
235} arr_state_t;
236
237static arr_state_t arr_state[8] = {
238 {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL},
239 {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}
240};
241
242static unsigned char ccr_state[7] = { 0, 0, 0, 0, 0, 0, 0 };
243
244static void cyrix_set_all(void)
245{
246 int i;
247
248 prepare_set();
249
250 /* the CCRs are not contiguous */
251 for (i = 0; i < 4; i++)
252 setCx86(CX86_CCR0 + i, ccr_state[i]);
253 for (; i < 7; i++)
254 setCx86(CX86_CCR4 + i, ccr_state[i]);
255 for (i = 0; i < 8; i++)
256 cyrix_set_arr(i, arr_state[i].base,
257 arr_state[i].size, arr_state[i].type);
258
259 post_set();
260}
261
262#if 0
263/*
264 * On Cyrix 6x86(MX) and M II the ARR3 is special: it has connection
265 * with the SMM (System Management Mode) mode. So we need the following:
266 * Check whether SMI_LOCK (CCR3 bit 0) is set
267 * if it is set, write a warning message: ARR3 cannot be changed!
268 * (it cannot be changed until the next processor reset)
269 * if it is reset, then we can change it, set all the needed bits:
270 * - disable access to SMM memory through ARR3 range (CCR1 bit 7 reset)
271 * - disable access to SMM memory (CCR1 bit 2 reset)
272 * - disable SMM mode (CCR1 bit 1 reset)
273 * - disable write protection of ARR3 (CCR6 bit 1 reset)
274 * - (maybe) disable ARR3
275 * Just to be sure, we enable ARR usage by the processor (CCR5 bit 5 set)
276 */
277static void __init
278cyrix_arr_init(void)
279{
280 struct set_mtrr_context ctxt;
281 unsigned char ccr[7];
282 int ccrc[7] = { 0, 0, 0, 0, 0, 0, 0 };
283#ifdef CONFIG_SMP
284 int i;
285#endif
286
287 /* flush cache and enable MAPEN */
288 set_mtrr_prepare_save(&ctxt);
289 set_mtrr_cache_disable(&ctxt);
290
291 /* Save all CCRs locally */
292 ccr[0] = getCx86(CX86_CCR0);
293 ccr[1] = getCx86(CX86_CCR1);
294 ccr[2] = getCx86(CX86_CCR2);
295 ccr[3] = ctxt.ccr3;
296 ccr[4] = getCx86(CX86_CCR4);
297 ccr[5] = getCx86(CX86_CCR5);
298 ccr[6] = getCx86(CX86_CCR6);
299
300 if (ccr[3] & 1) {
301 ccrc[3] = 1;
302 arr3_protected = 1;
303 } else {
304 /* Disable SMM mode (bit 1), access to SMM memory (bit 2) and
305 * access to SMM memory through ARR3 (bit 7).
306 */
307 if (ccr[1] & 0x80) {
308 ccr[1] &= 0x7f;
309 ccrc[1] |= 0x80;
310 }
311 if (ccr[1] & 0x04) {
312 ccr[1] &= 0xfb;
313 ccrc[1] |= 0x04;
314 }
315 if (ccr[1] & 0x02) {
316 ccr[1] &= 0xfd;
317 ccrc[1] |= 0x02;
318 }
319 arr3_protected = 0;
320 if (ccr[6] & 0x02) {
321 ccr[6] &= 0xfd;
322 ccrc[6] = 1; /* Disable write protection of ARR3 */
323 setCx86(CX86_CCR6, ccr[6]);
324 }
325 /* Disable ARR3. This is safe now that we disabled SMM. */
326 /* cyrix_set_arr_up (3, 0, 0, 0, FALSE); */
327 }
328 /* If we changed CCR1 in memory, change it in the processor, too. */
329 if (ccrc[1])
330 setCx86(CX86_CCR1, ccr[1]);
331
332 /* Enable ARR usage by the processor */
333 if (!(ccr[5] & 0x20)) {
334 ccr[5] |= 0x20;
335 ccrc[5] = 1;
336 setCx86(CX86_CCR5, ccr[5]);
337 }
338#ifdef CONFIG_SMP
339 for (i = 0; i < 7; i++)
340 ccr_state[i] = ccr[i];
341 for (i = 0; i < 8; i++)
342 cyrix_get_arr(i,
343 &arr_state[i].base, &arr_state[i].size,
344 &arr_state[i].type);
345#endif
346
347 set_mtrr_done(&ctxt); /* flush cache and disable MAPEN */
348
349 if (ccrc[5])
350 printk(KERN_INFO "mtrr: ARR usage was not enabled, enabled manually\n");
351 if (ccrc[3])
352 printk(KERN_INFO "mtrr: ARR3 cannot be changed\n");
353/*
354 if ( ccrc[1] & 0x80) printk ("mtrr: SMM memory access through ARR3 disabled\n");
355 if ( ccrc[1] & 0x04) printk ("mtrr: SMM memory access disabled\n");
356 if ( ccrc[1] & 0x02) printk ("mtrr: SMM mode disabled\n");
357*/
358 if (ccrc[6])
359 printk(KERN_INFO "mtrr: ARR3 was write protected, unprotected\n");
360}
361#endif
362
363static struct mtrr_ops cyrix_mtrr_ops = {
364 .vendor = X86_VENDOR_CYRIX,
365// .init = cyrix_arr_init,
366 .set_all = cyrix_set_all,
367 .set = cyrix_set_arr,
368 .get = cyrix_get_arr,
369 .get_free_region = cyrix_get_free_region,
370 .validate_add_page = generic_validate_add_page,
371 .have_wrcomb = positive_have_wrcomb,
372};
373
374int __init cyrix_init_mtrr(void)
375{
376 set_mtrr_ops(&cyrix_mtrr_ops);
377 return 0;
378}
379
380//arch_initcall(cyrix_init_mtrr);
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
new file mode 100644
index 000000000000..56f64e34829f
--- /dev/null
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -0,0 +1,509 @@
1/* This only handles 32bit MTRR on 32bit hosts. This is strictly wrong
2 because MTRRs can span upto 40 bits (36bits on most modern x86) */
3#include <linux/init.h>
4#include <linux/slab.h>
5#include <linux/mm.h>
6#include <linux/module.h>
7#include <asm/io.h>
8#include <asm/mtrr.h>
9#include <asm/msr.h>
10#include <asm/system.h>
11#include <asm/cpufeature.h>
12#include <asm/tlbflush.h>
13#include "mtrr.h"
14
15struct mtrr_state {
16 struct mtrr_var_range *var_ranges;
17 mtrr_type fixed_ranges[NUM_FIXED_RANGES];
18 unsigned char enabled;
19 unsigned char have_fixed;
20 mtrr_type def_type;
21};
22
23struct fixed_range_block {
24 int base_msr; /* start address of an MTRR block */
25 int ranges; /* number of MTRRs in this block */
26};
27
28static struct fixed_range_block fixed_range_blocks[] = {
29 { MTRRfix64K_00000_MSR, 1 }, /* one 64k MTRR */
30 { MTRRfix16K_80000_MSR, 2 }, /* two 16k MTRRs */
31 { MTRRfix4K_C0000_MSR, 8 }, /* eight 4k MTRRs */
32 {}
33};
34
35static unsigned long smp_changes_mask;
36static struct mtrr_state mtrr_state = {};
37
38#undef MODULE_PARAM_PREFIX
39#define MODULE_PARAM_PREFIX "mtrr."
40
41static int mtrr_show;
42module_param_named(show, mtrr_show, bool, 0);
43
44/* Get the MSR pair relating to a var range */
45static void
46get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr)
47{
48 rdmsr(MTRRphysBase_MSR(index), vr->base_lo, vr->base_hi);
49 rdmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi);
50}
51
52static void
53get_fixed_ranges(mtrr_type * frs)
54{
55 unsigned int *p = (unsigned int *) frs;
56 int i;
57
58 rdmsr(MTRRfix64K_00000_MSR, p[0], p[1]);
59
60 for (i = 0; i < 2; i++)
61 rdmsr(MTRRfix16K_80000_MSR + i, p[2 + i * 2], p[3 + i * 2]);
62 for (i = 0; i < 8; i++)
63 rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]);
64}
65
66void mtrr_save_fixed_ranges(void *info)
67{
68 if (cpu_has_mtrr)
69 get_fixed_ranges(mtrr_state.fixed_ranges);
70}
71
72static void print_fixed(unsigned base, unsigned step, const mtrr_type*types)
73{
74 unsigned i;
75
76 for (i = 0; i < 8; ++i, ++types, base += step)
77 printk(KERN_INFO "MTRR %05X-%05X %s\n",
78 base, base + step - 1, mtrr_attrib_to_str(*types));
79}
80
81/* Grab all of the MTRR state for this CPU into *state */
82void __init get_mtrr_state(void)
83{
84 unsigned int i;
85 struct mtrr_var_range *vrs;
86 unsigned lo, dummy;
87
88 if (!mtrr_state.var_ranges) {
89 mtrr_state.var_ranges = kmalloc(num_var_ranges * sizeof (struct mtrr_var_range),
90 GFP_KERNEL);
91 if (!mtrr_state.var_ranges)
92 return;
93 }
94 vrs = mtrr_state.var_ranges;
95
96 rdmsr(MTRRcap_MSR, lo, dummy);
97 mtrr_state.have_fixed = (lo >> 8) & 1;
98
99 for (i = 0; i < num_var_ranges; i++)
100 get_mtrr_var_range(i, &vrs[i]);
101 if (mtrr_state.have_fixed)
102 get_fixed_ranges(mtrr_state.fixed_ranges);
103
104 rdmsr(MTRRdefType_MSR, lo, dummy);
105 mtrr_state.def_type = (lo & 0xff);
106 mtrr_state.enabled = (lo & 0xc00) >> 10;
107
108 if (mtrr_show) {
109 int high_width;
110
111 printk(KERN_INFO "MTRR default type: %s\n", mtrr_attrib_to_str(mtrr_state.def_type));
112 if (mtrr_state.have_fixed) {
113 printk(KERN_INFO "MTRR fixed ranges %sabled:\n",
114 mtrr_state.enabled & 1 ? "en" : "dis");
115 print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0);
116 for (i = 0; i < 2; ++i)
117 print_fixed(0x80000 + i * 0x20000, 0x04000, mtrr_state.fixed_ranges + (i + 1) * 8);
118 for (i = 0; i < 8; ++i)
119 print_fixed(0xC0000 + i * 0x08000, 0x01000, mtrr_state.fixed_ranges + (i + 3) * 8);
120 }
121 printk(KERN_INFO "MTRR variable ranges %sabled:\n",
122 mtrr_state.enabled & 2 ? "en" : "dis");
123 high_width = ((size_or_mask ? ffs(size_or_mask) - 1 : 32) - (32 - PAGE_SHIFT) + 3) / 4;
124 for (i = 0; i < num_var_ranges; ++i) {
125 if (mtrr_state.var_ranges[i].mask_lo & (1 << 11))
126 printk(KERN_INFO "MTRR %u base %0*X%05X000 mask %0*X%05X000 %s\n",
127 i,
128 high_width,
129 mtrr_state.var_ranges[i].base_hi,
130 mtrr_state.var_ranges[i].base_lo >> 12,
131 high_width,
132 mtrr_state.var_ranges[i].mask_hi,
133 mtrr_state.var_ranges[i].mask_lo >> 12,
134 mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff));
135 else
136 printk(KERN_INFO "MTRR %u disabled\n", i);
137 }
138 }
139}
140
141/* Some BIOS's are fucked and don't set all MTRRs the same! */
142void __init mtrr_state_warn(void)
143{
144 unsigned long mask = smp_changes_mask;
145
146 if (!mask)
147 return;
148 if (mask & MTRR_CHANGE_MASK_FIXED)
149 printk(KERN_WARNING "mtrr: your CPUs had inconsistent fixed MTRR settings\n");
150 if (mask & MTRR_CHANGE_MASK_VARIABLE)
151 printk(KERN_WARNING "mtrr: your CPUs had inconsistent variable MTRR settings\n");
152 if (mask & MTRR_CHANGE_MASK_DEFTYPE)
153 printk(KERN_WARNING "mtrr: your CPUs had inconsistent MTRRdefType settings\n");
154 printk(KERN_INFO "mtrr: probably your BIOS does not setup all CPUs.\n");
155 printk(KERN_INFO "mtrr: corrected configuration.\n");
156}
157
158/* Doesn't attempt to pass an error out to MTRR users
159 because it's quite complicated in some cases and probably not
160 worth it because the best error handling is to ignore it. */
161void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b)
162{
163 if (wrmsr_safe(msr, a, b) < 0)
164 printk(KERN_ERR
165 "MTRR: CPU %u: Writing MSR %x to %x:%x failed\n",
166 smp_processor_id(), msr, a, b);
167}
168
169/**
170 * Enable and allow read/write of extended fixed-range MTRR bits on K8 CPUs
171 * see AMD publication no. 24593, chapter 3.2.1 for more information
172 */
173static inline void k8_enable_fixed_iorrs(void)
174{
175 unsigned lo, hi;
176
177 rdmsr(MSR_K8_SYSCFG, lo, hi);
178 mtrr_wrmsr(MSR_K8_SYSCFG, lo
179 | K8_MTRRFIXRANGE_DRAM_ENABLE
180 | K8_MTRRFIXRANGE_DRAM_MODIFY, hi);
181}
182
183/**
184 * Checks and updates an fixed-range MTRR if it differs from the value it
185 * should have. If K8 extenstions are wanted, update the K8 SYSCFG MSR also.
186 * see AMD publication no. 24593, chapter 7.8.1, page 233 for more information
187 * \param msr MSR address of the MTTR which should be checked and updated
188 * \param changed pointer which indicates whether the MTRR needed to be changed
189 * \param msrwords pointer to the MSR values which the MSR should have
190 */
191static void set_fixed_range(int msr, int * changed, unsigned int * msrwords)
192{
193 unsigned lo, hi;
194
195 rdmsr(msr, lo, hi);
196
197 if (lo != msrwords[0] || hi != msrwords[1]) {
198 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
199 boot_cpu_data.x86 == 15 &&
200 ((msrwords[0] | msrwords[1]) & K8_MTRR_RDMEM_WRMEM_MASK))
201 k8_enable_fixed_iorrs();
202 mtrr_wrmsr(msr, msrwords[0], msrwords[1]);
203 *changed = TRUE;
204 }
205}
206
207int generic_get_free_region(unsigned long base, unsigned long size, int replace_reg)
208/* [SUMMARY] Get a free MTRR.
209 <base> The starting (base) address of the region.
210 <size> The size (in bytes) of the region.
211 [RETURNS] The index of the region on success, else -1 on error.
212*/
213{
214 int i, max;
215 mtrr_type ltype;
216 unsigned long lbase, lsize;
217
218 max = num_var_ranges;
219 if (replace_reg >= 0 && replace_reg < max)
220 return replace_reg;
221 for (i = 0; i < max; ++i) {
222 mtrr_if->get(i, &lbase, &lsize, &ltype);
223 if (lsize == 0)
224 return i;
225 }
226 return -ENOSPC;
227}
228
229static void generic_get_mtrr(unsigned int reg, unsigned long *base,
230 unsigned long *size, mtrr_type *type)
231{
232 unsigned int mask_lo, mask_hi, base_lo, base_hi;
233
234 rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi);
235 if ((mask_lo & 0x800) == 0) {
236 /* Invalid (i.e. free) range */
237 *base = 0;
238 *size = 0;
239 *type = 0;
240 return;
241 }
242
243 rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi);
244
245 /* Work out the shifted address mask. */
246 mask_lo = size_or_mask | mask_hi << (32 - PAGE_SHIFT)
247 | mask_lo >> PAGE_SHIFT;
248
249 /* This works correctly if size is a power of two, i.e. a
250 contiguous range. */
251 *size = -mask_lo;
252 *base = base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT;
253 *type = base_lo & 0xff;
254}
255
256/**
257 * Checks and updates the fixed-range MTRRs if they differ from the saved set
258 * \param frs pointer to fixed-range MTRR values, saved by get_fixed_ranges()
259 */
260static int set_fixed_ranges(mtrr_type * frs)
261{
262 unsigned long long *saved = (unsigned long long *) frs;
263 int changed = FALSE;
264 int block=-1, range;
265
266 while (fixed_range_blocks[++block].ranges)
267 for (range=0; range < fixed_range_blocks[block].ranges; range++)
268 set_fixed_range(fixed_range_blocks[block].base_msr + range,
269 &changed, (unsigned int *) saved++);
270
271 return changed;
272}
273
274/* Set the MSR pair relating to a var range. Returns TRUE if
275 changes are made */
276static int set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr)
277{
278 unsigned int lo, hi;
279 int changed = FALSE;
280
281 rdmsr(MTRRphysBase_MSR(index), lo, hi);
282 if ((vr->base_lo & 0xfffff0ffUL) != (lo & 0xfffff0ffUL)
283 || (vr->base_hi & (size_and_mask >> (32 - PAGE_SHIFT))) !=
284 (hi & (size_and_mask >> (32 - PAGE_SHIFT)))) {
285 mtrr_wrmsr(MTRRphysBase_MSR(index), vr->base_lo, vr->base_hi);
286 changed = TRUE;
287 }
288
289 rdmsr(MTRRphysMask_MSR(index), lo, hi);
290
291 if ((vr->mask_lo & 0xfffff800UL) != (lo & 0xfffff800UL)
292 || (vr->mask_hi & (size_and_mask >> (32 - PAGE_SHIFT))) !=
293 (hi & (size_and_mask >> (32 - PAGE_SHIFT)))) {
294 mtrr_wrmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi);
295 changed = TRUE;
296 }
297 return changed;
298}
299
300static u32 deftype_lo, deftype_hi;
301
302static unsigned long set_mtrr_state(void)
303/* [SUMMARY] Set the MTRR state for this CPU.
304 <state> The MTRR state information to read.
305 <ctxt> Some relevant CPU context.
306 [NOTE] The CPU must already be in a safe state for MTRR changes.
307 [RETURNS] 0 if no changes made, else a mask indication what was changed.
308*/
309{
310 unsigned int i;
311 unsigned long change_mask = 0;
312
313 for (i = 0; i < num_var_ranges; i++)
314 if (set_mtrr_var_ranges(i, &mtrr_state.var_ranges[i]))
315 change_mask |= MTRR_CHANGE_MASK_VARIABLE;
316
317 if (mtrr_state.have_fixed && set_fixed_ranges(mtrr_state.fixed_ranges))
318 change_mask |= MTRR_CHANGE_MASK_FIXED;
319
320 /* Set_mtrr_restore restores the old value of MTRRdefType,
321 so to set it we fiddle with the saved value */
322 if ((deftype_lo & 0xff) != mtrr_state.def_type
323 || ((deftype_lo & 0xc00) >> 10) != mtrr_state.enabled) {
324 deftype_lo = (deftype_lo & ~0xcff) | mtrr_state.def_type | (mtrr_state.enabled << 10);
325 change_mask |= MTRR_CHANGE_MASK_DEFTYPE;
326 }
327
328 return change_mask;
329}
330
331
332static unsigned long cr4 = 0;
333static DEFINE_SPINLOCK(set_atomicity_lock);
334
335/*
336 * Since we are disabling the cache don't allow any interrupts - they
337 * would run extremely slow and would only increase the pain. The caller must
338 * ensure that local interrupts are disabled and are reenabled after post_set()
339 * has been called.
340 */
341
342static void prepare_set(void) __acquires(set_atomicity_lock)
343{
344 unsigned long cr0;
345
346 /* Note that this is not ideal, since the cache is only flushed/disabled
347 for this CPU while the MTRRs are changed, but changing this requires
348 more invasive changes to the way the kernel boots */
349
350 spin_lock(&set_atomicity_lock);
351
352 /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */
353 cr0 = read_cr0() | 0x40000000; /* set CD flag */
354 write_cr0(cr0);
355 wbinvd();
356
357 /* Save value of CR4 and clear Page Global Enable (bit 7) */
358 if ( cpu_has_pge ) {
359 cr4 = read_cr4();
360 write_cr4(cr4 & ~X86_CR4_PGE);
361 }
362
363 /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */
364 __flush_tlb();
365
366 /* Save MTRR state */
367 rdmsr(MTRRdefType_MSR, deftype_lo, deftype_hi);
368
369 /* Disable MTRRs, and set the default type to uncached */
370 mtrr_wrmsr(MTRRdefType_MSR, deftype_lo & ~0xcff, deftype_hi);
371}
372
373static void post_set(void) __releases(set_atomicity_lock)
374{
375 /* Flush TLBs (no need to flush caches - they are disabled) */
376 __flush_tlb();
377
378 /* Intel (P6) standard MTRRs */
379 mtrr_wrmsr(MTRRdefType_MSR, deftype_lo, deftype_hi);
380
381 /* Enable caches */
382 write_cr0(read_cr0() & 0xbfffffff);
383
384 /* Restore value of CR4 */
385 if ( cpu_has_pge )
386 write_cr4(cr4);
387 spin_unlock(&set_atomicity_lock);
388}
389
390static void generic_set_all(void)
391{
392 unsigned long mask, count;
393 unsigned long flags;
394
395 local_irq_save(flags);
396 prepare_set();
397
398 /* Actually set the state */
399 mask = set_mtrr_state();
400
401 post_set();
402 local_irq_restore(flags);
403
404 /* Use the atomic bitops to update the global mask */
405 for (count = 0; count < sizeof mask * 8; ++count) {
406 if (mask & 0x01)
407 set_bit(count, &smp_changes_mask);
408 mask >>= 1;
409 }
410
411}
412
413static void generic_set_mtrr(unsigned int reg, unsigned long base,
414 unsigned long size, mtrr_type type)
415/* [SUMMARY] Set variable MTRR register on the local CPU.
416 <reg> The register to set.
417 <base> The base address of the region.
418 <size> The size of the region. If this is 0 the region is disabled.
419 <type> The type of the region.
420 <do_safe> If TRUE, do the change safely. If FALSE, safety measures should
421 be done externally.
422 [RETURNS] Nothing.
423*/
424{
425 unsigned long flags;
426 struct mtrr_var_range *vr;
427
428 vr = &mtrr_state.var_ranges[reg];
429
430 local_irq_save(flags);
431 prepare_set();
432
433 if (size == 0) {
434 /* The invalid bit is kept in the mask, so we simply clear the
435 relevant mask register to disable a range. */
436 mtrr_wrmsr(MTRRphysMask_MSR(reg), 0, 0);
437 memset(vr, 0, sizeof(struct mtrr_var_range));
438 } else {
439 vr->base_lo = base << PAGE_SHIFT | type;
440 vr->base_hi = (base & size_and_mask) >> (32 - PAGE_SHIFT);
441 vr->mask_lo = -size << PAGE_SHIFT | 0x800;
442 vr->mask_hi = (-size & size_and_mask) >> (32 - PAGE_SHIFT);
443
444 mtrr_wrmsr(MTRRphysBase_MSR(reg), vr->base_lo, vr->base_hi);
445 mtrr_wrmsr(MTRRphysMask_MSR(reg), vr->mask_lo, vr->mask_hi);
446 }
447
448 post_set();
449 local_irq_restore(flags);
450}
451
452int generic_validate_add_page(unsigned long base, unsigned long size, unsigned int type)
453{
454 unsigned long lbase, last;
455
456 /* For Intel PPro stepping <= 7, must be 4 MiB aligned
457 and not touch 0x70000000->0x7003FFFF */
458 if (is_cpu(INTEL) && boot_cpu_data.x86 == 6 &&
459 boot_cpu_data.x86_model == 1 &&
460 boot_cpu_data.x86_mask <= 7) {
461 if (base & ((1 << (22 - PAGE_SHIFT)) - 1)) {
462 printk(KERN_WARNING "mtrr: base(0x%lx000) is not 4 MiB aligned\n", base);
463 return -EINVAL;
464 }
465 if (!(base + size < 0x70000 || base > 0x7003F) &&
466 (type == MTRR_TYPE_WRCOMB
467 || type == MTRR_TYPE_WRBACK)) {
468 printk(KERN_WARNING "mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n");
469 return -EINVAL;
470 }
471 }
472
473 /* Check upper bits of base and last are equal and lower bits are 0
474 for base and 1 for last */
475 last = base + size - 1;
476 for (lbase = base; !(lbase & 1) && (last & 1);
477 lbase = lbase >> 1, last = last >> 1) ;
478 if (lbase != last) {
479 printk(KERN_WARNING "mtrr: base(0x%lx000) is not aligned on a size(0x%lx000) boundary\n",
480 base, size);
481 return -EINVAL;
482 }
483 return 0;
484}
485
486
487static int generic_have_wrcomb(void)
488{
489 unsigned long config, dummy;
490 rdmsr(MTRRcap_MSR, config, dummy);
491 return (config & (1 << 10));
492}
493
494int positive_have_wrcomb(void)
495{
496 return 1;
497}
498
499/* generic structure...
500 */
501struct mtrr_ops generic_mtrr_ops = {
502 .use_intel_if = 1,
503 .set_all = generic_set_all,
504 .get = generic_get_mtrr,
505 .get_free_region = generic_get_free_region,
506 .set = generic_set_mtrr,
507 .validate_add_page = generic_validate_add_page,
508 .have_wrcomb = generic_have_wrcomb,
509};
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
new file mode 100644
index 000000000000..c7d8f1756745
--- /dev/null
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -0,0 +1,439 @@
1#include <linux/init.h>
2#include <linux/proc_fs.h>
3#include <linux/capability.h>
4#include <linux/ctype.h>
5#include <linux/module.h>
6#include <linux/seq_file.h>
7#include <asm/uaccess.h>
8
9#define LINE_SIZE 80
10
11#include <asm/mtrr.h>
12#include "mtrr.h"
13
14/* RED-PEN: this is accessed without any locking */
15extern unsigned int *usage_table;
16
17
18#define FILE_FCOUNT(f) (((struct seq_file *)((f)->private_data))->private)
19
20static const char *const mtrr_strings[MTRR_NUM_TYPES] =
21{
22 "uncachable", /* 0 */
23 "write-combining", /* 1 */
24 "?", /* 2 */
25 "?", /* 3 */
26 "write-through", /* 4 */
27 "write-protect", /* 5 */
28 "write-back", /* 6 */
29};
30
31const char *mtrr_attrib_to_str(int x)
32{
33 return (x <= 6) ? mtrr_strings[x] : "?";
34}
35
36#ifdef CONFIG_PROC_FS
37
38static int
39mtrr_file_add(unsigned long base, unsigned long size,
40 unsigned int type, char increment, struct file *file, int page)
41{
42 int reg, max;
43 unsigned int *fcount = FILE_FCOUNT(file);
44
45 max = num_var_ranges;
46 if (fcount == NULL) {
47 fcount = kzalloc(max * sizeof *fcount, GFP_KERNEL);
48 if (!fcount)
49 return -ENOMEM;
50 FILE_FCOUNT(file) = fcount;
51 }
52 if (!page) {
53 if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1)))
54 return -EINVAL;
55 base >>= PAGE_SHIFT;
56 size >>= PAGE_SHIFT;
57 }
58 reg = mtrr_add_page(base, size, type, 1);
59 if (reg >= 0)
60 ++fcount[reg];
61 return reg;
62}
63
64static int
65mtrr_file_del(unsigned long base, unsigned long size,
66 struct file *file, int page)
67{
68 int reg;
69 unsigned int *fcount = FILE_FCOUNT(file);
70
71 if (!page) {
72 if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1)))
73 return -EINVAL;
74 base >>= PAGE_SHIFT;
75 size >>= PAGE_SHIFT;
76 }
77 reg = mtrr_del_page(-1, base, size);
78 if (reg < 0)
79 return reg;
80 if (fcount == NULL)
81 return reg;
82 if (fcount[reg] < 1)
83 return -EINVAL;
84 --fcount[reg];
85 return reg;
86}
87
88/* RED-PEN: seq_file can seek now. this is ignored. */
89static ssize_t
90mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
91/* Format of control line:
92 "base=%Lx size=%Lx type=%s" OR:
93 "disable=%d"
94*/
95{
96 int i, err;
97 unsigned long reg;
98 unsigned long long base, size;
99 char *ptr;
100 char line[LINE_SIZE];
101 size_t linelen;
102
103 if (!capable(CAP_SYS_ADMIN))
104 return -EPERM;
105 if (!len)
106 return -EINVAL;
107 memset(line, 0, LINE_SIZE);
108 if (len > LINE_SIZE)
109 len = LINE_SIZE;
110 if (copy_from_user(line, buf, len - 1))
111 return -EFAULT;
112 linelen = strlen(line);
113 ptr = line + linelen - 1;
114 if (linelen && *ptr == '\n')
115 *ptr = '\0';
116 if (!strncmp(line, "disable=", 8)) {
117 reg = simple_strtoul(line + 8, &ptr, 0);
118 err = mtrr_del_page(reg, 0, 0);
119 if (err < 0)
120 return err;
121 return len;
122 }
123 if (strncmp(line, "base=", 5))
124 return -EINVAL;
125 base = simple_strtoull(line + 5, &ptr, 0);
126 for (; isspace(*ptr); ++ptr) ;
127 if (strncmp(ptr, "size=", 5))
128 return -EINVAL;
129 size = simple_strtoull(ptr + 5, &ptr, 0);
130 if ((base & 0xfff) || (size & 0xfff))
131 return -EINVAL;
132 for (; isspace(*ptr); ++ptr) ;
133 if (strncmp(ptr, "type=", 5))
134 return -EINVAL;
135 ptr += 5;
136 for (; isspace(*ptr); ++ptr) ;
137 for (i = 0; i < MTRR_NUM_TYPES; ++i) {
138 if (strcmp(ptr, mtrr_strings[i]))
139 continue;
140 base >>= PAGE_SHIFT;
141 size >>= PAGE_SHIFT;
142 err =
143 mtrr_add_page((unsigned long) base, (unsigned long) size, i,
144 1);
145 if (err < 0)
146 return err;
147 return len;
148 }
149 return -EINVAL;
150}
151
152static long
153mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
154{
155 int err = 0;
156 mtrr_type type;
157 unsigned long size;
158 struct mtrr_sentry sentry;
159 struct mtrr_gentry gentry;
160 void __user *arg = (void __user *) __arg;
161
162 switch (cmd) {
163 case MTRRIOC_ADD_ENTRY:
164 case MTRRIOC_SET_ENTRY:
165 case MTRRIOC_DEL_ENTRY:
166 case MTRRIOC_KILL_ENTRY:
167 case MTRRIOC_ADD_PAGE_ENTRY:
168 case MTRRIOC_SET_PAGE_ENTRY:
169 case MTRRIOC_DEL_PAGE_ENTRY:
170 case MTRRIOC_KILL_PAGE_ENTRY:
171 if (copy_from_user(&sentry, arg, sizeof sentry))
172 return -EFAULT;
173 break;
174 case MTRRIOC_GET_ENTRY:
175 case MTRRIOC_GET_PAGE_ENTRY:
176 if (copy_from_user(&gentry, arg, sizeof gentry))
177 return -EFAULT;
178 break;
179#ifdef CONFIG_COMPAT
180 case MTRRIOC32_ADD_ENTRY:
181 case MTRRIOC32_SET_ENTRY:
182 case MTRRIOC32_DEL_ENTRY:
183 case MTRRIOC32_KILL_ENTRY:
184 case MTRRIOC32_ADD_PAGE_ENTRY:
185 case MTRRIOC32_SET_PAGE_ENTRY:
186 case MTRRIOC32_DEL_PAGE_ENTRY:
187 case MTRRIOC32_KILL_PAGE_ENTRY: {
188 struct mtrr_sentry32 __user *s32 = (struct mtrr_sentry32 __user *)__arg;
189 err = get_user(sentry.base, &s32->base);
190 err |= get_user(sentry.size, &s32->size);
191 err |= get_user(sentry.type, &s32->type);
192 if (err)
193 return err;
194 break;
195 }
196 case MTRRIOC32_GET_ENTRY:
197 case MTRRIOC32_GET_PAGE_ENTRY: {
198 struct mtrr_gentry32 __user *g32 = (struct mtrr_gentry32 __user *)__arg;
199 err = get_user(gentry.regnum, &g32->regnum);
200 err |= get_user(gentry.base, &g32->base);
201 err |= get_user(gentry.size, &g32->size);
202 err |= get_user(gentry.type, &g32->type);
203 if (err)
204 return err;
205 break;
206 }
207#endif
208 }
209
210 switch (cmd) {
211 default:
212 return -ENOTTY;
213 case MTRRIOC_ADD_ENTRY:
214#ifdef CONFIG_COMPAT
215 case MTRRIOC32_ADD_ENTRY:
216#endif
217 if (!capable(CAP_SYS_ADMIN))
218 return -EPERM;
219 err =
220 mtrr_file_add(sentry.base, sentry.size, sentry.type, 1,
221 file, 0);
222 break;
223 case MTRRIOC_SET_ENTRY:
224#ifdef CONFIG_COMPAT
225 case MTRRIOC32_SET_ENTRY:
226#endif
227 if (!capable(CAP_SYS_ADMIN))
228 return -EPERM;
229 err = mtrr_add(sentry.base, sentry.size, sentry.type, 0);
230 break;
231 case MTRRIOC_DEL_ENTRY:
232#ifdef CONFIG_COMPAT
233 case MTRRIOC32_DEL_ENTRY:
234#endif
235 if (!capable(CAP_SYS_ADMIN))
236 return -EPERM;
237 err = mtrr_file_del(sentry.base, sentry.size, file, 0);
238 break;
239 case MTRRIOC_KILL_ENTRY:
240#ifdef CONFIG_COMPAT
241 case MTRRIOC32_KILL_ENTRY:
242#endif
243 if (!capable(CAP_SYS_ADMIN))
244 return -EPERM;
245 err = mtrr_del(-1, sentry.base, sentry.size);
246 break;
247 case MTRRIOC_GET_ENTRY:
248#ifdef CONFIG_COMPAT
249 case MTRRIOC32_GET_ENTRY:
250#endif
251 if (gentry.regnum >= num_var_ranges)
252 return -EINVAL;
253 mtrr_if->get(gentry.regnum, &gentry.base, &size, &type);
254
255 /* Hide entries that go above 4GB */
256 if (gentry.base + size - 1 >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT))
257 || size >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT)))
258 gentry.base = gentry.size = gentry.type = 0;
259 else {
260 gentry.base <<= PAGE_SHIFT;
261 gentry.size = size << PAGE_SHIFT;
262 gentry.type = type;
263 }
264
265 break;
266 case MTRRIOC_ADD_PAGE_ENTRY:
267#ifdef CONFIG_COMPAT
268 case MTRRIOC32_ADD_PAGE_ENTRY:
269#endif
270 if (!capable(CAP_SYS_ADMIN))
271 return -EPERM;
272 err =
273 mtrr_file_add(sentry.base, sentry.size, sentry.type, 1,
274 file, 1);
275 break;
276 case MTRRIOC_SET_PAGE_ENTRY:
277#ifdef CONFIG_COMPAT
278 case MTRRIOC32_SET_PAGE_ENTRY:
279#endif
280 if (!capable(CAP_SYS_ADMIN))
281 return -EPERM;
282 err = mtrr_add_page(sentry.base, sentry.size, sentry.type, 0);
283 break;
284 case MTRRIOC_DEL_PAGE_ENTRY:
285#ifdef CONFIG_COMPAT
286 case MTRRIOC32_DEL_PAGE_ENTRY:
287#endif
288 if (!capable(CAP_SYS_ADMIN))
289 return -EPERM;
290 err = mtrr_file_del(sentry.base, sentry.size, file, 1);
291 break;
292 case MTRRIOC_KILL_PAGE_ENTRY:
293#ifdef CONFIG_COMPAT
294 case MTRRIOC32_KILL_PAGE_ENTRY:
295#endif
296 if (!capable(CAP_SYS_ADMIN))
297 return -EPERM;
298 err = mtrr_del_page(-1, sentry.base, sentry.size);
299 break;
300 case MTRRIOC_GET_PAGE_ENTRY:
301#ifdef CONFIG_COMPAT
302 case MTRRIOC32_GET_PAGE_ENTRY:
303#endif
304 if (gentry.regnum >= num_var_ranges)
305 return -EINVAL;
306 mtrr_if->get(gentry.regnum, &gentry.base, &size, &type);
307 /* Hide entries that would overflow */
308 if (size != (__typeof__(gentry.size))size)
309 gentry.base = gentry.size = gentry.type = 0;
310 else {
311 gentry.size = size;
312 gentry.type = type;
313 }
314 break;
315 }
316
317 if (err)
318 return err;
319
320 switch(cmd) {
321 case MTRRIOC_GET_ENTRY:
322 case MTRRIOC_GET_PAGE_ENTRY:
323 if (copy_to_user(arg, &gentry, sizeof gentry))
324 err = -EFAULT;
325 break;
326#ifdef CONFIG_COMPAT
327 case MTRRIOC32_GET_ENTRY:
328 case MTRRIOC32_GET_PAGE_ENTRY: {
329 struct mtrr_gentry32 __user *g32 = (struct mtrr_gentry32 __user *)__arg;
330 err = put_user(gentry.base, &g32->base);
331 err |= put_user(gentry.size, &g32->size);
332 err |= put_user(gentry.regnum, &g32->regnum);
333 err |= put_user(gentry.type, &g32->type);
334 break;
335 }
336#endif
337 }
338 return err;
339}
340
341static int
342mtrr_close(struct inode *ino, struct file *file)
343{
344 int i, max;
345 unsigned int *fcount = FILE_FCOUNT(file);
346
347 if (fcount != NULL) {
348 max = num_var_ranges;
349 for (i = 0; i < max; ++i) {
350 while (fcount[i] > 0) {
351 mtrr_del(i, 0, 0);
352 --fcount[i];
353 }
354 }
355 kfree(fcount);
356 FILE_FCOUNT(file) = NULL;
357 }
358 return single_release(ino, file);
359}
360
361static int mtrr_seq_show(struct seq_file *seq, void *offset);
362
363static int mtrr_open(struct inode *inode, struct file *file)
364{
365 if (!mtrr_if)
366 return -EIO;
367 if (!mtrr_if->get)
368 return -ENXIO;
369 return single_open(file, mtrr_seq_show, NULL);
370}
371
372static const struct file_operations mtrr_fops = {
373 .owner = THIS_MODULE,
374 .open = mtrr_open,
375 .read = seq_read,
376 .llseek = seq_lseek,
377 .write = mtrr_write,
378 .unlocked_ioctl = mtrr_ioctl,
379 .compat_ioctl = mtrr_ioctl,
380 .release = mtrr_close,
381};
382
383
384static struct proc_dir_entry *proc_root_mtrr;
385
386
387static int mtrr_seq_show(struct seq_file *seq, void *offset)
388{
389 char factor;
390 int i, max, len;
391 mtrr_type type;
392 unsigned long base, size;
393
394 len = 0;
395 max = num_var_ranges;
396 for (i = 0; i < max; i++) {
397 mtrr_if->get(i, &base, &size, &type);
398 if (size == 0)
399 usage_table[i] = 0;
400 else {
401 if (size < (0x100000 >> PAGE_SHIFT)) {
402 /* less than 1MB */
403 factor = 'K';
404 size <<= PAGE_SHIFT - 10;
405 } else {
406 factor = 'M';
407 size >>= 20 - PAGE_SHIFT;
408 }
409 /* RED-PEN: base can be > 32bit */
410 len += seq_printf(seq,
411 "reg%02i: base=0x%05lx000 (%4luMB), size=%4lu%cB: %s, count=%d\n",
412 i, base, base >> (20 - PAGE_SHIFT), size, factor,
413 mtrr_attrib_to_str(type), usage_table[i]);
414 }
415 }
416 return 0;
417}
418
419static int __init mtrr_if_init(void)
420{
421 struct cpuinfo_x86 *c = &boot_cpu_data;
422
423 if ((!cpu_has(c, X86_FEATURE_MTRR)) &&
424 (!cpu_has(c, X86_FEATURE_K6_MTRR)) &&
425 (!cpu_has(c, X86_FEATURE_CYRIX_ARR)) &&
426 (!cpu_has(c, X86_FEATURE_CENTAUR_MCR)))
427 return -ENODEV;
428
429 proc_root_mtrr =
430 create_proc_entry("mtrr", S_IWUSR | S_IRUGO, &proc_root);
431 if (proc_root_mtrr) {
432 proc_root_mtrr->owner = THIS_MODULE;
433 proc_root_mtrr->proc_fops = &mtrr_fops;
434 }
435 return 0;
436}
437
438arch_initcall(mtrr_if_init);
439#endif /* CONFIG_PROC_FS */
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
new file mode 100644
index 000000000000..c48b6fea5ab4
--- /dev/null
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -0,0 +1,768 @@
1/* Generic MTRR (Memory Type Range Register) driver.
2
3 Copyright (C) 1997-2000 Richard Gooch
4 Copyright (c) 2002 Patrick Mochel
5
6 This library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Library General Public
8 License as published by the Free Software Foundation; either
9 version 2 of the License, or (at your option) any later version.
10
11 This library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Library General Public License for more details.
15
16 You should have received a copy of the GNU Library General Public
17 License along with this library; if not, write to the Free
18 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19
20 Richard Gooch may be reached by email at rgooch@atnf.csiro.au
21 The postal address is:
22 Richard Gooch, c/o ATNF, P. O. Box 76, Epping, N.S.W., 2121, Australia.
23
24 Source: "Pentium Pro Family Developer's Manual, Volume 3:
25 Operating System Writer's Guide" (Intel document number 242692),
26 section 11.11.7
27
28 This was cleaned and made readable by Patrick Mochel <mochel@osdl.org>
29 on 6-7 March 2002.
30 Source: Intel Architecture Software Developers Manual, Volume 3:
31 System Programming Guide; Section 9.11. (1997 edition - PPro).
32*/
33
34#include <linux/module.h>
35#include <linux/init.h>
36#include <linux/pci.h>
37#include <linux/smp.h>
38#include <linux/cpu.h>
39#include <linux/mutex.h>
40
41#include <asm/mtrr.h>
42
43#include <asm/uaccess.h>
44#include <asm/processor.h>
45#include <asm/msr.h>
46#include "mtrr.h"
47
48u32 num_var_ranges = 0;
49
50unsigned int *usage_table;
51static DEFINE_MUTEX(mtrr_mutex);
52
53u64 size_or_mask, size_and_mask;
54
55static struct mtrr_ops * mtrr_ops[X86_VENDOR_NUM] = {};
56
57struct mtrr_ops * mtrr_if = NULL;
58
59static void set_mtrr(unsigned int reg, unsigned long base,
60 unsigned long size, mtrr_type type);
61
62#ifndef CONFIG_X86_64
63extern int arr3_protected;
64#else
65#define arr3_protected 0
66#endif
67
68void set_mtrr_ops(struct mtrr_ops * ops)
69{
70 if (ops->vendor && ops->vendor < X86_VENDOR_NUM)
71 mtrr_ops[ops->vendor] = ops;
72}
73
74/* Returns non-zero if we have the write-combining memory type */
75static int have_wrcomb(void)
76{
77 struct pci_dev *dev;
78 u8 rev;
79
80 if ((dev = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, NULL)) != NULL) {
81 /* ServerWorks LE chipsets < rev 6 have problems with write-combining
82 Don't allow it and leave room for other chipsets to be tagged */
83 if (dev->vendor == PCI_VENDOR_ID_SERVERWORKS &&
84 dev->device == PCI_DEVICE_ID_SERVERWORKS_LE) {
85 pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
86 if (rev <= 5) {
87 printk(KERN_INFO "mtrr: Serverworks LE rev < 6 detected. Write-combining disabled.\n");
88 pci_dev_put(dev);
89 return 0;
90 }
91 }
92 /* Intel 450NX errata # 23. Non ascending cacheline evictions to
93 write combining memory may resulting in data corruption */
94 if (dev->vendor == PCI_VENDOR_ID_INTEL &&
95 dev->device == PCI_DEVICE_ID_INTEL_82451NX) {
96 printk(KERN_INFO "mtrr: Intel 450NX MMC detected. Write-combining disabled.\n");
97 pci_dev_put(dev);
98 return 0;
99 }
100 pci_dev_put(dev);
101 }
102 return (mtrr_if->have_wrcomb ? mtrr_if->have_wrcomb() : 0);
103}
104
105/* This function returns the number of variable MTRRs */
106static void __init set_num_var_ranges(void)
107{
108 unsigned long config = 0, dummy;
109
110 if (use_intel()) {
111 rdmsr(MTRRcap_MSR, config, dummy);
112 } else if (is_cpu(AMD))
113 config = 2;
114 else if (is_cpu(CYRIX) || is_cpu(CENTAUR))
115 config = 8;
116 num_var_ranges = config & 0xff;
117}
118
119static void __init init_table(void)
120{
121 int i, max;
122
123 max = num_var_ranges;
124 if ((usage_table = kmalloc(max * sizeof *usage_table, GFP_KERNEL))
125 == NULL) {
126 printk(KERN_ERR "mtrr: could not allocate\n");
127 return;
128 }
129 for (i = 0; i < max; i++)
130 usage_table[i] = 1;
131}
132
133struct set_mtrr_data {
134 atomic_t count;
135 atomic_t gate;
136 unsigned long smp_base;
137 unsigned long smp_size;
138 unsigned int smp_reg;
139 mtrr_type smp_type;
140};
141
142#ifdef CONFIG_SMP
143
144static void ipi_handler(void *info)
145/* [SUMMARY] Synchronisation handler. Executed by "other" CPUs.
146 [RETURNS] Nothing.
147*/
148{
149 struct set_mtrr_data *data = info;
150 unsigned long flags;
151
152 local_irq_save(flags);
153
154 atomic_dec(&data->count);
155 while(!atomic_read(&data->gate))
156 cpu_relax();
157
158 /* The master has cleared me to execute */
159 if (data->smp_reg != ~0U)
160 mtrr_if->set(data->smp_reg, data->smp_base,
161 data->smp_size, data->smp_type);
162 else
163 mtrr_if->set_all();
164
165 atomic_dec(&data->count);
166 while(atomic_read(&data->gate))
167 cpu_relax();
168
169 atomic_dec(&data->count);
170 local_irq_restore(flags);
171}
172
173#endif
174
175static inline int types_compatible(mtrr_type type1, mtrr_type type2) {
176 return type1 == MTRR_TYPE_UNCACHABLE ||
177 type2 == MTRR_TYPE_UNCACHABLE ||
178 (type1 == MTRR_TYPE_WRTHROUGH && type2 == MTRR_TYPE_WRBACK) ||
179 (type1 == MTRR_TYPE_WRBACK && type2 == MTRR_TYPE_WRTHROUGH);
180}
181
182/**
183 * set_mtrr - update mtrrs on all processors
184 * @reg: mtrr in question
185 * @base: mtrr base
186 * @size: mtrr size
187 * @type: mtrr type
188 *
189 * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly:
190 *
191 * 1. Send IPI to do the following:
192 * 2. Disable Interrupts
193 * 3. Wait for all procs to do so
194 * 4. Enter no-fill cache mode
195 * 5. Flush caches
196 * 6. Clear PGE bit
197 * 7. Flush all TLBs
198 * 8. Disable all range registers
199 * 9. Update the MTRRs
200 * 10. Enable all range registers
201 * 11. Flush all TLBs and caches again
202 * 12. Enter normal cache mode and reenable caching
203 * 13. Set PGE
204 * 14. Wait for buddies to catch up
205 * 15. Enable interrupts.
206 *
207 * What does that mean for us? Well, first we set data.count to the number
208 * of CPUs. As each CPU disables interrupts, it'll decrement it once. We wait
209 * until it hits 0 and proceed. We set the data.gate flag and reset data.count.
210 * Meanwhile, they are waiting for that flag to be set. Once it's set, each
211 * CPU goes through the transition of updating MTRRs. The CPU vendors may each do it
212 * differently, so we call mtrr_if->set() callback and let them take care of it.
213 * When they're done, they again decrement data->count and wait for data.gate to
214 * be reset.
215 * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag.
216 * Everyone then enables interrupts and we all continue on.
217 *
218 * Note that the mechanism is the same for UP systems, too; all the SMP stuff
219 * becomes nops.
220 */
221static void set_mtrr(unsigned int reg, unsigned long base,
222 unsigned long size, mtrr_type type)
223{
224 struct set_mtrr_data data;
225 unsigned long flags;
226
227 data.smp_reg = reg;
228 data.smp_base = base;
229 data.smp_size = size;
230 data.smp_type = type;
231 atomic_set(&data.count, num_booting_cpus() - 1);
232 /* make sure data.count is visible before unleashing other CPUs */
233 smp_wmb();
234 atomic_set(&data.gate,0);
235
236 /* Start the ball rolling on other CPUs */
237 if (smp_call_function(ipi_handler, &data, 1, 0) != 0)
238 panic("mtrr: timed out waiting for other CPUs\n");
239
240 local_irq_save(flags);
241
242 while(atomic_read(&data.count))
243 cpu_relax();
244
245 /* ok, reset count and toggle gate */
246 atomic_set(&data.count, num_booting_cpus() - 1);
247 smp_wmb();
248 atomic_set(&data.gate,1);
249
250 /* do our MTRR business */
251
252 /* HACK!
253 * We use this same function to initialize the mtrrs on boot.
254 * The state of the boot cpu's mtrrs has been saved, and we want
255 * to replicate across all the APs.
256 * If we're doing that @reg is set to something special...
257 */
258 if (reg != ~0U)
259 mtrr_if->set(reg,base,size,type);
260
261 /* wait for the others */
262 while(atomic_read(&data.count))
263 cpu_relax();
264
265 atomic_set(&data.count, num_booting_cpus() - 1);
266 smp_wmb();
267 atomic_set(&data.gate,0);
268
269 /*
270 * Wait here for everyone to have seen the gate change
271 * So we're the last ones to touch 'data'
272 */
273 while(atomic_read(&data.count))
274 cpu_relax();
275
276 local_irq_restore(flags);
277}
278
279/**
280 * mtrr_add_page - Add a memory type region
281 * @base: Physical base address of region in pages (in units of 4 kB!)
282 * @size: Physical size of region in pages (4 kB)
283 * @type: Type of MTRR desired
284 * @increment: If this is true do usage counting on the region
285 *
286 * Memory type region registers control the caching on newer Intel and
287 * non Intel processors. This function allows drivers to request an
288 * MTRR is added. The details and hardware specifics of each processor's
289 * implementation are hidden from the caller, but nevertheless the
290 * caller should expect to need to provide a power of two size on an
291 * equivalent power of two boundary.
292 *
293 * If the region cannot be added either because all regions are in use
294 * or the CPU cannot support it a negative value is returned. On success
295 * the register number for this entry is returned, but should be treated
296 * as a cookie only.
297 *
298 * On a multiprocessor machine the changes are made to all processors.
299 * This is required on x86 by the Intel processors.
300 *
301 * The available types are
302 *
303 * %MTRR_TYPE_UNCACHABLE - No caching
304 *
305 * %MTRR_TYPE_WRBACK - Write data back in bursts whenever
306 *
307 * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts
308 *
309 * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes
310 *
311 * BUGS: Needs a quiet flag for the cases where drivers do not mind
312 * failures and do not wish system log messages to be sent.
313 */
314
315int mtrr_add_page(unsigned long base, unsigned long size,
316 unsigned int type, char increment)
317{
318 int i, replace, error;
319 mtrr_type ltype;
320 unsigned long lbase, lsize;
321
322 if (!mtrr_if)
323 return -ENXIO;
324
325 if ((error = mtrr_if->validate_add_page(base,size,type)))
326 return error;
327
328 if (type >= MTRR_NUM_TYPES) {
329 printk(KERN_WARNING "mtrr: type: %u invalid\n", type);
330 return -EINVAL;
331 }
332
333 /* If the type is WC, check that this processor supports it */
334 if ((type == MTRR_TYPE_WRCOMB) && !have_wrcomb()) {
335 printk(KERN_WARNING
336 "mtrr: your processor doesn't support write-combining\n");
337 return -ENOSYS;
338 }
339
340 if (!size) {
341 printk(KERN_WARNING "mtrr: zero sized request\n");
342 return -EINVAL;
343 }
344
345 if (base & size_or_mask || size & size_or_mask) {
346 printk(KERN_WARNING "mtrr: base or size exceeds the MTRR width\n");
347 return -EINVAL;
348 }
349
350 error = -EINVAL;
351 replace = -1;
352
353 /* No CPU hotplug when we change MTRR entries */
354 lock_cpu_hotplug();
355 /* Search for existing MTRR */
356 mutex_lock(&mtrr_mutex);
357 for (i = 0; i < num_var_ranges; ++i) {
358 mtrr_if->get(i, &lbase, &lsize, &ltype);
359 if (!lsize || base > lbase + lsize - 1 || base + size - 1 < lbase)
360 continue;
361 /* At this point we know there is some kind of overlap/enclosure */
362 if (base < lbase || base + size - 1 > lbase + lsize - 1) {
363 if (base <= lbase && base + size - 1 >= lbase + lsize - 1) {
364 /* New region encloses an existing region */
365 if (type == ltype) {
366 replace = replace == -1 ? i : -2;
367 continue;
368 }
369 else if (types_compatible(type, ltype))
370 continue;
371 }
372 printk(KERN_WARNING
373 "mtrr: 0x%lx000,0x%lx000 overlaps existing"
374 " 0x%lx000,0x%lx000\n", base, size, lbase,
375 lsize);
376 goto out;
377 }
378 /* New region is enclosed by an existing region */
379 if (ltype != type) {
380 if (types_compatible(type, ltype))
381 continue;
382 printk (KERN_WARNING "mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n",
383 base, size, mtrr_attrib_to_str(ltype),
384 mtrr_attrib_to_str(type));
385 goto out;
386 }
387 if (increment)
388 ++usage_table[i];
389 error = i;
390 goto out;
391 }
392 /* Search for an empty MTRR */
393 i = mtrr_if->get_free_region(base, size, replace);
394 if (i >= 0) {
395 set_mtrr(i, base, size, type);
396 if (likely(replace < 0))
397 usage_table[i] = 1;
398 else {
399 usage_table[i] = usage_table[replace] + !!increment;
400 if (unlikely(replace != i)) {
401 set_mtrr(replace, 0, 0, 0);
402 usage_table[replace] = 0;
403 }
404 }
405 } else
406 printk(KERN_INFO "mtrr: no more MTRRs available\n");
407 error = i;
408 out:
409 mutex_unlock(&mtrr_mutex);
410 unlock_cpu_hotplug();
411 return error;
412}
413
414static int mtrr_check(unsigned long base, unsigned long size)
415{
416 if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) {
417 printk(KERN_WARNING
418 "mtrr: size and base must be multiples of 4 kiB\n");
419 printk(KERN_DEBUG
420 "mtrr: size: 0x%lx base: 0x%lx\n", size, base);
421 dump_stack();
422 return -1;
423 }
424 return 0;
425}
426
427/**
428 * mtrr_add - Add a memory type region
429 * @base: Physical base address of region
430 * @size: Physical size of region
431 * @type: Type of MTRR desired
432 * @increment: If this is true do usage counting on the region
433 *
434 * Memory type region registers control the caching on newer Intel and
435 * non Intel processors. This function allows drivers to request an
436 * MTRR is added. The details and hardware specifics of each processor's
437 * implementation are hidden from the caller, but nevertheless the
438 * caller should expect to need to provide a power of two size on an
439 * equivalent power of two boundary.
440 *
441 * If the region cannot be added either because all regions are in use
442 * or the CPU cannot support it a negative value is returned. On success
443 * the register number for this entry is returned, but should be treated
444 * as a cookie only.
445 *
446 * On a multiprocessor machine the changes are made to all processors.
447 * This is required on x86 by the Intel processors.
448 *
449 * The available types are
450 *
451 * %MTRR_TYPE_UNCACHABLE - No caching
452 *
453 * %MTRR_TYPE_WRBACK - Write data back in bursts whenever
454 *
455 * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts
456 *
457 * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes
458 *
459 * BUGS: Needs a quiet flag for the cases where drivers do not mind
460 * failures and do not wish system log messages to be sent.
461 */
462
463int
464mtrr_add(unsigned long base, unsigned long size, unsigned int type,
465 char increment)
466{
467 if (mtrr_check(base, size))
468 return -EINVAL;
469 return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type,
470 increment);
471}
472
473/**
474 * mtrr_del_page - delete a memory type region
475 * @reg: Register returned by mtrr_add
476 * @base: Physical base address
477 * @size: Size of region
478 *
479 * If register is supplied then base and size are ignored. This is
480 * how drivers should call it.
481 *
482 * Releases an MTRR region. If the usage count drops to zero the
483 * register is freed and the region returns to default state.
484 * On success the register is returned, on failure a negative error
485 * code.
486 */
487
488int mtrr_del_page(int reg, unsigned long base, unsigned long size)
489{
490 int i, max;
491 mtrr_type ltype;
492 unsigned long lbase, lsize;
493 int error = -EINVAL;
494
495 if (!mtrr_if)
496 return -ENXIO;
497
498 max = num_var_ranges;
499 /* No CPU hotplug when we change MTRR entries */
500 lock_cpu_hotplug();
501 mutex_lock(&mtrr_mutex);
502 if (reg < 0) {
503 /* Search for existing MTRR */
504 for (i = 0; i < max; ++i) {
505 mtrr_if->get(i, &lbase, &lsize, &ltype);
506 if (lbase == base && lsize == size) {
507 reg = i;
508 break;
509 }
510 }
511 if (reg < 0) {
512 printk(KERN_DEBUG "mtrr: no MTRR for %lx000,%lx000 found\n", base,
513 size);
514 goto out;
515 }
516 }
517 if (reg >= max) {
518 printk(KERN_WARNING "mtrr: register: %d too big\n", reg);
519 goto out;
520 }
521 if (is_cpu(CYRIX) && !use_intel()) {
522 if ((reg == 3) && arr3_protected) {
523 printk(KERN_WARNING "mtrr: ARR3 cannot be changed\n");
524 goto out;
525 }
526 }
527 mtrr_if->get(reg, &lbase, &lsize, &ltype);
528 if (lsize < 1) {
529 printk(KERN_WARNING "mtrr: MTRR %d not used\n", reg);
530 goto out;
531 }
532 if (usage_table[reg] < 1) {
533 printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg);
534 goto out;
535 }
536 if (--usage_table[reg] < 1)
537 set_mtrr(reg, 0, 0, 0);
538 error = reg;
539 out:
540 mutex_unlock(&mtrr_mutex);
541 unlock_cpu_hotplug();
542 return error;
543}
544/**
545 * mtrr_del - delete a memory type region
546 * @reg: Register returned by mtrr_add
547 * @base: Physical base address
548 * @size: Size of region
549 *
550 * If register is supplied then base and size are ignored. This is
551 * how drivers should call it.
552 *
553 * Releases an MTRR region. If the usage count drops to zero the
554 * register is freed and the region returns to default state.
555 * On success the register is returned, on failure a negative error
556 * code.
557 */
558
559int
560mtrr_del(int reg, unsigned long base, unsigned long size)
561{
562 if (mtrr_check(base, size))
563 return -EINVAL;
564 return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT);
565}
566
567EXPORT_SYMBOL(mtrr_add);
568EXPORT_SYMBOL(mtrr_del);
569
570/* HACK ALERT!
571 * These should be called implicitly, but we can't yet until all the initcall
572 * stuff is done...
573 */
574extern void amd_init_mtrr(void);
575extern void cyrix_init_mtrr(void);
576extern void centaur_init_mtrr(void);
577
578static void __init init_ifs(void)
579{
580#ifndef CONFIG_X86_64
581 amd_init_mtrr();
582 cyrix_init_mtrr();
583 centaur_init_mtrr();
584#endif
585}
586
587/* The suspend/resume methods are only for CPU without MTRR. CPU using generic
588 * MTRR driver doesn't require this
589 */
590struct mtrr_value {
591 mtrr_type ltype;
592 unsigned long lbase;
593 unsigned long lsize;
594};
595
596static struct mtrr_value * mtrr_state;
597
598static int mtrr_save(struct sys_device * sysdev, pm_message_t state)
599{
600 int i;
601 int size = num_var_ranges * sizeof(struct mtrr_value);
602
603 mtrr_state = kzalloc(size,GFP_ATOMIC);
604 if (!mtrr_state)
605 return -ENOMEM;
606
607 for (i = 0; i < num_var_ranges; i++) {
608 mtrr_if->get(i,
609 &mtrr_state[i].lbase,
610 &mtrr_state[i].lsize,
611 &mtrr_state[i].ltype);
612 }
613 return 0;
614}
615
616static int mtrr_restore(struct sys_device * sysdev)
617{
618 int i;
619
620 for (i = 0; i < num_var_ranges; i++) {
621 if (mtrr_state[i].lsize)
622 set_mtrr(i,
623 mtrr_state[i].lbase,
624 mtrr_state[i].lsize,
625 mtrr_state[i].ltype);
626 }
627 kfree(mtrr_state);
628 return 0;
629}
630
631
632
633static struct sysdev_driver mtrr_sysdev_driver = {
634 .suspend = mtrr_save,
635 .resume = mtrr_restore,
636};
637
638
639/**
640 * mtrr_bp_init - initialize mtrrs on the boot CPU
641 *
642 * This needs to be called early; before any of the other CPUs are
643 * initialized (i.e. before smp_init()).
644 *
645 */
646void __init mtrr_bp_init(void)
647{
648 init_ifs();
649
650 if (cpu_has_mtrr) {
651 mtrr_if = &generic_mtrr_ops;
652 size_or_mask = 0xff000000; /* 36 bits */
653 size_and_mask = 0x00f00000;
654
655 /* This is an AMD specific MSR, but we assume(hope?) that
656 Intel will implement it to when they extend the address
657 bus of the Xeon. */
658 if (cpuid_eax(0x80000000) >= 0x80000008) {
659 u32 phys_addr;
660 phys_addr = cpuid_eax(0x80000008) & 0xff;
661 /* CPUID workaround for Intel 0F33/0F34 CPU */
662 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
663 boot_cpu_data.x86 == 0xF &&
664 boot_cpu_data.x86_model == 0x3 &&
665 (boot_cpu_data.x86_mask == 0x3 ||
666 boot_cpu_data.x86_mask == 0x4))
667 phys_addr = 36;
668
669 size_or_mask = ~((1ULL << (phys_addr - PAGE_SHIFT)) - 1);
670 size_and_mask = ~size_or_mask & 0xfffff00000ULL;
671 } else if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR &&
672 boot_cpu_data.x86 == 6) {
673 /* VIA C* family have Intel style MTRRs, but
674 don't support PAE */
675 size_or_mask = 0xfff00000; /* 32 bits */
676 size_and_mask = 0;
677 }
678 } else {
679 switch (boot_cpu_data.x86_vendor) {
680 case X86_VENDOR_AMD:
681 if (cpu_has_k6_mtrr) {
682 /* Pre-Athlon (K6) AMD CPU MTRRs */
683 mtrr_if = mtrr_ops[X86_VENDOR_AMD];
684 size_or_mask = 0xfff00000; /* 32 bits */
685 size_and_mask = 0;
686 }
687 break;
688 case X86_VENDOR_CENTAUR:
689 if (cpu_has_centaur_mcr) {
690 mtrr_if = mtrr_ops[X86_VENDOR_CENTAUR];
691 size_or_mask = 0xfff00000; /* 32 bits */
692 size_and_mask = 0;
693 }
694 break;
695 case X86_VENDOR_CYRIX:
696 if (cpu_has_cyrix_arr) {
697 mtrr_if = mtrr_ops[X86_VENDOR_CYRIX];
698 size_or_mask = 0xfff00000; /* 32 bits */
699 size_and_mask = 0;
700 }
701 break;
702 default:
703 break;
704 }
705 }
706
707 if (mtrr_if) {
708 set_num_var_ranges();
709 init_table();
710 if (use_intel())
711 get_mtrr_state();
712 }
713}
714
715void mtrr_ap_init(void)
716{
717 unsigned long flags;
718
719 if (!mtrr_if || !use_intel())
720 return;
721 /*
722 * Ideally we should hold mtrr_mutex here to avoid mtrr entries changed,
723 * but this routine will be called in cpu boot time, holding the lock
724 * breaks it. This routine is called in two cases: 1.very earily time
725 * of software resume, when there absolutely isn't mtrr entry changes;
726 * 2.cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug lock to
727 * prevent mtrr entry changes
728 */
729 local_irq_save(flags);
730
731 mtrr_if->set_all();
732
733 local_irq_restore(flags);
734}
735
736/**
737 * Save current fixed-range MTRR state of the BSP
738 */
739void mtrr_save_state(void)
740{
741 int cpu = get_cpu();
742
743 if (cpu == 0)
744 mtrr_save_fixed_ranges(NULL);
745 else
746 smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1, 1);
747 put_cpu();
748}
749
750static int __init mtrr_init_finialize(void)
751{
752 if (!mtrr_if)
753 return 0;
754 if (use_intel())
755 mtrr_state_warn();
756 else {
757 /* The CPUs haven't MTRR and seemes not support SMP. They have
758 * specific drivers, we use a tricky method to support
759 * suspend/resume for them.
760 * TBD: is there any system with such CPU which supports
761 * suspend/resume? if no, we should remove the code.
762 */
763 sysdev_driver_register(&cpu_sysdev_class,
764 &mtrr_sysdev_driver);
765 }
766 return 0;
767}
768subsys_initcall(mtrr_init_finialize);
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
new file mode 100644
index 000000000000..289dfe6030e3
--- /dev/null
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -0,0 +1,98 @@
1/*
2 * local mtrr defines.
3 */
4
5#ifndef TRUE
6#define TRUE 1
7#define FALSE 0
8#endif
9
10#define MTRRcap_MSR 0x0fe
11#define MTRRdefType_MSR 0x2ff
12
13#define MTRRphysBase_MSR(reg) (0x200 + 2 * (reg))
14#define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1)
15
16#define NUM_FIXED_RANGES 88
17#define MTRRfix64K_00000_MSR 0x250
18#define MTRRfix16K_80000_MSR 0x258
19#define MTRRfix16K_A0000_MSR 0x259
20#define MTRRfix4K_C0000_MSR 0x268
21#define MTRRfix4K_C8000_MSR 0x269
22#define MTRRfix4K_D0000_MSR 0x26a
23#define MTRRfix4K_D8000_MSR 0x26b
24#define MTRRfix4K_E0000_MSR 0x26c
25#define MTRRfix4K_E8000_MSR 0x26d
26#define MTRRfix4K_F0000_MSR 0x26e
27#define MTRRfix4K_F8000_MSR 0x26f
28
29#define MTRR_CHANGE_MASK_FIXED 0x01
30#define MTRR_CHANGE_MASK_VARIABLE 0x02
31#define MTRR_CHANGE_MASK_DEFTYPE 0x04
32
33/* In the Intel processor's MTRR interface, the MTRR type is always held in
34 an 8 bit field: */
35typedef u8 mtrr_type;
36
37struct mtrr_ops {
38 u32 vendor;
39 u32 use_intel_if;
40// void (*init)(void);
41 void (*set)(unsigned int reg, unsigned long base,
42 unsigned long size, mtrr_type type);
43 void (*set_all)(void);
44
45 void (*get)(unsigned int reg, unsigned long *base,
46 unsigned long *size, mtrr_type * type);
47 int (*get_free_region)(unsigned long base, unsigned long size,
48 int replace_reg);
49 int (*validate_add_page)(unsigned long base, unsigned long size,
50 unsigned int type);
51 int (*have_wrcomb)(void);
52};
53
54extern int generic_get_free_region(unsigned long base, unsigned long size,
55 int replace_reg);
56extern int generic_validate_add_page(unsigned long base, unsigned long size,
57 unsigned int type);
58
59extern struct mtrr_ops generic_mtrr_ops;
60
61extern int positive_have_wrcomb(void);
62
63/* library functions for processor-specific routines */
64struct set_mtrr_context {
65 unsigned long flags;
66 unsigned long cr4val;
67 u32 deftype_lo;
68 u32 deftype_hi;
69 u32 ccr3;
70};
71
72struct mtrr_var_range {
73 u32 base_lo;
74 u32 base_hi;
75 u32 mask_lo;
76 u32 mask_hi;
77};
78
79void set_mtrr_done(struct set_mtrr_context *ctxt);
80void set_mtrr_cache_disable(struct set_mtrr_context *ctxt);
81void set_mtrr_prepare_save(struct set_mtrr_context *ctxt);
82
83void get_mtrr_state(void);
84
85extern void set_mtrr_ops(struct mtrr_ops * ops);
86
87extern u64 size_or_mask, size_and_mask;
88extern struct mtrr_ops * mtrr_if;
89
90#define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd)
91#define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1)
92
93extern unsigned int num_var_ranges;
94
95void mtrr_state_warn(void);
96const char *mtrr_attrib_to_str(int x);
97void mtrr_wrmsr(unsigned, unsigned, unsigned);
98
diff --git a/arch/x86/kernel/cpu/mtrr/state.c b/arch/x86/kernel/cpu/mtrr/state.c
new file mode 100644
index 000000000000..49e20c2afcdf
--- /dev/null
+++ b/arch/x86/kernel/cpu/mtrr/state.c
@@ -0,0 +1,79 @@
1#include <linux/mm.h>
2#include <linux/init.h>
3#include <asm/io.h>
4#include <asm/mtrr.h>
5#include <asm/msr.h>
6#include <asm/processor-cyrix.h>
7#include "mtrr.h"
8
9
10/* Put the processor into a state where MTRRs can be safely set */
11void set_mtrr_prepare_save(struct set_mtrr_context *ctxt)
12{
13 unsigned int cr0;
14
15 /* Disable interrupts locally */
16 local_irq_save(ctxt->flags);
17
18 if (use_intel() || is_cpu(CYRIX)) {
19
20 /* Save value of CR4 and clear Page Global Enable (bit 7) */
21 if ( cpu_has_pge ) {
22 ctxt->cr4val = read_cr4();
23 write_cr4(ctxt->cr4val & ~X86_CR4_PGE);
24 }
25
26 /* Disable and flush caches. Note that wbinvd flushes the TLBs as
27 a side-effect */
28 cr0 = read_cr0() | 0x40000000;
29 wbinvd();
30 write_cr0(cr0);
31 wbinvd();
32
33 if (use_intel())
34 /* Save MTRR state */
35 rdmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi);
36 else
37 /* Cyrix ARRs - everything else were excluded at the top */
38 ctxt->ccr3 = getCx86(CX86_CCR3);
39 }
40}
41
42void set_mtrr_cache_disable(struct set_mtrr_context *ctxt)
43{
44 if (use_intel())
45 /* Disable MTRRs, and set the default type to uncached */
46 mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo & 0xf300UL,
47 ctxt->deftype_hi);
48 else if (is_cpu(CYRIX))
49 /* Cyrix ARRs - everything else were excluded at the top */
50 setCx86(CX86_CCR3, (ctxt->ccr3 & 0x0f) | 0x10);
51}
52
53/* Restore the processor after a set_mtrr_prepare */
54void set_mtrr_done(struct set_mtrr_context *ctxt)
55{
56 if (use_intel() || is_cpu(CYRIX)) {
57
58 /* Flush caches and TLBs */
59 wbinvd();
60
61 /* Restore MTRRdefType */
62 if (use_intel())
63 /* Intel (P6) standard MTRRs */
64 mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi);
65 else
66 /* Cyrix ARRs - everything else was excluded at the top */
67 setCx86(CX86_CCR3, ctxt->ccr3);
68
69 /* Enable caches */
70 write_cr0(read_cr0() & 0xbfffffff);
71
72 /* Restore value of CR4 */
73 if ( cpu_has_pge )
74 write_cr4(ctxt->cr4val);
75 }
76 /* Re-enable interrupts locally (if enabled previously) */
77 local_irq_restore(ctxt->flags);
78}
79
diff --git a/arch/x86/kernel/cpu/nexgen.c b/arch/x86/kernel/cpu/nexgen.c
new file mode 100644
index 000000000000..961fbe1a748f
--- /dev/null
+++ b/arch/x86/kernel/cpu/nexgen.c
@@ -0,0 +1,60 @@
1#include <linux/kernel.h>
2#include <linux/init.h>
3#include <linux/string.h>
4#include <asm/processor.h>
5
6#include "cpu.h"
7
8/*
9 * Detect a NexGen CPU running without BIOS hypercode new enough
10 * to have CPUID. (Thanks to Herbert Oppmann)
11 */
12
13static int __cpuinit deep_magic_nexgen_probe(void)
14{
15 int ret;
16
17 __asm__ __volatile__ (
18 " movw $0x5555, %%ax\n"
19 " xorw %%dx,%%dx\n"
20 " movw $2, %%cx\n"
21 " divw %%cx\n"
22 " movl $0, %%eax\n"
23 " jnz 1f\n"
24 " movl $1, %%eax\n"
25 "1:\n"
26 : "=a" (ret) : : "cx", "dx" );
27 return ret;
28}
29
30static void __cpuinit init_nexgen(struct cpuinfo_x86 * c)
31{
32 c->x86_cache_size = 256; /* A few had 1 MB... */
33}
34
35static void __cpuinit nexgen_identify(struct cpuinfo_x86 * c)
36{
37 /* Detect NexGen with old hypercode */
38 if ( deep_magic_nexgen_probe() ) {
39 strcpy(c->x86_vendor_id, "NexGenDriven");
40 }
41}
42
43static struct cpu_dev nexgen_cpu_dev __cpuinitdata = {
44 .c_vendor = "Nexgen",
45 .c_ident = { "NexGenDriven" },
46 .c_models = {
47 { .vendor = X86_VENDOR_NEXGEN,
48 .family = 5,
49 .model_names = { [1] = "Nx586" }
50 },
51 },
52 .c_init = init_nexgen,
53 .c_identify = nexgen_identify,
54};
55
56int __init nexgen_init_cpu(void)
57{
58 cpu_devs[X86_VENDOR_NEXGEN] = &nexgen_cpu_dev;
59 return 0;
60}
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
new file mode 100644
index 000000000000..93fecd4b03de
--- /dev/null
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -0,0 +1,713 @@
1/* local apic based NMI watchdog for various CPUs.
2 This file also handles reservation of performance counters for coordination
3 with other users (like oprofile).
4
5 Note that these events normally don't tick when the CPU idles. This means
6 the frequency varies with CPU load.
7
8 Original code for K7/P6 written by Keith Owens */
9
10#include <linux/percpu.h>
11#include <linux/module.h>
12#include <linux/kernel.h>
13#include <linux/bitops.h>
14#include <linux/smp.h>
15#include <linux/nmi.h>
16#include <asm/apic.h>
17#include <asm/intel_arch_perfmon.h>
18
19struct nmi_watchdog_ctlblk {
20 unsigned int cccr_msr;
21 unsigned int perfctr_msr; /* the MSR to reset in NMI handler */
22 unsigned int evntsel_msr; /* the MSR to select the events to handle */
23};
24
25/* Interface defining a CPU specific perfctr watchdog */
26struct wd_ops {
27 int (*reserve)(void);
28 void (*unreserve)(void);
29 int (*setup)(unsigned nmi_hz);
30 void (*rearm)(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz);
31 void (*stop)(void);
32 unsigned perfctr;
33 unsigned evntsel;
34 u64 checkbit;
35};
36
37static struct wd_ops *wd_ops;
38
39/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
40 * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now)
41 */
42#define NMI_MAX_COUNTER_BITS 66
43
44/* perfctr_nmi_owner tracks the ownership of the perfctr registers:
45 * evtsel_nmi_owner tracks the ownership of the event selection
46 * - different performance counters/ event selection may be reserved for
47 * different subsystems this reservation system just tries to coordinate
48 * things a little
49 */
50static DECLARE_BITMAP(perfctr_nmi_owner, NMI_MAX_COUNTER_BITS);
51static DECLARE_BITMAP(evntsel_nmi_owner, NMI_MAX_COUNTER_BITS);
52
53static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk);
54
55/* converts an msr to an appropriate reservation bit */
56static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
57{
58 /* returns the bit offset of the performance counter register */
59 switch (boot_cpu_data.x86_vendor) {
60 case X86_VENDOR_AMD:
61 return (msr - MSR_K7_PERFCTR0);
62 case X86_VENDOR_INTEL:
63 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
64 return (msr - MSR_ARCH_PERFMON_PERFCTR0);
65
66 switch (boot_cpu_data.x86) {
67 case 6:
68 return (msr - MSR_P6_PERFCTR0);
69 case 15:
70 return (msr - MSR_P4_BPU_PERFCTR0);
71 }
72 }
73 return 0;
74}
75
76/* converts an msr to an appropriate reservation bit */
77/* returns the bit offset of the event selection register */
78static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
79{
80 /* returns the bit offset of the event selection register */
81 switch (boot_cpu_data.x86_vendor) {
82 case X86_VENDOR_AMD:
83 return (msr - MSR_K7_EVNTSEL0);
84 case X86_VENDOR_INTEL:
85 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
86 return (msr - MSR_ARCH_PERFMON_EVENTSEL0);
87
88 switch (boot_cpu_data.x86) {
89 case 6:
90 return (msr - MSR_P6_EVNTSEL0);
91 case 15:
92 return (msr - MSR_P4_BSU_ESCR0);
93 }
94 }
95 return 0;
96
97}
98
99/* checks for a bit availability (hack for oprofile) */
100int avail_to_resrv_perfctr_nmi_bit(unsigned int counter)
101{
102 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
103
104 return (!test_bit(counter, perfctr_nmi_owner));
105}
106
107/* checks the an msr for availability */
108int avail_to_resrv_perfctr_nmi(unsigned int msr)
109{
110 unsigned int counter;
111
112 counter = nmi_perfctr_msr_to_bit(msr);
113 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
114
115 return (!test_bit(counter, perfctr_nmi_owner));
116}
117
118int reserve_perfctr_nmi(unsigned int msr)
119{
120 unsigned int counter;
121
122 counter = nmi_perfctr_msr_to_bit(msr);
123 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
124
125 if (!test_and_set_bit(counter, perfctr_nmi_owner))
126 return 1;
127 return 0;
128}
129
130void release_perfctr_nmi(unsigned int msr)
131{
132 unsigned int counter;
133
134 counter = nmi_perfctr_msr_to_bit(msr);
135 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
136
137 clear_bit(counter, perfctr_nmi_owner);
138}
139
140int reserve_evntsel_nmi(unsigned int msr)
141{
142 unsigned int counter;
143
144 counter = nmi_evntsel_msr_to_bit(msr);
145 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
146
147 if (!test_and_set_bit(counter, evntsel_nmi_owner))
148 return 1;
149 return 0;
150}
151
152void release_evntsel_nmi(unsigned int msr)
153{
154 unsigned int counter;
155
156 counter = nmi_evntsel_msr_to_bit(msr);
157 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
158
159 clear_bit(counter, evntsel_nmi_owner);
160}
161
162EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi);
163EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
164EXPORT_SYMBOL(reserve_perfctr_nmi);
165EXPORT_SYMBOL(release_perfctr_nmi);
166EXPORT_SYMBOL(reserve_evntsel_nmi);
167EXPORT_SYMBOL(release_evntsel_nmi);
168
169void disable_lapic_nmi_watchdog(void)
170{
171 BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
172
173 if (atomic_read(&nmi_active) <= 0)
174 return;
175
176 on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1);
177 wd_ops->unreserve();
178
179 BUG_ON(atomic_read(&nmi_active) != 0);
180}
181
182void enable_lapic_nmi_watchdog(void)
183{
184 BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
185
186 /* are we already enabled */
187 if (atomic_read(&nmi_active) != 0)
188 return;
189
190 /* are we lapic aware */
191 if (!wd_ops)
192 return;
193 if (!wd_ops->reserve()) {
194 printk(KERN_ERR "NMI watchdog: cannot reserve perfctrs\n");
195 return;
196 }
197
198 on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1);
199 touch_nmi_watchdog();
200}
201
202/*
203 * Activate the NMI watchdog via the local APIC.
204 */
205
206static unsigned int adjust_for_32bit_ctr(unsigned int hz)
207{
208 u64 counter_val;
209 unsigned int retval = hz;
210
211 /*
212 * On Intel CPUs with P6/ARCH_PERFMON only 32 bits in the counter
213 * are writable, with higher bits sign extending from bit 31.
214 * So, we can only program the counter with 31 bit values and
215 * 32nd bit should be 1, for 33.. to be 1.
216 * Find the appropriate nmi_hz
217 */
218 counter_val = (u64)cpu_khz * 1000;
219 do_div(counter_val, retval);
220 if (counter_val > 0x7fffffffULL) {
221 u64 count = (u64)cpu_khz * 1000;
222 do_div(count, 0x7fffffffUL);
223 retval = count + 1;
224 }
225 return retval;
226}
227
228static void
229write_watchdog_counter(unsigned int perfctr_msr, const char *descr, unsigned nmi_hz)
230{
231 u64 count = (u64)cpu_khz * 1000;
232
233 do_div(count, nmi_hz);
234 if(descr)
235 Dprintk("setting %s to -0x%08Lx\n", descr, count);
236 wrmsrl(perfctr_msr, 0 - count);
237}
238
239static void write_watchdog_counter32(unsigned int perfctr_msr,
240 const char *descr, unsigned nmi_hz)
241{
242 u64 count = (u64)cpu_khz * 1000;
243
244 do_div(count, nmi_hz);
245 if(descr)
246 Dprintk("setting %s to -0x%08Lx\n", descr, count);
247 wrmsr(perfctr_msr, (u32)(-count), 0);
248}
249
250/* AMD K7/K8/Family10h/Family11h support. AMD keeps this interface
251 nicely stable so there is not much variety */
252
253#define K7_EVNTSEL_ENABLE (1 << 22)
254#define K7_EVNTSEL_INT (1 << 20)
255#define K7_EVNTSEL_OS (1 << 17)
256#define K7_EVNTSEL_USR (1 << 16)
257#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76
258#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
259
260static int setup_k7_watchdog(unsigned nmi_hz)
261{
262 unsigned int perfctr_msr, evntsel_msr;
263 unsigned int evntsel;
264 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
265
266 perfctr_msr = wd_ops->perfctr;
267 evntsel_msr = wd_ops->evntsel;
268
269 wrmsrl(perfctr_msr, 0UL);
270
271 evntsel = K7_EVNTSEL_INT
272 | K7_EVNTSEL_OS
273 | K7_EVNTSEL_USR
274 | K7_NMI_EVENT;
275
276 /* setup the timer */
277 wrmsr(evntsel_msr, evntsel, 0);
278 write_watchdog_counter(perfctr_msr, "K7_PERFCTR0",nmi_hz);
279 apic_write(APIC_LVTPC, APIC_DM_NMI);
280 evntsel |= K7_EVNTSEL_ENABLE;
281 wrmsr(evntsel_msr, evntsel, 0);
282
283 wd->perfctr_msr = perfctr_msr;
284 wd->evntsel_msr = evntsel_msr;
285 wd->cccr_msr = 0; //unused
286 return 1;
287}
288
289static void single_msr_stop_watchdog(void)
290{
291 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
292
293 wrmsr(wd->evntsel_msr, 0, 0);
294}
295
296static int single_msr_reserve(void)
297{
298 if (!reserve_perfctr_nmi(wd_ops->perfctr))
299 return 0;
300
301 if (!reserve_evntsel_nmi(wd_ops->evntsel)) {
302 release_perfctr_nmi(wd_ops->perfctr);
303 return 0;
304 }
305 return 1;
306}
307
308static void single_msr_unreserve(void)
309{
310 release_evntsel_nmi(wd_ops->evntsel);
311 release_perfctr_nmi(wd_ops->perfctr);
312}
313
314static void single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
315{
316 /* start the cycle over again */
317 write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz);
318}
319
320static struct wd_ops k7_wd_ops = {
321 .reserve = single_msr_reserve,
322 .unreserve = single_msr_unreserve,
323 .setup = setup_k7_watchdog,
324 .rearm = single_msr_rearm,
325 .stop = single_msr_stop_watchdog,
326 .perfctr = MSR_K7_PERFCTR0,
327 .evntsel = MSR_K7_EVNTSEL0,
328 .checkbit = 1ULL<<47,
329};
330
331/* Intel Model 6 (PPro+,P2,P3,P-M,Core1) */
332
333#define P6_EVNTSEL0_ENABLE (1 << 22)
334#define P6_EVNTSEL_INT (1 << 20)
335#define P6_EVNTSEL_OS (1 << 17)
336#define P6_EVNTSEL_USR (1 << 16)
337#define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79
338#define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED
339
340static int setup_p6_watchdog(unsigned nmi_hz)
341{
342 unsigned int perfctr_msr, evntsel_msr;
343 unsigned int evntsel;
344 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
345
346 perfctr_msr = wd_ops->perfctr;
347 evntsel_msr = wd_ops->evntsel;
348
349 /* KVM doesn't implement this MSR */
350 if (wrmsr_safe(perfctr_msr, 0, 0) < 0)
351 return 0;
352
353 evntsel = P6_EVNTSEL_INT
354 | P6_EVNTSEL_OS
355 | P6_EVNTSEL_USR
356 | P6_NMI_EVENT;
357
358 /* setup the timer */
359 wrmsr(evntsel_msr, evntsel, 0);
360 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
361 write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0",nmi_hz);
362 apic_write(APIC_LVTPC, APIC_DM_NMI);
363 evntsel |= P6_EVNTSEL0_ENABLE;
364 wrmsr(evntsel_msr, evntsel, 0);
365
366 wd->perfctr_msr = perfctr_msr;
367 wd->evntsel_msr = evntsel_msr;
368 wd->cccr_msr = 0; //unused
369 return 1;
370}
371
372static void p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
373{
374 /* P6 based Pentium M need to re-unmask
375 * the apic vector but it doesn't hurt
376 * other P6 variant.
377 * ArchPerfom/Core Duo also needs this */
378 apic_write(APIC_LVTPC, APIC_DM_NMI);
379 /* P6/ARCH_PERFMON has 32 bit counter write */
380 write_watchdog_counter32(wd->perfctr_msr, NULL,nmi_hz);
381}
382
383static struct wd_ops p6_wd_ops = {
384 .reserve = single_msr_reserve,
385 .unreserve = single_msr_unreserve,
386 .setup = setup_p6_watchdog,
387 .rearm = p6_rearm,
388 .stop = single_msr_stop_watchdog,
389 .perfctr = MSR_P6_PERFCTR0,
390 .evntsel = MSR_P6_EVNTSEL0,
391 .checkbit = 1ULL<<39,
392};
393
394/* Intel P4 performance counters. By far the most complicated of all. */
395
396#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7)
397#define P4_ESCR_EVENT_SELECT(N) ((N)<<25)
398#define P4_ESCR_OS (1<<3)
399#define P4_ESCR_USR (1<<2)
400#define P4_CCCR_OVF_PMI0 (1<<26)
401#define P4_CCCR_OVF_PMI1 (1<<27)
402#define P4_CCCR_THRESHOLD(N) ((N)<<20)
403#define P4_CCCR_COMPLEMENT (1<<19)
404#define P4_CCCR_COMPARE (1<<18)
405#define P4_CCCR_REQUIRED (3<<16)
406#define P4_CCCR_ESCR_SELECT(N) ((N)<<13)
407#define P4_CCCR_ENABLE (1<<12)
408#define P4_CCCR_OVF (1<<31)
409
410/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
411 CRU_ESCR0 (with any non-null event selector) through a complemented
412 max threshold. [IA32-Vol3, Section 14.9.9] */
413
414static int setup_p4_watchdog(unsigned nmi_hz)
415{
416 unsigned int perfctr_msr, evntsel_msr, cccr_msr;
417 unsigned int evntsel, cccr_val;
418 unsigned int misc_enable, dummy;
419 unsigned int ht_num;
420 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
421
422 rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy);
423 if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
424 return 0;
425
426#ifdef CONFIG_SMP
427 /* detect which hyperthread we are on */
428 if (smp_num_siblings == 2) {
429 unsigned int ebx, apicid;
430
431 ebx = cpuid_ebx(1);
432 apicid = (ebx >> 24) & 0xff;
433 ht_num = apicid & 1;
434 } else
435#endif
436 ht_num = 0;
437
438 /* performance counters are shared resources
439 * assign each hyperthread its own set
440 * (re-use the ESCR0 register, seems safe
441 * and keeps the cccr_val the same)
442 */
443 if (!ht_num) {
444 /* logical cpu 0 */
445 perfctr_msr = MSR_P4_IQ_PERFCTR0;
446 evntsel_msr = MSR_P4_CRU_ESCR0;
447 cccr_msr = MSR_P4_IQ_CCCR0;
448 cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
449 } else {
450 /* logical cpu 1 */
451 perfctr_msr = MSR_P4_IQ_PERFCTR1;
452 evntsel_msr = MSR_P4_CRU_ESCR0;
453 cccr_msr = MSR_P4_IQ_CCCR1;
454 cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4);
455 }
456
457 evntsel = P4_ESCR_EVENT_SELECT(0x3F)
458 | P4_ESCR_OS
459 | P4_ESCR_USR;
460
461 cccr_val |= P4_CCCR_THRESHOLD(15)
462 | P4_CCCR_COMPLEMENT
463 | P4_CCCR_COMPARE
464 | P4_CCCR_REQUIRED;
465
466 wrmsr(evntsel_msr, evntsel, 0);
467 wrmsr(cccr_msr, cccr_val, 0);
468 write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz);
469 apic_write(APIC_LVTPC, APIC_DM_NMI);
470 cccr_val |= P4_CCCR_ENABLE;
471 wrmsr(cccr_msr, cccr_val, 0);
472 wd->perfctr_msr = perfctr_msr;
473 wd->evntsel_msr = evntsel_msr;
474 wd->cccr_msr = cccr_msr;
475 return 1;
476}
477
478static void stop_p4_watchdog(void)
479{
480 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
481 wrmsr(wd->cccr_msr, 0, 0);
482 wrmsr(wd->evntsel_msr, 0, 0);
483}
484
485static int p4_reserve(void)
486{
487 if (!reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR0))
488 return 0;
489#ifdef CONFIG_SMP
490 if (smp_num_siblings > 1 && !reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR1))
491 goto fail1;
492#endif
493 if (!reserve_evntsel_nmi(MSR_P4_CRU_ESCR0))
494 goto fail2;
495 /* RED-PEN why is ESCR1 not reserved here? */
496 return 1;
497 fail2:
498#ifdef CONFIG_SMP
499 if (smp_num_siblings > 1)
500 release_perfctr_nmi(MSR_P4_IQ_PERFCTR1);
501 fail1:
502#endif
503 release_perfctr_nmi(MSR_P4_IQ_PERFCTR0);
504 return 0;
505}
506
507static void p4_unreserve(void)
508{
509#ifdef CONFIG_SMP
510 if (smp_num_siblings > 1)
511 release_perfctr_nmi(MSR_P4_IQ_PERFCTR1);
512#endif
513 release_evntsel_nmi(MSR_P4_CRU_ESCR0);
514 release_perfctr_nmi(MSR_P4_IQ_PERFCTR0);
515}
516
517static void p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
518{
519 unsigned dummy;
520 /*
521 * P4 quirks:
522 * - An overflown perfctr will assert its interrupt
523 * until the OVF flag in its CCCR is cleared.
524 * - LVTPC is masked on interrupt and must be
525 * unmasked by the LVTPC handler.
526 */
527 rdmsrl(wd->cccr_msr, dummy);
528 dummy &= ~P4_CCCR_OVF;
529 wrmsrl(wd->cccr_msr, dummy);
530 apic_write(APIC_LVTPC, APIC_DM_NMI);
531 /* start the cycle over again */
532 write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz);
533}
534
535static struct wd_ops p4_wd_ops = {
536 .reserve = p4_reserve,
537 .unreserve = p4_unreserve,
538 .setup = setup_p4_watchdog,
539 .rearm = p4_rearm,
540 .stop = stop_p4_watchdog,
541 /* RED-PEN this is wrong for the other sibling */
542 .perfctr = MSR_P4_BPU_PERFCTR0,
543 .evntsel = MSR_P4_BSU_ESCR0,
544 .checkbit = 1ULL<<39,
545};
546
547/* Watchdog using the Intel architected PerfMon. Used for Core2 and hopefully
548 all future Intel CPUs. */
549
550#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL
551#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK
552
553static int setup_intel_arch_watchdog(unsigned nmi_hz)
554{
555 unsigned int ebx;
556 union cpuid10_eax eax;
557 unsigned int unused;
558 unsigned int perfctr_msr, evntsel_msr;
559 unsigned int evntsel;
560 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
561
562 /*
563 * Check whether the Architectural PerfMon supports
564 * Unhalted Core Cycles Event or not.
565 * NOTE: Corresponding bit = 0 in ebx indicates event present.
566 */
567 cpuid(10, &(eax.full), &ebx, &unused, &unused);
568 if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
569 (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
570 return 0;
571
572 perfctr_msr = wd_ops->perfctr;
573 evntsel_msr = wd_ops->evntsel;
574
575 wrmsrl(perfctr_msr, 0UL);
576
577 evntsel = ARCH_PERFMON_EVENTSEL_INT
578 | ARCH_PERFMON_EVENTSEL_OS
579 | ARCH_PERFMON_EVENTSEL_USR
580 | ARCH_PERFMON_NMI_EVENT_SEL
581 | ARCH_PERFMON_NMI_EVENT_UMASK;
582
583 /* setup the timer */
584 wrmsr(evntsel_msr, evntsel, 0);
585 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
586 write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz);
587 apic_write(APIC_LVTPC, APIC_DM_NMI);
588 evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
589 wrmsr(evntsel_msr, evntsel, 0);
590
591 wd->perfctr_msr = perfctr_msr;
592 wd->evntsel_msr = evntsel_msr;
593 wd->cccr_msr = 0; //unused
594 wd_ops->checkbit = 1ULL << (eax.split.bit_width - 1);
595 return 1;
596}
597
598static struct wd_ops intel_arch_wd_ops = {
599 .reserve = single_msr_reserve,
600 .unreserve = single_msr_unreserve,
601 .setup = setup_intel_arch_watchdog,
602 .rearm = p6_rearm,
603 .stop = single_msr_stop_watchdog,
604 .perfctr = MSR_ARCH_PERFMON_PERFCTR1,
605 .evntsel = MSR_ARCH_PERFMON_EVENTSEL1,
606};
607
608static struct wd_ops coreduo_wd_ops = {
609 .reserve = single_msr_reserve,
610 .unreserve = single_msr_unreserve,
611 .setup = setup_intel_arch_watchdog,
612 .rearm = p6_rearm,
613 .stop = single_msr_stop_watchdog,
614 .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
615 .evntsel = MSR_ARCH_PERFMON_EVENTSEL0,
616};
617
618static void probe_nmi_watchdog(void)
619{
620 switch (boot_cpu_data.x86_vendor) {
621 case X86_VENDOR_AMD:
622 if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 &&
623 boot_cpu_data.x86 != 16)
624 return;
625 wd_ops = &k7_wd_ops;
626 break;
627 case X86_VENDOR_INTEL:
628 /* Work around Core Duo (Yonah) errata AE49 where perfctr1
629 doesn't have a working enable bit. */
630 if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) {
631 wd_ops = &coreduo_wd_ops;
632 break;
633 }
634 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
635 wd_ops = &intel_arch_wd_ops;
636 break;
637 }
638 switch (boot_cpu_data.x86) {
639 case 6:
640 if (boot_cpu_data.x86_model > 0xd)
641 return;
642
643 wd_ops = &p6_wd_ops;
644 break;
645 case 15:
646 if (boot_cpu_data.x86_model > 0x4)
647 return;
648
649 wd_ops = &p4_wd_ops;
650 break;
651 default:
652 return;
653 }
654 break;
655 }
656}
657
658/* Interface to nmi.c */
659
660int lapic_watchdog_init(unsigned nmi_hz)
661{
662 if (!wd_ops) {
663 probe_nmi_watchdog();
664 if (!wd_ops)
665 return -1;
666
667 if (!wd_ops->reserve()) {
668 printk(KERN_ERR
669 "NMI watchdog: cannot reserve perfctrs\n");
670 return -1;
671 }
672 }
673
674 if (!(wd_ops->setup(nmi_hz))) {
675 printk(KERN_ERR "Cannot setup NMI watchdog on CPU %d\n",
676 raw_smp_processor_id());
677 return -1;
678 }
679
680 return 0;
681}
682
683void lapic_watchdog_stop(void)
684{
685 if (wd_ops)
686 wd_ops->stop();
687}
688
689unsigned lapic_adjust_nmi_hz(unsigned hz)
690{
691 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
692 if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
693 wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR1)
694 hz = adjust_for_32bit_ctr(hz);
695 return hz;
696}
697
698int lapic_wd_event(unsigned nmi_hz)
699{
700 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
701 u64 ctr;
702 rdmsrl(wd->perfctr_msr, ctr);
703 if (ctr & wd_ops->checkbit) { /* perfctr still running? */
704 return 0;
705 }
706 wd_ops->rearm(wd, nmi_hz);
707 return 1;
708}
709
710int lapic_watchdog_ok(void)
711{
712 return wd_ops != NULL;
713}
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
new file mode 100644
index 000000000000..1e31b6caffb1
--- /dev/null
+++ b/arch/x86/kernel/cpu/proc.c
@@ -0,0 +1,192 @@
1#include <linux/smp.h>
2#include <linux/timex.h>
3#include <linux/string.h>
4#include <asm/semaphore.h>
5#include <linux/seq_file.h>
6#include <linux/cpufreq.h>
7
8/*
9 * Get CPU information for use by the procfs.
10 */
11static int show_cpuinfo(struct seq_file *m, void *v)
12{
13 /*
14 * These flag bits must match the definitions in <asm/cpufeature.h>.
15 * NULL means this bit is undefined or reserved; either way it doesn't
16 * have meaning as far as Linux is concerned. Note that it's important
17 * to realize there is a difference between this table and CPUID -- if
18 * applications want to get the raw CPUID data, they should access
19 * /dev/cpu/<cpu_nr>/cpuid instead.
20 */
21 static const char * const x86_cap_flags[] = {
22 /* Intel-defined */
23 "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
24 "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
25 "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx",
26 "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe",
27
28 /* AMD-defined */
29 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
30 NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
31 NULL, NULL, NULL, "mp", "nx", NULL, "mmxext", NULL,
32 NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm",
33 "3dnowext", "3dnow",
34
35 /* Transmeta-defined */
36 "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
37 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
38 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
39 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
40
41 /* Other (Linux-defined) */
42 "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr",
43 NULL, NULL, NULL, NULL,
44 "constant_tsc", "up", NULL, "arch_perfmon",
45 "pebs", "bts", NULL, "sync_rdtsc",
46 "rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
47 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
48
49 /* Intel-defined (#2) */
50 "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
51 "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL,
52 NULL, NULL, "dca", NULL, NULL, NULL, NULL, "popcnt",
53 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
54
55 /* VIA/Cyrix/Centaur-defined */
56 NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en",
57 "ace2", "ace2_en", "phe", "phe_en", "pmm", "pmm_en", NULL, NULL,
58 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
59 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
60
61 /* AMD-defined (#2) */
62 "lahf_lm", "cmp_legacy", "svm", "extapic", "cr8_legacy",
63 "altmovcr8", "abm", "sse4a",
64 "misalignsse", "3dnowprefetch",
65 "osvw", "ibs", NULL, NULL, NULL, NULL,
66 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
67 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
68
69 /* Auxiliary (Linux-defined) */
70 "ida", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
71 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
72 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
73 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
74 };
75 static const char * const x86_power_flags[] = {
76 "ts", /* temperature sensor */
77 "fid", /* frequency id control */
78 "vid", /* voltage id control */
79 "ttp", /* thermal trip */
80 "tm",
81 "stc",
82 "100mhzsteps",
83 "hwpstate",
84 "", /* constant_tsc - moved to flags */
85 /* nothing */
86 };
87 struct cpuinfo_x86 *c = v;
88 int i, n = c - cpu_data;
89 int fpu_exception;
90
91#ifdef CONFIG_SMP
92 if (!cpu_online(n))
93 return 0;
94#endif
95 seq_printf(m, "processor\t: %d\n"
96 "vendor_id\t: %s\n"
97 "cpu family\t: %d\n"
98 "model\t\t: %d\n"
99 "model name\t: %s\n",
100 n,
101 c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
102 c->x86,
103 c->x86_model,
104 c->x86_model_id[0] ? c->x86_model_id : "unknown");
105
106 if (c->x86_mask || c->cpuid_level >= 0)
107 seq_printf(m, "stepping\t: %d\n", c->x86_mask);
108 else
109 seq_printf(m, "stepping\t: unknown\n");
110
111 if ( cpu_has(c, X86_FEATURE_TSC) ) {
112 unsigned int freq = cpufreq_quick_get(n);
113 if (!freq)
114 freq = cpu_khz;
115 seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
116 freq / 1000, (freq % 1000));
117 }
118
119 /* Cache size */
120 if (c->x86_cache_size >= 0)
121 seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
122#ifdef CONFIG_X86_HT
123 if (c->x86_max_cores * smp_num_siblings > 1) {
124 seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
125 seq_printf(m, "siblings\t: %d\n", cpus_weight(cpu_core_map[n]));
126 seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
127 seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
128 }
129#endif
130
131 /* We use exception 16 if we have hardware math and we've either seen it or the CPU claims it is internal */
132 fpu_exception = c->hard_math && (ignore_fpu_irq || cpu_has_fpu);
133 seq_printf(m, "fdiv_bug\t: %s\n"
134 "hlt_bug\t\t: %s\n"
135 "f00f_bug\t: %s\n"
136 "coma_bug\t: %s\n"
137 "fpu\t\t: %s\n"
138 "fpu_exception\t: %s\n"
139 "cpuid level\t: %d\n"
140 "wp\t\t: %s\n"
141 "flags\t\t:",
142 c->fdiv_bug ? "yes" : "no",
143 c->hlt_works_ok ? "no" : "yes",
144 c->f00f_bug ? "yes" : "no",
145 c->coma_bug ? "yes" : "no",
146 c->hard_math ? "yes" : "no",
147 fpu_exception ? "yes" : "no",
148 c->cpuid_level,
149 c->wp_works_ok ? "yes" : "no");
150
151 for ( i = 0 ; i < 32*NCAPINTS ; i++ )
152 if ( test_bit(i, c->x86_capability) &&
153 x86_cap_flags[i] != NULL )
154 seq_printf(m, " %s", x86_cap_flags[i]);
155
156 for (i = 0; i < 32; i++)
157 if (c->x86_power & (1 << i)) {
158 if (i < ARRAY_SIZE(x86_power_flags) &&
159 x86_power_flags[i])
160 seq_printf(m, "%s%s",
161 x86_power_flags[i][0]?" ":"",
162 x86_power_flags[i]);
163 else
164 seq_printf(m, " [%d]", i);
165 }
166
167 seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
168 c->loops_per_jiffy/(500000/HZ),
169 (c->loops_per_jiffy/(5000/HZ)) % 100);
170 seq_printf(m, "clflush size\t: %u\n\n", c->x86_clflush_size);
171
172 return 0;
173}
174
175static void *c_start(struct seq_file *m, loff_t *pos)
176{
177 return *pos < NR_CPUS ? cpu_data + *pos : NULL;
178}
179static void *c_next(struct seq_file *m, void *v, loff_t *pos)
180{
181 ++*pos;
182 return c_start(m, pos);
183}
184static void c_stop(struct seq_file *m, void *v)
185{
186}
187struct seq_operations cpuinfo_op = {
188 .start = c_start,
189 .next = c_next,
190 .stop = c_stop,
191 .show = show_cpuinfo,
192};
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
new file mode 100644
index 000000000000..200fb3f9ebfb
--- /dev/null
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -0,0 +1,116 @@
1#include <linux/kernel.h>
2#include <linux/mm.h>
3#include <linux/init.h>
4#include <asm/processor.h>
5#include <asm/msr.h>
6#include "cpu.h"
7
8static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
9{
10 unsigned int cap_mask, uk, max, dummy;
11 unsigned int cms_rev1, cms_rev2;
12 unsigned int cpu_rev, cpu_freq = 0, cpu_flags, new_cpu_rev;
13 char cpu_info[65];
14
15 get_model_name(c); /* Same as AMD/Cyrix */
16 display_cacheinfo(c);
17
18 /* Print CMS and CPU revision */
19 max = cpuid_eax(0x80860000);
20 cpu_rev = 0;
21 if ( max >= 0x80860001 ) {
22 cpuid(0x80860001, &dummy, &cpu_rev, &cpu_freq, &cpu_flags);
23 if (cpu_rev != 0x02000000) {
24 printk(KERN_INFO "CPU: Processor revision %u.%u.%u.%u, %u MHz\n",
25 (cpu_rev >> 24) & 0xff,
26 (cpu_rev >> 16) & 0xff,
27 (cpu_rev >> 8) & 0xff,
28 cpu_rev & 0xff,
29 cpu_freq);
30 }
31 }
32 if ( max >= 0x80860002 ) {
33 cpuid(0x80860002, &new_cpu_rev, &cms_rev1, &cms_rev2, &dummy);
34 if (cpu_rev == 0x02000000) {
35 printk(KERN_INFO "CPU: Processor revision %08X, %u MHz\n",
36 new_cpu_rev, cpu_freq);
37 }
38 printk(KERN_INFO "CPU: Code Morphing Software revision %u.%u.%u-%u-%u\n",
39 (cms_rev1 >> 24) & 0xff,
40 (cms_rev1 >> 16) & 0xff,
41 (cms_rev1 >> 8) & 0xff,
42 cms_rev1 & 0xff,
43 cms_rev2);
44 }
45 if ( max >= 0x80860006 ) {
46 cpuid(0x80860003,
47 (void *)&cpu_info[0],
48 (void *)&cpu_info[4],
49 (void *)&cpu_info[8],
50 (void *)&cpu_info[12]);
51 cpuid(0x80860004,
52 (void *)&cpu_info[16],
53 (void *)&cpu_info[20],
54 (void *)&cpu_info[24],
55 (void *)&cpu_info[28]);
56 cpuid(0x80860005,
57 (void *)&cpu_info[32],
58 (void *)&cpu_info[36],
59 (void *)&cpu_info[40],
60 (void *)&cpu_info[44]);
61 cpuid(0x80860006,
62 (void *)&cpu_info[48],
63 (void *)&cpu_info[52],
64 (void *)&cpu_info[56],
65 (void *)&cpu_info[60]);
66 cpu_info[64] = '\0';
67 printk(KERN_INFO "CPU: %s\n", cpu_info);
68 }
69
70 /* Unhide possibly hidden capability flags */
71 rdmsr(0x80860004, cap_mask, uk);
72 wrmsr(0x80860004, ~0, uk);
73 c->x86_capability[0] = cpuid_edx(0x00000001);
74 wrmsr(0x80860004, cap_mask, uk);
75
76 /* All Transmeta CPUs have a constant TSC */
77 set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
78
79 /* If we can run i686 user-space code, call us an i686 */
80#define USER686 ((1 << X86_FEATURE_TSC)|\
81 (1 << X86_FEATURE_CX8)|\
82 (1 << X86_FEATURE_CMOV))
83 if (c->x86 == 5 && (c->x86_capability[0] & USER686) == USER686)
84 c->x86 = 6;
85
86#ifdef CONFIG_SYSCTL
87 /* randomize_va_space slows us down enormously;
88 it probably triggers retranslation of x86->native bytecode */
89 randomize_va_space = 0;
90#endif
91}
92
93static void __cpuinit transmeta_identify(struct cpuinfo_x86 * c)
94{
95 u32 xlvl;
96
97 /* Transmeta-defined flags: level 0x80860001 */
98 xlvl = cpuid_eax(0x80860000);
99 if ( (xlvl & 0xffff0000) == 0x80860000 ) {
100 if ( xlvl >= 0x80860001 )
101 c->x86_capability[2] = cpuid_edx(0x80860001);
102 }
103}
104
105static struct cpu_dev transmeta_cpu_dev __cpuinitdata = {
106 .c_vendor = "Transmeta",
107 .c_ident = { "GenuineTMx86", "TransmetaCPU" },
108 .c_init = init_transmeta,
109 .c_identify = transmeta_identify,
110};
111
112int __init transmeta_init_cpu(void)
113{
114 cpu_devs[X86_VENDOR_TRANSMETA] = &transmeta_cpu_dev;
115 return 0;
116}
diff --git a/arch/x86/kernel/cpu/umc.c b/arch/x86/kernel/cpu/umc.c
new file mode 100644
index 000000000000..a7a4e75bdcd7
--- /dev/null
+++ b/arch/x86/kernel/cpu/umc.c
@@ -0,0 +1,26 @@
1#include <linux/kernel.h>
2#include <linux/init.h>
3#include <asm/processor.h>
4#include "cpu.h"
5
6/* UMC chips appear to be only either 386 or 486, so no special init takes place.
7 */
8
9static struct cpu_dev umc_cpu_dev __cpuinitdata = {
10 .c_vendor = "UMC",
11 .c_ident = { "UMC UMC UMC" },
12 .c_models = {
13 { .vendor = X86_VENDOR_UMC, .family = 4, .model_names =
14 {
15 [1] = "U5D",
16 [2] = "U5S",
17 }
18 },
19 },
20};
21
22int __init umc_init_cpu(void)
23{
24 cpu_devs[X86_VENDOR_UMC] = &umc_cpu_dev;
25 return 0;
26}
diff --git a/arch/x86/kernel/cpufreq/Kconfig b/arch/x86/kernel/cpufreq/Kconfig
new file mode 100644
index 000000000000..a3fd51926cbd
--- /dev/null
+++ b/arch/x86/kernel/cpufreq/Kconfig
@@ -0,0 +1,108 @@
1#
2# CPU Frequency scaling
3#
4
5menu "CPU Frequency scaling"
6
7source "drivers/cpufreq/Kconfig"
8
9if CPU_FREQ
10
11comment "CPUFreq processor drivers"
12
13config X86_POWERNOW_K8
14 tristate "AMD Opteron/Athlon64 PowerNow!"
15 select CPU_FREQ_TABLE
16 help
17 This adds the CPUFreq driver for mobile AMD Opteron/Athlon64 processors.
18
19 To compile this driver as a module, choose M here: the
20 module will be called powernow-k8.
21
22 For details, take a look at <file:Documentation/cpu-freq/>.
23
24 If in doubt, say N.
25
26config X86_POWERNOW_K8_ACPI
27 bool
28 depends on X86_POWERNOW_K8 && ACPI_PROCESSOR
29 depends on !(X86_POWERNOW_K8 = y && ACPI_PROCESSOR = m)
30 default y
31
32config X86_SPEEDSTEP_CENTRINO
33 tristate "Intel Enhanced SpeedStep (deprecated)"
34 select CPU_FREQ_TABLE
35 depends on ACPI_PROCESSOR
36 help
37 This is deprecated and this functionality is now merged into
38 acpi_cpufreq (X86_ACPI_CPUFREQ). Use that driver instead of
39 speedstep_centrino.
40 This adds the CPUFreq driver for Enhanced SpeedStep enabled
41 mobile CPUs. This means Intel Pentium M (Centrino) CPUs
42 or 64bit enabled Intel Xeons.
43
44 To compile this driver as a module, choose M here: the
45 module will be called speedstep-centrino.
46
47 For details, take a look at <file:Documentation/cpu-freq/>.
48
49 If in doubt, say N.
50
51config X86_ACPI_CPUFREQ
52 tristate "ACPI Processor P-States driver"
53 select CPU_FREQ_TABLE
54 depends on ACPI_PROCESSOR
55 help
56 This driver adds a CPUFreq driver which utilizes the ACPI
57 Processor Performance States.
58 This driver also supports Intel Enhanced Speedstep.
59
60 To compile this driver as a module, choose M here: the
61 module will be called acpi-cpufreq.
62
63 For details, take a look at <file:Documentation/cpu-freq/>.
64
65 If in doubt, say N.
66
67comment "shared options"
68
69config X86_ACPI_CPUFREQ_PROC_INTF
70 bool "/proc/acpi/processor/../performance interface (deprecated)"
71 depends on PROC_FS
72 depends on X86_ACPI_CPUFREQ || X86_POWERNOW_K8_ACPI
73 help
74 This enables the deprecated /proc/acpi/processor/../performance
75 interface. While it is helpful for debugging, the generic,
76 cross-architecture cpufreq interfaces should be used.
77
78 If in doubt, say N.
79
80config X86_P4_CLOCKMOD
81 tristate "Intel Pentium 4 clock modulation"
82 depends on EMBEDDED
83 select CPU_FREQ_TABLE
84 help
85 This adds the clock modulation driver for Intel Pentium 4 / XEON
86 processors. When enabled it will lower CPU temperature by skipping
87 clocks.
88
89 This driver should be only used in exceptional
90 circumstances when very low power is needed because it causes severe
91 slowdowns and noticeable latencies. Normally Speedstep should be used
92 instead.
93
94 To compile this driver as a module, choose M here: the
95 module will be called p4-clockmod.
96
97 For details, take a look at <file:Documentation/cpu-freq/>.
98
99 Unless you are absolutely sure say N.
100
101
102config X86_SPEEDSTEP_LIB
103 tristate
104 default X86_P4_CLOCKMOD
105
106endif
107
108endmenu
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
new file mode 100644
index 000000000000..5c2faa10e9fa
--- /dev/null
+++ b/arch/x86/kernel/cpuid.c
@@ -0,0 +1,242 @@
1/* ----------------------------------------------------------------------- *
2 *
3 * Copyright 2000 H. Peter Anvin - All Rights Reserved
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139,
8 * USA; either version 2 of the License, or (at your option) any later
9 * version; incorporated herein by reference.
10 *
11 * ----------------------------------------------------------------------- */
12
13/*
14 * cpuid.c
15 *
16 * x86 CPUID access device
17 *
18 * This device is accessed by lseek() to the appropriate CPUID level
19 * and then read in chunks of 16 bytes. A larger size means multiple
20 * reads of consecutive levels.
21 *
22 * This driver uses /dev/cpu/%d/cpuid where %d is the minor number, and on
23 * an SMP box will direct the access to CPU %d.
24 */
25
26#include <linux/module.h>
27
28#include <linux/types.h>
29#include <linux/errno.h>
30#include <linux/fcntl.h>
31#include <linux/init.h>
32#include <linux/poll.h>
33#include <linux/smp.h>
34#include <linux/major.h>
35#include <linux/fs.h>
36#include <linux/smp_lock.h>
37#include <linux/device.h>
38#include <linux/cpu.h>
39#include <linux/notifier.h>
40
41#include <asm/processor.h>
42#include <asm/msr.h>
43#include <asm/uaccess.h>
44#include <asm/system.h>
45
46static struct class *cpuid_class;
47
48#ifdef CONFIG_SMP
49
50struct cpuid_command {
51 u32 reg;
52 u32 *data;
53};
54
55static void cpuid_smp_cpuid(void *cmd_block)
56{
57 struct cpuid_command *cmd = (struct cpuid_command *)cmd_block;
58
59 cpuid(cmd->reg, &cmd->data[0], &cmd->data[1], &cmd->data[2],
60 &cmd->data[3]);
61}
62
63static inline void do_cpuid(int cpu, u32 reg, u32 * data)
64{
65 struct cpuid_command cmd;
66
67 preempt_disable();
68 if (cpu == smp_processor_id()) {
69 cpuid(reg, &data[0], &data[1], &data[2], &data[3]);
70 } else {
71 cmd.reg = reg;
72 cmd.data = data;
73
74 smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1, 1);
75 }
76 preempt_enable();
77}
78#else /* ! CONFIG_SMP */
79
80static inline void do_cpuid(int cpu, u32 reg, u32 * data)
81{
82 cpuid(reg, &data[0], &data[1], &data[2], &data[3]);
83}
84
85#endif /* ! CONFIG_SMP */
86
87static loff_t cpuid_seek(struct file *file, loff_t offset, int orig)
88{
89 loff_t ret;
90
91 lock_kernel();
92
93 switch (orig) {
94 case 0:
95 file->f_pos = offset;
96 ret = file->f_pos;
97 break;
98 case 1:
99 file->f_pos += offset;
100 ret = file->f_pos;
101 break;
102 default:
103 ret = -EINVAL;
104 }
105
106 unlock_kernel();
107 return ret;
108}
109
110static ssize_t cpuid_read(struct file *file, char __user *buf,
111 size_t count, loff_t * ppos)
112{
113 char __user *tmp = buf;
114 u32 data[4];
115 u32 reg = *ppos;
116 int cpu = iminor(file->f_path.dentry->d_inode);
117
118 if (count % 16)
119 return -EINVAL; /* Invalid chunk size */
120
121 for (; count; count -= 16) {
122 do_cpuid(cpu, reg, data);
123 if (copy_to_user(tmp, &data, 16))
124 return -EFAULT;
125 tmp += 16;
126 *ppos = reg++;
127 }
128
129 return tmp - buf;
130}
131
132static int cpuid_open(struct inode *inode, struct file *file)
133{
134 unsigned int cpu = iminor(file->f_path.dentry->d_inode);
135 struct cpuinfo_x86 *c = &(cpu_data)[cpu];
136
137 if (cpu >= NR_CPUS || !cpu_online(cpu))
138 return -ENXIO; /* No such CPU */
139 if (c->cpuid_level < 0)
140 return -EIO; /* CPUID not supported */
141
142 return 0;
143}
144
145/*
146 * File operations we support
147 */
148static const struct file_operations cpuid_fops = {
149 .owner = THIS_MODULE,
150 .llseek = cpuid_seek,
151 .read = cpuid_read,
152 .open = cpuid_open,
153};
154
155static int cpuid_device_create(int i)
156{
157 int err = 0;
158 struct device *dev;
159
160 dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, i), "cpu%d",i);
161 if (IS_ERR(dev))
162 err = PTR_ERR(dev);
163 return err;
164}
165
166static int cpuid_class_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
167{
168 unsigned int cpu = (unsigned long)hcpu;
169
170 switch (action) {
171 case CPU_ONLINE:
172 case CPU_ONLINE_FROZEN:
173 cpuid_device_create(cpu);
174 break;
175 case CPU_DEAD:
176 case CPU_DEAD_FROZEN:
177 device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu));
178 break;
179 }
180 return NOTIFY_OK;
181}
182
183static struct notifier_block __cpuinitdata cpuid_class_cpu_notifier =
184{
185 .notifier_call = cpuid_class_cpu_callback,
186};
187
188static int __init cpuid_init(void)
189{
190 int i, err = 0;
191 i = 0;
192
193 if (register_chrdev(CPUID_MAJOR, "cpu/cpuid", &cpuid_fops)) {
194 printk(KERN_ERR "cpuid: unable to get major %d for cpuid\n",
195 CPUID_MAJOR);
196 err = -EBUSY;
197 goto out;
198 }
199 cpuid_class = class_create(THIS_MODULE, "cpuid");
200 if (IS_ERR(cpuid_class)) {
201 err = PTR_ERR(cpuid_class);
202 goto out_chrdev;
203 }
204 for_each_online_cpu(i) {
205 err = cpuid_device_create(i);
206 if (err != 0)
207 goto out_class;
208 }
209 register_hotcpu_notifier(&cpuid_class_cpu_notifier);
210
211 err = 0;
212 goto out;
213
214out_class:
215 i = 0;
216 for_each_online_cpu(i) {
217 device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, i));
218 }
219 class_destroy(cpuid_class);
220out_chrdev:
221 unregister_chrdev(CPUID_MAJOR, "cpu/cpuid");
222out:
223 return err;
224}
225
226static void __exit cpuid_exit(void)
227{
228 int cpu = 0;
229
230 for_each_online_cpu(cpu)
231 device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu));
232 class_destroy(cpuid_class);
233 unregister_chrdev(CPUID_MAJOR, "cpu/cpuid");
234 unregister_hotcpu_notifier(&cpuid_class_cpu_notifier);
235}
236
237module_init(cpuid_init);
238module_exit(cpuid_exit);
239
240MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>");
241MODULE_DESCRIPTION("x86 generic CPUID driver");
242MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/crash_32.c b/arch/x86/kernel/crash_32.c
new file mode 100644
index 000000000000..53589d1b1a05
--- /dev/null
+++ b/arch/x86/kernel/crash_32.c
@@ -0,0 +1,137 @@
1/*
2 * Architecture specific (i386) functions for kexec based crash dumps.
3 *
4 * Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
5 *
6 * Copyright (C) IBM Corporation, 2004. All rights reserved.
7 *
8 */
9
10#include <linux/init.h>
11#include <linux/types.h>
12#include <linux/kernel.h>
13#include <linux/smp.h>
14#include <linux/reboot.h>
15#include <linux/kexec.h>
16#include <linux/delay.h>
17#include <linux/elf.h>
18#include <linux/elfcore.h>
19
20#include <asm/processor.h>
21#include <asm/hardirq.h>
22#include <asm/nmi.h>
23#include <asm/hw_irq.h>
24#include <asm/apic.h>
25#include <linux/kdebug.h>
26#include <asm/smp.h>
27
28#include <mach_ipi.h>
29
30
31/* This keeps a track of which one is crashing cpu. */
32static int crashing_cpu;
33
34#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
35static atomic_t waiting_for_crash_ipi;
36
37static int crash_nmi_callback(struct notifier_block *self,
38 unsigned long val, void *data)
39{
40 struct pt_regs *regs;
41 struct pt_regs fixed_regs;
42 int cpu;
43
44 if (val != DIE_NMI_IPI)
45 return NOTIFY_OK;
46
47 regs = ((struct die_args *)data)->regs;
48 cpu = raw_smp_processor_id();
49
50 /* Don't do anything if this handler is invoked on crashing cpu.
51 * Otherwise, system will completely hang. Crashing cpu can get
52 * an NMI if system was initially booted with nmi_watchdog parameter.
53 */
54 if (cpu == crashing_cpu)
55 return NOTIFY_STOP;
56 local_irq_disable();
57
58 if (!user_mode_vm(regs)) {
59 crash_fixup_ss_esp(&fixed_regs, regs);
60 regs = &fixed_regs;
61 }
62 crash_save_cpu(regs, cpu);
63 disable_local_APIC();
64 atomic_dec(&waiting_for_crash_ipi);
65 /* Assume hlt works */
66 halt();
67 for (;;)
68 cpu_relax();
69
70 return 1;
71}
72
73static void smp_send_nmi_allbutself(void)
74{
75 cpumask_t mask = cpu_online_map;
76 cpu_clear(safe_smp_processor_id(), mask);
77 if (!cpus_empty(mask))
78 send_IPI_mask(mask, NMI_VECTOR);
79}
80
81static struct notifier_block crash_nmi_nb = {
82 .notifier_call = crash_nmi_callback,
83};
84
85static void nmi_shootdown_cpus(void)
86{
87 unsigned long msecs;
88
89 atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
90 /* Would it be better to replace the trap vector here? */
91 if (register_die_notifier(&crash_nmi_nb))
92 return; /* return what? */
93 /* Ensure the new callback function is set before sending
94 * out the NMI
95 */
96 wmb();
97
98 smp_send_nmi_allbutself();
99
100 msecs = 1000; /* Wait at most a second for the other cpus to stop */
101 while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) {
102 mdelay(1);
103 msecs--;
104 }
105
106 /* Leave the nmi callback set */
107 disable_local_APIC();
108}
109#else
110static void nmi_shootdown_cpus(void)
111{
112 /* There are no cpus to shootdown */
113}
114#endif
115
116void machine_crash_shutdown(struct pt_regs *regs)
117{
118 /* This function is only called after the system
119 * has panicked or is otherwise in a critical state.
120 * The minimum amount of code to allow a kexec'd kernel
121 * to run successfully needs to happen here.
122 *
123 * In practice this means shooting down the other cpus in
124 * an SMP system.
125 */
126 /* The kernel is broken so disable interrupts */
127 local_irq_disable();
128
129 /* Make a note of crashing cpu. Will be used in NMI callback.*/
130 crashing_cpu = safe_smp_processor_id();
131 nmi_shootdown_cpus();
132 lapic_shutdown();
133#if defined(CONFIG_X86_IO_APIC)
134 disable_IO_APIC();
135#endif
136 crash_save_cpu(regs, safe_smp_processor_id());
137}
diff --git a/arch/x86/kernel/crash_64.c b/arch/x86/kernel/crash_64.c
new file mode 100644
index 000000000000..13432a1ae904
--- /dev/null
+++ b/arch/x86/kernel/crash_64.c
@@ -0,0 +1,135 @@
1/*
2 * Architecture specific (x86_64) functions for kexec based crash dumps.
3 *
4 * Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
5 *
6 * Copyright (C) IBM Corporation, 2004. All rights reserved.
7 *
8 */
9
10#include <linux/init.h>
11#include <linux/types.h>
12#include <linux/kernel.h>
13#include <linux/smp.h>
14#include <linux/irq.h>
15#include <linux/reboot.h>
16#include <linux/kexec.h>
17#include <linux/delay.h>
18#include <linux/elf.h>
19#include <linux/elfcore.h>
20#include <linux/kdebug.h>
21
22#include <asm/processor.h>
23#include <asm/hardirq.h>
24#include <asm/nmi.h>
25#include <asm/hw_irq.h>
26#include <asm/mach_apic.h>
27
28/* This keeps a track of which one is crashing cpu. */
29static int crashing_cpu;
30
31#ifdef CONFIG_SMP
32static atomic_t waiting_for_crash_ipi;
33
34static int crash_nmi_callback(struct notifier_block *self,
35 unsigned long val, void *data)
36{
37 struct pt_regs *regs;
38 int cpu;
39
40 if (val != DIE_NMI_IPI)
41 return NOTIFY_OK;
42
43 regs = ((struct die_args *)data)->regs;
44 cpu = raw_smp_processor_id();
45
46 /*
47 * Don't do anything if this handler is invoked on crashing cpu.
48 * Otherwise, system will completely hang. Crashing cpu can get
49 * an NMI if system was initially booted with nmi_watchdog parameter.
50 */
51 if (cpu == crashing_cpu)
52 return NOTIFY_STOP;
53 local_irq_disable();
54
55 crash_save_cpu(regs, cpu);
56 disable_local_APIC();
57 atomic_dec(&waiting_for_crash_ipi);
58 /* Assume hlt works */
59 for(;;)
60 halt();
61
62 return 1;
63}
64
65static void smp_send_nmi_allbutself(void)
66{
67 send_IPI_allbutself(NMI_VECTOR);
68}
69
70/*
71 * This code is a best effort heuristic to get the
72 * other cpus to stop executing. So races with
73 * cpu hotplug shouldn't matter.
74 */
75
76static struct notifier_block crash_nmi_nb = {
77 .notifier_call = crash_nmi_callback,
78};
79
80static void nmi_shootdown_cpus(void)
81{
82 unsigned long msecs;
83
84 atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
85 if (register_die_notifier(&crash_nmi_nb))
86 return; /* return what? */
87
88 /*
89 * Ensure the new callback function is set before sending
90 * out the NMI
91 */
92 wmb();
93
94 smp_send_nmi_allbutself();
95
96 msecs = 1000; /* Wait at most a second for the other cpus to stop */
97 while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) {
98 mdelay(1);
99 msecs--;
100 }
101 /* Leave the nmi callback set */
102 disable_local_APIC();
103}
104#else
105static void nmi_shootdown_cpus(void)
106{
107 /* There are no cpus to shootdown */
108}
109#endif
110
111void machine_crash_shutdown(struct pt_regs *regs)
112{
113 /*
114 * This function is only called after the system
115 * has panicked or is otherwise in a critical state.
116 * The minimum amount of code to allow a kexec'd kernel
117 * to run successfully needs to happen here.
118 *
119 * In practice this means shooting down the other cpus in
120 * an SMP system.
121 */
122 /* The kernel is broken so disable interrupts */
123 local_irq_disable();
124
125 /* Make a note of crashing cpu. Will be used in NMI callback.*/
126 crashing_cpu = smp_processor_id();
127 nmi_shootdown_cpus();
128
129 if(cpu_has_apic)
130 disable_local_APIC();
131
132 disable_IO_APIC();
133
134 crash_save_cpu(regs, smp_processor_id());
135}
diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c
new file mode 100644
index 000000000000..3f532df488bc
--- /dev/null
+++ b/arch/x86/kernel/crash_dump_32.c
@@ -0,0 +1,74 @@
1/*
2 * kernel/crash_dump.c - Memory preserving reboot related code.
3 *
4 * Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
5 * Copyright (C) IBM Corporation, 2004. All rights reserved
6 */
7
8#include <linux/errno.h>
9#include <linux/highmem.h>
10#include <linux/crash_dump.h>
11
12#include <asm/uaccess.h>
13
14static void *kdump_buf_page;
15
16/**
17 * copy_oldmem_page - copy one page from "oldmem"
18 * @pfn: page frame number to be copied
19 * @buf: target memory address for the copy; this can be in kernel address
20 * space or user address space (see @userbuf)
21 * @csize: number of bytes to copy
22 * @offset: offset in bytes into the page (based on pfn) to begin the copy
23 * @userbuf: if set, @buf is in user address space, use copy_to_user(),
24 * otherwise @buf is in kernel address space, use memcpy().
25 *
26 * Copy a page from "oldmem". For this page, there is no pte mapped
27 * in the current kernel. We stitch up a pte, similar to kmap_atomic.
28 *
29 * Calling copy_to_user() in atomic context is not desirable. Hence first
30 * copying the data to a pre-allocated kernel page and then copying to user
31 * space in non-atomic context.
32 */
33ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
34 size_t csize, unsigned long offset, int userbuf)
35{
36 void *vaddr;
37
38 if (!csize)
39 return 0;
40
41 vaddr = kmap_atomic_pfn(pfn, KM_PTE0);
42
43 if (!userbuf) {
44 memcpy(buf, (vaddr + offset), csize);
45 kunmap_atomic(vaddr, KM_PTE0);
46 } else {
47 if (!kdump_buf_page) {
48 printk(KERN_WARNING "Kdump: Kdump buffer page not"
49 " allocated\n");
50 return -EFAULT;
51 }
52 copy_page(kdump_buf_page, vaddr);
53 kunmap_atomic(vaddr, KM_PTE0);
54 if (copy_to_user(buf, (kdump_buf_page + offset), csize))
55 return -EFAULT;
56 }
57
58 return csize;
59}
60
61static int __init kdump_buf_page_init(void)
62{
63 int ret = 0;
64
65 kdump_buf_page = kmalloc(PAGE_SIZE, GFP_KERNEL);
66 if (!kdump_buf_page) {
67 printk(KERN_WARNING "Kdump: Failed to allocate kdump buffer"
68 " page\n");
69 ret = -ENOMEM;
70 }
71
72 return ret;
73}
74arch_initcall(kdump_buf_page_init);
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
new file mode 100644
index 000000000000..942deac4d43a
--- /dev/null
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -0,0 +1,47 @@
1/*
2 * kernel/crash_dump.c - Memory preserving reboot related code.
3 *
4 * Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
5 * Copyright (C) IBM Corporation, 2004. All rights reserved
6 */
7
8#include <linux/errno.h>
9#include <linux/crash_dump.h>
10
11#include <asm/uaccess.h>
12#include <asm/io.h>
13
14/**
15 * copy_oldmem_page - copy one page from "oldmem"
16 * @pfn: page frame number to be copied
17 * @buf: target memory address for the copy; this can be in kernel address
18 * space or user address space (see @userbuf)
19 * @csize: number of bytes to copy
20 * @offset: offset in bytes into the page (based on pfn) to begin the copy
21 * @userbuf: if set, @buf is in user address space, use copy_to_user(),
22 * otherwise @buf is in kernel address space, use memcpy().
23 *
24 * Copy a page from "oldmem". For this page, there is no pte mapped
25 * in the current kernel. We stitch up a pte, similar to kmap_atomic.
26 */
27ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
28 size_t csize, unsigned long offset, int userbuf)
29{
30 void *vaddr;
31
32 if (!csize)
33 return 0;
34
35 vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE);
36
37 if (userbuf) {
38 if (copy_to_user(buf, (vaddr + offset), csize)) {
39 iounmap(vaddr);
40 return -EFAULT;
41 }
42 } else
43 memcpy(buf, (vaddr + offset), csize);
44
45 iounmap(vaddr);
46 return csize;
47}
diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c
new file mode 100644
index 000000000000..40978af630e7
--- /dev/null
+++ b/arch/x86/kernel/doublefault_32.c
@@ -0,0 +1,70 @@
1#include <linux/mm.h>
2#include <linux/sched.h>
3#include <linux/init.h>
4#include <linux/init_task.h>
5#include <linux/fs.h>
6
7#include <asm/uaccess.h>
8#include <asm/pgtable.h>
9#include <asm/processor.h>
10#include <asm/desc.h>
11
12#define DOUBLEFAULT_STACKSIZE (1024)
13static unsigned long doublefault_stack[DOUBLEFAULT_STACKSIZE];
14#define STACK_START (unsigned long)(doublefault_stack+DOUBLEFAULT_STACKSIZE)
15
16#define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM)
17
18static void doublefault_fn(void)
19{
20 struct Xgt_desc_struct gdt_desc = {0, 0};
21 unsigned long gdt, tss;
22
23 store_gdt(&gdt_desc);
24 gdt = gdt_desc.address;
25
26 printk(KERN_EMERG "PANIC: double fault, gdt at %08lx [%d bytes]\n", gdt, gdt_desc.size);
27
28 if (ptr_ok(gdt)) {
29 gdt += GDT_ENTRY_TSS << 3;
30 tss = *(u16 *)(gdt+2);
31 tss += *(u8 *)(gdt+4) << 16;
32 tss += *(u8 *)(gdt+7) << 24;
33 printk(KERN_EMERG "double fault, tss at %08lx\n", tss);
34
35 if (ptr_ok(tss)) {
36 struct i386_hw_tss *t = (struct i386_hw_tss *)tss;
37
38 printk(KERN_EMERG "eip = %08lx, esp = %08lx\n", t->eip, t->esp);
39
40 printk(KERN_EMERG "eax = %08lx, ebx = %08lx, ecx = %08lx, edx = %08lx\n",
41 t->eax, t->ebx, t->ecx, t->edx);
42 printk(KERN_EMERG "esi = %08lx, edi = %08lx\n",
43 t->esi, t->edi);
44 }
45 }
46
47 for (;;)
48 cpu_relax();
49}
50
51struct tss_struct doublefault_tss __cacheline_aligned = {
52 .x86_tss = {
53 .esp0 = STACK_START,
54 .ss0 = __KERNEL_DS,
55 .ldt = 0,
56 .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
57
58 .eip = (unsigned long) doublefault_fn,
59 /* 0x2 bit is always set */
60 .eflags = X86_EFLAGS_SF | 0x2,
61 .esp = STACK_START,
62 .es = __USER_DS,
63 .cs = __KERNEL_CS,
64 .ss = __KERNEL_DS,
65 .ds = __USER_DS,
66 .fs = __KERNEL_PERCPU,
67
68 .__cr3 = __pa(swapper_pg_dir)
69 }
70};
diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c
new file mode 100644
index 000000000000..3c86b979a40a
--- /dev/null
+++ b/arch/x86/kernel/e820_32.c
@@ -0,0 +1,944 @@
1#include <linux/kernel.h>
2#include <linux/types.h>
3#include <linux/init.h>
4#include <linux/bootmem.h>
5#include <linux/ioport.h>
6#include <linux/string.h>
7#include <linux/kexec.h>
8#include <linux/module.h>
9#include <linux/mm.h>
10#include <linux/efi.h>
11#include <linux/pfn.h>
12#include <linux/uaccess.h>
13#include <linux/suspend.h>
14
15#include <asm/pgtable.h>
16#include <asm/page.h>
17#include <asm/e820.h>
18#include <asm/setup.h>
19
20#ifdef CONFIG_EFI
21int efi_enabled = 0;
22EXPORT_SYMBOL(efi_enabled);
23#endif
24
25struct e820map e820;
26struct change_member {
27 struct e820entry *pbios; /* pointer to original bios entry */
28 unsigned long long addr; /* address for this change point */
29};
30static struct change_member change_point_list[2*E820MAX] __initdata;
31static struct change_member *change_point[2*E820MAX] __initdata;
32static struct e820entry *overlap_list[E820MAX] __initdata;
33static struct e820entry new_bios[E820MAX] __initdata;
34/* For PCI or other memory-mapped resources */
35unsigned long pci_mem_start = 0x10000000;
36#ifdef CONFIG_PCI
37EXPORT_SYMBOL(pci_mem_start);
38#endif
39extern int user_defined_memmap;
40struct resource data_resource = {
41 .name = "Kernel data",
42 .start = 0,
43 .end = 0,
44 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
45};
46
47struct resource code_resource = {
48 .name = "Kernel code",
49 .start = 0,
50 .end = 0,
51 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
52};
53
54static struct resource system_rom_resource = {
55 .name = "System ROM",
56 .start = 0xf0000,
57 .end = 0xfffff,
58 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
59};
60
61static struct resource extension_rom_resource = {
62 .name = "Extension ROM",
63 .start = 0xe0000,
64 .end = 0xeffff,
65 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
66};
67
68static struct resource adapter_rom_resources[] = { {
69 .name = "Adapter ROM",
70 .start = 0xc8000,
71 .end = 0,
72 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
73}, {
74 .name = "Adapter ROM",
75 .start = 0,
76 .end = 0,
77 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
78}, {
79 .name = "Adapter ROM",
80 .start = 0,
81 .end = 0,
82 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
83}, {
84 .name = "Adapter ROM",
85 .start = 0,
86 .end = 0,
87 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
88}, {
89 .name = "Adapter ROM",
90 .start = 0,
91 .end = 0,
92 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
93}, {
94 .name = "Adapter ROM",
95 .start = 0,
96 .end = 0,
97 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
98} };
99
100static struct resource video_rom_resource = {
101 .name = "Video ROM",
102 .start = 0xc0000,
103 .end = 0xc7fff,
104 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
105};
106
107static struct resource video_ram_resource = {
108 .name = "Video RAM area",
109 .start = 0xa0000,
110 .end = 0xbffff,
111 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
112};
113
114static struct resource standard_io_resources[] = { {
115 .name = "dma1",
116 .start = 0x0000,
117 .end = 0x001f,
118 .flags = IORESOURCE_BUSY | IORESOURCE_IO
119}, {
120 .name = "pic1",
121 .start = 0x0020,
122 .end = 0x0021,
123 .flags = IORESOURCE_BUSY | IORESOURCE_IO
124}, {
125 .name = "timer0",
126 .start = 0x0040,
127 .end = 0x0043,
128 .flags = IORESOURCE_BUSY | IORESOURCE_IO
129}, {
130 .name = "timer1",
131 .start = 0x0050,
132 .end = 0x0053,
133 .flags = IORESOURCE_BUSY | IORESOURCE_IO
134}, {
135 .name = "keyboard",
136 .start = 0x0060,
137 .end = 0x006f,
138 .flags = IORESOURCE_BUSY | IORESOURCE_IO
139}, {
140 .name = "dma page reg",
141 .start = 0x0080,
142 .end = 0x008f,
143 .flags = IORESOURCE_BUSY | IORESOURCE_IO
144}, {
145 .name = "pic2",
146 .start = 0x00a0,
147 .end = 0x00a1,
148 .flags = IORESOURCE_BUSY | IORESOURCE_IO
149}, {
150 .name = "dma2",
151 .start = 0x00c0,
152 .end = 0x00df,
153 .flags = IORESOURCE_BUSY | IORESOURCE_IO
154}, {
155 .name = "fpu",
156 .start = 0x00f0,
157 .end = 0x00ff,
158 .flags = IORESOURCE_BUSY | IORESOURCE_IO
159} };
160
161#define ROMSIGNATURE 0xaa55
162
163static int __init romsignature(const unsigned char *rom)
164{
165 const unsigned short * const ptr = (const unsigned short *)rom;
166 unsigned short sig;
167
168 return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE;
169}
170
171static int __init romchecksum(const unsigned char *rom, unsigned long length)
172{
173 unsigned char sum, c;
174
175 for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--)
176 sum += c;
177 return !length && !sum;
178}
179
180static void __init probe_roms(void)
181{
182 const unsigned char *rom;
183 unsigned long start, length, upper;
184 unsigned char c;
185 int i;
186
187 /* video rom */
188 upper = adapter_rom_resources[0].start;
189 for (start = video_rom_resource.start; start < upper; start += 2048) {
190 rom = isa_bus_to_virt(start);
191 if (!romsignature(rom))
192 continue;
193
194 video_rom_resource.start = start;
195
196 if (probe_kernel_address(rom + 2, c) != 0)
197 continue;
198
199 /* 0 < length <= 0x7f * 512, historically */
200 length = c * 512;
201
202 /* if checksum okay, trust length byte */
203 if (length && romchecksum(rom, length))
204 video_rom_resource.end = start + length - 1;
205
206 request_resource(&iomem_resource, &video_rom_resource);
207 break;
208 }
209
210 start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
211 if (start < upper)
212 start = upper;
213
214 /* system rom */
215 request_resource(&iomem_resource, &system_rom_resource);
216 upper = system_rom_resource.start;
217
218 /* check for extension rom (ignore length byte!) */
219 rom = isa_bus_to_virt(extension_rom_resource.start);
220 if (romsignature(rom)) {
221 length = extension_rom_resource.end - extension_rom_resource.start + 1;
222 if (romchecksum(rom, length)) {
223 request_resource(&iomem_resource, &extension_rom_resource);
224 upper = extension_rom_resource.start;
225 }
226 }
227
228 /* check for adapter roms on 2k boundaries */
229 for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
230 rom = isa_bus_to_virt(start);
231 if (!romsignature(rom))
232 continue;
233
234 if (probe_kernel_address(rom + 2, c) != 0)
235 continue;
236
237 /* 0 < length <= 0x7f * 512, historically */
238 length = c * 512;
239
240 /* but accept any length that fits if checksum okay */
241 if (!length || start + length > upper || !romchecksum(rom, length))
242 continue;
243
244 adapter_rom_resources[i].start = start;
245 adapter_rom_resources[i].end = start + length - 1;
246 request_resource(&iomem_resource, &adapter_rom_resources[i]);
247
248 start = adapter_rom_resources[i++].end & ~2047UL;
249 }
250}
251
252/*
253 * Request address space for all standard RAM and ROM resources
254 * and also for regions reported as reserved by the e820.
255 */
256static void __init
257legacy_init_iomem_resources(struct resource *code_resource, struct resource *data_resource)
258{
259 int i;
260
261 probe_roms();
262 for (i = 0; i < e820.nr_map; i++) {
263 struct resource *res;
264#ifndef CONFIG_RESOURCES_64BIT
265 if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
266 continue;
267#endif
268 res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
269 switch (e820.map[i].type) {
270 case E820_RAM: res->name = "System RAM"; break;
271 case E820_ACPI: res->name = "ACPI Tables"; break;
272 case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
273 default: res->name = "reserved";
274 }
275 res->start = e820.map[i].addr;
276 res->end = res->start + e820.map[i].size - 1;
277 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
278 if (request_resource(&iomem_resource, res)) {
279 kfree(res);
280 continue;
281 }
282 if (e820.map[i].type == E820_RAM) {
283 /*
284 * We don't know which RAM region contains kernel data,
285 * so we try it repeatedly and let the resource manager
286 * test it.
287 */
288 request_resource(res, code_resource);
289 request_resource(res, data_resource);
290#ifdef CONFIG_KEXEC
291 request_resource(res, &crashk_res);
292#endif
293 }
294 }
295}
296
297/*
298 * Request address space for all standard resources
299 *
300 * This is called just before pcibios_init(), which is also a
301 * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
302 */
303static int __init request_standard_resources(void)
304{
305 int i;
306
307 printk("Setting up standard PCI resources\n");
308 if (efi_enabled)
309 efi_initialize_iomem_resources(&code_resource, &data_resource);
310 else
311 legacy_init_iomem_resources(&code_resource, &data_resource);
312
313 /* EFI systems may still have VGA */
314 request_resource(&iomem_resource, &video_ram_resource);
315
316 /* request I/O space for devices used on all i[345]86 PCs */
317 for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
318 request_resource(&ioport_resource, &standard_io_resources[i]);
319 return 0;
320}
321
322subsys_initcall(request_standard_resources);
323
324#if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION)
325/**
326 * e820_mark_nosave_regions - Find the ranges of physical addresses that do not
327 * correspond to e820 RAM areas and mark the corresponding pages as nosave for
328 * hibernation.
329 *
330 * This function requires the e820 map to be sorted and without any
331 * overlapping entries and assumes the first e820 area to be RAM.
332 */
333void __init e820_mark_nosave_regions(void)
334{
335 int i;
336 unsigned long pfn;
337
338 pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size);
339 for (i = 1; i < e820.nr_map; i++) {
340 struct e820entry *ei = &e820.map[i];
341
342 if (pfn < PFN_UP(ei->addr))
343 register_nosave_region(pfn, PFN_UP(ei->addr));
344
345 pfn = PFN_DOWN(ei->addr + ei->size);
346 if (ei->type != E820_RAM)
347 register_nosave_region(PFN_UP(ei->addr), pfn);
348
349 if (pfn >= max_low_pfn)
350 break;
351 }
352}
353#endif
354
355void __init add_memory_region(unsigned long long start,
356 unsigned long long size, int type)
357{
358 int x;
359
360 if (!efi_enabled) {
361 x = e820.nr_map;
362
363 if (x == E820MAX) {
364 printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
365 return;
366 }
367
368 e820.map[x].addr = start;
369 e820.map[x].size = size;
370 e820.map[x].type = type;
371 e820.nr_map++;
372 }
373} /* add_memory_region */
374
375/*
376 * Sanitize the BIOS e820 map.
377 *
378 * Some e820 responses include overlapping entries. The following
379 * replaces the original e820 map with a new one, removing overlaps.
380 *
381 */
382int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
383{
384 struct change_member *change_tmp;
385 unsigned long current_type, last_type;
386 unsigned long long last_addr;
387 int chgidx, still_changing;
388 int overlap_entries;
389 int new_bios_entry;
390 int old_nr, new_nr, chg_nr;
391 int i;
392
393 /*
394 Visually we're performing the following (1,2,3,4 = memory types)...
395
396 Sample memory map (w/overlaps):
397 ____22__________________
398 ______________________4_
399 ____1111________________
400 _44_____________________
401 11111111________________
402 ____________________33__
403 ___________44___________
404 __________33333_________
405 ______________22________
406 ___________________2222_
407 _________111111111______
408 _____________________11_
409 _________________4______
410
411 Sanitized equivalent (no overlap):
412 1_______________________
413 _44_____________________
414 ___1____________________
415 ____22__________________
416 ______11________________
417 _________1______________
418 __________3_____________
419 ___________44___________
420 _____________33_________
421 _______________2________
422 ________________1_______
423 _________________4______
424 ___________________2____
425 ____________________33__
426 ______________________4_
427 */
428 /* if there's only one memory region, don't bother */
429 if (*pnr_map < 2) {
430 return -1;
431 }
432
433 old_nr = *pnr_map;
434
435 /* bail out if we find any unreasonable addresses in bios map */
436 for (i=0; i<old_nr; i++)
437 if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) {
438 return -1;
439 }
440
441 /* create pointers for initial change-point information (for sorting) */
442 for (i=0; i < 2*old_nr; i++)
443 change_point[i] = &change_point_list[i];
444
445 /* record all known change-points (starting and ending addresses),
446 omitting those that are for empty memory regions */
447 chgidx = 0;
448 for (i=0; i < old_nr; i++) {
449 if (biosmap[i].size != 0) {
450 change_point[chgidx]->addr = biosmap[i].addr;
451 change_point[chgidx++]->pbios = &biosmap[i];
452 change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
453 change_point[chgidx++]->pbios = &biosmap[i];
454 }
455 }
456 chg_nr = chgidx; /* true number of change-points */
457
458 /* sort change-point list by memory addresses (low -> high) */
459 still_changing = 1;
460 while (still_changing) {
461 still_changing = 0;
462 for (i=1; i < chg_nr; i++) {
463 /* if <current_addr> > <last_addr>, swap */
464 /* or, if current=<start_addr> & last=<end_addr>, swap */
465 if ((change_point[i]->addr < change_point[i-1]->addr) ||
466 ((change_point[i]->addr == change_point[i-1]->addr) &&
467 (change_point[i]->addr == change_point[i]->pbios->addr) &&
468 (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
469 )
470 {
471 change_tmp = change_point[i];
472 change_point[i] = change_point[i-1];
473 change_point[i-1] = change_tmp;
474 still_changing=1;
475 }
476 }
477 }
478
479 /* create a new bios memory map, removing overlaps */
480 overlap_entries=0; /* number of entries in the overlap table */
481 new_bios_entry=0; /* index for creating new bios map entries */
482 last_type = 0; /* start with undefined memory type */
483 last_addr = 0; /* start with 0 as last starting address */
484 /* loop through change-points, determining affect on the new bios map */
485 for (chgidx=0; chgidx < chg_nr; chgidx++)
486 {
487 /* keep track of all overlapping bios entries */
488 if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
489 {
490 /* add map entry to overlap list (> 1 entry implies an overlap) */
491 overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
492 }
493 else
494 {
495 /* remove entry from list (order independent, so swap with last) */
496 for (i=0; i<overlap_entries; i++)
497 {
498 if (overlap_list[i] == change_point[chgidx]->pbios)
499 overlap_list[i] = overlap_list[overlap_entries-1];
500 }
501 overlap_entries--;
502 }
503 /* if there are overlapping entries, decide which "type" to use */
504 /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
505 current_type = 0;
506 for (i=0; i<overlap_entries; i++)
507 if (overlap_list[i]->type > current_type)
508 current_type = overlap_list[i]->type;
509 /* continue building up new bios map based on this information */
510 if (current_type != last_type) {
511 if (last_type != 0) {
512 new_bios[new_bios_entry].size =
513 change_point[chgidx]->addr - last_addr;
514 /* move forward only if the new size was non-zero */
515 if (new_bios[new_bios_entry].size != 0)
516 if (++new_bios_entry >= E820MAX)
517 break; /* no more space left for new bios entries */
518 }
519 if (current_type != 0) {
520 new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
521 new_bios[new_bios_entry].type = current_type;
522 last_addr=change_point[chgidx]->addr;
523 }
524 last_type = current_type;
525 }
526 }
527 new_nr = new_bios_entry; /* retain count for new bios entries */
528
529 /* copy new bios mapping into original location */
530 memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
531 *pnr_map = new_nr;
532
533 return 0;
534}
535
536/*
537 * Copy the BIOS e820 map into a safe place.
538 *
539 * Sanity-check it while we're at it..
540 *
541 * If we're lucky and live on a modern system, the setup code
542 * will have given us a memory map that we can use to properly
543 * set up memory. If we aren't, we'll fake a memory map.
544 *
545 * We check to see that the memory map contains at least 2 elements
546 * before we'll use it, because the detection code in setup.S may
547 * not be perfect and most every PC known to man has two memory
548 * regions: one from 0 to 640k, and one from 1mb up. (The IBM
549 * thinkpad 560x, for example, does not cooperate with the memory
550 * detection code.)
551 */
552int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
553{
554 /* Only one memory region (or negative)? Ignore it */
555 if (nr_map < 2)
556 return -1;
557
558 do {
559 unsigned long long start = biosmap->addr;
560 unsigned long long size = biosmap->size;
561 unsigned long long end = start + size;
562 unsigned long type = biosmap->type;
563
564 /* Overflow in 64 bits? Ignore the memory map. */
565 if (start > end)
566 return -1;
567
568 /*
569 * Some BIOSes claim RAM in the 640k - 1M region.
570 * Not right. Fix it up.
571 */
572 if (type == E820_RAM) {
573 if (start < 0x100000ULL && end > 0xA0000ULL) {
574 if (start < 0xA0000ULL)
575 add_memory_region(start, 0xA0000ULL-start, type);
576 if (end <= 0x100000ULL)
577 continue;
578 start = 0x100000ULL;
579 size = end - start;
580 }
581 }
582 add_memory_region(start, size, type);
583 } while (biosmap++,--nr_map);
584 return 0;
585}
586
587/*
588 * Callback for efi_memory_walk.
589 */
590static int __init
591efi_find_max_pfn(unsigned long start, unsigned long end, void *arg)
592{
593 unsigned long *max_pfn = arg, pfn;
594
595 if (start < end) {
596 pfn = PFN_UP(end -1);
597 if (pfn > *max_pfn)
598 *max_pfn = pfn;
599 }
600 return 0;
601}
602
603static int __init
604efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
605{
606 memory_present(0, PFN_UP(start), PFN_DOWN(end));
607 return 0;
608}
609
610/*
611 * Find the highest page frame number we have available
612 */
613void __init find_max_pfn(void)
614{
615 int i;
616
617 max_pfn = 0;
618 if (efi_enabled) {
619 efi_memmap_walk(efi_find_max_pfn, &max_pfn);
620 efi_memmap_walk(efi_memory_present_wrapper, NULL);
621 return;
622 }
623
624 for (i = 0; i < e820.nr_map; i++) {
625 unsigned long start, end;
626 /* RAM? */
627 if (e820.map[i].type != E820_RAM)
628 continue;
629 start = PFN_UP(e820.map[i].addr);
630 end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
631 if (start >= end)
632 continue;
633 if (end > max_pfn)
634 max_pfn = end;
635 memory_present(0, start, end);
636 }
637}
638
639/*
640 * Free all available memory for boot time allocation. Used
641 * as a callback function by efi_memory_walk()
642 */
643
644static int __init
645free_available_memory(unsigned long start, unsigned long end, void *arg)
646{
647 /* check max_low_pfn */
648 if (start >= (max_low_pfn << PAGE_SHIFT))
649 return 0;
650 if (end >= (max_low_pfn << PAGE_SHIFT))
651 end = max_low_pfn << PAGE_SHIFT;
652 if (start < end)
653 free_bootmem(start, end - start);
654
655 return 0;
656}
657/*
658 * Register fully available low RAM pages with the bootmem allocator.
659 */
660void __init register_bootmem_low_pages(unsigned long max_low_pfn)
661{
662 int i;
663
664 if (efi_enabled) {
665 efi_memmap_walk(free_available_memory, NULL);
666 return;
667 }
668 for (i = 0; i < e820.nr_map; i++) {
669 unsigned long curr_pfn, last_pfn, size;
670 /*
671 * Reserve usable low memory
672 */
673 if (e820.map[i].type != E820_RAM)
674 continue;
675 /*
676 * We are rounding up the start address of usable memory:
677 */
678 curr_pfn = PFN_UP(e820.map[i].addr);
679 if (curr_pfn >= max_low_pfn)
680 continue;
681 /*
682 * ... and at the end of the usable range downwards:
683 */
684 last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
685
686 if (last_pfn > max_low_pfn)
687 last_pfn = max_low_pfn;
688
689 /*
690 * .. finally, did all the rounding and playing
691 * around just make the area go away?
692 */
693 if (last_pfn <= curr_pfn)
694 continue;
695
696 size = last_pfn - curr_pfn;
697 free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
698 }
699}
700
701void __init e820_register_memory(void)
702{
703 unsigned long gapstart, gapsize, round;
704 unsigned long long last;
705 int i;
706
707 /*
708 * Search for the bigest gap in the low 32 bits of the e820
709 * memory space.
710 */
711 last = 0x100000000ull;
712 gapstart = 0x10000000;
713 gapsize = 0x400000;
714 i = e820.nr_map;
715 while (--i >= 0) {
716 unsigned long long start = e820.map[i].addr;
717 unsigned long long end = start + e820.map[i].size;
718
719 /*
720 * Since "last" is at most 4GB, we know we'll
721 * fit in 32 bits if this condition is true
722 */
723 if (last > end) {
724 unsigned long gap = last - end;
725
726 if (gap > gapsize) {
727 gapsize = gap;
728 gapstart = end;
729 }
730 }
731 if (start < last)
732 last = start;
733 }
734
735 /*
736 * See how much we want to round up: start off with
737 * rounding to the next 1MB area.
738 */
739 round = 0x100000;
740 while ((gapsize >> 4) > round)
741 round += round;
742 /* Fun with two's complement */
743 pci_mem_start = (gapstart + round) & -round;
744
745 printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
746 pci_mem_start, gapstart, gapsize);
747}
748
749void __init print_memory_map(char *who)
750{
751 int i;
752
753 for (i = 0; i < e820.nr_map; i++) {
754 printk(" %s: %016Lx - %016Lx ", who,
755 e820.map[i].addr,
756 e820.map[i].addr + e820.map[i].size);
757 switch (e820.map[i].type) {
758 case E820_RAM: printk("(usable)\n");
759 break;
760 case E820_RESERVED:
761 printk("(reserved)\n");
762 break;
763 case E820_ACPI:
764 printk("(ACPI data)\n");
765 break;
766 case E820_NVS:
767 printk("(ACPI NVS)\n");
768 break;
769 default: printk("type %u\n", e820.map[i].type);
770 break;
771 }
772 }
773}
774
775static __init __always_inline void efi_limit_regions(unsigned long long size)
776{
777 unsigned long long current_addr = 0;
778 efi_memory_desc_t *md, *next_md;
779 void *p, *p1;
780 int i, j;
781
782 j = 0;
783 p1 = memmap.map;
784 for (p = p1, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) {
785 md = p;
786 next_md = p1;
787 current_addr = md->phys_addr +
788 PFN_PHYS(md->num_pages);
789 if (is_available_memory(md)) {
790 if (md->phys_addr >= size) continue;
791 memcpy(next_md, md, memmap.desc_size);
792 if (current_addr >= size) {
793 next_md->num_pages -=
794 PFN_UP(current_addr-size);
795 }
796 p1 += memmap.desc_size;
797 next_md = p1;
798 j++;
799 } else if ((md->attribute & EFI_MEMORY_RUNTIME) ==
800 EFI_MEMORY_RUNTIME) {
801 /* In order to make runtime services
802 * available we have to include runtime
803 * memory regions in memory map */
804 memcpy(next_md, md, memmap.desc_size);
805 p1 += memmap.desc_size;
806 next_md = p1;
807 j++;
808 }
809 }
810 memmap.nr_map = j;
811 memmap.map_end = memmap.map +
812 (memmap.nr_map * memmap.desc_size);
813}
814
815void __init limit_regions(unsigned long long size)
816{
817 unsigned long long current_addr;
818 int i;
819
820 print_memory_map("limit_regions start");
821 if (efi_enabled) {
822 efi_limit_regions(size);
823 return;
824 }
825 for (i = 0; i < e820.nr_map; i++) {
826 current_addr = e820.map[i].addr + e820.map[i].size;
827 if (current_addr < size)
828 continue;
829
830 if (e820.map[i].type != E820_RAM)
831 continue;
832
833 if (e820.map[i].addr >= size) {
834 /*
835 * This region starts past the end of the
836 * requested size, skip it completely.
837 */
838 e820.nr_map = i;
839 } else {
840 e820.nr_map = i + 1;
841 e820.map[i].size -= current_addr - size;
842 }
843 print_memory_map("limit_regions endfor");
844 return;
845 }
846 print_memory_map("limit_regions endfunc");
847}
848
849/*
850 * This function checks if any part of the range <start,end> is mapped
851 * with type.
852 */
853int
854e820_any_mapped(u64 start, u64 end, unsigned type)
855{
856 int i;
857 for (i = 0; i < e820.nr_map; i++) {
858 const struct e820entry *ei = &e820.map[i];
859 if (type && ei->type != type)
860 continue;
861 if (ei->addr >= end || ei->addr + ei->size <= start)
862 continue;
863 return 1;
864 }
865 return 0;
866}
867EXPORT_SYMBOL_GPL(e820_any_mapped);
868
869 /*
870 * This function checks if the entire range <start,end> is mapped with type.
871 *
872 * Note: this function only works correct if the e820 table is sorted and
873 * not-overlapping, which is the case
874 */
875int __init
876e820_all_mapped(unsigned long s, unsigned long e, unsigned type)
877{
878 u64 start = s;
879 u64 end = e;
880 int i;
881 for (i = 0; i < e820.nr_map; i++) {
882 struct e820entry *ei = &e820.map[i];
883 if (type && ei->type != type)
884 continue;
885 /* is the region (part) in overlap with the current region ?*/
886 if (ei->addr >= end || ei->addr + ei->size <= start)
887 continue;
888 /* if the region is at the beginning of <start,end> we move
889 * start to the end of the region since it's ok until there
890 */
891 if (ei->addr <= start)
892 start = ei->addr + ei->size;
893 /* if start is now at or beyond end, we're done, full
894 * coverage */
895 if (start >= end)
896 return 1; /* we're done */
897 }
898 return 0;
899}
900
901static int __init parse_memmap(char *arg)
902{
903 if (!arg)
904 return -EINVAL;
905
906 if (strcmp(arg, "exactmap") == 0) {
907#ifdef CONFIG_CRASH_DUMP
908 /* If we are doing a crash dump, we
909 * still need to know the real mem
910 * size before original memory map is
911 * reset.
912 */
913 find_max_pfn();
914 saved_max_pfn = max_pfn;
915#endif
916 e820.nr_map = 0;
917 user_defined_memmap = 1;
918 } else {
919 /* If the user specifies memory size, we
920 * limit the BIOS-provided memory map to
921 * that size. exactmap can be used to specify
922 * the exact map. mem=number can be used to
923 * trim the existing memory map.
924 */
925 unsigned long long start_at, mem_size;
926
927 mem_size = memparse(arg, &arg);
928 if (*arg == '@') {
929 start_at = memparse(arg+1, &arg);
930 add_memory_region(start_at, mem_size, E820_RAM);
931 } else if (*arg == '#') {
932 start_at = memparse(arg+1, &arg);
933 add_memory_region(start_at, mem_size, E820_ACPI);
934 } else if (*arg == '$') {
935 start_at = memparse(arg+1, &arg);
936 add_memory_region(start_at, mem_size, E820_RESERVED);
937 } else {
938 limit_regions(mem_size);
939 user_defined_memmap = 1;
940 }
941 }
942 return 0;
943}
944early_param("memmap", parse_memmap);
diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c
new file mode 100644
index 000000000000..0f4d5e209e9b
--- /dev/null
+++ b/arch/x86/kernel/e820_64.c
@@ -0,0 +1,725 @@
1/*
2 * Handle the memory map.
3 * The functions here do the job until bootmem takes over.
4 *
5 * Getting sanitize_e820_map() in sync with i386 version by applying change:
6 * - Provisions for empty E820 memory regions (reported by certain BIOSes).
7 * Alex Achenbach <xela@slit.de>, December 2002.
8 * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
9 *
10 */
11#include <linux/kernel.h>
12#include <linux/types.h>
13#include <linux/init.h>
14#include <linux/bootmem.h>
15#include <linux/ioport.h>
16#include <linux/string.h>
17#include <linux/kexec.h>
18#include <linux/module.h>
19#include <linux/mm.h>
20#include <linux/suspend.h>
21#include <linux/pfn.h>
22
23#include <asm/pgtable.h>
24#include <asm/page.h>
25#include <asm/e820.h>
26#include <asm/proto.h>
27#include <asm/bootsetup.h>
28#include <asm/sections.h>
29
30struct e820map e820;
31
32/*
33 * PFN of last memory page.
34 */
35unsigned long end_pfn;
36EXPORT_SYMBOL(end_pfn);
37
38/*
39 * end_pfn only includes RAM, while end_pfn_map includes all e820 entries.
40 * The direct mapping extends to end_pfn_map, so that we can directly access
41 * apertures, ACPI and other tables without having to play with fixmaps.
42 */
43unsigned long end_pfn_map;
44
45/*
46 * Last pfn which the user wants to use.
47 */
48static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT;
49
50extern struct resource code_resource, data_resource;
51
52/* Check for some hardcoded bad areas that early boot is not allowed to touch */
53static inline int bad_addr(unsigned long *addrp, unsigned long size)
54{
55 unsigned long addr = *addrp, last = addr + size;
56
57 /* various gunk below that needed for SMP startup */
58 if (addr < 0x8000) {
59 *addrp = PAGE_ALIGN(0x8000);
60 return 1;
61 }
62
63 /* direct mapping tables of the kernel */
64 if (last >= table_start<<PAGE_SHIFT && addr < table_end<<PAGE_SHIFT) {
65 *addrp = PAGE_ALIGN(table_end << PAGE_SHIFT);
66 return 1;
67 }
68
69 /* initrd */
70#ifdef CONFIG_BLK_DEV_INITRD
71 if (LOADER_TYPE && INITRD_START && last >= INITRD_START &&
72 addr < INITRD_START+INITRD_SIZE) {
73 *addrp = PAGE_ALIGN(INITRD_START + INITRD_SIZE);
74 return 1;
75 }
76#endif
77 /* kernel code */
78 if (last >= __pa_symbol(&_text) && addr < __pa_symbol(&_end)) {
79 *addrp = PAGE_ALIGN(__pa_symbol(&_end));
80 return 1;
81 }
82
83 if (last >= ebda_addr && addr < ebda_addr + ebda_size) {
84 *addrp = PAGE_ALIGN(ebda_addr + ebda_size);
85 return 1;
86 }
87
88#ifdef CONFIG_NUMA
89 /* NUMA memory to node map */
90 if (last >= nodemap_addr && addr < nodemap_addr + nodemap_size) {
91 *addrp = nodemap_addr + nodemap_size;
92 return 1;
93 }
94#endif
95 /* XXX ramdisk image here? */
96 return 0;
97}
98
99/*
100 * This function checks if any part of the range <start,end> is mapped
101 * with type.
102 */
103int
104e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
105{
106 int i;
107 for (i = 0; i < e820.nr_map; i++) {
108 struct e820entry *ei = &e820.map[i];
109 if (type && ei->type != type)
110 continue;
111 if (ei->addr >= end || ei->addr + ei->size <= start)
112 continue;
113 return 1;
114 }
115 return 0;
116}
117EXPORT_SYMBOL_GPL(e820_any_mapped);
118
119/*
120 * This function checks if the entire range <start,end> is mapped with type.
121 *
122 * Note: this function only works correct if the e820 table is sorted and
123 * not-overlapping, which is the case
124 */
125int __init e820_all_mapped(unsigned long start, unsigned long end, unsigned type)
126{
127 int i;
128 for (i = 0; i < e820.nr_map; i++) {
129 struct e820entry *ei = &e820.map[i];
130 if (type && ei->type != type)
131 continue;
132 /* is the region (part) in overlap with the current region ?*/
133 if (ei->addr >= end || ei->addr + ei->size <= start)
134 continue;
135
136 /* if the region is at the beginning of <start,end> we move
137 * start to the end of the region since it's ok until there
138 */
139 if (ei->addr <= start)
140 start = ei->addr + ei->size;
141 /* if start is now at or beyond end, we're done, full coverage */
142 if (start >= end)
143 return 1; /* we're done */
144 }
145 return 0;
146}
147
148/*
149 * Find a free area in a specific range.
150 */
151unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsigned size)
152{
153 int i;
154 for (i = 0; i < e820.nr_map; i++) {
155 struct e820entry *ei = &e820.map[i];
156 unsigned long addr = ei->addr, last;
157 if (ei->type != E820_RAM)
158 continue;
159 if (addr < start)
160 addr = start;
161 if (addr > ei->addr + ei->size)
162 continue;
163 while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size)
164 ;
165 last = PAGE_ALIGN(addr) + size;
166 if (last > ei->addr + ei->size)
167 continue;
168 if (last > end)
169 continue;
170 return addr;
171 }
172 return -1UL;
173}
174
175/*
176 * Find the highest page frame number we have available
177 */
178unsigned long __init e820_end_of_ram(void)
179{
180 unsigned long end_pfn = 0;
181 end_pfn = find_max_pfn_with_active_regions();
182
183 if (end_pfn > end_pfn_map)
184 end_pfn_map = end_pfn;
185 if (end_pfn_map > MAXMEM>>PAGE_SHIFT)
186 end_pfn_map = MAXMEM>>PAGE_SHIFT;
187 if (end_pfn > end_user_pfn)
188 end_pfn = end_user_pfn;
189 if (end_pfn > end_pfn_map)
190 end_pfn = end_pfn_map;
191
192 printk("end_pfn_map = %lu\n", end_pfn_map);
193 return end_pfn;
194}
195
196/*
197 * Mark e820 reserved areas as busy for the resource manager.
198 */
199void __init e820_reserve_resources(void)
200{
201 int i;
202 for (i = 0; i < e820.nr_map; i++) {
203 struct resource *res;
204 res = alloc_bootmem_low(sizeof(struct resource));
205 switch (e820.map[i].type) {
206 case E820_RAM: res->name = "System RAM"; break;
207 case E820_ACPI: res->name = "ACPI Tables"; break;
208 case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
209 default: res->name = "reserved";
210 }
211 res->start = e820.map[i].addr;
212 res->end = res->start + e820.map[i].size - 1;
213 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
214 request_resource(&iomem_resource, res);
215 if (e820.map[i].type == E820_RAM) {
216 /*
217 * We don't know which RAM region contains kernel data,
218 * so we try it repeatedly and let the resource manager
219 * test it.
220 */
221 request_resource(res, &code_resource);
222 request_resource(res, &data_resource);
223#ifdef CONFIG_KEXEC
224 request_resource(res, &crashk_res);
225#endif
226 }
227 }
228}
229
230/*
231 * Find the ranges of physical addresses that do not correspond to
232 * e820 RAM areas and mark the corresponding pages as nosave for software
233 * suspend and suspend to RAM.
234 *
235 * This function requires the e820 map to be sorted and without any
236 * overlapping entries and assumes the first e820 area to be RAM.
237 */
238void __init e820_mark_nosave_regions(void)
239{
240 int i;
241 unsigned long paddr;
242
243 paddr = round_down(e820.map[0].addr + e820.map[0].size, PAGE_SIZE);
244 for (i = 1; i < e820.nr_map; i++) {
245 struct e820entry *ei = &e820.map[i];
246
247 if (paddr < ei->addr)
248 register_nosave_region(PFN_DOWN(paddr),
249 PFN_UP(ei->addr));
250
251 paddr = round_down(ei->addr + ei->size, PAGE_SIZE);
252 if (ei->type != E820_RAM)
253 register_nosave_region(PFN_UP(ei->addr),
254 PFN_DOWN(paddr));
255
256 if (paddr >= (end_pfn << PAGE_SHIFT))
257 break;
258 }
259}
260
261/*
262 * Finds an active region in the address range from start_pfn to end_pfn and
263 * returns its range in ei_startpfn and ei_endpfn for the e820 entry.
264 */
265static int __init e820_find_active_region(const struct e820entry *ei,
266 unsigned long start_pfn,
267 unsigned long end_pfn,
268 unsigned long *ei_startpfn,
269 unsigned long *ei_endpfn)
270{
271 *ei_startpfn = round_up(ei->addr, PAGE_SIZE) >> PAGE_SHIFT;
272 *ei_endpfn = round_down(ei->addr + ei->size, PAGE_SIZE) >> PAGE_SHIFT;
273
274 /* Skip map entries smaller than a page */
275 if (*ei_startpfn >= *ei_endpfn)
276 return 0;
277
278 /* Check if end_pfn_map should be updated */
279 if (ei->type != E820_RAM && *ei_endpfn > end_pfn_map)
280 end_pfn_map = *ei_endpfn;
281
282 /* Skip if map is outside the node */
283 if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
284 *ei_startpfn >= end_pfn)
285 return 0;
286
287 /* Check for overlaps */
288 if (*ei_startpfn < start_pfn)
289 *ei_startpfn = start_pfn;
290 if (*ei_endpfn > end_pfn)
291 *ei_endpfn = end_pfn;
292
293 /* Obey end_user_pfn to save on memmap */
294 if (*ei_startpfn >= end_user_pfn)
295 return 0;
296 if (*ei_endpfn > end_user_pfn)
297 *ei_endpfn = end_user_pfn;
298
299 return 1;
300}
301
302/* Walk the e820 map and register active regions within a node */
303void __init
304e820_register_active_regions(int nid, unsigned long start_pfn,
305 unsigned long end_pfn)
306{
307 unsigned long ei_startpfn;
308 unsigned long ei_endpfn;
309 int i;
310
311 for (i = 0; i < e820.nr_map; i++)
312 if (e820_find_active_region(&e820.map[i],
313 start_pfn, end_pfn,
314 &ei_startpfn, &ei_endpfn))
315 add_active_range(nid, ei_startpfn, ei_endpfn);
316}
317
318/*
319 * Add a memory region to the kernel e820 map.
320 */
321void __init add_memory_region(unsigned long start, unsigned long size, int type)
322{
323 int x = e820.nr_map;
324
325 if (x == E820MAX) {
326 printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
327 return;
328 }
329
330 e820.map[x].addr = start;
331 e820.map[x].size = size;
332 e820.map[x].type = type;
333 e820.nr_map++;
334}
335
336/*
337 * Find the hole size (in bytes) in the memory range.
338 * @start: starting address of the memory range to scan
339 * @end: ending address of the memory range to scan
340 */
341unsigned long __init e820_hole_size(unsigned long start, unsigned long end)
342{
343 unsigned long start_pfn = start >> PAGE_SHIFT;
344 unsigned long end_pfn = end >> PAGE_SHIFT;
345 unsigned long ei_startpfn;
346 unsigned long ei_endpfn;
347 unsigned long ram = 0;
348 int i;
349
350 for (i = 0; i < e820.nr_map; i++) {
351 if (e820_find_active_region(&e820.map[i],
352 start_pfn, end_pfn,
353 &ei_startpfn, &ei_endpfn))
354 ram += ei_endpfn - ei_startpfn;
355 }
356 return end - start - (ram << PAGE_SHIFT);
357}
358
359void __init e820_print_map(char *who)
360{
361 int i;
362
363 for (i = 0; i < e820.nr_map; i++) {
364 printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
365 (unsigned long long) e820.map[i].addr,
366 (unsigned long long) (e820.map[i].addr + e820.map[i].size));
367 switch (e820.map[i].type) {
368 case E820_RAM: printk("(usable)\n");
369 break;
370 case E820_RESERVED:
371 printk("(reserved)\n");
372 break;
373 case E820_ACPI:
374 printk("(ACPI data)\n");
375 break;
376 case E820_NVS:
377 printk("(ACPI NVS)\n");
378 break;
379 default: printk("type %u\n", e820.map[i].type);
380 break;
381 }
382 }
383}
384
385/*
386 * Sanitize the BIOS e820 map.
387 *
388 * Some e820 responses include overlapping entries. The following
389 * replaces the original e820 map with a new one, removing overlaps.
390 *
391 */
392static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
393{
394 struct change_member {
395 struct e820entry *pbios; /* pointer to original bios entry */
396 unsigned long long addr; /* address for this change point */
397 };
398 static struct change_member change_point_list[2*E820MAX] __initdata;
399 static struct change_member *change_point[2*E820MAX] __initdata;
400 static struct e820entry *overlap_list[E820MAX] __initdata;
401 static struct e820entry new_bios[E820MAX] __initdata;
402 struct change_member *change_tmp;
403 unsigned long current_type, last_type;
404 unsigned long long last_addr;
405 int chgidx, still_changing;
406 int overlap_entries;
407 int new_bios_entry;
408 int old_nr, new_nr, chg_nr;
409 int i;
410
411 /*
412 Visually we're performing the following (1,2,3,4 = memory types)...
413
414 Sample memory map (w/overlaps):
415 ____22__________________
416 ______________________4_
417 ____1111________________
418 _44_____________________
419 11111111________________
420 ____________________33__
421 ___________44___________
422 __________33333_________
423 ______________22________
424 ___________________2222_
425 _________111111111______
426 _____________________11_
427 _________________4______
428
429 Sanitized equivalent (no overlap):
430 1_______________________
431 _44_____________________
432 ___1____________________
433 ____22__________________
434 ______11________________
435 _________1______________
436 __________3_____________
437 ___________44___________
438 _____________33_________
439 _______________2________
440 ________________1_______
441 _________________4______
442 ___________________2____
443 ____________________33__
444 ______________________4_
445 */
446
447 /* if there's only one memory region, don't bother */
448 if (*pnr_map < 2)
449 return -1;
450
451 old_nr = *pnr_map;
452
453 /* bail out if we find any unreasonable addresses in bios map */
454 for (i=0; i<old_nr; i++)
455 if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
456 return -1;
457
458 /* create pointers for initial change-point information (for sorting) */
459 for (i=0; i < 2*old_nr; i++)
460 change_point[i] = &change_point_list[i];
461
462 /* record all known change-points (starting and ending addresses),
463 omitting those that are for empty memory regions */
464 chgidx = 0;
465 for (i=0; i < old_nr; i++) {
466 if (biosmap[i].size != 0) {
467 change_point[chgidx]->addr = biosmap[i].addr;
468 change_point[chgidx++]->pbios = &biosmap[i];
469 change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
470 change_point[chgidx++]->pbios = &biosmap[i];
471 }
472 }
473 chg_nr = chgidx;
474
475 /* sort change-point list by memory addresses (low -> high) */
476 still_changing = 1;
477 while (still_changing) {
478 still_changing = 0;
479 for (i=1; i < chg_nr; i++) {
480 /* if <current_addr> > <last_addr>, swap */
481 /* or, if current=<start_addr> & last=<end_addr>, swap */
482 if ((change_point[i]->addr < change_point[i-1]->addr) ||
483 ((change_point[i]->addr == change_point[i-1]->addr) &&
484 (change_point[i]->addr == change_point[i]->pbios->addr) &&
485 (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
486 )
487 {
488 change_tmp = change_point[i];
489 change_point[i] = change_point[i-1];
490 change_point[i-1] = change_tmp;
491 still_changing=1;
492 }
493 }
494 }
495
496 /* create a new bios memory map, removing overlaps */
497 overlap_entries=0; /* number of entries in the overlap table */
498 new_bios_entry=0; /* index for creating new bios map entries */
499 last_type = 0; /* start with undefined memory type */
500 last_addr = 0; /* start with 0 as last starting address */
501 /* loop through change-points, determining affect on the new bios map */
502 for (chgidx=0; chgidx < chg_nr; chgidx++)
503 {
504 /* keep track of all overlapping bios entries */
505 if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
506 {
507 /* add map entry to overlap list (> 1 entry implies an overlap) */
508 overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
509 }
510 else
511 {
512 /* remove entry from list (order independent, so swap with last) */
513 for (i=0; i<overlap_entries; i++)
514 {
515 if (overlap_list[i] == change_point[chgidx]->pbios)
516 overlap_list[i] = overlap_list[overlap_entries-1];
517 }
518 overlap_entries--;
519 }
520 /* if there are overlapping entries, decide which "type" to use */
521 /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
522 current_type = 0;
523 for (i=0; i<overlap_entries; i++)
524 if (overlap_list[i]->type > current_type)
525 current_type = overlap_list[i]->type;
526 /* continue building up new bios map based on this information */
527 if (current_type != last_type) {
528 if (last_type != 0) {
529 new_bios[new_bios_entry].size =
530 change_point[chgidx]->addr - last_addr;
531 /* move forward only if the new size was non-zero */
532 if (new_bios[new_bios_entry].size != 0)
533 if (++new_bios_entry >= E820MAX)
534 break; /* no more space left for new bios entries */
535 }
536 if (current_type != 0) {
537 new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
538 new_bios[new_bios_entry].type = current_type;
539 last_addr=change_point[chgidx]->addr;
540 }
541 last_type = current_type;
542 }
543 }
544 new_nr = new_bios_entry; /* retain count for new bios entries */
545
546 /* copy new bios mapping into original location */
547 memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
548 *pnr_map = new_nr;
549
550 return 0;
551}
552
553/*
554 * Copy the BIOS e820 map into a safe place.
555 *
556 * Sanity-check it while we're at it..
557 *
558 * If we're lucky and live on a modern system, the setup code
559 * will have given us a memory map that we can use to properly
560 * set up memory. If we aren't, we'll fake a memory map.
561 */
562static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
563{
564 /* Only one memory region (or negative)? Ignore it */
565 if (nr_map < 2)
566 return -1;
567
568 do {
569 unsigned long start = biosmap->addr;
570 unsigned long size = biosmap->size;
571 unsigned long end = start + size;
572 unsigned long type = biosmap->type;
573
574 /* Overflow in 64 bits? Ignore the memory map. */
575 if (start > end)
576 return -1;
577
578 add_memory_region(start, size, type);
579 } while (biosmap++,--nr_map);
580 return 0;
581}
582
583void early_panic(char *msg)
584{
585 early_printk(msg);
586 panic(msg);
587}
588
589void __init setup_memory_region(void)
590{
591 /*
592 * Try to copy the BIOS-supplied E820-map.
593 *
594 * Otherwise fake a memory map; one section from 0k->640k,
595 * the next section from 1mb->appropriate_mem_k
596 */
597 sanitize_e820_map(E820_MAP, &E820_MAP_NR);
598 if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0)
599 early_panic("Cannot find a valid memory map");
600 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
601 e820_print_map("BIOS-e820");
602}
603
604static int __init parse_memopt(char *p)
605{
606 if (!p)
607 return -EINVAL;
608 end_user_pfn = memparse(p, &p);
609 end_user_pfn >>= PAGE_SHIFT;
610 return 0;
611}
612early_param("mem", parse_memopt);
613
614static int userdef __initdata;
615
616static int __init parse_memmap_opt(char *p)
617{
618 char *oldp;
619 unsigned long long start_at, mem_size;
620
621 if (!strcmp(p, "exactmap")) {
622#ifdef CONFIG_CRASH_DUMP
623 /* If we are doing a crash dump, we
624 * still need to know the real mem
625 * size before original memory map is
626 * reset.
627 */
628 e820_register_active_regions(0, 0, -1UL);
629 saved_max_pfn = e820_end_of_ram();
630 remove_all_active_ranges();
631#endif
632 end_pfn_map = 0;
633 e820.nr_map = 0;
634 userdef = 1;
635 return 0;
636 }
637
638 oldp = p;
639 mem_size = memparse(p, &p);
640 if (p == oldp)
641 return -EINVAL;
642 if (*p == '@') {
643 start_at = memparse(p+1, &p);
644 add_memory_region(start_at, mem_size, E820_RAM);
645 } else if (*p == '#') {
646 start_at = memparse(p+1, &p);
647 add_memory_region(start_at, mem_size, E820_ACPI);
648 } else if (*p == '$') {
649 start_at = memparse(p+1, &p);
650 add_memory_region(start_at, mem_size, E820_RESERVED);
651 } else {
652 end_user_pfn = (mem_size >> PAGE_SHIFT);
653 }
654 return *p == '\0' ? 0 : -EINVAL;
655}
656early_param("memmap", parse_memmap_opt);
657
658void __init finish_e820_parsing(void)
659{
660 if (userdef) {
661 printk(KERN_INFO "user-defined physical RAM map:\n");
662 e820_print_map("user");
663 }
664}
665
666unsigned long pci_mem_start = 0xaeedbabe;
667EXPORT_SYMBOL(pci_mem_start);
668
669/*
670 * Search for the biggest gap in the low 32 bits of the e820
671 * memory space. We pass this space to PCI to assign MMIO resources
672 * for hotplug or unconfigured devices in.
673 * Hopefully the BIOS let enough space left.
674 */
675__init void e820_setup_gap(void)
676{
677 unsigned long gapstart, gapsize, round;
678 unsigned long last;
679 int i;
680 int found = 0;
681
682 last = 0x100000000ull;
683 gapstart = 0x10000000;
684 gapsize = 0x400000;
685 i = e820.nr_map;
686 while (--i >= 0) {
687 unsigned long long start = e820.map[i].addr;
688 unsigned long long end = start + e820.map[i].size;
689
690 /*
691 * Since "last" is at most 4GB, we know we'll
692 * fit in 32 bits if this condition is true
693 */
694 if (last > end) {
695 unsigned long gap = last - end;
696
697 if (gap > gapsize) {
698 gapsize = gap;
699 gapstart = end;
700 found = 1;
701 }
702 }
703 if (start < last)
704 last = start;
705 }
706
707 if (!found) {
708 gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024;
709 printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit address range\n"
710 KERN_ERR "PCI: Unassigned devices with 32bit resource registers may break!\n");
711 }
712
713 /*
714 * See how much we want to round up: start off with
715 * rounding to the next 1MB area.
716 */
717 round = 0x100000;
718 while ((gapsize >> 4) > round)
719 round += round;
720 /* Fun with two's complement */
721 pci_mem_start = (gapstart + round) & -round;
722
723 printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
724 pci_mem_start, gapstart, gapsize);
725}
diff --git a/arch/x86/kernel/early-quirks_64.c b/arch/x86/kernel/early-quirks_64.c
new file mode 100644
index 000000000000..13aa4fd728f3
--- /dev/null
+++ b/arch/x86/kernel/early-quirks_64.c
@@ -0,0 +1,127 @@
1/* Various workarounds for chipset bugs.
2 This code runs very early and can't use the regular PCI subsystem
3 The entries are keyed to PCI bridges which usually identify chipsets
4 uniquely.
5 This is only for whole classes of chipsets with specific problems which
6 need early invasive action (e.g. before the timers are initialized).
7 Most PCI device specific workarounds can be done later and should be
8 in standard PCI quirks
9 Mainboard specific bugs should be handled by DMI entries.
10 CPU specific bugs in setup.c */
11
12#include <linux/pci.h>
13#include <linux/acpi.h>
14#include <linux/pci_ids.h>
15#include <asm/pci-direct.h>
16#include <asm/proto.h>
17#include <asm/iommu.h>
18#include <asm/dma.h>
19
20static void __init via_bugs(void)
21{
22#ifdef CONFIG_IOMMU
23 if ((end_pfn > MAX_DMA32_PFN || force_iommu) &&
24 !iommu_aperture_allowed) {
25 printk(KERN_INFO
26 "Looks like a VIA chipset. Disabling IOMMU. Override with iommu=allowed\n");
27 iommu_aperture_disabled = 1;
28 }
29#endif
30}
31
32#ifdef CONFIG_ACPI
33
34static int __init nvidia_hpet_check(struct acpi_table_header *header)
35{
36 return 0;
37}
38#endif
39
40static void __init nvidia_bugs(void)
41{
42#ifdef CONFIG_ACPI
43 /*
44 * All timer overrides on Nvidia are
45 * wrong unless HPET is enabled.
46 * Unfortunately that's not true on many Asus boards.
47 * We don't know yet how to detect this automatically, but
48 * at least allow a command line override.
49 */
50 if (acpi_use_timer_override)
51 return;
52
53 if (acpi_table_parse(ACPI_SIG_HPET, nvidia_hpet_check)) {
54 acpi_skip_timer_override = 1;
55 printk(KERN_INFO "Nvidia board "
56 "detected. Ignoring ACPI "
57 "timer override.\n");
58 printk(KERN_INFO "If you got timer trouble "
59 "try acpi_use_timer_override\n");
60 }
61#endif
62 /* RED-PEN skip them on mptables too? */
63
64}
65
66static void __init ati_bugs(void)
67{
68 if (timer_over_8254 == 1) {
69 timer_over_8254 = 0;
70 printk(KERN_INFO
71 "ATI board detected. Disabling timer routing over 8254.\n");
72 }
73}
74
75struct chipset {
76 u16 vendor;
77 void (*f)(void);
78};
79
80static struct chipset early_qrk[] __initdata = {
81 { PCI_VENDOR_ID_NVIDIA, nvidia_bugs },
82 { PCI_VENDOR_ID_VIA, via_bugs },
83 { PCI_VENDOR_ID_ATI, ati_bugs },
84 {}
85};
86
87void __init early_quirks(void)
88{
89 int num, slot, func;
90
91 if (!early_pci_allowed())
92 return;
93
94 /* Poor man's PCI discovery */
95 for (num = 0; num < 32; num++) {
96 for (slot = 0; slot < 32; slot++) {
97 for (func = 0; func < 8; func++) {
98 u32 class;
99 u32 vendor;
100 u8 type;
101 int i;
102 class = read_pci_config(num,slot,func,
103 PCI_CLASS_REVISION);
104 if (class == 0xffffffff)
105 break;
106
107 if ((class >> 16) != PCI_CLASS_BRIDGE_PCI)
108 continue;
109
110 vendor = read_pci_config(num, slot, func,
111 PCI_VENDOR_ID);
112 vendor &= 0xffff;
113
114 for (i = 0; early_qrk[i].f; i++)
115 if (early_qrk[i].vendor == vendor) {
116 early_qrk[i].f();
117 return;
118 }
119
120 type = read_pci_config_byte(num, slot, func,
121 PCI_HEADER_TYPE);
122 if (!(type & 0x80))
123 break;
124 }
125 }
126 }
127}
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
new file mode 100644
index 000000000000..fd9aff3f3890
--- /dev/null
+++ b/arch/x86/kernel/early_printk.c
@@ -0,0 +1,259 @@
1#include <linux/console.h>
2#include <linux/kernel.h>
3#include <linux/init.h>
4#include <linux/string.h>
5#include <linux/screen_info.h>
6#include <asm/io.h>
7#include <asm/processor.h>
8#include <asm/fcntl.h>
9#include <xen/hvc-console.h>
10
11/* Simple VGA output */
12
13#ifdef __i386__
14#include <asm/setup.h>
15#else
16#include <asm/bootsetup.h>
17#endif
18#define VGABASE (__ISA_IO_base + 0xb8000)
19
20static int max_ypos = 25, max_xpos = 80;
21static int current_ypos = 25, current_xpos = 0;
22
23static void early_vga_write(struct console *con, const char *str, unsigned n)
24{
25 char c;
26 int i, k, j;
27
28 while ((c = *str++) != '\0' && n-- > 0) {
29 if (current_ypos >= max_ypos) {
30 /* scroll 1 line up */
31 for (k = 1, j = 0; k < max_ypos; k++, j++) {
32 for (i = 0; i < max_xpos; i++) {
33 writew(readw(VGABASE+2*(max_xpos*k+i)),
34 VGABASE + 2*(max_xpos*j + i));
35 }
36 }
37 for (i = 0; i < max_xpos; i++)
38 writew(0x720, VGABASE + 2*(max_xpos*j + i));
39 current_ypos = max_ypos-1;
40 }
41 if (c == '\n') {
42 current_xpos = 0;
43 current_ypos++;
44 } else if (c != '\r') {
45 writew(((0x7 << 8) | (unsigned short) c),
46 VGABASE + 2*(max_xpos*current_ypos +
47 current_xpos++));
48 if (current_xpos >= max_xpos) {
49 current_xpos = 0;
50 current_ypos++;
51 }
52 }
53 }
54}
55
56static struct console early_vga_console = {
57 .name = "earlyvga",
58 .write = early_vga_write,
59 .flags = CON_PRINTBUFFER,
60 .index = -1,
61};
62
63/* Serial functions loosely based on a similar package from Klaus P. Gerlicher */
64
65static int early_serial_base = 0x3f8; /* ttyS0 */
66
67#define XMTRDY 0x20
68
69#define DLAB 0x80
70
71#define TXR 0 /* Transmit register (WRITE) */
72#define RXR 0 /* Receive register (READ) */
73#define IER 1 /* Interrupt Enable */
74#define IIR 2 /* Interrupt ID */
75#define FCR 2 /* FIFO control */
76#define LCR 3 /* Line control */
77#define MCR 4 /* Modem control */
78#define LSR 5 /* Line Status */
79#define MSR 6 /* Modem Status */
80#define DLL 0 /* Divisor Latch Low */
81#define DLH 1 /* Divisor latch High */
82
83static int early_serial_putc(unsigned char ch)
84{
85 unsigned timeout = 0xffff;
86 while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout)
87 cpu_relax();
88 outb(ch, early_serial_base + TXR);
89 return timeout ? 0 : -1;
90}
91
92static void early_serial_write(struct console *con, const char *s, unsigned n)
93{
94 while (*s && n-- > 0) {
95 if (*s == '\n')
96 early_serial_putc('\r');
97 early_serial_putc(*s);
98 s++;
99 }
100}
101
102#define DEFAULT_BAUD 9600
103
104static __init void early_serial_init(char *s)
105{
106 unsigned char c;
107 unsigned divisor;
108 unsigned baud = DEFAULT_BAUD;
109 char *e;
110
111 if (*s == ',')
112 ++s;
113
114 if (*s) {
115 unsigned port;
116 if (!strncmp(s,"0x",2)) {
117 early_serial_base = simple_strtoul(s, &e, 16);
118 } else {
119 static int bases[] = { 0x3f8, 0x2f8 };
120
121 if (!strncmp(s,"ttyS",4))
122 s += 4;
123 port = simple_strtoul(s, &e, 10);
124 if (port > 1 || s == e)
125 port = 0;
126 early_serial_base = bases[port];
127 }
128 s += strcspn(s, ",");
129 if (*s == ',')
130 s++;
131 }
132
133 outb(0x3, early_serial_base + LCR); /* 8n1 */
134 outb(0, early_serial_base + IER); /* no interrupt */
135 outb(0, early_serial_base + FCR); /* no fifo */
136 outb(0x3, early_serial_base + MCR); /* DTR + RTS */
137
138 if (*s) {
139 baud = simple_strtoul(s, &e, 0);
140 if (baud == 0 || s == e)
141 baud = DEFAULT_BAUD;
142 }
143
144 divisor = 115200 / baud;
145 c = inb(early_serial_base + LCR);
146 outb(c | DLAB, early_serial_base + LCR);
147 outb(divisor & 0xff, early_serial_base + DLL);
148 outb((divisor >> 8) & 0xff, early_serial_base + DLH);
149 outb(c & ~DLAB, early_serial_base + LCR);
150}
151
152static struct console early_serial_console = {
153 .name = "earlyser",
154 .write = early_serial_write,
155 .flags = CON_PRINTBUFFER,
156 .index = -1,
157};
158
159/* Console interface to a host file on AMD's SimNow! */
160
161static int simnow_fd;
162
163enum {
164 MAGIC1 = 0xBACCD00A,
165 MAGIC2 = 0xCA110000,
166 XOPEN = 5,
167 XWRITE = 4,
168};
169
170static noinline long simnow(long cmd, long a, long b, long c)
171{
172 long ret;
173 asm volatile("cpuid" :
174 "=a" (ret) :
175 "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2));
176 return ret;
177}
178
179static void __init simnow_init(char *str)
180{
181 char *fn = "klog";
182 if (*str == '=')
183 fn = ++str;
184 /* error ignored */
185 simnow_fd = simnow(XOPEN, (unsigned long)fn, O_WRONLY|O_APPEND|O_CREAT, 0644);
186}
187
188static void simnow_write(struct console *con, const char *s, unsigned n)
189{
190 simnow(XWRITE, simnow_fd, (unsigned long)s, n);
191}
192
193static struct console simnow_console = {
194 .name = "simnow",
195 .write = simnow_write,
196 .flags = CON_PRINTBUFFER,
197 .index = -1,
198};
199
200/* Direct interface for emergencies */
201struct console *early_console = &early_vga_console;
202static int early_console_initialized = 0;
203
204void early_printk(const char *fmt, ...)
205{
206 char buf[512];
207 int n;
208 va_list ap;
209
210 va_start(ap,fmt);
211 n = vscnprintf(buf,512,fmt,ap);
212 early_console->write(early_console,buf,n);
213 va_end(ap);
214}
215
216static int __initdata keep_early;
217
218static int __init setup_early_printk(char *buf)
219{
220 if (!buf)
221 return 0;
222
223 if (early_console_initialized)
224 return 0;
225 early_console_initialized = 1;
226
227 if (strstr(buf, "keep"))
228 keep_early = 1;
229
230 if (!strncmp(buf, "serial", 6)) {
231 early_serial_init(buf + 6);
232 early_console = &early_serial_console;
233 } else if (!strncmp(buf, "ttyS", 4)) {
234 early_serial_init(buf);
235 early_console = &early_serial_console;
236 } else if (!strncmp(buf, "vga", 3)
237 && SCREEN_INFO.orig_video_isVGA == 1) {
238 max_xpos = SCREEN_INFO.orig_video_cols;
239 max_ypos = SCREEN_INFO.orig_video_lines;
240 current_ypos = SCREEN_INFO.orig_y;
241 early_console = &early_vga_console;
242 } else if (!strncmp(buf, "simnow", 6)) {
243 simnow_init(buf + 6);
244 early_console = &simnow_console;
245 keep_early = 1;
246#ifdef CONFIG_HVC_XEN
247 } else if (!strncmp(buf, "xen", 3)) {
248 early_console = &xenboot_console;
249#endif
250 }
251
252 if (keep_early)
253 early_console->flags &= ~CON_BOOT;
254 else
255 early_console->flags |= CON_BOOT;
256 register_console(early_console);
257 return 0;
258}
259early_param("earlyprintk", setup_early_printk);
diff --git a/arch/x86/kernel/efi_32.c b/arch/x86/kernel/efi_32.c
new file mode 100644
index 000000000000..2452c6fbe992
--- /dev/null
+++ b/arch/x86/kernel/efi_32.c
@@ -0,0 +1,712 @@
1/*
2 * Extensible Firmware Interface
3 *
4 * Based on Extensible Firmware Interface Specification version 1.0
5 *
6 * Copyright (C) 1999 VA Linux Systems
7 * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
8 * Copyright (C) 1999-2002 Hewlett-Packard Co.
9 * David Mosberger-Tang <davidm@hpl.hp.com>
10 * Stephane Eranian <eranian@hpl.hp.com>
11 *
12 * All EFI Runtime Services are not implemented yet as EFI only
13 * supports physical mode addressing on SoftSDV. This is to be fixed
14 * in a future version. --drummond 1999-07-20
15 *
16 * Implemented EFI runtime services and virtual mode calls. --davidm
17 *
18 * Goutham Rao: <goutham.rao@intel.com>
19 * Skip non-WB memory and ignore empty memory ranges.
20 */
21
22#include <linux/kernel.h>
23#include <linux/init.h>
24#include <linux/mm.h>
25#include <linux/types.h>
26#include <linux/time.h>
27#include <linux/spinlock.h>
28#include <linux/bootmem.h>
29#include <linux/ioport.h>
30#include <linux/module.h>
31#include <linux/efi.h>
32#include <linux/kexec.h>
33
34#include <asm/setup.h>
35#include <asm/io.h>
36#include <asm/page.h>
37#include <asm/pgtable.h>
38#include <asm/processor.h>
39#include <asm/desc.h>
40#include <asm/tlbflush.h>
41
42#define EFI_DEBUG 0
43#define PFX "EFI: "
44
45extern efi_status_t asmlinkage efi_call_phys(void *, ...);
46
47struct efi efi;
48EXPORT_SYMBOL(efi);
49static struct efi efi_phys;
50struct efi_memory_map memmap;
51
52/*
53 * We require an early boot_ioremap mapping mechanism initially
54 */
55extern void * boot_ioremap(unsigned long, unsigned long);
56
57/*
58 * To make EFI call EFI runtime service in physical addressing mode we need
59 * prelog/epilog before/after the invocation to disable interrupt, to
60 * claim EFI runtime service handler exclusively and to duplicate a memory in
61 * low memory space say 0 - 3G.
62 */
63
64static unsigned long efi_rt_eflags;
65static DEFINE_SPINLOCK(efi_rt_lock);
66static pgd_t efi_bak_pg_dir_pointer[2];
67
68static void efi_call_phys_prelog(void) __acquires(efi_rt_lock)
69{
70 unsigned long cr4;
71 unsigned long temp;
72 struct Xgt_desc_struct gdt_descr;
73
74 spin_lock(&efi_rt_lock);
75 local_irq_save(efi_rt_eflags);
76
77 /*
78 * If I don't have PSE, I should just duplicate two entries in page
79 * directory. If I have PSE, I just need to duplicate one entry in
80 * page directory.
81 */
82 cr4 = read_cr4();
83
84 if (cr4 & X86_CR4_PSE) {
85 efi_bak_pg_dir_pointer[0].pgd =
86 swapper_pg_dir[pgd_index(0)].pgd;
87 swapper_pg_dir[0].pgd =
88 swapper_pg_dir[pgd_index(PAGE_OFFSET)].pgd;
89 } else {
90 efi_bak_pg_dir_pointer[0].pgd =
91 swapper_pg_dir[pgd_index(0)].pgd;
92 efi_bak_pg_dir_pointer[1].pgd =
93 swapper_pg_dir[pgd_index(0x400000)].pgd;
94 swapper_pg_dir[pgd_index(0)].pgd =
95 swapper_pg_dir[pgd_index(PAGE_OFFSET)].pgd;
96 temp = PAGE_OFFSET + 0x400000;
97 swapper_pg_dir[pgd_index(0x400000)].pgd =
98 swapper_pg_dir[pgd_index(temp)].pgd;
99 }
100
101 /*
102 * After the lock is released, the original page table is restored.
103 */
104 local_flush_tlb();
105
106 gdt_descr.address = __pa(get_cpu_gdt_table(0));
107 gdt_descr.size = GDT_SIZE - 1;
108 load_gdt(&gdt_descr);
109}
110
111static void efi_call_phys_epilog(void) __releases(efi_rt_lock)
112{
113 unsigned long cr4;
114 struct Xgt_desc_struct gdt_descr;
115
116 gdt_descr.address = (unsigned long)get_cpu_gdt_table(0);
117 gdt_descr.size = GDT_SIZE - 1;
118 load_gdt(&gdt_descr);
119
120 cr4 = read_cr4();
121
122 if (cr4 & X86_CR4_PSE) {
123 swapper_pg_dir[pgd_index(0)].pgd =
124 efi_bak_pg_dir_pointer[0].pgd;
125 } else {
126 swapper_pg_dir[pgd_index(0)].pgd =
127 efi_bak_pg_dir_pointer[0].pgd;
128 swapper_pg_dir[pgd_index(0x400000)].pgd =
129 efi_bak_pg_dir_pointer[1].pgd;
130 }
131
132 /*
133 * After the lock is released, the original page table is restored.
134 */
135 local_flush_tlb();
136
137 local_irq_restore(efi_rt_eflags);
138 spin_unlock(&efi_rt_lock);
139}
140
141static efi_status_t
142phys_efi_set_virtual_address_map(unsigned long memory_map_size,
143 unsigned long descriptor_size,
144 u32 descriptor_version,
145 efi_memory_desc_t *virtual_map)
146{
147 efi_status_t status;
148
149 efi_call_phys_prelog();
150 status = efi_call_phys(efi_phys.set_virtual_address_map,
151 memory_map_size, descriptor_size,
152 descriptor_version, virtual_map);
153 efi_call_phys_epilog();
154 return status;
155}
156
157static efi_status_t
158phys_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
159{
160 efi_status_t status;
161
162 efi_call_phys_prelog();
163 status = efi_call_phys(efi_phys.get_time, tm, tc);
164 efi_call_phys_epilog();
165 return status;
166}
167
168inline int efi_set_rtc_mmss(unsigned long nowtime)
169{
170 int real_seconds, real_minutes;
171 efi_status_t status;
172 efi_time_t eft;
173 efi_time_cap_t cap;
174
175 spin_lock(&efi_rt_lock);
176 status = efi.get_time(&eft, &cap);
177 spin_unlock(&efi_rt_lock);
178 if (status != EFI_SUCCESS)
179 panic("Ooops, efitime: can't read time!\n");
180 real_seconds = nowtime % 60;
181 real_minutes = nowtime / 60;
182
183 if (((abs(real_minutes - eft.minute) + 15)/30) & 1)
184 real_minutes += 30;
185 real_minutes %= 60;
186
187 eft.minute = real_minutes;
188 eft.second = real_seconds;
189
190 if (status != EFI_SUCCESS) {
191 printk("Ooops: efitime: can't read time!\n");
192 return -1;
193 }
194 return 0;
195}
196/*
197 * This is used during kernel init before runtime
198 * services have been remapped and also during suspend, therefore,
199 * we'll need to call both in physical and virtual modes.
200 */
201inline unsigned long efi_get_time(void)
202{
203 efi_status_t status;
204 efi_time_t eft;
205 efi_time_cap_t cap;
206
207 if (efi.get_time) {
208 /* if we are in virtual mode use remapped function */
209 status = efi.get_time(&eft, &cap);
210 } else {
211 /* we are in physical mode */
212 status = phys_efi_get_time(&eft, &cap);
213 }
214
215 if (status != EFI_SUCCESS)
216 printk("Oops: efitime: can't read time status: 0x%lx\n",status);
217
218 return mktime(eft.year, eft.month, eft.day, eft.hour,
219 eft.minute, eft.second);
220}
221
222int is_available_memory(efi_memory_desc_t * md)
223{
224 if (!(md->attribute & EFI_MEMORY_WB))
225 return 0;
226
227 switch (md->type) {
228 case EFI_LOADER_CODE:
229 case EFI_LOADER_DATA:
230 case EFI_BOOT_SERVICES_CODE:
231 case EFI_BOOT_SERVICES_DATA:
232 case EFI_CONVENTIONAL_MEMORY:
233 return 1;
234 }
235 return 0;
236}
237
238/*
239 * We need to map the EFI memory map again after paging_init().
240 */
241void __init efi_map_memmap(void)
242{
243 memmap.map = NULL;
244
245 memmap.map = bt_ioremap((unsigned long) memmap.phys_map,
246 (memmap.nr_map * memmap.desc_size));
247 if (memmap.map == NULL)
248 printk(KERN_ERR PFX "Could not remap the EFI memmap!\n");
249
250 memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
251}
252
253#if EFI_DEBUG
254static void __init print_efi_memmap(void)
255{
256 efi_memory_desc_t *md;
257 void *p;
258 int i;
259
260 for (p = memmap.map, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) {
261 md = p;
262 printk(KERN_INFO "mem%02u: type=%u, attr=0x%llx, "
263 "range=[0x%016llx-0x%016llx) (%lluMB)\n",
264 i, md->type, md->attribute, md->phys_addr,
265 md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT),
266 (md->num_pages >> (20 - EFI_PAGE_SHIFT)));
267 }
268}
269#endif /* EFI_DEBUG */
270
271/*
272 * Walks the EFI memory map and calls CALLBACK once for each EFI
273 * memory descriptor that has memory that is available for kernel use.
274 */
275void efi_memmap_walk(efi_freemem_callback_t callback, void *arg)
276{
277 int prev_valid = 0;
278 struct range {
279 unsigned long start;
280 unsigned long end;
281 } uninitialized_var(prev), curr;
282 efi_memory_desc_t *md;
283 unsigned long start, end;
284 void *p;
285
286 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
287 md = p;
288
289 if ((md->num_pages == 0) || (!is_available_memory(md)))
290 continue;
291
292 curr.start = md->phys_addr;
293 curr.end = curr.start + (md->num_pages << EFI_PAGE_SHIFT);
294
295 if (!prev_valid) {
296 prev = curr;
297 prev_valid = 1;
298 } else {
299 if (curr.start < prev.start)
300 printk(KERN_INFO PFX "Unordered memory map\n");
301 if (prev.end == curr.start)
302 prev.end = curr.end;
303 else {
304 start =
305 (unsigned long) (PAGE_ALIGN(prev.start));
306 end = (unsigned long) (prev.end & PAGE_MASK);
307 if ((end > start)
308 && (*callback) (start, end, arg) < 0)
309 return;
310 prev = curr;
311 }
312 }
313 }
314 if (prev_valid) {
315 start = (unsigned long) PAGE_ALIGN(prev.start);
316 end = (unsigned long) (prev.end & PAGE_MASK);
317 if (end > start)
318 (*callback) (start, end, arg);
319 }
320}
321
322void __init efi_init(void)
323{
324 efi_config_table_t *config_tables;
325 efi_runtime_services_t *runtime;
326 efi_char16_t *c16;
327 char vendor[100] = "unknown";
328 unsigned long num_config_tables;
329 int i = 0;
330
331 memset(&efi, 0, sizeof(efi) );
332 memset(&efi_phys, 0, sizeof(efi_phys));
333
334 efi_phys.systab = EFI_SYSTAB;
335 memmap.phys_map = EFI_MEMMAP;
336 memmap.nr_map = EFI_MEMMAP_SIZE/EFI_MEMDESC_SIZE;
337 memmap.desc_version = EFI_MEMDESC_VERSION;
338 memmap.desc_size = EFI_MEMDESC_SIZE;
339
340 efi.systab = (efi_system_table_t *)
341 boot_ioremap((unsigned long) efi_phys.systab,
342 sizeof(efi_system_table_t));
343 /*
344 * Verify the EFI Table
345 */
346 if (efi.systab == NULL)
347 printk(KERN_ERR PFX "Woah! Couldn't map the EFI system table.\n");
348 if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
349 printk(KERN_ERR PFX "Woah! EFI system table signature incorrect\n");
350 if ((efi.systab->hdr.revision >> 16) == 0)
351 printk(KERN_ERR PFX "Warning: EFI system table version "
352 "%d.%02d, expected 1.00 or greater\n",
353 efi.systab->hdr.revision >> 16,
354 efi.systab->hdr.revision & 0xffff);
355
356 /*
357 * Grab some details from the system table
358 */
359 num_config_tables = efi.systab->nr_tables;
360 config_tables = (efi_config_table_t *)efi.systab->tables;
361 runtime = efi.systab->runtime;
362
363 /*
364 * Show what we know for posterity
365 */
366 c16 = (efi_char16_t *) boot_ioremap(efi.systab->fw_vendor, 2);
367 if (c16) {
368 for (i = 0; i < (sizeof(vendor) - 1) && *c16; ++i)
369 vendor[i] = *c16++;
370 vendor[i] = '\0';
371 } else
372 printk(KERN_ERR PFX "Could not map the firmware vendor!\n");
373
374 printk(KERN_INFO PFX "EFI v%u.%.02u by %s \n",
375 efi.systab->hdr.revision >> 16,
376 efi.systab->hdr.revision & 0xffff, vendor);
377
378 /*
379 * Let's see what config tables the firmware passed to us.
380 */
381 config_tables = (efi_config_table_t *)
382 boot_ioremap((unsigned long) config_tables,
383 num_config_tables * sizeof(efi_config_table_t));
384
385 if (config_tables == NULL)
386 printk(KERN_ERR PFX "Could not map EFI Configuration Table!\n");
387
388 efi.mps = EFI_INVALID_TABLE_ADDR;
389 efi.acpi = EFI_INVALID_TABLE_ADDR;
390 efi.acpi20 = EFI_INVALID_TABLE_ADDR;
391 efi.smbios = EFI_INVALID_TABLE_ADDR;
392 efi.sal_systab = EFI_INVALID_TABLE_ADDR;
393 efi.boot_info = EFI_INVALID_TABLE_ADDR;
394 efi.hcdp = EFI_INVALID_TABLE_ADDR;
395 efi.uga = EFI_INVALID_TABLE_ADDR;
396
397 for (i = 0; i < num_config_tables; i++) {
398 if (efi_guidcmp(config_tables[i].guid, MPS_TABLE_GUID) == 0) {
399 efi.mps = config_tables[i].table;
400 printk(KERN_INFO " MPS=0x%lx ", config_tables[i].table);
401 } else
402 if (efi_guidcmp(config_tables[i].guid, ACPI_20_TABLE_GUID) == 0) {
403 efi.acpi20 = config_tables[i].table;
404 printk(KERN_INFO " ACPI 2.0=0x%lx ", config_tables[i].table);
405 } else
406 if (efi_guidcmp(config_tables[i].guid, ACPI_TABLE_GUID) == 0) {
407 efi.acpi = config_tables[i].table;
408 printk(KERN_INFO " ACPI=0x%lx ", config_tables[i].table);
409 } else
410 if (efi_guidcmp(config_tables[i].guid, SMBIOS_TABLE_GUID) == 0) {
411 efi.smbios = config_tables[i].table;
412 printk(KERN_INFO " SMBIOS=0x%lx ", config_tables[i].table);
413 } else
414 if (efi_guidcmp(config_tables[i].guid, HCDP_TABLE_GUID) == 0) {
415 efi.hcdp = config_tables[i].table;
416 printk(KERN_INFO " HCDP=0x%lx ", config_tables[i].table);
417 } else
418 if (efi_guidcmp(config_tables[i].guid, UGA_IO_PROTOCOL_GUID) == 0) {
419 efi.uga = config_tables[i].table;
420 printk(KERN_INFO " UGA=0x%lx ", config_tables[i].table);
421 }
422 }
423 printk("\n");
424
425 /*
426 * Check out the runtime services table. We need to map
427 * the runtime services table so that we can grab the physical
428 * address of several of the EFI runtime functions, needed to
429 * set the firmware into virtual mode.
430 */
431
432 runtime = (efi_runtime_services_t *) boot_ioremap((unsigned long)
433 runtime,
434 sizeof(efi_runtime_services_t));
435 if (runtime != NULL) {
436 /*
437 * We will only need *early* access to the following
438 * two EFI runtime services before set_virtual_address_map
439 * is invoked.
440 */
441 efi_phys.get_time = (efi_get_time_t *) runtime->get_time;
442 efi_phys.set_virtual_address_map =
443 (efi_set_virtual_address_map_t *)
444 runtime->set_virtual_address_map;
445 } else
446 printk(KERN_ERR PFX "Could not map the runtime service table!\n");
447
448 /* Map the EFI memory map for use until paging_init() */
449 memmap.map = boot_ioremap((unsigned long) EFI_MEMMAP, EFI_MEMMAP_SIZE);
450 if (memmap.map == NULL)
451 printk(KERN_ERR PFX "Could not map the EFI memory map!\n");
452
453 memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
454
455#if EFI_DEBUG
456 print_efi_memmap();
457#endif
458}
459
460static inline void __init check_range_for_systab(efi_memory_desc_t *md)
461{
462 if (((unsigned long)md->phys_addr <= (unsigned long)efi_phys.systab) &&
463 ((unsigned long)efi_phys.systab < md->phys_addr +
464 ((unsigned long)md->num_pages << EFI_PAGE_SHIFT))) {
465 unsigned long addr;
466
467 addr = md->virt_addr - md->phys_addr +
468 (unsigned long)efi_phys.systab;
469 efi.systab = (efi_system_table_t *)addr;
470 }
471}
472
473/*
474 * Wrap all the virtual calls in a way that forces the parameters on the stack.
475 */
476
477#define efi_call_virt(f, args...) \
478 ((efi_##f##_t __attribute__((regparm(0)))*)efi.systab->runtime->f)(args)
479
480static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
481{
482 return efi_call_virt(get_time, tm, tc);
483}
484
485static efi_status_t virt_efi_set_time (efi_time_t *tm)
486{
487 return efi_call_virt(set_time, tm);
488}
489
490static efi_status_t virt_efi_get_wakeup_time (efi_bool_t *enabled,
491 efi_bool_t *pending,
492 efi_time_t *tm)
493{
494 return efi_call_virt(get_wakeup_time, enabled, pending, tm);
495}
496
497static efi_status_t virt_efi_set_wakeup_time (efi_bool_t enabled,
498 efi_time_t *tm)
499{
500 return efi_call_virt(set_wakeup_time, enabled, tm);
501}
502
503static efi_status_t virt_efi_get_variable (efi_char16_t *name,
504 efi_guid_t *vendor, u32 *attr,
505 unsigned long *data_size, void *data)
506{
507 return efi_call_virt(get_variable, name, vendor, attr, data_size, data);
508}
509
510static efi_status_t virt_efi_get_next_variable (unsigned long *name_size,
511 efi_char16_t *name,
512 efi_guid_t *vendor)
513{
514 return efi_call_virt(get_next_variable, name_size, name, vendor);
515}
516
517static efi_status_t virt_efi_set_variable (efi_char16_t *name,
518 efi_guid_t *vendor,
519 unsigned long attr,
520 unsigned long data_size, void *data)
521{
522 return efi_call_virt(set_variable, name, vendor, attr, data_size, data);
523}
524
525static efi_status_t virt_efi_get_next_high_mono_count (u32 *count)
526{
527 return efi_call_virt(get_next_high_mono_count, count);
528}
529
530static void virt_efi_reset_system (int reset_type, efi_status_t status,
531 unsigned long data_size,
532 efi_char16_t *data)
533{
534 efi_call_virt(reset_system, reset_type, status, data_size, data);
535}
536
537/*
538 * This function will switch the EFI runtime services to virtual mode.
539 * Essentially, look through the EFI memmap and map every region that
540 * has the runtime attribute bit set in its memory descriptor and update
541 * that memory descriptor with the virtual address obtained from ioremap().
542 * This enables the runtime services to be called without having to
543 * thunk back into physical mode for every invocation.
544 */
545
546void __init efi_enter_virtual_mode(void)
547{
548 efi_memory_desc_t *md;
549 efi_status_t status;
550 void *p;
551
552 efi.systab = NULL;
553
554 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
555 md = p;
556
557 if (!(md->attribute & EFI_MEMORY_RUNTIME))
558 continue;
559
560 md->virt_addr = (unsigned long)ioremap(md->phys_addr,
561 md->num_pages << EFI_PAGE_SHIFT);
562 if (!(unsigned long)md->virt_addr) {
563 printk(KERN_ERR PFX "ioremap of 0x%lX failed\n",
564 (unsigned long)md->phys_addr);
565 }
566 /* update the virtual address of the EFI system table */
567 check_range_for_systab(md);
568 }
569
570 BUG_ON(!efi.systab);
571
572 status = phys_efi_set_virtual_address_map(
573 memmap.desc_size * memmap.nr_map,
574 memmap.desc_size,
575 memmap.desc_version,
576 memmap.phys_map);
577
578 if (status != EFI_SUCCESS) {
579 printk (KERN_ALERT "You are screwed! "
580 "Unable to switch EFI into virtual mode "
581 "(status=%lx)\n", status);
582 panic("EFI call to SetVirtualAddressMap() failed!");
583 }
584
585 /*
586 * Now that EFI is in virtual mode, update the function
587 * pointers in the runtime service table to the new virtual addresses.
588 */
589
590 efi.get_time = virt_efi_get_time;
591 efi.set_time = virt_efi_set_time;
592 efi.get_wakeup_time = virt_efi_get_wakeup_time;
593 efi.set_wakeup_time = virt_efi_set_wakeup_time;
594 efi.get_variable = virt_efi_get_variable;
595 efi.get_next_variable = virt_efi_get_next_variable;
596 efi.set_variable = virt_efi_set_variable;
597 efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count;
598 efi.reset_system = virt_efi_reset_system;
599}
600
601void __init
602efi_initialize_iomem_resources(struct resource *code_resource,
603 struct resource *data_resource)
604{
605 struct resource *res;
606 efi_memory_desc_t *md;
607 void *p;
608
609 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
610 md = p;
611
612 if ((md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >
613 0x100000000ULL)
614 continue;
615 res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
616 switch (md->type) {
617 case EFI_RESERVED_TYPE:
618 res->name = "Reserved Memory";
619 break;
620 case EFI_LOADER_CODE:
621 res->name = "Loader Code";
622 break;
623 case EFI_LOADER_DATA:
624 res->name = "Loader Data";
625 break;
626 case EFI_BOOT_SERVICES_DATA:
627 res->name = "BootServices Data";
628 break;
629 case EFI_BOOT_SERVICES_CODE:
630 res->name = "BootServices Code";
631 break;
632 case EFI_RUNTIME_SERVICES_CODE:
633 res->name = "Runtime Service Code";
634 break;
635 case EFI_RUNTIME_SERVICES_DATA:
636 res->name = "Runtime Service Data";
637 break;
638 case EFI_CONVENTIONAL_MEMORY:
639 res->name = "Conventional Memory";
640 break;
641 case EFI_UNUSABLE_MEMORY:
642 res->name = "Unusable Memory";
643 break;
644 case EFI_ACPI_RECLAIM_MEMORY:
645 res->name = "ACPI Reclaim";
646 break;
647 case EFI_ACPI_MEMORY_NVS:
648 res->name = "ACPI NVS";
649 break;
650 case EFI_MEMORY_MAPPED_IO:
651 res->name = "Memory Mapped IO";
652 break;
653 case EFI_MEMORY_MAPPED_IO_PORT_SPACE:
654 res->name = "Memory Mapped IO Port Space";
655 break;
656 default:
657 res->name = "Reserved";
658 break;
659 }
660 res->start = md->phys_addr;
661 res->end = res->start + ((md->num_pages << EFI_PAGE_SHIFT) - 1);
662 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
663 if (request_resource(&iomem_resource, res) < 0)
664 printk(KERN_ERR PFX "Failed to allocate res %s : "
665 "0x%llx-0x%llx\n", res->name,
666 (unsigned long long)res->start,
667 (unsigned long long)res->end);
668 /*
669 * We don't know which region contains kernel data so we try
670 * it repeatedly and let the resource manager test it.
671 */
672 if (md->type == EFI_CONVENTIONAL_MEMORY) {
673 request_resource(res, code_resource);
674 request_resource(res, data_resource);
675#ifdef CONFIG_KEXEC
676 request_resource(res, &crashk_res);
677#endif
678 }
679 }
680}
681
682/*
683 * Convenience functions to obtain memory types and attributes
684 */
685
686u32 efi_mem_type(unsigned long phys_addr)
687{
688 efi_memory_desc_t *md;
689 void *p;
690
691 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
692 md = p;
693 if ((md->phys_addr <= phys_addr) && (phys_addr <
694 (md->phys_addr + (md-> num_pages << EFI_PAGE_SHIFT)) ))
695 return md->type;
696 }
697 return 0;
698}
699
700u64 efi_mem_attributes(unsigned long phys_addr)
701{
702 efi_memory_desc_t *md;
703 void *p;
704
705 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
706 md = p;
707 if ((md->phys_addr <= phys_addr) && (phys_addr <
708 (md->phys_addr + (md-> num_pages << EFI_PAGE_SHIFT)) ))
709 return md->attribute;
710 }
711 return 0;
712}
diff --git a/arch/x86/kernel/efi_stub_32.S b/arch/x86/kernel/efi_stub_32.S
new file mode 100644
index 000000000000..ef00bb77d7e4
--- /dev/null
+++ b/arch/x86/kernel/efi_stub_32.S
@@ -0,0 +1,122 @@
1/*
2 * EFI call stub for IA32.
3 *
4 * This stub allows us to make EFI calls in physical mode with interrupts
5 * turned off.
6 */
7
8#include <linux/linkage.h>
9#include <asm/page.h>
10
11/*
12 * efi_call_phys(void *, ...) is a function with variable parameters.
13 * All the callers of this function assure that all the parameters are 4-bytes.
14 */
15
16/*
17 * In gcc calling convention, EBX, ESP, EBP, ESI and EDI are all callee save.
18 * So we'd better save all of them at the beginning of this function and restore
19 * at the end no matter how many we use, because we can not assure EFI runtime
20 * service functions will comply with gcc calling convention, too.
21 */
22
23.text
24ENTRY(efi_call_phys)
25 /*
26 * 0. The function can only be called in Linux kernel. So CS has been
27 * set to 0x0010, DS and SS have been set to 0x0018. In EFI, I found
28 * the values of these registers are the same. And, the corresponding
29 * GDT entries are identical. So I will do nothing about segment reg
30 * and GDT, but change GDT base register in prelog and epilog.
31 */
32
33 /*
34 * 1. Now I am running with EIP = <physical address> + PAGE_OFFSET.
35 * But to make it smoothly switch from virtual mode to flat mode.
36 * The mapping of lower virtual memory has been created in prelog and
37 * epilog.
38 */
39 movl $1f, %edx
40 subl $__PAGE_OFFSET, %edx
41 jmp *%edx
421:
43
44 /*
45 * 2. Now on the top of stack is the return
46 * address in the caller of efi_call_phys(), then parameter 1,
47 * parameter 2, ..., param n. To make things easy, we save the return
48 * address of efi_call_phys in a global variable.
49 */
50 popl %edx
51 movl %edx, saved_return_addr
52 /* get the function pointer into ECX*/
53 popl %ecx
54 movl %ecx, efi_rt_function_ptr
55 movl $2f, %edx
56 subl $__PAGE_OFFSET, %edx
57 pushl %edx
58
59 /*
60 * 3. Clear PG bit in %CR0.
61 */
62 movl %cr0, %edx
63 andl $0x7fffffff, %edx
64 movl %edx, %cr0
65 jmp 1f
661:
67
68 /*
69 * 4. Adjust stack pointer.
70 */
71 subl $__PAGE_OFFSET, %esp
72
73 /*
74 * 5. Call the physical function.
75 */
76 jmp *%ecx
77
782:
79 /*
80 * 6. After EFI runtime service returns, control will return to
81 * following instruction. We'd better readjust stack pointer first.
82 */
83 addl $__PAGE_OFFSET, %esp
84
85 /*
86 * 7. Restore PG bit
87 */
88 movl %cr0, %edx
89 orl $0x80000000, %edx
90 movl %edx, %cr0
91 jmp 1f
921:
93 /*
94 * 8. Now restore the virtual mode from flat mode by
95 * adding EIP with PAGE_OFFSET.
96 */
97 movl $1f, %edx
98 jmp *%edx
991:
100
101 /*
102 * 9. Balance the stack. And because EAX contain the return value,
103 * we'd better not clobber it.
104 */
105 leal efi_rt_function_ptr, %edx
106 movl (%edx), %ecx
107 pushl %ecx
108
109 /*
110 * 10. Push the saved return address onto the stack and return.
111 */
112 leal saved_return_addr, %edx
113 movl (%edx), %ecx
114 pushl %ecx
115 ret
116.previous
117
118.data
119saved_return_addr:
120 .long 0
121efi_rt_function_ptr:
122 .long 0
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
new file mode 100644
index 000000000000..290b7bc82da3
--- /dev/null
+++ b/arch/x86/kernel/entry_32.S
@@ -0,0 +1,1112 @@
1/*
2 * linux/arch/i386/entry.S
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */
6
7/*
8 * entry.S contains the system-call and fault low-level handling routines.
9 * This also contains the timer-interrupt handler, as well as all interrupts
10 * and faults that can result in a task-switch.
11 *
12 * NOTE: This code handles signal-recognition, which happens every time
13 * after a timer-interrupt and after each system call.
14 *
15 * I changed all the .align's to 4 (16 byte alignment), as that's faster
16 * on a 486.
17 *
18 * Stack layout in 'syscall_exit':
19 * ptrace needs to have all regs on the stack.
20 * if the order here is changed, it needs to be
21 * updated in fork.c:copy_process, signal.c:do_signal,
22 * ptrace.c and ptrace.h
23 *
24 * 0(%esp) - %ebx
25 * 4(%esp) - %ecx
26 * 8(%esp) - %edx
27 * C(%esp) - %esi
28 * 10(%esp) - %edi
29 * 14(%esp) - %ebp
30 * 18(%esp) - %eax
31 * 1C(%esp) - %ds
32 * 20(%esp) - %es
33 * 24(%esp) - %fs
34 * 28(%esp) - orig_eax
35 * 2C(%esp) - %eip
36 * 30(%esp) - %cs
37 * 34(%esp) - %eflags
38 * 38(%esp) - %oldesp
39 * 3C(%esp) - %oldss
40 *
41 * "current" is in register %ebx during any slow entries.
42 */
43
44#include <linux/linkage.h>
45#include <asm/thread_info.h>
46#include <asm/irqflags.h>
47#include <asm/errno.h>
48#include <asm/segment.h>
49#include <asm/smp.h>
50#include <asm/page.h>
51#include <asm/desc.h>
52#include <asm/percpu.h>
53#include <asm/dwarf2.h>
54#include "irq_vectors.h"
55
56/*
57 * We use macros for low-level operations which need to be overridden
58 * for paravirtualization. The following will never clobber any registers:
59 * INTERRUPT_RETURN (aka. "iret")
60 * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
61 * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
62 *
63 * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
64 * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
65 * Allowing a register to be clobbered can shrink the paravirt replacement
66 * enough to patch inline, increasing performance.
67 */
68
69#define nr_syscalls ((syscall_table_size)/4)
70
71CF_MASK = 0x00000001
72TF_MASK = 0x00000100
73IF_MASK = 0x00000200
74DF_MASK = 0x00000400
75NT_MASK = 0x00004000
76VM_MASK = 0x00020000
77
78#ifdef CONFIG_PREEMPT
79#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
80#else
81#define preempt_stop(clobbers)
82#define resume_kernel restore_nocheck
83#endif
84
85.macro TRACE_IRQS_IRET
86#ifdef CONFIG_TRACE_IRQFLAGS
87 testl $IF_MASK,PT_EFLAGS(%esp) # interrupts off?
88 jz 1f
89 TRACE_IRQS_ON
901:
91#endif
92.endm
93
94#ifdef CONFIG_VM86
95#define resume_userspace_sig check_userspace
96#else
97#define resume_userspace_sig resume_userspace
98#endif
99
100#define SAVE_ALL \
101 cld; \
102 pushl %fs; \
103 CFI_ADJUST_CFA_OFFSET 4;\
104 /*CFI_REL_OFFSET fs, 0;*/\
105 pushl %es; \
106 CFI_ADJUST_CFA_OFFSET 4;\
107 /*CFI_REL_OFFSET es, 0;*/\
108 pushl %ds; \
109 CFI_ADJUST_CFA_OFFSET 4;\
110 /*CFI_REL_OFFSET ds, 0;*/\
111 pushl %eax; \
112 CFI_ADJUST_CFA_OFFSET 4;\
113 CFI_REL_OFFSET eax, 0;\
114 pushl %ebp; \
115 CFI_ADJUST_CFA_OFFSET 4;\
116 CFI_REL_OFFSET ebp, 0;\
117 pushl %edi; \
118 CFI_ADJUST_CFA_OFFSET 4;\
119 CFI_REL_OFFSET edi, 0;\
120 pushl %esi; \
121 CFI_ADJUST_CFA_OFFSET 4;\
122 CFI_REL_OFFSET esi, 0;\
123 pushl %edx; \
124 CFI_ADJUST_CFA_OFFSET 4;\
125 CFI_REL_OFFSET edx, 0;\
126 pushl %ecx; \
127 CFI_ADJUST_CFA_OFFSET 4;\
128 CFI_REL_OFFSET ecx, 0;\
129 pushl %ebx; \
130 CFI_ADJUST_CFA_OFFSET 4;\
131 CFI_REL_OFFSET ebx, 0;\
132 movl $(__USER_DS), %edx; \
133 movl %edx, %ds; \
134 movl %edx, %es; \
135 movl $(__KERNEL_PERCPU), %edx; \
136 movl %edx, %fs
137
138#define RESTORE_INT_REGS \
139 popl %ebx; \
140 CFI_ADJUST_CFA_OFFSET -4;\
141 CFI_RESTORE ebx;\
142 popl %ecx; \
143 CFI_ADJUST_CFA_OFFSET -4;\
144 CFI_RESTORE ecx;\
145 popl %edx; \
146 CFI_ADJUST_CFA_OFFSET -4;\
147 CFI_RESTORE edx;\
148 popl %esi; \
149 CFI_ADJUST_CFA_OFFSET -4;\
150 CFI_RESTORE esi;\
151 popl %edi; \
152 CFI_ADJUST_CFA_OFFSET -4;\
153 CFI_RESTORE edi;\
154 popl %ebp; \
155 CFI_ADJUST_CFA_OFFSET -4;\
156 CFI_RESTORE ebp;\
157 popl %eax; \
158 CFI_ADJUST_CFA_OFFSET -4;\
159 CFI_RESTORE eax
160
161#define RESTORE_REGS \
162 RESTORE_INT_REGS; \
1631: popl %ds; \
164 CFI_ADJUST_CFA_OFFSET -4;\
165 /*CFI_RESTORE ds;*/\
1662: popl %es; \
167 CFI_ADJUST_CFA_OFFSET -4;\
168 /*CFI_RESTORE es;*/\
1693: popl %fs; \
170 CFI_ADJUST_CFA_OFFSET -4;\
171 /*CFI_RESTORE fs;*/\
172.pushsection .fixup,"ax"; \
1734: movl $0,(%esp); \
174 jmp 1b; \
1755: movl $0,(%esp); \
176 jmp 2b; \
1776: movl $0,(%esp); \
178 jmp 3b; \
179.section __ex_table,"a";\
180 .align 4; \
181 .long 1b,4b; \
182 .long 2b,5b; \
183 .long 3b,6b; \
184.popsection
185
186#define RING0_INT_FRAME \
187 CFI_STARTPROC simple;\
188 CFI_SIGNAL_FRAME;\
189 CFI_DEF_CFA esp, 3*4;\
190 /*CFI_OFFSET cs, -2*4;*/\
191 CFI_OFFSET eip, -3*4
192
193#define RING0_EC_FRAME \
194 CFI_STARTPROC simple;\
195 CFI_SIGNAL_FRAME;\
196 CFI_DEF_CFA esp, 4*4;\
197 /*CFI_OFFSET cs, -2*4;*/\
198 CFI_OFFSET eip, -3*4
199
200#define RING0_PTREGS_FRAME \
201 CFI_STARTPROC simple;\
202 CFI_SIGNAL_FRAME;\
203 CFI_DEF_CFA esp, PT_OLDESP-PT_EBX;\
204 /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/\
205 CFI_OFFSET eip, PT_EIP-PT_OLDESP;\
206 /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/\
207 /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/\
208 CFI_OFFSET eax, PT_EAX-PT_OLDESP;\
209 CFI_OFFSET ebp, PT_EBP-PT_OLDESP;\
210 CFI_OFFSET edi, PT_EDI-PT_OLDESP;\
211 CFI_OFFSET esi, PT_ESI-PT_OLDESP;\
212 CFI_OFFSET edx, PT_EDX-PT_OLDESP;\
213 CFI_OFFSET ecx, PT_ECX-PT_OLDESP;\
214 CFI_OFFSET ebx, PT_EBX-PT_OLDESP
215
216ENTRY(ret_from_fork)
217 CFI_STARTPROC
218 pushl %eax
219 CFI_ADJUST_CFA_OFFSET 4
220 call schedule_tail
221 GET_THREAD_INFO(%ebp)
222 popl %eax
223 CFI_ADJUST_CFA_OFFSET -4
224 pushl $0x0202 # Reset kernel eflags
225 CFI_ADJUST_CFA_OFFSET 4
226 popfl
227 CFI_ADJUST_CFA_OFFSET -4
228 jmp syscall_exit
229 CFI_ENDPROC
230END(ret_from_fork)
231
232/*
233 * Return to user mode is not as complex as all this looks,
234 * but we want the default path for a system call return to
235 * go as quickly as possible which is why some of this is
236 * less clear than it otherwise should be.
237 */
238
239 # userspace resumption stub bypassing syscall exit tracing
240 ALIGN
241 RING0_PTREGS_FRAME
242ret_from_exception:
243 preempt_stop(CLBR_ANY)
244ret_from_intr:
245 GET_THREAD_INFO(%ebp)
246check_userspace:
247 movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS
248 movb PT_CS(%esp), %al
249 andl $(VM_MASK | SEGMENT_RPL_MASK), %eax
250 cmpl $USER_RPL, %eax
251 jb resume_kernel # not returning to v8086 or userspace
252
253ENTRY(resume_userspace)
254 DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
255 # setting need_resched or sigpending
256 # between sampling and the iret
257 movl TI_flags(%ebp), %ecx
258 andl $_TIF_WORK_MASK, %ecx # is there any work to be done on
259 # int/exception return?
260 jne work_pending
261 jmp restore_all
262END(ret_from_exception)
263
264#ifdef CONFIG_PREEMPT
265ENTRY(resume_kernel)
266 DISABLE_INTERRUPTS(CLBR_ANY)
267 cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ?
268 jnz restore_nocheck
269need_resched:
270 movl TI_flags(%ebp), %ecx # need_resched set ?
271 testb $_TIF_NEED_RESCHED, %cl
272 jz restore_all
273 testl $IF_MASK,PT_EFLAGS(%esp) # interrupts off (exception path) ?
274 jz restore_all
275 call preempt_schedule_irq
276 jmp need_resched
277END(resume_kernel)
278#endif
279 CFI_ENDPROC
280
281/* SYSENTER_RETURN points to after the "sysenter" instruction in
282 the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */
283
284 # sysenter call handler stub
285ENTRY(sysenter_entry)
286 CFI_STARTPROC simple
287 CFI_SIGNAL_FRAME
288 CFI_DEF_CFA esp, 0
289 CFI_REGISTER esp, ebp
290 movl TSS_sysenter_esp0(%esp),%esp
291sysenter_past_esp:
292 /*
293 * No need to follow this irqs on/off section: the syscall
294 * disabled irqs and here we enable it straight after entry:
295 */
296 ENABLE_INTERRUPTS(CLBR_NONE)
297 pushl $(__USER_DS)
298 CFI_ADJUST_CFA_OFFSET 4
299 /*CFI_REL_OFFSET ss, 0*/
300 pushl %ebp
301 CFI_ADJUST_CFA_OFFSET 4
302 CFI_REL_OFFSET esp, 0
303 pushfl
304 CFI_ADJUST_CFA_OFFSET 4
305 pushl $(__USER_CS)
306 CFI_ADJUST_CFA_OFFSET 4
307 /*CFI_REL_OFFSET cs, 0*/
308 /*
309 * Push current_thread_info()->sysenter_return to the stack.
310 * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
311 * pushed above; +8 corresponds to copy_thread's esp0 setting.
312 */
313 pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
314 CFI_ADJUST_CFA_OFFSET 4
315 CFI_REL_OFFSET eip, 0
316
317/*
318 * Load the potential sixth argument from user stack.
319 * Careful about security.
320 */
321 cmpl $__PAGE_OFFSET-3,%ebp
322 jae syscall_fault
3231: movl (%ebp),%ebp
324.section __ex_table,"a"
325 .align 4
326 .long 1b,syscall_fault
327.previous
328
329 pushl %eax
330 CFI_ADJUST_CFA_OFFSET 4
331 SAVE_ALL
332 GET_THREAD_INFO(%ebp)
333
334 /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
335 testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
336 jnz syscall_trace_entry
337 cmpl $(nr_syscalls), %eax
338 jae syscall_badsys
339 call *sys_call_table(,%eax,4)
340 movl %eax,PT_EAX(%esp)
341 DISABLE_INTERRUPTS(CLBR_ANY)
342 TRACE_IRQS_OFF
343 movl TI_flags(%ebp), %ecx
344 testw $_TIF_ALLWORK_MASK, %cx
345 jne syscall_exit_work
346/* if something modifies registers it must also disable sysexit */
347 movl PT_EIP(%esp), %edx
348 movl PT_OLDESP(%esp), %ecx
349 xorl %ebp,%ebp
350 TRACE_IRQS_ON
3511: mov PT_FS(%esp), %fs
352 ENABLE_INTERRUPTS_SYSEXIT
353 CFI_ENDPROC
354.pushsection .fixup,"ax"
3552: movl $0,PT_FS(%esp)
356 jmp 1b
357.section __ex_table,"a"
358 .align 4
359 .long 1b,2b
360.popsection
361ENDPROC(sysenter_entry)
362
363 # system call handler stub
364ENTRY(system_call)
365 RING0_INT_FRAME # can't unwind into user space anyway
366 pushl %eax # save orig_eax
367 CFI_ADJUST_CFA_OFFSET 4
368 SAVE_ALL
369 GET_THREAD_INFO(%ebp)
370 # system call tracing in operation / emulation
371 /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
372 testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
373 jnz syscall_trace_entry
374 cmpl $(nr_syscalls), %eax
375 jae syscall_badsys
376syscall_call:
377 call *sys_call_table(,%eax,4)
378 movl %eax,PT_EAX(%esp) # store the return value
379syscall_exit:
380 DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
381 # setting need_resched or sigpending
382 # between sampling and the iret
383 TRACE_IRQS_OFF
384 testl $TF_MASK,PT_EFLAGS(%esp) # If tracing set singlestep flag on exit
385 jz no_singlestep
386 orl $_TIF_SINGLESTEP,TI_flags(%ebp)
387no_singlestep:
388 movl TI_flags(%ebp), %ecx
389 testw $_TIF_ALLWORK_MASK, %cx # current->work
390 jne syscall_exit_work
391
392restore_all:
393 movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
394 # Warning: PT_OLDSS(%esp) contains the wrong/random values if we
395 # are returning to the kernel.
396 # See comments in process.c:copy_thread() for details.
397 movb PT_OLDSS(%esp), %ah
398 movb PT_CS(%esp), %al
399 andl $(VM_MASK | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
400 cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
401 CFI_REMEMBER_STATE
402 je ldt_ss # returning to user-space with LDT SS
403restore_nocheck:
404 TRACE_IRQS_IRET
405restore_nocheck_notrace:
406 RESTORE_REGS
407 addl $4, %esp # skip orig_eax/error_code
408 CFI_ADJUST_CFA_OFFSET -4
4091: INTERRUPT_RETURN
410.section .fixup,"ax"
411iret_exc:
412 pushl $0 # no error code
413 pushl $do_iret_error
414 jmp error_code
415.previous
416.section __ex_table,"a"
417 .align 4
418 .long 1b,iret_exc
419.previous
420
421 CFI_RESTORE_STATE
422ldt_ss:
423 larl PT_OLDSS(%esp), %eax
424 jnz restore_nocheck
425 testl $0x00400000, %eax # returning to 32bit stack?
426 jnz restore_nocheck # allright, normal return
427
428#ifdef CONFIG_PARAVIRT
429 /*
430 * The kernel can't run on a non-flat stack if paravirt mode
431 * is active. Rather than try to fixup the high bits of
432 * ESP, bypass this code entirely. This may break DOSemu
433 * and/or Wine support in a paravirt VM, although the option
434 * is still available to implement the setting of the high
435 * 16-bits in the INTERRUPT_RETURN paravirt-op.
436 */
437 cmpl $0, paravirt_ops+PARAVIRT_enabled
438 jne restore_nocheck
439#endif
440
441 /* If returning to userspace with 16bit stack,
442 * try to fix the higher word of ESP, as the CPU
443 * won't restore it.
444 * This is an "official" bug of all the x86-compatible
445 * CPUs, which we can try to work around to make
446 * dosemu and wine happy. */
447 movl PT_OLDESP(%esp), %eax
448 movl %esp, %edx
449 call patch_espfix_desc
450 pushl $__ESPFIX_SS
451 CFI_ADJUST_CFA_OFFSET 4
452 pushl %eax
453 CFI_ADJUST_CFA_OFFSET 4
454 DISABLE_INTERRUPTS(CLBR_EAX)
455 TRACE_IRQS_OFF
456 lss (%esp), %esp
457 CFI_ADJUST_CFA_OFFSET -8
458 jmp restore_nocheck
459 CFI_ENDPROC
460ENDPROC(system_call)
461
462 # perform work that needs to be done immediately before resumption
463 ALIGN
464 RING0_PTREGS_FRAME # can't unwind into user space anyway
465work_pending:
466 testb $_TIF_NEED_RESCHED, %cl
467 jz work_notifysig
468work_resched:
469 call schedule
470 DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
471 # setting need_resched or sigpending
472 # between sampling and the iret
473 TRACE_IRQS_OFF
474 movl TI_flags(%ebp), %ecx
475 andl $_TIF_WORK_MASK, %ecx # is there any work to be done other
476 # than syscall tracing?
477 jz restore_all
478 testb $_TIF_NEED_RESCHED, %cl
479 jnz work_resched
480
481work_notifysig: # deal with pending signals and
482 # notify-resume requests
483#ifdef CONFIG_VM86
484 testl $VM_MASK, PT_EFLAGS(%esp)
485 movl %esp, %eax
486 jne work_notifysig_v86 # returning to kernel-space or
487 # vm86-space
488 xorl %edx, %edx
489 call do_notify_resume
490 jmp resume_userspace_sig
491
492 ALIGN
493work_notifysig_v86:
494 pushl %ecx # save ti_flags for do_notify_resume
495 CFI_ADJUST_CFA_OFFSET 4
496 call save_v86_state # %eax contains pt_regs pointer
497 popl %ecx
498 CFI_ADJUST_CFA_OFFSET -4
499 movl %eax, %esp
500#else
501 movl %esp, %eax
502#endif
503 xorl %edx, %edx
504 call do_notify_resume
505 jmp resume_userspace_sig
506END(work_pending)
507
508 # perform syscall exit tracing
509 ALIGN
510syscall_trace_entry:
511 movl $-ENOSYS,PT_EAX(%esp)
512 movl %esp, %eax
513 xorl %edx,%edx
514 call do_syscall_trace
515 cmpl $0, %eax
516 jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU,
517 # so must skip actual syscall
518 movl PT_ORIG_EAX(%esp), %eax
519 cmpl $(nr_syscalls), %eax
520 jnae syscall_call
521 jmp syscall_exit
522END(syscall_trace_entry)
523
524 # perform syscall exit tracing
525 ALIGN
526syscall_exit_work:
527 testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
528 jz work_pending
529 TRACE_IRQS_ON
530 ENABLE_INTERRUPTS(CLBR_ANY) # could let do_syscall_trace() call
531 # schedule() instead
532 movl %esp, %eax
533 movl $1, %edx
534 call do_syscall_trace
535 jmp resume_userspace
536END(syscall_exit_work)
537 CFI_ENDPROC
538
539 RING0_INT_FRAME # can't unwind into user space anyway
540syscall_fault:
541 pushl %eax # save orig_eax
542 CFI_ADJUST_CFA_OFFSET 4
543 SAVE_ALL
544 GET_THREAD_INFO(%ebp)
545 movl $-EFAULT,PT_EAX(%esp)
546 jmp resume_userspace
547END(syscall_fault)
548
549syscall_badsys:
550 movl $-ENOSYS,PT_EAX(%esp)
551 jmp resume_userspace
552END(syscall_badsys)
553 CFI_ENDPROC
554
555#define FIXUP_ESPFIX_STACK \
556 /* since we are on a wrong stack, we cant make it a C code :( */ \
557 PER_CPU(gdt_page, %ebx); \
558 GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \
559 addl %esp, %eax; \
560 pushl $__KERNEL_DS; \
561 CFI_ADJUST_CFA_OFFSET 4; \
562 pushl %eax; \
563 CFI_ADJUST_CFA_OFFSET 4; \
564 lss (%esp), %esp; \
565 CFI_ADJUST_CFA_OFFSET -8;
566#define UNWIND_ESPFIX_STACK \
567 movl %ss, %eax; \
568 /* see if on espfix stack */ \
569 cmpw $__ESPFIX_SS, %ax; \
570 jne 27f; \
571 movl $__KERNEL_DS, %eax; \
572 movl %eax, %ds; \
573 movl %eax, %es; \
574 /* switch to normal stack */ \
575 FIXUP_ESPFIX_STACK; \
57627:;
577
578/*
579 * Build the entry stubs and pointer table with
580 * some assembler magic.
581 */
582.data
583ENTRY(interrupt)
584.text
585
586ENTRY(irq_entries_start)
587 RING0_INT_FRAME
588vector=0
589.rept NR_IRQS
590 ALIGN
591 .if vector
592 CFI_ADJUST_CFA_OFFSET -4
593 .endif
5941: pushl $~(vector)
595 CFI_ADJUST_CFA_OFFSET 4
596 jmp common_interrupt
597 .previous
598 .long 1b
599 .text
600vector=vector+1
601.endr
602END(irq_entries_start)
603
604.previous
605END(interrupt)
606.previous
607
608/*
609 * the CPU automatically disables interrupts when executing an IRQ vector,
610 * so IRQ-flags tracing has to follow that:
611 */
612 ALIGN
613common_interrupt:
614 SAVE_ALL
615 TRACE_IRQS_OFF
616 movl %esp,%eax
617 call do_IRQ
618 jmp ret_from_intr
619ENDPROC(common_interrupt)
620 CFI_ENDPROC
621
622#define BUILD_INTERRUPT(name, nr) \
623ENTRY(name) \
624 RING0_INT_FRAME; \
625 pushl $~(nr); \
626 CFI_ADJUST_CFA_OFFSET 4; \
627 SAVE_ALL; \
628 TRACE_IRQS_OFF \
629 movl %esp,%eax; \
630 call smp_##name; \
631 jmp ret_from_intr; \
632 CFI_ENDPROC; \
633ENDPROC(name)
634
635/* The include is where all of the SMP etc. interrupts come from */
636#include "entry_arch.h"
637
638KPROBE_ENTRY(page_fault)
639 RING0_EC_FRAME
640 pushl $do_page_fault
641 CFI_ADJUST_CFA_OFFSET 4
642 ALIGN
643error_code:
644 /* the function address is in %fs's slot on the stack */
645 pushl %es
646 CFI_ADJUST_CFA_OFFSET 4
647 /*CFI_REL_OFFSET es, 0*/
648 pushl %ds
649 CFI_ADJUST_CFA_OFFSET 4
650 /*CFI_REL_OFFSET ds, 0*/
651 pushl %eax
652 CFI_ADJUST_CFA_OFFSET 4
653 CFI_REL_OFFSET eax, 0
654 pushl %ebp
655 CFI_ADJUST_CFA_OFFSET 4
656 CFI_REL_OFFSET ebp, 0
657 pushl %edi
658 CFI_ADJUST_CFA_OFFSET 4
659 CFI_REL_OFFSET edi, 0
660 pushl %esi
661 CFI_ADJUST_CFA_OFFSET 4
662 CFI_REL_OFFSET esi, 0
663 pushl %edx
664 CFI_ADJUST_CFA_OFFSET 4
665 CFI_REL_OFFSET edx, 0
666 pushl %ecx
667 CFI_ADJUST_CFA_OFFSET 4
668 CFI_REL_OFFSET ecx, 0
669 pushl %ebx
670 CFI_ADJUST_CFA_OFFSET 4
671 CFI_REL_OFFSET ebx, 0
672 cld
673 pushl %fs
674 CFI_ADJUST_CFA_OFFSET 4
675 /*CFI_REL_OFFSET fs, 0*/
676 movl $(__KERNEL_PERCPU), %ecx
677 movl %ecx, %fs
678 UNWIND_ESPFIX_STACK
679 popl %ecx
680 CFI_ADJUST_CFA_OFFSET -4
681 /*CFI_REGISTER es, ecx*/
682 movl PT_FS(%esp), %edi # get the function address
683 movl PT_ORIG_EAX(%esp), %edx # get the error code
684 movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
685 mov %ecx, PT_FS(%esp)
686 /*CFI_REL_OFFSET fs, ES*/
687 movl $(__USER_DS), %ecx
688 movl %ecx, %ds
689 movl %ecx, %es
690 movl %esp,%eax # pt_regs pointer
691 call *%edi
692 jmp ret_from_exception
693 CFI_ENDPROC
694KPROBE_END(page_fault)
695
696ENTRY(coprocessor_error)
697 RING0_INT_FRAME
698 pushl $0
699 CFI_ADJUST_CFA_OFFSET 4
700 pushl $do_coprocessor_error
701 CFI_ADJUST_CFA_OFFSET 4
702 jmp error_code
703 CFI_ENDPROC
704END(coprocessor_error)
705
706ENTRY(simd_coprocessor_error)
707 RING0_INT_FRAME
708 pushl $0
709 CFI_ADJUST_CFA_OFFSET 4
710 pushl $do_simd_coprocessor_error
711 CFI_ADJUST_CFA_OFFSET 4
712 jmp error_code
713 CFI_ENDPROC
714END(simd_coprocessor_error)
715
716ENTRY(device_not_available)
717 RING0_INT_FRAME
718 pushl $-1 # mark this as an int
719 CFI_ADJUST_CFA_OFFSET 4
720 SAVE_ALL
721 GET_CR0_INTO_EAX
722 testl $0x4, %eax # EM (math emulation bit)
723 jne device_not_available_emulate
724 preempt_stop(CLBR_ANY)
725 call math_state_restore
726 jmp ret_from_exception
727device_not_available_emulate:
728 pushl $0 # temporary storage for ORIG_EIP
729 CFI_ADJUST_CFA_OFFSET 4
730 call math_emulate
731 addl $4, %esp
732 CFI_ADJUST_CFA_OFFSET -4
733 jmp ret_from_exception
734 CFI_ENDPROC
735END(device_not_available)
736
737/*
738 * Debug traps and NMI can happen at the one SYSENTER instruction
739 * that sets up the real kernel stack. Check here, since we can't
740 * allow the wrong stack to be used.
741 *
742 * "TSS_sysenter_esp0+12" is because the NMI/debug handler will have
743 * already pushed 3 words if it hits on the sysenter instruction:
744 * eflags, cs and eip.
745 *
746 * We just load the right stack, and push the three (known) values
747 * by hand onto the new stack - while updating the return eip past
748 * the instruction that would have done it for sysenter.
749 */
750#define FIX_STACK(offset, ok, label) \
751 cmpw $__KERNEL_CS,4(%esp); \
752 jne ok; \
753label: \
754 movl TSS_sysenter_esp0+offset(%esp),%esp; \
755 CFI_DEF_CFA esp, 0; \
756 CFI_UNDEFINED eip; \
757 pushfl; \
758 CFI_ADJUST_CFA_OFFSET 4; \
759 pushl $__KERNEL_CS; \
760 CFI_ADJUST_CFA_OFFSET 4; \
761 pushl $sysenter_past_esp; \
762 CFI_ADJUST_CFA_OFFSET 4; \
763 CFI_REL_OFFSET eip, 0
764
765KPROBE_ENTRY(debug)
766 RING0_INT_FRAME
767 cmpl $sysenter_entry,(%esp)
768 jne debug_stack_correct
769 FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
770debug_stack_correct:
771 pushl $-1 # mark this as an int
772 CFI_ADJUST_CFA_OFFSET 4
773 SAVE_ALL
774 xorl %edx,%edx # error code 0
775 movl %esp,%eax # pt_regs pointer
776 call do_debug
777 jmp ret_from_exception
778 CFI_ENDPROC
779KPROBE_END(debug)
780
781/*
782 * NMI is doubly nasty. It can happen _while_ we're handling
783 * a debug fault, and the debug fault hasn't yet been able to
784 * clear up the stack. So we first check whether we got an
785 * NMI on the sysenter entry path, but after that we need to
786 * check whether we got an NMI on the debug path where the debug
787 * fault happened on the sysenter path.
788 */
789KPROBE_ENTRY(nmi)
790 RING0_INT_FRAME
791 pushl %eax
792 CFI_ADJUST_CFA_OFFSET 4
793 movl %ss, %eax
794 cmpw $__ESPFIX_SS, %ax
795 popl %eax
796 CFI_ADJUST_CFA_OFFSET -4
797 je nmi_espfix_stack
798 cmpl $sysenter_entry,(%esp)
799 je nmi_stack_fixup
800 pushl %eax
801 CFI_ADJUST_CFA_OFFSET 4
802 movl %esp,%eax
803 /* Do not access memory above the end of our stack page,
804 * it might not exist.
805 */
806 andl $(THREAD_SIZE-1),%eax
807 cmpl $(THREAD_SIZE-20),%eax
808 popl %eax
809 CFI_ADJUST_CFA_OFFSET -4
810 jae nmi_stack_correct
811 cmpl $sysenter_entry,12(%esp)
812 je nmi_debug_stack_check
813nmi_stack_correct:
814 /* We have a RING0_INT_FRAME here */
815 pushl %eax
816 CFI_ADJUST_CFA_OFFSET 4
817 SAVE_ALL
818 xorl %edx,%edx # zero error code
819 movl %esp,%eax # pt_regs pointer
820 call do_nmi
821 jmp restore_nocheck_notrace
822 CFI_ENDPROC
823
824nmi_stack_fixup:
825 RING0_INT_FRAME
826 FIX_STACK(12,nmi_stack_correct, 1)
827 jmp nmi_stack_correct
828
829nmi_debug_stack_check:
830 /* We have a RING0_INT_FRAME here */
831 cmpw $__KERNEL_CS,16(%esp)
832 jne nmi_stack_correct
833 cmpl $debug,(%esp)
834 jb nmi_stack_correct
835 cmpl $debug_esp_fix_insn,(%esp)
836 ja nmi_stack_correct
837 FIX_STACK(24,nmi_stack_correct, 1)
838 jmp nmi_stack_correct
839
840nmi_espfix_stack:
841 /* We have a RING0_INT_FRAME here.
842 *
843 * create the pointer to lss back
844 */
845 pushl %ss
846 CFI_ADJUST_CFA_OFFSET 4
847 pushl %esp
848 CFI_ADJUST_CFA_OFFSET 4
849 addw $4, (%esp)
850 /* copy the iret frame of 12 bytes */
851 .rept 3
852 pushl 16(%esp)
853 CFI_ADJUST_CFA_OFFSET 4
854 .endr
855 pushl %eax
856 CFI_ADJUST_CFA_OFFSET 4
857 SAVE_ALL
858 FIXUP_ESPFIX_STACK # %eax == %esp
859 xorl %edx,%edx # zero error code
860 call do_nmi
861 RESTORE_REGS
862 lss 12+4(%esp), %esp # back to espfix stack
863 CFI_ADJUST_CFA_OFFSET -24
8641: INTERRUPT_RETURN
865 CFI_ENDPROC
866.section __ex_table,"a"
867 .align 4
868 .long 1b,iret_exc
869.previous
870KPROBE_END(nmi)
871
872#ifdef CONFIG_PARAVIRT
873ENTRY(native_iret)
8741: iret
875.section __ex_table,"a"
876 .align 4
877 .long 1b,iret_exc
878.previous
879END(native_iret)
880
881ENTRY(native_irq_enable_sysexit)
882 sti
883 sysexit
884END(native_irq_enable_sysexit)
885#endif
886
887KPROBE_ENTRY(int3)
888 RING0_INT_FRAME
889 pushl $-1 # mark this as an int
890 CFI_ADJUST_CFA_OFFSET 4
891 SAVE_ALL
892 xorl %edx,%edx # zero error code
893 movl %esp,%eax # pt_regs pointer
894 call do_int3
895 jmp ret_from_exception
896 CFI_ENDPROC
897KPROBE_END(int3)
898
899ENTRY(overflow)
900 RING0_INT_FRAME
901 pushl $0
902 CFI_ADJUST_CFA_OFFSET 4
903 pushl $do_overflow
904 CFI_ADJUST_CFA_OFFSET 4
905 jmp error_code
906 CFI_ENDPROC
907END(overflow)
908
909ENTRY(bounds)
910 RING0_INT_FRAME
911 pushl $0
912 CFI_ADJUST_CFA_OFFSET 4
913 pushl $do_bounds
914 CFI_ADJUST_CFA_OFFSET 4
915 jmp error_code
916 CFI_ENDPROC
917END(bounds)
918
919ENTRY(invalid_op)
920 RING0_INT_FRAME
921 pushl $0
922 CFI_ADJUST_CFA_OFFSET 4
923 pushl $do_invalid_op
924 CFI_ADJUST_CFA_OFFSET 4
925 jmp error_code
926 CFI_ENDPROC
927END(invalid_op)
928
929ENTRY(coprocessor_segment_overrun)
930 RING0_INT_FRAME
931 pushl $0
932 CFI_ADJUST_CFA_OFFSET 4
933 pushl $do_coprocessor_segment_overrun
934 CFI_ADJUST_CFA_OFFSET 4
935 jmp error_code
936 CFI_ENDPROC
937END(coprocessor_segment_overrun)
938
939ENTRY(invalid_TSS)
940 RING0_EC_FRAME
941 pushl $do_invalid_TSS
942 CFI_ADJUST_CFA_OFFSET 4
943 jmp error_code
944 CFI_ENDPROC
945END(invalid_TSS)
946
947ENTRY(segment_not_present)
948 RING0_EC_FRAME
949 pushl $do_segment_not_present
950 CFI_ADJUST_CFA_OFFSET 4
951 jmp error_code
952 CFI_ENDPROC
953END(segment_not_present)
954
955ENTRY(stack_segment)
956 RING0_EC_FRAME
957 pushl $do_stack_segment
958 CFI_ADJUST_CFA_OFFSET 4
959 jmp error_code
960 CFI_ENDPROC
961END(stack_segment)
962
963KPROBE_ENTRY(general_protection)
964 RING0_EC_FRAME
965 pushl $do_general_protection
966 CFI_ADJUST_CFA_OFFSET 4
967 jmp error_code
968 CFI_ENDPROC
969KPROBE_END(general_protection)
970
971ENTRY(alignment_check)
972 RING0_EC_FRAME
973 pushl $do_alignment_check
974 CFI_ADJUST_CFA_OFFSET 4
975 jmp error_code
976 CFI_ENDPROC
977END(alignment_check)
978
979ENTRY(divide_error)
980 RING0_INT_FRAME
981 pushl $0 # no error code
982 CFI_ADJUST_CFA_OFFSET 4
983 pushl $do_divide_error
984 CFI_ADJUST_CFA_OFFSET 4
985 jmp error_code
986 CFI_ENDPROC
987END(divide_error)
988
989#ifdef CONFIG_X86_MCE
990ENTRY(machine_check)
991 RING0_INT_FRAME
992 pushl $0
993 CFI_ADJUST_CFA_OFFSET 4
994 pushl machine_check_vector
995 CFI_ADJUST_CFA_OFFSET 4
996 jmp error_code
997 CFI_ENDPROC
998END(machine_check)
999#endif
1000
1001ENTRY(spurious_interrupt_bug)
1002 RING0_INT_FRAME
1003 pushl $0
1004 CFI_ADJUST_CFA_OFFSET 4
1005 pushl $do_spurious_interrupt_bug
1006 CFI_ADJUST_CFA_OFFSET 4
1007 jmp error_code
1008 CFI_ENDPROC
1009END(spurious_interrupt_bug)
1010
1011ENTRY(kernel_thread_helper)
1012 pushl $0 # fake return address for unwinder
1013 CFI_STARTPROC
1014 movl %edx,%eax
1015 push %edx
1016 CFI_ADJUST_CFA_OFFSET 4
1017 call *%ebx
1018 push %eax
1019 CFI_ADJUST_CFA_OFFSET 4
1020 call do_exit
1021 CFI_ENDPROC
1022ENDPROC(kernel_thread_helper)
1023
1024#ifdef CONFIG_XEN
1025ENTRY(xen_hypervisor_callback)
1026 CFI_STARTPROC
1027 pushl $0
1028 CFI_ADJUST_CFA_OFFSET 4
1029 SAVE_ALL
1030 TRACE_IRQS_OFF
1031
1032 /* Check to see if we got the event in the critical
1033 region in xen_iret_direct, after we've reenabled
1034 events and checked for pending events. This simulates
1035 iret instruction's behaviour where it delivers a
1036 pending interrupt when enabling interrupts. */
1037 movl PT_EIP(%esp),%eax
1038 cmpl $xen_iret_start_crit,%eax
1039 jb 1f
1040 cmpl $xen_iret_end_crit,%eax
1041 jae 1f
1042
1043 call xen_iret_crit_fixup
1044
10451: mov %esp, %eax
1046 call xen_evtchn_do_upcall
1047 jmp ret_from_intr
1048 CFI_ENDPROC
1049ENDPROC(xen_hypervisor_callback)
1050
1051# Hypervisor uses this for application faults while it executes.
1052# We get here for two reasons:
1053# 1. Fault while reloading DS, ES, FS or GS
1054# 2. Fault while executing IRET
1055# Category 1 we fix up by reattempting the load, and zeroing the segment
1056# register if the load fails.
1057# Category 2 we fix up by jumping to do_iret_error. We cannot use the
1058# normal Linux return path in this case because if we use the IRET hypercall
1059# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
1060# We distinguish between categories by maintaining a status value in EAX.
1061ENTRY(xen_failsafe_callback)
1062 CFI_STARTPROC
1063 pushl %eax
1064 CFI_ADJUST_CFA_OFFSET 4
1065 movl $1,%eax
10661: mov 4(%esp),%ds
10672: mov 8(%esp),%es
10683: mov 12(%esp),%fs
10694: mov 16(%esp),%gs
1070 testl %eax,%eax
1071 popl %eax
1072 CFI_ADJUST_CFA_OFFSET -4
1073 lea 16(%esp),%esp
1074 CFI_ADJUST_CFA_OFFSET -16
1075 jz 5f
1076 addl $16,%esp
1077 jmp iret_exc # EAX != 0 => Category 2 (Bad IRET)
10785: pushl $0 # EAX == 0 => Category 1 (Bad segment)
1079 CFI_ADJUST_CFA_OFFSET 4
1080 SAVE_ALL
1081 jmp ret_from_exception
1082 CFI_ENDPROC
1083
1084.section .fixup,"ax"
10856: xorl %eax,%eax
1086 movl %eax,4(%esp)
1087 jmp 1b
10887: xorl %eax,%eax
1089 movl %eax,8(%esp)
1090 jmp 2b
10918: xorl %eax,%eax
1092 movl %eax,12(%esp)
1093 jmp 3b
10949: xorl %eax,%eax
1095 movl %eax,16(%esp)
1096 jmp 4b
1097.previous
1098.section __ex_table,"a"
1099 .align 4
1100 .long 1b,6b
1101 .long 2b,7b
1102 .long 3b,8b
1103 .long 4b,9b
1104.previous
1105ENDPROC(xen_failsafe_callback)
1106
1107#endif /* CONFIG_XEN */
1108
1109.section .rodata,"a"
1110#include "syscall_table_32.S"
1111
1112syscall_table_size=(.-sys_call_table)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
new file mode 100644
index 000000000000..1d232e5f5658
--- /dev/null
+++ b/arch/x86/kernel/entry_64.S
@@ -0,0 +1,1172 @@
1/*
2 * linux/arch/x86_64/entry.S
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
6 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
7 */
8
9/*
10 * entry.S contains the system-call and fault low-level handling routines.
11 *
12 * NOTE: This code handles signal-recognition, which happens every time
13 * after an interrupt and after each system call.
14 *
15 * Normal syscalls and interrupts don't save a full stack frame, this is
16 * only done for syscall tracing, signals or fork/exec et.al.
17 *
18 * A note on terminology:
19 * - top of stack: Architecture defined interrupt frame from SS to RIP
20 * at the top of the kernel process stack.
21 * - partial stack frame: partially saved registers upto R11.
22 * - full stack frame: Like partial stack frame, but all register saved.
23 *
24 * Some macro usage:
25 * - CFI macros are used to generate dwarf2 unwind information for better
26 * backtraces. They don't change any code.
27 * - SAVE_ALL/RESTORE_ALL - Save/restore all registers
28 * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify.
29 * There are unfortunately lots of special cases where some registers
30 * not touched. The macro is a big mess that should be cleaned up.
31 * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS.
32 * Gives a full stack frame.
33 * - ENTRY/END Define functions in the symbol table.
34 * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack
35 * frame that is otherwise undefined after a SYSCALL
36 * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging.
37 * - errorentry/paranoidentry/zeroentry - Define exception entry points.
38 */
39
40#include <linux/linkage.h>
41#include <asm/segment.h>
42#include <asm/cache.h>
43#include <asm/errno.h>
44#include <asm/dwarf2.h>
45#include <asm/calling.h>
46#include <asm/asm-offsets.h>
47#include <asm/msr.h>
48#include <asm/unistd.h>
49#include <asm/thread_info.h>
50#include <asm/hw_irq.h>
51#include <asm/page.h>
52#include <asm/irqflags.h>
53
54 .code64
55
56#ifndef CONFIG_PREEMPT
57#define retint_kernel retint_restore_args
58#endif
59
60
61.macro TRACE_IRQS_IRETQ offset=ARGOFFSET
62#ifdef CONFIG_TRACE_IRQFLAGS
63 bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */
64 jnc 1f
65 TRACE_IRQS_ON
661:
67#endif
68.endm
69
70/*
71 * C code is not supposed to know about undefined top of stack. Every time
72 * a C function with an pt_regs argument is called from the SYSCALL based
73 * fast path FIXUP_TOP_OF_STACK is needed.
74 * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
75 * manipulation.
76 */
77
78 /* %rsp:at FRAMEEND */
79 .macro FIXUP_TOP_OF_STACK tmp
80 movq %gs:pda_oldrsp,\tmp
81 movq \tmp,RSP(%rsp)
82 movq $__USER_DS,SS(%rsp)
83 movq $__USER_CS,CS(%rsp)
84 movq $-1,RCX(%rsp)
85 movq R11(%rsp),\tmp /* get eflags */
86 movq \tmp,EFLAGS(%rsp)
87 .endm
88
89 .macro RESTORE_TOP_OF_STACK tmp,offset=0
90 movq RSP-\offset(%rsp),\tmp
91 movq \tmp,%gs:pda_oldrsp
92 movq EFLAGS-\offset(%rsp),\tmp
93 movq \tmp,R11-\offset(%rsp)
94 .endm
95
96 .macro FAKE_STACK_FRAME child_rip
97 /* push in order ss, rsp, eflags, cs, rip */
98 xorl %eax, %eax
99 pushq %rax /* ss */
100 CFI_ADJUST_CFA_OFFSET 8
101 /*CFI_REL_OFFSET ss,0*/
102 pushq %rax /* rsp */
103 CFI_ADJUST_CFA_OFFSET 8
104 CFI_REL_OFFSET rsp,0
105 pushq $(1<<9) /* eflags - interrupts on */
106 CFI_ADJUST_CFA_OFFSET 8
107 /*CFI_REL_OFFSET rflags,0*/
108 pushq $__KERNEL_CS /* cs */
109 CFI_ADJUST_CFA_OFFSET 8
110 /*CFI_REL_OFFSET cs,0*/
111 pushq \child_rip /* rip */
112 CFI_ADJUST_CFA_OFFSET 8
113 CFI_REL_OFFSET rip,0
114 pushq %rax /* orig rax */
115 CFI_ADJUST_CFA_OFFSET 8
116 .endm
117
118 .macro UNFAKE_STACK_FRAME
119 addq $8*6, %rsp
120 CFI_ADJUST_CFA_OFFSET -(6*8)
121 .endm
122
123 .macro CFI_DEFAULT_STACK start=1
124 .if \start
125 CFI_STARTPROC simple
126 CFI_SIGNAL_FRAME
127 CFI_DEF_CFA rsp,SS+8
128 .else
129 CFI_DEF_CFA_OFFSET SS+8
130 .endif
131 CFI_REL_OFFSET r15,R15
132 CFI_REL_OFFSET r14,R14
133 CFI_REL_OFFSET r13,R13
134 CFI_REL_OFFSET r12,R12
135 CFI_REL_OFFSET rbp,RBP
136 CFI_REL_OFFSET rbx,RBX
137 CFI_REL_OFFSET r11,R11
138 CFI_REL_OFFSET r10,R10
139 CFI_REL_OFFSET r9,R9
140 CFI_REL_OFFSET r8,R8
141 CFI_REL_OFFSET rax,RAX
142 CFI_REL_OFFSET rcx,RCX
143 CFI_REL_OFFSET rdx,RDX
144 CFI_REL_OFFSET rsi,RSI
145 CFI_REL_OFFSET rdi,RDI
146 CFI_REL_OFFSET rip,RIP
147 /*CFI_REL_OFFSET cs,CS*/
148 /*CFI_REL_OFFSET rflags,EFLAGS*/
149 CFI_REL_OFFSET rsp,RSP
150 /*CFI_REL_OFFSET ss,SS*/
151 .endm
152/*
153 * A newly forked process directly context switches into this.
154 */
155/* rdi: prev */
156ENTRY(ret_from_fork)
157 CFI_DEFAULT_STACK
158 push kernel_eflags(%rip)
159 CFI_ADJUST_CFA_OFFSET 4
160 popf # reset kernel eflags
161 CFI_ADJUST_CFA_OFFSET -4
162 call schedule_tail
163 GET_THREAD_INFO(%rcx)
164 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx)
165 jnz rff_trace
166rff_action:
167 RESTORE_REST
168 testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread?
169 je int_ret_from_sys_call
170 testl $_TIF_IA32,threadinfo_flags(%rcx)
171 jnz int_ret_from_sys_call
172 RESTORE_TOP_OF_STACK %rdi,ARGOFFSET
173 jmp ret_from_sys_call
174rff_trace:
175 movq %rsp,%rdi
176 call syscall_trace_leave
177 GET_THREAD_INFO(%rcx)
178 jmp rff_action
179 CFI_ENDPROC
180END(ret_from_fork)
181
182/*
183 * System call entry. Upto 6 arguments in registers are supported.
184 *
185 * SYSCALL does not save anything on the stack and does not change the
186 * stack pointer.
187 */
188
189/*
190 * Register setup:
191 * rax system call number
192 * rdi arg0
193 * rcx return address for syscall/sysret, C arg3
194 * rsi arg1
195 * rdx arg2
196 * r10 arg3 (--> moved to rcx for C)
197 * r8 arg4
198 * r9 arg5
199 * r11 eflags for syscall/sysret, temporary for C
200 * r12-r15,rbp,rbx saved by C code, not touched.
201 *
202 * Interrupts are off on entry.
203 * Only called from user space.
204 *
205 * XXX if we had a free scratch register we could save the RSP into the stack frame
206 * and report it properly in ps. Unfortunately we haven't.
207 *
208 * When user can change the frames always force IRET. That is because
209 * it deals with uncanonical addresses better. SYSRET has trouble
210 * with them due to bugs in both AMD and Intel CPUs.
211 */
212
213ENTRY(system_call)
214 CFI_STARTPROC simple
215 CFI_SIGNAL_FRAME
216 CFI_DEF_CFA rsp,PDA_STACKOFFSET
217 CFI_REGISTER rip,rcx
218 /*CFI_REGISTER rflags,r11*/
219 swapgs
220 movq %rsp,%gs:pda_oldrsp
221 movq %gs:pda_kernelstack,%rsp
222 /*
223 * No need to follow this irqs off/on section - it's straight
224 * and short:
225 */
226 sti
227 SAVE_ARGS 8,1
228 movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
229 movq %rcx,RIP-ARGOFFSET(%rsp)
230 CFI_REL_OFFSET rip,RIP-ARGOFFSET
231 GET_THREAD_INFO(%rcx)
232 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx)
233 jnz tracesys
234 cmpq $__NR_syscall_max,%rax
235 ja badsys
236 movq %r10,%rcx
237 call *sys_call_table(,%rax,8) # XXX: rip relative
238 movq %rax,RAX-ARGOFFSET(%rsp)
239/*
240 * Syscall return path ending with SYSRET (fast path)
241 * Has incomplete stack frame and undefined top of stack.
242 */
243ret_from_sys_call:
244 movl $_TIF_ALLWORK_MASK,%edi
245 /* edi: flagmask */
246sysret_check:
247 GET_THREAD_INFO(%rcx)
248 cli
249 TRACE_IRQS_OFF
250 movl threadinfo_flags(%rcx),%edx
251 andl %edi,%edx
252 jnz sysret_careful
253 CFI_REMEMBER_STATE
254 /*
255 * sysretq will re-enable interrupts:
256 */
257 TRACE_IRQS_ON
258 movq RIP-ARGOFFSET(%rsp),%rcx
259 CFI_REGISTER rip,rcx
260 RESTORE_ARGS 0,-ARG_SKIP,1
261 /*CFI_REGISTER rflags,r11*/
262 movq %gs:pda_oldrsp,%rsp
263 swapgs
264 sysretq
265
266 CFI_RESTORE_STATE
267 /* Handle reschedules */
268 /* edx: work, edi: workmask */
269sysret_careful:
270 bt $TIF_NEED_RESCHED,%edx
271 jnc sysret_signal
272 TRACE_IRQS_ON
273 sti
274 pushq %rdi
275 CFI_ADJUST_CFA_OFFSET 8
276 call schedule
277 popq %rdi
278 CFI_ADJUST_CFA_OFFSET -8
279 jmp sysret_check
280
281 /* Handle a signal */
282sysret_signal:
283 TRACE_IRQS_ON
284 sti
285 testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
286 jz 1f
287
288 /* Really a signal */
289 /* edx: work flags (arg3) */
290 leaq do_notify_resume(%rip),%rax
291 leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
292 xorl %esi,%esi # oldset -> arg2
293 call ptregscall_common
2941: movl $_TIF_NEED_RESCHED,%edi
295 /* Use IRET because user could have changed frame. This
296 works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
297 cli
298 TRACE_IRQS_OFF
299 jmp int_with_check
300
301badsys:
302 movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
303 jmp ret_from_sys_call
304
305 /* Do syscall tracing */
306tracesys:
307 SAVE_REST
308 movq $-ENOSYS,RAX(%rsp)
309 FIXUP_TOP_OF_STACK %rdi
310 movq %rsp,%rdi
311 call syscall_trace_enter
312 LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */
313 RESTORE_REST
314 cmpq $__NR_syscall_max,%rax
315 movq $-ENOSYS,%rcx
316 cmova %rcx,%rax
317 ja 1f
318 movq %r10,%rcx /* fixup for C */
319 call *sys_call_table(,%rax,8)
3201: movq %rax,RAX-ARGOFFSET(%rsp)
321 /* Use IRET because user could have changed frame */
322
323/*
324 * Syscall return path ending with IRET.
325 * Has correct top of stack, but partial stack frame.
326 */
327 .globl int_ret_from_sys_call
328int_ret_from_sys_call:
329 cli
330 TRACE_IRQS_OFF
331 testl $3,CS-ARGOFFSET(%rsp)
332 je retint_restore_args
333 movl $_TIF_ALLWORK_MASK,%edi
334 /* edi: mask to check */
335int_with_check:
336 GET_THREAD_INFO(%rcx)
337 movl threadinfo_flags(%rcx),%edx
338 andl %edi,%edx
339 jnz int_careful
340 andl $~TS_COMPAT,threadinfo_status(%rcx)
341 jmp retint_swapgs
342
343 /* Either reschedule or signal or syscall exit tracking needed. */
344 /* First do a reschedule test. */
345 /* edx: work, edi: workmask */
346int_careful:
347 bt $TIF_NEED_RESCHED,%edx
348 jnc int_very_careful
349 TRACE_IRQS_ON
350 sti
351 pushq %rdi
352 CFI_ADJUST_CFA_OFFSET 8
353 call schedule
354 popq %rdi
355 CFI_ADJUST_CFA_OFFSET -8
356 cli
357 TRACE_IRQS_OFF
358 jmp int_with_check
359
360 /* handle signals and tracing -- both require a full stack frame */
361int_very_careful:
362 TRACE_IRQS_ON
363 sti
364 SAVE_REST
365 /* Check for syscall exit trace */
366 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx
367 jz int_signal
368 pushq %rdi
369 CFI_ADJUST_CFA_OFFSET 8
370 leaq 8(%rsp),%rdi # &ptregs -> arg1
371 call syscall_trace_leave
372 popq %rdi
373 CFI_ADJUST_CFA_OFFSET -8
374 andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi
375 jmp int_restore_rest
376
377int_signal:
378 testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
379 jz 1f
380 movq %rsp,%rdi # &ptregs -> arg1
381 xorl %esi,%esi # oldset -> arg2
382 call do_notify_resume
3831: movl $_TIF_NEED_RESCHED,%edi
384int_restore_rest:
385 RESTORE_REST
386 cli
387 TRACE_IRQS_OFF
388 jmp int_with_check
389 CFI_ENDPROC
390END(system_call)
391
392/*
393 * Certain special system calls that need to save a complete full stack frame.
394 */
395
396 .macro PTREGSCALL label,func,arg
397 .globl \label
398\label:
399 leaq \func(%rip),%rax
400 leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */
401 jmp ptregscall_common
402END(\label)
403 .endm
404
405 CFI_STARTPROC
406
407 PTREGSCALL stub_clone, sys_clone, %r8
408 PTREGSCALL stub_fork, sys_fork, %rdi
409 PTREGSCALL stub_vfork, sys_vfork, %rdi
410 PTREGSCALL stub_rt_sigsuspend, sys_rt_sigsuspend, %rdx
411 PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
412 PTREGSCALL stub_iopl, sys_iopl, %rsi
413
414ENTRY(ptregscall_common)
415 popq %r11
416 CFI_ADJUST_CFA_OFFSET -8
417 CFI_REGISTER rip, r11
418 SAVE_REST
419 movq %r11, %r15
420 CFI_REGISTER rip, r15
421 FIXUP_TOP_OF_STACK %r11
422 call *%rax
423 RESTORE_TOP_OF_STACK %r11
424 movq %r15, %r11
425 CFI_REGISTER rip, r11
426 RESTORE_REST
427 pushq %r11
428 CFI_ADJUST_CFA_OFFSET 8
429 CFI_REL_OFFSET rip, 0
430 ret
431 CFI_ENDPROC
432END(ptregscall_common)
433
434ENTRY(stub_execve)
435 CFI_STARTPROC
436 popq %r11
437 CFI_ADJUST_CFA_OFFSET -8
438 CFI_REGISTER rip, r11
439 SAVE_REST
440 FIXUP_TOP_OF_STACK %r11
441 call sys_execve
442 RESTORE_TOP_OF_STACK %r11
443 movq %rax,RAX(%rsp)
444 RESTORE_REST
445 jmp int_ret_from_sys_call
446 CFI_ENDPROC
447END(stub_execve)
448
449/*
450 * sigreturn is special because it needs to restore all registers on return.
451 * This cannot be done with SYSRET, so use the IRET return path instead.
452 */
453ENTRY(stub_rt_sigreturn)
454 CFI_STARTPROC
455 addq $8, %rsp
456 CFI_ADJUST_CFA_OFFSET -8
457 SAVE_REST
458 movq %rsp,%rdi
459 FIXUP_TOP_OF_STACK %r11
460 call sys_rt_sigreturn
461 movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
462 RESTORE_REST
463 jmp int_ret_from_sys_call
464 CFI_ENDPROC
465END(stub_rt_sigreturn)
466
467/*
468 * initial frame state for interrupts and exceptions
469 */
470 .macro _frame ref
471 CFI_STARTPROC simple
472 CFI_SIGNAL_FRAME
473 CFI_DEF_CFA rsp,SS+8-\ref
474 /*CFI_REL_OFFSET ss,SS-\ref*/
475 CFI_REL_OFFSET rsp,RSP-\ref
476 /*CFI_REL_OFFSET rflags,EFLAGS-\ref*/
477 /*CFI_REL_OFFSET cs,CS-\ref*/
478 CFI_REL_OFFSET rip,RIP-\ref
479 .endm
480
481/* initial frame state for interrupts (and exceptions without error code) */
482#define INTR_FRAME _frame RIP
483/* initial frame state for exceptions with error code (and interrupts with
484 vector already pushed) */
485#define XCPT_FRAME _frame ORIG_RAX
486
487/*
488 * Interrupt entry/exit.
489 *
490 * Interrupt entry points save only callee clobbered registers in fast path.
491 *
492 * Entry runs with interrupts off.
493 */
494
495/* 0(%rsp): interrupt number */
496 .macro interrupt func
497 cld
498 SAVE_ARGS
499 leaq -ARGOFFSET(%rsp),%rdi # arg1 for handler
500 pushq %rbp
501 CFI_ADJUST_CFA_OFFSET 8
502 CFI_REL_OFFSET rbp, 0
503 movq %rsp,%rbp
504 CFI_DEF_CFA_REGISTER rbp
505 testl $3,CS(%rdi)
506 je 1f
507 swapgs
508 /* irqcount is used to check if a CPU is already on an interrupt
509 stack or not. While this is essentially redundant with preempt_count
510 it is a little cheaper to use a separate counter in the PDA
511 (short of moving irq_enter into assembly, which would be too
512 much work) */
5131: incl %gs:pda_irqcount
514 cmoveq %gs:pda_irqstackptr,%rsp
515 push %rbp # backlink for old unwinder
516 /*
517 * We entered an interrupt context - irqs are off:
518 */
519 TRACE_IRQS_OFF
520 call \func
521 .endm
522
523ENTRY(common_interrupt)
524 XCPT_FRAME
525 interrupt do_IRQ
526 /* 0(%rsp): oldrsp-ARGOFFSET */
527ret_from_intr:
528 cli
529 TRACE_IRQS_OFF
530 decl %gs:pda_irqcount
531 leaveq
532 CFI_DEF_CFA_REGISTER rsp
533 CFI_ADJUST_CFA_OFFSET -8
534exit_intr:
535 GET_THREAD_INFO(%rcx)
536 testl $3,CS-ARGOFFSET(%rsp)
537 je retint_kernel
538
539 /* Interrupt came from user space */
540 /*
541 * Has a correct top of stack, but a partial stack frame
542 * %rcx: thread info. Interrupts off.
543 */
544retint_with_reschedule:
545 movl $_TIF_WORK_MASK,%edi
546retint_check:
547 movl threadinfo_flags(%rcx),%edx
548 andl %edi,%edx
549 CFI_REMEMBER_STATE
550 jnz retint_careful
551retint_swapgs:
552 /*
553 * The iretq could re-enable interrupts:
554 */
555 cli
556 TRACE_IRQS_IRETQ
557 swapgs
558 jmp restore_args
559
560retint_restore_args:
561 cli
562 /*
563 * The iretq could re-enable interrupts:
564 */
565 TRACE_IRQS_IRETQ
566restore_args:
567 RESTORE_ARGS 0,8,0
568iret_label:
569 iretq
570
571 .section __ex_table,"a"
572 .quad iret_label,bad_iret
573 .previous
574 .section .fixup,"ax"
575 /* force a signal here? this matches i386 behaviour */
576 /* running with kernel gs */
577bad_iret:
578 movq $11,%rdi /* SIGSEGV */
579 TRACE_IRQS_ON
580 sti
581 jmp do_exit
582 .previous
583
584 /* edi: workmask, edx: work */
585retint_careful:
586 CFI_RESTORE_STATE
587 bt $TIF_NEED_RESCHED,%edx
588 jnc retint_signal
589 TRACE_IRQS_ON
590 sti
591 pushq %rdi
592 CFI_ADJUST_CFA_OFFSET 8
593 call schedule
594 popq %rdi
595 CFI_ADJUST_CFA_OFFSET -8
596 GET_THREAD_INFO(%rcx)
597 cli
598 TRACE_IRQS_OFF
599 jmp retint_check
600
601retint_signal:
602 testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
603 jz retint_swapgs
604 TRACE_IRQS_ON
605 sti
606 SAVE_REST
607 movq $-1,ORIG_RAX(%rsp)
608 xorl %esi,%esi # oldset
609 movq %rsp,%rdi # &pt_regs
610 call do_notify_resume
611 RESTORE_REST
612 cli
613 TRACE_IRQS_OFF
614 movl $_TIF_NEED_RESCHED,%edi
615 GET_THREAD_INFO(%rcx)
616 jmp retint_check
617
618#ifdef CONFIG_PREEMPT
619 /* Returning to kernel space. Check if we need preemption */
620 /* rcx: threadinfo. interrupts off. */
621ENTRY(retint_kernel)
622 cmpl $0,threadinfo_preempt_count(%rcx)
623 jnz retint_restore_args
624 bt $TIF_NEED_RESCHED,threadinfo_flags(%rcx)
625 jnc retint_restore_args
626 bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */
627 jnc retint_restore_args
628 call preempt_schedule_irq
629 jmp exit_intr
630#endif
631
632 CFI_ENDPROC
633END(common_interrupt)
634
635/*
636 * APIC interrupts.
637 */
638 .macro apicinterrupt num,func
639 INTR_FRAME
640 pushq $~(\num)
641 CFI_ADJUST_CFA_OFFSET 8
642 interrupt \func
643 jmp ret_from_intr
644 CFI_ENDPROC
645 .endm
646
647ENTRY(thermal_interrupt)
648 apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt
649END(thermal_interrupt)
650
651ENTRY(threshold_interrupt)
652 apicinterrupt THRESHOLD_APIC_VECTOR,mce_threshold_interrupt
653END(threshold_interrupt)
654
655#ifdef CONFIG_SMP
656ENTRY(reschedule_interrupt)
657 apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt
658END(reschedule_interrupt)
659
660 .macro INVALIDATE_ENTRY num
661ENTRY(invalidate_interrupt\num)
662 apicinterrupt INVALIDATE_TLB_VECTOR_START+\num,smp_invalidate_interrupt
663END(invalidate_interrupt\num)
664 .endm
665
666 INVALIDATE_ENTRY 0
667 INVALIDATE_ENTRY 1
668 INVALIDATE_ENTRY 2
669 INVALIDATE_ENTRY 3
670 INVALIDATE_ENTRY 4
671 INVALIDATE_ENTRY 5
672 INVALIDATE_ENTRY 6
673 INVALIDATE_ENTRY 7
674
675ENTRY(call_function_interrupt)
676 apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt
677END(call_function_interrupt)
678ENTRY(irq_move_cleanup_interrupt)
679 apicinterrupt IRQ_MOVE_CLEANUP_VECTOR,smp_irq_move_cleanup_interrupt
680END(irq_move_cleanup_interrupt)
681#endif
682
683ENTRY(apic_timer_interrupt)
684 apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt
685END(apic_timer_interrupt)
686
687ENTRY(error_interrupt)
688 apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt
689END(error_interrupt)
690
691ENTRY(spurious_interrupt)
692 apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt
693END(spurious_interrupt)
694
695/*
696 * Exception entry points.
697 */
698 .macro zeroentry sym
699 INTR_FRAME
700 pushq $0 /* push error code/oldrax */
701 CFI_ADJUST_CFA_OFFSET 8
702 pushq %rax /* push real oldrax to the rdi slot */
703 CFI_ADJUST_CFA_OFFSET 8
704 CFI_REL_OFFSET rax,0
705 leaq \sym(%rip),%rax
706 jmp error_entry
707 CFI_ENDPROC
708 .endm
709
710 .macro errorentry sym
711 XCPT_FRAME
712 pushq %rax
713 CFI_ADJUST_CFA_OFFSET 8
714 CFI_REL_OFFSET rax,0
715 leaq \sym(%rip),%rax
716 jmp error_entry
717 CFI_ENDPROC
718 .endm
719
720 /* error code is on the stack already */
721 /* handle NMI like exceptions that can happen everywhere */
722 .macro paranoidentry sym, ist=0, irqtrace=1
723 SAVE_ALL
724 cld
725 movl $1,%ebx
726 movl $MSR_GS_BASE,%ecx
727 rdmsr
728 testl %edx,%edx
729 js 1f
730 swapgs
731 xorl %ebx,%ebx
7321:
733 .if \ist
734 movq %gs:pda_data_offset, %rbp
735 .endif
736 movq %rsp,%rdi
737 movq ORIG_RAX(%rsp),%rsi
738 movq $-1,ORIG_RAX(%rsp)
739 .if \ist
740 subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
741 .endif
742 call \sym
743 .if \ist
744 addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
745 .endif
746 cli
747 .if \irqtrace
748 TRACE_IRQS_OFF
749 .endif
750 .endm
751
752 /*
753 * "Paranoid" exit path from exception stack.
754 * Paranoid because this is used by NMIs and cannot take
755 * any kernel state for granted.
756 * We don't do kernel preemption checks here, because only
757 * NMI should be common and it does not enable IRQs and
758 * cannot get reschedule ticks.
759 *
760 * "trace" is 0 for the NMI handler only, because irq-tracing
761 * is fundamentally NMI-unsafe. (we cannot change the soft and
762 * hard flags at once, atomically)
763 */
764 .macro paranoidexit trace=1
765 /* ebx: no swapgs flag */
766paranoid_exit\trace:
767 testl %ebx,%ebx /* swapgs needed? */
768 jnz paranoid_restore\trace
769 testl $3,CS(%rsp)
770 jnz paranoid_userspace\trace
771paranoid_swapgs\trace:
772 .if \trace
773 TRACE_IRQS_IRETQ 0
774 .endif
775 swapgs
776paranoid_restore\trace:
777 RESTORE_ALL 8
778 iretq
779paranoid_userspace\trace:
780 GET_THREAD_INFO(%rcx)
781 movl threadinfo_flags(%rcx),%ebx
782 andl $_TIF_WORK_MASK,%ebx
783 jz paranoid_swapgs\trace
784 movq %rsp,%rdi /* &pt_regs */
785 call sync_regs
786 movq %rax,%rsp /* switch stack for scheduling */
787 testl $_TIF_NEED_RESCHED,%ebx
788 jnz paranoid_schedule\trace
789 movl %ebx,%edx /* arg3: thread flags */
790 .if \trace
791 TRACE_IRQS_ON
792 .endif
793 sti
794 xorl %esi,%esi /* arg2: oldset */
795 movq %rsp,%rdi /* arg1: &pt_regs */
796 call do_notify_resume
797 cli
798 .if \trace
799 TRACE_IRQS_OFF
800 .endif
801 jmp paranoid_userspace\trace
802paranoid_schedule\trace:
803 .if \trace
804 TRACE_IRQS_ON
805 .endif
806 sti
807 call schedule
808 cli
809 .if \trace
810 TRACE_IRQS_OFF
811 .endif
812 jmp paranoid_userspace\trace
813 CFI_ENDPROC
814 .endm
815
816/*
817 * Exception entry point. This expects an error code/orig_rax on the stack
818 * and the exception handler in %rax.
819 */
820KPROBE_ENTRY(error_entry)
821 _frame RDI
822 CFI_REL_OFFSET rax,0
823 /* rdi slot contains rax, oldrax contains error code */
824 cld
825 subq $14*8,%rsp
826 CFI_ADJUST_CFA_OFFSET (14*8)
827 movq %rsi,13*8(%rsp)
828 CFI_REL_OFFSET rsi,RSI
829 movq 14*8(%rsp),%rsi /* load rax from rdi slot */
830 CFI_REGISTER rax,rsi
831 movq %rdx,12*8(%rsp)
832 CFI_REL_OFFSET rdx,RDX
833 movq %rcx,11*8(%rsp)
834 CFI_REL_OFFSET rcx,RCX
835 movq %rsi,10*8(%rsp) /* store rax */
836 CFI_REL_OFFSET rax,RAX
837 movq %r8, 9*8(%rsp)
838 CFI_REL_OFFSET r8,R8
839 movq %r9, 8*8(%rsp)
840 CFI_REL_OFFSET r9,R9
841 movq %r10,7*8(%rsp)
842 CFI_REL_OFFSET r10,R10
843 movq %r11,6*8(%rsp)
844 CFI_REL_OFFSET r11,R11
845 movq %rbx,5*8(%rsp)
846 CFI_REL_OFFSET rbx,RBX
847 movq %rbp,4*8(%rsp)
848 CFI_REL_OFFSET rbp,RBP
849 movq %r12,3*8(%rsp)
850 CFI_REL_OFFSET r12,R12
851 movq %r13,2*8(%rsp)
852 CFI_REL_OFFSET r13,R13
853 movq %r14,1*8(%rsp)
854 CFI_REL_OFFSET r14,R14
855 movq %r15,(%rsp)
856 CFI_REL_OFFSET r15,R15
857 xorl %ebx,%ebx
858 testl $3,CS(%rsp)
859 je error_kernelspace
860error_swapgs:
861 swapgs
862error_sti:
863 movq %rdi,RDI(%rsp)
864 CFI_REL_OFFSET rdi,RDI
865 movq %rsp,%rdi
866 movq ORIG_RAX(%rsp),%rsi /* get error code */
867 movq $-1,ORIG_RAX(%rsp)
868 call *%rax
869 /* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */
870error_exit:
871 movl %ebx,%eax
872 RESTORE_REST
873 cli
874 TRACE_IRQS_OFF
875 GET_THREAD_INFO(%rcx)
876 testl %eax,%eax
877 jne retint_kernel
878 movl threadinfo_flags(%rcx),%edx
879 movl $_TIF_WORK_MASK,%edi
880 andl %edi,%edx
881 jnz retint_careful
882 /*
883 * The iret might restore flags:
884 */
885 TRACE_IRQS_IRETQ
886 swapgs
887 RESTORE_ARGS 0,8,0
888 jmp iret_label
889 CFI_ENDPROC
890
891error_kernelspace:
892 incl %ebx
893 /* There are two places in the kernel that can potentially fault with
894 usergs. Handle them here. The exception handlers after
895 iret run with kernel gs again, so don't set the user space flag.
896 B stepping K8s sometimes report an truncated RIP for IRET
897 exceptions returning to compat mode. Check for these here too. */
898 leaq iret_label(%rip),%rbp
899 cmpq %rbp,RIP(%rsp)
900 je error_swapgs
901 movl %ebp,%ebp /* zero extend */
902 cmpq %rbp,RIP(%rsp)
903 je error_swapgs
904 cmpq $gs_change,RIP(%rsp)
905 je error_swapgs
906 jmp error_sti
907KPROBE_END(error_entry)
908
909 /* Reload gs selector with exception handling */
910 /* edi: new selector */
911ENTRY(load_gs_index)
912 CFI_STARTPROC
913 pushf
914 CFI_ADJUST_CFA_OFFSET 8
915 cli
916 swapgs
917gs_change:
918 movl %edi,%gs
9192: mfence /* workaround */
920 swapgs
921 popf
922 CFI_ADJUST_CFA_OFFSET -8
923 ret
924 CFI_ENDPROC
925ENDPROC(load_gs_index)
926
927 .section __ex_table,"a"
928 .align 8
929 .quad gs_change,bad_gs
930 .previous
931 .section .fixup,"ax"
932 /* running with kernelgs */
933bad_gs:
934 swapgs /* switch back to user gs */
935 xorl %eax,%eax
936 movl %eax,%gs
937 jmp 2b
938 .previous
939
940/*
941 * Create a kernel thread.
942 *
943 * C extern interface:
944 * extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
945 *
946 * asm input arguments:
947 * rdi: fn, rsi: arg, rdx: flags
948 */
949ENTRY(kernel_thread)
950 CFI_STARTPROC
951 FAKE_STACK_FRAME $child_rip
952 SAVE_ALL
953
954 # rdi: flags, rsi: usp, rdx: will be &pt_regs
955 movq %rdx,%rdi
956 orq kernel_thread_flags(%rip),%rdi
957 movq $-1, %rsi
958 movq %rsp, %rdx
959
960 xorl %r8d,%r8d
961 xorl %r9d,%r9d
962
963 # clone now
964 call do_fork
965 movq %rax,RAX(%rsp)
966 xorl %edi,%edi
967
968 /*
969 * It isn't worth to check for reschedule here,
970 * so internally to the x86_64 port you can rely on kernel_thread()
971 * not to reschedule the child before returning, this avoids the need
972 * of hacks for example to fork off the per-CPU idle tasks.
973 * [Hopefully no generic code relies on the reschedule -AK]
974 */
975 RESTORE_ALL
976 UNFAKE_STACK_FRAME
977 ret
978 CFI_ENDPROC
979ENDPROC(kernel_thread)
980
981child_rip:
982 pushq $0 # fake return address
983 CFI_STARTPROC
984 /*
985 * Here we are in the child and the registers are set as they were
986 * at kernel_thread() invocation in the parent.
987 */
988 movq %rdi, %rax
989 movq %rsi, %rdi
990 call *%rax
991 # exit
992 xorl %edi, %edi
993 call do_exit
994 CFI_ENDPROC
995ENDPROC(child_rip)
996
997/*
998 * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
999 *
1000 * C extern interface:
1001 * extern long execve(char *name, char **argv, char **envp)
1002 *
1003 * asm input arguments:
1004 * rdi: name, rsi: argv, rdx: envp
1005 *
1006 * We want to fallback into:
1007 * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs regs)
1008 *
1009 * do_sys_execve asm fallback arguments:
1010 * rdi: name, rsi: argv, rdx: envp, fake frame on the stack
1011 */
1012ENTRY(kernel_execve)
1013 CFI_STARTPROC
1014 FAKE_STACK_FRAME $0
1015 SAVE_ALL
1016 call sys_execve
1017 movq %rax, RAX(%rsp)
1018 RESTORE_REST
1019 testq %rax,%rax
1020 je int_ret_from_sys_call
1021 RESTORE_ARGS
1022 UNFAKE_STACK_FRAME
1023 ret
1024 CFI_ENDPROC
1025ENDPROC(kernel_execve)
1026
1027KPROBE_ENTRY(page_fault)
1028 errorentry do_page_fault
1029KPROBE_END(page_fault)
1030
1031ENTRY(coprocessor_error)
1032 zeroentry do_coprocessor_error
1033END(coprocessor_error)
1034
1035ENTRY(simd_coprocessor_error)
1036 zeroentry do_simd_coprocessor_error
1037END(simd_coprocessor_error)
1038
1039ENTRY(device_not_available)
1040 zeroentry math_state_restore
1041END(device_not_available)
1042
1043 /* runs on exception stack */
1044KPROBE_ENTRY(debug)
1045 INTR_FRAME
1046 pushq $0
1047 CFI_ADJUST_CFA_OFFSET 8
1048 paranoidentry do_debug, DEBUG_STACK
1049 paranoidexit
1050KPROBE_END(debug)
1051
1052 /* runs on exception stack */
1053KPROBE_ENTRY(nmi)
1054 INTR_FRAME
1055 pushq $-1
1056 CFI_ADJUST_CFA_OFFSET 8
1057 paranoidentry do_nmi, 0, 0
1058#ifdef CONFIG_TRACE_IRQFLAGS
1059 paranoidexit 0
1060#else
1061 jmp paranoid_exit1
1062 CFI_ENDPROC
1063#endif
1064KPROBE_END(nmi)
1065
1066KPROBE_ENTRY(int3)
1067 INTR_FRAME
1068 pushq $0
1069 CFI_ADJUST_CFA_OFFSET 8
1070 paranoidentry do_int3, DEBUG_STACK
1071 jmp paranoid_exit1
1072 CFI_ENDPROC
1073KPROBE_END(int3)
1074
1075ENTRY(overflow)
1076 zeroentry do_overflow
1077END(overflow)
1078
1079ENTRY(bounds)
1080 zeroentry do_bounds
1081END(bounds)
1082
1083ENTRY(invalid_op)
1084 zeroentry do_invalid_op
1085END(invalid_op)
1086
1087ENTRY(coprocessor_segment_overrun)
1088 zeroentry do_coprocessor_segment_overrun
1089END(coprocessor_segment_overrun)
1090
1091ENTRY(reserved)
1092 zeroentry do_reserved
1093END(reserved)
1094
1095 /* runs on exception stack */
1096ENTRY(double_fault)
1097 XCPT_FRAME
1098 paranoidentry do_double_fault
1099 jmp paranoid_exit1
1100 CFI_ENDPROC
1101END(double_fault)
1102
1103ENTRY(invalid_TSS)
1104 errorentry do_invalid_TSS
1105END(invalid_TSS)
1106
1107ENTRY(segment_not_present)
1108 errorentry do_segment_not_present
1109END(segment_not_present)
1110
1111 /* runs on exception stack */
1112ENTRY(stack_segment)
1113 XCPT_FRAME
1114 paranoidentry do_stack_segment
1115 jmp paranoid_exit1
1116 CFI_ENDPROC
1117END(stack_segment)
1118
1119KPROBE_ENTRY(general_protection)
1120 errorentry do_general_protection
1121KPROBE_END(general_protection)
1122
1123ENTRY(alignment_check)
1124 errorentry do_alignment_check
1125END(alignment_check)
1126
1127ENTRY(divide_error)
1128 zeroentry do_divide_error
1129END(divide_error)
1130
1131ENTRY(spurious_interrupt_bug)
1132 zeroentry do_spurious_interrupt_bug
1133END(spurious_interrupt_bug)
1134
1135#ifdef CONFIG_X86_MCE
1136 /* runs on exception stack */
1137ENTRY(machine_check)
1138 INTR_FRAME
1139 pushq $0
1140 CFI_ADJUST_CFA_OFFSET 8
1141 paranoidentry do_machine_check
1142 jmp paranoid_exit1
1143 CFI_ENDPROC
1144END(machine_check)
1145#endif
1146
1147/* Call softirq on interrupt stack. Interrupts are off. */
1148ENTRY(call_softirq)
1149 CFI_STARTPROC
1150 push %rbp
1151 CFI_ADJUST_CFA_OFFSET 8
1152 CFI_REL_OFFSET rbp,0
1153 mov %rsp,%rbp
1154 CFI_DEF_CFA_REGISTER rbp
1155 incl %gs:pda_irqcount
1156 cmove %gs:pda_irqstackptr,%rsp
1157 push %rbp # backlink for old unwinder
1158 call __do_softirq
1159 leaveq
1160 CFI_DEF_CFA_REGISTER rsp
1161 CFI_ADJUST_CFA_OFFSET -8
1162 decl %gs:pda_irqcount
1163 ret
1164 CFI_ENDPROC
1165ENDPROC(call_softirq)
1166
1167KPROBE_ENTRY(ignore_sysret)
1168 CFI_STARTPROC
1169 mov $-ENOSYS,%eax
1170 sysret
1171 CFI_ENDPROC
1172ENDPROC(ignore_sysret)
diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c
new file mode 100644
index 000000000000..47496a40e84f
--- /dev/null
+++ b/arch/x86/kernel/genapic_64.c
@@ -0,0 +1,66 @@
1/*
2 * Copyright 2004 James Cleverdon, IBM.
3 * Subject to the GNU Public License, v.2
4 *
5 * Generic APIC sub-arch probe layer.
6 *
7 * Hacked for x86-64 by James Cleverdon from i386 architecture code by
8 * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
9 * James Cleverdon.
10 */
11#include <linux/threads.h>
12#include <linux/cpumask.h>
13#include <linux/string.h>
14#include <linux/module.h>
15#include <linux/kernel.h>
16#include <linux/ctype.h>
17#include <linux/init.h>
18
19#include <asm/smp.h>
20#include <asm/ipi.h>
21#include <asm/genapic.h>
22
23#ifdef CONFIG_ACPI
24#include <acpi/acpi_bus.h>
25#endif
26
27/* which logical CPU number maps to which CPU (physical APIC ID) */
28u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly
29 = { [0 ... NR_CPUS-1] = BAD_APICID };
30EXPORT_SYMBOL(x86_cpu_to_apicid);
31
32u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
33
34struct genapic __read_mostly *genapic = &apic_flat;
35
36/*
37 * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
38 */
39void __init setup_apic_routing(void)
40{
41#ifdef CONFIG_ACPI
42 /*
43 * Quirk: some x86_64 machines can only use physical APIC mode
44 * regardless of how many processors are present (x86_64 ES7000
45 * is an example).
46 */
47 if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID &&
48 (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL))
49 genapic = &apic_physflat;
50 else
51#endif
52
53 if (cpus_weight(cpu_possible_map) <= 8)
54 genapic = &apic_flat;
55 else
56 genapic = &apic_physflat;
57
58 printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name);
59}
60
61/* Same for both flat and physical. */
62
63void send_IPI_self(int vector)
64{
65 __send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
66}
diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c
new file mode 100644
index 000000000000..ecb01eefdd27
--- /dev/null
+++ b/arch/x86/kernel/genapic_flat_64.c
@@ -0,0 +1,194 @@
1/*
2 * Copyright 2004 James Cleverdon, IBM.
3 * Subject to the GNU Public License, v.2
4 *
5 * Flat APIC subarch code.
6 *
7 * Hacked for x86-64 by James Cleverdon from i386 architecture code by
8 * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
9 * James Cleverdon.
10 */
11#include <linux/errno.h>
12#include <linux/threads.h>
13#include <linux/cpumask.h>
14#include <linux/string.h>
15#include <linux/kernel.h>
16#include <linux/ctype.h>
17#include <linux/init.h>
18#include <asm/smp.h>
19#include <asm/ipi.h>
20#include <asm/genapic.h>
21
22static cpumask_t flat_target_cpus(void)
23{
24 return cpu_online_map;
25}
26
27static cpumask_t flat_vector_allocation_domain(int cpu)
28{
29 /* Careful. Some cpus do not strictly honor the set of cpus
30 * specified in the interrupt destination when using lowest
31 * priority interrupt delivery mode.
32 *
33 * In particular there was a hyperthreading cpu observed to
34 * deliver interrupts to the wrong hyperthread when only one
35 * hyperthread was specified in the interrupt desitination.
36 */
37 cpumask_t domain = { { [0] = APIC_ALL_CPUS, } };
38 return domain;
39}
40
41/*
42 * Set up the logical destination ID.
43 *
44 * Intel recommends to set DFR, LDR and TPR before enabling
45 * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
46 * document number 292116). So here it goes...
47 */
48static void flat_init_apic_ldr(void)
49{
50 unsigned long val;
51 unsigned long num, id;
52
53 num = smp_processor_id();
54 id = 1UL << num;
55 x86_cpu_to_log_apicid[num] = id;
56 apic_write(APIC_DFR, APIC_DFR_FLAT);
57 val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
58 val |= SET_APIC_LOGICAL_ID(id);
59 apic_write(APIC_LDR, val);
60}
61
62static void flat_send_IPI_mask(cpumask_t cpumask, int vector)
63{
64 unsigned long mask = cpus_addr(cpumask)[0];
65 unsigned long flags;
66
67 local_irq_save(flags);
68 __send_IPI_dest_field(mask, vector, APIC_DEST_LOGICAL);
69 local_irq_restore(flags);
70}
71
72static void flat_send_IPI_allbutself(int vector)
73{
74#ifdef CONFIG_HOTPLUG_CPU
75 int hotplug = 1;
76#else
77 int hotplug = 0;
78#endif
79 if (hotplug || vector == NMI_VECTOR) {
80 cpumask_t allbutme = cpu_online_map;
81
82 cpu_clear(smp_processor_id(), allbutme);
83
84 if (!cpus_empty(allbutme))
85 flat_send_IPI_mask(allbutme, vector);
86 } else if (num_online_cpus() > 1) {
87 __send_IPI_shortcut(APIC_DEST_ALLBUT, vector,APIC_DEST_LOGICAL);
88 }
89}
90
91static void flat_send_IPI_all(int vector)
92{
93 if (vector == NMI_VECTOR)
94 flat_send_IPI_mask(cpu_online_map, vector);
95 else
96 __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
97}
98
99static int flat_apic_id_registered(void)
100{
101 return physid_isset(GET_APIC_ID(apic_read(APIC_ID)), phys_cpu_present_map);
102}
103
104static unsigned int flat_cpu_mask_to_apicid(cpumask_t cpumask)
105{
106 return cpus_addr(cpumask)[0] & APIC_ALL_CPUS;
107}
108
109static unsigned int phys_pkg_id(int index_msb)
110{
111 return hard_smp_processor_id() >> index_msb;
112}
113
114struct genapic apic_flat = {
115 .name = "flat",
116 .int_delivery_mode = dest_LowestPrio,
117 .int_dest_mode = (APIC_DEST_LOGICAL != 0),
118 .target_cpus = flat_target_cpus,
119 .vector_allocation_domain = flat_vector_allocation_domain,
120 .apic_id_registered = flat_apic_id_registered,
121 .init_apic_ldr = flat_init_apic_ldr,
122 .send_IPI_all = flat_send_IPI_all,
123 .send_IPI_allbutself = flat_send_IPI_allbutself,
124 .send_IPI_mask = flat_send_IPI_mask,
125 .cpu_mask_to_apicid = flat_cpu_mask_to_apicid,
126 .phys_pkg_id = phys_pkg_id,
127};
128
129/*
130 * Physflat mode is used when there are more than 8 CPUs on a AMD system.
131 * We cannot use logical delivery in this case because the mask
132 * overflows, so use physical mode.
133 */
134
135static cpumask_t physflat_target_cpus(void)
136{
137 return cpu_online_map;
138}
139
140static cpumask_t physflat_vector_allocation_domain(int cpu)
141{
142 cpumask_t domain = CPU_MASK_NONE;
143 cpu_set(cpu, domain);
144 return domain;
145}
146
147
148static void physflat_send_IPI_mask(cpumask_t cpumask, int vector)
149{
150 send_IPI_mask_sequence(cpumask, vector);
151}
152
153static void physflat_send_IPI_allbutself(int vector)
154{
155 cpumask_t allbutme = cpu_online_map;
156
157 cpu_clear(smp_processor_id(), allbutme);
158 physflat_send_IPI_mask(allbutme, vector);
159}
160
161static void physflat_send_IPI_all(int vector)
162{
163 physflat_send_IPI_mask(cpu_online_map, vector);
164}
165
166static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask)
167{
168 int cpu;
169
170 /*
171 * We're using fixed IRQ delivery, can only return one phys APIC ID.
172 * May as well be the first.
173 */
174 cpu = first_cpu(cpumask);
175 if ((unsigned)cpu < NR_CPUS)
176 return x86_cpu_to_apicid[cpu];
177 else
178 return BAD_APICID;
179}
180
181struct genapic apic_physflat = {
182 .name = "physical flat",
183 .int_delivery_mode = dest_Fixed,
184 .int_dest_mode = (APIC_DEST_PHYSICAL != 0),
185 .target_cpus = physflat_target_cpus,
186 .vector_allocation_domain = physflat_vector_allocation_domain,
187 .apic_id_registered = flat_apic_id_registered,
188 .init_apic_ldr = flat_init_apic_ldr,/*not needed, but shouldn't hurt*/
189 .send_IPI_all = physflat_send_IPI_all,
190 .send_IPI_allbutself = physflat_send_IPI_allbutself,
191 .send_IPI_mask = physflat_send_IPI_mask,
192 .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid,
193 .phys_pkg_id = phys_pkg_id,
194};
diff --git a/arch/x86/kernel/geode_32.c b/arch/x86/kernel/geode_32.c
new file mode 100644
index 000000000000..41e8aec4c61d
--- /dev/null
+++ b/arch/x86/kernel/geode_32.c
@@ -0,0 +1,155 @@
1/*
2 * AMD Geode southbridge support code
3 * Copyright (C) 2006, Advanced Micro Devices, Inc.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of version 2 of the GNU General Public License
7 * as published by the Free Software Foundation.
8 */
9
10#include <linux/kernel.h>
11#include <linux/module.h>
12#include <linux/ioport.h>
13#include <linux/io.h>
14#include <asm/msr.h>
15#include <asm/geode.h>
16
17static struct {
18 char *name;
19 u32 msr;
20 int size;
21 u32 base;
22} lbars[] = {
23 { "geode-pms", MSR_LBAR_PMS, LBAR_PMS_SIZE, 0 },
24 { "geode-acpi", MSR_LBAR_ACPI, LBAR_ACPI_SIZE, 0 },
25 { "geode-gpio", MSR_LBAR_GPIO, LBAR_GPIO_SIZE, 0 },
26 { "geode-mfgpt", MSR_LBAR_MFGPT, LBAR_MFGPT_SIZE, 0 }
27};
28
29static void __init init_lbars(void)
30{
31 u32 lo, hi;
32 int i;
33
34 for (i = 0; i < ARRAY_SIZE(lbars); i++) {
35 rdmsr(lbars[i].msr, lo, hi);
36 if (hi & 0x01)
37 lbars[i].base = lo & 0x0000ffff;
38
39 if (lbars[i].base == 0)
40 printk(KERN_ERR "geode: Couldn't initialize '%s'\n",
41 lbars[i].name);
42 }
43}
44
45int geode_get_dev_base(unsigned int dev)
46{
47 BUG_ON(dev >= ARRAY_SIZE(lbars));
48 return lbars[dev].base;
49}
50EXPORT_SYMBOL_GPL(geode_get_dev_base);
51
52/* === GPIO API === */
53
54void geode_gpio_set(unsigned int gpio, unsigned int reg)
55{
56 u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
57
58 if (!base)
59 return;
60
61 if (gpio < 16)
62 outl(1 << gpio, base + reg);
63 else
64 outl(1 << (gpio - 16), base + 0x80 + reg);
65}
66EXPORT_SYMBOL_GPL(geode_gpio_set);
67
68void geode_gpio_clear(unsigned int gpio, unsigned int reg)
69{
70 u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
71
72 if (!base)
73 return;
74
75 if (gpio < 16)
76 outl(1 << (gpio + 16), base + reg);
77 else
78 outl(1 << gpio, base + 0x80 + reg);
79}
80EXPORT_SYMBOL_GPL(geode_gpio_clear);
81
82int geode_gpio_isset(unsigned int gpio, unsigned int reg)
83{
84 u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
85
86 if (!base)
87 return 0;
88
89 if (gpio < 16)
90 return (inl(base + reg) & (1 << gpio)) ? 1 : 0;
91 else
92 return (inl(base + 0x80 + reg) & (1 << (gpio - 16))) ? 1 : 0;
93}
94EXPORT_SYMBOL_GPL(geode_gpio_isset);
95
96void geode_gpio_set_irq(unsigned int group, unsigned int irq)
97{
98 u32 lo, hi;
99
100 if (group > 7 || irq > 15)
101 return;
102
103 rdmsr(MSR_PIC_ZSEL_HIGH, lo, hi);
104
105 lo &= ~(0xF << (group * 4));
106 lo |= (irq & 0xF) << (group * 4);
107
108 wrmsr(MSR_PIC_ZSEL_HIGH, lo, hi);
109}
110EXPORT_SYMBOL_GPL(geode_gpio_set_irq);
111
112void geode_gpio_setup_event(unsigned int gpio, int pair, int pme)
113{
114 u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
115 u32 offset, shift, val;
116
117 if (gpio >= 24)
118 offset = GPIO_MAP_W;
119 else if (gpio >= 16)
120 offset = GPIO_MAP_Z;
121 else if (gpio >= 8)
122 offset = GPIO_MAP_Y;
123 else
124 offset = GPIO_MAP_X;
125
126 shift = (gpio % 8) * 4;
127
128 val = inl(base + offset);
129
130 /* Clear whatever was there before */
131 val &= ~(0xF << shift);
132
133 /* And set the new value */
134
135 val |= ((pair & 7) << shift);
136
137 /* Set the PME bit if this is a PME event */
138
139 if (pme)
140 val |= (1 << (shift + 3));
141
142 outl(val, base + offset);
143}
144EXPORT_SYMBOL_GPL(geode_gpio_setup_event);
145
146static int __init geode_southbridge_init(void)
147{
148 if (!is_geode())
149 return -ENODEV;
150
151 init_lbars();
152 return 0;
153}
154
155postcore_initcall(geode_southbridge_init);
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
new file mode 100644
index 000000000000..6c34bdd22e26
--- /dev/null
+++ b/arch/x86/kernel/head64.c
@@ -0,0 +1,86 @@
1/*
2 * linux/arch/x86_64/kernel/head64.c -- prepare to run common code
3 *
4 * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
5 */
6
7#include <linux/init.h>
8#include <linux/linkage.h>
9#include <linux/types.h>
10#include <linux/kernel.h>
11#include <linux/string.h>
12#include <linux/percpu.h>
13
14#include <asm/processor.h>
15#include <asm/proto.h>
16#include <asm/smp.h>
17#include <asm/bootsetup.h>
18#include <asm/setup.h>
19#include <asm/desc.h>
20#include <asm/pgtable.h>
21#include <asm/tlbflush.h>
22#include <asm/sections.h>
23
24static void __init zap_identity_mappings(void)
25{
26 pgd_t *pgd = pgd_offset_k(0UL);
27 pgd_clear(pgd);
28 __flush_tlb();
29}
30
31/* Don't add a printk in there. printk relies on the PDA which is not initialized
32 yet. */
33static void __init clear_bss(void)
34{
35 memset(__bss_start, 0,
36 (unsigned long) __bss_stop - (unsigned long) __bss_start);
37}
38
39#define NEW_CL_POINTER 0x228 /* Relative to real mode data */
40#define OLD_CL_MAGIC_ADDR 0x20
41#define OLD_CL_MAGIC 0xA33F
42#define OLD_CL_OFFSET 0x22
43
44static void __init copy_bootdata(char *real_mode_data)
45{
46 unsigned long new_data;
47 char * command_line;
48
49 memcpy(x86_boot_params, real_mode_data, BOOT_PARAM_SIZE);
50 new_data = *(u32 *) (x86_boot_params + NEW_CL_POINTER);
51 if (!new_data) {
52 if (OLD_CL_MAGIC != *(u16 *)(real_mode_data + OLD_CL_MAGIC_ADDR)) {
53 return;
54 }
55 new_data = __pa(real_mode_data) + *(u16 *)(real_mode_data + OLD_CL_OFFSET);
56 }
57 command_line = __va(new_data);
58 memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE);
59}
60
61void __init x86_64_start_kernel(char * real_mode_data)
62{
63 int i;
64
65 /* clear bss before set_intr_gate with early_idt_handler */
66 clear_bss();
67
68 /* Make NULL pointers segfault */
69 zap_identity_mappings();
70
71 for (i = 0; i < IDT_ENTRIES; i++)
72 set_intr_gate(i, early_idt_handler);
73 asm volatile("lidt %0" :: "m" (idt_descr));
74
75 early_printk("Kernel alive\n");
76
77 for (i = 0; i < NR_CPUS; i++)
78 cpu_pda(i) = &boot_cpu_pda[i];
79
80 pda_init(0);
81 copy_bootdata(__va(real_mode_data));
82#ifdef CONFIG_SMP
83 cpu_set(0, cpu_online_map);
84#endif
85 start_kernel();
86}
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
new file mode 100644
index 000000000000..9150ca9b5f80
--- /dev/null
+++ b/arch/x86/kernel/head_32.S
@@ -0,0 +1,578 @@
1/*
2 * linux/arch/i386/kernel/head.S -- the 32-bit startup code.
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 *
6 * Enhanced CPU detection and feature setting code by Mike Jagdis
7 * and Martin Mares, November 1997.
8 */
9
10.text
11#include <linux/threads.h>
12#include <linux/linkage.h>
13#include <asm/segment.h>
14#include <asm/page.h>
15#include <asm/pgtable.h>
16#include <asm/desc.h>
17#include <asm/cache.h>
18#include <asm/thread_info.h>
19#include <asm/asm-offsets.h>
20#include <asm/setup.h>
21
22/*
23 * References to members of the new_cpu_data structure.
24 */
25
26#define X86 new_cpu_data+CPUINFO_x86
27#define X86_VENDOR new_cpu_data+CPUINFO_x86_vendor
28#define X86_MODEL new_cpu_data+CPUINFO_x86_model
29#define X86_MASK new_cpu_data+CPUINFO_x86_mask
30#define X86_HARD_MATH new_cpu_data+CPUINFO_hard_math
31#define X86_CPUID new_cpu_data+CPUINFO_cpuid_level
32#define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability
33#define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id
34
35/*
36 * This is how much memory *in addition to the memory covered up to
37 * and including _end* we need mapped initially.
38 * We need:
39 * - one bit for each possible page, but only in low memory, which means
40 * 2^32/4096/8 = 128K worst case (4G/4G split.)
41 * - enough space to map all low memory, which means
42 * (2^32/4096) / 1024 pages (worst case, non PAE)
43 * (2^32/4096) / 512 + 4 pages (worst case for PAE)
44 * - a few pages for allocator use before the kernel pagetable has
45 * been set up
46 *
47 * Modulo rounding, each megabyte assigned here requires a kilobyte of
48 * memory, which is currently unreclaimed.
49 *
50 * This should be a multiple of a page.
51 */
52LOW_PAGES = 1<<(32-PAGE_SHIFT_asm)
53
54#if PTRS_PER_PMD > 1
55PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PMD) + PTRS_PER_PGD
56#else
57PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PGD)
58#endif
59BOOTBITMAP_SIZE = LOW_PAGES / 8
60ALLOCATOR_SLOP = 4
61
62INIT_MAP_BEYOND_END = BOOTBITMAP_SIZE + (PAGE_TABLE_SIZE + ALLOCATOR_SLOP)*PAGE_SIZE_asm
63
64/*
65 * 32-bit kernel entrypoint; only used by the boot CPU. On entry,
66 * %esi points to the real-mode code as a 32-bit pointer.
67 * CS and DS must be 4 GB flat segments, but we don't depend on
68 * any particular GDT layout, because we load our own as soon as we
69 * can.
70 */
71.section .text.head,"ax",@progbits
72ENTRY(startup_32)
73
74/*
75 * Set segments to known values.
76 */
77 cld
78 lgdt boot_gdt_descr - __PAGE_OFFSET
79 movl $(__BOOT_DS),%eax
80 movl %eax,%ds
81 movl %eax,%es
82 movl %eax,%fs
83 movl %eax,%gs
84
85/*
86 * Clear BSS first so that there are no surprises...
87 * No need to cld as DF is already clear from cld above...
88 */
89 xorl %eax,%eax
90 movl $__bss_start - __PAGE_OFFSET,%edi
91 movl $__bss_stop - __PAGE_OFFSET,%ecx
92 subl %edi,%ecx
93 shrl $2,%ecx
94 rep ; stosl
95/*
96 * Copy bootup parameters out of the way.
97 * Note: %esi still has the pointer to the real-mode data.
98 * With the kexec as boot loader, parameter segment might be loaded beyond
99 * kernel image and might not even be addressable by early boot page tables.
100 * (kexec on panic case). Hence copy out the parameters before initializing
101 * page tables.
102 */
103 movl $(boot_params - __PAGE_OFFSET),%edi
104 movl $(PARAM_SIZE/4),%ecx
105 cld
106 rep
107 movsl
108 movl boot_params - __PAGE_OFFSET + NEW_CL_POINTER,%esi
109 andl %esi,%esi
110 jnz 2f # New command line protocol
111 cmpw $(OLD_CL_MAGIC),OLD_CL_MAGIC_ADDR
112 jne 1f
113 movzwl OLD_CL_OFFSET,%esi
114 addl $(OLD_CL_BASE_ADDR),%esi
1152:
116 movl $(boot_command_line - __PAGE_OFFSET),%edi
117 movl $(COMMAND_LINE_SIZE/4),%ecx
118 rep
119 movsl
1201:
121
122/*
123 * Initialize page tables. This creates a PDE and a set of page
124 * tables, which are located immediately beyond _end. The variable
125 * init_pg_tables_end is set up to point to the first "safe" location.
126 * Mappings are created both at virtual address 0 (identity mapping)
127 * and PAGE_OFFSET for up to _end+sizeof(page tables)+INIT_MAP_BEYOND_END.
128 *
129 * Warning: don't use %esi or the stack in this code. However, %esp
130 * can be used as a GPR if you really need it...
131 */
132page_pde_offset = (__PAGE_OFFSET >> 20);
133
134 movl $(pg0 - __PAGE_OFFSET), %edi
135 movl $(swapper_pg_dir - __PAGE_OFFSET), %edx
136 movl $0x007, %eax /* 0x007 = PRESENT+RW+USER */
13710:
138 leal 0x007(%edi),%ecx /* Create PDE entry */
139 movl %ecx,(%edx) /* Store identity PDE entry */
140 movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */
141 addl $4,%edx
142 movl $1024, %ecx
14311:
144 stosl
145 addl $0x1000,%eax
146 loop 11b
147 /* End condition: we must map up to and including INIT_MAP_BEYOND_END */
148 /* bytes beyond the end of our own page tables; the +0x007 is the attribute bits */
149 leal (INIT_MAP_BEYOND_END+0x007)(%edi),%ebp
150 cmpl %ebp,%eax
151 jb 10b
152 movl %edi,(init_pg_tables_end - __PAGE_OFFSET)
153
154 xorl %ebx,%ebx /* This is the boot CPU (BSP) */
155 jmp 3f
156/*
157 * Non-boot CPU entry point; entered from trampoline.S
158 * We can't lgdt here, because lgdt itself uses a data segment, but
159 * we know the trampoline has already loaded the boot_gdt for us.
160 *
161 * If cpu hotplug is not supported then this code can go in init section
162 * which will be freed later
163 */
164
165#ifndef CONFIG_HOTPLUG_CPU
166.section .init.text,"ax",@progbits
167#endif
168
169 /* Do an early initialization of the fixmap area */
170 movl $(swapper_pg_dir - __PAGE_OFFSET), %edx
171 movl $(swapper_pg_pmd - __PAGE_OFFSET), %eax
172 addl $0x007, %eax /* 0x007 = PRESENT+RW+USER */
173 movl %eax, 4092(%edx)
174
175#ifdef CONFIG_SMP
176ENTRY(startup_32_smp)
177 cld
178 movl $(__BOOT_DS),%eax
179 movl %eax,%ds
180 movl %eax,%es
181 movl %eax,%fs
182 movl %eax,%gs
183
184/*
185 * New page tables may be in 4Mbyte page mode and may
186 * be using the global pages.
187 *
188 * NOTE! If we are on a 486 we may have no cr4 at all!
189 * So we do not try to touch it unless we really have
190 * some bits in it to set. This won't work if the BSP
191 * implements cr4 but this AP does not -- very unlikely
192 * but be warned! The same applies to the pse feature
193 * if not equally supported. --macro
194 *
195 * NOTE! We have to correct for the fact that we're
196 * not yet offset PAGE_OFFSET..
197 */
198#define cr4_bits mmu_cr4_features-__PAGE_OFFSET
199 movl cr4_bits,%edx
200 andl %edx,%edx
201 jz 6f
202 movl %cr4,%eax # Turn on paging options (PSE,PAE,..)
203 orl %edx,%eax
204 movl %eax,%cr4
205
206 btl $5, %eax # check if PAE is enabled
207 jnc 6f
208
209 /* Check if extended functions are implemented */
210 movl $0x80000000, %eax
211 cpuid
212 cmpl $0x80000000, %eax
213 jbe 6f
214 mov $0x80000001, %eax
215 cpuid
216 /* Execute Disable bit supported? */
217 btl $20, %edx
218 jnc 6f
219
220 /* Setup EFER (Extended Feature Enable Register) */
221 movl $0xc0000080, %ecx
222 rdmsr
223
224 btsl $11, %eax
225 /* Make changes effective */
226 wrmsr
227
2286:
229 /* This is a secondary processor (AP) */
230 xorl %ebx,%ebx
231 incl %ebx
232
233#endif /* CONFIG_SMP */
2343:
235
236/*
237 * Enable paging
238 */
239 movl $swapper_pg_dir-__PAGE_OFFSET,%eax
240 movl %eax,%cr3 /* set the page table pointer.. */
241 movl %cr0,%eax
242 orl $0x80000000,%eax
243 movl %eax,%cr0 /* ..and set paging (PG) bit */
244 ljmp $__BOOT_CS,$1f /* Clear prefetch and normalize %eip */
2451:
246 /* Set up the stack pointer */
247 lss stack_start,%esp
248
249/*
250 * Initialize eflags. Some BIOS's leave bits like NT set. This would
251 * confuse the debugger if this code is traced.
252 * XXX - best to initialize before switching to protected mode.
253 */
254 pushl $0
255 popfl
256
257#ifdef CONFIG_SMP
258 andl %ebx,%ebx
259 jz 1f /* Initial CPU cleans BSS */
260 jmp checkCPUtype
2611:
262#endif /* CONFIG_SMP */
263
264/*
265 * start system 32-bit setup. We need to re-do some of the things done
266 * in 16-bit mode for the "real" operations.
267 */
268 call setup_idt
269
270checkCPUtype:
271
272 movl $-1,X86_CPUID # -1 for no CPUID initially
273
274/* check if it is 486 or 386. */
275/*
276 * XXX - this does a lot of unnecessary setup. Alignment checks don't
277 * apply at our cpl of 0 and the stack ought to be aligned already, and
278 * we don't need to preserve eflags.
279 */
280
281 movb $3,X86 # at least 386
282 pushfl # push EFLAGS
283 popl %eax # get EFLAGS
284 movl %eax,%ecx # save original EFLAGS
285 xorl $0x240000,%eax # flip AC and ID bits in EFLAGS
286 pushl %eax # copy to EFLAGS
287 popfl # set EFLAGS
288 pushfl # get new EFLAGS
289 popl %eax # put it in eax
290 xorl %ecx,%eax # change in flags
291 pushl %ecx # restore original EFLAGS
292 popfl
293 testl $0x40000,%eax # check if AC bit changed
294 je is386
295
296 movb $4,X86 # at least 486
297 testl $0x200000,%eax # check if ID bit changed
298 je is486
299
300 /* get vendor info */
301 xorl %eax,%eax # call CPUID with 0 -> return vendor ID
302 cpuid
303 movl %eax,X86_CPUID # save CPUID level
304 movl %ebx,X86_VENDOR_ID # lo 4 chars
305 movl %edx,X86_VENDOR_ID+4 # next 4 chars
306 movl %ecx,X86_VENDOR_ID+8 # last 4 chars
307
308 orl %eax,%eax # do we have processor info as well?
309 je is486
310
311 movl $1,%eax # Use the CPUID instruction to get CPU type
312 cpuid
313 movb %al,%cl # save reg for future use
314 andb $0x0f,%ah # mask processor family
315 movb %ah,X86
316 andb $0xf0,%al # mask model
317 shrb $4,%al
318 movb %al,X86_MODEL
319 andb $0x0f,%cl # mask mask revision
320 movb %cl,X86_MASK
321 movl %edx,X86_CAPABILITY
322
323is486: movl $0x50022,%ecx # set AM, WP, NE and MP
324 jmp 2f
325
326is386: movl $2,%ecx # set MP
3272: movl %cr0,%eax
328 andl $0x80000011,%eax # Save PG,PE,ET
329 orl %ecx,%eax
330 movl %eax,%cr0
331
332 call check_x87
333 lgdt early_gdt_descr
334 lidt idt_descr
335 ljmp $(__KERNEL_CS),$1f
3361: movl $(__KERNEL_DS),%eax # reload all the segment registers
337 movl %eax,%ss # after changing gdt.
338 movl %eax,%fs # gets reset once there's real percpu
339
340 movl $(__USER_DS),%eax # DS/ES contains default USER segment
341 movl %eax,%ds
342 movl %eax,%es
343
344 xorl %eax,%eax # Clear GS and LDT
345 movl %eax,%gs
346 lldt %ax
347
348 cld # gcc2 wants the direction flag cleared at all times
349 pushl $0 # fake return address for unwinder
350#ifdef CONFIG_SMP
351 movb ready, %cl
352 movb $1, ready
353 cmpb $0,%cl # the first CPU calls start_kernel
354 je 1f
355 movl $(__KERNEL_PERCPU), %eax
356 movl %eax,%fs # set this cpu's percpu
357 jmp initialize_secondary # all other CPUs call initialize_secondary
3581:
359#endif /* CONFIG_SMP */
360 jmp start_kernel
361
362/*
363 * We depend on ET to be correct. This checks for 287/387.
364 */
365check_x87:
366 movb $0,X86_HARD_MATH
367 clts
368 fninit
369 fstsw %ax
370 cmpb $0,%al
371 je 1f
372 movl %cr0,%eax /* no coprocessor: have to set bits */
373 xorl $4,%eax /* set EM */
374 movl %eax,%cr0
375 ret
376 ALIGN
3771: movb $1,X86_HARD_MATH
378 .byte 0xDB,0xE4 /* fsetpm for 287, ignored by 387 */
379 ret
380
381/*
382 * setup_idt
383 *
384 * sets up a idt with 256 entries pointing to
385 * ignore_int, interrupt gates. It doesn't actually load
386 * idt - that can be done only after paging has been enabled
387 * and the kernel moved to PAGE_OFFSET. Interrupts
388 * are enabled elsewhere, when we can be relatively
389 * sure everything is ok.
390 *
391 * Warning: %esi is live across this function.
392 */
393setup_idt:
394 lea ignore_int,%edx
395 movl $(__KERNEL_CS << 16),%eax
396 movw %dx,%ax /* selector = 0x0010 = cs */
397 movw $0x8E00,%dx /* interrupt gate - dpl=0, present */
398
399 lea idt_table,%edi
400 mov $256,%ecx
401rp_sidt:
402 movl %eax,(%edi)
403 movl %edx,4(%edi)
404 addl $8,%edi
405 dec %ecx
406 jne rp_sidt
407
408.macro set_early_handler handler,trapno
409 lea \handler,%edx
410 movl $(__KERNEL_CS << 16),%eax
411 movw %dx,%ax
412 movw $0x8E00,%dx /* interrupt gate - dpl=0, present */
413 lea idt_table,%edi
414 movl %eax,8*\trapno(%edi)
415 movl %edx,8*\trapno+4(%edi)
416.endm
417
418 set_early_handler handler=early_divide_err,trapno=0
419 set_early_handler handler=early_illegal_opcode,trapno=6
420 set_early_handler handler=early_protection_fault,trapno=13
421 set_early_handler handler=early_page_fault,trapno=14
422
423 ret
424
425early_divide_err:
426 xor %edx,%edx
427 pushl $0 /* fake errcode */
428 jmp early_fault
429
430early_illegal_opcode:
431 movl $6,%edx
432 pushl $0 /* fake errcode */
433 jmp early_fault
434
435early_protection_fault:
436 movl $13,%edx
437 jmp early_fault
438
439early_page_fault:
440 movl $14,%edx
441 jmp early_fault
442
443early_fault:
444 cld
445#ifdef CONFIG_PRINTK
446 movl $(__KERNEL_DS),%eax
447 movl %eax,%ds
448 movl %eax,%es
449 cmpl $2,early_recursion_flag
450 je hlt_loop
451 incl early_recursion_flag
452 movl %cr2,%eax
453 pushl %eax
454 pushl %edx /* trapno */
455 pushl $fault_msg
456#ifdef CONFIG_EARLY_PRINTK
457 call early_printk
458#else
459 call printk
460#endif
461#endif
462hlt_loop:
463 hlt
464 jmp hlt_loop
465
466/* This is the default interrupt "handler" :-) */
467 ALIGN
468ignore_int:
469 cld
470#ifdef CONFIG_PRINTK
471 pushl %eax
472 pushl %ecx
473 pushl %edx
474 pushl %es
475 pushl %ds
476 movl $(__KERNEL_DS),%eax
477 movl %eax,%ds
478 movl %eax,%es
479 cmpl $2,early_recursion_flag
480 je hlt_loop
481 incl early_recursion_flag
482 pushl 16(%esp)
483 pushl 24(%esp)
484 pushl 32(%esp)
485 pushl 40(%esp)
486 pushl $int_msg
487#ifdef CONFIG_EARLY_PRINTK
488 call early_printk
489#else
490 call printk
491#endif
492 addl $(5*4),%esp
493 popl %ds
494 popl %es
495 popl %edx
496 popl %ecx
497 popl %eax
498#endif
499 iret
500
501.section .text
502/*
503 * Real beginning of normal "text" segment
504 */
505ENTRY(stext)
506ENTRY(_stext)
507
508/*
509 * BSS section
510 */
511.section ".bss.page_aligned","wa"
512 .align PAGE_SIZE_asm
513ENTRY(swapper_pg_dir)
514 .fill 1024,4,0
515ENTRY(swapper_pg_pmd)
516 .fill 1024,4,0
517ENTRY(empty_zero_page)
518 .fill 4096,1,0
519
520/*
521 * This starts the data section.
522 */
523.data
524ENTRY(stack_start)
525 .long init_thread_union+THREAD_SIZE
526 .long __BOOT_DS
527
528ready: .byte 0
529
530early_recursion_flag:
531 .long 0
532
533int_msg:
534 .asciz "Unknown interrupt or fault at EIP %p %p %p\n"
535
536fault_msg:
537 .ascii "Int %d: CR2 %p err %p EIP %p CS %p flags %p\n"
538 .asciz "Stack: %p %p %p %p %p %p %p %p\n"
539
540#include "../../x86/xen/xen-head.S"
541
542/*
543 * The IDT and GDT 'descriptors' are a strange 48-bit object
544 * only used by the lidt and lgdt instructions. They are not
545 * like usual segment descriptors - they consist of a 16-bit
546 * segment size, and 32-bit linear address value:
547 */
548
549.globl boot_gdt_descr
550.globl idt_descr
551
552 ALIGN
553# early boot GDT descriptor (must use 1:1 address mapping)
554 .word 0 # 32 bit align gdt_desc.address
555boot_gdt_descr:
556 .word __BOOT_DS+7
557 .long boot_gdt - __PAGE_OFFSET
558
559 .word 0 # 32-bit align idt_desc.address
560idt_descr:
561 .word IDT_ENTRIES*8-1 # idt contains 256 entries
562 .long idt_table
563
564# boot GDT descriptor (later on used by CPU#0):
565 .word 0 # 32 bit align gdt_desc.address
566ENTRY(early_gdt_descr)
567 .word GDT_ENTRIES*8-1
568 .long per_cpu__gdt_page /* Overwritten for secondary CPUs */
569
570/*
571 * The boot_gdt must mirror the equivalent in setup.S and is
572 * used only for booting.
573 */
574 .align L1_CACHE_BYTES
575ENTRY(boot_gdt)
576 .fill GDT_ENTRY_BOOT_CS,8,0
577 .quad 0x00cf9a000000ffff /* kernel 4GB code at 0x00000000 */
578 .quad 0x00cf92000000ffff /* kernel 4GB data at 0x00000000 */
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
new file mode 100644
index 000000000000..b6167fe3330e
--- /dev/null
+++ b/arch/x86/kernel/head_64.S
@@ -0,0 +1,416 @@
1/*
2 * linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit
3 *
4 * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
6 * Copyright (C) 2000 Karsten Keil <kkeil@suse.de>
7 * Copyright (C) 2001,2002 Andi Kleen <ak@suse.de>
8 * Copyright (C) 2005 Eric Biederman <ebiederm@xmission.com>
9 */
10
11
12#include <linux/linkage.h>
13#include <linux/threads.h>
14#include <linux/init.h>
15#include <asm/desc.h>
16#include <asm/segment.h>
17#include <asm/pgtable.h>
18#include <asm/page.h>
19#include <asm/msr.h>
20#include <asm/cache.h>
21
22/* we are not able to switch in one step to the final KERNEL ADRESS SPACE
23 * because we need identity-mapped pages.
24 *
25 */
26
27 .text
28 .section .text.head
29 .code64
30 .globl startup_64
31startup_64:
32
33 /*
34 * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1,
35 * and someone has loaded an identity mapped page table
36 * for us. These identity mapped page tables map all of the
37 * kernel pages and possibly all of memory.
38 *
39 * %esi holds a physical pointer to real_mode_data.
40 *
41 * We come here either directly from a 64bit bootloader, or from
42 * arch/x86_64/boot/compressed/head.S.
43 *
44 * We only come here initially at boot nothing else comes here.
45 *
46 * Since we may be loaded at an address different from what we were
47 * compiled to run at we first fixup the physical addresses in our page
48 * tables and then reload them.
49 */
50
51 /* Compute the delta between the address I am compiled to run at and the
52 * address I am actually running at.
53 */
54 leaq _text(%rip), %rbp
55 subq $_text - __START_KERNEL_map, %rbp
56
57 /* Is the address not 2M aligned? */
58 movq %rbp, %rax
59 andl $~LARGE_PAGE_MASK, %eax
60 testl %eax, %eax
61 jnz bad_address
62
63 /* Is the address too large? */
64 leaq _text(%rip), %rdx
65 movq $PGDIR_SIZE, %rax
66 cmpq %rax, %rdx
67 jae bad_address
68
69 /* Fixup the physical addresses in the page table
70 */
71 addq %rbp, init_level4_pgt + 0(%rip)
72 addq %rbp, init_level4_pgt + (258*8)(%rip)
73 addq %rbp, init_level4_pgt + (511*8)(%rip)
74
75 addq %rbp, level3_ident_pgt + 0(%rip)
76
77 addq %rbp, level3_kernel_pgt + (510*8)(%rip)
78 addq %rbp, level3_kernel_pgt + (511*8)(%rip)
79
80 addq %rbp, level2_fixmap_pgt + (506*8)(%rip)
81
82 /* Add an Identity mapping if I am above 1G */
83 leaq _text(%rip), %rdi
84 andq $LARGE_PAGE_MASK, %rdi
85
86 movq %rdi, %rax
87 shrq $PUD_SHIFT, %rax
88 andq $(PTRS_PER_PUD - 1), %rax
89 jz ident_complete
90
91 leaq (level2_spare_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx
92 leaq level3_ident_pgt(%rip), %rbx
93 movq %rdx, 0(%rbx, %rax, 8)
94
95 movq %rdi, %rax
96 shrq $PMD_SHIFT, %rax
97 andq $(PTRS_PER_PMD - 1), %rax
98 leaq __PAGE_KERNEL_LARGE_EXEC(%rdi), %rdx
99 leaq level2_spare_pgt(%rip), %rbx
100 movq %rdx, 0(%rbx, %rax, 8)
101ident_complete:
102
103 /* Fixup the kernel text+data virtual addresses
104 */
105 leaq level2_kernel_pgt(%rip), %rdi
106 leaq 4096(%rdi), %r8
107 /* See if it is a valid page table entry */
1081: testq $1, 0(%rdi)
109 jz 2f
110 addq %rbp, 0(%rdi)
111 /* Go to the next page */
1122: addq $8, %rdi
113 cmp %r8, %rdi
114 jne 1b
115
116 /* Fixup phys_base */
117 addq %rbp, phys_base(%rip)
118
119#ifdef CONFIG_SMP
120 addq %rbp, trampoline_level4_pgt + 0(%rip)
121 addq %rbp, trampoline_level4_pgt + (511*8)(%rip)
122#endif
123#ifdef CONFIG_ACPI_SLEEP
124 addq %rbp, wakeup_level4_pgt + 0(%rip)
125 addq %rbp, wakeup_level4_pgt + (511*8)(%rip)
126#endif
127
128 /* Due to ENTRY(), sometimes the empty space gets filled with
129 * zeros. Better take a jmp than relying on empty space being
130 * filled with 0x90 (nop)
131 */
132 jmp secondary_startup_64
133ENTRY(secondary_startup_64)
134 /*
135 * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1,
136 * and someone has loaded a mapped page table.
137 *
138 * %esi holds a physical pointer to real_mode_data.
139 *
140 * We come here either from startup_64 (using physical addresses)
141 * or from trampoline.S (using virtual addresses).
142 *
143 * Using virtual addresses from trampoline.S removes the need
144 * to have any identity mapped pages in the kernel page table
145 * after the boot processor executes this code.
146 */
147
148 /* Enable PAE mode and PGE */
149 xorq %rax, %rax
150 btsq $5, %rax
151 btsq $7, %rax
152 movq %rax, %cr4
153
154 /* Setup early boot stage 4 level pagetables. */
155 movq $(init_level4_pgt - __START_KERNEL_map), %rax
156 addq phys_base(%rip), %rax
157 movq %rax, %cr3
158
159 /* Ensure I am executing from virtual addresses */
160 movq $1f, %rax
161 jmp *%rax
1621:
163
164 /* Check if nx is implemented */
165 movl $0x80000001, %eax
166 cpuid
167 movl %edx,%edi
168
169 /* Setup EFER (Extended Feature Enable Register) */
170 movl $MSR_EFER, %ecx
171 rdmsr
172 btsl $_EFER_SCE, %eax /* Enable System Call */
173 btl $20,%edi /* No Execute supported? */
174 jnc 1f
175 btsl $_EFER_NX, %eax
1761: wrmsr /* Make changes effective */
177
178 /* Setup cr0 */
179#define CR0_PM 1 /* protected mode */
180#define CR0_MP (1<<1)
181#define CR0_ET (1<<4)
182#define CR0_NE (1<<5)
183#define CR0_WP (1<<16)
184#define CR0_AM (1<<18)
185#define CR0_PAGING (1<<31)
186 movl $CR0_PM|CR0_MP|CR0_ET|CR0_NE|CR0_WP|CR0_AM|CR0_PAGING,%eax
187 /* Make changes effective */
188 movq %rax, %cr0
189
190 /* Setup a boot time stack */
191 movq init_rsp(%rip),%rsp
192
193 /* zero EFLAGS after setting rsp */
194 pushq $0
195 popfq
196
197 /*
198 * We must switch to a new descriptor in kernel space for the GDT
199 * because soon the kernel won't have access anymore to the userspace
200 * addresses where we're currently running on. We have to do that here
201 * because in 32bit we couldn't load a 64bit linear address.
202 */
203 lgdt cpu_gdt_descr(%rip)
204
205 /* set up data segments. actually 0 would do too */
206 movl $__KERNEL_DS,%eax
207 movl %eax,%ds
208 movl %eax,%ss
209 movl %eax,%es
210
211 /*
212 * We don't really need to load %fs or %gs, but load them anyway
213 * to kill any stale realmode selectors. This allows execution
214 * under VT hardware.
215 */
216 movl %eax,%fs
217 movl %eax,%gs
218
219 /*
220 * Setup up a dummy PDA. this is just for some early bootup code
221 * that does in_interrupt()
222 */
223 movl $MSR_GS_BASE,%ecx
224 movq $empty_zero_page,%rax
225 movq %rax,%rdx
226 shrq $32,%rdx
227 wrmsr
228
229 /* esi is pointer to real mode structure with interesting info.
230 pass it to C */
231 movl %esi, %edi
232
233 /* Finally jump to run C code and to be on real kernel address
234 * Since we are running on identity-mapped space we have to jump
235 * to the full 64bit address, this is only possible as indirect
236 * jump. In addition we need to ensure %cs is set so we make this
237 * a far return.
238 */
239 movq initial_code(%rip),%rax
240 pushq $0 # fake return address to stop unwinder
241 pushq $__KERNEL_CS # set correct cs
242 pushq %rax # target address in negative space
243 lretq
244
245 /* SMP bootup changes these two */
246#ifndef CONFIG_HOTPLUG_CPU
247 .pushsection .init.data
248#endif
249 .align 8
250 .globl initial_code
251initial_code:
252 .quad x86_64_start_kernel
253#ifndef CONFIG_HOTPLUG_CPU
254 .popsection
255#endif
256 .globl init_rsp
257init_rsp:
258 .quad init_thread_union+THREAD_SIZE-8
259
260bad_address:
261 jmp bad_address
262
263ENTRY(early_idt_handler)
264 cmpl $2,early_recursion_flag(%rip)
265 jz 1f
266 incl early_recursion_flag(%rip)
267 xorl %eax,%eax
268 movq 8(%rsp),%rsi # get rip
269 movq (%rsp),%rdx
270 movq %cr2,%rcx
271 leaq early_idt_msg(%rip),%rdi
272 call early_printk
273 cmpl $2,early_recursion_flag(%rip)
274 jz 1f
275 call dump_stack
276#ifdef CONFIG_KALLSYMS
277 leaq early_idt_ripmsg(%rip),%rdi
278 movq 8(%rsp),%rsi # get rip again
279 call __print_symbol
280#endif
2811: hlt
282 jmp 1b
283early_recursion_flag:
284 .long 0
285
286early_idt_msg:
287 .asciz "PANIC: early exception rip %lx error %lx cr2 %lx\n"
288early_idt_ripmsg:
289 .asciz "RIP %s\n"
290
291.balign PAGE_SIZE
292
293#define NEXT_PAGE(name) \
294 .balign PAGE_SIZE; \
295ENTRY(name)
296
297/* Automate the creation of 1 to 1 mapping pmd entries */
298#define PMDS(START, PERM, COUNT) \
299 i = 0 ; \
300 .rept (COUNT) ; \
301 .quad (START) + (i << 21) + (PERM) ; \
302 i = i + 1 ; \
303 .endr
304
305 /*
306 * This default setting generates an ident mapping at address 0x100000
307 * and a mapping for the kernel that precisely maps virtual address
308 * 0xffffffff80000000 to physical address 0x000000. (always using
309 * 2Mbyte large pages provided by PAE mode)
310 */
311NEXT_PAGE(init_level4_pgt)
312 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
313 .fill 257,8,0
314 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
315 .fill 252,8,0
316 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
317 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
318
319NEXT_PAGE(level3_ident_pgt)
320 .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
321 .fill 511,8,0
322
323NEXT_PAGE(level3_kernel_pgt)
324 .fill 510,8,0
325 /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
326 .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
327 .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
328
329NEXT_PAGE(level2_fixmap_pgt)
330 .fill 506,8,0
331 .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
332 /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */
333 .fill 5,8,0
334
335NEXT_PAGE(level1_fixmap_pgt)
336 .fill 512,8,0
337
338NEXT_PAGE(level2_ident_pgt)
339 /* Since I easily can, map the first 1G.
340 * Don't set NX because code runs from these pages.
341 */
342 PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD)
343
344NEXT_PAGE(level2_kernel_pgt)
345 /* 40MB kernel mapping. The kernel code cannot be bigger than that.
346 When you change this change KERNEL_TEXT_SIZE in page.h too. */
347 /* (2^48-(2*1024*1024*1024)-((2^39)*511)-((2^30)*510)) = 0 */
348 PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC|_PAGE_GLOBAL, KERNEL_TEXT_SIZE/PMD_SIZE)
349 /* Module mapping starts here */
350 .fill (PTRS_PER_PMD - (KERNEL_TEXT_SIZE/PMD_SIZE)),8,0
351
352NEXT_PAGE(level2_spare_pgt)
353 .fill 512,8,0
354
355#undef PMDS
356#undef NEXT_PAGE
357
358 .data
359 .align 16
360 .globl cpu_gdt_descr
361cpu_gdt_descr:
362 .word gdt_end-cpu_gdt_table-1
363gdt:
364 .quad cpu_gdt_table
365#ifdef CONFIG_SMP
366 .rept NR_CPUS-1
367 .word 0
368 .quad 0
369 .endr
370#endif
371
372ENTRY(phys_base)
373 /* This must match the first entry in level2_kernel_pgt */
374 .quad 0x0000000000000000
375
376/* We need valid kernel segments for data and code in long mode too
377 * IRET will check the segment types kkeil 2000/10/28
378 * Also sysret mandates a special GDT layout
379 */
380
381 .section .data.page_aligned, "aw"
382 .align PAGE_SIZE
383
384/* The TLS descriptors are currently at a different place compared to i386.
385 Hopefully nobody expects them at a fixed place (Wine?) */
386
387ENTRY(cpu_gdt_table)
388 .quad 0x0000000000000000 /* NULL descriptor */
389 .quad 0x00cf9b000000ffff /* __KERNEL32_CS */
390 .quad 0x00af9b000000ffff /* __KERNEL_CS */
391 .quad 0x00cf93000000ffff /* __KERNEL_DS */
392 .quad 0x00cffb000000ffff /* __USER32_CS */
393 .quad 0x00cff3000000ffff /* __USER_DS, __USER32_DS */
394 .quad 0x00affb000000ffff /* __USER_CS */
395 .quad 0x0 /* unused */
396 .quad 0,0 /* TSS */
397 .quad 0,0 /* LDT */
398 .quad 0,0,0 /* three TLS descriptors */
399 .quad 0x0000f40000000000 /* node/CPU stored in limit */
400gdt_end:
401 /* asm/segment.h:GDT_ENTRIES must match this */
402 /* This should be a multiple of the cache line size */
403 /* GDTs of other CPUs are now dynamically allocated */
404
405 /* zero the remaining page */
406 .fill PAGE_SIZE / 8 - GDT_ENTRIES,8,0
407
408 .section .bss, "aw", @nobits
409 .align L1_CACHE_BYTES
410ENTRY(idt_table)
411 .skip 256 * 16
412
413 .section .bss.page_aligned, "aw", @nobits
414 .align PAGE_SIZE
415ENTRY(empty_zero_page)
416 .skip PAGE_SIZE
diff --git a/arch/x86/kernel/hpet_32.c b/arch/x86/kernel/hpet_32.c
new file mode 100644
index 000000000000..533d4932bc79
--- /dev/null
+++ b/arch/x86/kernel/hpet_32.c
@@ -0,0 +1,553 @@
1#include <linux/clocksource.h>
2#include <linux/clockchips.h>
3#include <linux/errno.h>
4#include <linux/hpet.h>
5#include <linux/init.h>
6#include <linux/sysdev.h>
7#include <linux/pm.h>
8#include <linux/delay.h>
9
10#include <asm/hpet.h>
11#include <asm/io.h>
12
13extern struct clock_event_device *global_clock_event;
14
15#define HPET_MASK CLOCKSOURCE_MASK(32)
16#define HPET_SHIFT 22
17
18/* FSEC = 10^-15 NSEC = 10^-9 */
19#define FSEC_PER_NSEC 1000000
20
21/*
22 * HPET address is set in acpi/boot.c, when an ACPI entry exists
23 */
24unsigned long hpet_address;
25static void __iomem * hpet_virt_address;
26
27static inline unsigned long hpet_readl(unsigned long a)
28{
29 return readl(hpet_virt_address + a);
30}
31
32static inline void hpet_writel(unsigned long d, unsigned long a)
33{
34 writel(d, hpet_virt_address + a);
35}
36
37/*
38 * HPET command line enable / disable
39 */
40static int boot_hpet_disable;
41
42static int __init hpet_setup(char* str)
43{
44 if (str) {
45 if (!strncmp("disable", str, 7))
46 boot_hpet_disable = 1;
47 }
48 return 1;
49}
50__setup("hpet=", hpet_setup);
51
52static inline int is_hpet_capable(void)
53{
54 return (!boot_hpet_disable && hpet_address);
55}
56
57/*
58 * HPET timer interrupt enable / disable
59 */
60static int hpet_legacy_int_enabled;
61
62/**
63 * is_hpet_enabled - check whether the hpet timer interrupt is enabled
64 */
65int is_hpet_enabled(void)
66{
67 return is_hpet_capable() && hpet_legacy_int_enabled;
68}
69
70/*
71 * When the hpet driver (/dev/hpet) is enabled, we need to reserve
72 * timer 0 and timer 1 in case of RTC emulation.
73 */
74#ifdef CONFIG_HPET
75static void hpet_reserve_platform_timers(unsigned long id)
76{
77 struct hpet __iomem *hpet = hpet_virt_address;
78 struct hpet_timer __iomem *timer = &hpet->hpet_timers[2];
79 unsigned int nrtimers, i;
80 struct hpet_data hd;
81
82 nrtimers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1;
83
84 memset(&hd, 0, sizeof (hd));
85 hd.hd_phys_address = hpet_address;
86 hd.hd_address = hpet_virt_address;
87 hd.hd_nirqs = nrtimers;
88 hd.hd_flags = HPET_DATA_PLATFORM;
89 hpet_reserve_timer(&hd, 0);
90
91#ifdef CONFIG_HPET_EMULATE_RTC
92 hpet_reserve_timer(&hd, 1);
93#endif
94
95 hd.hd_irq[0] = HPET_LEGACY_8254;
96 hd.hd_irq[1] = HPET_LEGACY_RTC;
97
98 for (i = 2; i < nrtimers; timer++, i++)
99 hd.hd_irq[i] = (timer->hpet_config & Tn_INT_ROUTE_CNF_MASK) >>
100 Tn_INT_ROUTE_CNF_SHIFT;
101
102 hpet_alloc(&hd);
103
104}
105#else
106static void hpet_reserve_platform_timers(unsigned long id) { }
107#endif
108
109/*
110 * Common hpet info
111 */
112static unsigned long hpet_period;
113
114static void hpet_set_mode(enum clock_event_mode mode,
115 struct clock_event_device *evt);
116static int hpet_next_event(unsigned long delta,
117 struct clock_event_device *evt);
118
119/*
120 * The hpet clock event device
121 */
122static struct clock_event_device hpet_clockevent = {
123 .name = "hpet",
124 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
125 .set_mode = hpet_set_mode,
126 .set_next_event = hpet_next_event,
127 .shift = 32,
128 .irq = 0,
129};
130
131static void hpet_start_counter(void)
132{
133 unsigned long cfg = hpet_readl(HPET_CFG);
134
135 cfg &= ~HPET_CFG_ENABLE;
136 hpet_writel(cfg, HPET_CFG);
137 hpet_writel(0, HPET_COUNTER);
138 hpet_writel(0, HPET_COUNTER + 4);
139 cfg |= HPET_CFG_ENABLE;
140 hpet_writel(cfg, HPET_CFG);
141}
142
143static void hpet_enable_int(void)
144{
145 unsigned long cfg = hpet_readl(HPET_CFG);
146
147 cfg |= HPET_CFG_LEGACY;
148 hpet_writel(cfg, HPET_CFG);
149 hpet_legacy_int_enabled = 1;
150}
151
152static void hpet_set_mode(enum clock_event_mode mode,
153 struct clock_event_device *evt)
154{
155 unsigned long cfg, cmp, now;
156 uint64_t delta;
157
158 switch(mode) {
159 case CLOCK_EVT_MODE_PERIODIC:
160 delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * hpet_clockevent.mult;
161 delta >>= hpet_clockevent.shift;
162 now = hpet_readl(HPET_COUNTER);
163 cmp = now + (unsigned long) delta;
164 cfg = hpet_readl(HPET_T0_CFG);
165 cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC |
166 HPET_TN_SETVAL | HPET_TN_32BIT;
167 hpet_writel(cfg, HPET_T0_CFG);
168 /*
169 * The first write after writing TN_SETVAL to the
170 * config register sets the counter value, the second
171 * write sets the period.
172 */
173 hpet_writel(cmp, HPET_T0_CMP);
174 udelay(1);
175 hpet_writel((unsigned long) delta, HPET_T0_CMP);
176 break;
177
178 case CLOCK_EVT_MODE_ONESHOT:
179 cfg = hpet_readl(HPET_T0_CFG);
180 cfg &= ~HPET_TN_PERIODIC;
181 cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
182 hpet_writel(cfg, HPET_T0_CFG);
183 break;
184
185 case CLOCK_EVT_MODE_UNUSED:
186 case CLOCK_EVT_MODE_SHUTDOWN:
187 cfg = hpet_readl(HPET_T0_CFG);
188 cfg &= ~HPET_TN_ENABLE;
189 hpet_writel(cfg, HPET_T0_CFG);
190 break;
191
192 case CLOCK_EVT_MODE_RESUME:
193 hpet_enable_int();
194 break;
195 }
196}
197
198static int hpet_next_event(unsigned long delta,
199 struct clock_event_device *evt)
200{
201 unsigned long cnt;
202
203 cnt = hpet_readl(HPET_COUNTER);
204 cnt += delta;
205 hpet_writel(cnt, HPET_T0_CMP);
206
207 return ((long)(hpet_readl(HPET_COUNTER) - cnt ) > 0) ? -ETIME : 0;
208}
209
210/*
211 * Clock source related code
212 */
213static cycle_t read_hpet(void)
214{
215 return (cycle_t)hpet_readl(HPET_COUNTER);
216}
217
218static struct clocksource clocksource_hpet = {
219 .name = "hpet",
220 .rating = 250,
221 .read = read_hpet,
222 .mask = HPET_MASK,
223 .shift = HPET_SHIFT,
224 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
225 .resume = hpet_start_counter,
226};
227
228/*
229 * Try to setup the HPET timer
230 */
231int __init hpet_enable(void)
232{
233 unsigned long id;
234 uint64_t hpet_freq;
235 u64 tmp, start, now;
236 cycle_t t1;
237
238 if (!is_hpet_capable())
239 return 0;
240
241 hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
242
243 /*
244 * Read the period and check for a sane value:
245 */
246 hpet_period = hpet_readl(HPET_PERIOD);
247 if (hpet_period < HPET_MIN_PERIOD || hpet_period > HPET_MAX_PERIOD)
248 goto out_nohpet;
249
250 /*
251 * The period is a femto seconds value. We need to calculate the
252 * scaled math multiplication factor for nanosecond to hpet tick
253 * conversion.
254 */
255 hpet_freq = 1000000000000000ULL;
256 do_div(hpet_freq, hpet_period);
257 hpet_clockevent.mult = div_sc((unsigned long) hpet_freq,
258 NSEC_PER_SEC, 32);
259 /* Calculate the min / max delta */
260 hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
261 &hpet_clockevent);
262 hpet_clockevent.min_delta_ns = clockevent_delta2ns(0x30,
263 &hpet_clockevent);
264
265 /*
266 * Read the HPET ID register to retrieve the IRQ routing
267 * information and the number of channels
268 */
269 id = hpet_readl(HPET_ID);
270
271#ifdef CONFIG_HPET_EMULATE_RTC
272 /*
273 * The legacy routing mode needs at least two channels, tick timer
274 * and the rtc emulation channel.
275 */
276 if (!(id & HPET_ID_NUMBER))
277 goto out_nohpet;
278#endif
279
280 /* Start the counter */
281 hpet_start_counter();
282
283 /* Verify whether hpet counter works */
284 t1 = read_hpet();
285 rdtscll(start);
286
287 /*
288 * We don't know the TSC frequency yet, but waiting for
289 * 200000 TSC cycles is safe:
290 * 4 GHz == 50us
291 * 1 GHz == 200us
292 */
293 do {
294 rep_nop();
295 rdtscll(now);
296 } while ((now - start) < 200000UL);
297
298 if (t1 == read_hpet()) {
299 printk(KERN_WARNING
300 "HPET counter not counting. HPET disabled\n");
301 goto out_nohpet;
302 }
303
304 /* Initialize and register HPET clocksource
305 *
306 * hpet period is in femto seconds per cycle
307 * so we need to convert this to ns/cyc units
308 * aproximated by mult/2^shift
309 *
310 * fsec/cyc * 1nsec/1000000fsec = nsec/cyc = mult/2^shift
311 * fsec/cyc * 1ns/1000000fsec * 2^shift = mult
312 * fsec/cyc * 2^shift * 1nsec/1000000fsec = mult
313 * (fsec/cyc << shift)/1000000 = mult
314 * (hpet_period << shift)/FSEC_PER_NSEC = mult
315 */
316 tmp = (u64)hpet_period << HPET_SHIFT;
317 do_div(tmp, FSEC_PER_NSEC);
318 clocksource_hpet.mult = (u32)tmp;
319
320 clocksource_register(&clocksource_hpet);
321
322 if (id & HPET_ID_LEGSUP) {
323 hpet_enable_int();
324 hpet_reserve_platform_timers(id);
325 /*
326 * Start hpet with the boot cpu mask and make it
327 * global after the IO_APIC has been initialized.
328 */
329 hpet_clockevent.cpumask = cpumask_of_cpu(smp_processor_id());
330 clockevents_register_device(&hpet_clockevent);
331 global_clock_event = &hpet_clockevent;
332 return 1;
333 }
334 return 0;
335
336out_nohpet:
337 iounmap(hpet_virt_address);
338 hpet_virt_address = NULL;
339 boot_hpet_disable = 1;
340 return 0;
341}
342
343
344#ifdef CONFIG_HPET_EMULATE_RTC
345
346/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET
347 * is enabled, we support RTC interrupt functionality in software.
348 * RTC has 3 kinds of interrupts:
349 * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock
350 * is updated
351 * 2) Alarm Interrupt - generate an interrupt at a specific time of day
352 * 3) Periodic Interrupt - generate periodic interrupt, with frequencies
353 * 2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2)
354 * (1) and (2) above are implemented using polling at a frequency of
355 * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt
356 * overhead. (DEFAULT_RTC_INT_FREQ)
357 * For (3), we use interrupts at 64Hz or user specified periodic
358 * frequency, whichever is higher.
359 */
360#include <linux/mc146818rtc.h>
361#include <linux/rtc.h>
362
363#define DEFAULT_RTC_INT_FREQ 64
364#define DEFAULT_RTC_SHIFT 6
365#define RTC_NUM_INTS 1
366
367static unsigned long hpet_rtc_flags;
368static unsigned long hpet_prev_update_sec;
369static struct rtc_time hpet_alarm_time;
370static unsigned long hpet_pie_count;
371static unsigned long hpet_t1_cmp;
372static unsigned long hpet_default_delta;
373static unsigned long hpet_pie_delta;
374static unsigned long hpet_pie_limit;
375
376/*
377 * Timer 1 for RTC emulation. We use one shot mode, as periodic mode
378 * is not supported by all HPET implementations for timer 1.
379 *
380 * hpet_rtc_timer_init() is called when the rtc is initialized.
381 */
382int hpet_rtc_timer_init(void)
383{
384 unsigned long cfg, cnt, delta, flags;
385
386 if (!is_hpet_enabled())
387 return 0;
388
389 if (!hpet_default_delta) {
390 uint64_t clc;
391
392 clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC;
393 clc >>= hpet_clockevent.shift + DEFAULT_RTC_SHIFT;
394 hpet_default_delta = (unsigned long) clc;
395 }
396
397 if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit)
398 delta = hpet_default_delta;
399 else
400 delta = hpet_pie_delta;
401
402 local_irq_save(flags);
403
404 cnt = delta + hpet_readl(HPET_COUNTER);
405 hpet_writel(cnt, HPET_T1_CMP);
406 hpet_t1_cmp = cnt;
407
408 cfg = hpet_readl(HPET_T1_CFG);
409 cfg &= ~HPET_TN_PERIODIC;
410 cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
411 hpet_writel(cfg, HPET_T1_CFG);
412
413 local_irq_restore(flags);
414
415 return 1;
416}
417
418/*
419 * The functions below are called from rtc driver.
420 * Return 0 if HPET is not being used.
421 * Otherwise do the necessary changes and return 1.
422 */
423int hpet_mask_rtc_irq_bit(unsigned long bit_mask)
424{
425 if (!is_hpet_enabled())
426 return 0;
427
428 hpet_rtc_flags &= ~bit_mask;
429 return 1;
430}
431
432int hpet_set_rtc_irq_bit(unsigned long bit_mask)
433{
434 unsigned long oldbits = hpet_rtc_flags;
435
436 if (!is_hpet_enabled())
437 return 0;
438
439 hpet_rtc_flags |= bit_mask;
440
441 if (!oldbits)
442 hpet_rtc_timer_init();
443
444 return 1;
445}
446
447int hpet_set_alarm_time(unsigned char hrs, unsigned char min,
448 unsigned char sec)
449{
450 if (!is_hpet_enabled())
451 return 0;
452
453 hpet_alarm_time.tm_hour = hrs;
454 hpet_alarm_time.tm_min = min;
455 hpet_alarm_time.tm_sec = sec;
456
457 return 1;
458}
459
460int hpet_set_periodic_freq(unsigned long freq)
461{
462 uint64_t clc;
463
464 if (!is_hpet_enabled())
465 return 0;
466
467 if (freq <= DEFAULT_RTC_INT_FREQ)
468 hpet_pie_limit = DEFAULT_RTC_INT_FREQ / freq;
469 else {
470 clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC;
471 do_div(clc, freq);
472 clc >>= hpet_clockevent.shift;
473 hpet_pie_delta = (unsigned long) clc;
474 }
475 return 1;
476}
477
478int hpet_rtc_dropped_irq(void)
479{
480 return is_hpet_enabled();
481}
482
483static void hpet_rtc_timer_reinit(void)
484{
485 unsigned long cfg, delta;
486 int lost_ints = -1;
487
488 if (unlikely(!hpet_rtc_flags)) {
489 cfg = hpet_readl(HPET_T1_CFG);
490 cfg &= ~HPET_TN_ENABLE;
491 hpet_writel(cfg, HPET_T1_CFG);
492 return;
493 }
494
495 if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit)
496 delta = hpet_default_delta;
497 else
498 delta = hpet_pie_delta;
499
500 /*
501 * Increment the comparator value until we are ahead of the
502 * current count.
503 */
504 do {
505 hpet_t1_cmp += delta;
506 hpet_writel(hpet_t1_cmp, HPET_T1_CMP);
507 lost_ints++;
508 } while ((long)(hpet_readl(HPET_COUNTER) - hpet_t1_cmp) > 0);
509
510 if (lost_ints) {
511 if (hpet_rtc_flags & RTC_PIE)
512 hpet_pie_count += lost_ints;
513 if (printk_ratelimit())
514 printk(KERN_WARNING "rtc: lost %d interrupts\n",
515 lost_ints);
516 }
517}
518
519irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
520{
521 struct rtc_time curr_time;
522 unsigned long rtc_int_flag = 0;
523
524 hpet_rtc_timer_reinit();
525
526 if (hpet_rtc_flags & (RTC_UIE | RTC_AIE))
527 rtc_get_rtc_time(&curr_time);
528
529 if (hpet_rtc_flags & RTC_UIE &&
530 curr_time.tm_sec != hpet_prev_update_sec) {
531 rtc_int_flag = RTC_UF;
532 hpet_prev_update_sec = curr_time.tm_sec;
533 }
534
535 if (hpet_rtc_flags & RTC_PIE &&
536 ++hpet_pie_count >= hpet_pie_limit) {
537 rtc_int_flag |= RTC_PF;
538 hpet_pie_count = 0;
539 }
540
541 if (hpet_rtc_flags & RTC_PIE &&
542 (curr_time.tm_sec == hpet_alarm_time.tm_sec) &&
543 (curr_time.tm_min == hpet_alarm_time.tm_min) &&
544 (curr_time.tm_hour == hpet_alarm_time.tm_hour))
545 rtc_int_flag |= RTC_AF;
546
547 if (rtc_int_flag) {
548 rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8));
549 rtc_interrupt(rtc_int_flag, dev_id);
550 }
551 return IRQ_HANDLED;
552}
553#endif
diff --git a/arch/x86/kernel/hpet_64.c b/arch/x86/kernel/hpet_64.c
new file mode 100644
index 000000000000..e2d1b912e154
--- /dev/null
+++ b/arch/x86/kernel/hpet_64.c
@@ -0,0 +1,493 @@
1#include <linux/kernel.h>
2#include <linux/sched.h>
3#include <linux/init.h>
4#include <linux/mc146818rtc.h>
5#include <linux/time.h>
6#include <linux/clocksource.h>
7#include <linux/ioport.h>
8#include <linux/acpi.h>
9#include <linux/hpet.h>
10#include <asm/pgtable.h>
11#include <asm/vsyscall.h>
12#include <asm/timex.h>
13#include <asm/hpet.h>
14
15#define HPET_MASK 0xFFFFFFFF
16#define HPET_SHIFT 22
17
18/* FSEC = 10^-15 NSEC = 10^-9 */
19#define FSEC_PER_NSEC 1000000
20
21int nohpet __initdata;
22
23unsigned long hpet_address;
24unsigned long hpet_period; /* fsecs / HPET clock */
25unsigned long hpet_tick; /* HPET clocks / interrupt */
26
27int hpet_use_timer; /* Use counter of hpet for time keeping,
28 * otherwise PIT
29 */
30
31#ifdef CONFIG_HPET
32static __init int late_hpet_init(void)
33{
34 struct hpet_data hd;
35 unsigned int ntimer;
36
37 if (!hpet_address)
38 return 0;
39
40 memset(&hd, 0, sizeof(hd));
41
42 ntimer = hpet_readl(HPET_ID);
43 ntimer = (ntimer & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT;
44 ntimer++;
45
46 /*
47 * Register with driver.
48 * Timer0 and Timer1 is used by platform.
49 */
50 hd.hd_phys_address = hpet_address;
51 hd.hd_address = (void __iomem *)fix_to_virt(FIX_HPET_BASE);
52 hd.hd_nirqs = ntimer;
53 hd.hd_flags = HPET_DATA_PLATFORM;
54 hpet_reserve_timer(&hd, 0);
55#ifdef CONFIG_HPET_EMULATE_RTC
56 hpet_reserve_timer(&hd, 1);
57#endif
58 hd.hd_irq[0] = HPET_LEGACY_8254;
59 hd.hd_irq[1] = HPET_LEGACY_RTC;
60 if (ntimer > 2) {
61 struct hpet *hpet;
62 struct hpet_timer *timer;
63 int i;
64
65 hpet = (struct hpet *) fix_to_virt(FIX_HPET_BASE);
66 timer = &hpet->hpet_timers[2];
67 for (i = 2; i < ntimer; timer++, i++)
68 hd.hd_irq[i] = (timer->hpet_config &
69 Tn_INT_ROUTE_CNF_MASK) >>
70 Tn_INT_ROUTE_CNF_SHIFT;
71
72 }
73
74 hpet_alloc(&hd);
75 return 0;
76}
77fs_initcall(late_hpet_init);
78#endif
79
80int hpet_timer_stop_set_go(unsigned long tick)
81{
82 unsigned int cfg;
83
84/*
85 * Stop the timers and reset the main counter.
86 */
87
88 cfg = hpet_readl(HPET_CFG);
89 cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY);
90 hpet_writel(cfg, HPET_CFG);
91 hpet_writel(0, HPET_COUNTER);
92 hpet_writel(0, HPET_COUNTER + 4);
93
94/*
95 * Set up timer 0, as periodic with first interrupt to happen at hpet_tick,
96 * and period also hpet_tick.
97 */
98 if (hpet_use_timer) {
99 hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL |
100 HPET_TN_32BIT, HPET_T0_CFG);
101 hpet_writel(hpet_tick, HPET_T0_CMP); /* next interrupt */
102 hpet_writel(hpet_tick, HPET_T0_CMP); /* period */
103 cfg |= HPET_CFG_LEGACY;
104 }
105/*
106 * Go!
107 */
108
109 cfg |= HPET_CFG_ENABLE;
110 hpet_writel(cfg, HPET_CFG);
111
112 return 0;
113}
114
115static cycle_t read_hpet(void)
116{
117 return (cycle_t)hpet_readl(HPET_COUNTER);
118}
119
120static cycle_t __vsyscall_fn vread_hpet(void)
121{
122 return readl((void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0);
123}
124
125struct clocksource clocksource_hpet = {
126 .name = "hpet",
127 .rating = 250,
128 .read = read_hpet,
129 .mask = (cycle_t)HPET_MASK,
130 .mult = 0, /* set below */
131 .shift = HPET_SHIFT,
132 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
133 .vread = vread_hpet,
134};
135
136int __init hpet_arch_init(void)
137{
138 unsigned int id;
139 u64 tmp;
140
141 if (!hpet_address)
142 return -1;
143 set_fixmap_nocache(FIX_HPET_BASE, hpet_address);
144 __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE);
145
146/*
147 * Read the period, compute tick and quotient.
148 */
149
150 id = hpet_readl(HPET_ID);
151
152 if (!(id & HPET_ID_VENDOR) || !(id & HPET_ID_NUMBER))
153 return -1;
154
155 hpet_period = hpet_readl(HPET_PERIOD);
156 if (hpet_period < 100000 || hpet_period > 100000000)
157 return -1;
158
159 hpet_tick = (FSEC_PER_TICK + hpet_period / 2) / hpet_period;
160
161 hpet_use_timer = (id & HPET_ID_LEGSUP);
162
163 /*
164 * hpet period is in femto seconds per cycle
165 * so we need to convert this to ns/cyc units
166 * aproximated by mult/2^shift
167 *
168 * fsec/cyc * 1nsec/1000000fsec = nsec/cyc = mult/2^shift
169 * fsec/cyc * 1ns/1000000fsec * 2^shift = mult
170 * fsec/cyc * 2^shift * 1nsec/1000000fsec = mult
171 * (fsec/cyc << shift)/1000000 = mult
172 * (hpet_period << shift)/FSEC_PER_NSEC = mult
173 */
174 tmp = (u64)hpet_period << HPET_SHIFT;
175 do_div(tmp, FSEC_PER_NSEC);
176 clocksource_hpet.mult = (u32)tmp;
177 clocksource_register(&clocksource_hpet);
178
179 return hpet_timer_stop_set_go(hpet_tick);
180}
181
182int hpet_reenable(void)
183{
184 return hpet_timer_stop_set_go(hpet_tick);
185}
186
187/*
188 * calibrate_tsc() calibrates the processor TSC in a very simple way, comparing
189 * it to the HPET timer of known frequency.
190 */
191
192#define TICK_COUNT 100000000
193#define SMI_THRESHOLD 50000
194#define MAX_TRIES 5
195
196/*
197 * Some platforms take periodic SMI interrupts with 5ms duration. Make sure none
198 * occurs between the reads of the hpet & TSC.
199 */
200static void __init read_hpet_tsc(int *hpet, int *tsc)
201{
202 int tsc1, tsc2, hpet1, i;
203
204 for (i = 0; i < MAX_TRIES; i++) {
205 tsc1 = get_cycles_sync();
206 hpet1 = hpet_readl(HPET_COUNTER);
207 tsc2 = get_cycles_sync();
208 if ((tsc2 - tsc1) < SMI_THRESHOLD)
209 break;
210 }
211 *hpet = hpet1;
212 *tsc = tsc2;
213}
214
215unsigned int __init hpet_calibrate_tsc(void)
216{
217 int tsc_start, hpet_start;
218 int tsc_now, hpet_now;
219 unsigned long flags;
220
221 local_irq_save(flags);
222
223 read_hpet_tsc(&hpet_start, &tsc_start);
224
225 do {
226 local_irq_disable();
227 read_hpet_tsc(&hpet_now, &tsc_now);
228 local_irq_restore(flags);
229 } while ((tsc_now - tsc_start) < TICK_COUNT &&
230 (hpet_now - hpet_start) < TICK_COUNT);
231
232 return (tsc_now - tsc_start) * 1000000000L
233 / ((hpet_now - hpet_start) * hpet_period / 1000);
234}
235
236#ifdef CONFIG_HPET_EMULATE_RTC
237/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET
238 * is enabled, we support RTC interrupt functionality in software.
239 * RTC has 3 kinds of interrupts:
240 * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock
241 * is updated
242 * 2) Alarm Interrupt - generate an interrupt at a specific time of day
243 * 3) Periodic Interrupt - generate periodic interrupt, with frequencies
244 * 2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2)
245 * (1) and (2) above are implemented using polling at a frequency of
246 * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt
247 * overhead. (DEFAULT_RTC_INT_FREQ)
248 * For (3), we use interrupts at 64Hz or user specified periodic
249 * frequency, whichever is higher.
250 */
251#include <linux/rtc.h>
252
253#define DEFAULT_RTC_INT_FREQ 64
254#define RTC_NUM_INTS 1
255
256static unsigned long UIE_on;
257static unsigned long prev_update_sec;
258
259static unsigned long AIE_on;
260static struct rtc_time alarm_time;
261
262static unsigned long PIE_on;
263static unsigned long PIE_freq = DEFAULT_RTC_INT_FREQ;
264static unsigned long PIE_count;
265
266static unsigned long hpet_rtc_int_freq; /* RTC interrupt frequency */
267static unsigned int hpet_t1_cmp; /* cached comparator register */
268
269int is_hpet_enabled(void)
270{
271 return hpet_address != 0;
272}
273
274/*
275 * Timer 1 for RTC, we do not use periodic interrupt feature,
276 * even if HPET supports periodic interrupts on Timer 1.
277 * The reason being, to set up a periodic interrupt in HPET, we need to
278 * stop the main counter. And if we do that everytime someone diables/enables
279 * RTC, we will have adverse effect on main kernel timer running on Timer 0.
280 * So, for the time being, simulate the periodic interrupt in software.
281 *
282 * hpet_rtc_timer_init() is called for the first time and during subsequent
283 * interuppts reinit happens through hpet_rtc_timer_reinit().
284 */
285int hpet_rtc_timer_init(void)
286{
287 unsigned int cfg, cnt;
288 unsigned long flags;
289
290 if (!is_hpet_enabled())
291 return 0;
292 /*
293 * Set the counter 1 and enable the interrupts.
294 */
295 if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ))
296 hpet_rtc_int_freq = PIE_freq;
297 else
298 hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
299
300 local_irq_save(flags);
301
302 cnt = hpet_readl(HPET_COUNTER);
303 cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq);
304 hpet_writel(cnt, HPET_T1_CMP);
305 hpet_t1_cmp = cnt;
306
307 cfg = hpet_readl(HPET_T1_CFG);
308 cfg &= ~HPET_TN_PERIODIC;
309 cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
310 hpet_writel(cfg, HPET_T1_CFG);
311
312 local_irq_restore(flags);
313
314 return 1;
315}
316
317static void hpet_rtc_timer_reinit(void)
318{
319 unsigned int cfg, cnt, ticks_per_int, lost_ints;
320
321 if (unlikely(!(PIE_on | AIE_on | UIE_on))) {
322 cfg = hpet_readl(HPET_T1_CFG);
323 cfg &= ~HPET_TN_ENABLE;
324 hpet_writel(cfg, HPET_T1_CFG);
325 return;
326 }
327
328 if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ))
329 hpet_rtc_int_freq = PIE_freq;
330 else
331 hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
332
333 /* It is more accurate to use the comparator value than current count.*/
334 ticks_per_int = hpet_tick * HZ / hpet_rtc_int_freq;
335 hpet_t1_cmp += ticks_per_int;
336 hpet_writel(hpet_t1_cmp, HPET_T1_CMP);
337
338 /*
339 * If the interrupt handler was delayed too long, the write above tries
340 * to schedule the next interrupt in the past and the hardware would
341 * not interrupt until the counter had wrapped around.
342 * So we have to check that the comparator wasn't set to a past time.
343 */
344 cnt = hpet_readl(HPET_COUNTER);
345 if (unlikely((int)(cnt - hpet_t1_cmp) > 0)) {
346 lost_ints = (cnt - hpet_t1_cmp) / ticks_per_int + 1;
347 /* Make sure that, even with the time needed to execute
348 * this code, the next scheduled interrupt has been moved
349 * back to the future: */
350 lost_ints++;
351
352 hpet_t1_cmp += lost_ints * ticks_per_int;
353 hpet_writel(hpet_t1_cmp, HPET_T1_CMP);
354
355 if (PIE_on)
356 PIE_count += lost_ints;
357
358 if (printk_ratelimit())
359 printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n",
360 hpet_rtc_int_freq);
361 }
362}
363
364/*
365 * The functions below are called from rtc driver.
366 * Return 0 if HPET is not being used.
367 * Otherwise do the necessary changes and return 1.
368 */
369int hpet_mask_rtc_irq_bit(unsigned long bit_mask)
370{
371 if (!is_hpet_enabled())
372 return 0;
373
374 if (bit_mask & RTC_UIE)
375 UIE_on = 0;
376 if (bit_mask & RTC_PIE)
377 PIE_on = 0;
378 if (bit_mask & RTC_AIE)
379 AIE_on = 0;
380
381 return 1;
382}
383
384int hpet_set_rtc_irq_bit(unsigned long bit_mask)
385{
386 int timer_init_reqd = 0;
387
388 if (!is_hpet_enabled())
389 return 0;
390
391 if (!(PIE_on | AIE_on | UIE_on))
392 timer_init_reqd = 1;
393
394 if (bit_mask & RTC_UIE) {
395 UIE_on = 1;
396 }
397 if (bit_mask & RTC_PIE) {
398 PIE_on = 1;
399 PIE_count = 0;
400 }
401 if (bit_mask & RTC_AIE) {
402 AIE_on = 1;
403 }
404
405 if (timer_init_reqd)
406 hpet_rtc_timer_init();
407
408 return 1;
409}
410
411int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec)
412{
413 if (!is_hpet_enabled())
414 return 0;
415
416 alarm_time.tm_hour = hrs;
417 alarm_time.tm_min = min;
418 alarm_time.tm_sec = sec;
419
420 return 1;
421}
422
423int hpet_set_periodic_freq(unsigned long freq)
424{
425 if (!is_hpet_enabled())
426 return 0;
427
428 PIE_freq = freq;
429 PIE_count = 0;
430
431 return 1;
432}
433
434int hpet_rtc_dropped_irq(void)
435{
436 if (!is_hpet_enabled())
437 return 0;
438
439 return 1;
440}
441
442irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
443{
444 struct rtc_time curr_time;
445 unsigned long rtc_int_flag = 0;
446 int call_rtc_interrupt = 0;
447
448 hpet_rtc_timer_reinit();
449
450 if (UIE_on | AIE_on) {
451 rtc_get_rtc_time(&curr_time);
452 }
453 if (UIE_on) {
454 if (curr_time.tm_sec != prev_update_sec) {
455 /* Set update int info, call real rtc int routine */
456 call_rtc_interrupt = 1;
457 rtc_int_flag = RTC_UF;
458 prev_update_sec = curr_time.tm_sec;
459 }
460 }
461 if (PIE_on) {
462 PIE_count++;
463 if (PIE_count >= hpet_rtc_int_freq/PIE_freq) {
464 /* Set periodic int info, call real rtc int routine */
465 call_rtc_interrupt = 1;
466 rtc_int_flag |= RTC_PF;
467 PIE_count = 0;
468 }
469 }
470 if (AIE_on) {
471 if ((curr_time.tm_sec == alarm_time.tm_sec) &&
472 (curr_time.tm_min == alarm_time.tm_min) &&
473 (curr_time.tm_hour == alarm_time.tm_hour)) {
474 /* Set alarm int info, call real rtc int routine */
475 call_rtc_interrupt = 1;
476 rtc_int_flag |= RTC_AF;
477 }
478 }
479 if (call_rtc_interrupt) {
480 rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8));
481 rtc_interrupt(rtc_int_flag, dev_id);
482 }
483 return IRQ_HANDLED;
484}
485#endif
486
487static int __init nohpet_setup(char *s)
488{
489 nohpet = 1;
490 return 1;
491}
492
493__setup("nohpet", nohpet_setup);
diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c
new file mode 100644
index 000000000000..e3d4b73bfdb0
--- /dev/null
+++ b/arch/x86/kernel/i386_ksyms_32.c
@@ -0,0 +1,30 @@
1#include <linux/module.h>
2#include <asm/checksum.h>
3#include <asm/desc.h>
4
5EXPORT_SYMBOL(__down_failed);
6EXPORT_SYMBOL(__down_failed_interruptible);
7EXPORT_SYMBOL(__down_failed_trylock);
8EXPORT_SYMBOL(__up_wakeup);
9/* Networking helper routines. */
10EXPORT_SYMBOL(csum_partial_copy_generic);
11
12EXPORT_SYMBOL(__get_user_1);
13EXPORT_SYMBOL(__get_user_2);
14EXPORT_SYMBOL(__get_user_4);
15
16EXPORT_SYMBOL(__put_user_1);
17EXPORT_SYMBOL(__put_user_2);
18EXPORT_SYMBOL(__put_user_4);
19EXPORT_SYMBOL(__put_user_8);
20
21EXPORT_SYMBOL(strstr);
22
23#ifdef CONFIG_SMP
24extern void FASTCALL( __write_lock_failed(rwlock_t *rw));
25extern void FASTCALL( __read_lock_failed(rwlock_t *rw));
26EXPORT_SYMBOL(__write_lock_failed);
27EXPORT_SYMBOL(__read_lock_failed);
28#endif
29
30EXPORT_SYMBOL(csum_partial);
diff --git a/arch/x86/kernel/i387_32.c b/arch/x86/kernel/i387_32.c
new file mode 100644
index 000000000000..665847281ed2
--- /dev/null
+++ b/arch/x86/kernel/i387_32.c
@@ -0,0 +1,546 @@
1/*
2 * linux/arch/i386/kernel/i387.c
3 *
4 * Copyright (C) 1994 Linus Torvalds
5 *
6 * Pentium III FXSR, SSE support
7 * General FPU state handling cleanups
8 * Gareth Hughes <gareth@valinux.com>, May 2000
9 */
10
11#include <linux/sched.h>
12#include <linux/module.h>
13#include <asm/processor.h>
14#include <asm/i387.h>
15#include <asm/math_emu.h>
16#include <asm/sigcontext.h>
17#include <asm/user.h>
18#include <asm/ptrace.h>
19#include <asm/uaccess.h>
20
21#ifdef CONFIG_MATH_EMULATION
22#define HAVE_HWFP (boot_cpu_data.hard_math)
23#else
24#define HAVE_HWFP 1
25#endif
26
27static unsigned long mxcsr_feature_mask __read_mostly = 0xffffffff;
28
29void mxcsr_feature_mask_init(void)
30{
31 unsigned long mask = 0;
32 clts();
33 if (cpu_has_fxsr) {
34 memset(&current->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
35 asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave));
36 mask = current->thread.i387.fxsave.mxcsr_mask;
37 if (mask == 0) mask = 0x0000ffbf;
38 }
39 mxcsr_feature_mask &= mask;
40 stts();
41}
42
43/*
44 * The _current_ task is using the FPU for the first time
45 * so initialize it and set the mxcsr to its default
46 * value at reset if we support XMM instructions and then
47 * remeber the current task has used the FPU.
48 */
49void init_fpu(struct task_struct *tsk)
50{
51 if (cpu_has_fxsr) {
52 memset(&tsk->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
53 tsk->thread.i387.fxsave.cwd = 0x37f;
54 if (cpu_has_xmm)
55 tsk->thread.i387.fxsave.mxcsr = 0x1f80;
56 } else {
57 memset(&tsk->thread.i387.fsave, 0, sizeof(struct i387_fsave_struct));
58 tsk->thread.i387.fsave.cwd = 0xffff037fu;
59 tsk->thread.i387.fsave.swd = 0xffff0000u;
60 tsk->thread.i387.fsave.twd = 0xffffffffu;
61 tsk->thread.i387.fsave.fos = 0xffff0000u;
62 }
63 /* only the device not available exception or ptrace can call init_fpu */
64 set_stopped_child_used_math(tsk);
65}
66
67/*
68 * FPU lazy state save handling.
69 */
70
71void kernel_fpu_begin(void)
72{
73 struct thread_info *thread = current_thread_info();
74
75 preempt_disable();
76 if (thread->status & TS_USEDFPU) {
77 __save_init_fpu(thread->task);
78 return;
79 }
80 clts();
81}
82EXPORT_SYMBOL_GPL(kernel_fpu_begin);
83
84/*
85 * FPU tag word conversions.
86 */
87
88static inline unsigned short twd_i387_to_fxsr( unsigned short twd )
89{
90 unsigned int tmp; /* to avoid 16 bit prefixes in the code */
91
92 /* Transform each pair of bits into 01 (valid) or 00 (empty) */
93 tmp = ~twd;
94 tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */
95 /* and move the valid bits to the lower byte. */
96 tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */
97 tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */
98 tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */
99 return tmp;
100}
101
102static inline unsigned long twd_fxsr_to_i387( struct i387_fxsave_struct *fxsave )
103{
104 struct _fpxreg *st = NULL;
105 unsigned long tos = (fxsave->swd >> 11) & 7;
106 unsigned long twd = (unsigned long) fxsave->twd;
107 unsigned long tag;
108 unsigned long ret = 0xffff0000u;
109 int i;
110
111#define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16);
112
113 for ( i = 0 ; i < 8 ; i++ ) {
114 if ( twd & 0x1 ) {
115 st = FPREG_ADDR( fxsave, (i - tos) & 7 );
116
117 switch ( st->exponent & 0x7fff ) {
118 case 0x7fff:
119 tag = 2; /* Special */
120 break;
121 case 0x0000:
122 if ( !st->significand[0] &&
123 !st->significand[1] &&
124 !st->significand[2] &&
125 !st->significand[3] ) {
126 tag = 1; /* Zero */
127 } else {
128 tag = 2; /* Special */
129 }
130 break;
131 default:
132 if ( st->significand[3] & 0x8000 ) {
133 tag = 0; /* Valid */
134 } else {
135 tag = 2; /* Special */
136 }
137 break;
138 }
139 } else {
140 tag = 3; /* Empty */
141 }
142 ret |= (tag << (2 * i));
143 twd = twd >> 1;
144 }
145 return ret;
146}
147
148/*
149 * FPU state interaction.
150 */
151
152unsigned short get_fpu_cwd( struct task_struct *tsk )
153{
154 if ( cpu_has_fxsr ) {
155 return tsk->thread.i387.fxsave.cwd;
156 } else {
157 return (unsigned short)tsk->thread.i387.fsave.cwd;
158 }
159}
160
161unsigned short get_fpu_swd( struct task_struct *tsk )
162{
163 if ( cpu_has_fxsr ) {
164 return tsk->thread.i387.fxsave.swd;
165 } else {
166 return (unsigned short)tsk->thread.i387.fsave.swd;
167 }
168}
169
170#if 0
171unsigned short get_fpu_twd( struct task_struct *tsk )
172{
173 if ( cpu_has_fxsr ) {
174 return tsk->thread.i387.fxsave.twd;
175 } else {
176 return (unsigned short)tsk->thread.i387.fsave.twd;
177 }
178}
179#endif /* 0 */
180
181unsigned short get_fpu_mxcsr( struct task_struct *tsk )
182{
183 if ( cpu_has_xmm ) {
184 return tsk->thread.i387.fxsave.mxcsr;
185 } else {
186 return 0x1f80;
187 }
188}
189
190#if 0
191
192void set_fpu_cwd( struct task_struct *tsk, unsigned short cwd )
193{
194 if ( cpu_has_fxsr ) {
195 tsk->thread.i387.fxsave.cwd = cwd;
196 } else {
197 tsk->thread.i387.fsave.cwd = ((long)cwd | 0xffff0000u);
198 }
199}
200
201void set_fpu_swd( struct task_struct *tsk, unsigned short swd )
202{
203 if ( cpu_has_fxsr ) {
204 tsk->thread.i387.fxsave.swd = swd;
205 } else {
206 tsk->thread.i387.fsave.swd = ((long)swd | 0xffff0000u);
207 }
208}
209
210void set_fpu_twd( struct task_struct *tsk, unsigned short twd )
211{
212 if ( cpu_has_fxsr ) {
213 tsk->thread.i387.fxsave.twd = twd_i387_to_fxsr(twd);
214 } else {
215 tsk->thread.i387.fsave.twd = ((long)twd | 0xffff0000u);
216 }
217}
218
219#endif /* 0 */
220
221/*
222 * FXSR floating point environment conversions.
223 */
224
225static int convert_fxsr_to_user( struct _fpstate __user *buf,
226 struct i387_fxsave_struct *fxsave )
227{
228 unsigned long env[7];
229 struct _fpreg __user *to;
230 struct _fpxreg *from;
231 int i;
232
233 env[0] = (unsigned long)fxsave->cwd | 0xffff0000ul;
234 env[1] = (unsigned long)fxsave->swd | 0xffff0000ul;
235 env[2] = twd_fxsr_to_i387(fxsave);
236 env[3] = fxsave->fip;
237 env[4] = fxsave->fcs | ((unsigned long)fxsave->fop << 16);
238 env[5] = fxsave->foo;
239 env[6] = fxsave->fos;
240
241 if ( __copy_to_user( buf, env, 7 * sizeof(unsigned long) ) )
242 return 1;
243
244 to = &buf->_st[0];
245 from = (struct _fpxreg *) &fxsave->st_space[0];
246 for ( i = 0 ; i < 8 ; i++, to++, from++ ) {
247 unsigned long __user *t = (unsigned long __user *)to;
248 unsigned long *f = (unsigned long *)from;
249
250 if (__put_user(*f, t) ||
251 __put_user(*(f + 1), t + 1) ||
252 __put_user(from->exponent, &to->exponent))
253 return 1;
254 }
255 return 0;
256}
257
258static int convert_fxsr_from_user( struct i387_fxsave_struct *fxsave,
259 struct _fpstate __user *buf )
260{
261 unsigned long env[7];
262 struct _fpxreg *to;
263 struct _fpreg __user *from;
264 int i;
265
266 if ( __copy_from_user( env, buf, 7 * sizeof(long) ) )
267 return 1;
268
269 fxsave->cwd = (unsigned short)(env[0] & 0xffff);
270 fxsave->swd = (unsigned short)(env[1] & 0xffff);
271 fxsave->twd = twd_i387_to_fxsr((unsigned short)(env[2] & 0xffff));
272 fxsave->fip = env[3];
273 fxsave->fop = (unsigned short)((env[4] & 0xffff0000ul) >> 16);
274 fxsave->fcs = (env[4] & 0xffff);
275 fxsave->foo = env[5];
276 fxsave->fos = env[6];
277
278 to = (struct _fpxreg *) &fxsave->st_space[0];
279 from = &buf->_st[0];
280 for ( i = 0 ; i < 8 ; i++, to++, from++ ) {
281 unsigned long *t = (unsigned long *)to;
282 unsigned long __user *f = (unsigned long __user *)from;
283
284 if (__get_user(*t, f) ||
285 __get_user(*(t + 1), f + 1) ||
286 __get_user(to->exponent, &from->exponent))
287 return 1;
288 }
289 return 0;
290}
291
292/*
293 * Signal frame handlers.
294 */
295
296static inline int save_i387_fsave( struct _fpstate __user *buf )
297{
298 struct task_struct *tsk = current;
299
300 unlazy_fpu( tsk );
301 tsk->thread.i387.fsave.status = tsk->thread.i387.fsave.swd;
302 if ( __copy_to_user( buf, &tsk->thread.i387.fsave,
303 sizeof(struct i387_fsave_struct) ) )
304 return -1;
305 return 1;
306}
307
308static int save_i387_fxsave( struct _fpstate __user *buf )
309{
310 struct task_struct *tsk = current;
311 int err = 0;
312
313 unlazy_fpu( tsk );
314
315 if ( convert_fxsr_to_user( buf, &tsk->thread.i387.fxsave ) )
316 return -1;
317
318 err |= __put_user( tsk->thread.i387.fxsave.swd, &buf->status );
319 err |= __put_user( X86_FXSR_MAGIC, &buf->magic );
320 if ( err )
321 return -1;
322
323 if ( __copy_to_user( &buf->_fxsr_env[0], &tsk->thread.i387.fxsave,
324 sizeof(struct i387_fxsave_struct) ) )
325 return -1;
326 return 1;
327}
328
329int save_i387( struct _fpstate __user *buf )
330{
331 if ( !used_math() )
332 return 0;
333
334 /* This will cause a "finit" to be triggered by the next
335 * attempted FPU operation by the 'current' process.
336 */
337 clear_used_math();
338
339 if ( HAVE_HWFP ) {
340 if ( cpu_has_fxsr ) {
341 return save_i387_fxsave( buf );
342 } else {
343 return save_i387_fsave( buf );
344 }
345 } else {
346 return save_i387_soft( &current->thread.i387.soft, buf );
347 }
348}
349
350static inline int restore_i387_fsave( struct _fpstate __user *buf )
351{
352 struct task_struct *tsk = current;
353 clear_fpu( tsk );
354 return __copy_from_user( &tsk->thread.i387.fsave, buf,
355 sizeof(struct i387_fsave_struct) );
356}
357
358static int restore_i387_fxsave( struct _fpstate __user *buf )
359{
360 int err;
361 struct task_struct *tsk = current;
362 clear_fpu( tsk );
363 err = __copy_from_user( &tsk->thread.i387.fxsave, &buf->_fxsr_env[0],
364 sizeof(struct i387_fxsave_struct) );
365 /* mxcsr reserved bits must be masked to zero for security reasons */
366 tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
367 return err ? 1 : convert_fxsr_from_user( &tsk->thread.i387.fxsave, buf );
368}
369
370int restore_i387( struct _fpstate __user *buf )
371{
372 int err;
373
374 if ( HAVE_HWFP ) {
375 if ( cpu_has_fxsr ) {
376 err = restore_i387_fxsave( buf );
377 } else {
378 err = restore_i387_fsave( buf );
379 }
380 } else {
381 err = restore_i387_soft( &current->thread.i387.soft, buf );
382 }
383 set_used_math();
384 return err;
385}
386
387/*
388 * ptrace request handlers.
389 */
390
391static inline int get_fpregs_fsave( struct user_i387_struct __user *buf,
392 struct task_struct *tsk )
393{
394 return __copy_to_user( buf, &tsk->thread.i387.fsave,
395 sizeof(struct user_i387_struct) );
396}
397
398static inline int get_fpregs_fxsave( struct user_i387_struct __user *buf,
399 struct task_struct *tsk )
400{
401 return convert_fxsr_to_user( (struct _fpstate __user *)buf,
402 &tsk->thread.i387.fxsave );
403}
404
405int get_fpregs( struct user_i387_struct __user *buf, struct task_struct *tsk )
406{
407 if ( HAVE_HWFP ) {
408 if ( cpu_has_fxsr ) {
409 return get_fpregs_fxsave( buf, tsk );
410 } else {
411 return get_fpregs_fsave( buf, tsk );
412 }
413 } else {
414 return save_i387_soft( &tsk->thread.i387.soft,
415 (struct _fpstate __user *)buf );
416 }
417}
418
419static inline int set_fpregs_fsave( struct task_struct *tsk,
420 struct user_i387_struct __user *buf )
421{
422 return __copy_from_user( &tsk->thread.i387.fsave, buf,
423 sizeof(struct user_i387_struct) );
424}
425
426static inline int set_fpregs_fxsave( struct task_struct *tsk,
427 struct user_i387_struct __user *buf )
428{
429 return convert_fxsr_from_user( &tsk->thread.i387.fxsave,
430 (struct _fpstate __user *)buf );
431}
432
433int set_fpregs( struct task_struct *tsk, struct user_i387_struct __user *buf )
434{
435 if ( HAVE_HWFP ) {
436 if ( cpu_has_fxsr ) {
437 return set_fpregs_fxsave( tsk, buf );
438 } else {
439 return set_fpregs_fsave( tsk, buf );
440 }
441 } else {
442 return restore_i387_soft( &tsk->thread.i387.soft,
443 (struct _fpstate __user *)buf );
444 }
445}
446
447int get_fpxregs( struct user_fxsr_struct __user *buf, struct task_struct *tsk )
448{
449 if ( cpu_has_fxsr ) {
450 if (__copy_to_user( buf, &tsk->thread.i387.fxsave,
451 sizeof(struct user_fxsr_struct) ))
452 return -EFAULT;
453 return 0;
454 } else {
455 return -EIO;
456 }
457}
458
459int set_fpxregs( struct task_struct *tsk, struct user_fxsr_struct __user *buf )
460{
461 int ret = 0;
462
463 if ( cpu_has_fxsr ) {
464 if (__copy_from_user( &tsk->thread.i387.fxsave, buf,
465 sizeof(struct user_fxsr_struct) ))
466 ret = -EFAULT;
467 /* mxcsr reserved bits must be masked to zero for security reasons */
468 tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
469 } else {
470 ret = -EIO;
471 }
472 return ret;
473}
474
475/*
476 * FPU state for core dumps.
477 */
478
479static inline void copy_fpu_fsave( struct task_struct *tsk,
480 struct user_i387_struct *fpu )
481{
482 memcpy( fpu, &tsk->thread.i387.fsave,
483 sizeof(struct user_i387_struct) );
484}
485
486static inline void copy_fpu_fxsave( struct task_struct *tsk,
487 struct user_i387_struct *fpu )
488{
489 unsigned short *to;
490 unsigned short *from;
491 int i;
492
493 memcpy( fpu, &tsk->thread.i387.fxsave, 7 * sizeof(long) );
494
495 to = (unsigned short *)&fpu->st_space[0];
496 from = (unsigned short *)&tsk->thread.i387.fxsave.st_space[0];
497 for ( i = 0 ; i < 8 ; i++, to += 5, from += 8 ) {
498 memcpy( to, from, 5 * sizeof(unsigned short) );
499 }
500}
501
502int dump_fpu( struct pt_regs *regs, struct user_i387_struct *fpu )
503{
504 int fpvalid;
505 struct task_struct *tsk = current;
506
507 fpvalid = !!used_math();
508 if ( fpvalid ) {
509 unlazy_fpu( tsk );
510 if ( cpu_has_fxsr ) {
511 copy_fpu_fxsave( tsk, fpu );
512 } else {
513 copy_fpu_fsave( tsk, fpu );
514 }
515 }
516
517 return fpvalid;
518}
519EXPORT_SYMBOL(dump_fpu);
520
521int dump_task_fpu(struct task_struct *tsk, struct user_i387_struct *fpu)
522{
523 int fpvalid = !!tsk_used_math(tsk);
524
525 if (fpvalid) {
526 if (tsk == current)
527 unlazy_fpu(tsk);
528 if (cpu_has_fxsr)
529 copy_fpu_fxsave(tsk, fpu);
530 else
531 copy_fpu_fsave(tsk, fpu);
532 }
533 return fpvalid;
534}
535
536int dump_task_extended_fpu(struct task_struct *tsk, struct user_fxsr_struct *fpu)
537{
538 int fpvalid = tsk_used_math(tsk) && cpu_has_fxsr;
539
540 if (fpvalid) {
541 if (tsk == current)
542 unlazy_fpu(tsk);
543 memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(*fpu));
544 }
545 return fpvalid;
546}
diff --git a/arch/x86/kernel/i387_64.c b/arch/x86/kernel/i387_64.c
new file mode 100644
index 000000000000..1d58c13bc6bc
--- /dev/null
+++ b/arch/x86/kernel/i387_64.c
@@ -0,0 +1,151 @@
1/*
2 * linux/arch/x86_64/kernel/i387.c
3 *
4 * Copyright (C) 1994 Linus Torvalds
5 * Copyright (C) 2002 Andi Kleen, SuSE Labs
6 *
7 * Pentium III FXSR, SSE support
8 * General FPU state handling cleanups
9 * Gareth Hughes <gareth@valinux.com>, May 2000
10 *
11 * x86-64 rework 2002 Andi Kleen.
12 * Does direct fxsave in and out of user space now for signal handlers.
13 * All the FSAVE<->FXSAVE conversion code has been moved to the 32bit emulation,
14 * the 64bit user space sees a FXSAVE frame directly.
15 */
16
17#include <linux/sched.h>
18#include <linux/init.h>
19#include <asm/processor.h>
20#include <asm/i387.h>
21#include <asm/sigcontext.h>
22#include <asm/user.h>
23#include <asm/ptrace.h>
24#include <asm/uaccess.h>
25
26unsigned int mxcsr_feature_mask __read_mostly = 0xffffffff;
27
28void mxcsr_feature_mask_init(void)
29{
30 unsigned int mask;
31 clts();
32 memset(&current->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
33 asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave));
34 mask = current->thread.i387.fxsave.mxcsr_mask;
35 if (mask == 0) mask = 0x0000ffbf;
36 mxcsr_feature_mask &= mask;
37 stts();
38}
39
40/*
41 * Called at bootup to set up the initial FPU state that is later cloned
42 * into all processes.
43 */
44void __cpuinit fpu_init(void)
45{
46 unsigned long oldcr0 = read_cr0();
47 extern void __bad_fxsave_alignment(void);
48
49 if (offsetof(struct task_struct, thread.i387.fxsave) & 15)
50 __bad_fxsave_alignment();
51 set_in_cr4(X86_CR4_OSFXSR);
52 set_in_cr4(X86_CR4_OSXMMEXCPT);
53
54 write_cr0(oldcr0 & ~((1UL<<3)|(1UL<<2))); /* clear TS and EM */
55
56 mxcsr_feature_mask_init();
57 /* clean state in init */
58 current_thread_info()->status = 0;
59 clear_used_math();
60}
61
62void init_fpu(struct task_struct *child)
63{
64 if (tsk_used_math(child)) {
65 if (child == current)
66 unlazy_fpu(child);
67 return;
68 }
69 memset(&child->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
70 child->thread.i387.fxsave.cwd = 0x37f;
71 child->thread.i387.fxsave.mxcsr = 0x1f80;
72 /* only the device not available exception or ptrace can call init_fpu */
73 set_stopped_child_used_math(child);
74}
75
76/*
77 * Signal frame handlers.
78 */
79
80int save_i387(struct _fpstate __user *buf)
81{
82 struct task_struct *tsk = current;
83 int err = 0;
84
85 BUILD_BUG_ON(sizeof(struct user_i387_struct) !=
86 sizeof(tsk->thread.i387.fxsave));
87
88 if ((unsigned long)buf % 16)
89 printk("save_i387: bad fpstate %p\n",buf);
90
91 if (!used_math())
92 return 0;
93 clear_used_math(); /* trigger finit */
94 if (task_thread_info(tsk)->status & TS_USEDFPU) {
95 err = save_i387_checking((struct i387_fxsave_struct __user *)buf);
96 if (err) return err;
97 stts();
98 } else {
99 if (__copy_to_user(buf, &tsk->thread.i387.fxsave,
100 sizeof(struct i387_fxsave_struct)))
101 return -1;
102 }
103 return 1;
104}
105
106/*
107 * ptrace request handlers.
108 */
109
110int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *tsk)
111{
112 init_fpu(tsk);
113 return __copy_to_user(buf, &tsk->thread.i387.fxsave,
114 sizeof(struct user_i387_struct)) ? -EFAULT : 0;
115}
116
117int set_fpregs(struct task_struct *tsk, struct user_i387_struct __user *buf)
118{
119 if (__copy_from_user(&tsk->thread.i387.fxsave, buf,
120 sizeof(struct user_i387_struct)))
121 return -EFAULT;
122 return 0;
123}
124
125/*
126 * FPU state for core dumps.
127 */
128
129int dump_fpu( struct pt_regs *regs, struct user_i387_struct *fpu )
130{
131 struct task_struct *tsk = current;
132
133 if (!used_math())
134 return 0;
135
136 unlazy_fpu(tsk);
137 memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(struct user_i387_struct));
138 return 1;
139}
140
141int dump_task_fpu(struct task_struct *tsk, struct user_i387_struct *fpu)
142{
143 int fpvalid = !!tsk_used_math(tsk);
144
145 if (fpvalid) {
146 if (tsk == current)
147 unlazy_fpu(tsk);
148 memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(struct user_i387_struct));
149}
150 return fpvalid;
151}
diff --git a/arch/x86/kernel/i8237.c b/arch/x86/kernel/i8237.c
new file mode 100644
index 000000000000..6f508e8d7c57
--- /dev/null
+++ b/arch/x86/kernel/i8237.c
@@ -0,0 +1,72 @@
1/*
2 * i8237.c: 8237A DMA controller suspend functions.
3 *
4 * Written by Pierre Ossman, 2005.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or (at
9 * your option) any later version.
10 */
11
12#include <linux/init.h>
13#include <linux/sysdev.h>
14
15#include <asm/dma.h>
16
17/*
18 * This module just handles suspend/resume issues with the
19 * 8237A DMA controller (used for ISA and LPC).
20 * Allocation is handled in kernel/dma.c and normal usage is
21 * in asm/dma.h.
22 */
23
24static int i8237A_resume(struct sys_device *dev)
25{
26 unsigned long flags;
27 int i;
28
29 flags = claim_dma_lock();
30
31 dma_outb(DMA1_RESET_REG, 0);
32 dma_outb(DMA2_RESET_REG, 0);
33
34 for (i = 0;i < 8;i++) {
35 set_dma_addr(i, 0x000000);
36 /* DMA count is a bit weird so this is not 0 */
37 set_dma_count(i, 1);
38 }
39
40 /* Enable cascade DMA or channel 0-3 won't work */
41 enable_dma(4);
42
43 release_dma_lock(flags);
44
45 return 0;
46}
47
48static int i8237A_suspend(struct sys_device *dev, pm_message_t state)
49{
50 return 0;
51}
52
53static struct sysdev_class i8237_sysdev_class = {
54 set_kset_name("i8237"),
55 .suspend = i8237A_suspend,
56 .resume = i8237A_resume,
57};
58
59static struct sys_device device_i8237A = {
60 .id = 0,
61 .cls = &i8237_sysdev_class,
62};
63
64static int __init i8237A_init_sysfs(void)
65{
66 int error = sysdev_class_register(&i8237_sysdev_class);
67 if (!error)
68 error = sysdev_register(&device_i8237A);
69 return error;
70}
71
72device_initcall(i8237A_init_sysfs);
diff --git a/arch/x86/kernel/i8253_32.c b/arch/x86/kernel/i8253_32.c
new file mode 100644
index 000000000000..6d839f2f1b1a
--- /dev/null
+++ b/arch/x86/kernel/i8253_32.c
@@ -0,0 +1,206 @@
1/*
2 * i8253.c 8253/PIT functions
3 *
4 */
5#include <linux/clockchips.h>
6#include <linux/init.h>
7#include <linux/interrupt.h>
8#include <linux/jiffies.h>
9#include <linux/module.h>
10#include <linux/spinlock.h>
11
12#include <asm/smp.h>
13#include <asm/delay.h>
14#include <asm/i8253.h>
15#include <asm/io.h>
16#include <asm/timer.h>
17
18DEFINE_SPINLOCK(i8253_lock);
19EXPORT_SYMBOL(i8253_lock);
20
21/*
22 * HPET replaces the PIT, when enabled. So we need to know, which of
23 * the two timers is used
24 */
25struct clock_event_device *global_clock_event;
26
27/*
28 * Initialize the PIT timer.
29 *
30 * This is also called after resume to bring the PIT into operation again.
31 */
32static void init_pit_timer(enum clock_event_mode mode,
33 struct clock_event_device *evt)
34{
35 unsigned long flags;
36
37 spin_lock_irqsave(&i8253_lock, flags);
38
39 switch(mode) {
40 case CLOCK_EVT_MODE_PERIODIC:
41 /* binary, mode 2, LSB/MSB, ch 0 */
42 outb_p(0x34, PIT_MODE);
43 outb_p(LATCH & 0xff , PIT_CH0); /* LSB */
44 outb(LATCH >> 8 , PIT_CH0); /* MSB */
45 break;
46
47 case CLOCK_EVT_MODE_SHUTDOWN:
48 case CLOCK_EVT_MODE_UNUSED:
49 if (evt->mode == CLOCK_EVT_MODE_PERIODIC ||
50 evt->mode == CLOCK_EVT_MODE_ONESHOT) {
51 outb_p(0x30, PIT_MODE);
52 outb_p(0, PIT_CH0);
53 outb_p(0, PIT_CH0);
54 }
55 break;
56
57 case CLOCK_EVT_MODE_ONESHOT:
58 /* One shot setup */
59 outb_p(0x38, PIT_MODE);
60 break;
61
62 case CLOCK_EVT_MODE_RESUME:
63 /* Nothing to do here */
64 break;
65 }
66 spin_unlock_irqrestore(&i8253_lock, flags);
67}
68
69/*
70 * Program the next event in oneshot mode
71 *
72 * Delta is given in PIT ticks
73 */
74static int pit_next_event(unsigned long delta, struct clock_event_device *evt)
75{
76 unsigned long flags;
77
78 spin_lock_irqsave(&i8253_lock, flags);
79 outb_p(delta & 0xff , PIT_CH0); /* LSB */
80 outb(delta >> 8 , PIT_CH0); /* MSB */
81 spin_unlock_irqrestore(&i8253_lock, flags);
82
83 return 0;
84}
85
86/*
87 * On UP the PIT can serve all of the possible timer functions. On SMP systems
88 * it can be solely used for the global tick.
89 *
90 * The profiling and update capabilites are switched off once the local apic is
91 * registered. This mechanism replaces the previous #ifdef LOCAL_APIC -
92 * !using_apic_timer decisions in do_timer_interrupt_hook()
93 */
94struct clock_event_device pit_clockevent = {
95 .name = "pit",
96 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
97 .set_mode = init_pit_timer,
98 .set_next_event = pit_next_event,
99 .shift = 32,
100 .irq = 0,
101};
102
103/*
104 * Initialize the conversion factor and the min/max deltas of the clock event
105 * structure and register the clock event source with the framework.
106 */
107void __init setup_pit_timer(void)
108{
109 /*
110 * Start pit with the boot cpu mask and make it global after the
111 * IO_APIC has been initialized.
112 */
113 pit_clockevent.cpumask = cpumask_of_cpu(smp_processor_id());
114 pit_clockevent.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, 32);
115 pit_clockevent.max_delta_ns =
116 clockevent_delta2ns(0x7FFF, &pit_clockevent);
117 pit_clockevent.min_delta_ns =
118 clockevent_delta2ns(0xF, &pit_clockevent);
119 clockevents_register_device(&pit_clockevent);
120 global_clock_event = &pit_clockevent;
121}
122
123/*
124 * Since the PIT overflows every tick, its not very useful
125 * to just read by itself. So use jiffies to emulate a free
126 * running counter:
127 */
128static cycle_t pit_read(void)
129{
130 unsigned long flags;
131 int count;
132 u32 jifs;
133 static int old_count;
134 static u32 old_jifs;
135
136 spin_lock_irqsave(&i8253_lock, flags);
137 /*
138 * Although our caller may have the read side of xtime_lock,
139 * this is now a seqlock, and we are cheating in this routine
140 * by having side effects on state that we cannot undo if
141 * there is a collision on the seqlock and our caller has to
142 * retry. (Namely, old_jifs and old_count.) So we must treat
143 * jiffies as volatile despite the lock. We read jiffies
144 * before latching the timer count to guarantee that although
145 * the jiffies value might be older than the count (that is,
146 * the counter may underflow between the last point where
147 * jiffies was incremented and the point where we latch the
148 * count), it cannot be newer.
149 */
150 jifs = jiffies;
151 outb_p(0x00, PIT_MODE); /* latch the count ASAP */
152 count = inb_p(PIT_CH0); /* read the latched count */
153 count |= inb_p(PIT_CH0) << 8;
154
155 /* VIA686a test code... reset the latch if count > max + 1 */
156 if (count > LATCH) {
157 outb_p(0x34, PIT_MODE);
158 outb_p(LATCH & 0xff, PIT_CH0);
159 outb(LATCH >> 8, PIT_CH0);
160 count = LATCH - 1;
161 }
162
163 /*
164 * It's possible for count to appear to go the wrong way for a
165 * couple of reasons:
166 *
167 * 1. The timer counter underflows, but we haven't handled the
168 * resulting interrupt and incremented jiffies yet.
169 * 2. Hardware problem with the timer, not giving us continuous time,
170 * the counter does small "jumps" upwards on some Pentium systems,
171 * (see c't 95/10 page 335 for Neptun bug.)
172 *
173 * Previous attempts to handle these cases intelligently were
174 * buggy, so we just do the simple thing now.
175 */
176 if (count > old_count && jifs == old_jifs) {
177 count = old_count;
178 }
179 old_count = count;
180 old_jifs = jifs;
181
182 spin_unlock_irqrestore(&i8253_lock, flags);
183
184 count = (LATCH - 1) - count;
185
186 return (cycle_t)(jifs * LATCH) + count;
187}
188
189static struct clocksource clocksource_pit = {
190 .name = "pit",
191 .rating = 110,
192 .read = pit_read,
193 .mask = CLOCKSOURCE_MASK(32),
194 .mult = 0,
195 .shift = 20,
196};
197
198static int __init init_pit_clocksource(void)
199{
200 if (num_possible_cpus() > 1) /* PIT does not scale! */
201 return 0;
202
203 clocksource_pit.mult = clocksource_hz2mult(CLOCK_TICK_RATE, 20);
204 return clocksource_register(&clocksource_pit);
205}
206arch_initcall(init_pit_clocksource);
diff --git a/arch/x86/kernel/i8259_32.c b/arch/x86/kernel/i8259_32.c
new file mode 100644
index 000000000000..0499cbe9871a
--- /dev/null
+++ b/arch/x86/kernel/i8259_32.c
@@ -0,0 +1,420 @@
1#include <linux/errno.h>
2#include <linux/signal.h>
3#include <linux/sched.h>
4#include <linux/ioport.h>
5#include <linux/interrupt.h>
6#include <linux/slab.h>
7#include <linux/random.h>
8#include <linux/init.h>
9#include <linux/kernel_stat.h>
10#include <linux/sysdev.h>
11#include <linux/bitops.h>
12
13#include <asm/8253pit.h>
14#include <asm/atomic.h>
15#include <asm/system.h>
16#include <asm/io.h>
17#include <asm/timer.h>
18#include <asm/pgtable.h>
19#include <asm/delay.h>
20#include <asm/desc.h>
21#include <asm/apic.h>
22#include <asm/arch_hooks.h>
23#include <asm/i8259.h>
24
25#include <io_ports.h>
26
27/*
28 * This is the 'legacy' 8259A Programmable Interrupt Controller,
29 * present in the majority of PC/AT boxes.
30 * plus some generic x86 specific things if generic specifics makes
31 * any sense at all.
32 * this file should become arch/i386/kernel/irq.c when the old irq.c
33 * moves to arch independent land
34 */
35
36static int i8259A_auto_eoi;
37DEFINE_SPINLOCK(i8259A_lock);
38static void mask_and_ack_8259A(unsigned int);
39
40static struct irq_chip i8259A_chip = {
41 .name = "XT-PIC",
42 .mask = disable_8259A_irq,
43 .disable = disable_8259A_irq,
44 .unmask = enable_8259A_irq,
45 .mask_ack = mask_and_ack_8259A,
46};
47
48/*
49 * 8259A PIC functions to handle ISA devices:
50 */
51
52/*
53 * This contains the irq mask for both 8259A irq controllers,
54 */
55unsigned int cached_irq_mask = 0xffff;
56
57/*
58 * Not all IRQs can be routed through the IO-APIC, eg. on certain (older)
59 * boards the timer interrupt is not really connected to any IO-APIC pin,
60 * it's fed to the master 8259A's IR0 line only.
61 *
62 * Any '1' bit in this mask means the IRQ is routed through the IO-APIC.
63 * this 'mixed mode' IRQ handling costs nothing because it's only used
64 * at IRQ setup time.
65 */
66unsigned long io_apic_irqs;
67
68void disable_8259A_irq(unsigned int irq)
69{
70 unsigned int mask = 1 << irq;
71 unsigned long flags;
72
73 spin_lock_irqsave(&i8259A_lock, flags);
74 cached_irq_mask |= mask;
75 if (irq & 8)
76 outb(cached_slave_mask, PIC_SLAVE_IMR);
77 else
78 outb(cached_master_mask, PIC_MASTER_IMR);
79 spin_unlock_irqrestore(&i8259A_lock, flags);
80}
81
82void enable_8259A_irq(unsigned int irq)
83{
84 unsigned int mask = ~(1 << irq);
85 unsigned long flags;
86
87 spin_lock_irqsave(&i8259A_lock, flags);
88 cached_irq_mask &= mask;
89 if (irq & 8)
90 outb(cached_slave_mask, PIC_SLAVE_IMR);
91 else
92 outb(cached_master_mask, PIC_MASTER_IMR);
93 spin_unlock_irqrestore(&i8259A_lock, flags);
94}
95
96int i8259A_irq_pending(unsigned int irq)
97{
98 unsigned int mask = 1<<irq;
99 unsigned long flags;
100 int ret;
101
102 spin_lock_irqsave(&i8259A_lock, flags);
103 if (irq < 8)
104 ret = inb(PIC_MASTER_CMD) & mask;
105 else
106 ret = inb(PIC_SLAVE_CMD) & (mask >> 8);
107 spin_unlock_irqrestore(&i8259A_lock, flags);
108
109 return ret;
110}
111
112void make_8259A_irq(unsigned int irq)
113{
114 disable_irq_nosync(irq);
115 io_apic_irqs &= ~(1<<irq);
116 set_irq_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq,
117 "XT");
118 enable_irq(irq);
119}
120
121/*
122 * This function assumes to be called rarely. Switching between
123 * 8259A registers is slow.
124 * This has to be protected by the irq controller spinlock
125 * before being called.
126 */
127static inline int i8259A_irq_real(unsigned int irq)
128{
129 int value;
130 int irqmask = 1<<irq;
131
132 if (irq < 8) {
133 outb(0x0B,PIC_MASTER_CMD); /* ISR register */
134 value = inb(PIC_MASTER_CMD) & irqmask;
135 outb(0x0A,PIC_MASTER_CMD); /* back to the IRR register */
136 return value;
137 }
138 outb(0x0B,PIC_SLAVE_CMD); /* ISR register */
139 value = inb(PIC_SLAVE_CMD) & (irqmask >> 8);
140 outb(0x0A,PIC_SLAVE_CMD); /* back to the IRR register */
141 return value;
142}
143
144/*
145 * Careful! The 8259A is a fragile beast, it pretty
146 * much _has_ to be done exactly like this (mask it
147 * first, _then_ send the EOI, and the order of EOI
148 * to the two 8259s is important!
149 */
150static void mask_and_ack_8259A(unsigned int irq)
151{
152 unsigned int irqmask = 1 << irq;
153 unsigned long flags;
154
155 spin_lock_irqsave(&i8259A_lock, flags);
156 /*
157 * Lightweight spurious IRQ detection. We do not want
158 * to overdo spurious IRQ handling - it's usually a sign
159 * of hardware problems, so we only do the checks we can
160 * do without slowing down good hardware unnecessarily.
161 *
162 * Note that IRQ7 and IRQ15 (the two spurious IRQs
163 * usually resulting from the 8259A-1|2 PICs) occur
164 * even if the IRQ is masked in the 8259A. Thus we
165 * can check spurious 8259A IRQs without doing the
166 * quite slow i8259A_irq_real() call for every IRQ.
167 * This does not cover 100% of spurious interrupts,
168 * but should be enough to warn the user that there
169 * is something bad going on ...
170 */
171 if (cached_irq_mask & irqmask)
172 goto spurious_8259A_irq;
173 cached_irq_mask |= irqmask;
174
175handle_real_irq:
176 if (irq & 8) {
177 inb(PIC_SLAVE_IMR); /* DUMMY - (do we need this?) */
178 outb(cached_slave_mask, PIC_SLAVE_IMR);
179 outb(0x60+(irq&7),PIC_SLAVE_CMD);/* 'Specific EOI' to slave */
180 outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); /* 'Specific EOI' to master-IRQ2 */
181 } else {
182 inb(PIC_MASTER_IMR); /* DUMMY - (do we need this?) */
183 outb(cached_master_mask, PIC_MASTER_IMR);
184 outb(0x60+irq,PIC_MASTER_CMD); /* 'Specific EOI to master */
185 }
186 spin_unlock_irqrestore(&i8259A_lock, flags);
187 return;
188
189spurious_8259A_irq:
190 /*
191 * this is the slow path - should happen rarely.
192 */
193 if (i8259A_irq_real(irq))
194 /*
195 * oops, the IRQ _is_ in service according to the
196 * 8259A - not spurious, go handle it.
197 */
198 goto handle_real_irq;
199
200 {
201 static int spurious_irq_mask;
202 /*
203 * At this point we can be sure the IRQ is spurious,
204 * lets ACK and report it. [once per IRQ]
205 */
206 if (!(spurious_irq_mask & irqmask)) {
207 printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq);
208 spurious_irq_mask |= irqmask;
209 }
210 atomic_inc(&irq_err_count);
211 /*
212 * Theoretically we do not have to handle this IRQ,
213 * but in Linux this does not cause problems and is
214 * simpler for us.
215 */
216 goto handle_real_irq;
217 }
218}
219
220static char irq_trigger[2];
221/**
222 * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ
223 */
224static void restore_ELCR(char *trigger)
225{
226 outb(trigger[0], 0x4d0);
227 outb(trigger[1], 0x4d1);
228}
229
230static void save_ELCR(char *trigger)
231{
232 /* IRQ 0,1,2,8,13 are marked as reserved */
233 trigger[0] = inb(0x4d0) & 0xF8;
234 trigger[1] = inb(0x4d1) & 0xDE;
235}
236
237static int i8259A_resume(struct sys_device *dev)
238{
239 init_8259A(i8259A_auto_eoi);
240 restore_ELCR(irq_trigger);
241 return 0;
242}
243
244static int i8259A_suspend(struct sys_device *dev, pm_message_t state)
245{
246 save_ELCR(irq_trigger);
247 return 0;
248}
249
250static int i8259A_shutdown(struct sys_device *dev)
251{
252 /* Put the i8259A into a quiescent state that
253 * the kernel initialization code can get it
254 * out of.
255 */
256 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
257 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-1 */
258 return 0;
259}
260
261static struct sysdev_class i8259_sysdev_class = {
262 set_kset_name("i8259"),
263 .suspend = i8259A_suspend,
264 .resume = i8259A_resume,
265 .shutdown = i8259A_shutdown,
266};
267
268static struct sys_device device_i8259A = {
269 .id = 0,
270 .cls = &i8259_sysdev_class,
271};
272
273static int __init i8259A_init_sysfs(void)
274{
275 int error = sysdev_class_register(&i8259_sysdev_class);
276 if (!error)
277 error = sysdev_register(&device_i8259A);
278 return error;
279}
280
281device_initcall(i8259A_init_sysfs);
282
283void init_8259A(int auto_eoi)
284{
285 unsigned long flags;
286
287 i8259A_auto_eoi = auto_eoi;
288
289 spin_lock_irqsave(&i8259A_lock, flags);
290
291 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
292 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
293
294 /*
295 * outb_p - this has to work on a wide range of PC hardware.
296 */
297 outb_p(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */
298 outb_p(0x20 + 0, PIC_MASTER_IMR); /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */
299 outb_p(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); /* 8259A-1 (the master) has a slave on IR2 */
300 if (auto_eoi) /* master does Auto EOI */
301 outb_p(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR);
302 else /* master expects normal EOI */
303 outb_p(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR);
304
305 outb_p(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */
306 outb_p(0x20 + 8, PIC_SLAVE_IMR); /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */
307 outb_p(PIC_CASCADE_IR, PIC_SLAVE_IMR); /* 8259A-2 is a slave on master's IR2 */
308 outb_p(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */
309 if (auto_eoi)
310 /*
311 * In AEOI mode we just have to mask the interrupt
312 * when acking.
313 */
314 i8259A_chip.mask_ack = disable_8259A_irq;
315 else
316 i8259A_chip.mask_ack = mask_and_ack_8259A;
317
318 udelay(100); /* wait for 8259A to initialize */
319
320 outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */
321 outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */
322
323 spin_unlock_irqrestore(&i8259A_lock, flags);
324}
325
326/*
327 * Note that on a 486, we don't want to do a SIGFPE on an irq13
328 * as the irq is unreliable, and exception 16 works correctly
329 * (ie as explained in the intel literature). On a 386, you
330 * can't use exception 16 due to bad IBM design, so we have to
331 * rely on the less exact irq13.
332 *
333 * Careful.. Not only is IRQ13 unreliable, but it is also
334 * leads to races. IBM designers who came up with it should
335 * be shot.
336 */
337
338
339static irqreturn_t math_error_irq(int cpl, void *dev_id)
340{
341 extern void math_error(void __user *);
342 outb(0,0xF0);
343 if (ignore_fpu_irq || !boot_cpu_data.hard_math)
344 return IRQ_NONE;
345 math_error((void __user *)get_irq_regs()->eip);
346 return IRQ_HANDLED;
347}
348
349/*
350 * New motherboards sometimes make IRQ 13 be a PCI interrupt,
351 * so allow interrupt sharing.
352 */
353static struct irqaction fpu_irq = { math_error_irq, 0, CPU_MASK_NONE, "fpu", NULL, NULL };
354
355void __init init_ISA_irqs (void)
356{
357 int i;
358
359#ifdef CONFIG_X86_LOCAL_APIC
360 init_bsp_APIC();
361#endif
362 init_8259A(0);
363
364 for (i = 0; i < NR_IRQS; i++) {
365 irq_desc[i].status = IRQ_DISABLED;
366 irq_desc[i].action = NULL;
367 irq_desc[i].depth = 1;
368
369 if (i < 16) {
370 /*
371 * 16 old-style INTA-cycle interrupts:
372 */
373 set_irq_chip_and_handler_name(i, &i8259A_chip,
374 handle_level_irq, "XT");
375 } else {
376 /*
377 * 'high' PCI IRQs filled in on demand
378 */
379 irq_desc[i].chip = &no_irq_chip;
380 }
381 }
382}
383
384/* Overridden in paravirt.c */
385void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
386
387void __init native_init_IRQ(void)
388{
389 int i;
390
391 /* all the set up before the call gates are initialised */
392 pre_intr_init_hook();
393
394 /*
395 * Cover the whole vector space, no vector can escape
396 * us. (some of these will be overridden and become
397 * 'special' SMP interrupts)
398 */
399 for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
400 int vector = FIRST_EXTERNAL_VECTOR + i;
401 if (i >= NR_IRQS)
402 break;
403 if (vector != SYSCALL_VECTOR)
404 set_intr_gate(vector, interrupt[i]);
405 }
406
407 /* setup after call gates are initialised (usually add in
408 * the architecture specific gates)
409 */
410 intr_init_hook();
411
412 /*
413 * External FPU? Set up irq13 if so, for
414 * original braindamaged IBM FERR coupling.
415 */
416 if (boot_cpu_data.hard_math && !cpu_has_fpu)
417 setup_irq(FPU_IRQ, &fpu_irq);
418
419 irq_ctx_init(smp_processor_id());
420}
diff --git a/arch/x86/kernel/i8259_64.c b/arch/x86/kernel/i8259_64.c
new file mode 100644
index 000000000000..948cae646099
--- /dev/null
+++ b/arch/x86/kernel/i8259_64.c
@@ -0,0 +1,544 @@
1#include <linux/linkage.h>
2#include <linux/errno.h>
3#include <linux/signal.h>
4#include <linux/sched.h>
5#include <linux/ioport.h>
6#include <linux/interrupt.h>
7#include <linux/timex.h>
8#include <linux/slab.h>
9#include <linux/random.h>
10#include <linux/init.h>
11#include <linux/kernel_stat.h>
12#include <linux/sysdev.h>
13#include <linux/bitops.h>
14
15#include <asm/acpi.h>
16#include <asm/atomic.h>
17#include <asm/system.h>
18#include <asm/io.h>
19#include <asm/hw_irq.h>
20#include <asm/pgtable.h>
21#include <asm/delay.h>
22#include <asm/desc.h>
23#include <asm/apic.h>
24
25/*
26 * Common place to define all x86 IRQ vectors
27 *
28 * This builds up the IRQ handler stubs using some ugly macros in irq.h
29 *
30 * These macros create the low-level assembly IRQ routines that save
31 * register context and call do_IRQ(). do_IRQ() then does all the
32 * operations that are needed to keep the AT (or SMP IOAPIC)
33 * interrupt-controller happy.
34 */
35
36#define BI(x,y) \
37 BUILD_IRQ(x##y)
38
39#define BUILD_16_IRQS(x) \
40 BI(x,0) BI(x,1) BI(x,2) BI(x,3) \
41 BI(x,4) BI(x,5) BI(x,6) BI(x,7) \
42 BI(x,8) BI(x,9) BI(x,a) BI(x,b) \
43 BI(x,c) BI(x,d) BI(x,e) BI(x,f)
44
45/*
46 * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
47 * (these are usually mapped to vectors 0x30-0x3f)
48 */
49
50/*
51 * The IO-APIC gives us many more interrupt sources. Most of these
52 * are unused but an SMP system is supposed to have enough memory ...
53 * sometimes (mostly wrt. hw bugs) we get corrupted vectors all
54 * across the spectrum, so we really want to be prepared to get all
55 * of these. Plus, more powerful systems might have more than 64
56 * IO-APIC registers.
57 *
58 * (these are usually mapped into the 0x30-0xff vector range)
59 */
60 BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3)
61BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7)
62BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb)
63BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf)
64
65#undef BUILD_16_IRQS
66#undef BI
67
68
69#define IRQ(x,y) \
70 IRQ##x##y##_interrupt
71
72#define IRQLIST_16(x) \
73 IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \
74 IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \
75 IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \
76 IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f)
77
78/* for the irq vectors */
79static void (*interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = {
80 IRQLIST_16(0x2), IRQLIST_16(0x3),
81 IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7),
82 IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb),
83 IRQLIST_16(0xc), IRQLIST_16(0xd), IRQLIST_16(0xe), IRQLIST_16(0xf)
84};
85
86#undef IRQ
87#undef IRQLIST_16
88
89/*
90 * This is the 'legacy' 8259A Programmable Interrupt Controller,
91 * present in the majority of PC/AT boxes.
92 * plus some generic x86 specific things if generic specifics makes
93 * any sense at all.
94 * this file should become arch/i386/kernel/irq.c when the old irq.c
95 * moves to arch independent land
96 */
97
98static int i8259A_auto_eoi;
99DEFINE_SPINLOCK(i8259A_lock);
100static void mask_and_ack_8259A(unsigned int);
101
102static struct irq_chip i8259A_chip = {
103 .name = "XT-PIC",
104 .mask = disable_8259A_irq,
105 .disable = disable_8259A_irq,
106 .unmask = enable_8259A_irq,
107 .mask_ack = mask_and_ack_8259A,
108};
109
110/*
111 * 8259A PIC functions to handle ISA devices:
112 */
113
114/*
115 * This contains the irq mask for both 8259A irq controllers,
116 */
117static unsigned int cached_irq_mask = 0xffff;
118
119#define __byte(x,y) (((unsigned char *)&(y))[x])
120#define cached_21 (__byte(0,cached_irq_mask))
121#define cached_A1 (__byte(1,cached_irq_mask))
122
123/*
124 * Not all IRQs can be routed through the IO-APIC, eg. on certain (older)
125 * boards the timer interrupt is not really connected to any IO-APIC pin,
126 * it's fed to the master 8259A's IR0 line only.
127 *
128 * Any '1' bit in this mask means the IRQ is routed through the IO-APIC.
129 * this 'mixed mode' IRQ handling costs nothing because it's only used
130 * at IRQ setup time.
131 */
132unsigned long io_apic_irqs;
133
134void disable_8259A_irq(unsigned int irq)
135{
136 unsigned int mask = 1 << irq;
137 unsigned long flags;
138
139 spin_lock_irqsave(&i8259A_lock, flags);
140 cached_irq_mask |= mask;
141 if (irq & 8)
142 outb(cached_A1,0xA1);
143 else
144 outb(cached_21,0x21);
145 spin_unlock_irqrestore(&i8259A_lock, flags);
146}
147
148void enable_8259A_irq(unsigned int irq)
149{
150 unsigned int mask = ~(1 << irq);
151 unsigned long flags;
152
153 spin_lock_irqsave(&i8259A_lock, flags);
154 cached_irq_mask &= mask;
155 if (irq & 8)
156 outb(cached_A1,0xA1);
157 else
158 outb(cached_21,0x21);
159 spin_unlock_irqrestore(&i8259A_lock, flags);
160}
161
162int i8259A_irq_pending(unsigned int irq)
163{
164 unsigned int mask = 1<<irq;
165 unsigned long flags;
166 int ret;
167
168 spin_lock_irqsave(&i8259A_lock, flags);
169 if (irq < 8)
170 ret = inb(0x20) & mask;
171 else
172 ret = inb(0xA0) & (mask >> 8);
173 spin_unlock_irqrestore(&i8259A_lock, flags);
174
175 return ret;
176}
177
178void make_8259A_irq(unsigned int irq)
179{
180 disable_irq_nosync(irq);
181 io_apic_irqs &= ~(1<<irq);
182 set_irq_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq,
183 "XT");
184 enable_irq(irq);
185}
186
187/*
188 * This function assumes to be called rarely. Switching between
189 * 8259A registers is slow.
190 * This has to be protected by the irq controller spinlock
191 * before being called.
192 */
193static inline int i8259A_irq_real(unsigned int irq)
194{
195 int value;
196 int irqmask = 1<<irq;
197
198 if (irq < 8) {
199 outb(0x0B,0x20); /* ISR register */
200 value = inb(0x20) & irqmask;
201 outb(0x0A,0x20); /* back to the IRR register */
202 return value;
203 }
204 outb(0x0B,0xA0); /* ISR register */
205 value = inb(0xA0) & (irqmask >> 8);
206 outb(0x0A,0xA0); /* back to the IRR register */
207 return value;
208}
209
210/*
211 * Careful! The 8259A is a fragile beast, it pretty
212 * much _has_ to be done exactly like this (mask it
213 * first, _then_ send the EOI, and the order of EOI
214 * to the two 8259s is important!
215 */
216static void mask_and_ack_8259A(unsigned int irq)
217{
218 unsigned int irqmask = 1 << irq;
219 unsigned long flags;
220
221 spin_lock_irqsave(&i8259A_lock, flags);
222 /*
223 * Lightweight spurious IRQ detection. We do not want
224 * to overdo spurious IRQ handling - it's usually a sign
225 * of hardware problems, so we only do the checks we can
226 * do without slowing down good hardware unnecessarily.
227 *
228 * Note that IRQ7 and IRQ15 (the two spurious IRQs
229 * usually resulting from the 8259A-1|2 PICs) occur
230 * even if the IRQ is masked in the 8259A. Thus we
231 * can check spurious 8259A IRQs without doing the
232 * quite slow i8259A_irq_real() call for every IRQ.
233 * This does not cover 100% of spurious interrupts,
234 * but should be enough to warn the user that there
235 * is something bad going on ...
236 */
237 if (cached_irq_mask & irqmask)
238 goto spurious_8259A_irq;
239 cached_irq_mask |= irqmask;
240
241handle_real_irq:
242 if (irq & 8) {
243 inb(0xA1); /* DUMMY - (do we need this?) */
244 outb(cached_A1,0xA1);
245 outb(0x60+(irq&7),0xA0);/* 'Specific EOI' to slave */
246 outb(0x62,0x20); /* 'Specific EOI' to master-IRQ2 */
247 } else {
248 inb(0x21); /* DUMMY - (do we need this?) */
249 outb(cached_21,0x21);
250 outb(0x60+irq,0x20); /* 'Specific EOI' to master */
251 }
252 spin_unlock_irqrestore(&i8259A_lock, flags);
253 return;
254
255spurious_8259A_irq:
256 /*
257 * this is the slow path - should happen rarely.
258 */
259 if (i8259A_irq_real(irq))
260 /*
261 * oops, the IRQ _is_ in service according to the
262 * 8259A - not spurious, go handle it.
263 */
264 goto handle_real_irq;
265
266 {
267 static int spurious_irq_mask;
268 /*
269 * At this point we can be sure the IRQ is spurious,
270 * lets ACK and report it. [once per IRQ]
271 */
272 if (!(spurious_irq_mask & irqmask)) {
273 printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq);
274 spurious_irq_mask |= irqmask;
275 }
276 atomic_inc(&irq_err_count);
277 /*
278 * Theoretically we do not have to handle this IRQ,
279 * but in Linux this does not cause problems and is
280 * simpler for us.
281 */
282 goto handle_real_irq;
283 }
284}
285
286void init_8259A(int auto_eoi)
287{
288 unsigned long flags;
289
290 i8259A_auto_eoi = auto_eoi;
291
292 spin_lock_irqsave(&i8259A_lock, flags);
293
294 outb(0xff, 0x21); /* mask all of 8259A-1 */
295 outb(0xff, 0xA1); /* mask all of 8259A-2 */
296
297 /*
298 * outb_p - this has to work on a wide range of PC hardware.
299 */
300 outb_p(0x11, 0x20); /* ICW1: select 8259A-1 init */
301 outb_p(IRQ0_VECTOR, 0x21); /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */
302 outb_p(0x04, 0x21); /* 8259A-1 (the master) has a slave on IR2 */
303 if (auto_eoi)
304 outb_p(0x03, 0x21); /* master does Auto EOI */
305 else
306 outb_p(0x01, 0x21); /* master expects normal EOI */
307
308 outb_p(0x11, 0xA0); /* ICW1: select 8259A-2 init */
309 outb_p(IRQ8_VECTOR, 0xA1); /* ICW2: 8259A-2 IR0-7 mapped to 0x38-0x3f */
310 outb_p(0x02, 0xA1); /* 8259A-2 is a slave on master's IR2 */
311 outb_p(0x01, 0xA1); /* (slave's support for AEOI in flat mode
312 is to be investigated) */
313
314 if (auto_eoi)
315 /*
316 * in AEOI mode we just have to mask the interrupt
317 * when acking.
318 */
319 i8259A_chip.mask_ack = disable_8259A_irq;
320 else
321 i8259A_chip.mask_ack = mask_and_ack_8259A;
322
323 udelay(100); /* wait for 8259A to initialize */
324
325 outb(cached_21, 0x21); /* restore master IRQ mask */
326 outb(cached_A1, 0xA1); /* restore slave IRQ mask */
327
328 spin_unlock_irqrestore(&i8259A_lock, flags);
329}
330
331static char irq_trigger[2];
332/**
333 * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ
334 */
335static void restore_ELCR(char *trigger)
336{
337 outb(trigger[0], 0x4d0);
338 outb(trigger[1], 0x4d1);
339}
340
341static void save_ELCR(char *trigger)
342{
343 /* IRQ 0,1,2,8,13 are marked as reserved */
344 trigger[0] = inb(0x4d0) & 0xF8;
345 trigger[1] = inb(0x4d1) & 0xDE;
346}
347
348static int i8259A_resume(struct sys_device *dev)
349{
350 init_8259A(i8259A_auto_eoi);
351 restore_ELCR(irq_trigger);
352 return 0;
353}
354
355static int i8259A_suspend(struct sys_device *dev, pm_message_t state)
356{
357 save_ELCR(irq_trigger);
358 return 0;
359}
360
361static int i8259A_shutdown(struct sys_device *dev)
362{
363 /* Put the i8259A into a quiescent state that
364 * the kernel initialization code can get it
365 * out of.
366 */
367 outb(0xff, 0x21); /* mask all of 8259A-1 */
368 outb(0xff, 0xA1); /* mask all of 8259A-1 */
369 return 0;
370}
371
372static struct sysdev_class i8259_sysdev_class = {
373 set_kset_name("i8259"),
374 .suspend = i8259A_suspend,
375 .resume = i8259A_resume,
376 .shutdown = i8259A_shutdown,
377};
378
379static struct sys_device device_i8259A = {
380 .id = 0,
381 .cls = &i8259_sysdev_class,
382};
383
384static int __init i8259A_init_sysfs(void)
385{
386 int error = sysdev_class_register(&i8259_sysdev_class);
387 if (!error)
388 error = sysdev_register(&device_i8259A);
389 return error;
390}
391
392device_initcall(i8259A_init_sysfs);
393
394/*
395 * IRQ2 is cascade interrupt to second interrupt controller
396 */
397
398static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL};
399DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
400 [0 ... IRQ0_VECTOR - 1] = -1,
401 [IRQ0_VECTOR] = 0,
402 [IRQ1_VECTOR] = 1,
403 [IRQ2_VECTOR] = 2,
404 [IRQ3_VECTOR] = 3,
405 [IRQ4_VECTOR] = 4,
406 [IRQ5_VECTOR] = 5,
407 [IRQ6_VECTOR] = 6,
408 [IRQ7_VECTOR] = 7,
409 [IRQ8_VECTOR] = 8,
410 [IRQ9_VECTOR] = 9,
411 [IRQ10_VECTOR] = 10,
412 [IRQ11_VECTOR] = 11,
413 [IRQ12_VECTOR] = 12,
414 [IRQ13_VECTOR] = 13,
415 [IRQ14_VECTOR] = 14,
416 [IRQ15_VECTOR] = 15,
417 [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
418};
419
420void __init init_ISA_irqs (void)
421{
422 int i;
423
424 init_bsp_APIC();
425 init_8259A(0);
426
427 for (i = 0; i < NR_IRQS; i++) {
428 irq_desc[i].status = IRQ_DISABLED;
429 irq_desc[i].action = NULL;
430 irq_desc[i].depth = 1;
431
432 if (i < 16) {
433 /*
434 * 16 old-style INTA-cycle interrupts:
435 */
436 set_irq_chip_and_handler_name(i, &i8259A_chip,
437 handle_level_irq, "XT");
438 } else {
439 /*
440 * 'high' PCI IRQs filled in on demand
441 */
442 irq_desc[i].chip = &no_irq_chip;
443 }
444 }
445}
446
447static void setup_timer_hardware(void)
448{
449 outb_p(0x34,0x43); /* binary, mode 2, LSB/MSB, ch 0 */
450 udelay(10);
451 outb_p(LATCH & 0xff , 0x40); /* LSB */
452 udelay(10);
453 outb(LATCH >> 8 , 0x40); /* MSB */
454}
455
456static int timer_resume(struct sys_device *dev)
457{
458 setup_timer_hardware();
459 return 0;
460}
461
462void i8254_timer_resume(void)
463{
464 setup_timer_hardware();
465}
466
467static struct sysdev_class timer_sysclass = {
468 set_kset_name("timer_pit"),
469 .resume = timer_resume,
470};
471
472static struct sys_device device_timer = {
473 .id = 0,
474 .cls = &timer_sysclass,
475};
476
477static int __init init_timer_sysfs(void)
478{
479 int error = sysdev_class_register(&timer_sysclass);
480 if (!error)
481 error = sysdev_register(&device_timer);
482 return error;
483}
484
485device_initcall(init_timer_sysfs);
486
487void __init init_IRQ(void)
488{
489 int i;
490
491 init_ISA_irqs();
492 /*
493 * Cover the whole vector space, no vector can escape
494 * us. (some of these will be overridden and become
495 * 'special' SMP interrupts)
496 */
497 for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
498 int vector = FIRST_EXTERNAL_VECTOR + i;
499 if (vector != IA32_SYSCALL_VECTOR)
500 set_intr_gate(vector, interrupt[i]);
501 }
502
503#ifdef CONFIG_SMP
504 /*
505 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
506 * IPI, driven by wakeup.
507 */
508 set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
509
510 /* IPIs for invalidation */
511 set_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0);
512 set_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1);
513 set_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2);
514 set_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3);
515 set_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4);
516 set_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5);
517 set_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6);
518 set_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7);
519
520 /* IPI for generic function call */
521 set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
522
523 /* Low priority IPI to cleanup after moving an irq */
524 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
525#endif
526 set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
527 set_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
528
529 /* self generated IPI for local APIC timer */
530 set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
531
532 /* IPI vectors for APIC spurious and error interrupts */
533 set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
534 set_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
535
536 /*
537 * Set the clock to HZ Hz, we already have a valid
538 * vector now:
539 */
540 setup_timer_hardware();
541
542 if (!acpi_ioapic)
543 setup_irq(2, &irq2);
544}
diff --git a/arch/x86/kernel/init_task_32.c b/arch/x86/kernel/init_task_32.c
new file mode 100644
index 000000000000..d26fc063a760
--- /dev/null
+++ b/arch/x86/kernel/init_task_32.c
@@ -0,0 +1,46 @@
1#include <linux/mm.h>
2#include <linux/module.h>
3#include <linux/sched.h>
4#include <linux/init.h>
5#include <linux/init_task.h>
6#include <linux/fs.h>
7#include <linux/mqueue.h>
8
9#include <asm/uaccess.h>
10#include <asm/pgtable.h>
11#include <asm/desc.h>
12
13static struct fs_struct init_fs = INIT_FS;
14static struct files_struct init_files = INIT_FILES;
15static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
16static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
17struct mm_struct init_mm = INIT_MM(init_mm);
18
19EXPORT_SYMBOL(init_mm);
20
21/*
22 * Initial thread structure.
23 *
24 * We need to make sure that this is THREAD_SIZE aligned due to the
25 * way process stacks are handled. This is done by having a special
26 * "init_task" linker map entry..
27 */
28union thread_union init_thread_union
29 __attribute__((__section__(".data.init_task"))) =
30 { INIT_THREAD_INFO(init_task) };
31
32/*
33 * Initial task structure.
34 *
35 * All other task structs will be allocated on slabs in fork.c
36 */
37struct task_struct init_task = INIT_TASK(init_task);
38
39EXPORT_SYMBOL(init_task);
40
41/*
42 * per-CPU TSS segments. Threads are completely 'soft' on Linux,
43 * no more per-task TSS's.
44 */
45DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
46
diff --git a/arch/x86/kernel/init_task_64.c b/arch/x86/kernel/init_task_64.c
new file mode 100644
index 000000000000..4ff33d4f8551
--- /dev/null
+++ b/arch/x86/kernel/init_task_64.c
@@ -0,0 +1,54 @@
1#include <linux/mm.h>
2#include <linux/module.h>
3#include <linux/sched.h>
4#include <linux/init.h>
5#include <linux/init_task.h>
6#include <linux/fs.h>
7#include <linux/mqueue.h>
8
9#include <asm/uaccess.h>
10#include <asm/pgtable.h>
11#include <asm/desc.h>
12
13static struct fs_struct init_fs = INIT_FS;
14static struct files_struct init_files = INIT_FILES;
15static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
16static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
17struct mm_struct init_mm = INIT_MM(init_mm);
18
19EXPORT_SYMBOL(init_mm);
20
21/*
22 * Initial task structure.
23 *
24 * We need to make sure that this is 8192-byte aligned due to the
25 * way process stacks are handled. This is done by having a special
26 * "init_task" linker map entry..
27 */
28union thread_union init_thread_union
29 __attribute__((__section__(".data.init_task"))) =
30 { INIT_THREAD_INFO(init_task) };
31
32/*
33 * Initial task structure.
34 *
35 * All other task structs will be allocated on slabs in fork.c
36 */
37struct task_struct init_task = INIT_TASK(init_task);
38
39EXPORT_SYMBOL(init_task);
40/*
41 * per-CPU TSS segments. Threads are completely 'soft' on Linux,
42 * no more per-task TSS's. The TSS size is kept cacheline-aligned
43 * so they are allowed to end up in the .data.cacheline_aligned
44 * section. Since TSS's are completely CPU-local, we want them
45 * on exact cacheline boundaries, to eliminate cacheline ping-pong.
46 */
47DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
48
49/* Copies of the original ist values from the tss are only accessed during
50 * debugging, no special alignment required.
51 */
52DEFINE_PER_CPU(struct orig_ist, orig_ist);
53
54#define ALIGN_TO_4K __attribute__((section(".data.init_task")))
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
new file mode 100644
index 000000000000..e2f4a1c68547
--- /dev/null
+++ b/arch/x86/kernel/io_apic_32.c
@@ -0,0 +1,2847 @@
1/*
2 * Intel IO-APIC support for multi-Pentium hosts.
3 *
4 * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo
5 *
6 * Many thanks to Stig Venaas for trying out countless experimental
7 * patches and reporting/debugging problems patiently!
8 *
9 * (c) 1999, Multiple IO-APIC support, developed by
10 * Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and
11 * Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>,
12 * further tested and cleaned up by Zach Brown <zab@redhat.com>
13 * and Ingo Molnar <mingo@redhat.com>
14 *
15 * Fixes
16 * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
17 * thanks to Eric Gilmore
18 * and Rolf G. Tews
19 * for testing these extensively
20 * Paul Diefenbaugh : Added full ACPI support
21 */
22
23#include <linux/mm.h>
24#include <linux/interrupt.h>
25#include <linux/init.h>
26#include <linux/delay.h>
27#include <linux/sched.h>
28#include <linux/mc146818rtc.h>
29#include <linux/compiler.h>
30#include <linux/acpi.h>
31#include <linux/module.h>
32#include <linux/sysdev.h>
33#include <linux/pci.h>
34#include <linux/msi.h>
35#include <linux/htirq.h>
36#include <linux/freezer.h>
37#include <linux/kthread.h>
38
39#include <asm/io.h>
40#include <asm/smp.h>
41#include <asm/desc.h>
42#include <asm/timer.h>
43#include <asm/i8259.h>
44#include <asm/nmi.h>
45#include <asm/msidef.h>
46#include <asm/hypertransport.h>
47
48#include <mach_apic.h>
49#include <mach_apicdef.h>
50
51#include "io_ports.h"
52
53int (*ioapic_renumber_irq)(int ioapic, int irq);
54atomic_t irq_mis_count;
55
56/* Where if anywhere is the i8259 connect in external int mode */
57static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
58
59static DEFINE_SPINLOCK(ioapic_lock);
60static DEFINE_SPINLOCK(vector_lock);
61
62int timer_over_8254 __initdata = 1;
63
64/*
65 * Is the SiS APIC rmw bug present ?
66 * -1 = don't know, 0 = no, 1 = yes
67 */
68int sis_apic_bug = -1;
69
70/*
71 * # of IRQ routing registers
72 */
73int nr_ioapic_registers[MAX_IO_APICS];
74
75static int disable_timer_pin_1 __initdata;
76
77/*
78 * Rough estimation of how many shared IRQs there are, can
79 * be changed anytime.
80 */
81#define MAX_PLUS_SHARED_IRQS NR_IRQS
82#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
83
84/*
85 * This is performance-critical, we want to do it O(1)
86 *
87 * the indexing order of this array favors 1:1 mappings
88 * between pins and IRQs.
89 */
90
91static struct irq_pin_list {
92 int apic, pin, next;
93} irq_2_pin[PIN_MAP_SIZE];
94
95struct io_apic {
96 unsigned int index;
97 unsigned int unused[3];
98 unsigned int data;
99};
100
101static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
102{
103 return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
104 + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK);
105}
106
107static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
108{
109 struct io_apic __iomem *io_apic = io_apic_base(apic);
110 writel(reg, &io_apic->index);
111 return readl(&io_apic->data);
112}
113
114static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
115{
116 struct io_apic __iomem *io_apic = io_apic_base(apic);
117 writel(reg, &io_apic->index);
118 writel(value, &io_apic->data);
119}
120
121/*
122 * Re-write a value: to be used for read-modify-write
123 * cycles where the read already set up the index register.
124 *
125 * Older SiS APIC requires we rewrite the index register
126 */
127static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
128{
129 volatile struct io_apic __iomem *io_apic = io_apic_base(apic);
130 if (sis_apic_bug)
131 writel(reg, &io_apic->index);
132 writel(value, &io_apic->data);
133}
134
135union entry_union {
136 struct { u32 w1, w2; };
137 struct IO_APIC_route_entry entry;
138};
139
140static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
141{
142 union entry_union eu;
143 unsigned long flags;
144 spin_lock_irqsave(&ioapic_lock, flags);
145 eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
146 eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
147 spin_unlock_irqrestore(&ioapic_lock, flags);
148 return eu.entry;
149}
150
151/*
152 * When we write a new IO APIC routing entry, we need to write the high
153 * word first! If the mask bit in the low word is clear, we will enable
154 * the interrupt, and we need to make sure the entry is fully populated
155 * before that happens.
156 */
157static void
158__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
159{
160 union entry_union eu;
161 eu.entry = e;
162 io_apic_write(apic, 0x11 + 2*pin, eu.w2);
163 io_apic_write(apic, 0x10 + 2*pin, eu.w1);
164}
165
166static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
167{
168 unsigned long flags;
169 spin_lock_irqsave(&ioapic_lock, flags);
170 __ioapic_write_entry(apic, pin, e);
171 spin_unlock_irqrestore(&ioapic_lock, flags);
172}
173
174/*
175 * When we mask an IO APIC routing entry, we need to write the low
176 * word first, in order to set the mask bit before we change the
177 * high bits!
178 */
179static void ioapic_mask_entry(int apic, int pin)
180{
181 unsigned long flags;
182 union entry_union eu = { .entry.mask = 1 };
183
184 spin_lock_irqsave(&ioapic_lock, flags);
185 io_apic_write(apic, 0x10 + 2*pin, eu.w1);
186 io_apic_write(apic, 0x11 + 2*pin, eu.w2);
187 spin_unlock_irqrestore(&ioapic_lock, flags);
188}
189
190/*
191 * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
192 * shared ISA-space IRQs, so we have to support them. We are super
193 * fast in the common case, and fast for shared ISA-space IRQs.
194 */
195static void add_pin_to_irq(unsigned int irq, int apic, int pin)
196{
197 static int first_free_entry = NR_IRQS;
198 struct irq_pin_list *entry = irq_2_pin + irq;
199
200 while (entry->next)
201 entry = irq_2_pin + entry->next;
202
203 if (entry->pin != -1) {
204 entry->next = first_free_entry;
205 entry = irq_2_pin + entry->next;
206 if (++first_free_entry >= PIN_MAP_SIZE)
207 panic("io_apic.c: whoops");
208 }
209 entry->apic = apic;
210 entry->pin = pin;
211}
212
213/*
214 * Reroute an IRQ to a different pin.
215 */
216static void __init replace_pin_at_irq(unsigned int irq,
217 int oldapic, int oldpin,
218 int newapic, int newpin)
219{
220 struct irq_pin_list *entry = irq_2_pin + irq;
221
222 while (1) {
223 if (entry->apic == oldapic && entry->pin == oldpin) {
224 entry->apic = newapic;
225 entry->pin = newpin;
226 }
227 if (!entry->next)
228 break;
229 entry = irq_2_pin + entry->next;
230 }
231}
232
233static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable)
234{
235 struct irq_pin_list *entry = irq_2_pin + irq;
236 unsigned int pin, reg;
237
238 for (;;) {
239 pin = entry->pin;
240 if (pin == -1)
241 break;
242 reg = io_apic_read(entry->apic, 0x10 + pin*2);
243 reg &= ~disable;
244 reg |= enable;
245 io_apic_modify(entry->apic, 0x10 + pin*2, reg);
246 if (!entry->next)
247 break;
248 entry = irq_2_pin + entry->next;
249 }
250}
251
252/* mask = 1 */
253static void __mask_IO_APIC_irq (unsigned int irq)
254{
255 __modify_IO_APIC_irq(irq, 0x00010000, 0);
256}
257
258/* mask = 0 */
259static void __unmask_IO_APIC_irq (unsigned int irq)
260{
261 __modify_IO_APIC_irq(irq, 0, 0x00010000);
262}
263
264/* mask = 1, trigger = 0 */
265static void __mask_and_edge_IO_APIC_irq (unsigned int irq)
266{
267 __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000);
268}
269
270/* mask = 0, trigger = 1 */
271static void __unmask_and_level_IO_APIC_irq (unsigned int irq)
272{
273 __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000);
274}
275
276static void mask_IO_APIC_irq (unsigned int irq)
277{
278 unsigned long flags;
279
280 spin_lock_irqsave(&ioapic_lock, flags);
281 __mask_IO_APIC_irq(irq);
282 spin_unlock_irqrestore(&ioapic_lock, flags);
283}
284
285static void unmask_IO_APIC_irq (unsigned int irq)
286{
287 unsigned long flags;
288
289 spin_lock_irqsave(&ioapic_lock, flags);
290 __unmask_IO_APIC_irq(irq);
291 spin_unlock_irqrestore(&ioapic_lock, flags);
292}
293
294static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
295{
296 struct IO_APIC_route_entry entry;
297
298 /* Check delivery_mode to be sure we're not clearing an SMI pin */
299 entry = ioapic_read_entry(apic, pin);
300 if (entry.delivery_mode == dest_SMI)
301 return;
302
303 /*
304 * Disable it in the IO-APIC irq-routing table:
305 */
306 ioapic_mask_entry(apic, pin);
307}
308
309static void clear_IO_APIC (void)
310{
311 int apic, pin;
312
313 for (apic = 0; apic < nr_ioapics; apic++)
314 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
315 clear_IO_APIC_pin(apic, pin);
316}
317
318#ifdef CONFIG_SMP
319static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
320{
321 unsigned long flags;
322 int pin;
323 struct irq_pin_list *entry = irq_2_pin + irq;
324 unsigned int apicid_value;
325 cpumask_t tmp;
326
327 cpus_and(tmp, cpumask, cpu_online_map);
328 if (cpus_empty(tmp))
329 tmp = TARGET_CPUS;
330
331 cpus_and(cpumask, tmp, CPU_MASK_ALL);
332
333 apicid_value = cpu_mask_to_apicid(cpumask);
334 /* Prepare to do the io_apic_write */
335 apicid_value = apicid_value << 24;
336 spin_lock_irqsave(&ioapic_lock, flags);
337 for (;;) {
338 pin = entry->pin;
339 if (pin == -1)
340 break;
341 io_apic_write(entry->apic, 0x10 + 1 + pin*2, apicid_value);
342 if (!entry->next)
343 break;
344 entry = irq_2_pin + entry->next;
345 }
346 irq_desc[irq].affinity = cpumask;
347 spin_unlock_irqrestore(&ioapic_lock, flags);
348}
349
350#if defined(CONFIG_IRQBALANCE)
351# include <asm/processor.h> /* kernel_thread() */
352# include <linux/kernel_stat.h> /* kstat */
353# include <linux/slab.h> /* kmalloc() */
354# include <linux/timer.h> /* time_after() */
355
356#define IRQBALANCE_CHECK_ARCH -999
357#define MAX_BALANCED_IRQ_INTERVAL (5*HZ)
358#define MIN_BALANCED_IRQ_INTERVAL (HZ/2)
359#define BALANCED_IRQ_MORE_DELTA (HZ/10)
360#define BALANCED_IRQ_LESS_DELTA (HZ)
361
362static int irqbalance_disabled __read_mostly = IRQBALANCE_CHECK_ARCH;
363static int physical_balance __read_mostly;
364static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL;
365
366static struct irq_cpu_info {
367 unsigned long * last_irq;
368 unsigned long * irq_delta;
369 unsigned long irq;
370} irq_cpu_data[NR_CPUS];
371
372#define CPU_IRQ(cpu) (irq_cpu_data[cpu].irq)
373#define LAST_CPU_IRQ(cpu,irq) (irq_cpu_data[cpu].last_irq[irq])
374#define IRQ_DELTA(cpu,irq) (irq_cpu_data[cpu].irq_delta[irq])
375
376#define IDLE_ENOUGH(cpu,now) \
377 (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1))
378
379#define IRQ_ALLOWED(cpu, allowed_mask) cpu_isset(cpu, allowed_mask)
380
381#define CPU_TO_PACKAGEINDEX(i) (first_cpu(cpu_sibling_map[i]))
382
383static cpumask_t balance_irq_affinity[NR_IRQS] = {
384 [0 ... NR_IRQS-1] = CPU_MASK_ALL
385};
386
387void set_balance_irq_affinity(unsigned int irq, cpumask_t mask)
388{
389 balance_irq_affinity[irq] = mask;
390}
391
392static unsigned long move(int curr_cpu, cpumask_t allowed_mask,
393 unsigned long now, int direction)
394{
395 int search_idle = 1;
396 int cpu = curr_cpu;
397
398 goto inside;
399
400 do {
401 if (unlikely(cpu == curr_cpu))
402 search_idle = 0;
403inside:
404 if (direction == 1) {
405 cpu++;
406 if (cpu >= NR_CPUS)
407 cpu = 0;
408 } else {
409 cpu--;
410 if (cpu == -1)
411 cpu = NR_CPUS-1;
412 }
413 } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu,allowed_mask) ||
414 (search_idle && !IDLE_ENOUGH(cpu,now)));
415
416 return cpu;
417}
418
419static inline void balance_irq(int cpu, int irq)
420{
421 unsigned long now = jiffies;
422 cpumask_t allowed_mask;
423 unsigned int new_cpu;
424
425 if (irqbalance_disabled)
426 return;
427
428 cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]);
429 new_cpu = move(cpu, allowed_mask, now, 1);
430 if (cpu != new_cpu) {
431 set_pending_irq(irq, cpumask_of_cpu(new_cpu));
432 }
433}
434
435static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
436{
437 int i, j;
438
439 for_each_online_cpu(i) {
440 for (j = 0; j < NR_IRQS; j++) {
441 if (!irq_desc[j].action)
442 continue;
443 /* Is it a significant load ? */
444 if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i),j) <
445 useful_load_threshold)
446 continue;
447 balance_irq(i, j);
448 }
449 }
450 balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
451 balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
452 return;
453}
454
455static void do_irq_balance(void)
456{
457 int i, j;
458 unsigned long max_cpu_irq = 0, min_cpu_irq = (~0);
459 unsigned long move_this_load = 0;
460 int max_loaded = 0, min_loaded = 0;
461 int load;
462 unsigned long useful_load_threshold = balanced_irq_interval + 10;
463 int selected_irq;
464 int tmp_loaded, first_attempt = 1;
465 unsigned long tmp_cpu_irq;
466 unsigned long imbalance = 0;
467 cpumask_t allowed_mask, target_cpu_mask, tmp;
468
469 for_each_possible_cpu(i) {
470 int package_index;
471 CPU_IRQ(i) = 0;
472 if (!cpu_online(i))
473 continue;
474 package_index = CPU_TO_PACKAGEINDEX(i);
475 for (j = 0; j < NR_IRQS; j++) {
476 unsigned long value_now, delta;
477 /* Is this an active IRQ or balancing disabled ? */
478 if (!irq_desc[j].action || irq_balancing_disabled(j))
479 continue;
480 if ( package_index == i )
481 IRQ_DELTA(package_index,j) = 0;
482 /* Determine the total count per processor per IRQ */
483 value_now = (unsigned long) kstat_cpu(i).irqs[j];
484
485 /* Determine the activity per processor per IRQ */
486 delta = value_now - LAST_CPU_IRQ(i,j);
487
488 /* Update last_cpu_irq[][] for the next time */
489 LAST_CPU_IRQ(i,j) = value_now;
490
491 /* Ignore IRQs whose rate is less than the clock */
492 if (delta < useful_load_threshold)
493 continue;
494 /* update the load for the processor or package total */
495 IRQ_DELTA(package_index,j) += delta;
496
497 /* Keep track of the higher numbered sibling as well */
498 if (i != package_index)
499 CPU_IRQ(i) += delta;
500 /*
501 * We have sibling A and sibling B in the package
502 *
503 * cpu_irq[A] = load for cpu A + load for cpu B
504 * cpu_irq[B] = load for cpu B
505 */
506 CPU_IRQ(package_index) += delta;
507 }
508 }
509 /* Find the least loaded processor package */
510 for_each_online_cpu(i) {
511 if (i != CPU_TO_PACKAGEINDEX(i))
512 continue;
513 if (min_cpu_irq > CPU_IRQ(i)) {
514 min_cpu_irq = CPU_IRQ(i);
515 min_loaded = i;
516 }
517 }
518 max_cpu_irq = ULONG_MAX;
519
520tryanothercpu:
521 /* Look for heaviest loaded processor.
522 * We may come back to get the next heaviest loaded processor.
523 * Skip processors with trivial loads.
524 */
525 tmp_cpu_irq = 0;
526 tmp_loaded = -1;
527 for_each_online_cpu(i) {
528 if (i != CPU_TO_PACKAGEINDEX(i))
529 continue;
530 if (max_cpu_irq <= CPU_IRQ(i))
531 continue;
532 if (tmp_cpu_irq < CPU_IRQ(i)) {
533 tmp_cpu_irq = CPU_IRQ(i);
534 tmp_loaded = i;
535 }
536 }
537
538 if (tmp_loaded == -1) {
539 /* In the case of small number of heavy interrupt sources,
540 * loading some of the cpus too much. We use Ingo's original
541 * approach to rotate them around.
542 */
543 if (!first_attempt && imbalance >= useful_load_threshold) {
544 rotate_irqs_among_cpus(useful_load_threshold);
545 return;
546 }
547 goto not_worth_the_effort;
548 }
549
550 first_attempt = 0; /* heaviest search */
551 max_cpu_irq = tmp_cpu_irq; /* load */
552 max_loaded = tmp_loaded; /* processor */
553 imbalance = (max_cpu_irq - min_cpu_irq) / 2;
554
555 /* if imbalance is less than approx 10% of max load, then
556 * observe diminishing returns action. - quit
557 */
558 if (imbalance < (max_cpu_irq >> 3))
559 goto not_worth_the_effort;
560
561tryanotherirq:
562 /* if we select an IRQ to move that can't go where we want, then
563 * see if there is another one to try.
564 */
565 move_this_load = 0;
566 selected_irq = -1;
567 for (j = 0; j < NR_IRQS; j++) {
568 /* Is this an active IRQ? */
569 if (!irq_desc[j].action)
570 continue;
571 if (imbalance <= IRQ_DELTA(max_loaded,j))
572 continue;
573 /* Try to find the IRQ that is closest to the imbalance
574 * without going over.
575 */
576 if (move_this_load < IRQ_DELTA(max_loaded,j)) {
577 move_this_load = IRQ_DELTA(max_loaded,j);
578 selected_irq = j;
579 }
580 }
581 if (selected_irq == -1) {
582 goto tryanothercpu;
583 }
584
585 imbalance = move_this_load;
586
587 /* For physical_balance case, we accumlated both load
588 * values in the one of the siblings cpu_irq[],
589 * to use the same code for physical and logical processors
590 * as much as possible.
591 *
592 * NOTE: the cpu_irq[] array holds the sum of the load for
593 * sibling A and sibling B in the slot for the lowest numbered
594 * sibling (A), _AND_ the load for sibling B in the slot for
595 * the higher numbered sibling.
596 *
597 * We seek the least loaded sibling by making the comparison
598 * (A+B)/2 vs B
599 */
600 load = CPU_IRQ(min_loaded) >> 1;
601 for_each_cpu_mask(j, cpu_sibling_map[min_loaded]) {
602 if (load > CPU_IRQ(j)) {
603 /* This won't change cpu_sibling_map[min_loaded] */
604 load = CPU_IRQ(j);
605 min_loaded = j;
606 }
607 }
608
609 cpus_and(allowed_mask,
610 cpu_online_map,
611 balance_irq_affinity[selected_irq]);
612 target_cpu_mask = cpumask_of_cpu(min_loaded);
613 cpus_and(tmp, target_cpu_mask, allowed_mask);
614
615 if (!cpus_empty(tmp)) {
616 /* mark for change destination */
617 set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded));
618
619 /* Since we made a change, come back sooner to
620 * check for more variation.
621 */
622 balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
623 balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
624 return;
625 }
626 goto tryanotherirq;
627
628not_worth_the_effort:
629 /*
630 * if we did not find an IRQ to move, then adjust the time interval
631 * upward
632 */
633 balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL,
634 balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);
635 return;
636}
637
638static int balanced_irq(void *unused)
639{
640 int i;
641 unsigned long prev_balance_time = jiffies;
642 long time_remaining = balanced_irq_interval;
643
644 /* push everything to CPU 0 to give us a starting point. */
645 for (i = 0 ; i < NR_IRQS ; i++) {
646 irq_desc[i].pending_mask = cpumask_of_cpu(0);
647 set_pending_irq(i, cpumask_of_cpu(0));
648 }
649
650 set_freezable();
651 for ( ; ; ) {
652 time_remaining = schedule_timeout_interruptible(time_remaining);
653 try_to_freeze();
654 if (time_after(jiffies,
655 prev_balance_time+balanced_irq_interval)) {
656 preempt_disable();
657 do_irq_balance();
658 prev_balance_time = jiffies;
659 time_remaining = balanced_irq_interval;
660 preempt_enable();
661 }
662 }
663 return 0;
664}
665
666static int __init balanced_irq_init(void)
667{
668 int i;
669 struct cpuinfo_x86 *c;
670 cpumask_t tmp;
671
672 cpus_shift_right(tmp, cpu_online_map, 2);
673 c = &boot_cpu_data;
674 /* When not overwritten by the command line ask subarchitecture. */
675 if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH)
676 irqbalance_disabled = NO_BALANCE_IRQ;
677 if (irqbalance_disabled)
678 return 0;
679
680 /* disable irqbalance completely if there is only one processor online */
681 if (num_online_cpus() < 2) {
682 irqbalance_disabled = 1;
683 return 0;
684 }
685 /*
686 * Enable physical balance only if more than 1 physical processor
687 * is present
688 */
689 if (smp_num_siblings > 1 && !cpus_empty(tmp))
690 physical_balance = 1;
691
692 for_each_online_cpu(i) {
693 irq_cpu_data[i].irq_delta = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
694 irq_cpu_data[i].last_irq = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
695 if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) {
696 printk(KERN_ERR "balanced_irq_init: out of memory");
697 goto failed;
698 }
699 memset(irq_cpu_data[i].irq_delta,0,sizeof(unsigned long) * NR_IRQS);
700 memset(irq_cpu_data[i].last_irq,0,sizeof(unsigned long) * NR_IRQS);
701 }
702
703 printk(KERN_INFO "Starting balanced_irq\n");
704 if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd")))
705 return 0;
706 printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq");
707failed:
708 for_each_possible_cpu(i) {
709 kfree(irq_cpu_data[i].irq_delta);
710 irq_cpu_data[i].irq_delta = NULL;
711 kfree(irq_cpu_data[i].last_irq);
712 irq_cpu_data[i].last_irq = NULL;
713 }
714 return 0;
715}
716
717int __devinit irqbalance_disable(char *str)
718{
719 irqbalance_disabled = 1;
720 return 1;
721}
722
723__setup("noirqbalance", irqbalance_disable);
724
725late_initcall(balanced_irq_init);
726#endif /* CONFIG_IRQBALANCE */
727#endif /* CONFIG_SMP */
728
729#ifndef CONFIG_SMP
730void fastcall send_IPI_self(int vector)
731{
732 unsigned int cfg;
733
734 /*
735 * Wait for idle.
736 */
737 apic_wait_icr_idle();
738 cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL;
739 /*
740 * Send the IPI. The write to APIC_ICR fires this off.
741 */
742 apic_write_around(APIC_ICR, cfg);
743}
744#endif /* !CONFIG_SMP */
745
746
747/*
748 * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
749 * specific CPU-side IRQs.
750 */
751
752#define MAX_PIRQS 8
753static int pirq_entries [MAX_PIRQS];
754static int pirqs_enabled;
755int skip_ioapic_setup;
756
757static int __init ioapic_pirq_setup(char *str)
758{
759 int i, max;
760 int ints[MAX_PIRQS+1];
761
762 get_options(str, ARRAY_SIZE(ints), ints);
763
764 for (i = 0; i < MAX_PIRQS; i++)
765 pirq_entries[i] = -1;
766
767 pirqs_enabled = 1;
768 apic_printk(APIC_VERBOSE, KERN_INFO
769 "PIRQ redirection, working around broken MP-BIOS.\n");
770 max = MAX_PIRQS;
771 if (ints[0] < MAX_PIRQS)
772 max = ints[0];
773
774 for (i = 0; i < max; i++) {
775 apic_printk(APIC_VERBOSE, KERN_DEBUG
776 "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
777 /*
778 * PIRQs are mapped upside down, usually.
779 */
780 pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
781 }
782 return 1;
783}
784
785__setup("pirq=", ioapic_pirq_setup);
786
787/*
788 * Find the IRQ entry number of a certain pin.
789 */
790static int find_irq_entry(int apic, int pin, int type)
791{
792 int i;
793
794 for (i = 0; i < mp_irq_entries; i++)
795 if (mp_irqs[i].mpc_irqtype == type &&
796 (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
797 mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
798 mp_irqs[i].mpc_dstirq == pin)
799 return i;
800
801 return -1;
802}
803
804/*
805 * Find the pin to which IRQ[irq] (ISA) is connected
806 */
807static int __init find_isa_irq_pin(int irq, int type)
808{
809 int i;
810
811 for (i = 0; i < mp_irq_entries; i++) {
812 int lbus = mp_irqs[i].mpc_srcbus;
813
814 if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
815 mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
816 mp_bus_id_to_type[lbus] == MP_BUS_MCA
817 ) &&
818 (mp_irqs[i].mpc_irqtype == type) &&
819 (mp_irqs[i].mpc_srcbusirq == irq))
820
821 return mp_irqs[i].mpc_dstirq;
822 }
823 return -1;
824}
825
826static int __init find_isa_irq_apic(int irq, int type)
827{
828 int i;
829
830 for (i = 0; i < mp_irq_entries; i++) {
831 int lbus = mp_irqs[i].mpc_srcbus;
832
833 if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
834 mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
835 mp_bus_id_to_type[lbus] == MP_BUS_MCA
836 ) &&
837 (mp_irqs[i].mpc_irqtype == type) &&
838 (mp_irqs[i].mpc_srcbusirq == irq))
839 break;
840 }
841 if (i < mp_irq_entries) {
842 int apic;
843 for(apic = 0; apic < nr_ioapics; apic++) {
844 if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
845 return apic;
846 }
847 }
848
849 return -1;
850}
851
852/*
853 * Find a specific PCI IRQ entry.
854 * Not an __init, possibly needed by modules
855 */
856static int pin_2_irq(int idx, int apic, int pin);
857
858int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
859{
860 int apic, i, best_guess = -1;
861
862 apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, "
863 "slot:%d, pin:%d.\n", bus, slot, pin);
864 if (mp_bus_id_to_pci_bus[bus] == -1) {
865 printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
866 return -1;
867 }
868 for (i = 0; i < mp_irq_entries; i++) {
869 int lbus = mp_irqs[i].mpc_srcbus;
870
871 for (apic = 0; apic < nr_ioapics; apic++)
872 if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
873 mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
874 break;
875
876 if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) &&
877 !mp_irqs[i].mpc_irqtype &&
878 (bus == lbus) &&
879 (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
880 int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
881
882 if (!(apic || IO_APIC_IRQ(irq)))
883 continue;
884
885 if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
886 return irq;
887 /*
888 * Use the first all-but-pin matching entry as a
889 * best-guess fuzzy result for broken mptables.
890 */
891 if (best_guess < 0)
892 best_guess = irq;
893 }
894 }
895 return best_guess;
896}
897EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
898
899/*
900 * This function currently is only a helper for the i386 smp boot process where
901 * we need to reprogram the ioredtbls to cater for the cpus which have come online
902 * so mask in all cases should simply be TARGET_CPUS
903 */
904#ifdef CONFIG_SMP
905void __init setup_ioapic_dest(void)
906{
907 int pin, ioapic, irq, irq_entry;
908
909 if (skip_ioapic_setup == 1)
910 return;
911
912 for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
913 for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
914 irq_entry = find_irq_entry(ioapic, pin, mp_INT);
915 if (irq_entry == -1)
916 continue;
917 irq = pin_2_irq(irq_entry, ioapic, pin);
918 set_ioapic_affinity_irq(irq, TARGET_CPUS);
919 }
920
921 }
922}
923#endif
924
925/*
926 * EISA Edge/Level control register, ELCR
927 */
928static int EISA_ELCR(unsigned int irq)
929{
930 if (irq < 16) {
931 unsigned int port = 0x4d0 + (irq >> 3);
932 return (inb(port) >> (irq & 7)) & 1;
933 }
934 apic_printk(APIC_VERBOSE, KERN_INFO
935 "Broken MPtable reports ISA irq %d\n", irq);
936 return 0;
937}
938
939/* EISA interrupts are always polarity zero and can be edge or level
940 * trigger depending on the ELCR value. If an interrupt is listed as
941 * EISA conforming in the MP table, that means its trigger type must
942 * be read in from the ELCR */
943
944#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
945#define default_EISA_polarity(idx) (0)
946
947/* ISA interrupts are always polarity zero edge triggered,
948 * when listed as conforming in the MP table. */
949
950#define default_ISA_trigger(idx) (0)
951#define default_ISA_polarity(idx) (0)
952
953/* PCI interrupts are always polarity one level triggered,
954 * when listed as conforming in the MP table. */
955
956#define default_PCI_trigger(idx) (1)
957#define default_PCI_polarity(idx) (1)
958
959/* MCA interrupts are always polarity zero level triggered,
960 * when listed as conforming in the MP table. */
961
962#define default_MCA_trigger(idx) (1)
963#define default_MCA_polarity(idx) (0)
964
965static int __init MPBIOS_polarity(int idx)
966{
967 int bus = mp_irqs[idx].mpc_srcbus;
968 int polarity;
969
970 /*
971 * Determine IRQ line polarity (high active or low active):
972 */
973 switch (mp_irqs[idx].mpc_irqflag & 3)
974 {
975 case 0: /* conforms, ie. bus-type dependent polarity */
976 {
977 switch (mp_bus_id_to_type[bus])
978 {
979 case MP_BUS_ISA: /* ISA pin */
980 {
981 polarity = default_ISA_polarity(idx);
982 break;
983 }
984 case MP_BUS_EISA: /* EISA pin */
985 {
986 polarity = default_EISA_polarity(idx);
987 break;
988 }
989 case MP_BUS_PCI: /* PCI pin */
990 {
991 polarity = default_PCI_polarity(idx);
992 break;
993 }
994 case MP_BUS_MCA: /* MCA pin */
995 {
996 polarity = default_MCA_polarity(idx);
997 break;
998 }
999 default:
1000 {
1001 printk(KERN_WARNING "broken BIOS!!\n");
1002 polarity = 1;
1003 break;
1004 }
1005 }
1006 break;
1007 }
1008 case 1: /* high active */
1009 {
1010 polarity = 0;
1011 break;
1012 }
1013 case 2: /* reserved */
1014 {
1015 printk(KERN_WARNING "broken BIOS!!\n");
1016 polarity = 1;
1017 break;
1018 }
1019 case 3: /* low active */
1020 {
1021 polarity = 1;
1022 break;
1023 }
1024 default: /* invalid */
1025 {
1026 printk(KERN_WARNING "broken BIOS!!\n");
1027 polarity = 1;
1028 break;
1029 }
1030 }
1031 return polarity;
1032}
1033
1034static int MPBIOS_trigger(int idx)
1035{
1036 int bus = mp_irqs[idx].mpc_srcbus;
1037 int trigger;
1038
1039 /*
1040 * Determine IRQ trigger mode (edge or level sensitive):
1041 */
1042 switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
1043 {
1044 case 0: /* conforms, ie. bus-type dependent */
1045 {
1046 switch (mp_bus_id_to_type[bus])
1047 {
1048 case MP_BUS_ISA: /* ISA pin */
1049 {
1050 trigger = default_ISA_trigger(idx);
1051 break;
1052 }
1053 case MP_BUS_EISA: /* EISA pin */
1054 {
1055 trigger = default_EISA_trigger(idx);
1056 break;
1057 }
1058 case MP_BUS_PCI: /* PCI pin */
1059 {
1060 trigger = default_PCI_trigger(idx);
1061 break;
1062 }
1063 case MP_BUS_MCA: /* MCA pin */
1064 {
1065 trigger = default_MCA_trigger(idx);
1066 break;
1067 }
1068 default:
1069 {
1070 printk(KERN_WARNING "broken BIOS!!\n");
1071 trigger = 1;
1072 break;
1073 }
1074 }
1075 break;
1076 }
1077 case 1: /* edge */
1078 {
1079 trigger = 0;
1080 break;
1081 }
1082 case 2: /* reserved */
1083 {
1084 printk(KERN_WARNING "broken BIOS!!\n");
1085 trigger = 1;
1086 break;
1087 }
1088 case 3: /* level */
1089 {
1090 trigger = 1;
1091 break;
1092 }
1093 default: /* invalid */
1094 {
1095 printk(KERN_WARNING "broken BIOS!!\n");
1096 trigger = 0;
1097 break;
1098 }
1099 }
1100 return trigger;
1101}
1102
1103static inline int irq_polarity(int idx)
1104{
1105 return MPBIOS_polarity(idx);
1106}
1107
1108static inline int irq_trigger(int idx)
1109{
1110 return MPBIOS_trigger(idx);
1111}
1112
1113static int pin_2_irq(int idx, int apic, int pin)
1114{
1115 int irq, i;
1116 int bus = mp_irqs[idx].mpc_srcbus;
1117
1118 /*
1119 * Debugging check, we are in big trouble if this message pops up!
1120 */
1121 if (mp_irqs[idx].mpc_dstirq != pin)
1122 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
1123
1124 switch (mp_bus_id_to_type[bus])
1125 {
1126 case MP_BUS_ISA: /* ISA pin */
1127 case MP_BUS_EISA:
1128 case MP_BUS_MCA:
1129 {
1130 irq = mp_irqs[idx].mpc_srcbusirq;
1131 break;
1132 }
1133 case MP_BUS_PCI: /* PCI pin */
1134 {
1135 /*
1136 * PCI IRQs are mapped in order
1137 */
1138 i = irq = 0;
1139 while (i < apic)
1140 irq += nr_ioapic_registers[i++];
1141 irq += pin;
1142
1143 /*
1144 * For MPS mode, so far only needed by ES7000 platform
1145 */
1146 if (ioapic_renumber_irq)
1147 irq = ioapic_renumber_irq(apic, irq);
1148
1149 break;
1150 }
1151 default:
1152 {
1153 printk(KERN_ERR "unknown bus type %d.\n",bus);
1154 irq = 0;
1155 break;
1156 }
1157 }
1158
1159 /*
1160 * PCI IRQ command line redirection. Yes, limits are hardcoded.
1161 */
1162 if ((pin >= 16) && (pin <= 23)) {
1163 if (pirq_entries[pin-16] != -1) {
1164 if (!pirq_entries[pin-16]) {
1165 apic_printk(APIC_VERBOSE, KERN_DEBUG
1166 "disabling PIRQ%d\n", pin-16);
1167 } else {
1168 irq = pirq_entries[pin-16];
1169 apic_printk(APIC_VERBOSE, KERN_DEBUG
1170 "using PIRQ%d -> IRQ %d\n",
1171 pin-16, irq);
1172 }
1173 }
1174 }
1175 return irq;
1176}
1177
1178static inline int IO_APIC_irq_trigger(int irq)
1179{
1180 int apic, idx, pin;
1181
1182 for (apic = 0; apic < nr_ioapics; apic++) {
1183 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
1184 idx = find_irq_entry(apic,pin,mp_INT);
1185 if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin)))
1186 return irq_trigger(idx);
1187 }
1188 }
1189 /*
1190 * nonexistent IRQs are edge default
1191 */
1192 return 0;
1193}
1194
1195/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
1196static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 };
1197
1198static int __assign_irq_vector(int irq)
1199{
1200 static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
1201 int vector, offset, i;
1202
1203 BUG_ON((unsigned)irq >= NR_IRQ_VECTORS);
1204
1205 if (irq_vector[irq] > 0)
1206 return irq_vector[irq];
1207
1208 vector = current_vector;
1209 offset = current_offset;
1210next:
1211 vector += 8;
1212 if (vector >= FIRST_SYSTEM_VECTOR) {
1213 offset = (offset + 1) % 8;
1214 vector = FIRST_DEVICE_VECTOR + offset;
1215 }
1216 if (vector == current_vector)
1217 return -ENOSPC;
1218 if (vector == SYSCALL_VECTOR)
1219 goto next;
1220 for (i = 0; i < NR_IRQ_VECTORS; i++)
1221 if (irq_vector[i] == vector)
1222 goto next;
1223
1224 current_vector = vector;
1225 current_offset = offset;
1226 irq_vector[irq] = vector;
1227
1228 return vector;
1229}
1230
1231static int assign_irq_vector(int irq)
1232{
1233 unsigned long flags;
1234 int vector;
1235
1236 spin_lock_irqsave(&vector_lock, flags);
1237 vector = __assign_irq_vector(irq);
1238 spin_unlock_irqrestore(&vector_lock, flags);
1239
1240 return vector;
1241}
1242static struct irq_chip ioapic_chip;
1243
1244#define IOAPIC_AUTO -1
1245#define IOAPIC_EDGE 0
1246#define IOAPIC_LEVEL 1
1247
1248static void ioapic_register_intr(int irq, int vector, unsigned long trigger)
1249{
1250 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
1251 trigger == IOAPIC_LEVEL) {
1252 irq_desc[irq].status |= IRQ_LEVEL;
1253 set_irq_chip_and_handler_name(irq, &ioapic_chip,
1254 handle_fasteoi_irq, "fasteoi");
1255 } else {
1256 irq_desc[irq].status &= ~IRQ_LEVEL;
1257 set_irq_chip_and_handler_name(irq, &ioapic_chip,
1258 handle_edge_irq, "edge");
1259 }
1260 set_intr_gate(vector, interrupt[irq]);
1261}
1262
1263static void __init setup_IO_APIC_irqs(void)
1264{
1265 struct IO_APIC_route_entry entry;
1266 int apic, pin, idx, irq, first_notcon = 1, vector;
1267 unsigned long flags;
1268
1269 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
1270
1271 for (apic = 0; apic < nr_ioapics; apic++) {
1272 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
1273
1274 /*
1275 * add it to the IO-APIC irq-routing table:
1276 */
1277 memset(&entry,0,sizeof(entry));
1278
1279 entry.delivery_mode = INT_DELIVERY_MODE;
1280 entry.dest_mode = INT_DEST_MODE;
1281 entry.mask = 0; /* enable IRQ */
1282 entry.dest.logical.logical_dest =
1283 cpu_mask_to_apicid(TARGET_CPUS);
1284
1285 idx = find_irq_entry(apic,pin,mp_INT);
1286 if (idx == -1) {
1287 if (first_notcon) {
1288 apic_printk(APIC_VERBOSE, KERN_DEBUG
1289 " IO-APIC (apicid-pin) %d-%d",
1290 mp_ioapics[apic].mpc_apicid,
1291 pin);
1292 first_notcon = 0;
1293 } else
1294 apic_printk(APIC_VERBOSE, ", %d-%d",
1295 mp_ioapics[apic].mpc_apicid, pin);
1296 continue;
1297 }
1298
1299 entry.trigger = irq_trigger(idx);
1300 entry.polarity = irq_polarity(idx);
1301
1302 if (irq_trigger(idx)) {
1303 entry.trigger = 1;
1304 entry.mask = 1;
1305 }
1306
1307 irq = pin_2_irq(idx, apic, pin);
1308 /*
1309 * skip adding the timer int on secondary nodes, which causes
1310 * a small but painful rift in the time-space continuum
1311 */
1312 if (multi_timer_check(apic, irq))
1313 continue;
1314 else
1315 add_pin_to_irq(irq, apic, pin);
1316
1317 if (!apic && !IO_APIC_IRQ(irq))
1318 continue;
1319
1320 if (IO_APIC_IRQ(irq)) {
1321 vector = assign_irq_vector(irq);
1322 entry.vector = vector;
1323 ioapic_register_intr(irq, vector, IOAPIC_AUTO);
1324
1325 if (!apic && (irq < 16))
1326 disable_8259A_irq(irq);
1327 }
1328 spin_lock_irqsave(&ioapic_lock, flags);
1329 __ioapic_write_entry(apic, pin, entry);
1330 spin_unlock_irqrestore(&ioapic_lock, flags);
1331 }
1332 }
1333
1334 if (!first_notcon)
1335 apic_printk(APIC_VERBOSE, " not connected.\n");
1336}
1337
1338/*
1339 * Set up the 8259A-master output pin:
1340 */
1341static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
1342{
1343 struct IO_APIC_route_entry entry;
1344
1345 memset(&entry,0,sizeof(entry));
1346
1347 disable_8259A_irq(0);
1348
1349 /* mask LVT0 */
1350 apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
1351
1352 /*
1353 * We use logical delivery to get the timer IRQ
1354 * to the first CPU.
1355 */
1356 entry.dest_mode = INT_DEST_MODE;
1357 entry.mask = 0; /* unmask IRQ now */
1358 entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
1359 entry.delivery_mode = INT_DELIVERY_MODE;
1360 entry.polarity = 0;
1361 entry.trigger = 0;
1362 entry.vector = vector;
1363
1364 /*
1365 * The timer IRQ doesn't have to know that behind the
1366 * scene we have a 8259A-master in AEOI mode ...
1367 */
1368 irq_desc[0].chip = &ioapic_chip;
1369 set_irq_handler(0, handle_edge_irq);
1370
1371 /*
1372 * Add it to the IO-APIC irq-routing table:
1373 */
1374 ioapic_write_entry(apic, pin, entry);
1375
1376 enable_8259A_irq(0);
1377}
1378
1379void __init print_IO_APIC(void)
1380{
1381 int apic, i;
1382 union IO_APIC_reg_00 reg_00;
1383 union IO_APIC_reg_01 reg_01;
1384 union IO_APIC_reg_02 reg_02;
1385 union IO_APIC_reg_03 reg_03;
1386 unsigned long flags;
1387
1388 if (apic_verbosity == APIC_QUIET)
1389 return;
1390
1391 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
1392 for (i = 0; i < nr_ioapics; i++)
1393 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
1394 mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
1395
1396 /*
1397 * We are a bit conservative about what we expect. We have to
1398 * know about every hardware change ASAP.
1399 */
1400 printk(KERN_INFO "testing the IO APIC.......................\n");
1401
1402 for (apic = 0; apic < nr_ioapics; apic++) {
1403
1404 spin_lock_irqsave(&ioapic_lock, flags);
1405 reg_00.raw = io_apic_read(apic, 0);
1406 reg_01.raw = io_apic_read(apic, 1);
1407 if (reg_01.bits.version >= 0x10)
1408 reg_02.raw = io_apic_read(apic, 2);
1409 if (reg_01.bits.version >= 0x20)
1410 reg_03.raw = io_apic_read(apic, 3);
1411 spin_unlock_irqrestore(&ioapic_lock, flags);
1412
1413 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
1414 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
1415 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
1416 printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
1417 printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS);
1418
1419 printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw);
1420 printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries);
1421
1422 printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
1423 printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version);
1424
1425 /*
1426 * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
1427 * but the value of reg_02 is read as the previous read register
1428 * value, so ignore it if reg_02 == reg_01.
1429 */
1430 if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) {
1431 printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
1432 printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration);
1433 }
1434
1435 /*
1436 * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02
1437 * or reg_03, but the value of reg_0[23] is read as the previous read
1438 * register value, so ignore it if reg_03 == reg_0[12].
1439 */
1440 if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw &&
1441 reg_03.raw != reg_01.raw) {
1442 printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw);
1443 printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT);
1444 }
1445
1446 printk(KERN_DEBUG ".... IRQ redirection table:\n");
1447
1448 printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol"
1449 " Stat Dest Deli Vect: \n");
1450
1451 for (i = 0; i <= reg_01.bits.entries; i++) {
1452 struct IO_APIC_route_entry entry;
1453
1454 entry = ioapic_read_entry(apic, i);
1455
1456 printk(KERN_DEBUG " %02x %03X %02X ",
1457 i,
1458 entry.dest.logical.logical_dest,
1459 entry.dest.physical.physical_dest
1460 );
1461
1462 printk("%1d %1d %1d %1d %1d %1d %1d %02X\n",
1463 entry.mask,
1464 entry.trigger,
1465 entry.irr,
1466 entry.polarity,
1467 entry.delivery_status,
1468 entry.dest_mode,
1469 entry.delivery_mode,
1470 entry.vector
1471 );
1472 }
1473 }
1474 printk(KERN_DEBUG "IRQ to pin mappings:\n");
1475 for (i = 0; i < NR_IRQS; i++) {
1476 struct irq_pin_list *entry = irq_2_pin + i;
1477 if (entry->pin < 0)
1478 continue;
1479 printk(KERN_DEBUG "IRQ%d ", i);
1480 for (;;) {
1481 printk("-> %d:%d", entry->apic, entry->pin);
1482 if (!entry->next)
1483 break;
1484 entry = irq_2_pin + entry->next;
1485 }
1486 printk("\n");
1487 }
1488
1489 printk(KERN_INFO ".................................... done.\n");
1490
1491 return;
1492}
1493
1494#if 0
1495
1496static void print_APIC_bitfield (int base)
1497{
1498 unsigned int v;
1499 int i, j;
1500
1501 if (apic_verbosity == APIC_QUIET)
1502 return;
1503
1504 printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG);
1505 for (i = 0; i < 8; i++) {
1506 v = apic_read(base + i*0x10);
1507 for (j = 0; j < 32; j++) {
1508 if (v & (1<<j))
1509 printk("1");
1510 else
1511 printk("0");
1512 }
1513 printk("\n");
1514 }
1515}
1516
1517void /*__init*/ print_local_APIC(void * dummy)
1518{
1519 unsigned int v, ver, maxlvt;
1520
1521 if (apic_verbosity == APIC_QUIET)
1522 return;
1523
1524 printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
1525 smp_processor_id(), hard_smp_processor_id());
1526 v = apic_read(APIC_ID);
1527 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(v));
1528 v = apic_read(APIC_LVR);
1529 printk(KERN_INFO "... APIC VERSION: %08x\n", v);
1530 ver = GET_APIC_VERSION(v);
1531 maxlvt = lapic_get_maxlvt();
1532
1533 v = apic_read(APIC_TASKPRI);
1534 printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
1535
1536 if (APIC_INTEGRATED(ver)) { /* !82489DX */
1537 v = apic_read(APIC_ARBPRI);
1538 printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
1539 v & APIC_ARBPRI_MASK);
1540 v = apic_read(APIC_PROCPRI);
1541 printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
1542 }
1543
1544 v = apic_read(APIC_EOI);
1545 printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
1546 v = apic_read(APIC_RRR);
1547 printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
1548 v = apic_read(APIC_LDR);
1549 printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
1550 v = apic_read(APIC_DFR);
1551 printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
1552 v = apic_read(APIC_SPIV);
1553 printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
1554
1555 printk(KERN_DEBUG "... APIC ISR field:\n");
1556 print_APIC_bitfield(APIC_ISR);
1557 printk(KERN_DEBUG "... APIC TMR field:\n");
1558 print_APIC_bitfield(APIC_TMR);
1559 printk(KERN_DEBUG "... APIC IRR field:\n");
1560 print_APIC_bitfield(APIC_IRR);
1561
1562 if (APIC_INTEGRATED(ver)) { /* !82489DX */
1563 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
1564 apic_write(APIC_ESR, 0);
1565 v = apic_read(APIC_ESR);
1566 printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
1567 }
1568
1569 v = apic_read(APIC_ICR);
1570 printk(KERN_DEBUG "... APIC ICR: %08x\n", v);
1571 v = apic_read(APIC_ICR2);
1572 printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
1573
1574 v = apic_read(APIC_LVTT);
1575 printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
1576
1577 if (maxlvt > 3) { /* PC is LVT#4. */
1578 v = apic_read(APIC_LVTPC);
1579 printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v);
1580 }
1581 v = apic_read(APIC_LVT0);
1582 printk(KERN_DEBUG "... APIC LVT0: %08x\n", v);
1583 v = apic_read(APIC_LVT1);
1584 printk(KERN_DEBUG "... APIC LVT1: %08x\n", v);
1585
1586 if (maxlvt > 2) { /* ERR is LVT#3. */
1587 v = apic_read(APIC_LVTERR);
1588 printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v);
1589 }
1590
1591 v = apic_read(APIC_TMICT);
1592 printk(KERN_DEBUG "... APIC TMICT: %08x\n", v);
1593 v = apic_read(APIC_TMCCT);
1594 printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
1595 v = apic_read(APIC_TDCR);
1596 printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
1597 printk("\n");
1598}
1599
1600void print_all_local_APICs (void)
1601{
1602 on_each_cpu(print_local_APIC, NULL, 1, 1);
1603}
1604
1605void /*__init*/ print_PIC(void)
1606{
1607 unsigned int v;
1608 unsigned long flags;
1609
1610 if (apic_verbosity == APIC_QUIET)
1611 return;
1612
1613 printk(KERN_DEBUG "\nprinting PIC contents\n");
1614
1615 spin_lock_irqsave(&i8259A_lock, flags);
1616
1617 v = inb(0xa1) << 8 | inb(0x21);
1618 printk(KERN_DEBUG "... PIC IMR: %04x\n", v);
1619
1620 v = inb(0xa0) << 8 | inb(0x20);
1621 printk(KERN_DEBUG "... PIC IRR: %04x\n", v);
1622
1623 outb(0x0b,0xa0);
1624 outb(0x0b,0x20);
1625 v = inb(0xa0) << 8 | inb(0x20);
1626 outb(0x0a,0xa0);
1627 outb(0x0a,0x20);
1628
1629 spin_unlock_irqrestore(&i8259A_lock, flags);
1630
1631 printk(KERN_DEBUG "... PIC ISR: %04x\n", v);
1632
1633 v = inb(0x4d1) << 8 | inb(0x4d0);
1634 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
1635}
1636
1637#endif /* 0 */
1638
1639static void __init enable_IO_APIC(void)
1640{
1641 union IO_APIC_reg_01 reg_01;
1642 int i8259_apic, i8259_pin;
1643 int i, apic;
1644 unsigned long flags;
1645
1646 for (i = 0; i < PIN_MAP_SIZE; i++) {
1647 irq_2_pin[i].pin = -1;
1648 irq_2_pin[i].next = 0;
1649 }
1650 if (!pirqs_enabled)
1651 for (i = 0; i < MAX_PIRQS; i++)
1652 pirq_entries[i] = -1;
1653
1654 /*
1655 * The number of IO-APIC IRQ registers (== #pins):
1656 */
1657 for (apic = 0; apic < nr_ioapics; apic++) {
1658 spin_lock_irqsave(&ioapic_lock, flags);
1659 reg_01.raw = io_apic_read(apic, 1);
1660 spin_unlock_irqrestore(&ioapic_lock, flags);
1661 nr_ioapic_registers[apic] = reg_01.bits.entries+1;
1662 }
1663 for(apic = 0; apic < nr_ioapics; apic++) {
1664 int pin;
1665 /* See if any of the pins is in ExtINT mode */
1666 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
1667 struct IO_APIC_route_entry entry;
1668 entry = ioapic_read_entry(apic, pin);
1669
1670
1671 /* If the interrupt line is enabled and in ExtInt mode
1672 * I have found the pin where the i8259 is connected.
1673 */
1674 if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
1675 ioapic_i8259.apic = apic;
1676 ioapic_i8259.pin = pin;
1677 goto found_i8259;
1678 }
1679 }
1680 }
1681 found_i8259:
1682 /* Look to see what if the MP table has reported the ExtINT */
1683 /* If we could not find the appropriate pin by looking at the ioapic
1684 * the i8259 probably is not connected the ioapic but give the
1685 * mptable a chance anyway.
1686 */
1687 i8259_pin = find_isa_irq_pin(0, mp_ExtINT);
1688 i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
1689 /* Trust the MP table if nothing is setup in the hardware */
1690 if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) {
1691 printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n");
1692 ioapic_i8259.pin = i8259_pin;
1693 ioapic_i8259.apic = i8259_apic;
1694 }
1695 /* Complain if the MP table and the hardware disagree */
1696 if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) &&
1697 (i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
1698 {
1699 printk(KERN_WARNING "ExtINT in hardware and MP table differ\n");
1700 }
1701
1702 /*
1703 * Do not trust the IO-APIC being empty at bootup
1704 */
1705 clear_IO_APIC();
1706}
1707
1708/*
1709 * Not an __init, needed by the reboot code
1710 */
1711void disable_IO_APIC(void)
1712{
1713 /*
1714 * Clear the IO-APIC before rebooting:
1715 */
1716 clear_IO_APIC();
1717
1718 /*
1719 * If the i8259 is routed through an IOAPIC
1720 * Put that IOAPIC in virtual wire mode
1721 * so legacy interrupts can be delivered.
1722 */
1723 if (ioapic_i8259.pin != -1) {
1724 struct IO_APIC_route_entry entry;
1725
1726 memset(&entry, 0, sizeof(entry));
1727 entry.mask = 0; /* Enabled */
1728 entry.trigger = 0; /* Edge */
1729 entry.irr = 0;
1730 entry.polarity = 0; /* High */
1731 entry.delivery_status = 0;
1732 entry.dest_mode = 0; /* Physical */
1733 entry.delivery_mode = dest_ExtINT; /* ExtInt */
1734 entry.vector = 0;
1735 entry.dest.physical.physical_dest =
1736 GET_APIC_ID(apic_read(APIC_ID));
1737
1738 /*
1739 * Add it to the IO-APIC irq-routing table:
1740 */
1741 ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
1742 }
1743 disconnect_bsp_APIC(ioapic_i8259.pin != -1);
1744}
1745
1746/*
1747 * function to set the IO-APIC physical IDs based on the
1748 * values stored in the MPC table.
1749 *
1750 * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
1751 */
1752
1753#ifndef CONFIG_X86_NUMAQ
1754static void __init setup_ioapic_ids_from_mpc(void)
1755{
1756 union IO_APIC_reg_00 reg_00;
1757 physid_mask_t phys_id_present_map;
1758 int apic;
1759 int i;
1760 unsigned char old_id;
1761 unsigned long flags;
1762
1763 /*
1764 * Don't check I/O APIC IDs for xAPIC systems. They have
1765 * no meaning without the serial APIC bus.
1766 */
1767 if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
1768 || APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
1769 return;
1770 /*
1771 * This is broken; anything with a real cpu count has to
1772 * circumvent this idiocy regardless.
1773 */
1774 phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map);
1775
1776 /*
1777 * Set the IOAPIC ID to the value stored in the MPC table.
1778 */
1779 for (apic = 0; apic < nr_ioapics; apic++) {
1780
1781 /* Read the register 0 value */
1782 spin_lock_irqsave(&ioapic_lock, flags);
1783 reg_00.raw = io_apic_read(apic, 0);
1784 spin_unlock_irqrestore(&ioapic_lock, flags);
1785
1786 old_id = mp_ioapics[apic].mpc_apicid;
1787
1788 if (mp_ioapics[apic].mpc_apicid >= get_physical_broadcast()) {
1789 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
1790 apic, mp_ioapics[apic].mpc_apicid);
1791 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
1792 reg_00.bits.ID);
1793 mp_ioapics[apic].mpc_apicid = reg_00.bits.ID;
1794 }
1795
1796 /*
1797 * Sanity check, is the ID really free? Every APIC in a
1798 * system must have a unique ID or we get lots of nice
1799 * 'stuck on smp_invalidate_needed IPI wait' messages.
1800 */
1801 if (check_apicid_used(phys_id_present_map,
1802 mp_ioapics[apic].mpc_apicid)) {
1803 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
1804 apic, mp_ioapics[apic].mpc_apicid);
1805 for (i = 0; i < get_physical_broadcast(); i++)
1806 if (!physid_isset(i, phys_id_present_map))
1807 break;
1808 if (i >= get_physical_broadcast())
1809 panic("Max APIC ID exceeded!\n");
1810 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
1811 i);
1812 physid_set(i, phys_id_present_map);
1813 mp_ioapics[apic].mpc_apicid = i;
1814 } else {
1815 physid_mask_t tmp;
1816 tmp = apicid_to_cpu_present(mp_ioapics[apic].mpc_apicid);
1817 apic_printk(APIC_VERBOSE, "Setting %d in the "
1818 "phys_id_present_map\n",
1819 mp_ioapics[apic].mpc_apicid);
1820 physids_or(phys_id_present_map, phys_id_present_map, tmp);
1821 }
1822
1823
1824 /*
1825 * We need to adjust the IRQ routing table
1826 * if the ID changed.
1827 */
1828 if (old_id != mp_ioapics[apic].mpc_apicid)
1829 for (i = 0; i < mp_irq_entries; i++)
1830 if (mp_irqs[i].mpc_dstapic == old_id)
1831 mp_irqs[i].mpc_dstapic
1832 = mp_ioapics[apic].mpc_apicid;
1833
1834 /*
1835 * Read the right value from the MPC table and
1836 * write it into the ID register.
1837 */
1838 apic_printk(APIC_VERBOSE, KERN_INFO
1839 "...changing IO-APIC physical APIC ID to %d ...",
1840 mp_ioapics[apic].mpc_apicid);
1841
1842 reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
1843 spin_lock_irqsave(&ioapic_lock, flags);
1844 io_apic_write(apic, 0, reg_00.raw);
1845 spin_unlock_irqrestore(&ioapic_lock, flags);
1846
1847 /*
1848 * Sanity check
1849 */
1850 spin_lock_irqsave(&ioapic_lock, flags);
1851 reg_00.raw = io_apic_read(apic, 0);
1852 spin_unlock_irqrestore(&ioapic_lock, flags);
1853 if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid)
1854 printk("could not set ID!\n");
1855 else
1856 apic_printk(APIC_VERBOSE, " ok.\n");
1857 }
1858}
1859#else
1860static void __init setup_ioapic_ids_from_mpc(void) { }
1861#endif
1862
1863int no_timer_check __initdata;
1864
1865static int __init notimercheck(char *s)
1866{
1867 no_timer_check = 1;
1868 return 1;
1869}
1870__setup("no_timer_check", notimercheck);
1871
1872/*
1873 * There is a nasty bug in some older SMP boards, their mptable lies
1874 * about the timer IRQ. We do the following to work around the situation:
1875 *
1876 * - timer IRQ defaults to IO-APIC IRQ
1877 * - if this function detects that timer IRQs are defunct, then we fall
1878 * back to ISA timer IRQs
1879 */
1880static int __init timer_irq_works(void)
1881{
1882 unsigned long t1 = jiffies;
1883
1884 if (no_timer_check)
1885 return 1;
1886
1887 local_irq_enable();
1888 /* Let ten ticks pass... */
1889 mdelay((10 * 1000) / HZ);
1890
1891 /*
1892 * Expect a few ticks at least, to be sure some possible
1893 * glue logic does not lock up after one or two first
1894 * ticks in a non-ExtINT mode. Also the local APIC
1895 * might have cached one ExtINT interrupt. Finally, at
1896 * least one tick may be lost due to delays.
1897 */
1898 if (jiffies - t1 > 4)
1899 return 1;
1900
1901 return 0;
1902}
1903
1904/*
1905 * In the SMP+IOAPIC case it might happen that there are an unspecified
1906 * number of pending IRQ events unhandled. These cases are very rare,
1907 * so we 'resend' these IRQs via IPIs, to the same CPU. It's much
1908 * better to do it this way as thus we do not have to be aware of
1909 * 'pending' interrupts in the IRQ path, except at this point.
1910 */
1911/*
1912 * Edge triggered needs to resend any interrupt
1913 * that was delayed but this is now handled in the device
1914 * independent code.
1915 */
1916
1917/*
1918 * Startup quirk:
1919 *
1920 * Starting up a edge-triggered IO-APIC interrupt is
1921 * nasty - we need to make sure that we get the edge.
1922 * If it is already asserted for some reason, we need
1923 * return 1 to indicate that is was pending.
1924 *
1925 * This is not complete - we should be able to fake
1926 * an edge even if it isn't on the 8259A...
1927 *
1928 * (We do this for level-triggered IRQs too - it cannot hurt.)
1929 */
1930static unsigned int startup_ioapic_irq(unsigned int irq)
1931{
1932 int was_pending = 0;
1933 unsigned long flags;
1934
1935 spin_lock_irqsave(&ioapic_lock, flags);
1936 if (irq < 16) {
1937 disable_8259A_irq(irq);
1938 if (i8259A_irq_pending(irq))
1939 was_pending = 1;
1940 }
1941 __unmask_IO_APIC_irq(irq);
1942 spin_unlock_irqrestore(&ioapic_lock, flags);
1943
1944 return was_pending;
1945}
1946
1947static void ack_ioapic_irq(unsigned int irq)
1948{
1949 move_native_irq(irq);
1950 ack_APIC_irq();
1951}
1952
1953static void ack_ioapic_quirk_irq(unsigned int irq)
1954{
1955 unsigned long v;
1956 int i;
1957
1958 move_native_irq(irq);
1959/*
1960 * It appears there is an erratum which affects at least version 0x11
1961 * of I/O APIC (that's the 82093AA and cores integrated into various
1962 * chipsets). Under certain conditions a level-triggered interrupt is
1963 * erroneously delivered as edge-triggered one but the respective IRR
1964 * bit gets set nevertheless. As a result the I/O unit expects an EOI
1965 * message but it will never arrive and further interrupts are blocked
1966 * from the source. The exact reason is so far unknown, but the
1967 * phenomenon was observed when two consecutive interrupt requests
1968 * from a given source get delivered to the same CPU and the source is
1969 * temporarily disabled in between.
1970 *
1971 * A workaround is to simulate an EOI message manually. We achieve it
1972 * by setting the trigger mode to edge and then to level when the edge
1973 * trigger mode gets detected in the TMR of a local APIC for a
1974 * level-triggered interrupt. We mask the source for the time of the
1975 * operation to prevent an edge-triggered interrupt escaping meanwhile.
1976 * The idea is from Manfred Spraul. --macro
1977 */
1978 i = irq_vector[irq];
1979
1980 v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
1981
1982 ack_APIC_irq();
1983
1984 if (!(v & (1 << (i & 0x1f)))) {
1985 atomic_inc(&irq_mis_count);
1986 spin_lock(&ioapic_lock);
1987 __mask_and_edge_IO_APIC_irq(irq);
1988 __unmask_and_level_IO_APIC_irq(irq);
1989 spin_unlock(&ioapic_lock);
1990 }
1991}
1992
1993static int ioapic_retrigger_irq(unsigned int irq)
1994{
1995 send_IPI_self(irq_vector[irq]);
1996
1997 return 1;
1998}
1999
2000static struct irq_chip ioapic_chip __read_mostly = {
2001 .name = "IO-APIC",
2002 .startup = startup_ioapic_irq,
2003 .mask = mask_IO_APIC_irq,
2004 .unmask = unmask_IO_APIC_irq,
2005 .ack = ack_ioapic_irq,
2006 .eoi = ack_ioapic_quirk_irq,
2007#ifdef CONFIG_SMP
2008 .set_affinity = set_ioapic_affinity_irq,
2009#endif
2010 .retrigger = ioapic_retrigger_irq,
2011};
2012
2013
2014static inline void init_IO_APIC_traps(void)
2015{
2016 int irq;
2017
2018 /*
2019 * NOTE! The local APIC isn't very good at handling
2020 * multiple interrupts at the same interrupt level.
2021 * As the interrupt level is determined by taking the
2022 * vector number and shifting that right by 4, we
2023 * want to spread these out a bit so that they don't
2024 * all fall in the same interrupt level.
2025 *
2026 * Also, we've got to be careful not to trash gate
2027 * 0x80, because int 0x80 is hm, kind of importantish. ;)
2028 */
2029 for (irq = 0; irq < NR_IRQS ; irq++) {
2030 int tmp = irq;
2031 if (IO_APIC_IRQ(tmp) && !irq_vector[tmp]) {
2032 /*
2033 * Hmm.. We don't have an entry for this,
2034 * so default to an old-fashioned 8259
2035 * interrupt if we can..
2036 */
2037 if (irq < 16)
2038 make_8259A_irq(irq);
2039 else
2040 /* Strange. Oh, well.. */
2041 irq_desc[irq].chip = &no_irq_chip;
2042 }
2043 }
2044}
2045
2046/*
2047 * The local APIC irq-chip implementation:
2048 */
2049
2050static void ack_apic(unsigned int irq)
2051{
2052 ack_APIC_irq();
2053}
2054
2055static void mask_lapic_irq (unsigned int irq)
2056{
2057 unsigned long v;
2058
2059 v = apic_read(APIC_LVT0);
2060 apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
2061}
2062
2063static void unmask_lapic_irq (unsigned int irq)
2064{
2065 unsigned long v;
2066
2067 v = apic_read(APIC_LVT0);
2068 apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED);
2069}
2070
2071static struct irq_chip lapic_chip __read_mostly = {
2072 .name = "local-APIC-edge",
2073 .mask = mask_lapic_irq,
2074 .unmask = unmask_lapic_irq,
2075 .eoi = ack_apic,
2076};
2077
2078static void setup_nmi (void)
2079{
2080 /*
2081 * Dirty trick to enable the NMI watchdog ...
2082 * We put the 8259A master into AEOI mode and
2083 * unmask on all local APICs LVT0 as NMI.
2084 *
2085 * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
2086 * is from Maciej W. Rozycki - so we do not have to EOI from
2087 * the NMI handler or the timer interrupt.
2088 */
2089 apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
2090
2091 on_each_cpu(enable_NMI_through_LVT0, NULL, 1, 1);
2092
2093 apic_printk(APIC_VERBOSE, " done.\n");
2094}
2095
2096/*
2097 * This looks a bit hackish but it's about the only one way of sending
2098 * a few INTA cycles to 8259As and any associated glue logic. ICR does
2099 * not support the ExtINT mode, unfortunately. We need to send these
2100 * cycles as some i82489DX-based boards have glue logic that keeps the
2101 * 8259A interrupt line asserted until INTA. --macro
2102 */
2103static inline void unlock_ExtINT_logic(void)
2104{
2105 int apic, pin, i;
2106 struct IO_APIC_route_entry entry0, entry1;
2107 unsigned char save_control, save_freq_select;
2108
2109 pin = find_isa_irq_pin(8, mp_INT);
2110 if (pin == -1) {
2111 WARN_ON_ONCE(1);
2112 return;
2113 }
2114 apic = find_isa_irq_apic(8, mp_INT);
2115 if (apic == -1) {
2116 WARN_ON_ONCE(1);
2117 return;
2118 }
2119
2120 entry0 = ioapic_read_entry(apic, pin);
2121 clear_IO_APIC_pin(apic, pin);
2122
2123 memset(&entry1, 0, sizeof(entry1));
2124
2125 entry1.dest_mode = 0; /* physical delivery */
2126 entry1.mask = 0; /* unmask IRQ now */
2127 entry1.dest.physical.physical_dest = hard_smp_processor_id();
2128 entry1.delivery_mode = dest_ExtINT;
2129 entry1.polarity = entry0.polarity;
2130 entry1.trigger = 0;
2131 entry1.vector = 0;
2132
2133 ioapic_write_entry(apic, pin, entry1);
2134
2135 save_control = CMOS_READ(RTC_CONTROL);
2136 save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
2137 CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6,
2138 RTC_FREQ_SELECT);
2139 CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL);
2140
2141 i = 100;
2142 while (i-- > 0) {
2143 mdelay(10);
2144 if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF)
2145 i -= 10;
2146 }
2147
2148 CMOS_WRITE(save_control, RTC_CONTROL);
2149 CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
2150 clear_IO_APIC_pin(apic, pin);
2151
2152 ioapic_write_entry(apic, pin, entry0);
2153}
2154
2155int timer_uses_ioapic_pin_0;
2156
2157/*
2158 * This code may look a bit paranoid, but it's supposed to cooperate with
2159 * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
2160 * is so screwy. Thanks to Brian Perkins for testing/hacking this beast
2161 * fanatically on his truly buggy board.
2162 */
2163static inline void __init check_timer(void)
2164{
2165 int apic1, pin1, apic2, pin2;
2166 int vector;
2167
2168 /*
2169 * get/set the timer IRQ vector:
2170 */
2171 disable_8259A_irq(0);
2172 vector = assign_irq_vector(0);
2173 set_intr_gate(vector, interrupt[0]);
2174
2175 /*
2176 * Subtle, code in do_timer_interrupt() expects an AEOI
2177 * mode for the 8259A whenever interrupts are routed
2178 * through I/O APICs. Also IRQ0 has to be enabled in
2179 * the 8259A which implies the virtual wire has to be
2180 * disabled in the local APIC.
2181 */
2182 apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
2183 init_8259A(1);
2184 timer_ack = 1;
2185 if (timer_over_8254 > 0)
2186 enable_8259A_irq(0);
2187
2188 pin1 = find_isa_irq_pin(0, mp_INT);
2189 apic1 = find_isa_irq_apic(0, mp_INT);
2190 pin2 = ioapic_i8259.pin;
2191 apic2 = ioapic_i8259.apic;
2192
2193 if (pin1 == 0)
2194 timer_uses_ioapic_pin_0 = 1;
2195
2196 printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
2197 vector, apic1, pin1, apic2, pin2);
2198
2199 if (pin1 != -1) {
2200 /*
2201 * Ok, does IRQ0 through the IOAPIC work?
2202 */
2203 unmask_IO_APIC_irq(0);
2204 if (timer_irq_works()) {
2205 if (nmi_watchdog == NMI_IO_APIC) {
2206 disable_8259A_irq(0);
2207 setup_nmi();
2208 enable_8259A_irq(0);
2209 }
2210 if (disable_timer_pin_1 > 0)
2211 clear_IO_APIC_pin(0, pin1);
2212 return;
2213 }
2214 clear_IO_APIC_pin(apic1, pin1);
2215 printk(KERN_ERR "..MP-BIOS bug: 8254 timer not connected to "
2216 "IO-APIC\n");
2217 }
2218
2219 printk(KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... ");
2220 if (pin2 != -1) {
2221 printk("\n..... (found pin %d) ...", pin2);
2222 /*
2223 * legacy devices should be connected to IO APIC #0
2224 */
2225 setup_ExtINT_IRQ0_pin(apic2, pin2, vector);
2226 if (timer_irq_works()) {
2227 printk("works.\n");
2228 if (pin1 != -1)
2229 replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
2230 else
2231 add_pin_to_irq(0, apic2, pin2);
2232 if (nmi_watchdog == NMI_IO_APIC) {
2233 setup_nmi();
2234 }
2235 return;
2236 }
2237 /*
2238 * Cleanup, just in case ...
2239 */
2240 clear_IO_APIC_pin(apic2, pin2);
2241 }
2242 printk(" failed.\n");
2243
2244 if (nmi_watchdog == NMI_IO_APIC) {
2245 printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
2246 nmi_watchdog = 0;
2247 }
2248
2249 printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
2250
2251 disable_8259A_irq(0);
2252 set_irq_chip_and_handler_name(0, &lapic_chip, handle_fasteoi_irq,
2253 "fasteoi");
2254 apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
2255 enable_8259A_irq(0);
2256
2257 if (timer_irq_works()) {
2258 printk(" works.\n");
2259 return;
2260 }
2261 apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
2262 printk(" failed.\n");
2263
2264 printk(KERN_INFO "...trying to set up timer as ExtINT IRQ...");
2265
2266 timer_ack = 0;
2267 init_8259A(0);
2268 make_8259A_irq(0);
2269 apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
2270
2271 unlock_ExtINT_logic();
2272
2273 if (timer_irq_works()) {
2274 printk(" works.\n");
2275 return;
2276 }
2277 printk(" failed :(.\n");
2278 panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
2279 "report. Then try booting with the 'noapic' option");
2280}
2281
2282/*
2283 *
2284 * IRQ's that are handled by the PIC in the MPS IOAPIC case.
2285 * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
2286 * Linux doesn't really care, as it's not actually used
2287 * for any interrupt handling anyway.
2288 */
2289#define PIC_IRQS (1 << PIC_CASCADE_IR)
2290
2291void __init setup_IO_APIC(void)
2292{
2293 enable_IO_APIC();
2294
2295 if (acpi_ioapic)
2296 io_apic_irqs = ~0; /* all IRQs go through IOAPIC */
2297 else
2298 io_apic_irqs = ~PIC_IRQS;
2299
2300 printk("ENABLING IO-APIC IRQs\n");
2301
2302 /*
2303 * Set up IO-APIC IRQ routing.
2304 */
2305 if (!acpi_ioapic)
2306 setup_ioapic_ids_from_mpc();
2307 sync_Arb_IDs();
2308 setup_IO_APIC_irqs();
2309 init_IO_APIC_traps();
2310 check_timer();
2311 if (!acpi_ioapic)
2312 print_IO_APIC();
2313}
2314
2315static int __init setup_disable_8254_timer(char *s)
2316{
2317 timer_over_8254 = -1;
2318 return 1;
2319}
2320static int __init setup_enable_8254_timer(char *s)
2321{
2322 timer_over_8254 = 2;
2323 return 1;
2324}
2325
2326__setup("disable_8254_timer", setup_disable_8254_timer);
2327__setup("enable_8254_timer", setup_enable_8254_timer);
2328
2329/*
2330 * Called after all the initialization is done. If we didnt find any
2331 * APIC bugs then we can allow the modify fast path
2332 */
2333
2334static int __init io_apic_bug_finalize(void)
2335{
2336 if(sis_apic_bug == -1)
2337 sis_apic_bug = 0;
2338 return 0;
2339}
2340
2341late_initcall(io_apic_bug_finalize);
2342
2343struct sysfs_ioapic_data {
2344 struct sys_device dev;
2345 struct IO_APIC_route_entry entry[0];
2346};
2347static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
2348
2349static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
2350{
2351 struct IO_APIC_route_entry *entry;
2352 struct sysfs_ioapic_data *data;
2353 int i;
2354
2355 data = container_of(dev, struct sysfs_ioapic_data, dev);
2356 entry = data->entry;
2357 for (i = 0; i < nr_ioapic_registers[dev->id]; i ++)
2358 entry[i] = ioapic_read_entry(dev->id, i);
2359
2360 return 0;
2361}
2362
2363static int ioapic_resume(struct sys_device *dev)
2364{
2365 struct IO_APIC_route_entry *entry;
2366 struct sysfs_ioapic_data *data;
2367 unsigned long flags;
2368 union IO_APIC_reg_00 reg_00;
2369 int i;
2370
2371 data = container_of(dev, struct sysfs_ioapic_data, dev);
2372 entry = data->entry;
2373
2374 spin_lock_irqsave(&ioapic_lock, flags);
2375 reg_00.raw = io_apic_read(dev->id, 0);
2376 if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
2377 reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
2378 io_apic_write(dev->id, 0, reg_00.raw);
2379 }
2380 spin_unlock_irqrestore(&ioapic_lock, flags);
2381 for (i = 0; i < nr_ioapic_registers[dev->id]; i ++)
2382 ioapic_write_entry(dev->id, i, entry[i]);
2383
2384 return 0;
2385}
2386
2387static struct sysdev_class ioapic_sysdev_class = {
2388 set_kset_name("ioapic"),
2389 .suspend = ioapic_suspend,
2390 .resume = ioapic_resume,
2391};
2392
2393static int __init ioapic_init_sysfs(void)
2394{
2395 struct sys_device * dev;
2396 int i, size, error = 0;
2397
2398 error = sysdev_class_register(&ioapic_sysdev_class);
2399 if (error)
2400 return error;
2401
2402 for (i = 0; i < nr_ioapics; i++ ) {
2403 size = sizeof(struct sys_device) + nr_ioapic_registers[i]
2404 * sizeof(struct IO_APIC_route_entry);
2405 mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL);
2406 if (!mp_ioapic_data[i]) {
2407 printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
2408 continue;
2409 }
2410 memset(mp_ioapic_data[i], 0, size);
2411 dev = &mp_ioapic_data[i]->dev;
2412 dev->id = i;
2413 dev->cls = &ioapic_sysdev_class;
2414 error = sysdev_register(dev);
2415 if (error) {
2416 kfree(mp_ioapic_data[i]);
2417 mp_ioapic_data[i] = NULL;
2418 printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
2419 continue;
2420 }
2421 }
2422
2423 return 0;
2424}
2425
2426device_initcall(ioapic_init_sysfs);
2427
2428/*
2429 * Dynamic irq allocate and deallocation
2430 */
2431int create_irq(void)
2432{
2433 /* Allocate an unused irq */
2434 int irq, new, vector = 0;
2435 unsigned long flags;
2436
2437 irq = -ENOSPC;
2438 spin_lock_irqsave(&vector_lock, flags);
2439 for (new = (NR_IRQS - 1); new >= 0; new--) {
2440 if (platform_legacy_irq(new))
2441 continue;
2442 if (irq_vector[new] != 0)
2443 continue;
2444 vector = __assign_irq_vector(new);
2445 if (likely(vector > 0))
2446 irq = new;
2447 break;
2448 }
2449 spin_unlock_irqrestore(&vector_lock, flags);
2450
2451 if (irq >= 0) {
2452 set_intr_gate(vector, interrupt[irq]);
2453 dynamic_irq_init(irq);
2454 }
2455 return irq;
2456}
2457
2458void destroy_irq(unsigned int irq)
2459{
2460 unsigned long flags;
2461
2462 dynamic_irq_cleanup(irq);
2463
2464 spin_lock_irqsave(&vector_lock, flags);
2465 irq_vector[irq] = 0;
2466 spin_unlock_irqrestore(&vector_lock, flags);
2467}
2468
2469/*
2470 * MSI mesage composition
2471 */
2472#ifdef CONFIG_PCI_MSI
2473static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
2474{
2475 int vector;
2476 unsigned dest;
2477
2478 vector = assign_irq_vector(irq);
2479 if (vector >= 0) {
2480 dest = cpu_mask_to_apicid(TARGET_CPUS);
2481
2482 msg->address_hi = MSI_ADDR_BASE_HI;
2483 msg->address_lo =
2484 MSI_ADDR_BASE_LO |
2485 ((INT_DEST_MODE == 0) ?
2486 MSI_ADDR_DEST_MODE_PHYSICAL:
2487 MSI_ADDR_DEST_MODE_LOGICAL) |
2488 ((INT_DELIVERY_MODE != dest_LowestPrio) ?
2489 MSI_ADDR_REDIRECTION_CPU:
2490 MSI_ADDR_REDIRECTION_LOWPRI) |
2491 MSI_ADDR_DEST_ID(dest);
2492
2493 msg->data =
2494 MSI_DATA_TRIGGER_EDGE |
2495 MSI_DATA_LEVEL_ASSERT |
2496 ((INT_DELIVERY_MODE != dest_LowestPrio) ?
2497 MSI_DATA_DELIVERY_FIXED:
2498 MSI_DATA_DELIVERY_LOWPRI) |
2499 MSI_DATA_VECTOR(vector);
2500 }
2501 return vector;
2502}
2503
2504#ifdef CONFIG_SMP
2505static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
2506{
2507 struct msi_msg msg;
2508 unsigned int dest;
2509 cpumask_t tmp;
2510 int vector;
2511
2512 cpus_and(tmp, mask, cpu_online_map);
2513 if (cpus_empty(tmp))
2514 tmp = TARGET_CPUS;
2515
2516 vector = assign_irq_vector(irq);
2517 if (vector < 0)
2518 return;
2519
2520 dest = cpu_mask_to_apicid(mask);
2521
2522 read_msi_msg(irq, &msg);
2523
2524 msg.data &= ~MSI_DATA_VECTOR_MASK;
2525 msg.data |= MSI_DATA_VECTOR(vector);
2526 msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
2527 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
2528
2529 write_msi_msg(irq, &msg);
2530 irq_desc[irq].affinity = mask;
2531}
2532#endif /* CONFIG_SMP */
2533
2534/*
2535 * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
2536 * which implement the MSI or MSI-X Capability Structure.
2537 */
2538static struct irq_chip msi_chip = {
2539 .name = "PCI-MSI",
2540 .unmask = unmask_msi_irq,
2541 .mask = mask_msi_irq,
2542 .ack = ack_ioapic_irq,
2543#ifdef CONFIG_SMP
2544 .set_affinity = set_msi_irq_affinity,
2545#endif
2546 .retrigger = ioapic_retrigger_irq,
2547};
2548
2549int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
2550{
2551 struct msi_msg msg;
2552 int irq, ret;
2553 irq = create_irq();
2554 if (irq < 0)
2555 return irq;
2556
2557 ret = msi_compose_msg(dev, irq, &msg);
2558 if (ret < 0) {
2559 destroy_irq(irq);
2560 return ret;
2561 }
2562
2563 set_irq_msi(irq, desc);
2564 write_msi_msg(irq, &msg);
2565
2566 set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq,
2567 "edge");
2568
2569 return 0;
2570}
2571
2572void arch_teardown_msi_irq(unsigned int irq)
2573{
2574 destroy_irq(irq);
2575}
2576
2577#endif /* CONFIG_PCI_MSI */
2578
2579/*
2580 * Hypertransport interrupt support
2581 */
2582#ifdef CONFIG_HT_IRQ
2583
2584#ifdef CONFIG_SMP
2585
2586static void target_ht_irq(unsigned int irq, unsigned int dest)
2587{
2588 struct ht_irq_msg msg;
2589 fetch_ht_irq_msg(irq, &msg);
2590
2591 msg.address_lo &= ~(HT_IRQ_LOW_DEST_ID_MASK);
2592 msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
2593
2594 msg.address_lo |= HT_IRQ_LOW_DEST_ID(dest);
2595 msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest);
2596
2597 write_ht_irq_msg(irq, &msg);
2598}
2599
2600static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
2601{
2602 unsigned int dest;
2603 cpumask_t tmp;
2604
2605 cpus_and(tmp, mask, cpu_online_map);
2606 if (cpus_empty(tmp))
2607 tmp = TARGET_CPUS;
2608
2609 cpus_and(mask, tmp, CPU_MASK_ALL);
2610
2611 dest = cpu_mask_to_apicid(mask);
2612
2613 target_ht_irq(irq, dest);
2614 irq_desc[irq].affinity = mask;
2615}
2616#endif
2617
2618static struct irq_chip ht_irq_chip = {
2619 .name = "PCI-HT",
2620 .mask = mask_ht_irq,
2621 .unmask = unmask_ht_irq,
2622 .ack = ack_ioapic_irq,
2623#ifdef CONFIG_SMP
2624 .set_affinity = set_ht_irq_affinity,
2625#endif
2626 .retrigger = ioapic_retrigger_irq,
2627};
2628
2629int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
2630{
2631 int vector;
2632
2633 vector = assign_irq_vector(irq);
2634 if (vector >= 0) {
2635 struct ht_irq_msg msg;
2636 unsigned dest;
2637 cpumask_t tmp;
2638
2639 cpus_clear(tmp);
2640 cpu_set(vector >> 8, tmp);
2641 dest = cpu_mask_to_apicid(tmp);
2642
2643 msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
2644
2645 msg.address_lo =
2646 HT_IRQ_LOW_BASE |
2647 HT_IRQ_LOW_DEST_ID(dest) |
2648 HT_IRQ_LOW_VECTOR(vector) |
2649 ((INT_DEST_MODE == 0) ?
2650 HT_IRQ_LOW_DM_PHYSICAL :
2651 HT_IRQ_LOW_DM_LOGICAL) |
2652 HT_IRQ_LOW_RQEOI_EDGE |
2653 ((INT_DELIVERY_MODE != dest_LowestPrio) ?
2654 HT_IRQ_LOW_MT_FIXED :
2655 HT_IRQ_LOW_MT_ARBITRATED) |
2656 HT_IRQ_LOW_IRQ_MASKED;
2657
2658 write_ht_irq_msg(irq, &msg);
2659
2660 set_irq_chip_and_handler_name(irq, &ht_irq_chip,
2661 handle_edge_irq, "edge");
2662 }
2663 return vector;
2664}
2665#endif /* CONFIG_HT_IRQ */
2666
2667/* --------------------------------------------------------------------------
2668 ACPI-based IOAPIC Configuration
2669 -------------------------------------------------------------------------- */
2670
2671#ifdef CONFIG_ACPI
2672
2673int __init io_apic_get_unique_id (int ioapic, int apic_id)
2674{
2675 union IO_APIC_reg_00 reg_00;
2676 static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
2677 physid_mask_t tmp;
2678 unsigned long flags;
2679 int i = 0;
2680
2681 /*
2682 * The P4 platform supports up to 256 APIC IDs on two separate APIC
2683 * buses (one for LAPICs, one for IOAPICs), where predecessors only
2684 * supports up to 16 on one shared APIC bus.
2685 *
2686 * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
2687 * advantage of new APIC bus architecture.
2688 */
2689
2690 if (physids_empty(apic_id_map))
2691 apic_id_map = ioapic_phys_id_map(phys_cpu_present_map);
2692
2693 spin_lock_irqsave(&ioapic_lock, flags);
2694 reg_00.raw = io_apic_read(ioapic, 0);
2695 spin_unlock_irqrestore(&ioapic_lock, flags);
2696
2697 if (apic_id >= get_physical_broadcast()) {
2698 printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
2699 "%d\n", ioapic, apic_id, reg_00.bits.ID);
2700 apic_id = reg_00.bits.ID;
2701 }
2702
2703 /*
2704 * Every APIC in a system must have a unique ID or we get lots of nice
2705 * 'stuck on smp_invalidate_needed IPI wait' messages.
2706 */
2707 if (check_apicid_used(apic_id_map, apic_id)) {
2708
2709 for (i = 0; i < get_physical_broadcast(); i++) {
2710 if (!check_apicid_used(apic_id_map, i))
2711 break;
2712 }
2713
2714 if (i == get_physical_broadcast())
2715 panic("Max apic_id exceeded!\n");
2716
2717 printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, "
2718 "trying %d\n", ioapic, apic_id, i);
2719
2720 apic_id = i;
2721 }
2722
2723 tmp = apicid_to_cpu_present(apic_id);
2724 physids_or(apic_id_map, apic_id_map, tmp);
2725
2726 if (reg_00.bits.ID != apic_id) {
2727 reg_00.bits.ID = apic_id;
2728
2729 spin_lock_irqsave(&ioapic_lock, flags);
2730 io_apic_write(ioapic, 0, reg_00.raw);
2731 reg_00.raw = io_apic_read(ioapic, 0);
2732 spin_unlock_irqrestore(&ioapic_lock, flags);
2733
2734 /* Sanity check */
2735 if (reg_00.bits.ID != apic_id) {
2736 printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic);
2737 return -1;
2738 }
2739 }
2740
2741 apic_printk(APIC_VERBOSE, KERN_INFO
2742 "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
2743
2744 return apic_id;
2745}
2746
2747
2748int __init io_apic_get_version (int ioapic)
2749{
2750 union IO_APIC_reg_01 reg_01;
2751 unsigned long flags;
2752
2753 spin_lock_irqsave(&ioapic_lock, flags);
2754 reg_01.raw = io_apic_read(ioapic, 1);
2755 spin_unlock_irqrestore(&ioapic_lock, flags);
2756
2757 return reg_01.bits.version;
2758}
2759
2760
2761int __init io_apic_get_redir_entries (int ioapic)
2762{
2763 union IO_APIC_reg_01 reg_01;
2764 unsigned long flags;
2765
2766 spin_lock_irqsave(&ioapic_lock, flags);
2767 reg_01.raw = io_apic_read(ioapic, 1);
2768 spin_unlock_irqrestore(&ioapic_lock, flags);
2769
2770 return reg_01.bits.entries;
2771}
2772
2773
2774int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low)
2775{
2776 struct IO_APIC_route_entry entry;
2777 unsigned long flags;
2778
2779 if (!IO_APIC_IRQ(irq)) {
2780 printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
2781 ioapic);
2782 return -EINVAL;
2783 }
2784
2785 /*
2786 * Generate a PCI IRQ routing entry and program the IOAPIC accordingly.
2787 * Note that we mask (disable) IRQs now -- these get enabled when the
2788 * corresponding device driver registers for this IRQ.
2789 */
2790
2791 memset(&entry,0,sizeof(entry));
2792
2793 entry.delivery_mode = INT_DELIVERY_MODE;
2794 entry.dest_mode = INT_DEST_MODE;
2795 entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
2796 entry.trigger = edge_level;
2797 entry.polarity = active_high_low;
2798 entry.mask = 1;
2799
2800 /*
2801 * IRQs < 16 are already in the irq_2_pin[] map
2802 */
2803 if (irq >= 16)
2804 add_pin_to_irq(irq, ioapic, pin);
2805
2806 entry.vector = assign_irq_vector(irq);
2807
2808 apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry "
2809 "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic,
2810 mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq,
2811 edge_level, active_high_low);
2812
2813 ioapic_register_intr(irq, entry.vector, edge_level);
2814
2815 if (!ioapic && (irq < 16))
2816 disable_8259A_irq(irq);
2817
2818 spin_lock_irqsave(&ioapic_lock, flags);
2819 __ioapic_write_entry(ioapic, pin, entry);
2820 spin_unlock_irqrestore(&ioapic_lock, flags);
2821
2822 return 0;
2823}
2824
2825#endif /* CONFIG_ACPI */
2826
2827static int __init parse_disable_timer_pin_1(char *arg)
2828{
2829 disable_timer_pin_1 = 1;
2830 return 0;
2831}
2832early_param("disable_timer_pin_1", parse_disable_timer_pin_1);
2833
2834static int __init parse_enable_timer_pin_1(char *arg)
2835{
2836 disable_timer_pin_1 = -1;
2837 return 0;
2838}
2839early_param("enable_timer_pin_1", parse_enable_timer_pin_1);
2840
2841static int __init parse_noapic(char *arg)
2842{
2843 /* disable IO-APIC */
2844 disable_ioapic_setup();
2845 return 0;
2846}
2847early_param("noapic", parse_noapic);
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
new file mode 100644
index 000000000000..966fa1062491
--- /dev/null
+++ b/arch/x86/kernel/io_apic_64.c
@@ -0,0 +1,2202 @@
1/*
2 * Intel IO-APIC support for multi-Pentium hosts.
3 *
4 * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo
5 *
6 * Many thanks to Stig Venaas for trying out countless experimental
7 * patches and reporting/debugging problems patiently!
8 *
9 * (c) 1999, Multiple IO-APIC support, developed by
10 * Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and
11 * Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>,
12 * further tested and cleaned up by Zach Brown <zab@redhat.com>
13 * and Ingo Molnar <mingo@redhat.com>
14 *
15 * Fixes
16 * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
17 * thanks to Eric Gilmore
18 * and Rolf G. Tews
19 * for testing these extensively
20 * Paul Diefenbaugh : Added full ACPI support
21 */
22
23#include <linux/mm.h>
24#include <linux/interrupt.h>
25#include <linux/init.h>
26#include <linux/delay.h>
27#include <linux/sched.h>
28#include <linux/pci.h>
29#include <linux/mc146818rtc.h>
30#include <linux/acpi.h>
31#include <linux/sysdev.h>
32#include <linux/msi.h>
33#include <linux/htirq.h>
34#ifdef CONFIG_ACPI
35#include <acpi/acpi_bus.h>
36#endif
37
38#include <asm/idle.h>
39#include <asm/io.h>
40#include <asm/smp.h>
41#include <asm/desc.h>
42#include <asm/proto.h>
43#include <asm/mach_apic.h>
44#include <asm/acpi.h>
45#include <asm/dma.h>
46#include <asm/nmi.h>
47#include <asm/msidef.h>
48#include <asm/hypertransport.h>
49
50struct irq_cfg {
51 cpumask_t domain;
52 cpumask_t old_domain;
53 unsigned move_cleanup_count;
54 u8 vector;
55 u8 move_in_progress : 1;
56};
57
58/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
59struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = {
60 [0] = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, },
61 [1] = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, },
62 [2] = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, },
63 [3] = { .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR, },
64 [4] = { .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR, },
65 [5] = { .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR, },
66 [6] = { .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR, },
67 [7] = { .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR, },
68 [8] = { .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR, },
69 [9] = { .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR, },
70 [10] = { .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, },
71 [11] = { .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, },
72 [12] = { .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, },
73 [13] = { .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, },
74 [14] = { .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, },
75 [15] = { .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
76};
77
78static int assign_irq_vector(int irq, cpumask_t mask);
79
80#define __apicdebuginit __init
81
82int sis_apic_bug; /* not actually supported, dummy for compile */
83
84static int no_timer_check;
85
86static int disable_timer_pin_1 __initdata;
87
88int timer_over_8254 __initdata = 1;
89
90/* Where if anywhere is the i8259 connect in external int mode */
91static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
92
93static DEFINE_SPINLOCK(ioapic_lock);
94DEFINE_SPINLOCK(vector_lock);
95
96/*
97 * # of IRQ routing registers
98 */
99int nr_ioapic_registers[MAX_IO_APICS];
100
101/*
102 * Rough estimation of how many shared IRQs there are, can
103 * be changed anytime.
104 */
105#define MAX_PLUS_SHARED_IRQS NR_IRQS
106#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
107
108/*
109 * This is performance-critical, we want to do it O(1)
110 *
111 * the indexing order of this array favors 1:1 mappings
112 * between pins and IRQs.
113 */
114
115static struct irq_pin_list {
116 short apic, pin, next;
117} irq_2_pin[PIN_MAP_SIZE];
118
119struct io_apic {
120 unsigned int index;
121 unsigned int unused[3];
122 unsigned int data;
123};
124
125static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
126{
127 return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
128 + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK);
129}
130
131static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
132{
133 struct io_apic __iomem *io_apic = io_apic_base(apic);
134 writel(reg, &io_apic->index);
135 return readl(&io_apic->data);
136}
137
138static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
139{
140 struct io_apic __iomem *io_apic = io_apic_base(apic);
141 writel(reg, &io_apic->index);
142 writel(value, &io_apic->data);
143}
144
145/*
146 * Re-write a value: to be used for read-modify-write
147 * cycles where the read already set up the index register.
148 */
149static inline void io_apic_modify(unsigned int apic, unsigned int value)
150{
151 struct io_apic __iomem *io_apic = io_apic_base(apic);
152 writel(value, &io_apic->data);
153}
154
155static int io_apic_level_ack_pending(unsigned int irq)
156{
157 struct irq_pin_list *entry;
158 unsigned long flags;
159 int pending = 0;
160
161 spin_lock_irqsave(&ioapic_lock, flags);
162 entry = irq_2_pin + irq;
163 for (;;) {
164 unsigned int reg;
165 int pin;
166
167 pin = entry->pin;
168 if (pin == -1)
169 break;
170 reg = io_apic_read(entry->apic, 0x10 + pin*2);
171 /* Is the remote IRR bit set? */
172 pending |= (reg >> 14) & 1;
173 if (!entry->next)
174 break;
175 entry = irq_2_pin + entry->next;
176 }
177 spin_unlock_irqrestore(&ioapic_lock, flags);
178 return pending;
179}
180
181/*
182 * Synchronize the IO-APIC and the CPU by doing
183 * a dummy read from the IO-APIC
184 */
185static inline void io_apic_sync(unsigned int apic)
186{
187 struct io_apic __iomem *io_apic = io_apic_base(apic);
188 readl(&io_apic->data);
189}
190
191#define __DO_ACTION(R, ACTION, FINAL) \
192 \
193{ \
194 int pin; \
195 struct irq_pin_list *entry = irq_2_pin + irq; \
196 \
197 BUG_ON(irq >= NR_IRQS); \
198 for (;;) { \
199 unsigned int reg; \
200 pin = entry->pin; \
201 if (pin == -1) \
202 break; \
203 reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \
204 reg ACTION; \
205 io_apic_modify(entry->apic, reg); \
206 FINAL; \
207 if (!entry->next) \
208 break; \
209 entry = irq_2_pin + entry->next; \
210 } \
211}
212
213union entry_union {
214 struct { u32 w1, w2; };
215 struct IO_APIC_route_entry entry;
216};
217
218static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
219{
220 union entry_union eu;
221 unsigned long flags;
222 spin_lock_irqsave(&ioapic_lock, flags);
223 eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
224 eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
225 spin_unlock_irqrestore(&ioapic_lock, flags);
226 return eu.entry;
227}
228
229/*
230 * When we write a new IO APIC routing entry, we need to write the high
231 * word first! If the mask bit in the low word is clear, we will enable
232 * the interrupt, and we need to make sure the entry is fully populated
233 * before that happens.
234 */
235static void
236__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
237{
238 union entry_union eu;
239 eu.entry = e;
240 io_apic_write(apic, 0x11 + 2*pin, eu.w2);
241 io_apic_write(apic, 0x10 + 2*pin, eu.w1);
242}
243
244static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
245{
246 unsigned long flags;
247 spin_lock_irqsave(&ioapic_lock, flags);
248 __ioapic_write_entry(apic, pin, e);
249 spin_unlock_irqrestore(&ioapic_lock, flags);
250}
251
252/*
253 * When we mask an IO APIC routing entry, we need to write the low
254 * word first, in order to set the mask bit before we change the
255 * high bits!
256 */
257static void ioapic_mask_entry(int apic, int pin)
258{
259 unsigned long flags;
260 union entry_union eu = { .entry.mask = 1 };
261
262 spin_lock_irqsave(&ioapic_lock, flags);
263 io_apic_write(apic, 0x10 + 2*pin, eu.w1);
264 io_apic_write(apic, 0x11 + 2*pin, eu.w2);
265 spin_unlock_irqrestore(&ioapic_lock, flags);
266}
267
268#ifdef CONFIG_SMP
269static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
270{
271 int apic, pin;
272 struct irq_pin_list *entry = irq_2_pin + irq;
273
274 BUG_ON(irq >= NR_IRQS);
275 for (;;) {
276 unsigned int reg;
277 apic = entry->apic;
278 pin = entry->pin;
279 if (pin == -1)
280 break;
281 io_apic_write(apic, 0x11 + pin*2, dest);
282 reg = io_apic_read(apic, 0x10 + pin*2);
283 reg &= ~0x000000ff;
284 reg |= vector;
285 io_apic_modify(apic, reg);
286 if (!entry->next)
287 break;
288 entry = irq_2_pin + entry->next;
289 }
290}
291
292static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
293{
294 struct irq_cfg *cfg = irq_cfg + irq;
295 unsigned long flags;
296 unsigned int dest;
297 cpumask_t tmp;
298
299 cpus_and(tmp, mask, cpu_online_map);
300 if (cpus_empty(tmp))
301 return;
302
303 if (assign_irq_vector(irq, mask))
304 return;
305
306 cpus_and(tmp, cfg->domain, mask);
307 dest = cpu_mask_to_apicid(tmp);
308
309 /*
310 * Only the high 8 bits are valid.
311 */
312 dest = SET_APIC_LOGICAL_ID(dest);
313
314 spin_lock_irqsave(&ioapic_lock, flags);
315 __target_IO_APIC_irq(irq, dest, cfg->vector);
316 irq_desc[irq].affinity = mask;
317 spin_unlock_irqrestore(&ioapic_lock, flags);
318}
319#endif
320
321/*
322 * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
323 * shared ISA-space IRQs, so we have to support them. We are super
324 * fast in the common case, and fast for shared ISA-space IRQs.
325 */
326static void add_pin_to_irq(unsigned int irq, int apic, int pin)
327{
328 static int first_free_entry = NR_IRQS;
329 struct irq_pin_list *entry = irq_2_pin + irq;
330
331 BUG_ON(irq >= NR_IRQS);
332 while (entry->next)
333 entry = irq_2_pin + entry->next;
334
335 if (entry->pin != -1) {
336 entry->next = first_free_entry;
337 entry = irq_2_pin + entry->next;
338 if (++first_free_entry >= PIN_MAP_SIZE)
339 panic("io_apic.c: ran out of irq_2_pin entries!");
340 }
341 entry->apic = apic;
342 entry->pin = pin;
343}
344
345
346#define DO_ACTION(name,R,ACTION, FINAL) \
347 \
348 static void name##_IO_APIC_irq (unsigned int irq) \
349 __DO_ACTION(R, ACTION, FINAL)
350
351DO_ACTION( __mask, 0, |= 0x00010000, io_apic_sync(entry->apic) )
352 /* mask = 1 */
353DO_ACTION( __unmask, 0, &= 0xfffeffff, )
354 /* mask = 0 */
355
356static void mask_IO_APIC_irq (unsigned int irq)
357{
358 unsigned long flags;
359
360 spin_lock_irqsave(&ioapic_lock, flags);
361 __mask_IO_APIC_irq(irq);
362 spin_unlock_irqrestore(&ioapic_lock, flags);
363}
364
365static void unmask_IO_APIC_irq (unsigned int irq)
366{
367 unsigned long flags;
368
369 spin_lock_irqsave(&ioapic_lock, flags);
370 __unmask_IO_APIC_irq(irq);
371 spin_unlock_irqrestore(&ioapic_lock, flags);
372}
373
374static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
375{
376 struct IO_APIC_route_entry entry;
377
378 /* Check delivery_mode to be sure we're not clearing an SMI pin */
379 entry = ioapic_read_entry(apic, pin);
380 if (entry.delivery_mode == dest_SMI)
381 return;
382 /*
383 * Disable it in the IO-APIC irq-routing table:
384 */
385 ioapic_mask_entry(apic, pin);
386}
387
388static void clear_IO_APIC (void)
389{
390 int apic, pin;
391
392 for (apic = 0; apic < nr_ioapics; apic++)
393 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
394 clear_IO_APIC_pin(apic, pin);
395}
396
397int skip_ioapic_setup;
398int ioapic_force;
399
400static int __init parse_noapic(char *str)
401{
402 disable_ioapic_setup();
403 return 0;
404}
405early_param("noapic", parse_noapic);
406
407/* Actually the next is obsolete, but keep it for paranoid reasons -AK */
408static int __init disable_timer_pin_setup(char *arg)
409{
410 disable_timer_pin_1 = 1;
411 return 1;
412}
413__setup("disable_timer_pin_1", disable_timer_pin_setup);
414
415static int __init setup_disable_8254_timer(char *s)
416{
417 timer_over_8254 = -1;
418 return 1;
419}
420static int __init setup_enable_8254_timer(char *s)
421{
422 timer_over_8254 = 2;
423 return 1;
424}
425
426__setup("disable_8254_timer", setup_disable_8254_timer);
427__setup("enable_8254_timer", setup_enable_8254_timer);
428
429
430/*
431 * Find the IRQ entry number of a certain pin.
432 */
433static int find_irq_entry(int apic, int pin, int type)
434{
435 int i;
436
437 for (i = 0; i < mp_irq_entries; i++)
438 if (mp_irqs[i].mpc_irqtype == type &&
439 (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
440 mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
441 mp_irqs[i].mpc_dstirq == pin)
442 return i;
443
444 return -1;
445}
446
447/*
448 * Find the pin to which IRQ[irq] (ISA) is connected
449 */
450static int __init find_isa_irq_pin(int irq, int type)
451{
452 int i;
453
454 for (i = 0; i < mp_irq_entries; i++) {
455 int lbus = mp_irqs[i].mpc_srcbus;
456
457 if (test_bit(lbus, mp_bus_not_pci) &&
458 (mp_irqs[i].mpc_irqtype == type) &&
459 (mp_irqs[i].mpc_srcbusirq == irq))
460
461 return mp_irqs[i].mpc_dstirq;
462 }
463 return -1;
464}
465
466static int __init find_isa_irq_apic(int irq, int type)
467{
468 int i;
469
470 for (i = 0; i < mp_irq_entries; i++) {
471 int lbus = mp_irqs[i].mpc_srcbus;
472
473 if (test_bit(lbus, mp_bus_not_pci) &&
474 (mp_irqs[i].mpc_irqtype == type) &&
475 (mp_irqs[i].mpc_srcbusirq == irq))
476 break;
477 }
478 if (i < mp_irq_entries) {
479 int apic;
480 for(apic = 0; apic < nr_ioapics; apic++) {
481 if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
482 return apic;
483 }
484 }
485
486 return -1;
487}
488
489/*
490 * Find a specific PCI IRQ entry.
491 * Not an __init, possibly needed by modules
492 */
493static int pin_2_irq(int idx, int apic, int pin);
494
495int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
496{
497 int apic, i, best_guess = -1;
498
499 apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
500 bus, slot, pin);
501 if (mp_bus_id_to_pci_bus[bus] == -1) {
502 apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
503 return -1;
504 }
505 for (i = 0; i < mp_irq_entries; i++) {
506 int lbus = mp_irqs[i].mpc_srcbus;
507
508 for (apic = 0; apic < nr_ioapics; apic++)
509 if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
510 mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
511 break;
512
513 if (!test_bit(lbus, mp_bus_not_pci) &&
514 !mp_irqs[i].mpc_irqtype &&
515 (bus == lbus) &&
516 (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
517 int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
518
519 if (!(apic || IO_APIC_IRQ(irq)))
520 continue;
521
522 if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
523 return irq;
524 /*
525 * Use the first all-but-pin matching entry as a
526 * best-guess fuzzy result for broken mptables.
527 */
528 if (best_guess < 0)
529 best_guess = irq;
530 }
531 }
532 BUG_ON(best_guess >= NR_IRQS);
533 return best_guess;
534}
535
536/* ISA interrupts are always polarity zero edge triggered,
537 * when listed as conforming in the MP table. */
538
539#define default_ISA_trigger(idx) (0)
540#define default_ISA_polarity(idx) (0)
541
542/* PCI interrupts are always polarity one level triggered,
543 * when listed as conforming in the MP table. */
544
545#define default_PCI_trigger(idx) (1)
546#define default_PCI_polarity(idx) (1)
547
548static int __init MPBIOS_polarity(int idx)
549{
550 int bus = mp_irqs[idx].mpc_srcbus;
551 int polarity;
552
553 /*
554 * Determine IRQ line polarity (high active or low active):
555 */
556 switch (mp_irqs[idx].mpc_irqflag & 3)
557 {
558 case 0: /* conforms, ie. bus-type dependent polarity */
559 if (test_bit(bus, mp_bus_not_pci))
560 polarity = default_ISA_polarity(idx);
561 else
562 polarity = default_PCI_polarity(idx);
563 break;
564 case 1: /* high active */
565 {
566 polarity = 0;
567 break;
568 }
569 case 2: /* reserved */
570 {
571 printk(KERN_WARNING "broken BIOS!!\n");
572 polarity = 1;
573 break;
574 }
575 case 3: /* low active */
576 {
577 polarity = 1;
578 break;
579 }
580 default: /* invalid */
581 {
582 printk(KERN_WARNING "broken BIOS!!\n");
583 polarity = 1;
584 break;
585 }
586 }
587 return polarity;
588}
589
590static int MPBIOS_trigger(int idx)
591{
592 int bus = mp_irqs[idx].mpc_srcbus;
593 int trigger;
594
595 /*
596 * Determine IRQ trigger mode (edge or level sensitive):
597 */
598 switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
599 {
600 case 0: /* conforms, ie. bus-type dependent */
601 if (test_bit(bus, mp_bus_not_pci))
602 trigger = default_ISA_trigger(idx);
603 else
604 trigger = default_PCI_trigger(idx);
605 break;
606 case 1: /* edge */
607 {
608 trigger = 0;
609 break;
610 }
611 case 2: /* reserved */
612 {
613 printk(KERN_WARNING "broken BIOS!!\n");
614 trigger = 1;
615 break;
616 }
617 case 3: /* level */
618 {
619 trigger = 1;
620 break;
621 }
622 default: /* invalid */
623 {
624 printk(KERN_WARNING "broken BIOS!!\n");
625 trigger = 0;
626 break;
627 }
628 }
629 return trigger;
630}
631
632static inline int irq_polarity(int idx)
633{
634 return MPBIOS_polarity(idx);
635}
636
637static inline int irq_trigger(int idx)
638{
639 return MPBIOS_trigger(idx);
640}
641
642static int pin_2_irq(int idx, int apic, int pin)
643{
644 int irq, i;
645 int bus = mp_irqs[idx].mpc_srcbus;
646
647 /*
648 * Debugging check, we are in big trouble if this message pops up!
649 */
650 if (mp_irqs[idx].mpc_dstirq != pin)
651 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
652
653 if (test_bit(bus, mp_bus_not_pci)) {
654 irq = mp_irqs[idx].mpc_srcbusirq;
655 } else {
656 /*
657 * PCI IRQs are mapped in order
658 */
659 i = irq = 0;
660 while (i < apic)
661 irq += nr_ioapic_registers[i++];
662 irq += pin;
663 }
664 BUG_ON(irq >= NR_IRQS);
665 return irq;
666}
667
668static int __assign_irq_vector(int irq, cpumask_t mask)
669{
670 /*
671 * NOTE! The local APIC isn't very good at handling
672 * multiple interrupts at the same interrupt level.
673 * As the interrupt level is determined by taking the
674 * vector number and shifting that right by 4, we
675 * want to spread these out a bit so that they don't
676 * all fall in the same interrupt level.
677 *
678 * Also, we've got to be careful not to trash gate
679 * 0x80, because int 0x80 is hm, kind of importantish. ;)
680 */
681 static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
682 unsigned int old_vector;
683 int cpu;
684 struct irq_cfg *cfg;
685
686 BUG_ON((unsigned)irq >= NR_IRQS);
687 cfg = &irq_cfg[irq];
688
689 /* Only try and allocate irqs on cpus that are present */
690 cpus_and(mask, mask, cpu_online_map);
691
692 if ((cfg->move_in_progress) || cfg->move_cleanup_count)
693 return -EBUSY;
694
695 old_vector = cfg->vector;
696 if (old_vector) {
697 cpumask_t tmp;
698 cpus_and(tmp, cfg->domain, mask);
699 if (!cpus_empty(tmp))
700 return 0;
701 }
702
703 for_each_cpu_mask(cpu, mask) {
704 cpumask_t domain, new_mask;
705 int new_cpu;
706 int vector, offset;
707
708 domain = vector_allocation_domain(cpu);
709 cpus_and(new_mask, domain, cpu_online_map);
710
711 vector = current_vector;
712 offset = current_offset;
713next:
714 vector += 8;
715 if (vector >= FIRST_SYSTEM_VECTOR) {
716 /* If we run out of vectors on large boxen, must share them. */
717 offset = (offset + 1) % 8;
718 vector = FIRST_DEVICE_VECTOR + offset;
719 }
720 if (unlikely(current_vector == vector))
721 continue;
722 if (vector == IA32_SYSCALL_VECTOR)
723 goto next;
724 for_each_cpu_mask(new_cpu, new_mask)
725 if (per_cpu(vector_irq, new_cpu)[vector] != -1)
726 goto next;
727 /* Found one! */
728 current_vector = vector;
729 current_offset = offset;
730 if (old_vector) {
731 cfg->move_in_progress = 1;
732 cfg->old_domain = cfg->domain;
733 }
734 for_each_cpu_mask(new_cpu, new_mask)
735 per_cpu(vector_irq, new_cpu)[vector] = irq;
736 cfg->vector = vector;
737 cfg->domain = domain;
738 return 0;
739 }
740 return -ENOSPC;
741}
742
743static int assign_irq_vector(int irq, cpumask_t mask)
744{
745 int err;
746 unsigned long flags;
747
748 spin_lock_irqsave(&vector_lock, flags);
749 err = __assign_irq_vector(irq, mask);
750 spin_unlock_irqrestore(&vector_lock, flags);
751 return err;
752}
753
754static void __clear_irq_vector(int irq)
755{
756 struct irq_cfg *cfg;
757 cpumask_t mask;
758 int cpu, vector;
759
760 BUG_ON((unsigned)irq >= NR_IRQS);
761 cfg = &irq_cfg[irq];
762 BUG_ON(!cfg->vector);
763
764 vector = cfg->vector;
765 cpus_and(mask, cfg->domain, cpu_online_map);
766 for_each_cpu_mask(cpu, mask)
767 per_cpu(vector_irq, cpu)[vector] = -1;
768
769 cfg->vector = 0;
770 cfg->domain = CPU_MASK_NONE;
771}
772
773void __setup_vector_irq(int cpu)
774{
775 /* Initialize vector_irq on a new cpu */
776 /* This function must be called with vector_lock held */
777 int irq, vector;
778
779 /* Mark the inuse vectors */
780 for (irq = 0; irq < NR_IRQS; ++irq) {
781 if (!cpu_isset(cpu, irq_cfg[irq].domain))
782 continue;
783 vector = irq_cfg[irq].vector;
784 per_cpu(vector_irq, cpu)[vector] = irq;
785 }
786 /* Mark the free vectors */
787 for (vector = 0; vector < NR_VECTORS; ++vector) {
788 irq = per_cpu(vector_irq, cpu)[vector];
789 if (irq < 0)
790 continue;
791 if (!cpu_isset(cpu, irq_cfg[irq].domain))
792 per_cpu(vector_irq, cpu)[vector] = -1;
793 }
794}
795
796
797static struct irq_chip ioapic_chip;
798
799static void ioapic_register_intr(int irq, unsigned long trigger)
800{
801 if (trigger) {
802 irq_desc[irq].status |= IRQ_LEVEL;
803 set_irq_chip_and_handler_name(irq, &ioapic_chip,
804 handle_fasteoi_irq, "fasteoi");
805 } else {
806 irq_desc[irq].status &= ~IRQ_LEVEL;
807 set_irq_chip_and_handler_name(irq, &ioapic_chip,
808 handle_edge_irq, "edge");
809 }
810}
811
812static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
813 int trigger, int polarity)
814{
815 struct irq_cfg *cfg = irq_cfg + irq;
816 struct IO_APIC_route_entry entry;
817 cpumask_t mask;
818
819 if (!IO_APIC_IRQ(irq))
820 return;
821
822 mask = TARGET_CPUS;
823 if (assign_irq_vector(irq, mask))
824 return;
825
826 cpus_and(mask, cfg->domain, mask);
827
828 apic_printk(APIC_VERBOSE,KERN_DEBUG
829 "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
830 "IRQ %d Mode:%i Active:%i)\n",
831 apic, mp_ioapics[apic].mpc_apicid, pin, cfg->vector,
832 irq, trigger, polarity);
833
834 /*
835 * add it to the IO-APIC irq-routing table:
836 */
837 memset(&entry,0,sizeof(entry));
838
839 entry.delivery_mode = INT_DELIVERY_MODE;
840 entry.dest_mode = INT_DEST_MODE;
841 entry.dest = cpu_mask_to_apicid(mask);
842 entry.mask = 0; /* enable IRQ */
843 entry.trigger = trigger;
844 entry.polarity = polarity;
845 entry.vector = cfg->vector;
846
847 /* Mask level triggered irqs.
848 * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
849 */
850 if (trigger)
851 entry.mask = 1;
852
853 ioapic_register_intr(irq, trigger);
854 if (irq < 16)
855 disable_8259A_irq(irq);
856
857 ioapic_write_entry(apic, pin, entry);
858}
859
860static void __init setup_IO_APIC_irqs(void)
861{
862 int apic, pin, idx, irq, first_notcon = 1;
863
864 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
865
866 for (apic = 0; apic < nr_ioapics; apic++) {
867 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
868
869 idx = find_irq_entry(apic,pin,mp_INT);
870 if (idx == -1) {
871 if (first_notcon) {
872 apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin);
873 first_notcon = 0;
874 } else
875 apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin);
876 continue;
877 }
878
879 irq = pin_2_irq(idx, apic, pin);
880 add_pin_to_irq(irq, apic, pin);
881
882 setup_IO_APIC_irq(apic, pin, irq,
883 irq_trigger(idx), irq_polarity(idx));
884 }
885 }
886
887 if (!first_notcon)
888 apic_printk(APIC_VERBOSE," not connected.\n");
889}
890
891/*
892 * Set up the 8259A-master output pin as broadcast to all
893 * CPUs.
894 */
895static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
896{
897 struct IO_APIC_route_entry entry;
898 unsigned long flags;
899
900 memset(&entry,0,sizeof(entry));
901
902 disable_8259A_irq(0);
903
904 /* mask LVT0 */
905 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
906
907 /*
908 * We use logical delivery to get the timer IRQ
909 * to the first CPU.
910 */
911 entry.dest_mode = INT_DEST_MODE;
912 entry.mask = 0; /* unmask IRQ now */
913 entry.dest = cpu_mask_to_apicid(TARGET_CPUS);
914 entry.delivery_mode = INT_DELIVERY_MODE;
915 entry.polarity = 0;
916 entry.trigger = 0;
917 entry.vector = vector;
918
919 /*
920 * The timer IRQ doesn't have to know that behind the
921 * scene we have a 8259A-master in AEOI mode ...
922 */
923 set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge");
924
925 /*
926 * Add it to the IO-APIC irq-routing table:
927 */
928 spin_lock_irqsave(&ioapic_lock, flags);
929 io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
930 io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
931 spin_unlock_irqrestore(&ioapic_lock, flags);
932
933 enable_8259A_irq(0);
934}
935
936void __apicdebuginit print_IO_APIC(void)
937{
938 int apic, i;
939 union IO_APIC_reg_00 reg_00;
940 union IO_APIC_reg_01 reg_01;
941 union IO_APIC_reg_02 reg_02;
942 unsigned long flags;
943
944 if (apic_verbosity == APIC_QUIET)
945 return;
946
947 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
948 for (i = 0; i < nr_ioapics; i++)
949 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
950 mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
951
952 /*
953 * We are a bit conservative about what we expect. We have to
954 * know about every hardware change ASAP.
955 */
956 printk(KERN_INFO "testing the IO APIC.......................\n");
957
958 for (apic = 0; apic < nr_ioapics; apic++) {
959
960 spin_lock_irqsave(&ioapic_lock, flags);
961 reg_00.raw = io_apic_read(apic, 0);
962 reg_01.raw = io_apic_read(apic, 1);
963 if (reg_01.bits.version >= 0x10)
964 reg_02.raw = io_apic_read(apic, 2);
965 spin_unlock_irqrestore(&ioapic_lock, flags);
966
967 printk("\n");
968 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
969 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
970 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
971
972 printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)&reg_01);
973 printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries);
974
975 printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
976 printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version);
977
978 if (reg_01.bits.version >= 0x10) {
979 printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
980 printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration);
981 }
982
983 printk(KERN_DEBUG ".... IRQ redirection table:\n");
984
985 printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol"
986 " Stat Dmod Deli Vect: \n");
987
988 for (i = 0; i <= reg_01.bits.entries; i++) {
989 struct IO_APIC_route_entry entry;
990
991 entry = ioapic_read_entry(apic, i);
992
993 printk(KERN_DEBUG " %02x %03X ",
994 i,
995 entry.dest
996 );
997
998 printk("%1d %1d %1d %1d %1d %1d %1d %02X\n",
999 entry.mask,
1000 entry.trigger,
1001 entry.irr,
1002 entry.polarity,
1003 entry.delivery_status,
1004 entry.dest_mode,
1005 entry.delivery_mode,
1006 entry.vector
1007 );
1008 }
1009 }
1010 printk(KERN_DEBUG "IRQ to pin mappings:\n");
1011 for (i = 0; i < NR_IRQS; i++) {
1012 struct irq_pin_list *entry = irq_2_pin + i;
1013 if (entry->pin < 0)
1014 continue;
1015 printk(KERN_DEBUG "IRQ%d ", i);
1016 for (;;) {
1017 printk("-> %d:%d", entry->apic, entry->pin);
1018 if (!entry->next)
1019 break;
1020 entry = irq_2_pin + entry->next;
1021 }
1022 printk("\n");
1023 }
1024
1025 printk(KERN_INFO ".................................... done.\n");
1026
1027 return;
1028}
1029
1030#if 0
1031
1032static __apicdebuginit void print_APIC_bitfield (int base)
1033{
1034 unsigned int v;
1035 int i, j;
1036
1037 if (apic_verbosity == APIC_QUIET)
1038 return;
1039
1040 printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG);
1041 for (i = 0; i < 8; i++) {
1042 v = apic_read(base + i*0x10);
1043 for (j = 0; j < 32; j++) {
1044 if (v & (1<<j))
1045 printk("1");
1046 else
1047 printk("0");
1048 }
1049 printk("\n");
1050 }
1051}
1052
1053void __apicdebuginit print_local_APIC(void * dummy)
1054{
1055 unsigned int v, ver, maxlvt;
1056
1057 if (apic_verbosity == APIC_QUIET)
1058 return;
1059
1060 printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
1061 smp_processor_id(), hard_smp_processor_id());
1062 v = apic_read(APIC_ID);
1063 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(v));
1064 v = apic_read(APIC_LVR);
1065 printk(KERN_INFO "... APIC VERSION: %08x\n", v);
1066 ver = GET_APIC_VERSION(v);
1067 maxlvt = get_maxlvt();
1068
1069 v = apic_read(APIC_TASKPRI);
1070 printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
1071
1072 v = apic_read(APIC_ARBPRI);
1073 printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
1074 v & APIC_ARBPRI_MASK);
1075 v = apic_read(APIC_PROCPRI);
1076 printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
1077
1078 v = apic_read(APIC_EOI);
1079 printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
1080 v = apic_read(APIC_RRR);
1081 printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
1082 v = apic_read(APIC_LDR);
1083 printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
1084 v = apic_read(APIC_DFR);
1085 printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
1086 v = apic_read(APIC_SPIV);
1087 printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
1088
1089 printk(KERN_DEBUG "... APIC ISR field:\n");
1090 print_APIC_bitfield(APIC_ISR);
1091 printk(KERN_DEBUG "... APIC TMR field:\n");
1092 print_APIC_bitfield(APIC_TMR);
1093 printk(KERN_DEBUG "... APIC IRR field:\n");
1094 print_APIC_bitfield(APIC_IRR);
1095
1096 v = apic_read(APIC_ESR);
1097 printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
1098
1099 v = apic_read(APIC_ICR);
1100 printk(KERN_DEBUG "... APIC ICR: %08x\n", v);
1101 v = apic_read(APIC_ICR2);
1102 printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
1103
1104 v = apic_read(APIC_LVTT);
1105 printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
1106
1107 if (maxlvt > 3) { /* PC is LVT#4. */
1108 v = apic_read(APIC_LVTPC);
1109 printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v);
1110 }
1111 v = apic_read(APIC_LVT0);
1112 printk(KERN_DEBUG "... APIC LVT0: %08x\n", v);
1113 v = apic_read(APIC_LVT1);
1114 printk(KERN_DEBUG "... APIC LVT1: %08x\n", v);
1115
1116 if (maxlvt > 2) { /* ERR is LVT#3. */
1117 v = apic_read(APIC_LVTERR);
1118 printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v);
1119 }
1120
1121 v = apic_read(APIC_TMICT);
1122 printk(KERN_DEBUG "... APIC TMICT: %08x\n", v);
1123 v = apic_read(APIC_TMCCT);
1124 printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
1125 v = apic_read(APIC_TDCR);
1126 printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
1127 printk("\n");
1128}
1129
1130void print_all_local_APICs (void)
1131{
1132 on_each_cpu(print_local_APIC, NULL, 1, 1);
1133}
1134
1135void __apicdebuginit print_PIC(void)
1136{
1137 unsigned int v;
1138 unsigned long flags;
1139
1140 if (apic_verbosity == APIC_QUIET)
1141 return;
1142
1143 printk(KERN_DEBUG "\nprinting PIC contents\n");
1144
1145 spin_lock_irqsave(&i8259A_lock, flags);
1146
1147 v = inb(0xa1) << 8 | inb(0x21);
1148 printk(KERN_DEBUG "... PIC IMR: %04x\n", v);
1149
1150 v = inb(0xa0) << 8 | inb(0x20);
1151 printk(KERN_DEBUG "... PIC IRR: %04x\n", v);
1152
1153 outb(0x0b,0xa0);
1154 outb(0x0b,0x20);
1155 v = inb(0xa0) << 8 | inb(0x20);
1156 outb(0x0a,0xa0);
1157 outb(0x0a,0x20);
1158
1159 spin_unlock_irqrestore(&i8259A_lock, flags);
1160
1161 printk(KERN_DEBUG "... PIC ISR: %04x\n", v);
1162
1163 v = inb(0x4d1) << 8 | inb(0x4d0);
1164 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
1165}
1166
1167#endif /* 0 */
1168
1169static void __init enable_IO_APIC(void)
1170{
1171 union IO_APIC_reg_01 reg_01;
1172 int i8259_apic, i8259_pin;
1173 int i, apic;
1174 unsigned long flags;
1175
1176 for (i = 0; i < PIN_MAP_SIZE; i++) {
1177 irq_2_pin[i].pin = -1;
1178 irq_2_pin[i].next = 0;
1179 }
1180
1181 /*
1182 * The number of IO-APIC IRQ registers (== #pins):
1183 */
1184 for (apic = 0; apic < nr_ioapics; apic++) {
1185 spin_lock_irqsave(&ioapic_lock, flags);
1186 reg_01.raw = io_apic_read(apic, 1);
1187 spin_unlock_irqrestore(&ioapic_lock, flags);
1188 nr_ioapic_registers[apic] = reg_01.bits.entries+1;
1189 }
1190 for(apic = 0; apic < nr_ioapics; apic++) {
1191 int pin;
1192 /* See if any of the pins is in ExtINT mode */
1193 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
1194 struct IO_APIC_route_entry entry;
1195 entry = ioapic_read_entry(apic, pin);
1196
1197 /* If the interrupt line is enabled and in ExtInt mode
1198 * I have found the pin where the i8259 is connected.
1199 */
1200 if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
1201 ioapic_i8259.apic = apic;
1202 ioapic_i8259.pin = pin;
1203 goto found_i8259;
1204 }
1205 }
1206 }
1207 found_i8259:
1208 /* Look to see what if the MP table has reported the ExtINT */
1209 i8259_pin = find_isa_irq_pin(0, mp_ExtINT);
1210 i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
1211 /* Trust the MP table if nothing is setup in the hardware */
1212 if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) {
1213 printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n");
1214 ioapic_i8259.pin = i8259_pin;
1215 ioapic_i8259.apic = i8259_apic;
1216 }
1217 /* Complain if the MP table and the hardware disagree */
1218 if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) &&
1219 (i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
1220 {
1221 printk(KERN_WARNING "ExtINT in hardware and MP table differ\n");
1222 }
1223
1224 /*
1225 * Do not trust the IO-APIC being empty at bootup
1226 */
1227 clear_IO_APIC();
1228}
1229
1230/*
1231 * Not an __init, needed by the reboot code
1232 */
1233void disable_IO_APIC(void)
1234{
1235 /*
1236 * Clear the IO-APIC before rebooting:
1237 */
1238 clear_IO_APIC();
1239
1240 /*
1241 * If the i8259 is routed through an IOAPIC
1242 * Put that IOAPIC in virtual wire mode
1243 * so legacy interrupts can be delivered.
1244 */
1245 if (ioapic_i8259.pin != -1) {
1246 struct IO_APIC_route_entry entry;
1247
1248 memset(&entry, 0, sizeof(entry));
1249 entry.mask = 0; /* Enabled */
1250 entry.trigger = 0; /* Edge */
1251 entry.irr = 0;
1252 entry.polarity = 0; /* High */
1253 entry.delivery_status = 0;
1254 entry.dest_mode = 0; /* Physical */
1255 entry.delivery_mode = dest_ExtINT; /* ExtInt */
1256 entry.vector = 0;
1257 entry.dest = GET_APIC_ID(apic_read(APIC_ID));
1258
1259 /*
1260 * Add it to the IO-APIC irq-routing table:
1261 */
1262 ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
1263 }
1264
1265 disconnect_bsp_APIC(ioapic_i8259.pin != -1);
1266}
1267
1268/*
1269 * There is a nasty bug in some older SMP boards, their mptable lies
1270 * about the timer IRQ. We do the following to work around the situation:
1271 *
1272 * - timer IRQ defaults to IO-APIC IRQ
1273 * - if this function detects that timer IRQs are defunct, then we fall
1274 * back to ISA timer IRQs
1275 */
1276static int __init timer_irq_works(void)
1277{
1278 unsigned long t1 = jiffies;
1279
1280 local_irq_enable();
1281 /* Let ten ticks pass... */
1282 mdelay((10 * 1000) / HZ);
1283
1284 /*
1285 * Expect a few ticks at least, to be sure some possible
1286 * glue logic does not lock up after one or two first
1287 * ticks in a non-ExtINT mode. Also the local APIC
1288 * might have cached one ExtINT interrupt. Finally, at
1289 * least one tick may be lost due to delays.
1290 */
1291
1292 /* jiffies wrap? */
1293 if (jiffies - t1 > 4)
1294 return 1;
1295 return 0;
1296}
1297
1298/*
1299 * In the SMP+IOAPIC case it might happen that there are an unspecified
1300 * number of pending IRQ events unhandled. These cases are very rare,
1301 * so we 'resend' these IRQs via IPIs, to the same CPU. It's much
1302 * better to do it this way as thus we do not have to be aware of
1303 * 'pending' interrupts in the IRQ path, except at this point.
1304 */
1305/*
1306 * Edge triggered needs to resend any interrupt
1307 * that was delayed but this is now handled in the device
1308 * independent code.
1309 */
1310
1311/*
1312 * Starting up a edge-triggered IO-APIC interrupt is
1313 * nasty - we need to make sure that we get the edge.
1314 * If it is already asserted for some reason, we need
1315 * return 1 to indicate that is was pending.
1316 *
1317 * This is not complete - we should be able to fake
1318 * an edge even if it isn't on the 8259A...
1319 */
1320
1321static unsigned int startup_ioapic_irq(unsigned int irq)
1322{
1323 int was_pending = 0;
1324 unsigned long flags;
1325
1326 spin_lock_irqsave(&ioapic_lock, flags);
1327 if (irq < 16) {
1328 disable_8259A_irq(irq);
1329 if (i8259A_irq_pending(irq))
1330 was_pending = 1;
1331 }
1332 __unmask_IO_APIC_irq(irq);
1333 spin_unlock_irqrestore(&ioapic_lock, flags);
1334
1335 return was_pending;
1336}
1337
1338static int ioapic_retrigger_irq(unsigned int irq)
1339{
1340 struct irq_cfg *cfg = &irq_cfg[irq];
1341 cpumask_t mask;
1342 unsigned long flags;
1343
1344 spin_lock_irqsave(&vector_lock, flags);
1345 cpus_clear(mask);
1346 cpu_set(first_cpu(cfg->domain), mask);
1347
1348 send_IPI_mask(mask, cfg->vector);
1349 spin_unlock_irqrestore(&vector_lock, flags);
1350
1351 return 1;
1352}
1353
1354/*
1355 * Level and edge triggered IO-APIC interrupts need different handling,
1356 * so we use two separate IRQ descriptors. Edge triggered IRQs can be
1357 * handled with the level-triggered descriptor, but that one has slightly
1358 * more overhead. Level-triggered interrupts cannot be handled with the
1359 * edge-triggered handler, without risking IRQ storms and other ugly
1360 * races.
1361 */
1362
1363#ifdef CONFIG_SMP
1364asmlinkage void smp_irq_move_cleanup_interrupt(void)
1365{
1366 unsigned vector, me;
1367 ack_APIC_irq();
1368 exit_idle();
1369 irq_enter();
1370
1371 me = smp_processor_id();
1372 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
1373 unsigned int irq;
1374 struct irq_desc *desc;
1375 struct irq_cfg *cfg;
1376 irq = __get_cpu_var(vector_irq)[vector];
1377 if (irq >= NR_IRQS)
1378 continue;
1379
1380 desc = irq_desc + irq;
1381 cfg = irq_cfg + irq;
1382 spin_lock(&desc->lock);
1383 if (!cfg->move_cleanup_count)
1384 goto unlock;
1385
1386 if ((vector == cfg->vector) && cpu_isset(me, cfg->domain))
1387 goto unlock;
1388
1389 __get_cpu_var(vector_irq)[vector] = -1;
1390 cfg->move_cleanup_count--;
1391unlock:
1392 spin_unlock(&desc->lock);
1393 }
1394
1395 irq_exit();
1396}
1397
1398static void irq_complete_move(unsigned int irq)
1399{
1400 struct irq_cfg *cfg = irq_cfg + irq;
1401 unsigned vector, me;
1402
1403 if (likely(!cfg->move_in_progress))
1404 return;
1405
1406 vector = ~get_irq_regs()->orig_rax;
1407 me = smp_processor_id();
1408 if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) {
1409 cpumask_t cleanup_mask;
1410
1411 cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
1412 cfg->move_cleanup_count = cpus_weight(cleanup_mask);
1413 send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
1414 cfg->move_in_progress = 0;
1415 }
1416}
1417#else
1418static inline void irq_complete_move(unsigned int irq) {}
1419#endif
1420
1421static void ack_apic_edge(unsigned int irq)
1422{
1423 irq_complete_move(irq);
1424 move_native_irq(irq);
1425 ack_APIC_irq();
1426}
1427
1428static void ack_apic_level(unsigned int irq)
1429{
1430 int do_unmask_irq = 0;
1431
1432 irq_complete_move(irq);
1433#if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE)
1434 /* If we are moving the irq we need to mask it */
1435 if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) {
1436 do_unmask_irq = 1;
1437 mask_IO_APIC_irq(irq);
1438 }
1439#endif
1440
1441 /*
1442 * We must acknowledge the irq before we move it or the acknowledge will
1443 * not propagate properly.
1444 */
1445 ack_APIC_irq();
1446
1447 /* Now we can move and renable the irq */
1448 if (unlikely(do_unmask_irq)) {
1449 /* Only migrate the irq if the ack has been received.
1450 *
1451 * On rare occasions the broadcast level triggered ack gets
1452 * delayed going to ioapics, and if we reprogram the
1453 * vector while Remote IRR is still set the irq will never
1454 * fire again.
1455 *
1456 * To prevent this scenario we read the Remote IRR bit
1457 * of the ioapic. This has two effects.
1458 * - On any sane system the read of the ioapic will
1459 * flush writes (and acks) going to the ioapic from
1460 * this cpu.
1461 * - We get to see if the ACK has actually been delivered.
1462 *
1463 * Based on failed experiments of reprogramming the
1464 * ioapic entry from outside of irq context starting
1465 * with masking the ioapic entry and then polling until
1466 * Remote IRR was clear before reprogramming the
1467 * ioapic I don't trust the Remote IRR bit to be
1468 * completey accurate.
1469 *
1470 * However there appears to be no other way to plug
1471 * this race, so if the Remote IRR bit is not
1472 * accurate and is causing problems then it is a hardware bug
1473 * and you can go talk to the chipset vendor about it.
1474 */
1475 if (!io_apic_level_ack_pending(irq))
1476 move_masked_irq(irq);
1477 unmask_IO_APIC_irq(irq);
1478 }
1479}
1480
1481static struct irq_chip ioapic_chip __read_mostly = {
1482 .name = "IO-APIC",
1483 .startup = startup_ioapic_irq,
1484 .mask = mask_IO_APIC_irq,
1485 .unmask = unmask_IO_APIC_irq,
1486 .ack = ack_apic_edge,
1487 .eoi = ack_apic_level,
1488#ifdef CONFIG_SMP
1489 .set_affinity = set_ioapic_affinity_irq,
1490#endif
1491 .retrigger = ioapic_retrigger_irq,
1492};
1493
1494static inline void init_IO_APIC_traps(void)
1495{
1496 int irq;
1497
1498 /*
1499 * NOTE! The local APIC isn't very good at handling
1500 * multiple interrupts at the same interrupt level.
1501 * As the interrupt level is determined by taking the
1502 * vector number and shifting that right by 4, we
1503 * want to spread these out a bit so that they don't
1504 * all fall in the same interrupt level.
1505 *
1506 * Also, we've got to be careful not to trash gate
1507 * 0x80, because int 0x80 is hm, kind of importantish. ;)
1508 */
1509 for (irq = 0; irq < NR_IRQS ; irq++) {
1510 int tmp = irq;
1511 if (IO_APIC_IRQ(tmp) && !irq_cfg[tmp].vector) {
1512 /*
1513 * Hmm.. We don't have an entry for this,
1514 * so default to an old-fashioned 8259
1515 * interrupt if we can..
1516 */
1517 if (irq < 16)
1518 make_8259A_irq(irq);
1519 else
1520 /* Strange. Oh, well.. */
1521 irq_desc[irq].chip = &no_irq_chip;
1522 }
1523 }
1524}
1525
1526static void enable_lapic_irq (unsigned int irq)
1527{
1528 unsigned long v;
1529
1530 v = apic_read(APIC_LVT0);
1531 apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
1532}
1533
1534static void disable_lapic_irq (unsigned int irq)
1535{
1536 unsigned long v;
1537
1538 v = apic_read(APIC_LVT0);
1539 apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
1540}
1541
1542static void ack_lapic_irq (unsigned int irq)
1543{
1544 ack_APIC_irq();
1545}
1546
1547static void end_lapic_irq (unsigned int i) { /* nothing */ }
1548
1549static struct hw_interrupt_type lapic_irq_type __read_mostly = {
1550 .name = "local-APIC",
1551 .typename = "local-APIC-edge",
1552 .startup = NULL, /* startup_irq() not used for IRQ0 */
1553 .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */
1554 .enable = enable_lapic_irq,
1555 .disable = disable_lapic_irq,
1556 .ack = ack_lapic_irq,
1557 .end = end_lapic_irq,
1558};
1559
1560static void setup_nmi (void)
1561{
1562 /*
1563 * Dirty trick to enable the NMI watchdog ...
1564 * We put the 8259A master into AEOI mode and
1565 * unmask on all local APICs LVT0 as NMI.
1566 *
1567 * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
1568 * is from Maciej W. Rozycki - so we do not have to EOI from
1569 * the NMI handler or the timer interrupt.
1570 */
1571 printk(KERN_INFO "activating NMI Watchdog ...");
1572
1573 enable_NMI_through_LVT0(NULL);
1574
1575 printk(" done.\n");
1576}
1577
1578/*
1579 * This looks a bit hackish but it's about the only one way of sending
1580 * a few INTA cycles to 8259As and any associated glue logic. ICR does
1581 * not support the ExtINT mode, unfortunately. We need to send these
1582 * cycles as some i82489DX-based boards have glue logic that keeps the
1583 * 8259A interrupt line asserted until INTA. --macro
1584 */
1585static inline void unlock_ExtINT_logic(void)
1586{
1587 int apic, pin, i;
1588 struct IO_APIC_route_entry entry0, entry1;
1589 unsigned char save_control, save_freq_select;
1590 unsigned long flags;
1591
1592 pin = find_isa_irq_pin(8, mp_INT);
1593 apic = find_isa_irq_apic(8, mp_INT);
1594 if (pin == -1)
1595 return;
1596
1597 spin_lock_irqsave(&ioapic_lock, flags);
1598 *(((int *)&entry0) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
1599 *(((int *)&entry0) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
1600 spin_unlock_irqrestore(&ioapic_lock, flags);
1601 clear_IO_APIC_pin(apic, pin);
1602
1603 memset(&entry1, 0, sizeof(entry1));
1604
1605 entry1.dest_mode = 0; /* physical delivery */
1606 entry1.mask = 0; /* unmask IRQ now */
1607 entry1.dest = hard_smp_processor_id();
1608 entry1.delivery_mode = dest_ExtINT;
1609 entry1.polarity = entry0.polarity;
1610 entry1.trigger = 0;
1611 entry1.vector = 0;
1612
1613 spin_lock_irqsave(&ioapic_lock, flags);
1614 io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry1) + 1));
1615 io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry1) + 0));
1616 spin_unlock_irqrestore(&ioapic_lock, flags);
1617
1618 save_control = CMOS_READ(RTC_CONTROL);
1619 save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
1620 CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6,
1621 RTC_FREQ_SELECT);
1622 CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL);
1623
1624 i = 100;
1625 while (i-- > 0) {
1626 mdelay(10);
1627 if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF)
1628 i -= 10;
1629 }
1630
1631 CMOS_WRITE(save_control, RTC_CONTROL);
1632 CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
1633 clear_IO_APIC_pin(apic, pin);
1634
1635 spin_lock_irqsave(&ioapic_lock, flags);
1636 io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry0) + 1));
1637 io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry0) + 0));
1638 spin_unlock_irqrestore(&ioapic_lock, flags);
1639}
1640
1641/*
1642 * This code may look a bit paranoid, but it's supposed to cooperate with
1643 * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
1644 * is so screwy. Thanks to Brian Perkins for testing/hacking this beast
1645 * fanatically on his truly buggy board.
1646 *
1647 * FIXME: really need to revamp this for modern platforms only.
1648 */
1649static inline void check_timer(void)
1650{
1651 struct irq_cfg *cfg = irq_cfg + 0;
1652 int apic1, pin1, apic2, pin2;
1653
1654 /*
1655 * get/set the timer IRQ vector:
1656 */
1657 disable_8259A_irq(0);
1658 assign_irq_vector(0, TARGET_CPUS);
1659
1660 /*
1661 * Subtle, code in do_timer_interrupt() expects an AEOI
1662 * mode for the 8259A whenever interrupts are routed
1663 * through I/O APICs. Also IRQ0 has to be enabled in
1664 * the 8259A which implies the virtual wire has to be
1665 * disabled in the local APIC.
1666 */
1667 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
1668 init_8259A(1);
1669 if (timer_over_8254 > 0)
1670 enable_8259A_irq(0);
1671
1672 pin1 = find_isa_irq_pin(0, mp_INT);
1673 apic1 = find_isa_irq_apic(0, mp_INT);
1674 pin2 = ioapic_i8259.pin;
1675 apic2 = ioapic_i8259.apic;
1676
1677 apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
1678 cfg->vector, apic1, pin1, apic2, pin2);
1679
1680 if (pin1 != -1) {
1681 /*
1682 * Ok, does IRQ0 through the IOAPIC work?
1683 */
1684 unmask_IO_APIC_irq(0);
1685 if (!no_timer_check && timer_irq_works()) {
1686 nmi_watchdog_default();
1687 if (nmi_watchdog == NMI_IO_APIC) {
1688 disable_8259A_irq(0);
1689 setup_nmi();
1690 enable_8259A_irq(0);
1691 }
1692 if (disable_timer_pin_1 > 0)
1693 clear_IO_APIC_pin(0, pin1);
1694 return;
1695 }
1696 clear_IO_APIC_pin(apic1, pin1);
1697 apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not "
1698 "connected to IO-APIC\n");
1699 }
1700
1701 apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) "
1702 "through the 8259A ... ");
1703 if (pin2 != -1) {
1704 apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...",
1705 apic2, pin2);
1706 /*
1707 * legacy devices should be connected to IO APIC #0
1708 */
1709 setup_ExtINT_IRQ0_pin(apic2, pin2, cfg->vector);
1710 if (timer_irq_works()) {
1711 apic_printk(APIC_VERBOSE," works.\n");
1712 nmi_watchdog_default();
1713 if (nmi_watchdog == NMI_IO_APIC) {
1714 setup_nmi();
1715 }
1716 return;
1717 }
1718 /*
1719 * Cleanup, just in case ...
1720 */
1721 clear_IO_APIC_pin(apic2, pin2);
1722 }
1723 apic_printk(APIC_VERBOSE," failed.\n");
1724
1725 if (nmi_watchdog == NMI_IO_APIC) {
1726 printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
1727 nmi_watchdog = 0;
1728 }
1729
1730 apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
1731
1732 disable_8259A_irq(0);
1733 irq_desc[0].chip = &lapic_irq_type;
1734 apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
1735 enable_8259A_irq(0);
1736
1737 if (timer_irq_works()) {
1738 apic_printk(APIC_VERBOSE," works.\n");
1739 return;
1740 }
1741 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
1742 apic_printk(APIC_VERBOSE," failed.\n");
1743
1744 apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ...");
1745
1746 init_8259A(0);
1747 make_8259A_irq(0);
1748 apic_write(APIC_LVT0, APIC_DM_EXTINT);
1749
1750 unlock_ExtINT_logic();
1751
1752 if (timer_irq_works()) {
1753 apic_printk(APIC_VERBOSE," works.\n");
1754 return;
1755 }
1756 apic_printk(APIC_VERBOSE," failed :(.\n");
1757 panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n");
1758}
1759
1760static int __init notimercheck(char *s)
1761{
1762 no_timer_check = 1;
1763 return 1;
1764}
1765__setup("no_timer_check", notimercheck);
1766
1767/*
1768 *
1769 * IRQ's that are handled by the PIC in the MPS IOAPIC case.
1770 * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
1771 * Linux doesn't really care, as it's not actually used
1772 * for any interrupt handling anyway.
1773 */
1774#define PIC_IRQS (1<<2)
1775
1776void __init setup_IO_APIC(void)
1777{
1778 enable_IO_APIC();
1779
1780 if (acpi_ioapic)
1781 io_apic_irqs = ~0; /* all IRQs go through IOAPIC */
1782 else
1783 io_apic_irqs = ~PIC_IRQS;
1784
1785 apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
1786
1787 sync_Arb_IDs();
1788 setup_IO_APIC_irqs();
1789 init_IO_APIC_traps();
1790 check_timer();
1791 if (!acpi_ioapic)
1792 print_IO_APIC();
1793}
1794
1795struct sysfs_ioapic_data {
1796 struct sys_device dev;
1797 struct IO_APIC_route_entry entry[0];
1798};
1799static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
1800
1801static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
1802{
1803 struct IO_APIC_route_entry *entry;
1804 struct sysfs_ioapic_data *data;
1805 int i;
1806
1807 data = container_of(dev, struct sysfs_ioapic_data, dev);
1808 entry = data->entry;
1809 for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ )
1810 *entry = ioapic_read_entry(dev->id, i);
1811
1812 return 0;
1813}
1814
1815static int ioapic_resume(struct sys_device *dev)
1816{
1817 struct IO_APIC_route_entry *entry;
1818 struct sysfs_ioapic_data *data;
1819 unsigned long flags;
1820 union IO_APIC_reg_00 reg_00;
1821 int i;
1822
1823 data = container_of(dev, struct sysfs_ioapic_data, dev);
1824 entry = data->entry;
1825
1826 spin_lock_irqsave(&ioapic_lock, flags);
1827 reg_00.raw = io_apic_read(dev->id, 0);
1828 if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
1829 reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
1830 io_apic_write(dev->id, 0, reg_00.raw);
1831 }
1832 spin_unlock_irqrestore(&ioapic_lock, flags);
1833 for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
1834 ioapic_write_entry(dev->id, i, entry[i]);
1835
1836 return 0;
1837}
1838
1839static struct sysdev_class ioapic_sysdev_class = {
1840 set_kset_name("ioapic"),
1841 .suspend = ioapic_suspend,
1842 .resume = ioapic_resume,
1843};
1844
1845static int __init ioapic_init_sysfs(void)
1846{
1847 struct sys_device * dev;
1848 int i, size, error = 0;
1849
1850 error = sysdev_class_register(&ioapic_sysdev_class);
1851 if (error)
1852 return error;
1853
1854 for (i = 0; i < nr_ioapics; i++ ) {
1855 size = sizeof(struct sys_device) + nr_ioapic_registers[i]
1856 * sizeof(struct IO_APIC_route_entry);
1857 mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL);
1858 if (!mp_ioapic_data[i]) {
1859 printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
1860 continue;
1861 }
1862 memset(mp_ioapic_data[i], 0, size);
1863 dev = &mp_ioapic_data[i]->dev;
1864 dev->id = i;
1865 dev->cls = &ioapic_sysdev_class;
1866 error = sysdev_register(dev);
1867 if (error) {
1868 kfree(mp_ioapic_data[i]);
1869 mp_ioapic_data[i] = NULL;
1870 printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
1871 continue;
1872 }
1873 }
1874
1875 return 0;
1876}
1877
1878device_initcall(ioapic_init_sysfs);
1879
1880/*
1881 * Dynamic irq allocate and deallocation
1882 */
1883int create_irq(void)
1884{
1885 /* Allocate an unused irq */
1886 int irq;
1887 int new;
1888 unsigned long flags;
1889
1890 irq = -ENOSPC;
1891 spin_lock_irqsave(&vector_lock, flags);
1892 for (new = (NR_IRQS - 1); new >= 0; new--) {
1893 if (platform_legacy_irq(new))
1894 continue;
1895 if (irq_cfg[new].vector != 0)
1896 continue;
1897 if (__assign_irq_vector(new, TARGET_CPUS) == 0)
1898 irq = new;
1899 break;
1900 }
1901 spin_unlock_irqrestore(&vector_lock, flags);
1902
1903 if (irq >= 0) {
1904 dynamic_irq_init(irq);
1905 }
1906 return irq;
1907}
1908
1909void destroy_irq(unsigned int irq)
1910{
1911 unsigned long flags;
1912
1913 dynamic_irq_cleanup(irq);
1914
1915 spin_lock_irqsave(&vector_lock, flags);
1916 __clear_irq_vector(irq);
1917 spin_unlock_irqrestore(&vector_lock, flags);
1918}
1919
1920/*
1921 * MSI mesage composition
1922 */
1923#ifdef CONFIG_PCI_MSI
1924static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
1925{
1926 struct irq_cfg *cfg = irq_cfg + irq;
1927 int err;
1928 unsigned dest;
1929 cpumask_t tmp;
1930
1931 tmp = TARGET_CPUS;
1932 err = assign_irq_vector(irq, tmp);
1933 if (!err) {
1934 cpus_and(tmp, cfg->domain, tmp);
1935 dest = cpu_mask_to_apicid(tmp);
1936
1937 msg->address_hi = MSI_ADDR_BASE_HI;
1938 msg->address_lo =
1939 MSI_ADDR_BASE_LO |
1940 ((INT_DEST_MODE == 0) ?
1941 MSI_ADDR_DEST_MODE_PHYSICAL:
1942 MSI_ADDR_DEST_MODE_LOGICAL) |
1943 ((INT_DELIVERY_MODE != dest_LowestPrio) ?
1944 MSI_ADDR_REDIRECTION_CPU:
1945 MSI_ADDR_REDIRECTION_LOWPRI) |
1946 MSI_ADDR_DEST_ID(dest);
1947
1948 msg->data =
1949 MSI_DATA_TRIGGER_EDGE |
1950 MSI_DATA_LEVEL_ASSERT |
1951 ((INT_DELIVERY_MODE != dest_LowestPrio) ?
1952 MSI_DATA_DELIVERY_FIXED:
1953 MSI_DATA_DELIVERY_LOWPRI) |
1954 MSI_DATA_VECTOR(cfg->vector);
1955 }
1956 return err;
1957}
1958
1959#ifdef CONFIG_SMP
1960static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
1961{
1962 struct irq_cfg *cfg = irq_cfg + irq;
1963 struct msi_msg msg;
1964 unsigned int dest;
1965 cpumask_t tmp;
1966
1967 cpus_and(tmp, mask, cpu_online_map);
1968 if (cpus_empty(tmp))
1969 return;
1970
1971 if (assign_irq_vector(irq, mask))
1972 return;
1973
1974 cpus_and(tmp, cfg->domain, mask);
1975 dest = cpu_mask_to_apicid(tmp);
1976
1977 read_msi_msg(irq, &msg);
1978
1979 msg.data &= ~MSI_DATA_VECTOR_MASK;
1980 msg.data |= MSI_DATA_VECTOR(cfg->vector);
1981 msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
1982 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
1983
1984 write_msi_msg(irq, &msg);
1985 irq_desc[irq].affinity = mask;
1986}
1987#endif /* CONFIG_SMP */
1988
1989/*
1990 * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
1991 * which implement the MSI or MSI-X Capability Structure.
1992 */
1993static struct irq_chip msi_chip = {
1994 .name = "PCI-MSI",
1995 .unmask = unmask_msi_irq,
1996 .mask = mask_msi_irq,
1997 .ack = ack_apic_edge,
1998#ifdef CONFIG_SMP
1999 .set_affinity = set_msi_irq_affinity,
2000#endif
2001 .retrigger = ioapic_retrigger_irq,
2002};
2003
2004int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
2005{
2006 struct msi_msg msg;
2007 int irq, ret;
2008 irq = create_irq();
2009 if (irq < 0)
2010 return irq;
2011
2012 ret = msi_compose_msg(dev, irq, &msg);
2013 if (ret < 0) {
2014 destroy_irq(irq);
2015 return ret;
2016 }
2017
2018 set_irq_msi(irq, desc);
2019 write_msi_msg(irq, &msg);
2020
2021 set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
2022
2023 return 0;
2024}
2025
2026void arch_teardown_msi_irq(unsigned int irq)
2027{
2028 destroy_irq(irq);
2029}
2030
2031#endif /* CONFIG_PCI_MSI */
2032
2033/*
2034 * Hypertransport interrupt support
2035 */
2036#ifdef CONFIG_HT_IRQ
2037
2038#ifdef CONFIG_SMP
2039
2040static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
2041{
2042 struct ht_irq_msg msg;
2043 fetch_ht_irq_msg(irq, &msg);
2044
2045 msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | HT_IRQ_LOW_DEST_ID_MASK);
2046 msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
2047
2048 msg.address_lo |= HT_IRQ_LOW_VECTOR(vector) | HT_IRQ_LOW_DEST_ID(dest);
2049 msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest);
2050
2051 write_ht_irq_msg(irq, &msg);
2052}
2053
2054static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
2055{
2056 struct irq_cfg *cfg = irq_cfg + irq;
2057 unsigned int dest;
2058 cpumask_t tmp;
2059
2060 cpus_and(tmp, mask, cpu_online_map);
2061 if (cpus_empty(tmp))
2062 return;
2063
2064 if (assign_irq_vector(irq, mask))
2065 return;
2066
2067 cpus_and(tmp, cfg->domain, mask);
2068 dest = cpu_mask_to_apicid(tmp);
2069
2070 target_ht_irq(irq, dest, cfg->vector);
2071 irq_desc[irq].affinity = mask;
2072}
2073#endif
2074
2075static struct irq_chip ht_irq_chip = {
2076 .name = "PCI-HT",
2077 .mask = mask_ht_irq,
2078 .unmask = unmask_ht_irq,
2079 .ack = ack_apic_edge,
2080#ifdef CONFIG_SMP
2081 .set_affinity = set_ht_irq_affinity,
2082#endif
2083 .retrigger = ioapic_retrigger_irq,
2084};
2085
2086int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
2087{
2088 struct irq_cfg *cfg = irq_cfg + irq;
2089 int err;
2090 cpumask_t tmp;
2091
2092 tmp = TARGET_CPUS;
2093 err = assign_irq_vector(irq, tmp);
2094 if (!err) {
2095 struct ht_irq_msg msg;
2096 unsigned dest;
2097
2098 cpus_and(tmp, cfg->domain, tmp);
2099 dest = cpu_mask_to_apicid(tmp);
2100
2101 msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
2102
2103 msg.address_lo =
2104 HT_IRQ_LOW_BASE |
2105 HT_IRQ_LOW_DEST_ID(dest) |
2106 HT_IRQ_LOW_VECTOR(cfg->vector) |
2107 ((INT_DEST_MODE == 0) ?
2108 HT_IRQ_LOW_DM_PHYSICAL :
2109 HT_IRQ_LOW_DM_LOGICAL) |
2110 HT_IRQ_LOW_RQEOI_EDGE |
2111 ((INT_DELIVERY_MODE != dest_LowestPrio) ?
2112 HT_IRQ_LOW_MT_FIXED :
2113 HT_IRQ_LOW_MT_ARBITRATED) |
2114 HT_IRQ_LOW_IRQ_MASKED;
2115
2116 write_ht_irq_msg(irq, &msg);
2117
2118 set_irq_chip_and_handler_name(irq, &ht_irq_chip,
2119 handle_edge_irq, "edge");
2120 }
2121 return err;
2122}
2123#endif /* CONFIG_HT_IRQ */
2124
2125/* --------------------------------------------------------------------------
2126 ACPI-based IOAPIC Configuration
2127 -------------------------------------------------------------------------- */
2128
2129#ifdef CONFIG_ACPI
2130
2131#define IO_APIC_MAX_ID 0xFE
2132
2133int __init io_apic_get_redir_entries (int ioapic)
2134{
2135 union IO_APIC_reg_01 reg_01;
2136 unsigned long flags;
2137
2138 spin_lock_irqsave(&ioapic_lock, flags);
2139 reg_01.raw = io_apic_read(ioapic, 1);
2140 spin_unlock_irqrestore(&ioapic_lock, flags);
2141
2142 return reg_01.bits.entries;
2143}
2144
2145
2146int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity)
2147{
2148 if (!IO_APIC_IRQ(irq)) {
2149 apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
2150 ioapic);
2151 return -EINVAL;
2152 }
2153
2154 /*
2155 * IRQs < 16 are already in the irq_2_pin[] map
2156 */
2157 if (irq >= 16)
2158 add_pin_to_irq(irq, ioapic, pin);
2159
2160 setup_IO_APIC_irq(ioapic, pin, irq, triggering, polarity);
2161
2162 return 0;
2163}
2164
2165#endif /* CONFIG_ACPI */
2166
2167
2168/*
2169 * This function currently is only a helper for the i386 smp boot process where
2170 * we need to reprogram the ioredtbls to cater for the cpus which have come online
2171 * so mask in all cases should simply be TARGET_CPUS
2172 */
2173#ifdef CONFIG_SMP
2174void __init setup_ioapic_dest(void)
2175{
2176 int pin, ioapic, irq, irq_entry;
2177
2178 if (skip_ioapic_setup == 1)
2179 return;
2180
2181 for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
2182 for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
2183 irq_entry = find_irq_entry(ioapic, pin, mp_INT);
2184 if (irq_entry == -1)
2185 continue;
2186 irq = pin_2_irq(irq_entry, ioapic, pin);
2187
2188 /* setup_IO_APIC_irqs could fail to get vector for some device
2189 * when you have too many devices, because at that time only boot
2190 * cpu is online.
2191 */
2192 if (!irq_cfg[irq].vector)
2193 setup_IO_APIC_irq(ioapic, pin, irq,
2194 irq_trigger(irq_entry),
2195 irq_polarity(irq_entry));
2196 else
2197 set_ioapic_affinity_irq(irq, TARGET_CPUS);
2198 }
2199
2200 }
2201}
2202#endif
diff --git a/arch/x86/kernel/ioport_32.c b/arch/x86/kernel/ioport_32.c
new file mode 100644
index 000000000000..3d310a946d76
--- /dev/null
+++ b/arch/x86/kernel/ioport_32.c
@@ -0,0 +1,153 @@
1/*
2 * linux/arch/i386/kernel/ioport.c
3 *
4 * This contains the io-permission bitmap code - written by obz, with changes
5 * by Linus.
6 */
7
8#include <linux/sched.h>
9#include <linux/kernel.h>
10#include <linux/capability.h>
11#include <linux/errno.h>
12#include <linux/types.h>
13#include <linux/ioport.h>
14#include <linux/smp.h>
15#include <linux/stddef.h>
16#include <linux/slab.h>
17#include <linux/thread_info.h>
18#include <linux/syscalls.h>
19
20/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
21static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
22{
23 unsigned long mask;
24 unsigned long *bitmap_base = bitmap + (base / BITS_PER_LONG);
25 unsigned int low_index = base & (BITS_PER_LONG-1);
26 int length = low_index + extent;
27
28 if (low_index != 0) {
29 mask = (~0UL << low_index);
30 if (length < BITS_PER_LONG)
31 mask &= ~(~0UL << length);
32 if (new_value)
33 *bitmap_base++ |= mask;
34 else
35 *bitmap_base++ &= ~mask;
36 length -= BITS_PER_LONG;
37 }
38
39 mask = (new_value ? ~0UL : 0UL);
40 while (length >= BITS_PER_LONG) {
41 *bitmap_base++ = mask;
42 length -= BITS_PER_LONG;
43 }
44
45 if (length > 0) {
46 mask = ~(~0UL << length);
47 if (new_value)
48 *bitmap_base++ |= mask;
49 else
50 *bitmap_base++ &= ~mask;
51 }
52}
53
54
55/*
56 * this changes the io permissions bitmap in the current task.
57 */
58asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
59{
60 unsigned long i, max_long, bytes, bytes_updated;
61 struct thread_struct * t = &current->thread;
62 struct tss_struct * tss;
63 unsigned long *bitmap;
64
65 if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
66 return -EINVAL;
67 if (turn_on && !capable(CAP_SYS_RAWIO))
68 return -EPERM;
69
70 /*
71 * If it's the first ioperm() call in this thread's lifetime, set the
72 * IO bitmap up. ioperm() is much less timing critical than clone(),
73 * this is why we delay this operation until now:
74 */
75 if (!t->io_bitmap_ptr) {
76 bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
77 if (!bitmap)
78 return -ENOMEM;
79
80 memset(bitmap, 0xff, IO_BITMAP_BYTES);
81 t->io_bitmap_ptr = bitmap;
82 set_thread_flag(TIF_IO_BITMAP);
83 }
84
85 /*
86 * do it in the per-thread copy and in the TSS ...
87 *
88 * Disable preemption via get_cpu() - we must not switch away
89 * because the ->io_bitmap_max value must match the bitmap
90 * contents:
91 */
92 tss = &per_cpu(init_tss, get_cpu());
93
94 set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
95
96 /*
97 * Search for a (possibly new) maximum. This is simple and stupid,
98 * to keep it obviously correct:
99 */
100 max_long = 0;
101 for (i = 0; i < IO_BITMAP_LONGS; i++)
102 if (t->io_bitmap_ptr[i] != ~0UL)
103 max_long = i;
104
105 bytes = (max_long + 1) * sizeof(long);
106 bytes_updated = max(bytes, t->io_bitmap_max);
107
108 t->io_bitmap_max = bytes;
109
110 /*
111 * Sets the lazy trigger so that the next I/O operation will
112 * reload the correct bitmap.
113 * Reset the owner so that a process switch will not set
114 * tss->io_bitmap_base to IO_BITMAP_OFFSET.
115 */
116 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY;
117 tss->io_bitmap_owner = NULL;
118
119 put_cpu();
120
121 return 0;
122}
123
124/*
125 * sys_iopl has to be used when you want to access the IO ports
126 * beyond the 0x3ff range: to get the full 65536 ports bitmapped
127 * you'd need 8kB of bitmaps/process, which is a bit excessive.
128 *
129 * Here we just change the eflags value on the stack: we allow
130 * only the super-user to do it. This depends on the stack-layout
131 * on system-call entry - see also fork() and the signal handling
132 * code.
133 */
134
135asmlinkage long sys_iopl(unsigned long unused)
136{
137 volatile struct pt_regs * regs = (struct pt_regs *) &unused;
138 unsigned int level = regs->ebx;
139 unsigned int old = (regs->eflags >> 12) & 3;
140 struct thread_struct *t = &current->thread;
141
142 if (level > 3)
143 return -EINVAL;
144 /* Trying to gain more privileges? */
145 if (level > old) {
146 if (!capable(CAP_SYS_RAWIO))
147 return -EPERM;
148 }
149 t->iopl = level << 12;
150 regs->eflags = (regs->eflags & ~X86_EFLAGS_IOPL) | t->iopl;
151 set_iopl_mask(t->iopl);
152 return 0;
153}
diff --git a/arch/x86/kernel/ioport_64.c b/arch/x86/kernel/ioport_64.c
new file mode 100644
index 000000000000..653efa30b0f4
--- /dev/null
+++ b/arch/x86/kernel/ioport_64.c
@@ -0,0 +1,119 @@
1/*
2 * linux/arch/x86_64/kernel/ioport.c
3 *
4 * This contains the io-permission bitmap code - written by obz, with changes
5 * by Linus.
6 */
7
8#include <linux/sched.h>
9#include <linux/kernel.h>
10#include <linux/capability.h>
11#include <linux/errno.h>
12#include <linux/types.h>
13#include <linux/ioport.h>
14#include <linux/smp.h>
15#include <linux/stddef.h>
16#include <linux/slab.h>
17#include <linux/thread_info.h>
18#include <linux/syscalls.h>
19
20/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
21static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
22{
23 int i;
24 if (new_value)
25 for (i = base; i < base + extent; i++)
26 __set_bit(i, bitmap);
27 else
28 for (i = base; i < base + extent; i++)
29 clear_bit(i, bitmap);
30}
31
32/*
33 * this changes the io permissions bitmap in the current task.
34 */
35asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
36{
37 unsigned int i, max_long, bytes, bytes_updated;
38 struct thread_struct * t = &current->thread;
39 struct tss_struct * tss;
40 unsigned long *bitmap;
41
42 if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
43 return -EINVAL;
44 if (turn_on && !capable(CAP_SYS_RAWIO))
45 return -EPERM;
46
47 /*
48 * If it's the first ioperm() call in this thread's lifetime, set the
49 * IO bitmap up. ioperm() is much less timing critical than clone(),
50 * this is why we delay this operation until now:
51 */
52 if (!t->io_bitmap_ptr) {
53 bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
54 if (!bitmap)
55 return -ENOMEM;
56
57 memset(bitmap, 0xff, IO_BITMAP_BYTES);
58 t->io_bitmap_ptr = bitmap;
59 set_thread_flag(TIF_IO_BITMAP);
60 }
61
62 /*
63 * do it in the per-thread copy and in the TSS ...
64 *
65 * Disable preemption via get_cpu() - we must not switch away
66 * because the ->io_bitmap_max value must match the bitmap
67 * contents:
68 */
69 tss = &per_cpu(init_tss, get_cpu());
70
71 set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
72
73 /*
74 * Search for a (possibly new) maximum. This is simple and stupid,
75 * to keep it obviously correct:
76 */
77 max_long = 0;
78 for (i = 0; i < IO_BITMAP_LONGS; i++)
79 if (t->io_bitmap_ptr[i] != ~0UL)
80 max_long = i;
81
82 bytes = (max_long + 1) * sizeof(long);
83 bytes_updated = max(bytes, t->io_bitmap_max);
84
85 t->io_bitmap_max = bytes;
86
87 /* Update the TSS: */
88 memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated);
89
90 put_cpu();
91
92 return 0;
93}
94
95/*
96 * sys_iopl has to be used when you want to access the IO ports
97 * beyond the 0x3ff range: to get the full 65536 ports bitmapped
98 * you'd need 8kB of bitmaps/process, which is a bit excessive.
99 *
100 * Here we just change the eflags value on the stack: we allow
101 * only the super-user to do it. This depends on the stack-layout
102 * on system-call entry - see also fork() and the signal handling
103 * code.
104 */
105
106asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs)
107{
108 unsigned int old = (regs->eflags >> 12) & 3;
109
110 if (level > 3)
111 return -EINVAL;
112 /* Trying to gain more privileges? */
113 if (level > old) {
114 if (!capable(CAP_SYS_RAWIO))
115 return -EPERM;
116 }
117 regs->eflags = (regs->eflags &~ X86_EFLAGS_IOPL) | (level << 12);
118 return 0;
119}
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
new file mode 100644
index 000000000000..4f681bcdb1fc
--- /dev/null
+++ b/arch/x86/kernel/irq_32.c
@@ -0,0 +1,341 @@
1/*
2 * linux/arch/i386/kernel/irq.c
3 *
4 * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
5 *
6 * This file contains the lowest level x86-specific interrupt
7 * entry, irq-stacks and irq statistics code. All the remaining
8 * irq logic is done by the generic kernel/irq/ code and
9 * by the x86-specific irq controller code. (e.g. i8259.c and
10 * io_apic.c.)
11 */
12
13#include <linux/module.h>
14#include <linux/seq_file.h>
15#include <linux/interrupt.h>
16#include <linux/kernel_stat.h>
17#include <linux/notifier.h>
18#include <linux/cpu.h>
19#include <linux/delay.h>
20
21#include <asm/apic.h>
22#include <asm/uaccess.h>
23
24DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
25EXPORT_PER_CPU_SYMBOL(irq_stat);
26
27DEFINE_PER_CPU(struct pt_regs *, irq_regs);
28EXPORT_PER_CPU_SYMBOL(irq_regs);
29
30/*
31 * 'what should we do if we get a hw irq event on an illegal vector'.
32 * each architecture has to answer this themselves.
33 */
34void ack_bad_irq(unsigned int irq)
35{
36 printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq);
37
38#ifdef CONFIG_X86_LOCAL_APIC
39 /*
40 * Currently unexpected vectors happen only on SMP and APIC.
41 * We _must_ ack these because every local APIC has only N
42 * irq slots per priority level, and a 'hanging, unacked' IRQ
43 * holds up an irq slot - in excessive cases (when multiple
44 * unexpected vectors occur) that might lock up the APIC
45 * completely.
46 * But only ack when the APIC is enabled -AK
47 */
48 if (cpu_has_apic)
49 ack_APIC_irq();
50#endif
51}
52
53#ifdef CONFIG_4KSTACKS
54/*
55 * per-CPU IRQ handling contexts (thread information and stack)
56 */
57union irq_ctx {
58 struct thread_info tinfo;
59 u32 stack[THREAD_SIZE/sizeof(u32)];
60};
61
62static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
63static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
64#endif
65
66/*
67 * do_IRQ handles all normal device IRQ's (the special
68 * SMP cross-CPU interrupts have their own specific
69 * handlers).
70 */
71fastcall unsigned int do_IRQ(struct pt_regs *regs)
72{
73 struct pt_regs *old_regs;
74 /* high bit used in ret_from_ code */
75 int irq = ~regs->orig_eax;
76 struct irq_desc *desc = irq_desc + irq;
77#ifdef CONFIG_4KSTACKS
78 union irq_ctx *curctx, *irqctx;
79 u32 *isp;
80#endif
81
82 if (unlikely((unsigned)irq >= NR_IRQS)) {
83 printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
84 __FUNCTION__, irq);
85 BUG();
86 }
87
88 old_regs = set_irq_regs(regs);
89 irq_enter();
90#ifdef CONFIG_DEBUG_STACKOVERFLOW
91 /* Debugging check for stack overflow: is there less than 1KB free? */
92 {
93 long esp;
94
95 __asm__ __volatile__("andl %%esp,%0" :
96 "=r" (esp) : "0" (THREAD_SIZE - 1));
97 if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) {
98 printk("do_IRQ: stack overflow: %ld\n",
99 esp - sizeof(struct thread_info));
100 dump_stack();
101 }
102 }
103#endif
104
105#ifdef CONFIG_4KSTACKS
106
107 curctx = (union irq_ctx *) current_thread_info();
108 irqctx = hardirq_ctx[smp_processor_id()];
109
110 /*
111 * this is where we switch to the IRQ stack. However, if we are
112 * already using the IRQ stack (because we interrupted a hardirq
113 * handler) we can't do that and just have to keep using the
114 * current stack (which is the irq stack already after all)
115 */
116 if (curctx != irqctx) {
117 int arg1, arg2, ebx;
118
119 /* build the stack frame on the IRQ stack */
120 isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
121 irqctx->tinfo.task = curctx->tinfo.task;
122 irqctx->tinfo.previous_esp = current_stack_pointer;
123
124 /*
125 * Copy the softirq bits in preempt_count so that the
126 * softirq checks work in the hardirq context.
127 */
128 irqctx->tinfo.preempt_count =
129 (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
130 (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
131
132 asm volatile(
133 " xchgl %%ebx,%%esp \n"
134 " call *%%edi \n"
135 " movl %%ebx,%%esp \n"
136 : "=a" (arg1), "=d" (arg2), "=b" (ebx)
137 : "0" (irq), "1" (desc), "2" (isp),
138 "D" (desc->handle_irq)
139 : "memory", "cc"
140 );
141 } else
142#endif
143 desc->handle_irq(irq, desc);
144
145 irq_exit();
146 set_irq_regs(old_regs);
147 return 1;
148}
149
150#ifdef CONFIG_4KSTACKS
151
152static char softirq_stack[NR_CPUS * THREAD_SIZE]
153 __attribute__((__section__(".bss.page_aligned")));
154
155static char hardirq_stack[NR_CPUS * THREAD_SIZE]
156 __attribute__((__section__(".bss.page_aligned")));
157
158/*
159 * allocate per-cpu stacks for hardirq and for softirq processing
160 */
161void irq_ctx_init(int cpu)
162{
163 union irq_ctx *irqctx;
164
165 if (hardirq_ctx[cpu])
166 return;
167
168 irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE];
169 irqctx->tinfo.task = NULL;
170 irqctx->tinfo.exec_domain = NULL;
171 irqctx->tinfo.cpu = cpu;
172 irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
173 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
174
175 hardirq_ctx[cpu] = irqctx;
176
177 irqctx = (union irq_ctx*) &softirq_stack[cpu*THREAD_SIZE];
178 irqctx->tinfo.task = NULL;
179 irqctx->tinfo.exec_domain = NULL;
180 irqctx->tinfo.cpu = cpu;
181 irqctx->tinfo.preempt_count = 0;
182 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
183
184 softirq_ctx[cpu] = irqctx;
185
186 printk("CPU %u irqstacks, hard=%p soft=%p\n",
187 cpu,hardirq_ctx[cpu],softirq_ctx[cpu]);
188}
189
190void irq_ctx_exit(int cpu)
191{
192 hardirq_ctx[cpu] = NULL;
193}
194
195extern asmlinkage void __do_softirq(void);
196
197asmlinkage void do_softirq(void)
198{
199 unsigned long flags;
200 struct thread_info *curctx;
201 union irq_ctx *irqctx;
202 u32 *isp;
203
204 if (in_interrupt())
205 return;
206
207 local_irq_save(flags);
208
209 if (local_softirq_pending()) {
210 curctx = current_thread_info();
211 irqctx = softirq_ctx[smp_processor_id()];
212 irqctx->tinfo.task = curctx->task;
213 irqctx->tinfo.previous_esp = current_stack_pointer;
214
215 /* build the stack frame on the softirq stack */
216 isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
217
218 asm volatile(
219 " xchgl %%ebx,%%esp \n"
220 " call __do_softirq \n"
221 " movl %%ebx,%%esp \n"
222 : "=b"(isp)
223 : "0"(isp)
224 : "memory", "cc", "edx", "ecx", "eax"
225 );
226 /*
227 * Shouldnt happen, we returned above if in_interrupt():
228 */
229 WARN_ON_ONCE(softirq_count());
230 }
231
232 local_irq_restore(flags);
233}
234#endif
235
236/*
237 * Interrupt statistics:
238 */
239
240atomic_t irq_err_count;
241
242/*
243 * /proc/interrupts printing:
244 */
245
246int show_interrupts(struct seq_file *p, void *v)
247{
248 int i = *(loff_t *) v, j;
249 struct irqaction * action;
250 unsigned long flags;
251
252 if (i == 0) {
253 seq_printf(p, " ");
254 for_each_online_cpu(j)
255 seq_printf(p, "CPU%-8d",j);
256 seq_putc(p, '\n');
257 }
258
259 if (i < NR_IRQS) {
260 spin_lock_irqsave(&irq_desc[i].lock, flags);
261 action = irq_desc[i].action;
262 if (!action)
263 goto skip;
264 seq_printf(p, "%3d: ",i);
265#ifndef CONFIG_SMP
266 seq_printf(p, "%10u ", kstat_irqs(i));
267#else
268 for_each_online_cpu(j)
269 seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
270#endif
271 seq_printf(p, " %8s", irq_desc[i].chip->name);
272 seq_printf(p, "-%-8s", irq_desc[i].name);
273 seq_printf(p, " %s", action->name);
274
275 for (action=action->next; action; action = action->next)
276 seq_printf(p, ", %s", action->name);
277
278 seq_putc(p, '\n');
279skip:
280 spin_unlock_irqrestore(&irq_desc[i].lock, flags);
281 } else if (i == NR_IRQS) {
282 seq_printf(p, "NMI: ");
283 for_each_online_cpu(j)
284 seq_printf(p, "%10u ", nmi_count(j));
285 seq_putc(p, '\n');
286#ifdef CONFIG_X86_LOCAL_APIC
287 seq_printf(p, "LOC: ");
288 for_each_online_cpu(j)
289 seq_printf(p, "%10u ",
290 per_cpu(irq_stat,j).apic_timer_irqs);
291 seq_putc(p, '\n');
292#endif
293 seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
294#if defined(CONFIG_X86_IO_APIC)
295 seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
296#endif
297 }
298 return 0;
299}
300
301#ifdef CONFIG_HOTPLUG_CPU
302#include <mach_apic.h>
303
304void fixup_irqs(cpumask_t map)
305{
306 unsigned int irq;
307 static int warned;
308
309 for (irq = 0; irq < NR_IRQS; irq++) {
310 cpumask_t mask;
311 if (irq == 2)
312 continue;
313
314 cpus_and(mask, irq_desc[irq].affinity, map);
315 if (any_online_cpu(mask) == NR_CPUS) {
316 printk("Breaking affinity for irq %i\n", irq);
317 mask = map;
318 }
319 if (irq_desc[irq].chip->set_affinity)
320 irq_desc[irq].chip->set_affinity(irq, mask);
321 else if (irq_desc[irq].action && !(warned++))
322 printk("Cannot set affinity for irq %i\n", irq);
323 }
324
325#if 0
326 barrier();
327 /* Ingo Molnar says: "after the IO-APIC masks have been redirected
328 [note the nop - the interrupt-enable boundary on x86 is two
329 instructions from sti] - to flush out pending hardirqs and
330 IPIs. After this point nothing is supposed to reach this CPU." */
331 __asm__ __volatile__("sti; nop; cli");
332 barrier();
333#else
334 /* That doesn't seem sufficient. Give it 1ms. */
335 local_irq_enable();
336 mdelay(1);
337 local_irq_disable();
338#endif
339}
340#endif
341
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
new file mode 100644
index 000000000000..bd11e42b22bf
--- /dev/null
+++ b/arch/x86/kernel/irq_64.c
@@ -0,0 +1,212 @@
1/*
2 * linux/arch/x86_64/kernel/irq.c
3 *
4 * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
5 *
6 * This file contains the lowest level x86_64-specific interrupt
7 * entry and irq statistics code. All the remaining irq logic is
8 * done by the generic kernel/irq/ code and in the
9 * x86_64-specific irq controller code. (e.g. i8259.c and
10 * io_apic.c.)
11 */
12
13#include <linux/kernel_stat.h>
14#include <linux/interrupt.h>
15#include <linux/seq_file.h>
16#include <linux/module.h>
17#include <linux/delay.h>
18#include <asm/uaccess.h>
19#include <asm/io_apic.h>
20#include <asm/idle.h>
21#include <asm/smp.h>
22
23atomic_t irq_err_count;
24
25#ifdef CONFIG_DEBUG_STACKOVERFLOW
26/*
27 * Probabilistic stack overflow check:
28 *
29 * Only check the stack in process context, because everything else
30 * runs on the big interrupt stacks. Checking reliably is too expensive,
31 * so we just check from interrupts.
32 */
33static inline void stack_overflow_check(struct pt_regs *regs)
34{
35 u64 curbase = (u64)task_stack_page(current);
36 static unsigned long warned = -60*HZ;
37
38 if (regs->rsp >= curbase && regs->rsp <= curbase + THREAD_SIZE &&
39 regs->rsp < curbase + sizeof(struct thread_info) + 128 &&
40 time_after(jiffies, warned + 60*HZ)) {
41 printk("do_IRQ: %s near stack overflow (cur:%Lx,rsp:%lx)\n",
42 current->comm, curbase, regs->rsp);
43 show_stack(NULL,NULL);
44 warned = jiffies;
45 }
46}
47#endif
48
49/*
50 * Generic, controller-independent functions:
51 */
52
53int show_interrupts(struct seq_file *p, void *v)
54{
55 int i = *(loff_t *) v, j;
56 struct irqaction * action;
57 unsigned long flags;
58
59 if (i == 0) {
60 seq_printf(p, " ");
61 for_each_online_cpu(j)
62 seq_printf(p, "CPU%-8d",j);
63 seq_putc(p, '\n');
64 }
65
66 if (i < NR_IRQS) {
67 spin_lock_irqsave(&irq_desc[i].lock, flags);
68 action = irq_desc[i].action;
69 if (!action)
70 goto skip;
71 seq_printf(p, "%3d: ",i);
72#ifndef CONFIG_SMP
73 seq_printf(p, "%10u ", kstat_irqs(i));
74#else
75 for_each_online_cpu(j)
76 seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
77#endif
78 seq_printf(p, " %8s", irq_desc[i].chip->name);
79 seq_printf(p, "-%-8s", irq_desc[i].name);
80
81 seq_printf(p, " %s", action->name);
82 for (action=action->next; action; action = action->next)
83 seq_printf(p, ", %s", action->name);
84 seq_putc(p, '\n');
85skip:
86 spin_unlock_irqrestore(&irq_desc[i].lock, flags);
87 } else if (i == NR_IRQS) {
88 seq_printf(p, "NMI: ");
89 for_each_online_cpu(j)
90 seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count);
91 seq_putc(p, '\n');
92 seq_printf(p, "LOC: ");
93 for_each_online_cpu(j)
94 seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs);
95 seq_putc(p, '\n');
96 seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
97 }
98 return 0;
99}
100
101/*
102 * do_IRQ handles all normal device IRQ's (the special
103 * SMP cross-CPU interrupts have their own specific
104 * handlers).
105 */
106asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
107{
108 struct pt_regs *old_regs = set_irq_regs(regs);
109
110 /* high bit used in ret_from_ code */
111 unsigned vector = ~regs->orig_rax;
112 unsigned irq;
113
114 exit_idle();
115 irq_enter();
116 irq = __get_cpu_var(vector_irq)[vector];
117
118#ifdef CONFIG_DEBUG_STACKOVERFLOW
119 stack_overflow_check(regs);
120#endif
121
122 if (likely(irq < NR_IRQS))
123 generic_handle_irq(irq);
124 else {
125 if (!disable_apic)
126 ack_APIC_irq();
127
128 if (printk_ratelimit())
129 printk(KERN_EMERG "%s: %d.%d No irq handler for vector\n",
130 __func__, smp_processor_id(), vector);
131 }
132
133 irq_exit();
134
135 set_irq_regs(old_regs);
136 return 1;
137}
138
139#ifdef CONFIG_HOTPLUG_CPU
140void fixup_irqs(cpumask_t map)
141{
142 unsigned int irq;
143 static int warned;
144
145 for (irq = 0; irq < NR_IRQS; irq++) {
146 cpumask_t mask;
147 int break_affinity = 0;
148 int set_affinity = 1;
149
150 if (irq == 2)
151 continue;
152
153 /* interrupt's are disabled at this point */
154 spin_lock(&irq_desc[irq].lock);
155
156 if (!irq_has_action(irq) ||
157 cpus_equal(irq_desc[irq].affinity, map)) {
158 spin_unlock(&irq_desc[irq].lock);
159 continue;
160 }
161
162 cpus_and(mask, irq_desc[irq].affinity, map);
163 if (cpus_empty(mask)) {
164 break_affinity = 1;
165 mask = map;
166 }
167
168 if (irq_desc[irq].chip->mask)
169 irq_desc[irq].chip->mask(irq);
170
171 if (irq_desc[irq].chip->set_affinity)
172 irq_desc[irq].chip->set_affinity(irq, mask);
173 else if (!(warned++))
174 set_affinity = 0;
175
176 if (irq_desc[irq].chip->unmask)
177 irq_desc[irq].chip->unmask(irq);
178
179 spin_unlock(&irq_desc[irq].lock);
180
181 if (break_affinity && set_affinity)
182 printk("Broke affinity for irq %i\n", irq);
183 else if (!set_affinity)
184 printk("Cannot set affinity for irq %i\n", irq);
185 }
186
187 /* That doesn't seem sufficient. Give it 1ms. */
188 local_irq_enable();
189 mdelay(1);
190 local_irq_disable();
191}
192#endif
193
194extern void call_softirq(void);
195
196asmlinkage void do_softirq(void)
197{
198 __u32 pending;
199 unsigned long flags;
200
201 if (in_interrupt())
202 return;
203
204 local_irq_save(flags);
205 pending = local_softirq_pending();
206 /* Switch to interrupt stack */
207 if (pending) {
208 call_softirq();
209 WARN_ON_ONCE(softirq_count());
210 }
211 local_irq_restore(flags);
212}
diff --git a/arch/x86/kernel/k8.c b/arch/x86/kernel/k8.c
new file mode 100644
index 000000000000..7377ccb21335
--- /dev/null
+++ b/arch/x86/kernel/k8.c
@@ -0,0 +1,123 @@
1/*
2 * Shared support code for AMD K8 northbridges and derivates.
3 * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2.
4 */
5#include <linux/gfp.h>
6#include <linux/types.h>
7#include <linux/init.h>
8#include <linux/errno.h>
9#include <linux/module.h>
10#include <linux/spinlock.h>
11#include <asm/k8.h>
12
13int num_k8_northbridges;
14EXPORT_SYMBOL(num_k8_northbridges);
15
16static u32 *flush_words;
17
18struct pci_device_id k8_nb_ids[] = {
19 { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) },
20 { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) },
21 {}
22};
23EXPORT_SYMBOL(k8_nb_ids);
24
25struct pci_dev **k8_northbridges;
26EXPORT_SYMBOL(k8_northbridges);
27
28static struct pci_dev *next_k8_northbridge(struct pci_dev *dev)
29{
30 do {
31 dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
32 if (!dev)
33 break;
34 } while (!pci_match_id(&k8_nb_ids[0], dev));
35 return dev;
36}
37
38int cache_k8_northbridges(void)
39{
40 int i;
41 struct pci_dev *dev;
42
43 if (num_k8_northbridges)
44 return 0;
45
46 dev = NULL;
47 while ((dev = next_k8_northbridge(dev)) != NULL)
48 num_k8_northbridges++;
49
50 k8_northbridges = kmalloc((num_k8_northbridges + 1) * sizeof(void *),
51 GFP_KERNEL);
52 if (!k8_northbridges)
53 return -ENOMEM;
54
55 if (!num_k8_northbridges) {
56 k8_northbridges[0] = NULL;
57 return 0;
58 }
59
60 flush_words = kmalloc(num_k8_northbridges * sizeof(u32), GFP_KERNEL);
61 if (!flush_words) {
62 kfree(k8_northbridges);
63 return -ENOMEM;
64 }
65
66 dev = NULL;
67 i = 0;
68 while ((dev = next_k8_northbridge(dev)) != NULL) {
69 k8_northbridges[i] = dev;
70 pci_read_config_dword(dev, 0x9c, &flush_words[i++]);
71 }
72 k8_northbridges[i] = NULL;
73 return 0;
74}
75EXPORT_SYMBOL_GPL(cache_k8_northbridges);
76
77/* Ignores subdevice/subvendor but as far as I can figure out
78 they're useless anyways */
79int __init early_is_k8_nb(u32 device)
80{
81 struct pci_device_id *id;
82 u32 vendor = device & 0xffff;
83 device >>= 16;
84 for (id = k8_nb_ids; id->vendor; id++)
85 if (vendor == id->vendor && device == id->device)
86 return 1;
87 return 0;
88}
89
90void k8_flush_garts(void)
91{
92 int flushed, i;
93 unsigned long flags;
94 static DEFINE_SPINLOCK(gart_lock);
95
96 /* Avoid races between AGP and IOMMU. In theory it's not needed
97 but I'm not sure if the hardware won't lose flush requests
98 when another is pending. This whole thing is so expensive anyways
99 that it doesn't matter to serialize more. -AK */
100 spin_lock_irqsave(&gart_lock, flags);
101 flushed = 0;
102 for (i = 0; i < num_k8_northbridges; i++) {
103 pci_write_config_dword(k8_northbridges[i], 0x9c,
104 flush_words[i]|1);
105 flushed++;
106 }
107 for (i = 0; i < num_k8_northbridges; i++) {
108 u32 w;
109 /* Make sure the hardware actually executed the flush*/
110 for (;;) {
111 pci_read_config_dword(k8_northbridges[i],
112 0x9c, &w);
113 if (!(w & 1))
114 break;
115 cpu_relax();
116 }
117 }
118 spin_unlock_irqrestore(&gart_lock, flags);
119 if (!flushed)
120 printk("nothing to flush?\n");
121}
122EXPORT_SYMBOL_GPL(k8_flush_garts);
123
diff --git a/arch/x86/kernel/kprobes_32.c b/arch/x86/kernel/kprobes_32.c
new file mode 100644
index 000000000000..448a50b1324c
--- /dev/null
+++ b/arch/x86/kernel/kprobes_32.c
@@ -0,0 +1,751 @@
1/*
2 * Kernel Probes (KProbes)
3 * arch/i386/kernel/kprobes.c
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18 *
19 * Copyright (C) IBM Corporation, 2002, 2004
20 *
21 * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
22 * Probes initial implementation ( includes contributions from
23 * Rusty Russell).
24 * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
25 * interface to access function arguments.
26 * 2005-May Hien Nguyen <hien@us.ibm.com>, Jim Keniston
27 * <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
28 * <prasanna@in.ibm.com> added function-return probes.
29 */
30
31#include <linux/kprobes.h>
32#include <linux/ptrace.h>
33#include <linux/preempt.h>
34#include <linux/kdebug.h>
35#include <asm/cacheflush.h>
36#include <asm/desc.h>
37#include <asm/uaccess.h>
38#include <asm/alternative.h>
39
40void jprobe_return_end(void);
41
42DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
43DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
44
45/* insert a jmp code */
46static __always_inline void set_jmp_op(void *from, void *to)
47{
48 struct __arch_jmp_op {
49 char op;
50 long raddr;
51 } __attribute__((packed)) *jop;
52 jop = (struct __arch_jmp_op *)from;
53 jop->raddr = (long)(to) - ((long)(from) + 5);
54 jop->op = RELATIVEJUMP_INSTRUCTION;
55}
56
57/*
58 * returns non-zero if opcodes can be boosted.
59 */
60static __always_inline int can_boost(kprobe_opcode_t *opcodes)
61{
62#define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf) \
63 (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
64 (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \
65 (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \
66 (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \
67 << (row % 32))
68 /*
69 * Undefined/reserved opcodes, conditional jump, Opcode Extension
70 * Groups, and some special opcodes can not be boost.
71 */
72 static const unsigned long twobyte_is_boostable[256 / 32] = {
73 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
74 /* ------------------------------- */
75 W(0x00, 0,0,1,1,0,0,1,0,1,1,0,0,0,0,0,0)| /* 00 */
76 W(0x10, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 10 */
77 W(0x20, 1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0)| /* 20 */
78 W(0x30, 0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 30 */
79 W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 40 */
80 W(0x50, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 50 */
81 W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1)| /* 60 */
82 W(0x70, 0,0,0,0,1,1,1,1,0,0,0,0,0,0,1,1), /* 70 */
83 W(0x80, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 80 */
84 W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1), /* 90 */
85 W(0xa0, 1,1,0,1,1,1,0,0,1,1,0,1,1,1,0,1)| /* a0 */
86 W(0xb0, 1,1,1,1,1,1,1,1,0,0,0,1,1,1,1,1), /* b0 */
87 W(0xc0, 1,1,0,0,0,0,0,0,1,1,1,1,1,1,1,1)| /* c0 */
88 W(0xd0, 0,1,1,1,0,1,0,0,1,1,0,1,1,1,0,1), /* d0 */
89 W(0xe0, 0,1,1,0,0,1,0,0,1,1,0,1,1,1,0,1)| /* e0 */
90 W(0xf0, 0,1,1,1,0,1,0,0,1,1,1,0,1,1,1,0) /* f0 */
91 /* ------------------------------- */
92 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
93 };
94#undef W
95 kprobe_opcode_t opcode;
96 kprobe_opcode_t *orig_opcodes = opcodes;
97retry:
98 if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
99 return 0;
100 opcode = *(opcodes++);
101
102 /* 2nd-byte opcode */
103 if (opcode == 0x0f) {
104 if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
105 return 0;
106 return test_bit(*opcodes, twobyte_is_boostable);
107 }
108
109 switch (opcode & 0xf0) {
110 case 0x60:
111 if (0x63 < opcode && opcode < 0x67)
112 goto retry; /* prefixes */
113 /* can't boost Address-size override and bound */
114 return (opcode != 0x62 && opcode != 0x67);
115 case 0x70:
116 return 0; /* can't boost conditional jump */
117 case 0xc0:
118 /* can't boost software-interruptions */
119 return (0xc1 < opcode && opcode < 0xcc) || opcode == 0xcf;
120 case 0xd0:
121 /* can boost AA* and XLAT */
122 return (opcode == 0xd4 || opcode == 0xd5 || opcode == 0xd7);
123 case 0xe0:
124 /* can boost in/out and absolute jmps */
125 return ((opcode & 0x04) || opcode == 0xea);
126 case 0xf0:
127 if ((opcode & 0x0c) == 0 && opcode != 0xf1)
128 goto retry; /* lock/rep(ne) prefix */
129 /* clear and set flags can be boost */
130 return (opcode == 0xf5 || (0xf7 < opcode && opcode < 0xfe));
131 default:
132 if (opcode == 0x26 || opcode == 0x36 || opcode == 0x3e)
133 goto retry; /* prefixes */
134 /* can't boost CS override and call */
135 return (opcode != 0x2e && opcode != 0x9a);
136 }
137}
138
139/*
140 * returns non-zero if opcode modifies the interrupt flag.
141 */
142static int __kprobes is_IF_modifier(kprobe_opcode_t opcode)
143{
144 switch (opcode) {
145 case 0xfa: /* cli */
146 case 0xfb: /* sti */
147 case 0xcf: /* iret/iretd */
148 case 0x9d: /* popf/popfd */
149 return 1;
150 }
151 return 0;
152}
153
154int __kprobes arch_prepare_kprobe(struct kprobe *p)
155{
156 /* insn: must be on special executable page on i386. */
157 p->ainsn.insn = get_insn_slot();
158 if (!p->ainsn.insn)
159 return -ENOMEM;
160
161 memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
162 p->opcode = *p->addr;
163 if (can_boost(p->addr)) {
164 p->ainsn.boostable = 0;
165 } else {
166 p->ainsn.boostable = -1;
167 }
168 return 0;
169}
170
171void __kprobes arch_arm_kprobe(struct kprobe *p)
172{
173 text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1);
174}
175
176void __kprobes arch_disarm_kprobe(struct kprobe *p)
177{
178 text_poke(p->addr, &p->opcode, 1);
179}
180
181void __kprobes arch_remove_kprobe(struct kprobe *p)
182{
183 mutex_lock(&kprobe_mutex);
184 free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1));
185 mutex_unlock(&kprobe_mutex);
186}
187
188static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
189{
190 kcb->prev_kprobe.kp = kprobe_running();
191 kcb->prev_kprobe.status = kcb->kprobe_status;
192 kcb->prev_kprobe.old_eflags = kcb->kprobe_old_eflags;
193 kcb->prev_kprobe.saved_eflags = kcb->kprobe_saved_eflags;
194}
195
196static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
197{
198 __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp;
199 kcb->kprobe_status = kcb->prev_kprobe.status;
200 kcb->kprobe_old_eflags = kcb->prev_kprobe.old_eflags;
201 kcb->kprobe_saved_eflags = kcb->prev_kprobe.saved_eflags;
202}
203
204static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
205 struct kprobe_ctlblk *kcb)
206{
207 __get_cpu_var(current_kprobe) = p;
208 kcb->kprobe_saved_eflags = kcb->kprobe_old_eflags
209 = (regs->eflags & (TF_MASK | IF_MASK));
210 if (is_IF_modifier(p->opcode))
211 kcb->kprobe_saved_eflags &= ~IF_MASK;
212}
213
214static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
215{
216 regs->eflags |= TF_MASK;
217 regs->eflags &= ~IF_MASK;
218 /*single step inline if the instruction is an int3*/
219 if (p->opcode == BREAKPOINT_INSTRUCTION)
220 regs->eip = (unsigned long)p->addr;
221 else
222 regs->eip = (unsigned long)p->ainsn.insn;
223}
224
225/* Called with kretprobe_lock held */
226void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
227 struct pt_regs *regs)
228{
229 unsigned long *sara = (unsigned long *)&regs->esp;
230
231 ri->ret_addr = (kprobe_opcode_t *) *sara;
232
233 /* Replace the return addr with trampoline addr */
234 *sara = (unsigned long) &kretprobe_trampoline;
235}
236
237/*
238 * Interrupts are disabled on entry as trap3 is an interrupt gate and they
239 * remain disabled thorough out this function.
240 */
241static int __kprobes kprobe_handler(struct pt_regs *regs)
242{
243 struct kprobe *p;
244 int ret = 0;
245 kprobe_opcode_t *addr;
246 struct kprobe_ctlblk *kcb;
247
248 addr = (kprobe_opcode_t *)(regs->eip - sizeof(kprobe_opcode_t));
249
250 /*
251 * We don't want to be preempted for the entire
252 * duration of kprobe processing
253 */
254 preempt_disable();
255 kcb = get_kprobe_ctlblk();
256
257 /* Check we're not actually recursing */
258 if (kprobe_running()) {
259 p = get_kprobe(addr);
260 if (p) {
261 if (kcb->kprobe_status == KPROBE_HIT_SS &&
262 *p->ainsn.insn == BREAKPOINT_INSTRUCTION) {
263 regs->eflags &= ~TF_MASK;
264 regs->eflags |= kcb->kprobe_saved_eflags;
265 goto no_kprobe;
266 }
267 /* We have reentered the kprobe_handler(), since
268 * another probe was hit while within the handler.
269 * We here save the original kprobes variables and
270 * just single step on the instruction of the new probe
271 * without calling any user handlers.
272 */
273 save_previous_kprobe(kcb);
274 set_current_kprobe(p, regs, kcb);
275 kprobes_inc_nmissed_count(p);
276 prepare_singlestep(p, regs);
277 kcb->kprobe_status = KPROBE_REENTER;
278 return 1;
279 } else {
280 if (*addr != BREAKPOINT_INSTRUCTION) {
281 /* The breakpoint instruction was removed by
282 * another cpu right after we hit, no further
283 * handling of this interrupt is appropriate
284 */
285 regs->eip -= sizeof(kprobe_opcode_t);
286 ret = 1;
287 goto no_kprobe;
288 }
289 p = __get_cpu_var(current_kprobe);
290 if (p->break_handler && p->break_handler(p, regs)) {
291 goto ss_probe;
292 }
293 }
294 goto no_kprobe;
295 }
296
297 p = get_kprobe(addr);
298 if (!p) {
299 if (*addr != BREAKPOINT_INSTRUCTION) {
300 /*
301 * The breakpoint instruction was removed right
302 * after we hit it. Another cpu has removed
303 * either a probepoint or a debugger breakpoint
304 * at this address. In either case, no further
305 * handling of this interrupt is appropriate.
306 * Back up over the (now missing) int3 and run
307 * the original instruction.
308 */
309 regs->eip -= sizeof(kprobe_opcode_t);
310 ret = 1;
311 }
312 /* Not one of ours: let kernel handle it */
313 goto no_kprobe;
314 }
315
316 set_current_kprobe(p, regs, kcb);
317 kcb->kprobe_status = KPROBE_HIT_ACTIVE;
318
319 if (p->pre_handler && p->pre_handler(p, regs))
320 /* handler has already set things up, so skip ss setup */
321 return 1;
322
323ss_probe:
324#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PM)
325 if (p->ainsn.boostable == 1 && !p->post_handler){
326 /* Boost up -- we can execute copied instructions directly */
327 reset_current_kprobe();
328 regs->eip = (unsigned long)p->ainsn.insn;
329 preempt_enable_no_resched();
330 return 1;
331 }
332#endif
333 prepare_singlestep(p, regs);
334 kcb->kprobe_status = KPROBE_HIT_SS;
335 return 1;
336
337no_kprobe:
338 preempt_enable_no_resched();
339 return ret;
340}
341
342/*
343 * For function-return probes, init_kprobes() establishes a probepoint
344 * here. When a retprobed function returns, this probe is hit and
345 * trampoline_probe_handler() runs, calling the kretprobe's handler.
346 */
347 void __kprobes kretprobe_trampoline_holder(void)
348 {
349 asm volatile ( ".global kretprobe_trampoline\n"
350 "kretprobe_trampoline: \n"
351 " pushf\n"
352 /* skip cs, eip, orig_eax */
353 " subl $12, %esp\n"
354 " pushl %fs\n"
355 " pushl %ds\n"
356 " pushl %es\n"
357 " pushl %eax\n"
358 " pushl %ebp\n"
359 " pushl %edi\n"
360 " pushl %esi\n"
361 " pushl %edx\n"
362 " pushl %ecx\n"
363 " pushl %ebx\n"
364 " movl %esp, %eax\n"
365 " call trampoline_handler\n"
366 /* move eflags to cs */
367 " movl 52(%esp), %edx\n"
368 " movl %edx, 48(%esp)\n"
369 /* save true return address on eflags */
370 " movl %eax, 52(%esp)\n"
371 " popl %ebx\n"
372 " popl %ecx\n"
373 " popl %edx\n"
374 " popl %esi\n"
375 " popl %edi\n"
376 " popl %ebp\n"
377 " popl %eax\n"
378 /* skip eip, orig_eax, es, ds, fs */
379 " addl $20, %esp\n"
380 " popf\n"
381 " ret\n");
382}
383
384/*
385 * Called from kretprobe_trampoline
386 */
387fastcall void *__kprobes trampoline_handler(struct pt_regs *regs)
388{
389 struct kretprobe_instance *ri = NULL;
390 struct hlist_head *head, empty_rp;
391 struct hlist_node *node, *tmp;
392 unsigned long flags, orig_ret_address = 0;
393 unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline;
394
395 INIT_HLIST_HEAD(&empty_rp);
396 spin_lock_irqsave(&kretprobe_lock, flags);
397 head = kretprobe_inst_table_head(current);
398 /* fixup registers */
399 regs->xcs = __KERNEL_CS | get_kernel_rpl();
400 regs->eip = trampoline_address;
401 regs->orig_eax = 0xffffffff;
402
403 /*
404 * It is possible to have multiple instances associated with a given
405 * task either because an multiple functions in the call path
406 * have a return probe installed on them, and/or more then one return
407 * return probe was registered for a target function.
408 *
409 * We can handle this because:
410 * - instances are always inserted at the head of the list
411 * - when multiple return probes are registered for the same
412 * function, the first instance's ret_addr will point to the
413 * real return address, and all the rest will point to
414 * kretprobe_trampoline
415 */
416 hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
417 if (ri->task != current)
418 /* another task is sharing our hash bucket */
419 continue;
420
421 if (ri->rp && ri->rp->handler){
422 __get_cpu_var(current_kprobe) = &ri->rp->kp;
423 get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
424 ri->rp->handler(ri, regs);
425 __get_cpu_var(current_kprobe) = NULL;
426 }
427
428 orig_ret_address = (unsigned long)ri->ret_addr;
429 recycle_rp_inst(ri, &empty_rp);
430
431 if (orig_ret_address != trampoline_address)
432 /*
433 * This is the real return address. Any other
434 * instances associated with this task are for
435 * other calls deeper on the call stack
436 */
437 break;
438 }
439
440 kretprobe_assert(ri, orig_ret_address, trampoline_address);
441 spin_unlock_irqrestore(&kretprobe_lock, flags);
442
443 hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
444 hlist_del(&ri->hlist);
445 kfree(ri);
446 }
447 return (void*)orig_ret_address;
448}
449
450/*
451 * Called after single-stepping. p->addr is the address of the
452 * instruction whose first byte has been replaced by the "int 3"
453 * instruction. To avoid the SMP problems that can occur when we
454 * temporarily put back the original opcode to single-step, we
455 * single-stepped a copy of the instruction. The address of this
456 * copy is p->ainsn.insn.
457 *
458 * This function prepares to return from the post-single-step
459 * interrupt. We have to fix up the stack as follows:
460 *
461 * 0) Except in the case of absolute or indirect jump or call instructions,
462 * the new eip is relative to the copied instruction. We need to make
463 * it relative to the original instruction.
464 *
465 * 1) If the single-stepped instruction was pushfl, then the TF and IF
466 * flags are set in the just-pushed eflags, and may need to be cleared.
467 *
468 * 2) If the single-stepped instruction was a call, the return address
469 * that is atop the stack is the address following the copied instruction.
470 * We need to make it the address following the original instruction.
471 *
472 * This function also checks instruction size for preparing direct execution.
473 */
474static void __kprobes resume_execution(struct kprobe *p,
475 struct pt_regs *regs, struct kprobe_ctlblk *kcb)
476{
477 unsigned long *tos = (unsigned long *)&regs->esp;
478 unsigned long copy_eip = (unsigned long)p->ainsn.insn;
479 unsigned long orig_eip = (unsigned long)p->addr;
480
481 regs->eflags &= ~TF_MASK;
482 switch (p->ainsn.insn[0]) {
483 case 0x9c: /* pushfl */
484 *tos &= ~(TF_MASK | IF_MASK);
485 *tos |= kcb->kprobe_old_eflags;
486 break;
487 case 0xc2: /* iret/ret/lret */
488 case 0xc3:
489 case 0xca:
490 case 0xcb:
491 case 0xcf:
492 case 0xea: /* jmp absolute -- eip is correct */
493 /* eip is already adjusted, no more changes required */
494 p->ainsn.boostable = 1;
495 goto no_change;
496 case 0xe8: /* call relative - Fix return addr */
497 *tos = orig_eip + (*tos - copy_eip);
498 break;
499 case 0x9a: /* call absolute -- same as call absolute, indirect */
500 *tos = orig_eip + (*tos - copy_eip);
501 goto no_change;
502 case 0xff:
503 if ((p->ainsn.insn[1] & 0x30) == 0x10) {
504 /*
505 * call absolute, indirect
506 * Fix return addr; eip is correct.
507 * But this is not boostable
508 */
509 *tos = orig_eip + (*tos - copy_eip);
510 goto no_change;
511 } else if (((p->ainsn.insn[1] & 0x31) == 0x20) || /* jmp near, absolute indirect */
512 ((p->ainsn.insn[1] & 0x31) == 0x21)) { /* jmp far, absolute indirect */
513 /* eip is correct. And this is boostable */
514 p->ainsn.boostable = 1;
515 goto no_change;
516 }
517 default:
518 break;
519 }
520
521 if (p->ainsn.boostable == 0) {
522 if ((regs->eip > copy_eip) &&
523 (regs->eip - copy_eip) + 5 < MAX_INSN_SIZE) {
524 /*
525 * These instructions can be executed directly if it
526 * jumps back to correct address.
527 */
528 set_jmp_op((void *)regs->eip,
529 (void *)orig_eip + (regs->eip - copy_eip));
530 p->ainsn.boostable = 1;
531 } else {
532 p->ainsn.boostable = -1;
533 }
534 }
535
536 regs->eip = orig_eip + (regs->eip - copy_eip);
537
538no_change:
539 return;
540}
541
542/*
543 * Interrupts are disabled on entry as trap1 is an interrupt gate and they
544 * remain disabled thoroughout this function.
545 */
546static int __kprobes post_kprobe_handler(struct pt_regs *regs)
547{
548 struct kprobe *cur = kprobe_running();
549 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
550
551 if (!cur)
552 return 0;
553
554 if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
555 kcb->kprobe_status = KPROBE_HIT_SSDONE;
556 cur->post_handler(cur, regs, 0);
557 }
558
559 resume_execution(cur, regs, kcb);
560 regs->eflags |= kcb->kprobe_saved_eflags;
561
562 /*Restore back the original saved kprobes variables and continue. */
563 if (kcb->kprobe_status == KPROBE_REENTER) {
564 restore_previous_kprobe(kcb);
565 goto out;
566 }
567 reset_current_kprobe();
568out:
569 preempt_enable_no_resched();
570
571 /*
572 * if somebody else is singlestepping across a probe point, eflags
573 * will have TF set, in which case, continue the remaining processing
574 * of do_debug, as if this is not a probe hit.
575 */
576 if (regs->eflags & TF_MASK)
577 return 0;
578
579 return 1;
580}
581
582static int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
583{
584 struct kprobe *cur = kprobe_running();
585 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
586
587 switch(kcb->kprobe_status) {
588 case KPROBE_HIT_SS:
589 case KPROBE_REENTER:
590 /*
591 * We are here because the instruction being single
592 * stepped caused a page fault. We reset the current
593 * kprobe and the eip points back to the probe address
594 * and allow the page fault handler to continue as a
595 * normal page fault.
596 */
597 regs->eip = (unsigned long)cur->addr;
598 regs->eflags |= kcb->kprobe_old_eflags;
599 if (kcb->kprobe_status == KPROBE_REENTER)
600 restore_previous_kprobe(kcb);
601 else
602 reset_current_kprobe();
603 preempt_enable_no_resched();
604 break;
605 case KPROBE_HIT_ACTIVE:
606 case KPROBE_HIT_SSDONE:
607 /*
608 * We increment the nmissed count for accounting,
609 * we can also use npre/npostfault count for accouting
610 * these specific fault cases.
611 */
612 kprobes_inc_nmissed_count(cur);
613
614 /*
615 * We come here because instructions in the pre/post
616 * handler caused the page_fault, this could happen
617 * if handler tries to access user space by
618 * copy_from_user(), get_user() etc. Let the
619 * user-specified handler try to fix it first.
620 */
621 if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
622 return 1;
623
624 /*
625 * In case the user-specified fault handler returned
626 * zero, try to fix up.
627 */
628 if (fixup_exception(regs))
629 return 1;
630
631 /*
632 * fixup_exception() could not handle it,
633 * Let do_page_fault() fix it.
634 */
635 break;
636 default:
637 break;
638 }
639 return 0;
640}
641
642/*
643 * Wrapper routine to for handling exceptions.
644 */
645int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
646 unsigned long val, void *data)
647{
648 struct die_args *args = (struct die_args *)data;
649 int ret = NOTIFY_DONE;
650
651 if (args->regs && user_mode_vm(args->regs))
652 return ret;
653
654 switch (val) {
655 case DIE_INT3:
656 if (kprobe_handler(args->regs))
657 ret = NOTIFY_STOP;
658 break;
659 case DIE_DEBUG:
660 if (post_kprobe_handler(args->regs))
661 ret = NOTIFY_STOP;
662 break;
663 case DIE_GPF:
664 case DIE_PAGE_FAULT:
665 /* kprobe_running() needs smp_processor_id() */
666 preempt_disable();
667 if (kprobe_running() &&
668 kprobe_fault_handler(args->regs, args->trapnr))
669 ret = NOTIFY_STOP;
670 preempt_enable();
671 break;
672 default:
673 break;
674 }
675 return ret;
676}
677
678int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
679{
680 struct jprobe *jp = container_of(p, struct jprobe, kp);
681 unsigned long addr;
682 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
683
684 kcb->jprobe_saved_regs = *regs;
685 kcb->jprobe_saved_esp = &regs->esp;
686 addr = (unsigned long)(kcb->jprobe_saved_esp);
687
688 /*
689 * TBD: As Linus pointed out, gcc assumes that the callee
690 * owns the argument space and could overwrite it, e.g.
691 * tailcall optimization. So, to be absolutely safe
692 * we also save and restore enough stack bytes to cover
693 * the argument area.
694 */
695 memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr,
696 MIN_STACK_SIZE(addr));
697 regs->eflags &= ~IF_MASK;
698 regs->eip = (unsigned long)(jp->entry);
699 return 1;
700}
701
702void __kprobes jprobe_return(void)
703{
704 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
705
706 asm volatile (" xchgl %%ebx,%%esp \n"
707 " int3 \n"
708 " .globl jprobe_return_end \n"
709 " jprobe_return_end: \n"
710 " nop \n"::"b"
711 (kcb->jprobe_saved_esp):"memory");
712}
713
714int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
715{
716 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
717 u8 *addr = (u8 *) (regs->eip - 1);
718 unsigned long stack_addr = (unsigned long)(kcb->jprobe_saved_esp);
719 struct jprobe *jp = container_of(p, struct jprobe, kp);
720
721 if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) {
722 if (&regs->esp != kcb->jprobe_saved_esp) {
723 struct pt_regs *saved_regs =
724 container_of(kcb->jprobe_saved_esp,
725 struct pt_regs, esp);
726 printk("current esp %p does not match saved esp %p\n",
727 &regs->esp, kcb->jprobe_saved_esp);
728 printk("Saved registers for jprobe %p\n", jp);
729 show_registers(saved_regs);
730 printk("Current registers\n");
731 show_registers(regs);
732 BUG();
733 }
734 *regs = kcb->jprobe_saved_regs;
735 memcpy((kprobe_opcode_t *) stack_addr, kcb->jprobes_stack,
736 MIN_STACK_SIZE(stack_addr));
737 preempt_enable_no_resched();
738 return 1;
739 }
740 return 0;
741}
742
743int __kprobes arch_trampoline_kprobe(struct kprobe *p)
744{
745 return 0;
746}
747
748int __init arch_init_kprobes(void)
749{
750 return 0;
751}
diff --git a/arch/x86/kernel/kprobes_64.c b/arch/x86/kernel/kprobes_64.c
new file mode 100644
index 000000000000..a30e004682e2
--- /dev/null
+++ b/arch/x86/kernel/kprobes_64.c
@@ -0,0 +1,749 @@
1/*
2 * Kernel Probes (KProbes)
3 * arch/x86_64/kernel/kprobes.c
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18 *
19 * Copyright (C) IBM Corporation, 2002, 2004
20 *
21 * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
22 * Probes initial implementation ( includes contributions from
23 * Rusty Russell).
24 * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
25 * interface to access function arguments.
26 * 2004-Oct Jim Keniston <kenistoj@us.ibm.com> and Prasanna S Panchamukhi
27 * <prasanna@in.ibm.com> adapted for x86_64
28 * 2005-Mar Roland McGrath <roland@redhat.com>
29 * Fixed to handle %rip-relative addressing mode correctly.
30 * 2005-May Rusty Lynch <rusty.lynch@intel.com>
31 * Added function return probes functionality
32 */
33
34#include <linux/kprobes.h>
35#include <linux/ptrace.h>
36#include <linux/string.h>
37#include <linux/slab.h>
38#include <linux/preempt.h>
39#include <linux/module.h>
40#include <linux/kdebug.h>
41
42#include <asm/pgtable.h>
43#include <asm/uaccess.h>
44#include <asm/alternative.h>
45
46void jprobe_return_end(void);
47static void __kprobes arch_copy_kprobe(struct kprobe *p);
48
49DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
50DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
51
52/*
53 * returns non-zero if opcode modifies the interrupt flag.
54 */
55static __always_inline int is_IF_modifier(kprobe_opcode_t *insn)
56{
57 switch (*insn) {
58 case 0xfa: /* cli */
59 case 0xfb: /* sti */
60 case 0xcf: /* iret/iretd */
61 case 0x9d: /* popf/popfd */
62 return 1;
63 }
64
65 if (*insn >= 0x40 && *insn <= 0x4f && *++insn == 0xcf)
66 return 1;
67 return 0;
68}
69
70int __kprobes arch_prepare_kprobe(struct kprobe *p)
71{
72 /* insn: must be on special executable page on x86_64. */
73 p->ainsn.insn = get_insn_slot();
74 if (!p->ainsn.insn) {
75 return -ENOMEM;
76 }
77 arch_copy_kprobe(p);
78 return 0;
79}
80
81/*
82 * Determine if the instruction uses the %rip-relative addressing mode.
83 * If it does, return the address of the 32-bit displacement word.
84 * If not, return null.
85 */
86static s32 __kprobes *is_riprel(u8 *insn)
87{
88#define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf) \
89 (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
90 (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \
91 (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \
92 (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \
93 << (row % 64))
94 static const u64 onebyte_has_modrm[256 / 64] = {
95 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
96 /* ------------------------------- */
97 W(0x00, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 00 */
98 W(0x10, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 10 */
99 W(0x20, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 20 */
100 W(0x30, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0), /* 30 */
101 W(0x40, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 40 */
102 W(0x50, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 50 */
103 W(0x60, 0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0)| /* 60 */
104 W(0x70, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 70 */
105 W(0x80, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 80 */
106 W(0x90, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 90 */
107 W(0xa0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* a0 */
108 W(0xb0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* b0 */
109 W(0xc0, 1,1,0,0,1,1,1,1,0,0,0,0,0,0,0,0)| /* c0 */
110 W(0xd0, 1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1)| /* d0 */
111 W(0xe0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* e0 */
112 W(0xf0, 0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,1) /* f0 */
113 /* ------------------------------- */
114 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
115 };
116 static const u64 twobyte_has_modrm[256 / 64] = {
117 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
118 /* ------------------------------- */
119 W(0x00, 1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,1)| /* 0f */
120 W(0x10, 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0)| /* 1f */
121 W(0x20, 1,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1)| /* 2f */
122 W(0x30, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 3f */
123 W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 4f */
124 W(0x50, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 5f */
125 W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 6f */
126 W(0x70, 1,1,1,1,1,1,1,0,0,0,0,0,1,1,1,1), /* 7f */
127 W(0x80, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 8f */
128 W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 9f */
129 W(0xa0, 0,0,0,1,1,1,1,1,0,0,0,1,1,1,1,1)| /* af */
130 W(0xb0, 1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1), /* bf */
131 W(0xc0, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0)| /* cf */
132 W(0xd0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* df */
133 W(0xe0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* ef */
134 W(0xf0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0) /* ff */
135 /* ------------------------------- */
136 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
137 };
138#undef W
139 int need_modrm;
140
141 /* Skip legacy instruction prefixes. */
142 while (1) {
143 switch (*insn) {
144 case 0x66:
145 case 0x67:
146 case 0x2e:
147 case 0x3e:
148 case 0x26:
149 case 0x64:
150 case 0x65:
151 case 0x36:
152 case 0xf0:
153 case 0xf3:
154 case 0xf2:
155 ++insn;
156 continue;
157 }
158 break;
159 }
160
161 /* Skip REX instruction prefix. */
162 if ((*insn & 0xf0) == 0x40)
163 ++insn;
164
165 if (*insn == 0x0f) { /* Two-byte opcode. */
166 ++insn;
167 need_modrm = test_bit(*insn, twobyte_has_modrm);
168 } else { /* One-byte opcode. */
169 need_modrm = test_bit(*insn, onebyte_has_modrm);
170 }
171
172 if (need_modrm) {
173 u8 modrm = *++insn;
174 if ((modrm & 0xc7) == 0x05) { /* %rip+disp32 addressing mode */
175 /* Displacement follows ModRM byte. */
176 return (s32 *) ++insn;
177 }
178 }
179
180 /* No %rip-relative addressing mode here. */
181 return NULL;
182}
183
184static void __kprobes arch_copy_kprobe(struct kprobe *p)
185{
186 s32 *ripdisp;
187 memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE);
188 ripdisp = is_riprel(p->ainsn.insn);
189 if (ripdisp) {
190 /*
191 * The copied instruction uses the %rip-relative
192 * addressing mode. Adjust the displacement for the
193 * difference between the original location of this
194 * instruction and the location of the copy that will
195 * actually be run. The tricky bit here is making sure
196 * that the sign extension happens correctly in this
197 * calculation, since we need a signed 32-bit result to
198 * be sign-extended to 64 bits when it's added to the
199 * %rip value and yield the same 64-bit result that the
200 * sign-extension of the original signed 32-bit
201 * displacement would have given.
202 */
203 s64 disp = (u8 *) p->addr + *ripdisp - (u8 *) p->ainsn.insn;
204 BUG_ON((s64) (s32) disp != disp); /* Sanity check. */
205 *ripdisp = disp;
206 }
207 p->opcode = *p->addr;
208}
209
210void __kprobes arch_arm_kprobe(struct kprobe *p)
211{
212 text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1);
213}
214
215void __kprobes arch_disarm_kprobe(struct kprobe *p)
216{
217 text_poke(p->addr, &p->opcode, 1);
218}
219
220void __kprobes arch_remove_kprobe(struct kprobe *p)
221{
222 mutex_lock(&kprobe_mutex);
223 free_insn_slot(p->ainsn.insn, 0);
224 mutex_unlock(&kprobe_mutex);
225}
226
227static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
228{
229 kcb->prev_kprobe.kp = kprobe_running();
230 kcb->prev_kprobe.status = kcb->kprobe_status;
231 kcb->prev_kprobe.old_rflags = kcb->kprobe_old_rflags;
232 kcb->prev_kprobe.saved_rflags = kcb->kprobe_saved_rflags;
233}
234
235static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
236{
237 __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp;
238 kcb->kprobe_status = kcb->prev_kprobe.status;
239 kcb->kprobe_old_rflags = kcb->prev_kprobe.old_rflags;
240 kcb->kprobe_saved_rflags = kcb->prev_kprobe.saved_rflags;
241}
242
243static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
244 struct kprobe_ctlblk *kcb)
245{
246 __get_cpu_var(current_kprobe) = p;
247 kcb->kprobe_saved_rflags = kcb->kprobe_old_rflags
248 = (regs->eflags & (TF_MASK | IF_MASK));
249 if (is_IF_modifier(p->ainsn.insn))
250 kcb->kprobe_saved_rflags &= ~IF_MASK;
251}
252
253static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
254{
255 regs->eflags |= TF_MASK;
256 regs->eflags &= ~IF_MASK;
257 /*single step inline if the instruction is an int3*/
258 if (p->opcode == BREAKPOINT_INSTRUCTION)
259 regs->rip = (unsigned long)p->addr;
260 else
261 regs->rip = (unsigned long)p->ainsn.insn;
262}
263
264/* Called with kretprobe_lock held */
265void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
266 struct pt_regs *regs)
267{
268 unsigned long *sara = (unsigned long *)regs->rsp;
269
270 ri->ret_addr = (kprobe_opcode_t *) *sara;
271 /* Replace the return addr with trampoline addr */
272 *sara = (unsigned long) &kretprobe_trampoline;
273}
274
275int __kprobes kprobe_handler(struct pt_regs *regs)
276{
277 struct kprobe *p;
278 int ret = 0;
279 kprobe_opcode_t *addr = (kprobe_opcode_t *)(regs->rip - sizeof(kprobe_opcode_t));
280 struct kprobe_ctlblk *kcb;
281
282 /*
283 * We don't want to be preempted for the entire
284 * duration of kprobe processing
285 */
286 preempt_disable();
287 kcb = get_kprobe_ctlblk();
288
289 /* Check we're not actually recursing */
290 if (kprobe_running()) {
291 p = get_kprobe(addr);
292 if (p) {
293 if (kcb->kprobe_status == KPROBE_HIT_SS &&
294 *p->ainsn.insn == BREAKPOINT_INSTRUCTION) {
295 regs->eflags &= ~TF_MASK;
296 regs->eflags |= kcb->kprobe_saved_rflags;
297 goto no_kprobe;
298 } else if (kcb->kprobe_status == KPROBE_HIT_SSDONE) {
299 /* TODO: Provide re-entrancy from
300 * post_kprobes_handler() and avoid exception
301 * stack corruption while single-stepping on
302 * the instruction of the new probe.
303 */
304 arch_disarm_kprobe(p);
305 regs->rip = (unsigned long)p->addr;
306 reset_current_kprobe();
307 ret = 1;
308 } else {
309 /* We have reentered the kprobe_handler(), since
310 * another probe was hit while within the
311 * handler. We here save the original kprobe
312 * variables and just single step on instruction
313 * of the new probe without calling any user
314 * handlers.
315 */
316 save_previous_kprobe(kcb);
317 set_current_kprobe(p, regs, kcb);
318 kprobes_inc_nmissed_count(p);
319 prepare_singlestep(p, regs);
320 kcb->kprobe_status = KPROBE_REENTER;
321 return 1;
322 }
323 } else {
324 if (*addr != BREAKPOINT_INSTRUCTION) {
325 /* The breakpoint instruction was removed by
326 * another cpu right after we hit, no further
327 * handling of this interrupt is appropriate
328 */
329 regs->rip = (unsigned long)addr;
330 ret = 1;
331 goto no_kprobe;
332 }
333 p = __get_cpu_var(current_kprobe);
334 if (p->break_handler && p->break_handler(p, regs)) {
335 goto ss_probe;
336 }
337 }
338 goto no_kprobe;
339 }
340
341 p = get_kprobe(addr);
342 if (!p) {
343 if (*addr != BREAKPOINT_INSTRUCTION) {
344 /*
345 * The breakpoint instruction was removed right
346 * after we hit it. Another cpu has removed
347 * either a probepoint or a debugger breakpoint
348 * at this address. In either case, no further
349 * handling of this interrupt is appropriate.
350 * Back up over the (now missing) int3 and run
351 * the original instruction.
352 */
353 regs->rip = (unsigned long)addr;
354 ret = 1;
355 }
356 /* Not one of ours: let kernel handle it */
357 goto no_kprobe;
358 }
359
360 set_current_kprobe(p, regs, kcb);
361 kcb->kprobe_status = KPROBE_HIT_ACTIVE;
362
363 if (p->pre_handler && p->pre_handler(p, regs))
364 /* handler has already set things up, so skip ss setup */
365 return 1;
366
367ss_probe:
368 prepare_singlestep(p, regs);
369 kcb->kprobe_status = KPROBE_HIT_SS;
370 return 1;
371
372no_kprobe:
373 preempt_enable_no_resched();
374 return ret;
375}
376
377/*
378 * For function-return probes, init_kprobes() establishes a probepoint
379 * here. When a retprobed function returns, this probe is hit and
380 * trampoline_probe_handler() runs, calling the kretprobe's handler.
381 */
382 void kretprobe_trampoline_holder(void)
383 {
384 asm volatile ( ".global kretprobe_trampoline\n"
385 "kretprobe_trampoline: \n"
386 "nop\n");
387 }
388
389/*
390 * Called when we hit the probe point at kretprobe_trampoline
391 */
392int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
393{
394 struct kretprobe_instance *ri = NULL;
395 struct hlist_head *head, empty_rp;
396 struct hlist_node *node, *tmp;
397 unsigned long flags, orig_ret_address = 0;
398 unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline;
399
400 INIT_HLIST_HEAD(&empty_rp);
401 spin_lock_irqsave(&kretprobe_lock, flags);
402 head = kretprobe_inst_table_head(current);
403
404 /*
405 * It is possible to have multiple instances associated with a given
406 * task either because an multiple functions in the call path
407 * have a return probe installed on them, and/or more then one return
408 * return probe was registered for a target function.
409 *
410 * We can handle this because:
411 * - instances are always inserted at the head of the list
412 * - when multiple return probes are registered for the same
413 * function, the first instance's ret_addr will point to the
414 * real return address, and all the rest will point to
415 * kretprobe_trampoline
416 */
417 hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
418 if (ri->task != current)
419 /* another task is sharing our hash bucket */
420 continue;
421
422 if (ri->rp && ri->rp->handler)
423 ri->rp->handler(ri, regs);
424
425 orig_ret_address = (unsigned long)ri->ret_addr;
426 recycle_rp_inst(ri, &empty_rp);
427
428 if (orig_ret_address != trampoline_address)
429 /*
430 * This is the real return address. Any other
431 * instances associated with this task are for
432 * other calls deeper on the call stack
433 */
434 break;
435 }
436
437 kretprobe_assert(ri, orig_ret_address, trampoline_address);
438 regs->rip = orig_ret_address;
439
440 reset_current_kprobe();
441 spin_unlock_irqrestore(&kretprobe_lock, flags);
442 preempt_enable_no_resched();
443
444 hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
445 hlist_del(&ri->hlist);
446 kfree(ri);
447 }
448 /*
449 * By returning a non-zero value, we are telling
450 * kprobe_handler() that we don't want the post_handler
451 * to run (and have re-enabled preemption)
452 */
453 return 1;
454}
455
456/*
457 * Called after single-stepping. p->addr is the address of the
458 * instruction whose first byte has been replaced by the "int 3"
459 * instruction. To avoid the SMP problems that can occur when we
460 * temporarily put back the original opcode to single-step, we
461 * single-stepped a copy of the instruction. The address of this
462 * copy is p->ainsn.insn.
463 *
464 * This function prepares to return from the post-single-step
465 * interrupt. We have to fix up the stack as follows:
466 *
467 * 0) Except in the case of absolute or indirect jump or call instructions,
468 * the new rip is relative to the copied instruction. We need to make
469 * it relative to the original instruction.
470 *
471 * 1) If the single-stepped instruction was pushfl, then the TF and IF
472 * flags are set in the just-pushed eflags, and may need to be cleared.
473 *
474 * 2) If the single-stepped instruction was a call, the return address
475 * that is atop the stack is the address following the copied instruction.
476 * We need to make it the address following the original instruction.
477 */
478static void __kprobes resume_execution(struct kprobe *p,
479 struct pt_regs *regs, struct kprobe_ctlblk *kcb)
480{
481 unsigned long *tos = (unsigned long *)regs->rsp;
482 unsigned long next_rip = 0;
483 unsigned long copy_rip = (unsigned long)p->ainsn.insn;
484 unsigned long orig_rip = (unsigned long)p->addr;
485 kprobe_opcode_t *insn = p->ainsn.insn;
486
487 /*skip the REX prefix*/
488 if (*insn >= 0x40 && *insn <= 0x4f)
489 insn++;
490
491 switch (*insn) {
492 case 0x9c: /* pushfl */
493 *tos &= ~(TF_MASK | IF_MASK);
494 *tos |= kcb->kprobe_old_rflags;
495 break;
496 case 0xc3: /* ret/lret */
497 case 0xcb:
498 case 0xc2:
499 case 0xca:
500 regs->eflags &= ~TF_MASK;
501 /* rip is already adjusted, no more changes required*/
502 return;
503 case 0xe8: /* call relative - Fix return addr */
504 *tos = orig_rip + (*tos - copy_rip);
505 break;
506 case 0xff:
507 if ((insn[1] & 0x30) == 0x10) {
508 /* call absolute, indirect */
509 /* Fix return addr; rip is correct. */
510 next_rip = regs->rip;
511 *tos = orig_rip + (*tos - copy_rip);
512 } else if (((insn[1] & 0x31) == 0x20) || /* jmp near, absolute indirect */
513 ((insn[1] & 0x31) == 0x21)) { /* jmp far, absolute indirect */
514 /* rip is correct. */
515 next_rip = regs->rip;
516 }
517 break;
518 case 0xea: /* jmp absolute -- rip is correct */
519 next_rip = regs->rip;
520 break;
521 default:
522 break;
523 }
524
525 regs->eflags &= ~TF_MASK;
526 if (next_rip) {
527 regs->rip = next_rip;
528 } else {
529 regs->rip = orig_rip + (regs->rip - copy_rip);
530 }
531}
532
533int __kprobes post_kprobe_handler(struct pt_regs *regs)
534{
535 struct kprobe *cur = kprobe_running();
536 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
537
538 if (!cur)
539 return 0;
540
541 if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
542 kcb->kprobe_status = KPROBE_HIT_SSDONE;
543 cur->post_handler(cur, regs, 0);
544 }
545
546 resume_execution(cur, regs, kcb);
547 regs->eflags |= kcb->kprobe_saved_rflags;
548
549 /* Restore the original saved kprobes variables and continue. */
550 if (kcb->kprobe_status == KPROBE_REENTER) {
551 restore_previous_kprobe(kcb);
552 goto out;
553 }
554 reset_current_kprobe();
555out:
556 preempt_enable_no_resched();
557
558 /*
559 * if somebody else is singlestepping across a probe point, eflags
560 * will have TF set, in which case, continue the remaining processing
561 * of do_debug, as if this is not a probe hit.
562 */
563 if (regs->eflags & TF_MASK)
564 return 0;
565
566 return 1;
567}
568
569int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
570{
571 struct kprobe *cur = kprobe_running();
572 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
573 const struct exception_table_entry *fixup;
574
575 switch(kcb->kprobe_status) {
576 case KPROBE_HIT_SS:
577 case KPROBE_REENTER:
578 /*
579 * We are here because the instruction being single
580 * stepped caused a page fault. We reset the current
581 * kprobe and the rip points back to the probe address
582 * and allow the page fault handler to continue as a
583 * normal page fault.
584 */
585 regs->rip = (unsigned long)cur->addr;
586 regs->eflags |= kcb->kprobe_old_rflags;
587 if (kcb->kprobe_status == KPROBE_REENTER)
588 restore_previous_kprobe(kcb);
589 else
590 reset_current_kprobe();
591 preempt_enable_no_resched();
592 break;
593 case KPROBE_HIT_ACTIVE:
594 case KPROBE_HIT_SSDONE:
595 /*
596 * We increment the nmissed count for accounting,
597 * we can also use npre/npostfault count for accouting
598 * these specific fault cases.
599 */
600 kprobes_inc_nmissed_count(cur);
601
602 /*
603 * We come here because instructions in the pre/post
604 * handler caused the page_fault, this could happen
605 * if handler tries to access user space by
606 * copy_from_user(), get_user() etc. Let the
607 * user-specified handler try to fix it first.
608 */
609 if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
610 return 1;
611
612 /*
613 * In case the user-specified fault handler returned
614 * zero, try to fix up.
615 */
616 fixup = search_exception_tables(regs->rip);
617 if (fixup) {
618 regs->rip = fixup->fixup;
619 return 1;
620 }
621
622 /*
623 * fixup() could not handle it,
624 * Let do_page_fault() fix it.
625 */
626 break;
627 default:
628 break;
629 }
630 return 0;
631}
632
633/*
634 * Wrapper routine for handling exceptions.
635 */
636int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
637 unsigned long val, void *data)
638{
639 struct die_args *args = (struct die_args *)data;
640 int ret = NOTIFY_DONE;
641
642 if (args->regs && user_mode(args->regs))
643 return ret;
644
645 switch (val) {
646 case DIE_INT3:
647 if (kprobe_handler(args->regs))
648 ret = NOTIFY_STOP;
649 break;
650 case DIE_DEBUG:
651 if (post_kprobe_handler(args->regs))
652 ret = NOTIFY_STOP;
653 break;
654 case DIE_GPF:
655 case DIE_PAGE_FAULT:
656 /* kprobe_running() needs smp_processor_id() */
657 preempt_disable();
658 if (kprobe_running() &&
659 kprobe_fault_handler(args->regs, args->trapnr))
660 ret = NOTIFY_STOP;
661 preempt_enable();
662 break;
663 default:
664 break;
665 }
666 return ret;
667}
668
669int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
670{
671 struct jprobe *jp = container_of(p, struct jprobe, kp);
672 unsigned long addr;
673 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
674
675 kcb->jprobe_saved_regs = *regs;
676 kcb->jprobe_saved_rsp = (long *) regs->rsp;
677 addr = (unsigned long)(kcb->jprobe_saved_rsp);
678 /*
679 * As Linus pointed out, gcc assumes that the callee
680 * owns the argument space and could overwrite it, e.g.
681 * tailcall optimization. So, to be absolutely safe
682 * we also save and restore enough stack bytes to cover
683 * the argument area.
684 */
685 memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr,
686 MIN_STACK_SIZE(addr));
687 regs->eflags &= ~IF_MASK;
688 regs->rip = (unsigned long)(jp->entry);
689 return 1;
690}
691
692void __kprobes jprobe_return(void)
693{
694 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
695
696 asm volatile (" xchg %%rbx,%%rsp \n"
697 " int3 \n"
698 " .globl jprobe_return_end \n"
699 " jprobe_return_end: \n"
700 " nop \n"::"b"
701 (kcb->jprobe_saved_rsp):"memory");
702}
703
704int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
705{
706 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
707 u8 *addr = (u8 *) (regs->rip - 1);
708 unsigned long stack_addr = (unsigned long)(kcb->jprobe_saved_rsp);
709 struct jprobe *jp = container_of(p, struct jprobe, kp);
710
711 if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) {
712 if ((long *)regs->rsp != kcb->jprobe_saved_rsp) {
713 struct pt_regs *saved_regs =
714 container_of(kcb->jprobe_saved_rsp,
715 struct pt_regs, rsp);
716 printk("current rsp %p does not match saved rsp %p\n",
717 (long *)regs->rsp, kcb->jprobe_saved_rsp);
718 printk("Saved registers for jprobe %p\n", jp);
719 show_registers(saved_regs);
720 printk("Current registers\n");
721 show_registers(regs);
722 BUG();
723 }
724 *regs = kcb->jprobe_saved_regs;
725 memcpy((kprobe_opcode_t *) stack_addr, kcb->jprobes_stack,
726 MIN_STACK_SIZE(stack_addr));
727 preempt_enable_no_resched();
728 return 1;
729 }
730 return 0;
731}
732
733static struct kprobe trampoline_p = {
734 .addr = (kprobe_opcode_t *) &kretprobe_trampoline,
735 .pre_handler = trampoline_probe_handler
736};
737
738int __init arch_init_kprobes(void)
739{
740 return register_kprobe(&trampoline_p);
741}
742
743int __kprobes arch_trampoline_kprobe(struct kprobe *p)
744{
745 if (p->addr == (kprobe_opcode_t *)&kretprobe_trampoline)
746 return 1;
747
748 return 0;
749}
diff --git a/arch/x86/kernel/ldt_32.c b/arch/x86/kernel/ldt_32.c
new file mode 100644
index 000000000000..e0b2d17f4f10
--- /dev/null
+++ b/arch/x86/kernel/ldt_32.c
@@ -0,0 +1,250 @@
1/*
2 * linux/arch/i386/kernel/ldt.c
3 *
4 * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
5 * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
6 */
7
8#include <linux/errno.h>
9#include <linux/sched.h>
10#include <linux/string.h>
11#include <linux/mm.h>
12#include <linux/smp.h>
13#include <linux/vmalloc.h>
14#include <linux/slab.h>
15
16#include <asm/uaccess.h>
17#include <asm/system.h>
18#include <asm/ldt.h>
19#include <asm/desc.h>
20#include <asm/mmu_context.h>
21
22#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
23static void flush_ldt(void *null)
24{
25 if (current->active_mm)
26 load_LDT(&current->active_mm->context);
27}
28#endif
29
30static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
31{
32 void *oldldt;
33 void *newldt;
34 int oldsize;
35
36 if (mincount <= pc->size)
37 return 0;
38 oldsize = pc->size;
39 mincount = (mincount+511)&(~511);
40 if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
41 newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
42 else
43 newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
44
45 if (!newldt)
46 return -ENOMEM;
47
48 if (oldsize)
49 memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
50 oldldt = pc->ldt;
51 memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
52 pc->ldt = newldt;
53 wmb();
54 pc->size = mincount;
55 wmb();
56
57 if (reload) {
58#ifdef CONFIG_SMP
59 cpumask_t mask;
60 preempt_disable();
61 load_LDT(pc);
62 mask = cpumask_of_cpu(smp_processor_id());
63 if (!cpus_equal(current->mm->cpu_vm_mask, mask))
64 smp_call_function(flush_ldt, NULL, 1, 1);
65 preempt_enable();
66#else
67 load_LDT(pc);
68#endif
69 }
70 if (oldsize) {
71 if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
72 vfree(oldldt);
73 else
74 kfree(oldldt);
75 }
76 return 0;
77}
78
79static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
80{
81 int err = alloc_ldt(new, old->size, 0);
82 if (err < 0)
83 return err;
84 memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
85 return 0;
86}
87
88/*
89 * we do not have to muck with descriptors here, that is
90 * done in switch_mm() as needed.
91 */
92int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
93{
94 struct mm_struct * old_mm;
95 int retval = 0;
96
97 init_MUTEX(&mm->context.sem);
98 mm->context.size = 0;
99 old_mm = current->mm;
100 if (old_mm && old_mm->context.size > 0) {
101 down(&old_mm->context.sem);
102 retval = copy_ldt(&mm->context, &old_mm->context);
103 up(&old_mm->context.sem);
104 }
105 return retval;
106}
107
108/*
109 * No need to lock the MM as we are the last user
110 */
111void destroy_context(struct mm_struct *mm)
112{
113 if (mm->context.size) {
114 if (mm == current->active_mm)
115 clear_LDT();
116 if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
117 vfree(mm->context.ldt);
118 else
119 kfree(mm->context.ldt);
120 mm->context.size = 0;
121 }
122}
123
124static int read_ldt(void __user * ptr, unsigned long bytecount)
125{
126 int err;
127 unsigned long size;
128 struct mm_struct * mm = current->mm;
129
130 if (!mm->context.size)
131 return 0;
132 if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
133 bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
134
135 down(&mm->context.sem);
136 size = mm->context.size*LDT_ENTRY_SIZE;
137 if (size > bytecount)
138 size = bytecount;
139
140 err = 0;
141 if (copy_to_user(ptr, mm->context.ldt, size))
142 err = -EFAULT;
143 up(&mm->context.sem);
144 if (err < 0)
145 goto error_return;
146 if (size != bytecount) {
147 /* zero-fill the rest */
148 if (clear_user(ptr+size, bytecount-size) != 0) {
149 err = -EFAULT;
150 goto error_return;
151 }
152 }
153 return bytecount;
154error_return:
155 return err;
156}
157
158static int read_default_ldt(void __user * ptr, unsigned long bytecount)
159{
160 int err;
161 unsigned long size;
162
163 err = 0;
164 size = 5*sizeof(struct desc_struct);
165 if (size > bytecount)
166 size = bytecount;
167
168 err = size;
169 if (clear_user(ptr, size))
170 err = -EFAULT;
171
172 return err;
173}
174
175static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
176{
177 struct mm_struct * mm = current->mm;
178 __u32 entry_1, entry_2;
179 int error;
180 struct user_desc ldt_info;
181
182 error = -EINVAL;
183 if (bytecount != sizeof(ldt_info))
184 goto out;
185 error = -EFAULT;
186 if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info)))
187 goto out;
188
189 error = -EINVAL;
190 if (ldt_info.entry_number >= LDT_ENTRIES)
191 goto out;
192 if (ldt_info.contents == 3) {
193 if (oldmode)
194 goto out;
195 if (ldt_info.seg_not_present == 0)
196 goto out;
197 }
198
199 down(&mm->context.sem);
200 if (ldt_info.entry_number >= mm->context.size) {
201 error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1);
202 if (error < 0)
203 goto out_unlock;
204 }
205
206 /* Allow LDTs to be cleared by the user. */
207 if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
208 if (oldmode || LDT_empty(&ldt_info)) {
209 entry_1 = 0;
210 entry_2 = 0;
211 goto install;
212 }
213 }
214
215 entry_1 = LDT_entry_a(&ldt_info);
216 entry_2 = LDT_entry_b(&ldt_info);
217 if (oldmode)
218 entry_2 &= ~(1 << 20);
219
220 /* Install the new entry ... */
221install:
222 write_ldt_entry(mm->context.ldt, ldt_info.entry_number, entry_1, entry_2);
223 error = 0;
224
225out_unlock:
226 up(&mm->context.sem);
227out:
228 return error;
229}
230
231asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
232{
233 int ret = -ENOSYS;
234
235 switch (func) {
236 case 0:
237 ret = read_ldt(ptr, bytecount);
238 break;
239 case 1:
240 ret = write_ldt(ptr, bytecount, 1);
241 break;
242 case 2:
243 ret = read_default_ldt(ptr, bytecount);
244 break;
245 case 0x11:
246 ret = write_ldt(ptr, bytecount, 0);
247 break;
248 }
249 return ret;
250}
diff --git a/arch/x86/kernel/ldt_64.c b/arch/x86/kernel/ldt_64.c
new file mode 100644
index 000000000000..bc9ffd5c19cc
--- /dev/null
+++ b/arch/x86/kernel/ldt_64.c
@@ -0,0 +1,252 @@
1/*
2 * linux/arch/x86_64/kernel/ldt.c
3 *
4 * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
5 * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
6 * Copyright (C) 2002 Andi Kleen
7 *
8 * This handles calls from both 32bit and 64bit mode.
9 */
10
11#include <linux/errno.h>
12#include <linux/sched.h>
13#include <linux/string.h>
14#include <linux/mm.h>
15#include <linux/smp.h>
16#include <linux/vmalloc.h>
17#include <linux/slab.h>
18
19#include <asm/uaccess.h>
20#include <asm/system.h>
21#include <asm/ldt.h>
22#include <asm/desc.h>
23#include <asm/proto.h>
24
25#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
26static void flush_ldt(void *null)
27{
28 if (current->active_mm)
29 load_LDT(&current->active_mm->context);
30}
31#endif
32
33static int alloc_ldt(mm_context_t *pc, unsigned mincount, int reload)
34{
35 void *oldldt;
36 void *newldt;
37 unsigned oldsize;
38
39 if (mincount <= (unsigned)pc->size)
40 return 0;
41 oldsize = pc->size;
42 mincount = (mincount+511)&(~511);
43 if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
44 newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
45 else
46 newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
47
48 if (!newldt)
49 return -ENOMEM;
50
51 if (oldsize)
52 memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
53 oldldt = pc->ldt;
54 memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
55 wmb();
56 pc->ldt = newldt;
57 wmb();
58 pc->size = mincount;
59 wmb();
60 if (reload) {
61#ifdef CONFIG_SMP
62 cpumask_t mask;
63
64 preempt_disable();
65 mask = cpumask_of_cpu(smp_processor_id());
66 load_LDT(pc);
67 if (!cpus_equal(current->mm->cpu_vm_mask, mask))
68 smp_call_function(flush_ldt, NULL, 1, 1);
69 preempt_enable();
70#else
71 load_LDT(pc);
72#endif
73 }
74 if (oldsize) {
75 if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
76 vfree(oldldt);
77 else
78 kfree(oldldt);
79 }
80 return 0;
81}
82
83static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
84{
85 int err = alloc_ldt(new, old->size, 0);
86 if (err < 0)
87 return err;
88 memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
89 return 0;
90}
91
92/*
93 * we do not have to muck with descriptors here, that is
94 * done in switch_mm() as needed.
95 */
96int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
97{
98 struct mm_struct * old_mm;
99 int retval = 0;
100
101 init_MUTEX(&mm->context.sem);
102 mm->context.size = 0;
103 old_mm = current->mm;
104 if (old_mm && old_mm->context.size > 0) {
105 down(&old_mm->context.sem);
106 retval = copy_ldt(&mm->context, &old_mm->context);
107 up(&old_mm->context.sem);
108 }
109 return retval;
110}
111
112/*
113 *
114 * Don't touch the LDT register - we're already in the next thread.
115 */
116void destroy_context(struct mm_struct *mm)
117{
118 if (mm->context.size) {
119 if ((unsigned)mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
120 vfree(mm->context.ldt);
121 else
122 kfree(mm->context.ldt);
123 mm->context.size = 0;
124 }
125}
126
127static int read_ldt(void __user * ptr, unsigned long bytecount)
128{
129 int err;
130 unsigned long size;
131 struct mm_struct * mm = current->mm;
132
133 if (!mm->context.size)
134 return 0;
135 if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
136 bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
137
138 down(&mm->context.sem);
139 size = mm->context.size*LDT_ENTRY_SIZE;
140 if (size > bytecount)
141 size = bytecount;
142
143 err = 0;
144 if (copy_to_user(ptr, mm->context.ldt, size))
145 err = -EFAULT;
146 up(&mm->context.sem);
147 if (err < 0)
148 goto error_return;
149 if (size != bytecount) {
150 /* zero-fill the rest */
151 if (clear_user(ptr+size, bytecount-size) != 0) {
152 err = -EFAULT;
153 goto error_return;
154 }
155 }
156 return bytecount;
157error_return:
158 return err;
159}
160
161static int read_default_ldt(void __user * ptr, unsigned long bytecount)
162{
163 /* Arbitrary number */
164 /* x86-64 default LDT is all zeros */
165 if (bytecount > 128)
166 bytecount = 128;
167 if (clear_user(ptr, bytecount))
168 return -EFAULT;
169 return bytecount;
170}
171
172static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
173{
174 struct task_struct *me = current;
175 struct mm_struct * mm = me->mm;
176 __u32 entry_1, entry_2, *lp;
177 int error;
178 struct user_desc ldt_info;
179
180 error = -EINVAL;
181
182 if (bytecount != sizeof(ldt_info))
183 goto out;
184 error = -EFAULT;
185 if (copy_from_user(&ldt_info, ptr, bytecount))
186 goto out;
187
188 error = -EINVAL;
189 if (ldt_info.entry_number >= LDT_ENTRIES)
190 goto out;
191 if (ldt_info.contents == 3) {
192 if (oldmode)
193 goto out;
194 if (ldt_info.seg_not_present == 0)
195 goto out;
196 }
197
198 down(&mm->context.sem);
199 if (ldt_info.entry_number >= (unsigned)mm->context.size) {
200 error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1);
201 if (error < 0)
202 goto out_unlock;
203 }
204
205 lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt);
206
207 /* Allow LDTs to be cleared by the user. */
208 if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
209 if (oldmode || LDT_empty(&ldt_info)) {
210 entry_1 = 0;
211 entry_2 = 0;
212 goto install;
213 }
214 }
215
216 entry_1 = LDT_entry_a(&ldt_info);
217 entry_2 = LDT_entry_b(&ldt_info);
218 if (oldmode)
219 entry_2 &= ~(1 << 20);
220
221 /* Install the new entry ... */
222install:
223 *lp = entry_1;
224 *(lp+1) = entry_2;
225 error = 0;
226
227out_unlock:
228 up(&mm->context.sem);
229out:
230 return error;
231}
232
233asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
234{
235 int ret = -ENOSYS;
236
237 switch (func) {
238 case 0:
239 ret = read_ldt(ptr, bytecount);
240 break;
241 case 1:
242 ret = write_ldt(ptr, bytecount, 1);
243 break;
244 case 2:
245 ret = read_default_ldt(ptr, bytecount);
246 break;
247 case 0x11:
248 ret = write_ldt(ptr, bytecount, 0);
249 break;
250 }
251 return ret;
252}
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
new file mode 100644
index 000000000000..91966bafb3dc
--- /dev/null
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -0,0 +1,171 @@
1/*
2 * machine_kexec.c - handle transition of Linux booting another kernel
3 * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com>
4 *
5 * This source code is licensed under the GNU General Public License,
6 * Version 2. See the file COPYING for more details.
7 */
8
9#include <linux/mm.h>
10#include <linux/kexec.h>
11#include <linux/delay.h>
12#include <linux/init.h>
13#include <asm/pgtable.h>
14#include <asm/pgalloc.h>
15#include <asm/tlbflush.h>
16#include <asm/mmu_context.h>
17#include <asm/io.h>
18#include <asm/apic.h>
19#include <asm/cpufeature.h>
20#include <asm/desc.h>
21#include <asm/system.h>
22
23#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
24static u32 kexec_pgd[1024] PAGE_ALIGNED;
25#ifdef CONFIG_X86_PAE
26static u32 kexec_pmd0[1024] PAGE_ALIGNED;
27static u32 kexec_pmd1[1024] PAGE_ALIGNED;
28#endif
29static u32 kexec_pte0[1024] PAGE_ALIGNED;
30static u32 kexec_pte1[1024] PAGE_ALIGNED;
31
32static void set_idt(void *newidt, __u16 limit)
33{
34 struct Xgt_desc_struct curidt;
35
36 /* ia32 supports unaliged loads & stores */
37 curidt.size = limit;
38 curidt.address = (unsigned long)newidt;
39
40 load_idt(&curidt);
41};
42
43
44static void set_gdt(void *newgdt, __u16 limit)
45{
46 struct Xgt_desc_struct curgdt;
47
48 /* ia32 supports unaligned loads & stores */
49 curgdt.size = limit;
50 curgdt.address = (unsigned long)newgdt;
51
52 load_gdt(&curgdt);
53};
54
55static void load_segments(void)
56{
57#define __STR(X) #X
58#define STR(X) __STR(X)
59
60 __asm__ __volatile__ (
61 "\tljmp $"STR(__KERNEL_CS)",$1f\n"
62 "\t1:\n"
63 "\tmovl $"STR(__KERNEL_DS)",%%eax\n"
64 "\tmovl %%eax,%%ds\n"
65 "\tmovl %%eax,%%es\n"
66 "\tmovl %%eax,%%fs\n"
67 "\tmovl %%eax,%%gs\n"
68 "\tmovl %%eax,%%ss\n"
69 ::: "eax", "memory");
70#undef STR
71#undef __STR
72}
73
74/*
75 * A architecture hook called to validate the
76 * proposed image and prepare the control pages
77 * as needed. The pages for KEXEC_CONTROL_CODE_SIZE
78 * have been allocated, but the segments have yet
79 * been copied into the kernel.
80 *
81 * Do what every setup is needed on image and the
82 * reboot code buffer to allow us to avoid allocations
83 * later.
84 *
85 * Currently nothing.
86 */
87int machine_kexec_prepare(struct kimage *image)
88{
89 return 0;
90}
91
92/*
93 * Undo anything leftover by machine_kexec_prepare
94 * when an image is freed.
95 */
96void machine_kexec_cleanup(struct kimage *image)
97{
98}
99
100/*
101 * Do not allocate memory (or fail in any way) in machine_kexec().
102 * We are past the point of no return, committed to rebooting now.
103 */
104NORET_TYPE void machine_kexec(struct kimage *image)
105{
106 unsigned long page_list[PAGES_NR];
107 void *control_page;
108
109 /* Interrupts aren't acceptable while we reboot */
110 local_irq_disable();
111
112 control_page = page_address(image->control_code_page);
113 memcpy(control_page, relocate_kernel, PAGE_SIZE);
114
115 page_list[PA_CONTROL_PAGE] = __pa(control_page);
116 page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel;
117 page_list[PA_PGD] = __pa(kexec_pgd);
118 page_list[VA_PGD] = (unsigned long)kexec_pgd;
119#ifdef CONFIG_X86_PAE
120 page_list[PA_PMD_0] = __pa(kexec_pmd0);
121 page_list[VA_PMD_0] = (unsigned long)kexec_pmd0;
122 page_list[PA_PMD_1] = __pa(kexec_pmd1);
123 page_list[VA_PMD_1] = (unsigned long)kexec_pmd1;
124#endif
125 page_list[PA_PTE_0] = __pa(kexec_pte0);
126 page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
127 page_list[PA_PTE_1] = __pa(kexec_pte1);
128 page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
129
130 /* The segment registers are funny things, they have both a
131 * visible and an invisible part. Whenever the visible part is
132 * set to a specific selector, the invisible part is loaded
133 * with from a table in memory. At no other time is the
134 * descriptor table in memory accessed.
135 *
136 * I take advantage of this here by force loading the
137 * segments, before I zap the gdt with an invalid value.
138 */
139 load_segments();
140 /* The gdt & idt are now invalid.
141 * If you want to load them you must set up your own idt & gdt.
142 */
143 set_gdt(phys_to_virt(0),0);
144 set_idt(phys_to_virt(0),0);
145
146 /* now call it */
147 relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
148 image->start, cpu_has_pae);
149}
150
151/* crashkernel=size@addr specifies the location to reserve for
152 * a crash kernel. By reserving this memory we guarantee
153 * that linux never sets it up as a DMA target.
154 * Useful for holding code to do something appropriate
155 * after a kernel panic.
156 */
157static int __init parse_crashkernel(char *arg)
158{
159 unsigned long size, base;
160 size = memparse(arg, &arg);
161 if (*arg == '@') {
162 base = memparse(arg+1, &arg);
163 /* FIXME: Do I want a sanity check
164 * to validate the memory range?
165 */
166 crashk_res.start = base;
167 crashk_res.end = base + size - 1;
168 }
169 return 0;
170}
171early_param("crashkernel", parse_crashkernel);
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
new file mode 100644
index 000000000000..c3a554703672
--- /dev/null
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -0,0 +1,259 @@
1/*
2 * machine_kexec.c - handle transition of Linux booting another kernel
3 * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com>
4 *
5 * This source code is licensed under the GNU General Public License,
6 * Version 2. See the file COPYING for more details.
7 */
8
9#include <linux/mm.h>
10#include <linux/kexec.h>
11#include <linux/string.h>
12#include <linux/reboot.h>
13#include <asm/pgtable.h>
14#include <asm/tlbflush.h>
15#include <asm/mmu_context.h>
16#include <asm/io.h>
17
18#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
19static u64 kexec_pgd[512] PAGE_ALIGNED;
20static u64 kexec_pud0[512] PAGE_ALIGNED;
21static u64 kexec_pmd0[512] PAGE_ALIGNED;
22static u64 kexec_pte0[512] PAGE_ALIGNED;
23static u64 kexec_pud1[512] PAGE_ALIGNED;
24static u64 kexec_pmd1[512] PAGE_ALIGNED;
25static u64 kexec_pte1[512] PAGE_ALIGNED;
26
27static void init_level2_page(pmd_t *level2p, unsigned long addr)
28{
29 unsigned long end_addr;
30
31 addr &= PAGE_MASK;
32 end_addr = addr + PUD_SIZE;
33 while (addr < end_addr) {
34 set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
35 addr += PMD_SIZE;
36 }
37}
38
39static int init_level3_page(struct kimage *image, pud_t *level3p,
40 unsigned long addr, unsigned long last_addr)
41{
42 unsigned long end_addr;
43 int result;
44
45 result = 0;
46 addr &= PAGE_MASK;
47 end_addr = addr + PGDIR_SIZE;
48 while ((addr < last_addr) && (addr < end_addr)) {
49 struct page *page;
50 pmd_t *level2p;
51
52 page = kimage_alloc_control_pages(image, 0);
53 if (!page) {
54 result = -ENOMEM;
55 goto out;
56 }
57 level2p = (pmd_t *)page_address(page);
58 init_level2_page(level2p, addr);
59 set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE));
60 addr += PUD_SIZE;
61 }
62 /* clear the unused entries */
63 while (addr < end_addr) {
64 pud_clear(level3p++);
65 addr += PUD_SIZE;
66 }
67out:
68 return result;
69}
70
71
72static int init_level4_page(struct kimage *image, pgd_t *level4p,
73 unsigned long addr, unsigned long last_addr)
74{
75 unsigned long end_addr;
76 int result;
77
78 result = 0;
79 addr &= PAGE_MASK;
80 end_addr = addr + (PTRS_PER_PGD * PGDIR_SIZE);
81 while ((addr < last_addr) && (addr < end_addr)) {
82 struct page *page;
83 pud_t *level3p;
84
85 page = kimage_alloc_control_pages(image, 0);
86 if (!page) {
87 result = -ENOMEM;
88 goto out;
89 }
90 level3p = (pud_t *)page_address(page);
91 result = init_level3_page(image, level3p, addr, last_addr);
92 if (result) {
93 goto out;
94 }
95 set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
96 addr += PGDIR_SIZE;
97 }
98 /* clear the unused entries */
99 while (addr < end_addr) {
100 pgd_clear(level4p++);
101 addr += PGDIR_SIZE;
102 }
103out:
104 return result;
105}
106
107
108static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
109{
110 pgd_t *level4p;
111 level4p = (pgd_t *)__va(start_pgtable);
112 return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT);
113}
114
115static void set_idt(void *newidt, u16 limit)
116{
117 struct desc_ptr curidt;
118
119 /* x86-64 supports unaliged loads & stores */
120 curidt.size = limit;
121 curidt.address = (unsigned long)newidt;
122
123 __asm__ __volatile__ (
124 "lidtq %0\n"
125 : : "m" (curidt)
126 );
127};
128
129
130static void set_gdt(void *newgdt, u16 limit)
131{
132 struct desc_ptr curgdt;
133
134 /* x86-64 supports unaligned loads & stores */
135 curgdt.size = limit;
136 curgdt.address = (unsigned long)newgdt;
137
138 __asm__ __volatile__ (
139 "lgdtq %0\n"
140 : : "m" (curgdt)
141 );
142};
143
144static void load_segments(void)
145{
146 __asm__ __volatile__ (
147 "\tmovl %0,%%ds\n"
148 "\tmovl %0,%%es\n"
149 "\tmovl %0,%%ss\n"
150 "\tmovl %0,%%fs\n"
151 "\tmovl %0,%%gs\n"
152 : : "a" (__KERNEL_DS) : "memory"
153 );
154}
155
156int machine_kexec_prepare(struct kimage *image)
157{
158 unsigned long start_pgtable;
159 int result;
160
161 /* Calculate the offsets */
162 start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
163
164 /* Setup the identity mapped 64bit page table */
165 result = init_pgtable(image, start_pgtable);
166 if (result)
167 return result;
168
169 return 0;
170}
171
172void machine_kexec_cleanup(struct kimage *image)
173{
174 return;
175}
176
177/*
178 * Do not allocate memory (or fail in any way) in machine_kexec().
179 * We are past the point of no return, committed to rebooting now.
180 */
181NORET_TYPE void machine_kexec(struct kimage *image)
182{
183 unsigned long page_list[PAGES_NR];
184 void *control_page;
185
186 /* Interrupts aren't acceptable while we reboot */
187 local_irq_disable();
188
189 control_page = page_address(image->control_code_page) + PAGE_SIZE;
190 memcpy(control_page, relocate_kernel, PAGE_SIZE);
191
192 page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page);
193 page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel;
194 page_list[PA_PGD] = virt_to_phys(&kexec_pgd);
195 page_list[VA_PGD] = (unsigned long)kexec_pgd;
196 page_list[PA_PUD_0] = virt_to_phys(&kexec_pud0);
197 page_list[VA_PUD_0] = (unsigned long)kexec_pud0;
198 page_list[PA_PMD_0] = virt_to_phys(&kexec_pmd0);
199 page_list[VA_PMD_0] = (unsigned long)kexec_pmd0;
200 page_list[PA_PTE_0] = virt_to_phys(&kexec_pte0);
201 page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
202 page_list[PA_PUD_1] = virt_to_phys(&kexec_pud1);
203 page_list[VA_PUD_1] = (unsigned long)kexec_pud1;
204 page_list[PA_PMD_1] = virt_to_phys(&kexec_pmd1);
205 page_list[VA_PMD_1] = (unsigned long)kexec_pmd1;
206 page_list[PA_PTE_1] = virt_to_phys(&kexec_pte1);
207 page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
208
209 page_list[PA_TABLE_PAGE] =
210 (unsigned long)__pa(page_address(image->control_code_page));
211
212 /* The segment registers are funny things, they have both a
213 * visible and an invisible part. Whenever the visible part is
214 * set to a specific selector, the invisible part is loaded
215 * with from a table in memory. At no other time is the
216 * descriptor table in memory accessed.
217 *
218 * I take advantage of this here by force loading the
219 * segments, before I zap the gdt with an invalid value.
220 */
221 load_segments();
222 /* The gdt & idt are now invalid.
223 * If you want to load them you must set up your own idt & gdt.
224 */
225 set_gdt(phys_to_virt(0),0);
226 set_idt(phys_to_virt(0),0);
227
228 /* now call it */
229 relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
230 image->start);
231}
232
233/* crashkernel=size@addr specifies the location to reserve for
234 * a crash kernel. By reserving this memory we guarantee
235 * that linux never set's it up as a DMA target.
236 * Useful for holding code to do something appropriate
237 * after a kernel panic.
238 */
239static int __init setup_crashkernel(char *arg)
240{
241 unsigned long size, base;
242 char *p;
243 if (!arg)
244 return -EINVAL;
245 size = memparse(arg, &p);
246 if (arg == p)
247 return -EINVAL;
248 if (*p == '@') {
249 base = memparse(p+1, &p);
250 /* FIXME: Do I want a sanity check to validate the
251 * memory range? Yes you do, but it's too early for
252 * e820 -AK */
253 crashk_res.start = base;
254 crashk_res.end = base + size - 1;
255 }
256 return 0;
257}
258early_param("crashkernel", setup_crashkernel);
259
diff --git a/arch/x86/kernel/mca_32.c b/arch/x86/kernel/mca_32.c
new file mode 100644
index 000000000000..b83672b89527
--- /dev/null
+++ b/arch/x86/kernel/mca_32.c
@@ -0,0 +1,470 @@
1/*
2 * linux/arch/i386/kernel/mca.c
3 * Written by Martin Kolinek, February 1996
4 *
5 * Changes:
6 *
7 * Chris Beauregard July 28th, 1996
8 * - Fixed up integrated SCSI detection
9 *
10 * Chris Beauregard August 3rd, 1996
11 * - Made mca_info local
12 * - Made integrated registers accessible through standard function calls
13 * - Added name field
14 * - More sanity checking
15 *
16 * Chris Beauregard August 9th, 1996
17 * - Rewrote /proc/mca
18 *
19 * Chris Beauregard January 7th, 1997
20 * - Added basic NMI-processing
21 * - Added more information to mca_info structure
22 *
23 * David Weinehall October 12th, 1998
24 * - Made a lot of cleaning up in the source
25 * - Added use of save_flags / restore_flags
26 * - Added the 'driver_loaded' flag in MCA_adapter
27 * - Added an alternative implemention of ZP Gu's mca_find_unused_adapter
28 *
29 * David Weinehall March 24th, 1999
30 * - Fixed the output of 'Driver Installed' in /proc/mca/pos
31 * - Made the Integrated Video & SCSI show up even if they have id 0000
32 *
33 * Alexander Viro November 9th, 1999
34 * - Switched to regular procfs methods
35 *
36 * Alfred Arnold & David Weinehall August 23rd, 2000
37 * - Added support for Planar POS-registers
38 */
39
40#include <linux/module.h>
41#include <linux/types.h>
42#include <linux/errno.h>
43#include <linux/kernel.h>
44#include <linux/mca.h>
45#include <linux/kprobes.h>
46#include <asm/system.h>
47#include <asm/io.h>
48#include <linux/proc_fs.h>
49#include <linux/mman.h>
50#include <linux/mm.h>
51#include <linux/pagemap.h>
52#include <linux/ioport.h>
53#include <asm/uaccess.h>
54#include <linux/init.h>
55#include <asm/arch_hooks.h>
56
57static unsigned char which_scsi = 0;
58
59int MCA_bus = 0;
60EXPORT_SYMBOL(MCA_bus);
61
62/*
63 * Motherboard register spinlock. Untested on SMP at the moment, but
64 * are there any MCA SMP boxes?
65 *
66 * Yes - Alan
67 */
68static DEFINE_SPINLOCK(mca_lock);
69
70/* Build the status info for the adapter */
71
72static void mca_configure_adapter_status(struct mca_device *mca_dev) {
73 mca_dev->status = MCA_ADAPTER_NONE;
74
75 mca_dev->pos_id = mca_dev->pos[0]
76 + (mca_dev->pos[1] << 8);
77
78 if(!mca_dev->pos_id && mca_dev->slot < MCA_MAX_SLOT_NR) {
79
80 /* id = 0x0000 usually indicates hardware failure,
81 * however, ZP Gu (zpg@castle.net> reports that his 9556
82 * has 0x0000 as id and everything still works. There
83 * also seem to be an adapter with id = 0x0000; the
84 * NCR Parallel Bus Memory Card. Until this is confirmed,
85 * however, this code will stay.
86 */
87
88 mca_dev->status = MCA_ADAPTER_ERROR;
89
90 return;
91 } else if(mca_dev->pos_id != 0xffff) {
92
93 /* 0xffff usually indicates that there's no adapter,
94 * however, some integrated adapters may have 0xffff as
95 * their id and still be valid. Examples are on-board
96 * VGA of the 55sx, the integrated SCSI of the 56 & 57,
97 * and possibly also the 95 ULTIMEDIA.
98 */
99
100 mca_dev->status = MCA_ADAPTER_NORMAL;
101 }
102
103 if((mca_dev->pos_id == 0xffff ||
104 mca_dev->pos_id == 0x0000) && mca_dev->slot >= MCA_MAX_SLOT_NR) {
105 int j;
106
107 for(j = 2; j < 8; j++) {
108 if(mca_dev->pos[j] != 0xff) {
109 mca_dev->status = MCA_ADAPTER_NORMAL;
110 break;
111 }
112 }
113 }
114
115 if(!(mca_dev->pos[2] & MCA_ENABLED)) {
116
117 /* enabled bit is in POS 2 */
118
119 mca_dev->status = MCA_ADAPTER_DISABLED;
120 }
121} /* mca_configure_adapter_status */
122
123/*--------------------------------------------------------------------*/
124
125static struct resource mca_standard_resources[] = {
126 { .start = 0x60, .end = 0x60, .name = "system control port B (MCA)" },
127 { .start = 0x90, .end = 0x90, .name = "arbitration (MCA)" },
128 { .start = 0x91, .end = 0x91, .name = "card Select Feedback (MCA)" },
129 { .start = 0x92, .end = 0x92, .name = "system Control port A (MCA)" },
130 { .start = 0x94, .end = 0x94, .name = "system board setup (MCA)" },
131 { .start = 0x96, .end = 0x97, .name = "POS (MCA)" },
132 { .start = 0x100, .end = 0x107, .name = "POS (MCA)" }
133};
134
135#define MCA_STANDARD_RESOURCES ARRAY_SIZE(mca_standard_resources)
136
137/**
138 * mca_read_and_store_pos - read the POS registers into a memory buffer
139 * @pos: a char pointer to 8 bytes, contains the POS register value on
140 * successful return
141 *
142 * Returns 1 if a card actually exists (i.e. the pos isn't
143 * all 0xff) or 0 otherwise
144 */
145static int mca_read_and_store_pos(unsigned char *pos) {
146 int j;
147 int found = 0;
148
149 for(j=0; j<8; j++) {
150 if((pos[j] = inb_p(MCA_POS_REG(j))) != 0xff) {
151 /* 0xff all across means no device. 0x00 means
152 * something's broken, but a device is
153 * probably there. However, if you get 0x00
154 * from a motherboard register it won't matter
155 * what we find. For the record, on the
156 * 57SLC, the integrated SCSI adapter has
157 * 0xffff for the adapter ID, but nonzero for
158 * other registers. */
159
160 found = 1;
161 }
162 }
163 return found;
164}
165
166static unsigned char mca_pc_read_pos(struct mca_device *mca_dev, int reg)
167{
168 unsigned char byte;
169 unsigned long flags;
170
171 if(reg < 0 || reg >= 8)
172 return 0;
173
174 spin_lock_irqsave(&mca_lock, flags);
175 if(mca_dev->pos_register) {
176 /* Disable adapter setup, enable motherboard setup */
177
178 outb_p(0, MCA_ADAPTER_SETUP_REG);
179 outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG);
180
181 byte = inb_p(MCA_POS_REG(reg));
182 outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
183 } else {
184
185 /* Make sure motherboard setup is off */
186
187 outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
188
189 /* Read the appropriate register */
190
191 outb_p(0x8|(mca_dev->slot & 0xf), MCA_ADAPTER_SETUP_REG);
192 byte = inb_p(MCA_POS_REG(reg));
193 outb_p(0, MCA_ADAPTER_SETUP_REG);
194 }
195 spin_unlock_irqrestore(&mca_lock, flags);
196
197 mca_dev->pos[reg] = byte;
198
199 return byte;
200}
201
202static void mca_pc_write_pos(struct mca_device *mca_dev, int reg,
203 unsigned char byte)
204{
205 unsigned long flags;
206
207 if(reg < 0 || reg >= 8)
208 return;
209
210 spin_lock_irqsave(&mca_lock, flags);
211
212 /* Make sure motherboard setup is off */
213
214 outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
215
216 /* Read in the appropriate register */
217
218 outb_p(0x8|(mca_dev->slot&0xf), MCA_ADAPTER_SETUP_REG);
219 outb_p(byte, MCA_POS_REG(reg));
220 outb_p(0, MCA_ADAPTER_SETUP_REG);
221
222 spin_unlock_irqrestore(&mca_lock, flags);
223
224 /* Update the global register list, while we have the byte */
225
226 mca_dev->pos[reg] = byte;
227
228}
229
230/* for the primary MCA bus, we have identity transforms */
231static int mca_dummy_transform_irq(struct mca_device * mca_dev, int irq)
232{
233 return irq;
234}
235
236static int mca_dummy_transform_ioport(struct mca_device * mca_dev, int port)
237{
238 return port;
239}
240
241static void *mca_dummy_transform_memory(struct mca_device * mca_dev, void *mem)
242{
243 return mem;
244}
245
246
247static int __init mca_init(void)
248{
249 unsigned int i, j;
250 struct mca_device *mca_dev;
251 unsigned char pos[8];
252 short mca_builtin_scsi_ports[] = {0xf7, 0xfd, 0x00};
253 struct mca_bus *bus;
254
255 /* WARNING: Be careful when making changes here. Putting an adapter
256 * and the motherboard simultaneously into setup mode may result in
257 * damage to chips (according to The Indispensible PC Hardware Book
258 * by Hans-Peter Messmer). Also, we disable system interrupts (so
259 * that we are not disturbed in the middle of this).
260 */
261
262 /* Make sure the MCA bus is present */
263
264 if (mca_system_init()) {
265 printk(KERN_ERR "MCA bus system initialisation failed\n");
266 return -ENODEV;
267 }
268
269 if (!MCA_bus)
270 return -ENODEV;
271
272 printk(KERN_INFO "Micro Channel bus detected.\n");
273
274 /* All MCA systems have at least a primary bus */
275 bus = mca_attach_bus(MCA_PRIMARY_BUS);
276 if (!bus)
277 goto out_nomem;
278 bus->default_dma_mask = 0xffffffffLL;
279 bus->f.mca_write_pos = mca_pc_write_pos;
280 bus->f.mca_read_pos = mca_pc_read_pos;
281 bus->f.mca_transform_irq = mca_dummy_transform_irq;
282 bus->f.mca_transform_ioport = mca_dummy_transform_ioport;
283 bus->f.mca_transform_memory = mca_dummy_transform_memory;
284
285 /* get the motherboard device */
286 mca_dev = kzalloc(sizeof(struct mca_device), GFP_KERNEL);
287 if(unlikely(!mca_dev))
288 goto out_nomem;
289
290 /*
291 * We do not expect many MCA interrupts during initialization,
292 * but let us be safe:
293 */
294 spin_lock_irq(&mca_lock);
295
296 /* Make sure adapter setup is off */
297
298 outb_p(0, MCA_ADAPTER_SETUP_REG);
299
300 /* Read motherboard POS registers */
301
302 mca_dev->pos_register = 0x7f;
303 outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG);
304 mca_dev->name[0] = 0;
305 mca_read_and_store_pos(mca_dev->pos);
306 mca_configure_adapter_status(mca_dev);
307 /* fake POS and slot for a motherboard */
308 mca_dev->pos_id = MCA_MOTHERBOARD_POS;
309 mca_dev->slot = MCA_MOTHERBOARD;
310 mca_register_device(MCA_PRIMARY_BUS, mca_dev);
311
312 mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
313 if(unlikely(!mca_dev))
314 goto out_unlock_nomem;
315
316 /* Put motherboard into video setup mode, read integrated video
317 * POS registers, and turn motherboard setup off.
318 */
319
320 mca_dev->pos_register = 0xdf;
321 outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG);
322 mca_dev->name[0] = 0;
323 mca_read_and_store_pos(mca_dev->pos);
324 mca_configure_adapter_status(mca_dev);
325 /* fake POS and slot for the integrated video */
326 mca_dev->pos_id = MCA_INTEGVIDEO_POS;
327 mca_dev->slot = MCA_INTEGVIDEO;
328 mca_register_device(MCA_PRIMARY_BUS, mca_dev);
329
330 /* Put motherboard into scsi setup mode, read integrated scsi
331 * POS registers, and turn motherboard setup off.
332 *
333 * It seems there are two possible SCSI registers. Martin says that
334 * for the 56,57, 0xf7 is the one, but fails on the 76.
335 * Alfredo (apena@vnet.ibm.com) says
336 * 0xfd works on his machine. We'll try both of them. I figure it's
337 * a good bet that only one could be valid at a time. This could
338 * screw up though if one is used for something else on the other
339 * machine.
340 */
341
342 for(i = 0; (which_scsi = mca_builtin_scsi_ports[i]) != 0; i++) {
343 outb_p(which_scsi, MCA_MOTHERBOARD_SETUP_REG);
344 if(mca_read_and_store_pos(pos))
345 break;
346 }
347 if(which_scsi) {
348 /* found a scsi card */
349 mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
350 if(unlikely(!mca_dev))
351 goto out_unlock_nomem;
352
353 for(j = 0; j < 8; j++)
354 mca_dev->pos[j] = pos[j];
355
356 mca_configure_adapter_status(mca_dev);
357 /* fake POS and slot for integrated SCSI controller */
358 mca_dev->pos_id = MCA_INTEGSCSI_POS;
359 mca_dev->slot = MCA_INTEGSCSI;
360 mca_dev->pos_register = which_scsi;
361 mca_register_device(MCA_PRIMARY_BUS, mca_dev);
362 }
363
364 /* Turn off motherboard setup */
365
366 outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
367
368 /* Now loop over MCA slots: put each adapter into setup mode, and
369 * read its POS registers. Then put adapter setup off.
370 */
371
372 for(i=0; i<MCA_MAX_SLOT_NR; i++) {
373 outb_p(0x8|(i&0xf), MCA_ADAPTER_SETUP_REG);
374 if(!mca_read_and_store_pos(pos))
375 continue;
376
377 mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
378 if(unlikely(!mca_dev))
379 goto out_unlock_nomem;
380
381 for(j=0; j<8; j++)
382 mca_dev->pos[j]=pos[j];
383
384 mca_dev->driver_loaded = 0;
385 mca_dev->slot = i;
386 mca_dev->pos_register = 0;
387 mca_configure_adapter_status(mca_dev);
388 mca_register_device(MCA_PRIMARY_BUS, mca_dev);
389 }
390 outb_p(0, MCA_ADAPTER_SETUP_REG);
391
392 /* Enable interrupts and return memory start */
393 spin_unlock_irq(&mca_lock);
394
395 for (i = 0; i < MCA_STANDARD_RESOURCES; i++)
396 request_resource(&ioport_resource, mca_standard_resources + i);
397
398 mca_do_proc_init();
399
400 return 0;
401
402 out_unlock_nomem:
403 spin_unlock_irq(&mca_lock);
404 out_nomem:
405 printk(KERN_EMERG "Failed memory allocation in MCA setup!\n");
406 return -ENOMEM;
407}
408
409subsys_initcall(mca_init);
410
411/*--------------------------------------------------------------------*/
412
413static __kprobes void
414mca_handle_nmi_device(struct mca_device *mca_dev, int check_flag)
415{
416 int slot = mca_dev->slot;
417
418 if(slot == MCA_INTEGSCSI) {
419 printk(KERN_CRIT "NMI: caused by MCA integrated SCSI adapter (%s)\n",
420 mca_dev->name);
421 } else if(slot == MCA_INTEGVIDEO) {
422 printk(KERN_CRIT "NMI: caused by MCA integrated video adapter (%s)\n",
423 mca_dev->name);
424 } else if(slot == MCA_MOTHERBOARD) {
425 printk(KERN_CRIT "NMI: caused by motherboard (%s)\n",
426 mca_dev->name);
427 }
428
429 /* More info available in POS 6 and 7? */
430
431 if(check_flag) {
432 unsigned char pos6, pos7;
433
434 pos6 = mca_device_read_pos(mca_dev, 6);
435 pos7 = mca_device_read_pos(mca_dev, 7);
436
437 printk(KERN_CRIT "NMI: POS 6 = 0x%x, POS 7 = 0x%x\n", pos6, pos7);
438 }
439
440} /* mca_handle_nmi_slot */
441
442/*--------------------------------------------------------------------*/
443
444static int __kprobes mca_handle_nmi_callback(struct device *dev, void *data)
445{
446 struct mca_device *mca_dev = to_mca_device(dev);
447 unsigned char pos5;
448
449 pos5 = mca_device_read_pos(mca_dev, 5);
450
451 if(!(pos5 & 0x80)) {
452 /* Bit 7 of POS 5 is reset when this adapter has a hardware
453 * error. Bit 7 it reset if there's error information
454 * available in POS 6 and 7.
455 */
456 mca_handle_nmi_device(mca_dev, !(pos5 & 0x40));
457 return 1;
458 }
459 return 0;
460}
461
462void __kprobes mca_handle_nmi(void)
463{
464 /* First try - scan the various adapters and see if a specific
465 * adapter was responsible for the error.
466 */
467 bus_for_each_dev(&mca_bus_type, NULL, NULL, mca_handle_nmi_callback);
468
469 mca_nmi_hook();
470} /* mca_handle_nmi */
diff --git a/arch/x86/kernel/mce_64.c b/arch/x86/kernel/mce_64.c
new file mode 100644
index 000000000000..a66d607f5b92
--- /dev/null
+++ b/arch/x86/kernel/mce_64.c
@@ -0,0 +1,875 @@
1/*
2 * Machine check handler.
3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 * Rest from unknown author(s).
5 * 2004 Andi Kleen. Rewrote most of it.
6 */
7
8#include <linux/init.h>
9#include <linux/types.h>
10#include <linux/kernel.h>
11#include <linux/sched.h>
12#include <linux/string.h>
13#include <linux/rcupdate.h>
14#include <linux/kallsyms.h>
15#include <linux/sysdev.h>
16#include <linux/miscdevice.h>
17#include <linux/fs.h>
18#include <linux/capability.h>
19#include <linux/cpu.h>
20#include <linux/percpu.h>
21#include <linux/poll.h>
22#include <linux/thread_info.h>
23#include <linux/ctype.h>
24#include <linux/kmod.h>
25#include <linux/kdebug.h>
26#include <asm/processor.h>
27#include <asm/msr.h>
28#include <asm/mce.h>
29#include <asm/uaccess.h>
30#include <asm/smp.h>
31#include <asm/idle.h>
32
33#define MISC_MCELOG_MINOR 227
34#define NR_BANKS 6
35
36atomic_t mce_entry;
37
38static int mce_dont_init;
39
40/*
41 * Tolerant levels:
42 * 0: always panic on uncorrected errors, log corrected errors
43 * 1: panic or SIGBUS on uncorrected errors, log corrected errors
44 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors
45 * 3: never panic or SIGBUS, log all errors (for testing only)
46 */
47static int tolerant = 1;
48static int banks;
49static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
50static unsigned long notify_user;
51static int rip_msr;
52static int mce_bootlog = 1;
53static atomic_t mce_events;
54
55static char trigger[128];
56static char *trigger_argv[2] = { trigger, NULL };
57
58static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
59
60/*
61 * Lockless MCE logging infrastructure.
62 * This avoids deadlocks on printk locks without having to break locks. Also
63 * separate MCEs from kernel messages to avoid bogus bug reports.
64 */
65
66struct mce_log mcelog = {
67 MCE_LOG_SIGNATURE,
68 MCE_LOG_LEN,
69};
70
71void mce_log(struct mce *mce)
72{
73 unsigned next, entry;
74 atomic_inc(&mce_events);
75 mce->finished = 0;
76 wmb();
77 for (;;) {
78 entry = rcu_dereference(mcelog.next);
79 /* The rmb forces the compiler to reload next in each
80 iteration */
81 rmb();
82 for (;;) {
83 /* When the buffer fills up discard new entries. Assume
84 that the earlier errors are the more interesting. */
85 if (entry >= MCE_LOG_LEN) {
86 set_bit(MCE_OVERFLOW, &mcelog.flags);
87 return;
88 }
89 /* Old left over entry. Skip. */
90 if (mcelog.entry[entry].finished) {
91 entry++;
92 continue;
93 }
94 break;
95 }
96 smp_rmb();
97 next = entry + 1;
98 if (cmpxchg(&mcelog.next, entry, next) == entry)
99 break;
100 }
101 memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
102 wmb();
103 mcelog.entry[entry].finished = 1;
104 wmb();
105
106 set_bit(0, &notify_user);
107}
108
109static void print_mce(struct mce *m)
110{
111 printk(KERN_EMERG "\n"
112 KERN_EMERG "HARDWARE ERROR\n"
113 KERN_EMERG
114 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
115 m->cpu, m->mcgstatus, m->bank, m->status);
116 if (m->rip) {
117 printk(KERN_EMERG
118 "RIP%s %02x:<%016Lx> ",
119 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
120 m->cs, m->rip);
121 if (m->cs == __KERNEL_CS)
122 print_symbol("{%s}", m->rip);
123 printk("\n");
124 }
125 printk(KERN_EMERG "TSC %Lx ", m->tsc);
126 if (m->addr)
127 printk("ADDR %Lx ", m->addr);
128 if (m->misc)
129 printk("MISC %Lx ", m->misc);
130 printk("\n");
131 printk(KERN_EMERG "This is not a software problem!\n");
132 printk(KERN_EMERG
133 "Run through mcelog --ascii to decode and contact your hardware vendor\n");
134}
135
136static void mce_panic(char *msg, struct mce *backup, unsigned long start)
137{
138 int i;
139
140 oops_begin();
141 for (i = 0; i < MCE_LOG_LEN; i++) {
142 unsigned long tsc = mcelog.entry[i].tsc;
143 if (time_before(tsc, start))
144 continue;
145 print_mce(&mcelog.entry[i]);
146 if (backup && mcelog.entry[i].tsc == backup->tsc)
147 backup = NULL;
148 }
149 if (backup)
150 print_mce(backup);
151 panic(msg);
152}
153
154static int mce_available(struct cpuinfo_x86 *c)
155{
156 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
157}
158
159static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
160{
161 if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
162 m->rip = regs->rip;
163 m->cs = regs->cs;
164 } else {
165 m->rip = 0;
166 m->cs = 0;
167 }
168 if (rip_msr) {
169 /* Assume the RIP in the MSR is exact. Is this true? */
170 m->mcgstatus |= MCG_STATUS_EIPV;
171 rdmsrl(rip_msr, m->rip);
172 m->cs = 0;
173 }
174}
175
176/*
177 * The actual machine check handler
178 */
179
180void do_machine_check(struct pt_regs * regs, long error_code)
181{
182 struct mce m, panicm;
183 u64 mcestart = 0;
184 int i;
185 int panicm_found = 0;
186 /*
187 * If no_way_out gets set, there is no safe way to recover from this
188 * MCE. If tolerant is cranked up, we'll try anyway.
189 */
190 int no_way_out = 0;
191 /*
192 * If kill_it gets set, there might be a way to recover from this
193 * error.
194 */
195 int kill_it = 0;
196
197 atomic_inc(&mce_entry);
198
199 if (regs)
200 notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL);
201 if (!banks)
202 goto out2;
203
204 memset(&m, 0, sizeof(struct mce));
205 m.cpu = smp_processor_id();
206 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
207 /* if the restart IP is not valid, we're done for */
208 if (!(m.mcgstatus & MCG_STATUS_RIPV))
209 no_way_out = 1;
210
211 rdtscll(mcestart);
212 barrier();
213
214 for (i = 0; i < banks; i++) {
215 if (!bank[i])
216 continue;
217
218 m.misc = 0;
219 m.addr = 0;
220 m.bank = i;
221 m.tsc = 0;
222
223 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
224 if ((m.status & MCI_STATUS_VAL) == 0)
225 continue;
226
227 if (m.status & MCI_STATUS_EN) {
228 /* if PCC was set, there's no way out */
229 no_way_out |= !!(m.status & MCI_STATUS_PCC);
230 /*
231 * If this error was uncorrectable and there was
232 * an overflow, we're in trouble. If no overflow,
233 * we might get away with just killing a task.
234 */
235 if (m.status & MCI_STATUS_UC) {
236 if (tolerant < 1 || m.status & MCI_STATUS_OVER)
237 no_way_out = 1;
238 kill_it = 1;
239 }
240 }
241
242 if (m.status & MCI_STATUS_MISCV)
243 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
244 if (m.status & MCI_STATUS_ADDRV)
245 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
246
247 mce_get_rip(&m, regs);
248 if (error_code >= 0)
249 rdtscll(m.tsc);
250 if (error_code != -2)
251 mce_log(&m);
252
253 /* Did this bank cause the exception? */
254 /* Assume that the bank with uncorrectable errors did it,
255 and that there is only a single one. */
256 if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
257 panicm = m;
258 panicm_found = 1;
259 }
260
261 add_taint(TAINT_MACHINE_CHECK);
262 }
263
264 /* Never do anything final in the polling timer */
265 if (!regs)
266 goto out;
267
268 /* If we didn't find an uncorrectable error, pick
269 the last one (shouldn't happen, just being safe). */
270 if (!panicm_found)
271 panicm = m;
272
273 /*
274 * If we have decided that we just CAN'T continue, and the user
275 * has not set tolerant to an insane level, give up and die.
276 */
277 if (no_way_out && tolerant < 3)
278 mce_panic("Machine check", &panicm, mcestart);
279
280 /*
281 * If the error seems to be unrecoverable, something should be
282 * done. Try to kill as little as possible. If we can kill just
283 * one task, do that. If the user has set the tolerance very
284 * high, don't try to do anything at all.
285 */
286 if (kill_it && tolerant < 3) {
287 int user_space = 0;
288
289 /*
290 * If the EIPV bit is set, it means the saved IP is the
291 * instruction which caused the MCE.
292 */
293 if (m.mcgstatus & MCG_STATUS_EIPV)
294 user_space = panicm.rip && (panicm.cs & 3);
295
296 /*
297 * If we know that the error was in user space, send a
298 * SIGBUS. Otherwise, panic if tolerance is low.
299 *
300 * do_exit() takes an awful lot of locks and has a slight
301 * risk of deadlocking.
302 */
303 if (user_space) {
304 do_exit(SIGBUS);
305 } else if (panic_on_oops || tolerant < 2) {
306 mce_panic("Uncorrected machine check",
307 &panicm, mcestart);
308 }
309 }
310
311 /* notify userspace ASAP */
312 set_thread_flag(TIF_MCE_NOTIFY);
313
314 out:
315 /* the last thing we do is clear state */
316 for (i = 0; i < banks; i++)
317 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
318 wrmsrl(MSR_IA32_MCG_STATUS, 0);
319 out2:
320 atomic_dec(&mce_entry);
321}
322
323#ifdef CONFIG_X86_MCE_INTEL
324/***
325 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
326 * @cpu: The CPU on which the event occured.
327 * @status: Event status information
328 *
329 * This function should be called by the thermal interrupt after the
330 * event has been processed and the decision was made to log the event
331 * further.
332 *
333 * The status parameter will be saved to the 'status' field of 'struct mce'
334 * and historically has been the register value of the
335 * MSR_IA32_THERMAL_STATUS (Intel) msr.
336 */
337void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
338{
339 struct mce m;
340
341 memset(&m, 0, sizeof(m));
342 m.cpu = cpu;
343 m.bank = MCE_THERMAL_BANK;
344 m.status = status;
345 rdtscll(m.tsc);
346 mce_log(&m);
347}
348#endif /* CONFIG_X86_MCE_INTEL */
349
350/*
351 * Periodic polling timer for "silent" machine check errors. If the
352 * poller finds an MCE, poll 2x faster. When the poller finds no more
353 * errors, poll 2x slower (up to check_interval seconds).
354 */
355
356static int check_interval = 5 * 60; /* 5 minutes */
357static int next_interval; /* in jiffies */
358static void mcheck_timer(struct work_struct *work);
359static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer);
360
361static void mcheck_check_cpu(void *info)
362{
363 if (mce_available(&current_cpu_data))
364 do_machine_check(NULL, 0);
365}
366
367static void mcheck_timer(struct work_struct *work)
368{
369 on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
370
371 /*
372 * Alert userspace if needed. If we logged an MCE, reduce the
373 * polling interval, otherwise increase the polling interval.
374 */
375 if (mce_notify_user()) {
376 next_interval = max(next_interval/2, HZ/100);
377 } else {
378 next_interval = min(next_interval*2,
379 (int)round_jiffies_relative(check_interval*HZ));
380 }
381
382 schedule_delayed_work(&mcheck_work, next_interval);
383}
384
385/*
386 * This is only called from process context. This is where we do
387 * anything we need to alert userspace about new MCEs. This is called
388 * directly from the poller and also from entry.S and idle, thanks to
389 * TIF_MCE_NOTIFY.
390 */
391int mce_notify_user(void)
392{
393 clear_thread_flag(TIF_MCE_NOTIFY);
394 if (test_and_clear_bit(0, &notify_user)) {
395 static unsigned long last_print;
396 unsigned long now = jiffies;
397
398 wake_up_interruptible(&mce_wait);
399 if (trigger[0])
400 call_usermodehelper(trigger, trigger_argv, NULL,
401 UMH_NO_WAIT);
402
403 if (time_after_eq(now, last_print + (check_interval*HZ))) {
404 last_print = now;
405 printk(KERN_INFO "Machine check events logged\n");
406 }
407
408 return 1;
409 }
410 return 0;
411}
412
413/* see if the idle task needs to notify userspace */
414static int
415mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk)
416{
417 /* IDLE_END should be safe - interrupts are back on */
418 if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY))
419 mce_notify_user();
420
421 return NOTIFY_OK;
422}
423
424static struct notifier_block mce_idle_notifier = {
425 .notifier_call = mce_idle_callback,
426};
427
428static __init int periodic_mcheck_init(void)
429{
430 next_interval = check_interval * HZ;
431 if (next_interval)
432 schedule_delayed_work(&mcheck_work,
433 round_jiffies_relative(next_interval));
434 idle_notifier_register(&mce_idle_notifier);
435 return 0;
436}
437__initcall(periodic_mcheck_init);
438
439
440/*
441 * Initialize Machine Checks for a CPU.
442 */
443static void mce_init(void *dummy)
444{
445 u64 cap;
446 int i;
447
448 rdmsrl(MSR_IA32_MCG_CAP, cap);
449 banks = cap & 0xff;
450 if (banks > NR_BANKS) {
451 printk(KERN_INFO "MCE: warning: using only %d banks\n", banks);
452 banks = NR_BANKS;
453 }
454 /* Use accurate RIP reporting if available. */
455 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
456 rip_msr = MSR_IA32_MCG_EIP;
457
458 /* Log the machine checks left over from the previous reset.
459 This also clears all registers */
460 do_machine_check(NULL, mce_bootlog ? -1 : -2);
461
462 set_in_cr4(X86_CR4_MCE);
463
464 if (cap & MCG_CTL_P)
465 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
466
467 for (i = 0; i < banks; i++) {
468 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
469 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
470 }
471}
472
473/* Add per CPU specific workarounds here */
474static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
475{
476 /* This should be disabled by the BIOS, but isn't always */
477 if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) {
478 /* disable GART TBL walk error reporting, which trips off
479 incorrectly with the IOMMU & 3ware & Cerberus. */
480 clear_bit(10, &bank[4]);
481 /* Lots of broken BIOS around that don't clear them
482 by default and leave crap in there. Don't log. */
483 mce_bootlog = 0;
484 }
485
486}
487
488static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
489{
490 switch (c->x86_vendor) {
491 case X86_VENDOR_INTEL:
492 mce_intel_feature_init(c);
493 break;
494 case X86_VENDOR_AMD:
495 mce_amd_feature_init(c);
496 break;
497 default:
498 break;
499 }
500}
501
502/*
503 * Called for each booted CPU to set up machine checks.
504 * Must be called with preempt off.
505 */
506void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
507{
508 static cpumask_t mce_cpus = CPU_MASK_NONE;
509
510 mce_cpu_quirks(c);
511
512 if (mce_dont_init ||
513 cpu_test_and_set(smp_processor_id(), mce_cpus) ||
514 !mce_available(c))
515 return;
516
517 mce_init(NULL);
518 mce_cpu_features(c);
519}
520
521/*
522 * Character device to read and clear the MCE log.
523 */
524
525static DEFINE_SPINLOCK(mce_state_lock);
526static int open_count; /* #times opened */
527static int open_exclu; /* already open exclusive? */
528
529static int mce_open(struct inode *inode, struct file *file)
530{
531 spin_lock(&mce_state_lock);
532
533 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
534 spin_unlock(&mce_state_lock);
535 return -EBUSY;
536 }
537
538 if (file->f_flags & O_EXCL)
539 open_exclu = 1;
540 open_count++;
541
542 spin_unlock(&mce_state_lock);
543
544 return nonseekable_open(inode, file);
545}
546
547static int mce_release(struct inode *inode, struct file *file)
548{
549 spin_lock(&mce_state_lock);
550
551 open_count--;
552 open_exclu = 0;
553
554 spin_unlock(&mce_state_lock);
555
556 return 0;
557}
558
559static void collect_tscs(void *data)
560{
561 unsigned long *cpu_tsc = (unsigned long *)data;
562 rdtscll(cpu_tsc[smp_processor_id()]);
563}
564
565static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off)
566{
567 unsigned long *cpu_tsc;
568 static DECLARE_MUTEX(mce_read_sem);
569 unsigned next;
570 char __user *buf = ubuf;
571 int i, err;
572
573 cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL);
574 if (!cpu_tsc)
575 return -ENOMEM;
576
577 down(&mce_read_sem);
578 next = rcu_dereference(mcelog.next);
579
580 /* Only supports full reads right now */
581 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
582 up(&mce_read_sem);
583 kfree(cpu_tsc);
584 return -EINVAL;
585 }
586
587 err = 0;
588 for (i = 0; i < next; i++) {
589 unsigned long start = jiffies;
590 while (!mcelog.entry[i].finished) {
591 if (time_after_eq(jiffies, start + 2)) {
592 memset(mcelog.entry + i,0, sizeof(struct mce));
593 goto timeout;
594 }
595 cpu_relax();
596 }
597 smp_rmb();
598 err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
599 buf += sizeof(struct mce);
600 timeout:
601 ;
602 }
603
604 memset(mcelog.entry, 0, next * sizeof(struct mce));
605 mcelog.next = 0;
606
607 synchronize_sched();
608
609 /* Collect entries that were still getting written before the synchronize. */
610
611 on_each_cpu(collect_tscs, cpu_tsc, 1, 1);
612 for (i = next; i < MCE_LOG_LEN; i++) {
613 if (mcelog.entry[i].finished &&
614 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
615 err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce));
616 smp_rmb();
617 buf += sizeof(struct mce);
618 memset(&mcelog.entry[i], 0, sizeof(struct mce));
619 }
620 }
621 up(&mce_read_sem);
622 kfree(cpu_tsc);
623 return err ? -EFAULT : buf - ubuf;
624}
625
626static unsigned int mce_poll(struct file *file, poll_table *wait)
627{
628 poll_wait(file, &mce_wait, wait);
629 if (rcu_dereference(mcelog.next))
630 return POLLIN | POLLRDNORM;
631 return 0;
632}
633
634static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg)
635{
636 int __user *p = (int __user *)arg;
637 if (!capable(CAP_SYS_ADMIN))
638 return -EPERM;
639 switch (cmd) {
640 case MCE_GET_RECORD_LEN:
641 return put_user(sizeof(struct mce), p);
642 case MCE_GET_LOG_LEN:
643 return put_user(MCE_LOG_LEN, p);
644 case MCE_GETCLEAR_FLAGS: {
645 unsigned flags;
646 do {
647 flags = mcelog.flags;
648 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
649 return put_user(flags, p);
650 }
651 default:
652 return -ENOTTY;
653 }
654}
655
656static const struct file_operations mce_chrdev_ops = {
657 .open = mce_open,
658 .release = mce_release,
659 .read = mce_read,
660 .poll = mce_poll,
661 .ioctl = mce_ioctl,
662};
663
664static struct miscdevice mce_log_device = {
665 MISC_MCELOG_MINOR,
666 "mcelog",
667 &mce_chrdev_ops,
668};
669
670static unsigned long old_cr4 __initdata;
671
672void __init stop_mce(void)
673{
674 old_cr4 = read_cr4();
675 clear_in_cr4(X86_CR4_MCE);
676}
677
678void __init restart_mce(void)
679{
680 if (old_cr4 & X86_CR4_MCE)
681 set_in_cr4(X86_CR4_MCE);
682}
683
684/*
685 * Old style boot options parsing. Only for compatibility.
686 */
687
688static int __init mcheck_disable(char *str)
689{
690 mce_dont_init = 1;
691 return 1;
692}
693
694/* mce=off disables machine check. Note you can reenable it later
695 using sysfs.
696 mce=TOLERANCELEVEL (number, see above)
697 mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
698 mce=nobootlog Don't log MCEs from before booting. */
699static int __init mcheck_enable(char *str)
700{
701 if (*str == '=')
702 str++;
703 if (!strcmp(str, "off"))
704 mce_dont_init = 1;
705 else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
706 mce_bootlog = str[0] == 'b';
707 else if (isdigit(str[0]))
708 get_option(&str, &tolerant);
709 else
710 printk("mce= argument %s ignored. Please use /sys", str);
711 return 1;
712}
713
714__setup("nomce", mcheck_disable);
715__setup("mce", mcheck_enable);
716
717/*
718 * Sysfs support
719 */
720
721/* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
722 Only one CPU is active at this time, the others get readded later using
723 CPU hotplug. */
724static int mce_resume(struct sys_device *dev)
725{
726 mce_init(NULL);
727 return 0;
728}
729
730/* Reinit MCEs after user configuration changes */
731static void mce_restart(void)
732{
733 if (next_interval)
734 cancel_delayed_work(&mcheck_work);
735 /* Timer race is harmless here */
736 on_each_cpu(mce_init, NULL, 1, 1);
737 next_interval = check_interval * HZ;
738 if (next_interval)
739 schedule_delayed_work(&mcheck_work,
740 round_jiffies_relative(next_interval));
741}
742
743static struct sysdev_class mce_sysclass = {
744 .resume = mce_resume,
745 set_kset_name("machinecheck"),
746};
747
748DEFINE_PER_CPU(struct sys_device, device_mce);
749
750/* Why are there no generic functions for this? */
751#define ACCESSOR(name, var, start) \
752 static ssize_t show_ ## name(struct sys_device *s, char *buf) { \
753 return sprintf(buf, "%lx\n", (unsigned long)var); \
754 } \
755 static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \
756 char *end; \
757 unsigned long new = simple_strtoul(buf, &end, 0); \
758 if (end == buf) return -EINVAL; \
759 var = new; \
760 start; \
761 return end-buf; \
762 } \
763 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
764
765/* TBD should generate these dynamically based on number of available banks */
766ACCESSOR(bank0ctl,bank[0],mce_restart())
767ACCESSOR(bank1ctl,bank[1],mce_restart())
768ACCESSOR(bank2ctl,bank[2],mce_restart())
769ACCESSOR(bank3ctl,bank[3],mce_restart())
770ACCESSOR(bank4ctl,bank[4],mce_restart())
771ACCESSOR(bank5ctl,bank[5],mce_restart())
772
773static ssize_t show_trigger(struct sys_device *s, char *buf)
774{
775 strcpy(buf, trigger);
776 strcat(buf, "\n");
777 return strlen(trigger) + 1;
778}
779
780static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz)
781{
782 char *p;
783 int len;
784 strncpy(trigger, buf, sizeof(trigger));
785 trigger[sizeof(trigger)-1] = 0;
786 len = strlen(trigger);
787 p = strchr(trigger, '\n');
788 if (*p) *p = 0;
789 return len;
790}
791
792static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
793ACCESSOR(tolerant,tolerant,)
794ACCESSOR(check_interval,check_interval,mce_restart())
795static struct sysdev_attribute *mce_attributes[] = {
796 &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
797 &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
798 &attr_tolerant, &attr_check_interval, &attr_trigger,
799 NULL
800};
801
802/* Per cpu sysdev init. All of the cpus still share the same ctl bank */
803static __cpuinit int mce_create_device(unsigned int cpu)
804{
805 int err;
806 int i;
807 if (!mce_available(&cpu_data[cpu]))
808 return -EIO;
809
810 per_cpu(device_mce,cpu).id = cpu;
811 per_cpu(device_mce,cpu).cls = &mce_sysclass;
812
813 err = sysdev_register(&per_cpu(device_mce,cpu));
814
815 if (!err) {
816 for (i = 0; mce_attributes[i]; i++)
817 sysdev_create_file(&per_cpu(device_mce,cpu),
818 mce_attributes[i]);
819 }
820 return err;
821}
822
823static void mce_remove_device(unsigned int cpu)
824{
825 int i;
826
827 for (i = 0; mce_attributes[i]; i++)
828 sysdev_remove_file(&per_cpu(device_mce,cpu),
829 mce_attributes[i]);
830 sysdev_unregister(&per_cpu(device_mce,cpu));
831 memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
832}
833
834/* Get notified when a cpu comes on/off. Be hotplug friendly. */
835static int
836mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
837{
838 unsigned int cpu = (unsigned long)hcpu;
839
840 switch (action) {
841 case CPU_ONLINE:
842 case CPU_ONLINE_FROZEN:
843 mce_create_device(cpu);
844 break;
845 case CPU_DEAD:
846 case CPU_DEAD_FROZEN:
847 mce_remove_device(cpu);
848 break;
849 }
850 return NOTIFY_OK;
851}
852
853static struct notifier_block mce_cpu_notifier = {
854 .notifier_call = mce_cpu_callback,
855};
856
857static __init int mce_init_device(void)
858{
859 int err;
860 int i = 0;
861
862 if (!mce_available(&boot_cpu_data))
863 return -EIO;
864 err = sysdev_class_register(&mce_sysclass);
865
866 for_each_online_cpu(i) {
867 mce_create_device(i);
868 }
869
870 register_hotcpu_notifier(&mce_cpu_notifier);
871 misc_register(&mce_log_device);
872 return err;
873}
874
875device_initcall(mce_init_device);
diff --git a/arch/x86/kernel/mce_amd_64.c b/arch/x86/kernel/mce_amd_64.c
new file mode 100644
index 000000000000..2f8a7f18b0fe
--- /dev/null
+++ b/arch/x86/kernel/mce_amd_64.c
@@ -0,0 +1,689 @@
1/*
2 * (c) 2005, 2006 Advanced Micro Devices, Inc.
3 * Your use of this code is subject to the terms and conditions of the
4 * GNU general public license version 2. See "COPYING" or
5 * http://www.gnu.org/licenses/gpl.html
6 *
7 * Written by Jacob Shin - AMD, Inc.
8 *
9 * Support : jacob.shin@amd.com
10 *
11 * April 2006
12 * - added support for AMD Family 0x10 processors
13 *
14 * All MC4_MISCi registers are shared between multi-cores
15 */
16
17#include <linux/cpu.h>
18#include <linux/errno.h>
19#include <linux/init.h>
20#include <linux/interrupt.h>
21#include <linux/kobject.h>
22#include <linux/notifier.h>
23#include <linux/sched.h>
24#include <linux/smp.h>
25#include <linux/sysdev.h>
26#include <linux/sysfs.h>
27#include <asm/apic.h>
28#include <asm/mce.h>
29#include <asm/msr.h>
30#include <asm/percpu.h>
31#include <asm/idle.h>
32
33#define PFX "mce_threshold: "
34#define VERSION "version 1.1.1"
35#define NR_BANKS 6
36#define NR_BLOCKS 9
37#define THRESHOLD_MAX 0xFFF
38#define INT_TYPE_APIC 0x00020000
39#define MASK_VALID_HI 0x80000000
40#define MASK_CNTP_HI 0x40000000
41#define MASK_LOCKED_HI 0x20000000
42#define MASK_LVTOFF_HI 0x00F00000
43#define MASK_COUNT_EN_HI 0x00080000
44#define MASK_INT_TYPE_HI 0x00060000
45#define MASK_OVERFLOW_HI 0x00010000
46#define MASK_ERR_COUNT_HI 0x00000FFF
47#define MASK_BLKPTR_LO 0xFF000000
48#define MCG_XBLK_ADDR 0xC0000400
49
50struct threshold_block {
51 unsigned int block;
52 unsigned int bank;
53 unsigned int cpu;
54 u32 address;
55 u16 interrupt_enable;
56 u16 threshold_limit;
57 struct kobject kobj;
58 struct list_head miscj;
59};
60
61/* defaults used early on boot */
62static struct threshold_block threshold_defaults = {
63 .interrupt_enable = 0,
64 .threshold_limit = THRESHOLD_MAX,
65};
66
67struct threshold_bank {
68 struct kobject kobj;
69 struct threshold_block *blocks;
70 cpumask_t cpus;
71};
72static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]);
73
74#ifdef CONFIG_SMP
75static unsigned char shared_bank[NR_BANKS] = {
76 0, 0, 0, 0, 1
77};
78#endif
79
80static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */
81
82/*
83 * CPU Initialization
84 */
85
86/* must be called with correct cpu affinity */
87static void threshold_restart_bank(struct threshold_block *b,
88 int reset, u16 old_limit)
89{
90 u32 mci_misc_hi, mci_misc_lo;
91
92 rdmsr(b->address, mci_misc_lo, mci_misc_hi);
93
94 if (b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX))
95 reset = 1; /* limit cannot be lower than err count */
96
97 if (reset) { /* reset err count and overflow bit */
98 mci_misc_hi =
99 (mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) |
100 (THRESHOLD_MAX - b->threshold_limit);
101 } else if (old_limit) { /* change limit w/o reset */
102 int new_count = (mci_misc_hi & THRESHOLD_MAX) +
103 (old_limit - b->threshold_limit);
104 mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) |
105 (new_count & THRESHOLD_MAX);
106 }
107
108 b->interrupt_enable ?
109 (mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) :
110 (mci_misc_hi &= ~MASK_INT_TYPE_HI);
111
112 mci_misc_hi |= MASK_COUNT_EN_HI;
113 wrmsr(b->address, mci_misc_lo, mci_misc_hi);
114}
115
116/* cpu init entry point, called from mce.c with preempt off */
117void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c)
118{
119 unsigned int bank, block;
120 unsigned int cpu = smp_processor_id();
121 u32 low = 0, high = 0, address = 0;
122
123 for (bank = 0; bank < NR_BANKS; ++bank) {
124 for (block = 0; block < NR_BLOCKS; ++block) {
125 if (block == 0)
126 address = MSR_IA32_MC0_MISC + bank * 4;
127 else if (block == 1) {
128 address = (low & MASK_BLKPTR_LO) >> 21;
129 if (!address)
130 break;
131 address += MCG_XBLK_ADDR;
132 }
133 else
134 ++address;
135
136 if (rdmsr_safe(address, &low, &high))
137 break;
138
139 if (!(high & MASK_VALID_HI)) {
140 if (block)
141 continue;
142 else
143 break;
144 }
145
146 if (!(high & MASK_CNTP_HI) ||
147 (high & MASK_LOCKED_HI))
148 continue;
149
150 if (!block)
151 per_cpu(bank_map, cpu) |= (1 << bank);
152#ifdef CONFIG_SMP
153 if (shared_bank[bank] && c->cpu_core_id)
154 break;
155#endif
156 high &= ~MASK_LVTOFF_HI;
157 high |= K8_APIC_EXT_LVT_ENTRY_THRESHOLD << 20;
158 wrmsr(address, low, high);
159
160 setup_APIC_extended_lvt(K8_APIC_EXT_LVT_ENTRY_THRESHOLD,
161 THRESHOLD_APIC_VECTOR,
162 K8_APIC_EXT_INT_MSG_FIX, 0);
163
164 threshold_defaults.address = address;
165 threshold_restart_bank(&threshold_defaults, 0, 0);
166 }
167 }
168}
169
170/*
171 * APIC Interrupt Handler
172 */
173
174/*
175 * threshold interrupt handler will service THRESHOLD_APIC_VECTOR.
176 * the interrupt goes off when error_count reaches threshold_limit.
177 * the handler will simply log mcelog w/ software defined bank number.
178 */
179asmlinkage void mce_threshold_interrupt(void)
180{
181 unsigned int bank, block;
182 struct mce m;
183 u32 low = 0, high = 0, address = 0;
184
185 ack_APIC_irq();
186 exit_idle();
187 irq_enter();
188
189 memset(&m, 0, sizeof(m));
190 rdtscll(m.tsc);
191 m.cpu = smp_processor_id();
192
193 /* assume first bank caused it */
194 for (bank = 0; bank < NR_BANKS; ++bank) {
195 if (!(per_cpu(bank_map, m.cpu) & (1 << bank)))
196 continue;
197 for (block = 0; block < NR_BLOCKS; ++block) {
198 if (block == 0)
199 address = MSR_IA32_MC0_MISC + bank * 4;
200 else if (block == 1) {
201 address = (low & MASK_BLKPTR_LO) >> 21;
202 if (!address)
203 break;
204 address += MCG_XBLK_ADDR;
205 }
206 else
207 ++address;
208
209 if (rdmsr_safe(address, &low, &high))
210 break;
211
212 if (!(high & MASK_VALID_HI)) {
213 if (block)
214 continue;
215 else
216 break;
217 }
218
219 if (!(high & MASK_CNTP_HI) ||
220 (high & MASK_LOCKED_HI))
221 continue;
222
223 /* Log the machine check that caused the threshold
224 event. */
225 do_machine_check(NULL, 0);
226
227 if (high & MASK_OVERFLOW_HI) {
228 rdmsrl(address, m.misc);
229 rdmsrl(MSR_IA32_MC0_STATUS + bank * 4,
230 m.status);
231 m.bank = K8_MCE_THRESHOLD_BASE
232 + bank * NR_BLOCKS
233 + block;
234 mce_log(&m);
235 goto out;
236 }
237 }
238 }
239out:
240 irq_exit();
241}
242
243/*
244 * Sysfs Interface
245 */
246
247struct threshold_attr {
248 struct attribute attr;
249 ssize_t(*show) (struct threshold_block *, char *);
250 ssize_t(*store) (struct threshold_block *, const char *, size_t count);
251};
252
253static cpumask_t affinity_set(unsigned int cpu)
254{
255 cpumask_t oldmask = current->cpus_allowed;
256 cpumask_t newmask = CPU_MASK_NONE;
257 cpu_set(cpu, newmask);
258 set_cpus_allowed(current, newmask);
259 return oldmask;
260}
261
262static void affinity_restore(cpumask_t oldmask)
263{
264 set_cpus_allowed(current, oldmask);
265}
266
267#define SHOW_FIELDS(name) \
268static ssize_t show_ ## name(struct threshold_block * b, char *buf) \
269{ \
270 return sprintf(buf, "%lx\n", (unsigned long) b->name); \
271}
272SHOW_FIELDS(interrupt_enable)
273SHOW_FIELDS(threshold_limit)
274
275static ssize_t store_interrupt_enable(struct threshold_block *b,
276 const char *buf, size_t count)
277{
278 char *end;
279 cpumask_t oldmask;
280 unsigned long new = simple_strtoul(buf, &end, 0);
281 if (end == buf)
282 return -EINVAL;
283 b->interrupt_enable = !!new;
284
285 oldmask = affinity_set(b->cpu);
286 threshold_restart_bank(b, 0, 0);
287 affinity_restore(oldmask);
288
289 return end - buf;
290}
291
292static ssize_t store_threshold_limit(struct threshold_block *b,
293 const char *buf, size_t count)
294{
295 char *end;
296 cpumask_t oldmask;
297 u16 old;
298 unsigned long new = simple_strtoul(buf, &end, 0);
299 if (end == buf)
300 return -EINVAL;
301 if (new > THRESHOLD_MAX)
302 new = THRESHOLD_MAX;
303 if (new < 1)
304 new = 1;
305 old = b->threshold_limit;
306 b->threshold_limit = new;
307
308 oldmask = affinity_set(b->cpu);
309 threshold_restart_bank(b, 0, old);
310 affinity_restore(oldmask);
311
312 return end - buf;
313}
314
315static ssize_t show_error_count(struct threshold_block *b, char *buf)
316{
317 u32 high, low;
318 cpumask_t oldmask;
319 oldmask = affinity_set(b->cpu);
320 rdmsr(b->address, low, high);
321 affinity_restore(oldmask);
322 return sprintf(buf, "%x\n",
323 (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit));
324}
325
326static ssize_t store_error_count(struct threshold_block *b,
327 const char *buf, size_t count)
328{
329 cpumask_t oldmask;
330 oldmask = affinity_set(b->cpu);
331 threshold_restart_bank(b, 1, 0);
332 affinity_restore(oldmask);
333 return 1;
334}
335
336#define THRESHOLD_ATTR(_name,_mode,_show,_store) { \
337 .attr = {.name = __stringify(_name), .mode = _mode }, \
338 .show = _show, \
339 .store = _store, \
340};
341
342#define RW_ATTR(name) \
343static struct threshold_attr name = \
344 THRESHOLD_ATTR(name, 0644, show_## name, store_## name)
345
346RW_ATTR(interrupt_enable);
347RW_ATTR(threshold_limit);
348RW_ATTR(error_count);
349
350static struct attribute *default_attrs[] = {
351 &interrupt_enable.attr,
352 &threshold_limit.attr,
353 &error_count.attr,
354 NULL
355};
356
357#define to_block(k) container_of(k, struct threshold_block, kobj)
358#define to_attr(a) container_of(a, struct threshold_attr, attr)
359
360static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
361{
362 struct threshold_block *b = to_block(kobj);
363 struct threshold_attr *a = to_attr(attr);
364 ssize_t ret;
365 ret = a->show ? a->show(b, buf) : -EIO;
366 return ret;
367}
368
369static ssize_t store(struct kobject *kobj, struct attribute *attr,
370 const char *buf, size_t count)
371{
372 struct threshold_block *b = to_block(kobj);
373 struct threshold_attr *a = to_attr(attr);
374 ssize_t ret;
375 ret = a->store ? a->store(b, buf, count) : -EIO;
376 return ret;
377}
378
379static struct sysfs_ops threshold_ops = {
380 .show = show,
381 .store = store,
382};
383
384static struct kobj_type threshold_ktype = {
385 .sysfs_ops = &threshold_ops,
386 .default_attrs = default_attrs,
387};
388
389static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
390 unsigned int bank,
391 unsigned int block,
392 u32 address)
393{
394 int err;
395 u32 low, high;
396 struct threshold_block *b = NULL;
397
398 if ((bank >= NR_BANKS) || (block >= NR_BLOCKS))
399 return 0;
400
401 if (rdmsr_safe(address, &low, &high))
402 return 0;
403
404 if (!(high & MASK_VALID_HI)) {
405 if (block)
406 goto recurse;
407 else
408 return 0;
409 }
410
411 if (!(high & MASK_CNTP_HI) ||
412 (high & MASK_LOCKED_HI))
413 goto recurse;
414
415 b = kzalloc(sizeof(struct threshold_block), GFP_KERNEL);
416 if (!b)
417 return -ENOMEM;
418
419 b->block = block;
420 b->bank = bank;
421 b->cpu = cpu;
422 b->address = address;
423 b->interrupt_enable = 0;
424 b->threshold_limit = THRESHOLD_MAX;
425
426 INIT_LIST_HEAD(&b->miscj);
427
428 if (per_cpu(threshold_banks, cpu)[bank]->blocks)
429 list_add(&b->miscj,
430 &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj);
431 else
432 per_cpu(threshold_banks, cpu)[bank]->blocks = b;
433
434 kobject_set_name(&b->kobj, "misc%i", block);
435 b->kobj.parent = &per_cpu(threshold_banks, cpu)[bank]->kobj;
436 b->kobj.ktype = &threshold_ktype;
437 err = kobject_register(&b->kobj);
438 if (err)
439 goto out_free;
440recurse:
441 if (!block) {
442 address = (low & MASK_BLKPTR_LO) >> 21;
443 if (!address)
444 return 0;
445 address += MCG_XBLK_ADDR;
446 } else
447 ++address;
448
449 err = allocate_threshold_blocks(cpu, bank, ++block, address);
450 if (err)
451 goto out_free;
452
453 return err;
454
455out_free:
456 if (b) {
457 kobject_unregister(&b->kobj);
458 kfree(b);
459 }
460 return err;
461}
462
463/* symlinks sibling shared banks to first core. first core owns dir/files. */
464static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
465{
466 int i, err = 0;
467 struct threshold_bank *b = NULL;
468 cpumask_t oldmask = CPU_MASK_NONE;
469 char name[32];
470
471 sprintf(name, "threshold_bank%i", bank);
472
473#ifdef CONFIG_SMP
474 if (cpu_data[cpu].cpu_core_id && shared_bank[bank]) { /* symlink */
475 i = first_cpu(cpu_core_map[cpu]);
476
477 /* first core not up yet */
478 if (cpu_data[i].cpu_core_id)
479 goto out;
480
481 /* already linked */
482 if (per_cpu(threshold_banks, cpu)[bank])
483 goto out;
484
485 b = per_cpu(threshold_banks, i)[bank];
486
487 if (!b)
488 goto out;
489
490 err = sysfs_create_link(&per_cpu(device_mce, cpu).kobj,
491 &b->kobj, name);
492 if (err)
493 goto out;
494
495 b->cpus = cpu_core_map[cpu];
496 per_cpu(threshold_banks, cpu)[bank] = b;
497 goto out;
498 }
499#endif
500
501 b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL);
502 if (!b) {
503 err = -ENOMEM;
504 goto out;
505 }
506
507 kobject_set_name(&b->kobj, "threshold_bank%i", bank);
508 b->kobj.parent = &per_cpu(device_mce, cpu).kobj;
509#ifndef CONFIG_SMP
510 b->cpus = CPU_MASK_ALL;
511#else
512 b->cpus = cpu_core_map[cpu];
513#endif
514 err = kobject_register(&b->kobj);
515 if (err)
516 goto out_free;
517
518 per_cpu(threshold_banks, cpu)[bank] = b;
519
520 oldmask = affinity_set(cpu);
521 err = allocate_threshold_blocks(cpu, bank, 0,
522 MSR_IA32_MC0_MISC + bank * 4);
523 affinity_restore(oldmask);
524
525 if (err)
526 goto out_free;
527
528 for_each_cpu_mask(i, b->cpus) {
529 if (i == cpu)
530 continue;
531
532 err = sysfs_create_link(&per_cpu(device_mce, i).kobj,
533 &b->kobj, name);
534 if (err)
535 goto out;
536
537 per_cpu(threshold_banks, i)[bank] = b;
538 }
539
540 goto out;
541
542out_free:
543 per_cpu(threshold_banks, cpu)[bank] = NULL;
544 kfree(b);
545out:
546 return err;
547}
548
549/* create dir/files for all valid threshold banks */
550static __cpuinit int threshold_create_device(unsigned int cpu)
551{
552 unsigned int bank;
553 int err = 0;
554
555 for (bank = 0; bank < NR_BANKS; ++bank) {
556 if (!(per_cpu(bank_map, cpu) & 1 << bank))
557 continue;
558 err = threshold_create_bank(cpu, bank);
559 if (err)
560 goto out;
561 }
562out:
563 return err;
564}
565
566/*
567 * let's be hotplug friendly.
568 * in case of multiple core processors, the first core always takes ownership
569 * of shared sysfs dir/files, and rest of the cores will be symlinked to it.
570 */
571
572static void deallocate_threshold_block(unsigned int cpu,
573 unsigned int bank)
574{
575 struct threshold_block *pos = NULL;
576 struct threshold_block *tmp = NULL;
577 struct threshold_bank *head = per_cpu(threshold_banks, cpu)[bank];
578
579 if (!head)
580 return;
581
582 list_for_each_entry_safe(pos, tmp, &head->blocks->miscj, miscj) {
583 kobject_unregister(&pos->kobj);
584 list_del(&pos->miscj);
585 kfree(pos);
586 }
587
588 kfree(per_cpu(threshold_banks, cpu)[bank]->blocks);
589 per_cpu(threshold_banks, cpu)[bank]->blocks = NULL;
590}
591
592static void threshold_remove_bank(unsigned int cpu, int bank)
593{
594 int i = 0;
595 struct threshold_bank *b;
596 char name[32];
597
598 b = per_cpu(threshold_banks, cpu)[bank];
599
600 if (!b)
601 return;
602
603 if (!b->blocks)
604 goto free_out;
605
606 sprintf(name, "threshold_bank%i", bank);
607
608#ifdef CONFIG_SMP
609 /* sibling symlink */
610 if (shared_bank[bank] && b->blocks->cpu != cpu) {
611 sysfs_remove_link(&per_cpu(device_mce, cpu).kobj, name);
612 per_cpu(threshold_banks, cpu)[bank] = NULL;
613 return;
614 }
615#endif
616
617 /* remove all sibling symlinks before unregistering */
618 for_each_cpu_mask(i, b->cpus) {
619 if (i == cpu)
620 continue;
621
622 sysfs_remove_link(&per_cpu(device_mce, i).kobj, name);
623 per_cpu(threshold_banks, i)[bank] = NULL;
624 }
625
626 deallocate_threshold_block(cpu, bank);
627
628free_out:
629 kobject_unregister(&b->kobj);
630 kfree(b);
631 per_cpu(threshold_banks, cpu)[bank] = NULL;
632}
633
634static void threshold_remove_device(unsigned int cpu)
635{
636 unsigned int bank;
637
638 for (bank = 0; bank < NR_BANKS; ++bank) {
639 if (!(per_cpu(bank_map, cpu) & 1 << bank))
640 continue;
641 threshold_remove_bank(cpu, bank);
642 }
643}
644
645/* get notified when a cpu comes on/off */
646static int threshold_cpu_callback(struct notifier_block *nfb,
647 unsigned long action, void *hcpu)
648{
649 /* cpu was unsigned int to begin with */
650 unsigned int cpu = (unsigned long)hcpu;
651
652 if (cpu >= NR_CPUS)
653 goto out;
654
655 switch (action) {
656 case CPU_ONLINE:
657 case CPU_ONLINE_FROZEN:
658 threshold_create_device(cpu);
659 break;
660 case CPU_DEAD:
661 case CPU_DEAD_FROZEN:
662 threshold_remove_device(cpu);
663 break;
664 default:
665 break;
666 }
667 out:
668 return NOTIFY_OK;
669}
670
671static struct notifier_block threshold_cpu_notifier = {
672 .notifier_call = threshold_cpu_callback,
673};
674
675static __init int threshold_init_device(void)
676{
677 unsigned lcpu = 0;
678
679 /* to hit CPUs online before the notifier is up */
680 for_each_online_cpu(lcpu) {
681 int err = threshold_create_device(lcpu);
682 if (err)
683 return err;
684 }
685 register_hotcpu_notifier(&threshold_cpu_notifier);
686 return 0;
687}
688
689device_initcall(threshold_init_device);
diff --git a/arch/x86/kernel/mce_intel_64.c b/arch/x86/kernel/mce_intel_64.c
new file mode 100644
index 000000000000..6551505d8a2c
--- /dev/null
+++ b/arch/x86/kernel/mce_intel_64.c
@@ -0,0 +1,89 @@
1/*
2 * Intel specific MCE features.
3 * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca>
4 */
5
6#include <linux/init.h>
7#include <linux/interrupt.h>
8#include <linux/percpu.h>
9#include <asm/processor.h>
10#include <asm/msr.h>
11#include <asm/mce.h>
12#include <asm/hw_irq.h>
13#include <asm/idle.h>
14#include <asm/therm_throt.h>
15
16asmlinkage void smp_thermal_interrupt(void)
17{
18 __u64 msr_val;
19
20 ack_APIC_irq();
21
22 exit_idle();
23 irq_enter();
24
25 rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
26 if (therm_throt_process(msr_val & 1))
27 mce_log_therm_throt_event(smp_processor_id(), msr_val);
28
29 irq_exit();
30}
31
32static void __cpuinit intel_init_thermal(struct cpuinfo_x86 *c)
33{
34 u32 l, h;
35 int tm2 = 0;
36 unsigned int cpu = smp_processor_id();
37
38 if (!cpu_has(c, X86_FEATURE_ACPI))
39 return;
40
41 if (!cpu_has(c, X86_FEATURE_ACC))
42 return;
43
44 /* first check if TM1 is already enabled by the BIOS, in which
45 * case there might be some SMM goo which handles it, so we can't even
46 * put a handler since it might be delivered via SMI already.
47 */
48 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
49 h = apic_read(APIC_LVTTHMR);
50 if ((l & (1 << 3)) && (h & APIC_DM_SMI)) {
51 printk(KERN_DEBUG
52 "CPU%d: Thermal monitoring handled by SMI\n", cpu);
53 return;
54 }
55
56 if (cpu_has(c, X86_FEATURE_TM2) && (l & (1 << 13)))
57 tm2 = 1;
58
59 if (h & APIC_VECTOR_MASK) {
60 printk(KERN_DEBUG
61 "CPU%d: Thermal LVT vector (%#x) already "
62 "installed\n", cpu, (h & APIC_VECTOR_MASK));
63 return;
64 }
65
66 h = THERMAL_APIC_VECTOR;
67 h |= (APIC_DM_FIXED | APIC_LVT_MASKED);
68 apic_write(APIC_LVTTHMR, h);
69
70 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
71 wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h);
72
73 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
74 wrmsr(MSR_IA32_MISC_ENABLE, l | (1 << 3), h);
75
76 l = apic_read(APIC_LVTTHMR);
77 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
78 printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n",
79 cpu, tm2 ? "TM2" : "TM1");
80
81 /* enable thermal throttle processing */
82 atomic_set(&therm_throt_en, 1);
83 return;
84}
85
86void __cpuinit mce_intel_feature_init(struct cpuinfo_x86 *c)
87{
88 intel_init_thermal(c);
89}
diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c
new file mode 100644
index 000000000000..09cf78110358
--- /dev/null
+++ b/arch/x86/kernel/microcode.c
@@ -0,0 +1,850 @@
1/*
2 * Intel CPU Microcode Update Driver for Linux
3 *
4 * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
5 * 2006 Shaohua Li <shaohua.li@intel.com>
6 *
7 * This driver allows to upgrade microcode on Intel processors
8 * belonging to IA-32 family - PentiumPro, Pentium II,
9 * Pentium III, Xeon, Pentium 4, etc.
10 *
11 * Reference: Section 8.10 of Volume III, Intel Pentium 4 Manual,
12 * Order Number 245472 or free download from:
13 *
14 * http://developer.intel.com/design/pentium4/manuals/245472.htm
15 *
16 * For more information, go to http://www.urbanmyth.org/microcode
17 *
18 * This program is free software; you can redistribute it and/or
19 * modify it under the terms of the GNU General Public License
20 * as published by the Free Software Foundation; either version
21 * 2 of the License, or (at your option) any later version.
22 *
23 * 1.0 16 Feb 2000, Tigran Aivazian <tigran@sco.com>
24 * Initial release.
25 * 1.01 18 Feb 2000, Tigran Aivazian <tigran@sco.com>
26 * Added read() support + cleanups.
27 * 1.02 21 Feb 2000, Tigran Aivazian <tigran@sco.com>
28 * Added 'device trimming' support. open(O_WRONLY) zeroes
29 * and frees the saved copy of applied microcode.
30 * 1.03 29 Feb 2000, Tigran Aivazian <tigran@sco.com>
31 * Made to use devfs (/dev/cpu/microcode) + cleanups.
32 * 1.04 06 Jun 2000, Simon Trimmer <simon@veritas.com>
33 * Added misc device support (now uses both devfs and misc).
34 * Added MICROCODE_IOCFREE ioctl to clear memory.
35 * 1.05 09 Jun 2000, Simon Trimmer <simon@veritas.com>
36 * Messages for error cases (non Intel & no suitable microcode).
37 * 1.06 03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
38 * Removed ->release(). Removed exclusive open and status bitmap.
39 * Added microcode_rwsem to serialize read()/write()/ioctl().
40 * Removed global kernel lock usage.
41 * 1.07 07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
42 * Write 0 to 0x8B msr and then cpuid before reading revision,
43 * so that it works even if there were no update done by the
44 * BIOS. Otherwise, reading from 0x8B gives junk (which happened
45 * to be 0 on my machine which is why it worked even when I
46 * disabled update by the BIOS)
47 * Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
48 * 1.08 11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
49 * Tigran Aivazian <tigran@veritas.com>
50 * Intel Pentium 4 processor support and bugfixes.
51 * 1.09 30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
52 * Bugfix for HT (Hyper-Threading) enabled processors
53 * whereby processor resources are shared by all logical processors
54 * in a single CPU package.
55 * 1.10 28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
56 * Tigran Aivazian <tigran@veritas.com>,
57 * Serialize updates as required on HT processors due to speculative
58 * nature of implementation.
59 * 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
60 * Fix the panic when writing zero-length microcode chunk.
61 * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
62 * Jun Nakajima <jun.nakajima@intel.com>
63 * Support for the microcode updates in the new format.
64 * 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
65 * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
66 * because we no longer hold a copy of applied microcode
67 * in kernel memory.
68 * 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
69 * Fix sigmatch() macro to handle old CPUs with pf == 0.
70 * Thanks to Stuart Swales for pointing out this bug.
71 */
72
73//#define DEBUG /* pr_debug */
74#include <linux/capability.h>
75#include <linux/kernel.h>
76#include <linux/init.h>
77#include <linux/sched.h>
78#include <linux/cpumask.h>
79#include <linux/module.h>
80#include <linux/slab.h>
81#include <linux/vmalloc.h>
82#include <linux/miscdevice.h>
83#include <linux/spinlock.h>
84#include <linux/mm.h>
85#include <linux/fs.h>
86#include <linux/mutex.h>
87#include <linux/cpu.h>
88#include <linux/firmware.h>
89#include <linux/platform_device.h>
90
91#include <asm/msr.h>
92#include <asm/uaccess.h>
93#include <asm/processor.h>
94
95MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver");
96MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
97MODULE_LICENSE("GPL");
98
99#define MICROCODE_VERSION "1.14a"
100
101#define DEFAULT_UCODE_DATASIZE (2000) /* 2000 bytes */
102#define MC_HEADER_SIZE (sizeof (microcode_header_t)) /* 48 bytes */
103#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 2048 bytes */
104#define EXT_HEADER_SIZE (sizeof (struct extended_sigtable)) /* 20 bytes */
105#define EXT_SIGNATURE_SIZE (sizeof (struct extended_signature)) /* 12 bytes */
106#define DWSIZE (sizeof (u32))
107#define get_totalsize(mc) \
108 (((microcode_t *)mc)->hdr.totalsize ? \
109 ((microcode_t *)mc)->hdr.totalsize : DEFAULT_UCODE_TOTALSIZE)
110#define get_datasize(mc) \
111 (((microcode_t *)mc)->hdr.datasize ? \
112 ((microcode_t *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE)
113
114#define sigmatch(s1, s2, p1, p2) \
115 (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0))))
116
117#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
118
119/* serialize access to the physical write to MSR 0x79 */
120static DEFINE_SPINLOCK(microcode_update_lock);
121
122/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
123static DEFINE_MUTEX(microcode_mutex);
124
125static struct ucode_cpu_info {
126 int valid;
127 unsigned int sig;
128 unsigned int pf;
129 unsigned int rev;
130 microcode_t *mc;
131} ucode_cpu_info[NR_CPUS];
132
133static void collect_cpu_info(int cpu_num)
134{
135 struct cpuinfo_x86 *c = cpu_data + cpu_num;
136 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
137 unsigned int val[2];
138
139 /* We should bind the task to the CPU */
140 BUG_ON(raw_smp_processor_id() != cpu_num);
141 uci->pf = uci->rev = 0;
142 uci->mc = NULL;
143 uci->valid = 1;
144
145 if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
146 cpu_has(c, X86_FEATURE_IA64)) {
147 printk(KERN_ERR "microcode: CPU%d not a capable Intel "
148 "processor\n", cpu_num);
149 uci->valid = 0;
150 return;
151 }
152
153 uci->sig = cpuid_eax(0x00000001);
154
155 if ((c->x86_model >= 5) || (c->x86 > 6)) {
156 /* get processor flags from MSR 0x17 */
157 rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
158 uci->pf = 1 << ((val[1] >> 18) & 7);
159 }
160
161 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
162 /* see notes above for revision 1.07. Apparent chip bug */
163 sync_core();
164 /* get the current revision from MSR 0x8B */
165 rdmsr(MSR_IA32_UCODE_REV, val[0], uci->rev);
166 pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n",
167 uci->sig, uci->pf, uci->rev);
168}
169
170static inline int microcode_update_match(int cpu_num,
171 microcode_header_t *mc_header, int sig, int pf)
172{
173 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
174
175 if (!sigmatch(sig, uci->sig, pf, uci->pf)
176 || mc_header->rev <= uci->rev)
177 return 0;
178 return 1;
179}
180
181static int microcode_sanity_check(void *mc)
182{
183 microcode_header_t *mc_header = mc;
184 struct extended_sigtable *ext_header = NULL;
185 struct extended_signature *ext_sig;
186 unsigned long total_size, data_size, ext_table_size;
187 int sum, orig_sum, ext_sigcount = 0, i;
188
189 total_size = get_totalsize(mc_header);
190 data_size = get_datasize(mc_header);
191 if (data_size + MC_HEADER_SIZE > total_size) {
192 printk(KERN_ERR "microcode: error! "
193 "Bad data size in microcode data file\n");
194 return -EINVAL;
195 }
196
197 if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
198 printk(KERN_ERR "microcode: error! "
199 "Unknown microcode update format\n");
200 return -EINVAL;
201 }
202 ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
203 if (ext_table_size) {
204 if ((ext_table_size < EXT_HEADER_SIZE)
205 || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
206 printk(KERN_ERR "microcode: error! "
207 "Small exttable size in microcode data file\n");
208 return -EINVAL;
209 }
210 ext_header = mc + MC_HEADER_SIZE + data_size;
211 if (ext_table_size != exttable_size(ext_header)) {
212 printk(KERN_ERR "microcode: error! "
213 "Bad exttable size in microcode data file\n");
214 return -EFAULT;
215 }
216 ext_sigcount = ext_header->count;
217 }
218
219 /* check extended table checksum */
220 if (ext_table_size) {
221 int ext_table_sum = 0;
222 int *ext_tablep = (int *)ext_header;
223
224 i = ext_table_size / DWSIZE;
225 while (i--)
226 ext_table_sum += ext_tablep[i];
227 if (ext_table_sum) {
228 printk(KERN_WARNING "microcode: aborting, "
229 "bad extended signature table checksum\n");
230 return -EINVAL;
231 }
232 }
233
234 /* calculate the checksum */
235 orig_sum = 0;
236 i = (MC_HEADER_SIZE + data_size) / DWSIZE;
237 while (i--)
238 orig_sum += ((int *)mc)[i];
239 if (orig_sum) {
240 printk(KERN_ERR "microcode: aborting, bad checksum\n");
241 return -EINVAL;
242 }
243 if (!ext_table_size)
244 return 0;
245 /* check extended signature checksum */
246 for (i = 0; i < ext_sigcount; i++) {
247 ext_sig = (struct extended_signature *)((void *)ext_header
248 + EXT_HEADER_SIZE + EXT_SIGNATURE_SIZE * i);
249 sum = orig_sum
250 - (mc_header->sig + mc_header->pf + mc_header->cksum)
251 + (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
252 if (sum) {
253 printk(KERN_ERR "microcode: aborting, bad checksum\n");
254 return -EINVAL;
255 }
256 }
257 return 0;
258}
259
260/*
261 * return 0 - no update found
262 * return 1 - found update
263 * return < 0 - error
264 */
265static int get_maching_microcode(void *mc, int cpu)
266{
267 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
268 microcode_header_t *mc_header = mc;
269 struct extended_sigtable *ext_header;
270 unsigned long total_size = get_totalsize(mc_header);
271 int ext_sigcount, i;
272 struct extended_signature *ext_sig;
273 void *new_mc;
274
275 if (microcode_update_match(cpu, mc_header,
276 mc_header->sig, mc_header->pf))
277 goto find;
278
279 if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE)
280 return 0;
281
282 ext_header = (struct extended_sigtable *)(mc +
283 get_datasize(mc_header) + MC_HEADER_SIZE);
284 ext_sigcount = ext_header->count;
285 ext_sig = (struct extended_signature *)((void *)ext_header
286 + EXT_HEADER_SIZE);
287 for (i = 0; i < ext_sigcount; i++) {
288 if (microcode_update_match(cpu, mc_header,
289 ext_sig->sig, ext_sig->pf))
290 goto find;
291 ext_sig++;
292 }
293 return 0;
294find:
295 pr_debug("microcode: CPU %d found a matching microcode update with"
296 " version 0x%x (current=0x%x)\n", cpu, mc_header->rev,uci->rev);
297 new_mc = vmalloc(total_size);
298 if (!new_mc) {
299 printk(KERN_ERR "microcode: error! Can not allocate memory\n");
300 return -ENOMEM;
301 }
302
303 /* free previous update file */
304 vfree(uci->mc);
305
306 memcpy(new_mc, mc, total_size);
307 uci->mc = new_mc;
308 return 1;
309}
310
311static void apply_microcode(int cpu)
312{
313 unsigned long flags;
314 unsigned int val[2];
315 int cpu_num = raw_smp_processor_id();
316 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
317
318 /* We should bind the task to the CPU */
319 BUG_ON(cpu_num != cpu);
320
321 if (uci->mc == NULL)
322 return;
323
324 /* serialize access to the physical write to MSR 0x79 */
325 spin_lock_irqsave(&microcode_update_lock, flags);
326
327 /* write microcode via MSR 0x79 */
328 wrmsr(MSR_IA32_UCODE_WRITE,
329 (unsigned long) uci->mc->bits,
330 (unsigned long) uci->mc->bits >> 16 >> 16);
331 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
332
333 /* see notes above for revision 1.07. Apparent chip bug */
334 sync_core();
335
336 /* get the current revision from MSR 0x8B */
337 rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
338
339 spin_unlock_irqrestore(&microcode_update_lock, flags);
340 if (val[1] != uci->mc->hdr.rev) {
341 printk(KERN_ERR "microcode: CPU%d updated from revision "
342 "0x%x to 0x%x failed\n", cpu_num, uci->rev, val[1]);
343 return;
344 }
345 pr_debug("microcode: CPU%d updated from revision "
346 "0x%x to 0x%x, date = %08x \n",
347 cpu_num, uci->rev, val[1], uci->mc->hdr.date);
348 uci->rev = val[1];
349}
350
351#ifdef CONFIG_MICROCODE_OLD_INTERFACE
352static void __user *user_buffer; /* user area microcode data buffer */
353static unsigned int user_buffer_size; /* it's size */
354
355static long get_next_ucode(void **mc, long offset)
356{
357 microcode_header_t mc_header;
358 unsigned long total_size;
359
360 /* No more data */
361 if (offset >= user_buffer_size)
362 return 0;
363 if (copy_from_user(&mc_header, user_buffer + offset, MC_HEADER_SIZE)) {
364 printk(KERN_ERR "microcode: error! Can not read user data\n");
365 return -EFAULT;
366 }
367 total_size = get_totalsize(&mc_header);
368 if (offset + total_size > user_buffer_size) {
369 printk(KERN_ERR "microcode: error! Bad total size in microcode "
370 "data file\n");
371 return -EINVAL;
372 }
373 *mc = vmalloc(total_size);
374 if (!*mc)
375 return -ENOMEM;
376 if (copy_from_user(*mc, user_buffer + offset, total_size)) {
377 printk(KERN_ERR "microcode: error! Can not read user data\n");
378 vfree(*mc);
379 return -EFAULT;
380 }
381 return offset + total_size;
382}
383
384static int do_microcode_update (void)
385{
386 long cursor = 0;
387 int error = 0;
388 void *new_mc = NULL;
389 int cpu;
390 cpumask_t old;
391
392 old = current->cpus_allowed;
393
394 while ((cursor = get_next_ucode(&new_mc, cursor)) > 0) {
395 error = microcode_sanity_check(new_mc);
396 if (error)
397 goto out;
398 /*
399 * It's possible the data file has multiple matching ucode,
400 * lets keep searching till the latest version
401 */
402 for_each_online_cpu(cpu) {
403 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
404
405 if (!uci->valid)
406 continue;
407 set_cpus_allowed(current, cpumask_of_cpu(cpu));
408 error = get_maching_microcode(new_mc, cpu);
409 if (error < 0)
410 goto out;
411 if (error == 1)
412 apply_microcode(cpu);
413 }
414 vfree(new_mc);
415 }
416out:
417 if (cursor > 0)
418 vfree(new_mc);
419 if (cursor < 0)
420 error = cursor;
421 set_cpus_allowed(current, old);
422 return error;
423}
424
425static int microcode_open (struct inode *unused1, struct file *unused2)
426{
427 return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
428}
429
430static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos)
431{
432 ssize_t ret;
433
434 if ((len >> PAGE_SHIFT) > num_physpages) {
435 printk(KERN_ERR "microcode: too much data (max %ld pages)\n", num_physpages);
436 return -EINVAL;
437 }
438
439 lock_cpu_hotplug();
440 mutex_lock(&microcode_mutex);
441
442 user_buffer = (void __user *) buf;
443 user_buffer_size = (int) len;
444
445 ret = do_microcode_update();
446 if (!ret)
447 ret = (ssize_t)len;
448
449 mutex_unlock(&microcode_mutex);
450 unlock_cpu_hotplug();
451
452 return ret;
453}
454
455static const struct file_operations microcode_fops = {
456 .owner = THIS_MODULE,
457 .write = microcode_write,
458 .open = microcode_open,
459};
460
461static struct miscdevice microcode_dev = {
462 .minor = MICROCODE_MINOR,
463 .name = "microcode",
464 .fops = &microcode_fops,
465};
466
467static int __init microcode_dev_init (void)
468{
469 int error;
470
471 error = misc_register(&microcode_dev);
472 if (error) {
473 printk(KERN_ERR
474 "microcode: can't misc_register on minor=%d\n",
475 MICROCODE_MINOR);
476 return error;
477 }
478
479 return 0;
480}
481
482static void microcode_dev_exit (void)
483{
484 misc_deregister(&microcode_dev);
485}
486
487MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
488#else
489#define microcode_dev_init() 0
490#define microcode_dev_exit() do { } while(0)
491#endif
492
493static long get_next_ucode_from_buffer(void **mc, void *buf,
494 unsigned long size, long offset)
495{
496 microcode_header_t *mc_header;
497 unsigned long total_size;
498
499 /* No more data */
500 if (offset >= size)
501 return 0;
502 mc_header = (microcode_header_t *)(buf + offset);
503 total_size = get_totalsize(mc_header);
504
505 if (offset + total_size > size) {
506 printk(KERN_ERR "microcode: error! Bad data in microcode data file\n");
507 return -EINVAL;
508 }
509
510 *mc = vmalloc(total_size);
511 if (!*mc) {
512 printk(KERN_ERR "microcode: error! Can not allocate memory\n");
513 return -ENOMEM;
514 }
515 memcpy(*mc, buf + offset, total_size);
516 return offset + total_size;
517}
518
519/* fake device for request_firmware */
520static struct platform_device *microcode_pdev;
521
522static int cpu_request_microcode(int cpu)
523{
524 char name[30];
525 struct cpuinfo_x86 *c = cpu_data + cpu;
526 const struct firmware *firmware;
527 void *buf;
528 unsigned long size;
529 long offset = 0;
530 int error;
531 void *mc;
532
533 /* We should bind the task to the CPU */
534 BUG_ON(cpu != raw_smp_processor_id());
535 sprintf(name,"intel-ucode/%02x-%02x-%02x",
536 c->x86, c->x86_model, c->x86_mask);
537 error = request_firmware(&firmware, name, &microcode_pdev->dev);
538 if (error) {
539 pr_debug("ucode data file %s load failed\n", name);
540 return error;
541 }
542 buf = (void *)firmware->data;
543 size = firmware->size;
544 while ((offset = get_next_ucode_from_buffer(&mc, buf, size, offset))
545 > 0) {
546 error = microcode_sanity_check(mc);
547 if (error)
548 break;
549 error = get_maching_microcode(mc, cpu);
550 if (error < 0)
551 break;
552 /*
553 * It's possible the data file has multiple matching ucode,
554 * lets keep searching till the latest version
555 */
556 if (error == 1) {
557 apply_microcode(cpu);
558 error = 0;
559 }
560 vfree(mc);
561 }
562 if (offset > 0)
563 vfree(mc);
564 if (offset < 0)
565 error = offset;
566 release_firmware(firmware);
567
568 return error;
569}
570
571static int apply_microcode_check_cpu(int cpu)
572{
573 struct cpuinfo_x86 *c = cpu_data + cpu;
574 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
575 cpumask_t old;
576 unsigned int val[2];
577 int err = 0;
578
579 /* Check if the microcode is available */
580 if (!uci->mc)
581 return 0;
582
583 old = current->cpus_allowed;
584 set_cpus_allowed(current, cpumask_of_cpu(cpu));
585
586 /* Check if the microcode we have in memory matches the CPU */
587 if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
588 cpu_has(c, X86_FEATURE_IA64) || uci->sig != cpuid_eax(0x00000001))
589 err = -EINVAL;
590
591 if (!err && ((c->x86_model >= 5) || (c->x86 > 6))) {
592 /* get processor flags from MSR 0x17 */
593 rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
594 if (uci->pf != (1 << ((val[1] >> 18) & 7)))
595 err = -EINVAL;
596 }
597
598 if (!err) {
599 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
600 /* see notes above for revision 1.07. Apparent chip bug */
601 sync_core();
602 /* get the current revision from MSR 0x8B */
603 rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
604 if (uci->rev != val[1])
605 err = -EINVAL;
606 }
607
608 if (!err)
609 apply_microcode(cpu);
610 else
611 printk(KERN_ERR "microcode: Could not apply microcode to CPU%d:"
612 " sig=0x%x, pf=0x%x, rev=0x%x\n",
613 cpu, uci->sig, uci->pf, uci->rev);
614
615 set_cpus_allowed(current, old);
616 return err;
617}
618
619static void microcode_init_cpu(int cpu, int resume)
620{
621 cpumask_t old;
622 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
623
624 old = current->cpus_allowed;
625
626 set_cpus_allowed(current, cpumask_of_cpu(cpu));
627 mutex_lock(&microcode_mutex);
628 collect_cpu_info(cpu);
629 if (uci->valid && system_state == SYSTEM_RUNNING && !resume)
630 cpu_request_microcode(cpu);
631 mutex_unlock(&microcode_mutex);
632 set_cpus_allowed(current, old);
633}
634
635static void microcode_fini_cpu(int cpu)
636{
637 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
638
639 mutex_lock(&microcode_mutex);
640 uci->valid = 0;
641 vfree(uci->mc);
642 uci->mc = NULL;
643 mutex_unlock(&microcode_mutex);
644}
645
646static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz)
647{
648 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
649 char *end;
650 unsigned long val = simple_strtoul(buf, &end, 0);
651 int err = 0;
652 int cpu = dev->id;
653
654 if (end == buf)
655 return -EINVAL;
656 if (val == 1) {
657 cpumask_t old;
658
659 old = current->cpus_allowed;
660
661 lock_cpu_hotplug();
662 set_cpus_allowed(current, cpumask_of_cpu(cpu));
663
664 mutex_lock(&microcode_mutex);
665 if (uci->valid)
666 err = cpu_request_microcode(cpu);
667 mutex_unlock(&microcode_mutex);
668 unlock_cpu_hotplug();
669 set_cpus_allowed(current, old);
670 }
671 if (err)
672 return err;
673 return sz;
674}
675
676static ssize_t version_show(struct sys_device *dev, char *buf)
677{
678 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
679
680 return sprintf(buf, "0x%x\n", uci->rev);
681}
682
683static ssize_t pf_show(struct sys_device *dev, char *buf)
684{
685 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
686
687 return sprintf(buf, "0x%x\n", uci->pf);
688}
689
690static SYSDEV_ATTR(reload, 0200, NULL, reload_store);
691static SYSDEV_ATTR(version, 0400, version_show, NULL);
692static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL);
693
694static struct attribute *mc_default_attrs[] = {
695 &attr_reload.attr,
696 &attr_version.attr,
697 &attr_processor_flags.attr,
698 NULL
699};
700
701static struct attribute_group mc_attr_group = {
702 .attrs = mc_default_attrs,
703 .name = "microcode",
704};
705
706static int __mc_sysdev_add(struct sys_device *sys_dev, int resume)
707{
708 int err, cpu = sys_dev->id;
709 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
710
711 if (!cpu_online(cpu))
712 return 0;
713
714 pr_debug("Microcode:CPU %d added\n", cpu);
715 memset(uci, 0, sizeof(*uci));
716
717 err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
718 if (err)
719 return err;
720
721 microcode_init_cpu(cpu, resume);
722
723 return 0;
724}
725
726static int mc_sysdev_add(struct sys_device *sys_dev)
727{
728 return __mc_sysdev_add(sys_dev, 0);
729}
730
731static int mc_sysdev_remove(struct sys_device *sys_dev)
732{
733 int cpu = sys_dev->id;
734
735 if (!cpu_online(cpu))
736 return 0;
737
738 pr_debug("Microcode:CPU %d removed\n", cpu);
739 microcode_fini_cpu(cpu);
740 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
741 return 0;
742}
743
744static int mc_sysdev_resume(struct sys_device *dev)
745{
746 int cpu = dev->id;
747
748 if (!cpu_online(cpu))
749 return 0;
750 pr_debug("Microcode:CPU %d resumed\n", cpu);
751 /* only CPU 0 will apply ucode here */
752 apply_microcode(0);
753 return 0;
754}
755
756static struct sysdev_driver mc_sysdev_driver = {
757 .add = mc_sysdev_add,
758 .remove = mc_sysdev_remove,
759 .resume = mc_sysdev_resume,
760};
761
762static __cpuinit int
763mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
764{
765 unsigned int cpu = (unsigned long)hcpu;
766 struct sys_device *sys_dev;
767
768 sys_dev = get_cpu_sysdev(cpu);
769 switch (action) {
770 case CPU_UP_CANCELED_FROZEN:
771 /* The CPU refused to come up during a system resume */
772 microcode_fini_cpu(cpu);
773 break;
774 case CPU_ONLINE:
775 case CPU_DOWN_FAILED:
776 mc_sysdev_add(sys_dev);
777 break;
778 case CPU_ONLINE_FROZEN:
779 /* System-wide resume is in progress, try to apply microcode */
780 if (apply_microcode_check_cpu(cpu)) {
781 /* The application of microcode failed */
782 microcode_fini_cpu(cpu);
783 __mc_sysdev_add(sys_dev, 1);
784 break;
785 }
786 case CPU_DOWN_FAILED_FROZEN:
787 if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
788 printk(KERN_ERR "Microcode: Failed to create the sysfs "
789 "group for CPU%d\n", cpu);
790 break;
791 case CPU_DOWN_PREPARE:
792 mc_sysdev_remove(sys_dev);
793 break;
794 case CPU_DOWN_PREPARE_FROZEN:
795 /* Suspend is in progress, only remove the interface */
796 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
797 break;
798 }
799 return NOTIFY_OK;
800}
801
802static struct notifier_block __cpuinitdata mc_cpu_notifier = {
803 .notifier_call = mc_cpu_callback,
804};
805
806static int __init microcode_init (void)
807{
808 int error;
809
810 error = microcode_dev_init();
811 if (error)
812 return error;
813 microcode_pdev = platform_device_register_simple("microcode", -1,
814 NULL, 0);
815 if (IS_ERR(microcode_pdev)) {
816 microcode_dev_exit();
817 return PTR_ERR(microcode_pdev);
818 }
819
820 lock_cpu_hotplug();
821 error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
822 unlock_cpu_hotplug();
823 if (error) {
824 microcode_dev_exit();
825 platform_device_unregister(microcode_pdev);
826 return error;
827 }
828
829 register_hotcpu_notifier(&mc_cpu_notifier);
830
831 printk(KERN_INFO
832 "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n");
833 return 0;
834}
835
836static void __exit microcode_exit (void)
837{
838 microcode_dev_exit();
839
840 unregister_hotcpu_notifier(&mc_cpu_notifier);
841
842 lock_cpu_hotplug();
843 sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
844 unlock_cpu_hotplug();
845
846 platform_device_unregister(microcode_pdev);
847}
848
849module_init(microcode_init)
850module_exit(microcode_exit)
diff --git a/arch/x86/kernel/module_32.c b/arch/x86/kernel/module_32.c
new file mode 100644
index 000000000000..3db0a5442eb1
--- /dev/null
+++ b/arch/x86/kernel/module_32.c
@@ -0,0 +1,152 @@
1/* Kernel module help for i386.
2 Copyright (C) 2001 Rusty Russell.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17*/
18#include <linux/moduleloader.h>
19#include <linux/elf.h>
20#include <linux/vmalloc.h>
21#include <linux/fs.h>
22#include <linux/string.h>
23#include <linux/kernel.h>
24#include <linux/bug.h>
25
26#if 0
27#define DEBUGP printk
28#else
29#define DEBUGP(fmt...)
30#endif
31
32void *module_alloc(unsigned long size)
33{
34 if (size == 0)
35 return NULL;
36 return vmalloc_exec(size);
37}
38
39
40/* Free memory returned from module_alloc */
41void module_free(struct module *mod, void *module_region)
42{
43 vfree(module_region);
44 /* FIXME: If module_region == mod->init_region, trim exception
45 table entries. */
46}
47
48/* We don't need anything special. */
49int module_frob_arch_sections(Elf_Ehdr *hdr,
50 Elf_Shdr *sechdrs,
51 char *secstrings,
52 struct module *mod)
53{
54 return 0;
55}
56
57int apply_relocate(Elf32_Shdr *sechdrs,
58 const char *strtab,
59 unsigned int symindex,
60 unsigned int relsec,
61 struct module *me)
62{
63 unsigned int i;
64 Elf32_Rel *rel = (void *)sechdrs[relsec].sh_addr;
65 Elf32_Sym *sym;
66 uint32_t *location;
67
68 DEBUGP("Applying relocate section %u to %u\n", relsec,
69 sechdrs[relsec].sh_info);
70 for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
71 /* This is where to make the change */
72 location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
73 + rel[i].r_offset;
74 /* This is the symbol it is referring to. Note that all
75 undefined symbols have been resolved. */
76 sym = (Elf32_Sym *)sechdrs[symindex].sh_addr
77 + ELF32_R_SYM(rel[i].r_info);
78
79 switch (ELF32_R_TYPE(rel[i].r_info)) {
80 case R_386_32:
81 /* We add the value into the location given */
82 *location += sym->st_value;
83 break;
84 case R_386_PC32:
85 /* Add the value, subtract its postition */
86 *location += sym->st_value - (uint32_t)location;
87 break;
88 default:
89 printk(KERN_ERR "module %s: Unknown relocation: %u\n",
90 me->name, ELF32_R_TYPE(rel[i].r_info));
91 return -ENOEXEC;
92 }
93 }
94 return 0;
95}
96
97int apply_relocate_add(Elf32_Shdr *sechdrs,
98 const char *strtab,
99 unsigned int symindex,
100 unsigned int relsec,
101 struct module *me)
102{
103 printk(KERN_ERR "module %s: ADD RELOCATION unsupported\n",
104 me->name);
105 return -ENOEXEC;
106}
107
108int module_finalize(const Elf_Ehdr *hdr,
109 const Elf_Shdr *sechdrs,
110 struct module *me)
111{
112 const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
113 *para = NULL;
114 char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
115
116 for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
117 if (!strcmp(".text", secstrings + s->sh_name))
118 text = s;
119 if (!strcmp(".altinstructions", secstrings + s->sh_name))
120 alt = s;
121 if (!strcmp(".smp_locks", secstrings + s->sh_name))
122 locks= s;
123 if (!strcmp(".parainstructions", secstrings + s->sh_name))
124 para = s;
125 }
126
127 if (alt) {
128 /* patch .altinstructions */
129 void *aseg = (void *)alt->sh_addr;
130 apply_alternatives(aseg, aseg + alt->sh_size);
131 }
132 if (locks && text) {
133 void *lseg = (void *)locks->sh_addr;
134 void *tseg = (void *)text->sh_addr;
135 alternatives_smp_module_add(me, me->name,
136 lseg, lseg + locks->sh_size,
137 tseg, tseg + text->sh_size);
138 }
139
140 if (para) {
141 void *pseg = (void *)para->sh_addr;
142 apply_paravirt(pseg, pseg + para->sh_size);
143 }
144
145 return module_bug_finalize(hdr, sechdrs, me);
146}
147
148void module_arch_cleanup(struct module *mod)
149{
150 alternatives_smp_module_del(mod);
151 module_bug_cleanup(mod);
152}
diff --git a/arch/x86/kernel/module_64.c b/arch/x86/kernel/module_64.c
new file mode 100644
index 000000000000..a888e67f5874
--- /dev/null
+++ b/arch/x86/kernel/module_64.c
@@ -0,0 +1,185 @@
1/* Kernel module help for x86-64
2 Copyright (C) 2001 Rusty Russell.
3 Copyright (C) 2002,2003 Andi Kleen, SuSE Labs.
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18*/
19#include <linux/moduleloader.h>
20#include <linux/elf.h>
21#include <linux/vmalloc.h>
22#include <linux/fs.h>
23#include <linux/string.h>
24#include <linux/kernel.h>
25#include <linux/slab.h>
26#include <linux/bug.h>
27
28#include <asm/system.h>
29#include <asm/page.h>
30#include <asm/pgtable.h>
31
32#define DEBUGP(fmt...)
33
34#ifndef CONFIG_UML
35void module_free(struct module *mod, void *module_region)
36{
37 vfree(module_region);
38 /* FIXME: If module_region == mod->init_region, trim exception
39 table entries. */
40}
41
42void *module_alloc(unsigned long size)
43{
44 struct vm_struct *area;
45
46 if (!size)
47 return NULL;
48 size = PAGE_ALIGN(size);
49 if (size > MODULES_LEN)
50 return NULL;
51
52 area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END);
53 if (!area)
54 return NULL;
55
56 return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL_EXEC);
57}
58#endif
59
60/* We don't need anything special. */
61int module_frob_arch_sections(Elf_Ehdr *hdr,
62 Elf_Shdr *sechdrs,
63 char *secstrings,
64 struct module *mod)
65{
66 return 0;
67}
68
69int apply_relocate_add(Elf64_Shdr *sechdrs,
70 const char *strtab,
71 unsigned int symindex,
72 unsigned int relsec,
73 struct module *me)
74{
75 unsigned int i;
76 Elf64_Rela *rel = (void *)sechdrs[relsec].sh_addr;
77 Elf64_Sym *sym;
78 void *loc;
79 u64 val;
80
81 DEBUGP("Applying relocate section %u to %u\n", relsec,
82 sechdrs[relsec].sh_info);
83 for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
84 /* This is where to make the change */
85 loc = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
86 + rel[i].r_offset;
87
88 /* This is the symbol it is referring to. Note that all
89 undefined symbols have been resolved. */
90 sym = (Elf64_Sym *)sechdrs[symindex].sh_addr
91 + ELF64_R_SYM(rel[i].r_info);
92
93 DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n",
94 (int)ELF64_R_TYPE(rel[i].r_info),
95 sym->st_value, rel[i].r_addend, (u64)loc);
96
97 val = sym->st_value + rel[i].r_addend;
98
99 switch (ELF64_R_TYPE(rel[i].r_info)) {
100 case R_X86_64_NONE:
101 break;
102 case R_X86_64_64:
103 *(u64 *)loc = val;
104 break;
105 case R_X86_64_32:
106 *(u32 *)loc = val;
107 if (val != *(u32 *)loc)
108 goto overflow;
109 break;
110 case R_X86_64_32S:
111 *(s32 *)loc = val;
112 if ((s64)val != *(s32 *)loc)
113 goto overflow;
114 break;
115 case R_X86_64_PC32:
116 val -= (u64)loc;
117 *(u32 *)loc = val;
118#if 0
119 if ((s64)val != *(s32 *)loc)
120 goto overflow;
121#endif
122 break;
123 default:
124 printk(KERN_ERR "module %s: Unknown rela relocation: %Lu\n",
125 me->name, ELF64_R_TYPE(rel[i].r_info));
126 return -ENOEXEC;
127 }
128 }
129 return 0;
130
131overflow:
132 printk(KERN_ERR "overflow in relocation type %d val %Lx\n",
133 (int)ELF64_R_TYPE(rel[i].r_info), val);
134 printk(KERN_ERR "`%s' likely not compiled with -mcmodel=kernel\n",
135 me->name);
136 return -ENOEXEC;
137}
138
139int apply_relocate(Elf_Shdr *sechdrs,
140 const char *strtab,
141 unsigned int symindex,
142 unsigned int relsec,
143 struct module *me)
144{
145 printk("non add relocation not supported\n");
146 return -ENOSYS;
147}
148
149int module_finalize(const Elf_Ehdr *hdr,
150 const Elf_Shdr *sechdrs,
151 struct module *me)
152{
153 const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL;
154 char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
155
156 for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
157 if (!strcmp(".text", secstrings + s->sh_name))
158 text = s;
159 if (!strcmp(".altinstructions", secstrings + s->sh_name))
160 alt = s;
161 if (!strcmp(".smp_locks", secstrings + s->sh_name))
162 locks= s;
163 }
164
165 if (alt) {
166 /* patch .altinstructions */
167 void *aseg = (void *)alt->sh_addr;
168 apply_alternatives(aseg, aseg + alt->sh_size);
169 }
170 if (locks && text) {
171 void *lseg = (void *)locks->sh_addr;
172 void *tseg = (void *)text->sh_addr;
173 alternatives_smp_module_add(me, me->name,
174 lseg, lseg + locks->sh_size,
175 tseg, tseg + text->sh_size);
176 }
177
178 return module_bug_finalize(hdr, sechdrs, me);
179}
180
181void module_arch_cleanup(struct module *mod)
182{
183 alternatives_smp_module_del(mod);
184 module_bug_cleanup(mod);
185}
diff --git a/arch/x86/kernel/mpparse_32.c b/arch/x86/kernel/mpparse_32.c
new file mode 100644
index 000000000000..13abb4ebfb79
--- /dev/null
+++ b/arch/x86/kernel/mpparse_32.c
@@ -0,0 +1,1132 @@
1/*
2 * Intel Multiprocessor Specification 1.1 and 1.4
3 * compliant MP-table parsing routines.
4 *
5 * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
6 * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
7 *
8 * Fixes
9 * Erich Boleyn : MP v1.4 and additional changes.
10 * Alan Cox : Added EBDA scanning
11 * Ingo Molnar : various cleanups and rewrites
12 * Maciej W. Rozycki: Bits for default MP configurations
13 * Paul Diefenbaugh: Added full ACPI support
14 */
15
16#include <linux/mm.h>
17#include <linux/init.h>
18#include <linux/acpi.h>
19#include <linux/delay.h>
20#include <linux/bootmem.h>
21#include <linux/kernel_stat.h>
22#include <linux/mc146818rtc.h>
23#include <linux/bitops.h>
24
25#include <asm/smp.h>
26#include <asm/acpi.h>
27#include <asm/mtrr.h>
28#include <asm/mpspec.h>
29#include <asm/io_apic.h>
30
31#include <mach_apic.h>
32#include <mach_apicdef.h>
33#include <mach_mpparse.h>
34#include <bios_ebda.h>
35
36/* Have we found an MP table */
37int smp_found_config;
38unsigned int __cpuinitdata maxcpus = NR_CPUS;
39
40/*
41 * Various Linux-internal data structures created from the
42 * MP-table.
43 */
44int apic_version [MAX_APICS];
45int mp_bus_id_to_type [MAX_MP_BUSSES];
46int mp_bus_id_to_node [MAX_MP_BUSSES];
47int mp_bus_id_to_local [MAX_MP_BUSSES];
48int quad_local_to_mp_bus_id [NR_CPUS/4][4];
49int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
50static int mp_current_pci_id;
51
52/* I/O APIC entries */
53struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
54
55/* # of MP IRQ source entries */
56struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
57
58/* MP IRQ source entries */
59int mp_irq_entries;
60
61int nr_ioapics;
62
63int pic_mode;
64unsigned long mp_lapic_addr;
65
66unsigned int def_to_bigsmp = 0;
67
68/* Processor that is doing the boot up */
69unsigned int boot_cpu_physical_apicid = -1U;
70/* Internal processor count */
71unsigned int __cpuinitdata num_processors;
72
73/* Bitmask of physically existing CPUs */
74physid_mask_t phys_cpu_present_map;
75
76u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
77
78/*
79 * Intel MP BIOS table parsing routines:
80 */
81
82
83/*
84 * Checksum an MP configuration block.
85 */
86
87static int __init mpf_checksum(unsigned char *mp, int len)
88{
89 int sum = 0;
90
91 while (len--)
92 sum += *mp++;
93
94 return sum & 0xFF;
95}
96
97/*
98 * Have to match translation table entries to main table entries by counter
99 * hence the mpc_record variable .... can't see a less disgusting way of
100 * doing this ....
101 */
102
103static int mpc_record;
104static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] __cpuinitdata;
105
106static void __cpuinit MP_processor_info (struct mpc_config_processor *m)
107{
108 int ver, apicid;
109 physid_mask_t phys_cpu;
110
111 if (!(m->mpc_cpuflag & CPU_ENABLED))
112 return;
113
114 apicid = mpc_apic_id(m, translation_table[mpc_record]);
115
116 if (m->mpc_featureflag&(1<<0))
117 Dprintk(" Floating point unit present.\n");
118 if (m->mpc_featureflag&(1<<7))
119 Dprintk(" Machine Exception supported.\n");
120 if (m->mpc_featureflag&(1<<8))
121 Dprintk(" 64 bit compare & exchange supported.\n");
122 if (m->mpc_featureflag&(1<<9))
123 Dprintk(" Internal APIC present.\n");
124 if (m->mpc_featureflag&(1<<11))
125 Dprintk(" SEP present.\n");
126 if (m->mpc_featureflag&(1<<12))
127 Dprintk(" MTRR present.\n");
128 if (m->mpc_featureflag&(1<<13))
129 Dprintk(" PGE present.\n");
130 if (m->mpc_featureflag&(1<<14))
131 Dprintk(" MCA present.\n");
132 if (m->mpc_featureflag&(1<<15))
133 Dprintk(" CMOV present.\n");
134 if (m->mpc_featureflag&(1<<16))
135 Dprintk(" PAT present.\n");
136 if (m->mpc_featureflag&(1<<17))
137 Dprintk(" PSE present.\n");
138 if (m->mpc_featureflag&(1<<18))
139 Dprintk(" PSN present.\n");
140 if (m->mpc_featureflag&(1<<19))
141 Dprintk(" Cache Line Flush Instruction present.\n");
142 /* 20 Reserved */
143 if (m->mpc_featureflag&(1<<21))
144 Dprintk(" Debug Trace and EMON Store present.\n");
145 if (m->mpc_featureflag&(1<<22))
146 Dprintk(" ACPI Thermal Throttle Registers present.\n");
147 if (m->mpc_featureflag&(1<<23))
148 Dprintk(" MMX present.\n");
149 if (m->mpc_featureflag&(1<<24))
150 Dprintk(" FXSR present.\n");
151 if (m->mpc_featureflag&(1<<25))
152 Dprintk(" XMM present.\n");
153 if (m->mpc_featureflag&(1<<26))
154 Dprintk(" Willamette New Instructions present.\n");
155 if (m->mpc_featureflag&(1<<27))
156 Dprintk(" Self Snoop present.\n");
157 if (m->mpc_featureflag&(1<<28))
158 Dprintk(" HT present.\n");
159 if (m->mpc_featureflag&(1<<29))
160 Dprintk(" Thermal Monitor present.\n");
161 /* 30, 31 Reserved */
162
163
164 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
165 Dprintk(" Bootup CPU\n");
166 boot_cpu_physical_apicid = m->mpc_apicid;
167 }
168
169 ver = m->mpc_apicver;
170
171 /*
172 * Validate version
173 */
174 if (ver == 0x0) {
175 printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! "
176 "fixing up to 0x10. (tell your hw vendor)\n",
177 m->mpc_apicid);
178 ver = 0x10;
179 }
180 apic_version[m->mpc_apicid] = ver;
181
182 phys_cpu = apicid_to_cpu_present(apicid);
183 physids_or(phys_cpu_present_map, phys_cpu_present_map, phys_cpu);
184
185 if (num_processors >= NR_CPUS) {
186 printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
187 " Processor ignored.\n", NR_CPUS);
188 return;
189 }
190
191 if (num_processors >= maxcpus) {
192 printk(KERN_WARNING "WARNING: maxcpus limit of %i reached."
193 " Processor ignored.\n", maxcpus);
194 return;
195 }
196
197 cpu_set(num_processors, cpu_possible_map);
198 num_processors++;
199
200 /*
201 * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y
202 * but we need to work other dependencies like SMP_SUSPEND etc
203 * before this can be done without some confusion.
204 * if (CPU_HOTPLUG_ENABLED || num_processors > 8)
205 * - Ashok Raj <ashok.raj@intel.com>
206 */
207 if (num_processors > 8) {
208 switch (boot_cpu_data.x86_vendor) {
209 case X86_VENDOR_INTEL:
210 if (!APIC_XAPIC(ver)) {
211 def_to_bigsmp = 0;
212 break;
213 }
214 /* If P4 and above fall through */
215 case X86_VENDOR_AMD:
216 def_to_bigsmp = 1;
217 }
218 }
219 bios_cpu_apicid[num_processors - 1] = m->mpc_apicid;
220}
221
222static void __init MP_bus_info (struct mpc_config_bus *m)
223{
224 char str[7];
225
226 memcpy(str, m->mpc_bustype, 6);
227 str[6] = 0;
228
229 mpc_oem_bus_info(m, str, translation_table[mpc_record]);
230
231#if MAX_MP_BUSSES < 256
232 if (m->mpc_busid >= MAX_MP_BUSSES) {
233 printk(KERN_WARNING "MP table busid value (%d) for bustype %s "
234 " is too large, max. supported is %d\n",
235 m->mpc_busid, str, MAX_MP_BUSSES - 1);
236 return;
237 }
238#endif
239
240 if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA)-1) == 0) {
241 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
242 } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA)-1) == 0) {
243 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
244 } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI)-1) == 0) {
245 mpc_oem_pci_bus(m, translation_table[mpc_record]);
246 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
247 mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
248 mp_current_pci_id++;
249 } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA)-1) == 0) {
250 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
251 } else {
252 printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
253 }
254}
255
256static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
257{
258 if (!(m->mpc_flags & MPC_APIC_USABLE))
259 return;
260
261 printk(KERN_INFO "I/O APIC #%d Version %d at 0x%lX.\n",
262 m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
263 if (nr_ioapics >= MAX_IO_APICS) {
264 printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found %d).\n",
265 MAX_IO_APICS, nr_ioapics);
266 panic("Recompile kernel with bigger MAX_IO_APICS!.\n");
267 }
268 if (!m->mpc_apicaddr) {
269 printk(KERN_ERR "WARNING: bogus zero I/O APIC address"
270 " found in MP table, skipping!\n");
271 return;
272 }
273 mp_ioapics[nr_ioapics] = *m;
274 nr_ioapics++;
275}
276
277static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
278{
279 mp_irqs [mp_irq_entries] = *m;
280 Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
281 " IRQ %02x, APIC ID %x, APIC INT %02x\n",
282 m->mpc_irqtype, m->mpc_irqflag & 3,
283 (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
284 m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
285 if (++mp_irq_entries == MAX_IRQ_SOURCES)
286 panic("Max # of irq sources exceeded!!\n");
287}
288
289static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
290{
291 Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
292 " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
293 m->mpc_irqtype, m->mpc_irqflag & 3,
294 (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
295 m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
296}
297
298#ifdef CONFIG_X86_NUMAQ
299static void __init MP_translation_info (struct mpc_config_translation *m)
300{
301 printk(KERN_INFO "Translation: record %d, type %d, quad %d, global %d, local %d\n", mpc_record, m->trans_type, m->trans_quad, m->trans_global, m->trans_local);
302
303 if (mpc_record >= MAX_MPC_ENTRY)
304 printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
305 else
306 translation_table[mpc_record] = m; /* stash this for later */
307 if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
308 node_set_online(m->trans_quad);
309}
310
311/*
312 * Read/parse the MPC oem tables
313 */
314
315static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, \
316 unsigned short oemsize)
317{
318 int count = sizeof (*oemtable); /* the header size */
319 unsigned char *oemptr = ((unsigned char *)oemtable)+count;
320
321 mpc_record = 0;
322 printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n", oemtable);
323 if (memcmp(oemtable->oem_signature,MPC_OEM_SIGNATURE,4))
324 {
325 printk(KERN_WARNING "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
326 oemtable->oem_signature[0],
327 oemtable->oem_signature[1],
328 oemtable->oem_signature[2],
329 oemtable->oem_signature[3]);
330 return;
331 }
332 if (mpf_checksum((unsigned char *)oemtable,oemtable->oem_length))
333 {
334 printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
335 return;
336 }
337 while (count < oemtable->oem_length) {
338 switch (*oemptr) {
339 case MP_TRANSLATION:
340 {
341 struct mpc_config_translation *m=
342 (struct mpc_config_translation *)oemptr;
343 MP_translation_info(m);
344 oemptr += sizeof(*m);
345 count += sizeof(*m);
346 ++mpc_record;
347 break;
348 }
349 default:
350 {
351 printk(KERN_WARNING "Unrecognised OEM table entry type! - %d\n", (int) *oemptr);
352 return;
353 }
354 }
355 }
356}
357
358static inline void mps_oem_check(struct mp_config_table *mpc, char *oem,
359 char *productid)
360{
361 if (strncmp(oem, "IBM NUMA", 8))
362 printk("Warning! May not be a NUMA-Q system!\n");
363 if (mpc->mpc_oemptr)
364 smp_read_mpc_oem((struct mp_config_oemtable *) mpc->mpc_oemptr,
365 mpc->mpc_oemsize);
366}
367#endif /* CONFIG_X86_NUMAQ */
368
369/*
370 * Read/parse the MPC
371 */
372
373static int __init smp_read_mpc(struct mp_config_table *mpc)
374{
375 char str[16];
376 char oem[10];
377 int count=sizeof(*mpc);
378 unsigned char *mpt=((unsigned char *)mpc)+count;
379
380 if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
381 printk(KERN_ERR "SMP mptable: bad signature [0x%x]!\n",
382 *(u32 *)mpc->mpc_signature);
383 return 0;
384 }
385 if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
386 printk(KERN_ERR "SMP mptable: checksum error!\n");
387 return 0;
388 }
389 if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
390 printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n",
391 mpc->mpc_spec);
392 return 0;
393 }
394 if (!mpc->mpc_lapic) {
395 printk(KERN_ERR "SMP mptable: null local APIC address!\n");
396 return 0;
397 }
398 memcpy(oem,mpc->mpc_oem,8);
399 oem[8]=0;
400 printk(KERN_INFO "OEM ID: %s ",oem);
401
402 memcpy(str,mpc->mpc_productid,12);
403 str[12]=0;
404 printk("Product ID: %s ",str);
405
406 mps_oem_check(mpc, oem, str);
407
408 printk("APIC at: 0x%lX\n",mpc->mpc_lapic);
409
410 /*
411 * Save the local APIC address (it might be non-default) -- but only
412 * if we're not using ACPI.
413 */
414 if (!acpi_lapic)
415 mp_lapic_addr = mpc->mpc_lapic;
416
417 /*
418 * Now process the configuration blocks.
419 */
420 mpc_record = 0;
421 while (count < mpc->mpc_length) {
422 switch(*mpt) {
423 case MP_PROCESSOR:
424 {
425 struct mpc_config_processor *m=
426 (struct mpc_config_processor *)mpt;
427 /* ACPI may have already provided this data */
428 if (!acpi_lapic)
429 MP_processor_info(m);
430 mpt += sizeof(*m);
431 count += sizeof(*m);
432 break;
433 }
434 case MP_BUS:
435 {
436 struct mpc_config_bus *m=
437 (struct mpc_config_bus *)mpt;
438 MP_bus_info(m);
439 mpt += sizeof(*m);
440 count += sizeof(*m);
441 break;
442 }
443 case MP_IOAPIC:
444 {
445 struct mpc_config_ioapic *m=
446 (struct mpc_config_ioapic *)mpt;
447 MP_ioapic_info(m);
448 mpt+=sizeof(*m);
449 count+=sizeof(*m);
450 break;
451 }
452 case MP_INTSRC:
453 {
454 struct mpc_config_intsrc *m=
455 (struct mpc_config_intsrc *)mpt;
456
457 MP_intsrc_info(m);
458 mpt+=sizeof(*m);
459 count+=sizeof(*m);
460 break;
461 }
462 case MP_LINTSRC:
463 {
464 struct mpc_config_lintsrc *m=
465 (struct mpc_config_lintsrc *)mpt;
466 MP_lintsrc_info(m);
467 mpt+=sizeof(*m);
468 count+=sizeof(*m);
469 break;
470 }
471 default:
472 {
473 count = mpc->mpc_length;
474 break;
475 }
476 }
477 ++mpc_record;
478 }
479 setup_apic_routing();
480 if (!num_processors)
481 printk(KERN_ERR "SMP mptable: no processors registered!\n");
482 return num_processors;
483}
484
485static int __init ELCR_trigger(unsigned int irq)
486{
487 unsigned int port;
488
489 port = 0x4d0 + (irq >> 3);
490 return (inb(port) >> (irq & 7)) & 1;
491}
492
493static void __init construct_default_ioirq_mptable(int mpc_default_type)
494{
495 struct mpc_config_intsrc intsrc;
496 int i;
497 int ELCR_fallback = 0;
498
499 intsrc.mpc_type = MP_INTSRC;
500 intsrc.mpc_irqflag = 0; /* conforming */
501 intsrc.mpc_srcbus = 0;
502 intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
503
504 intsrc.mpc_irqtype = mp_INT;
505
506 /*
507 * If true, we have an ISA/PCI system with no IRQ entries
508 * in the MP table. To prevent the PCI interrupts from being set up
509 * incorrectly, we try to use the ELCR. The sanity check to see if
510 * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
511 * never be level sensitive, so we simply see if the ELCR agrees.
512 * If it does, we assume it's valid.
513 */
514 if (mpc_default_type == 5) {
515 printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
516
517 if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13))
518 printk(KERN_WARNING "ELCR contains invalid data... not using ELCR\n");
519 else {
520 printk(KERN_INFO "Using ELCR to identify PCI interrupts\n");
521 ELCR_fallback = 1;
522 }
523 }
524
525 for (i = 0; i < 16; i++) {
526 switch (mpc_default_type) {
527 case 2:
528 if (i == 0 || i == 13)
529 continue; /* IRQ0 & IRQ13 not connected */
530 /* fall through */
531 default:
532 if (i == 2)
533 continue; /* IRQ2 is never connected */
534 }
535
536 if (ELCR_fallback) {
537 /*
538 * If the ELCR indicates a level-sensitive interrupt, we
539 * copy that information over to the MP table in the
540 * irqflag field (level sensitive, active high polarity).
541 */
542 if (ELCR_trigger(i))
543 intsrc.mpc_irqflag = 13;
544 else
545 intsrc.mpc_irqflag = 0;
546 }
547
548 intsrc.mpc_srcbusirq = i;
549 intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */
550 MP_intsrc_info(&intsrc);
551 }
552
553 intsrc.mpc_irqtype = mp_ExtINT;
554 intsrc.mpc_srcbusirq = 0;
555 intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */
556 MP_intsrc_info(&intsrc);
557}
558
559static inline void __init construct_default_ISA_mptable(int mpc_default_type)
560{
561 struct mpc_config_processor processor;
562 struct mpc_config_bus bus;
563 struct mpc_config_ioapic ioapic;
564 struct mpc_config_lintsrc lintsrc;
565 int linttypes[2] = { mp_ExtINT, mp_NMI };
566 int i;
567
568 /*
569 * local APIC has default address
570 */
571 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
572
573 /*
574 * 2 CPUs, numbered 0 & 1.
575 */
576 processor.mpc_type = MP_PROCESSOR;
577 /* Either an integrated APIC or a discrete 82489DX. */
578 processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
579 processor.mpc_cpuflag = CPU_ENABLED;
580 processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
581 (boot_cpu_data.x86_model << 4) |
582 boot_cpu_data.x86_mask;
583 processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
584 processor.mpc_reserved[0] = 0;
585 processor.mpc_reserved[1] = 0;
586 for (i = 0; i < 2; i++) {
587 processor.mpc_apicid = i;
588 MP_processor_info(&processor);
589 }
590
591 bus.mpc_type = MP_BUS;
592 bus.mpc_busid = 0;
593 switch (mpc_default_type) {
594 default:
595 printk("???\n");
596 printk(KERN_ERR "Unknown standard configuration %d\n",
597 mpc_default_type);
598 /* fall through */
599 case 1:
600 case 5:
601 memcpy(bus.mpc_bustype, "ISA ", 6);
602 break;
603 case 2:
604 case 6:
605 case 3:
606 memcpy(bus.mpc_bustype, "EISA ", 6);
607 break;
608 case 4:
609 case 7:
610 memcpy(bus.mpc_bustype, "MCA ", 6);
611 }
612 MP_bus_info(&bus);
613 if (mpc_default_type > 4) {
614 bus.mpc_busid = 1;
615 memcpy(bus.mpc_bustype, "PCI ", 6);
616 MP_bus_info(&bus);
617 }
618
619 ioapic.mpc_type = MP_IOAPIC;
620 ioapic.mpc_apicid = 2;
621 ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
622 ioapic.mpc_flags = MPC_APIC_USABLE;
623 ioapic.mpc_apicaddr = 0xFEC00000;
624 MP_ioapic_info(&ioapic);
625
626 /*
627 * We set up most of the low 16 IO-APIC pins according to MPS rules.
628 */
629 construct_default_ioirq_mptable(mpc_default_type);
630
631 lintsrc.mpc_type = MP_LINTSRC;
632 lintsrc.mpc_irqflag = 0; /* conforming */
633 lintsrc.mpc_srcbusid = 0;
634 lintsrc.mpc_srcbusirq = 0;
635 lintsrc.mpc_destapic = MP_APIC_ALL;
636 for (i = 0; i < 2; i++) {
637 lintsrc.mpc_irqtype = linttypes[i];
638 lintsrc.mpc_destapiclint = i;
639 MP_lintsrc_info(&lintsrc);
640 }
641}
642
643static struct intel_mp_floating *mpf_found;
644
645/*
646 * Scan the memory blocks for an SMP configuration block.
647 */
648void __init get_smp_config (void)
649{
650 struct intel_mp_floating *mpf = mpf_found;
651
652 /*
653 * ACPI supports both logical (e.g. Hyper-Threading) and physical
654 * processors, where MPS only supports physical.
655 */
656 if (acpi_lapic && acpi_ioapic) {
657 printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n");
658 return;
659 }
660 else if (acpi_lapic)
661 printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
662
663 printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
664 if (mpf->mpf_feature2 & (1<<7)) {
665 printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
666 pic_mode = 1;
667 } else {
668 printk(KERN_INFO " Virtual Wire compatibility mode.\n");
669 pic_mode = 0;
670 }
671
672 /*
673 * Now see if we need to read further.
674 */
675 if (mpf->mpf_feature1 != 0) {
676
677 printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1);
678 construct_default_ISA_mptable(mpf->mpf_feature1);
679
680 } else if (mpf->mpf_physptr) {
681
682 /*
683 * Read the physical hardware table. Anything here will
684 * override the defaults.
685 */
686 if (!smp_read_mpc(phys_to_virt(mpf->mpf_physptr))) {
687 smp_found_config = 0;
688 printk(KERN_ERR "BIOS bug, MP table errors detected!...\n");
689 printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n");
690 return;
691 }
692 /*
693 * If there are no explicit MP IRQ entries, then we are
694 * broken. We set up most of the low 16 IO-APIC pins to
695 * ISA defaults and hope it will work.
696 */
697 if (!mp_irq_entries) {
698 struct mpc_config_bus bus;
699
700 printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
701
702 bus.mpc_type = MP_BUS;
703 bus.mpc_busid = 0;
704 memcpy(bus.mpc_bustype, "ISA ", 6);
705 MP_bus_info(&bus);
706
707 construct_default_ioirq_mptable(0);
708 }
709
710 } else
711 BUG();
712
713 printk(KERN_INFO "Processors: %d\n", num_processors);
714 /*
715 * Only use the first configuration found.
716 */
717}
718
719static int __init smp_scan_config (unsigned long base, unsigned long length)
720{
721 unsigned long *bp = phys_to_virt(base);
722 struct intel_mp_floating *mpf;
723
724 Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
725 if (sizeof(*mpf) != 16)
726 printk("Error: MPF size\n");
727
728 while (length > 0) {
729 mpf = (struct intel_mp_floating *)bp;
730 if ((*bp == SMP_MAGIC_IDENT) &&
731 (mpf->mpf_length == 1) &&
732 !mpf_checksum((unsigned char *)bp, 16) &&
733 ((mpf->mpf_specification == 1)
734 || (mpf->mpf_specification == 4)) ) {
735
736 smp_found_config = 1;
737 printk(KERN_INFO "found SMP MP-table at %08lx\n",
738 virt_to_phys(mpf));
739 reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE);
740 if (mpf->mpf_physptr) {
741 /*
742 * We cannot access to MPC table to compute
743 * table size yet, as only few megabytes from
744 * the bottom is mapped now.
745 * PC-9800's MPC table places on the very last
746 * of physical memory; so that simply reserving
747 * PAGE_SIZE from mpg->mpf_physptr yields BUG()
748 * in reserve_bootmem.
749 */
750 unsigned long size = PAGE_SIZE;
751 unsigned long end = max_low_pfn * PAGE_SIZE;
752 if (mpf->mpf_physptr + size > end)
753 size = end - mpf->mpf_physptr;
754 reserve_bootmem(mpf->mpf_physptr, size);
755 }
756
757 mpf_found = mpf;
758 return 1;
759 }
760 bp += 4;
761 length -= 16;
762 }
763 return 0;
764}
765
766void __init find_smp_config (void)
767{
768 unsigned int address;
769
770 /*
771 * FIXME: Linux assumes you have 640K of base ram..
772 * this continues the error...
773 *
774 * 1) Scan the bottom 1K for a signature
775 * 2) Scan the top 1K of base RAM
776 * 3) Scan the 64K of bios
777 */
778 if (smp_scan_config(0x0,0x400) ||
779 smp_scan_config(639*0x400,0x400) ||
780 smp_scan_config(0xF0000,0x10000))
781 return;
782 /*
783 * If it is an SMP machine we should know now, unless the
784 * configuration is in an EISA/MCA bus machine with an
785 * extended bios data area.
786 *
787 * there is a real-mode segmented pointer pointing to the
788 * 4K EBDA area at 0x40E, calculate and scan it here.
789 *
790 * NOTE! There are Linux loaders that will corrupt the EBDA
791 * area, and as such this kind of SMP config may be less
792 * trustworthy, simply because the SMP table may have been
793 * stomped on during early boot. These loaders are buggy and
794 * should be fixed.
795 *
796 * MP1.4 SPEC states to only scan first 1K of 4K EBDA.
797 */
798
799 address = get_bios_ebda();
800 if (address)
801 smp_scan_config(address, 0x400);
802}
803
804int es7000_plat;
805
806/* --------------------------------------------------------------------------
807 ACPI-based MP Configuration
808 -------------------------------------------------------------------------- */
809
810#ifdef CONFIG_ACPI
811
812void __init mp_register_lapic_address(u64 address)
813{
814 mp_lapic_addr = (unsigned long) address;
815
816 set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
817
818 if (boot_cpu_physical_apicid == -1U)
819 boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
820
821 Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid);
822}
823
824void __cpuinit mp_register_lapic (u8 id, u8 enabled)
825{
826 struct mpc_config_processor processor;
827 int boot_cpu = 0;
828
829 if (MAX_APICS - id <= 0) {
830 printk(KERN_WARNING "Processor #%d invalid (max %d)\n",
831 id, MAX_APICS);
832 return;
833 }
834
835 if (id == boot_cpu_physical_apicid)
836 boot_cpu = 1;
837
838 processor.mpc_type = MP_PROCESSOR;
839 processor.mpc_apicid = id;
840 processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR));
841 processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
842 processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
843 processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
844 (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
845 processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
846 processor.mpc_reserved[0] = 0;
847 processor.mpc_reserved[1] = 0;
848
849 MP_processor_info(&processor);
850}
851
852#ifdef CONFIG_X86_IO_APIC
853
854#define MP_ISA_BUS 0
855#define MP_MAX_IOAPIC_PIN 127
856
857static struct mp_ioapic_routing {
858 int apic_id;
859 int gsi_base;
860 int gsi_end;
861 u32 pin_programmed[4];
862} mp_ioapic_routing[MAX_IO_APICS];
863
864static int mp_find_ioapic (int gsi)
865{
866 int i = 0;
867
868 /* Find the IOAPIC that manages this GSI. */
869 for (i = 0; i < nr_ioapics; i++) {
870 if ((gsi >= mp_ioapic_routing[i].gsi_base)
871 && (gsi <= mp_ioapic_routing[i].gsi_end))
872 return i;
873 }
874
875 printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
876
877 return -1;
878}
879
880void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base)
881{
882 int idx = 0;
883 int tmpid;
884
885 if (nr_ioapics >= MAX_IO_APICS) {
886 printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
887 "(found %d)\n", MAX_IO_APICS, nr_ioapics);
888 panic("Recompile kernel with bigger MAX_IO_APICS!\n");
889 }
890 if (!address) {
891 printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
892 " found in MADT table, skipping!\n");
893 return;
894 }
895
896 idx = nr_ioapics++;
897
898 mp_ioapics[idx].mpc_type = MP_IOAPIC;
899 mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
900 mp_ioapics[idx].mpc_apicaddr = address;
901
902 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
903 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
904 && !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
905 tmpid = io_apic_get_unique_id(idx, id);
906 else
907 tmpid = id;
908 if (tmpid == -1) {
909 nr_ioapics--;
910 return;
911 }
912 mp_ioapics[idx].mpc_apicid = tmpid;
913 mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
914
915 /*
916 * Build basic GSI lookup table to facilitate gsi->io_apic lookups
917 * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
918 */
919 mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
920 mp_ioapic_routing[idx].gsi_base = gsi_base;
921 mp_ioapic_routing[idx].gsi_end = gsi_base +
922 io_apic_get_redir_entries(idx);
923
924 printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, "
925 "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
926 mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
927 mp_ioapic_routing[idx].gsi_base,
928 mp_ioapic_routing[idx].gsi_end);
929}
930
931void __init
932mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
933{
934 struct mpc_config_intsrc intsrc;
935 int ioapic = -1;
936 int pin = -1;
937
938 /*
939 * Convert 'gsi' to 'ioapic.pin'.
940 */
941 ioapic = mp_find_ioapic(gsi);
942 if (ioapic < 0)
943 return;
944 pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
945
946 /*
947 * TBD: This check is for faulty timer entries, where the override
948 * erroneously sets the trigger to level, resulting in a HUGE
949 * increase of timer interrupts!
950 */
951 if ((bus_irq == 0) && (trigger == 3))
952 trigger = 1;
953
954 intsrc.mpc_type = MP_INTSRC;
955 intsrc.mpc_irqtype = mp_INT;
956 intsrc.mpc_irqflag = (trigger << 2) | polarity;
957 intsrc.mpc_srcbus = MP_ISA_BUS;
958 intsrc.mpc_srcbusirq = bus_irq; /* IRQ */
959 intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */
960 intsrc.mpc_dstirq = pin; /* INTIN# */
961
962 Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n",
963 intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
964 (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
965 intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq);
966
967 mp_irqs[mp_irq_entries] = intsrc;
968 if (++mp_irq_entries == MAX_IRQ_SOURCES)
969 panic("Max # of irq sources exceeded!\n");
970}
971
972void __init mp_config_acpi_legacy_irqs (void)
973{
974 struct mpc_config_intsrc intsrc;
975 int i = 0;
976 int ioapic = -1;
977
978 /*
979 * Fabricate the legacy ISA bus (bus #31).
980 */
981 mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
982 Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
983
984 /*
985 * Older generations of ES7000 have no legacy identity mappings
986 */
987 if (es7000_plat == 1)
988 return;
989
990 /*
991 * Locate the IOAPIC that manages the ISA IRQs (0-15).
992 */
993 ioapic = mp_find_ioapic(0);
994 if (ioapic < 0)
995 return;
996
997 intsrc.mpc_type = MP_INTSRC;
998 intsrc.mpc_irqflag = 0; /* Conforming */
999 intsrc.mpc_srcbus = MP_ISA_BUS;
1000 intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
1001
1002 /*
1003 * Use the default configuration for the IRQs 0-15. Unless
1004 * overriden by (MADT) interrupt source override entries.
1005 */
1006 for (i = 0; i < 16; i++) {
1007 int idx;
1008
1009 for (idx = 0; idx < mp_irq_entries; idx++) {
1010 struct mpc_config_intsrc *irq = mp_irqs + idx;
1011
1012 /* Do we already have a mapping for this ISA IRQ? */
1013 if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i)
1014 break;
1015
1016 /* Do we already have a mapping for this IOAPIC pin */
1017 if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
1018 (irq->mpc_dstirq == i))
1019 break;
1020 }
1021
1022 if (idx != mp_irq_entries) {
1023 printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
1024 continue; /* IRQ already used */
1025 }
1026
1027 intsrc.mpc_irqtype = mp_INT;
1028 intsrc.mpc_srcbusirq = i; /* Identity mapped */
1029 intsrc.mpc_dstirq = i;
1030
1031 Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, "
1032 "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
1033 (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
1034 intsrc.mpc_srcbusirq, intsrc.mpc_dstapic,
1035 intsrc.mpc_dstirq);
1036
1037 mp_irqs[mp_irq_entries] = intsrc;
1038 if (++mp_irq_entries == MAX_IRQ_SOURCES)
1039 panic("Max # of irq sources exceeded!\n");
1040 }
1041}
1042
1043#define MAX_GSI_NUM 4096
1044
1045int mp_register_gsi(u32 gsi, int triggering, int polarity)
1046{
1047 int ioapic = -1;
1048 int ioapic_pin = 0;
1049 int idx, bit = 0;
1050 static int pci_irq = 16;
1051 /*
1052 * Mapping between Global System Interrups, which
1053 * represent all possible interrupts, and IRQs
1054 * assigned to actual devices.
1055 */
1056 static int gsi_to_irq[MAX_GSI_NUM];
1057
1058 /* Don't set up the ACPI SCI because it's already set up */
1059 if (acpi_gbl_FADT.sci_interrupt == gsi)
1060 return gsi;
1061
1062 ioapic = mp_find_ioapic(gsi);
1063 if (ioapic < 0) {
1064 printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
1065 return gsi;
1066 }
1067
1068 ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
1069
1070 if (ioapic_renumber_irq)
1071 gsi = ioapic_renumber_irq(ioapic, gsi);
1072
1073 /*
1074 * Avoid pin reprogramming. PRTs typically include entries
1075 * with redundant pin->gsi mappings (but unique PCI devices);
1076 * we only program the IOAPIC on the first.
1077 */
1078 bit = ioapic_pin % 32;
1079 idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32);
1080 if (idx > 3) {
1081 printk(KERN_ERR "Invalid reference to IOAPIC pin "
1082 "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
1083 ioapic_pin);
1084 return gsi;
1085 }
1086 if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
1087 Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
1088 mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
1089 return gsi_to_irq[gsi];
1090 }
1091
1092 mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
1093
1094 if (triggering == ACPI_LEVEL_SENSITIVE) {
1095 /*
1096 * For PCI devices assign IRQs in order, avoiding gaps
1097 * due to unused I/O APIC pins.
1098 */
1099 int irq = gsi;
1100 if (gsi < MAX_GSI_NUM) {
1101 /*
1102 * Retain the VIA chipset work-around (gsi > 15), but
1103 * avoid a problem where the 8254 timer (IRQ0) is setup
1104 * via an override (so it's not on pin 0 of the ioapic),
1105 * and at the same time, the pin 0 interrupt is a PCI
1106 * type. The gsi > 15 test could cause these two pins
1107 * to be shared as IRQ0, and they are not shareable.
1108 * So test for this condition, and if necessary, avoid
1109 * the pin collision.
1110 */
1111 if (gsi > 15 || (gsi == 0 && !timer_uses_ioapic_pin_0))
1112 gsi = pci_irq++;
1113 /*
1114 * Don't assign IRQ used by ACPI SCI
1115 */
1116 if (gsi == acpi_gbl_FADT.sci_interrupt)
1117 gsi = pci_irq++;
1118 gsi_to_irq[irq] = gsi;
1119 } else {
1120 printk(KERN_ERR "GSI %u is too high\n", gsi);
1121 return gsi;
1122 }
1123 }
1124
1125 io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
1126 triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
1127 polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
1128 return gsi;
1129}
1130
1131#endif /* CONFIG_X86_IO_APIC */
1132#endif /* CONFIG_ACPI */
diff --git a/arch/x86/kernel/mpparse_64.c b/arch/x86/kernel/mpparse_64.c
new file mode 100644
index 000000000000..8bf0ca03ac8e
--- /dev/null
+++ b/arch/x86/kernel/mpparse_64.c
@@ -0,0 +1,852 @@
1/*
2 * Intel Multiprocessor Specification 1.1 and 1.4
3 * compliant MP-table parsing routines.
4 *
5 * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
6 * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
7 *
8 * Fixes
9 * Erich Boleyn : MP v1.4 and additional changes.
10 * Alan Cox : Added EBDA scanning
11 * Ingo Molnar : various cleanups and rewrites
12 * Maciej W. Rozycki: Bits for default MP configurations
13 * Paul Diefenbaugh: Added full ACPI support
14 */
15
16#include <linux/mm.h>
17#include <linux/init.h>
18#include <linux/delay.h>
19#include <linux/bootmem.h>
20#include <linux/kernel_stat.h>
21#include <linux/mc146818rtc.h>
22#include <linux/acpi.h>
23#include <linux/module.h>
24
25#include <asm/smp.h>
26#include <asm/mtrr.h>
27#include <asm/mpspec.h>
28#include <asm/pgalloc.h>
29#include <asm/io_apic.h>
30#include <asm/proto.h>
31#include <asm/acpi.h>
32
33/* Have we found an MP table */
34int smp_found_config;
35
36/*
37 * Various Linux-internal data structures created from the
38 * MP-table.
39 */
40DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
41int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
42
43static int mp_current_pci_id = 0;
44/* I/O APIC entries */
45struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
46
47/* # of MP IRQ source entries */
48struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
49
50/* MP IRQ source entries */
51int mp_irq_entries;
52
53int nr_ioapics;
54unsigned long mp_lapic_addr = 0;
55
56
57
58/* Processor that is doing the boot up */
59unsigned int boot_cpu_id = -1U;
60/* Internal processor count */
61unsigned int num_processors __cpuinitdata = 0;
62
63unsigned disabled_cpus __cpuinitdata;
64
65/* Bitmask of physically existing CPUs */
66physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;
67
68u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
69
70
71/*
72 * Intel MP BIOS table parsing routines:
73 */
74
75/*
76 * Checksum an MP configuration block.
77 */
78
79static int __init mpf_checksum(unsigned char *mp, int len)
80{
81 int sum = 0;
82
83 while (len--)
84 sum += *mp++;
85
86 return sum & 0xFF;
87}
88
89static void __cpuinit MP_processor_info (struct mpc_config_processor *m)
90{
91 int cpu;
92 cpumask_t tmp_map;
93 char *bootup_cpu = "";
94
95 if (!(m->mpc_cpuflag & CPU_ENABLED)) {
96 disabled_cpus++;
97 return;
98 }
99 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
100 bootup_cpu = " (Bootup-CPU)";
101 boot_cpu_id = m->mpc_apicid;
102 }
103
104 printk(KERN_INFO "Processor #%d%s\n", m->mpc_apicid, bootup_cpu);
105
106 if (num_processors >= NR_CPUS) {
107 printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
108 " Processor ignored.\n", NR_CPUS);
109 return;
110 }
111
112 num_processors++;
113 cpus_complement(tmp_map, cpu_present_map);
114 cpu = first_cpu(tmp_map);
115
116 physid_set(m->mpc_apicid, phys_cpu_present_map);
117 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
118 /*
119 * bios_cpu_apicid is required to have processors listed
120 * in same order as logical cpu numbers. Hence the first
121 * entry is BSP, and so on.
122 */
123 cpu = 0;
124 }
125 bios_cpu_apicid[cpu] = m->mpc_apicid;
126 x86_cpu_to_apicid[cpu] = m->mpc_apicid;
127
128 cpu_set(cpu, cpu_possible_map);
129 cpu_set(cpu, cpu_present_map);
130}
131
132static void __init MP_bus_info (struct mpc_config_bus *m)
133{
134 char str[7];
135
136 memcpy(str, m->mpc_bustype, 6);
137 str[6] = 0;
138 Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
139
140 if (strncmp(str, "ISA", 3) == 0) {
141 set_bit(m->mpc_busid, mp_bus_not_pci);
142 } else if (strncmp(str, "PCI", 3) == 0) {
143 clear_bit(m->mpc_busid, mp_bus_not_pci);
144 mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
145 mp_current_pci_id++;
146 } else {
147 printk(KERN_ERR "Unknown bustype %s\n", str);
148 }
149}
150
151static int bad_ioapic(unsigned long address)
152{
153 if (nr_ioapics >= MAX_IO_APICS) {
154 printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
155 "(found %d)\n", MAX_IO_APICS, nr_ioapics);
156 panic("Recompile kernel with bigger MAX_IO_APICS!\n");
157 }
158 if (!address) {
159 printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
160 " found in table, skipping!\n");
161 return 1;
162 }
163 return 0;
164}
165
166static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
167{
168 if (!(m->mpc_flags & MPC_APIC_USABLE))
169 return;
170
171 printk("I/O APIC #%d at 0x%X.\n",
172 m->mpc_apicid, m->mpc_apicaddr);
173
174 if (bad_ioapic(m->mpc_apicaddr))
175 return;
176
177 mp_ioapics[nr_ioapics] = *m;
178 nr_ioapics++;
179}
180
181static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
182{
183 mp_irqs [mp_irq_entries] = *m;
184 Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
185 " IRQ %02x, APIC ID %x, APIC INT %02x\n",
186 m->mpc_irqtype, m->mpc_irqflag & 3,
187 (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
188 m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
189 if (++mp_irq_entries >= MAX_IRQ_SOURCES)
190 panic("Max # of irq sources exceeded!!\n");
191}
192
193static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
194{
195 Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
196 " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
197 m->mpc_irqtype, m->mpc_irqflag & 3,
198 (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
199 m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
200}
201
202/*
203 * Read/parse the MPC
204 */
205
206static int __init smp_read_mpc(struct mp_config_table *mpc)
207{
208 char str[16];
209 int count=sizeof(*mpc);
210 unsigned char *mpt=((unsigned char *)mpc)+count;
211
212 if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
213 printk("MPTABLE: bad signature [%c%c%c%c]!\n",
214 mpc->mpc_signature[0],
215 mpc->mpc_signature[1],
216 mpc->mpc_signature[2],
217 mpc->mpc_signature[3]);
218 return 0;
219 }
220 if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
221 printk("MPTABLE: checksum error!\n");
222 return 0;
223 }
224 if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
225 printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n",
226 mpc->mpc_spec);
227 return 0;
228 }
229 if (!mpc->mpc_lapic) {
230 printk(KERN_ERR "MPTABLE: null local APIC address!\n");
231 return 0;
232 }
233 memcpy(str,mpc->mpc_oem,8);
234 str[8] = 0;
235 printk(KERN_INFO "MPTABLE: OEM ID: %s ",str);
236
237 memcpy(str,mpc->mpc_productid,12);
238 str[12] = 0;
239 printk("MPTABLE: Product ID: %s ",str);
240
241 printk("MPTABLE: APIC at: 0x%X\n",mpc->mpc_lapic);
242
243 /* save the local APIC address, it might be non-default */
244 if (!acpi_lapic)
245 mp_lapic_addr = mpc->mpc_lapic;
246
247 /*
248 * Now process the configuration blocks.
249 */
250 while (count < mpc->mpc_length) {
251 switch(*mpt) {
252 case MP_PROCESSOR:
253 {
254 struct mpc_config_processor *m=
255 (struct mpc_config_processor *)mpt;
256 if (!acpi_lapic)
257 MP_processor_info(m);
258 mpt += sizeof(*m);
259 count += sizeof(*m);
260 break;
261 }
262 case MP_BUS:
263 {
264 struct mpc_config_bus *m=
265 (struct mpc_config_bus *)mpt;
266 MP_bus_info(m);
267 mpt += sizeof(*m);
268 count += sizeof(*m);
269 break;
270 }
271 case MP_IOAPIC:
272 {
273 struct mpc_config_ioapic *m=
274 (struct mpc_config_ioapic *)mpt;
275 MP_ioapic_info(m);
276 mpt += sizeof(*m);
277 count += sizeof(*m);
278 break;
279 }
280 case MP_INTSRC:
281 {
282 struct mpc_config_intsrc *m=
283 (struct mpc_config_intsrc *)mpt;
284
285 MP_intsrc_info(m);
286 mpt += sizeof(*m);
287 count += sizeof(*m);
288 break;
289 }
290 case MP_LINTSRC:
291 {
292 struct mpc_config_lintsrc *m=
293 (struct mpc_config_lintsrc *)mpt;
294 MP_lintsrc_info(m);
295 mpt += sizeof(*m);
296 count += sizeof(*m);
297 break;
298 }
299 }
300 }
301 setup_apic_routing();
302 if (!num_processors)
303 printk(KERN_ERR "MPTABLE: no processors registered!\n");
304 return num_processors;
305}
306
307static int __init ELCR_trigger(unsigned int irq)
308{
309 unsigned int port;
310
311 port = 0x4d0 + (irq >> 3);
312 return (inb(port) >> (irq & 7)) & 1;
313}
314
315static void __init construct_default_ioirq_mptable(int mpc_default_type)
316{
317 struct mpc_config_intsrc intsrc;
318 int i;
319 int ELCR_fallback = 0;
320
321 intsrc.mpc_type = MP_INTSRC;
322 intsrc.mpc_irqflag = 0; /* conforming */
323 intsrc.mpc_srcbus = 0;
324 intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
325
326 intsrc.mpc_irqtype = mp_INT;
327
328 /*
329 * If true, we have an ISA/PCI system with no IRQ entries
330 * in the MP table. To prevent the PCI interrupts from being set up
331 * incorrectly, we try to use the ELCR. The sanity check to see if
332 * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
333 * never be level sensitive, so we simply see if the ELCR agrees.
334 * If it does, we assume it's valid.
335 */
336 if (mpc_default_type == 5) {
337 printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
338
339 if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13))
340 printk(KERN_ERR "ELCR contains invalid data... not using ELCR\n");
341 else {
342 printk(KERN_INFO "Using ELCR to identify PCI interrupts\n");
343 ELCR_fallback = 1;
344 }
345 }
346
347 for (i = 0; i < 16; i++) {
348 switch (mpc_default_type) {
349 case 2:
350 if (i == 0 || i == 13)
351 continue; /* IRQ0 & IRQ13 not connected */
352 /* fall through */
353 default:
354 if (i == 2)
355 continue; /* IRQ2 is never connected */
356 }
357
358 if (ELCR_fallback) {
359 /*
360 * If the ELCR indicates a level-sensitive interrupt, we
361 * copy that information over to the MP table in the
362 * irqflag field (level sensitive, active high polarity).
363 */
364 if (ELCR_trigger(i))
365 intsrc.mpc_irqflag = 13;
366 else
367 intsrc.mpc_irqflag = 0;
368 }
369
370 intsrc.mpc_srcbusirq = i;
371 intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */
372 MP_intsrc_info(&intsrc);
373 }
374
375 intsrc.mpc_irqtype = mp_ExtINT;
376 intsrc.mpc_srcbusirq = 0;
377 intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */
378 MP_intsrc_info(&intsrc);
379}
380
381static inline void __init construct_default_ISA_mptable(int mpc_default_type)
382{
383 struct mpc_config_processor processor;
384 struct mpc_config_bus bus;
385 struct mpc_config_ioapic ioapic;
386 struct mpc_config_lintsrc lintsrc;
387 int linttypes[2] = { mp_ExtINT, mp_NMI };
388 int i;
389
390 /*
391 * local APIC has default address
392 */
393 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
394
395 /*
396 * 2 CPUs, numbered 0 & 1.
397 */
398 processor.mpc_type = MP_PROCESSOR;
399 processor.mpc_apicver = 0;
400 processor.mpc_cpuflag = CPU_ENABLED;
401 processor.mpc_cpufeature = 0;
402 processor.mpc_featureflag = 0;
403 processor.mpc_reserved[0] = 0;
404 processor.mpc_reserved[1] = 0;
405 for (i = 0; i < 2; i++) {
406 processor.mpc_apicid = i;
407 MP_processor_info(&processor);
408 }
409
410 bus.mpc_type = MP_BUS;
411 bus.mpc_busid = 0;
412 switch (mpc_default_type) {
413 default:
414 printk(KERN_ERR "???\nUnknown standard configuration %d\n",
415 mpc_default_type);
416 /* fall through */
417 case 1:
418 case 5:
419 memcpy(bus.mpc_bustype, "ISA ", 6);
420 break;
421 }
422 MP_bus_info(&bus);
423 if (mpc_default_type > 4) {
424 bus.mpc_busid = 1;
425 memcpy(bus.mpc_bustype, "PCI ", 6);
426 MP_bus_info(&bus);
427 }
428
429 ioapic.mpc_type = MP_IOAPIC;
430 ioapic.mpc_apicid = 2;
431 ioapic.mpc_apicver = 0;
432 ioapic.mpc_flags = MPC_APIC_USABLE;
433 ioapic.mpc_apicaddr = 0xFEC00000;
434 MP_ioapic_info(&ioapic);
435
436 /*
437 * We set up most of the low 16 IO-APIC pins according to MPS rules.
438 */
439 construct_default_ioirq_mptable(mpc_default_type);
440
441 lintsrc.mpc_type = MP_LINTSRC;
442 lintsrc.mpc_irqflag = 0; /* conforming */
443 lintsrc.mpc_srcbusid = 0;
444 lintsrc.mpc_srcbusirq = 0;
445 lintsrc.mpc_destapic = MP_APIC_ALL;
446 for (i = 0; i < 2; i++) {
447 lintsrc.mpc_irqtype = linttypes[i];
448 lintsrc.mpc_destapiclint = i;
449 MP_lintsrc_info(&lintsrc);
450 }
451}
452
453static struct intel_mp_floating *mpf_found;
454
455/*
456 * Scan the memory blocks for an SMP configuration block.
457 */
458void __init get_smp_config (void)
459{
460 struct intel_mp_floating *mpf = mpf_found;
461
462 /*
463 * ACPI supports both logical (e.g. Hyper-Threading) and physical
464 * processors, where MPS only supports physical.
465 */
466 if (acpi_lapic && acpi_ioapic) {
467 printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n");
468 return;
469 }
470 else if (acpi_lapic)
471 printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
472
473 printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
474
475 /*
476 * Now see if we need to read further.
477 */
478 if (mpf->mpf_feature1 != 0) {
479
480 printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1);
481 construct_default_ISA_mptable(mpf->mpf_feature1);
482
483 } else if (mpf->mpf_physptr) {
484
485 /*
486 * Read the physical hardware table. Anything here will
487 * override the defaults.
488 */
489 if (!smp_read_mpc(phys_to_virt(mpf->mpf_physptr))) {
490 smp_found_config = 0;
491 printk(KERN_ERR "BIOS bug, MP table errors detected!...\n");
492 printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n");
493 return;
494 }
495 /*
496 * If there are no explicit MP IRQ entries, then we are
497 * broken. We set up most of the low 16 IO-APIC pins to
498 * ISA defaults and hope it will work.
499 */
500 if (!mp_irq_entries) {
501 struct mpc_config_bus bus;
502
503 printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
504
505 bus.mpc_type = MP_BUS;
506 bus.mpc_busid = 0;
507 memcpy(bus.mpc_bustype, "ISA ", 6);
508 MP_bus_info(&bus);
509
510 construct_default_ioirq_mptable(0);
511 }
512
513 } else
514 BUG();
515
516 printk(KERN_INFO "Processors: %d\n", num_processors);
517 /*
518 * Only use the first configuration found.
519 */
520}
521
522static int __init smp_scan_config (unsigned long base, unsigned long length)
523{
524 extern void __bad_mpf_size(void);
525 unsigned int *bp = phys_to_virt(base);
526 struct intel_mp_floating *mpf;
527
528 Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
529 if (sizeof(*mpf) != 16)
530 __bad_mpf_size();
531
532 while (length > 0) {
533 mpf = (struct intel_mp_floating *)bp;
534 if ((*bp == SMP_MAGIC_IDENT) &&
535 (mpf->mpf_length == 1) &&
536 !mpf_checksum((unsigned char *)bp, 16) &&
537 ((mpf->mpf_specification == 1)
538 || (mpf->mpf_specification == 4)) ) {
539
540 smp_found_config = 1;
541 reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE);
542 if (mpf->mpf_physptr)
543 reserve_bootmem_generic(mpf->mpf_physptr, PAGE_SIZE);
544 mpf_found = mpf;
545 return 1;
546 }
547 bp += 4;
548 length -= 16;
549 }
550 return 0;
551}
552
553void __init find_smp_config(void)
554{
555 unsigned int address;
556
557 /*
558 * FIXME: Linux assumes you have 640K of base ram..
559 * this continues the error...
560 *
561 * 1) Scan the bottom 1K for a signature
562 * 2) Scan the top 1K of base RAM
563 * 3) Scan the 64K of bios
564 */
565 if (smp_scan_config(0x0,0x400) ||
566 smp_scan_config(639*0x400,0x400) ||
567 smp_scan_config(0xF0000,0x10000))
568 return;
569 /*
570 * If it is an SMP machine we should know now.
571 *
572 * there is a real-mode segmented pointer pointing to the
573 * 4K EBDA area at 0x40E, calculate and scan it here.
574 *
575 * NOTE! There are Linux loaders that will corrupt the EBDA
576 * area, and as such this kind of SMP config may be less
577 * trustworthy, simply because the SMP table may have been
578 * stomped on during early boot. These loaders are buggy and
579 * should be fixed.
580 */
581
582 address = *(unsigned short *)phys_to_virt(0x40E);
583 address <<= 4;
584 if (smp_scan_config(address, 0x1000))
585 return;
586
587 /* If we have come this far, we did not find an MP table */
588 printk(KERN_INFO "No mptable found.\n");
589}
590
591/* --------------------------------------------------------------------------
592 ACPI-based MP Configuration
593 -------------------------------------------------------------------------- */
594
595#ifdef CONFIG_ACPI
596
597void __init mp_register_lapic_address(u64 address)
598{
599 mp_lapic_addr = (unsigned long) address;
600 set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
601 if (boot_cpu_id == -1U)
602 boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
603}
604
605void __cpuinit mp_register_lapic (u8 id, u8 enabled)
606{
607 struct mpc_config_processor processor;
608 int boot_cpu = 0;
609
610 if (id == boot_cpu_id)
611 boot_cpu = 1;
612
613 processor.mpc_type = MP_PROCESSOR;
614 processor.mpc_apicid = id;
615 processor.mpc_apicver = 0;
616 processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
617 processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
618 processor.mpc_cpufeature = 0;
619 processor.mpc_featureflag = 0;
620 processor.mpc_reserved[0] = 0;
621 processor.mpc_reserved[1] = 0;
622
623 MP_processor_info(&processor);
624}
625
626#define MP_ISA_BUS 0
627#define MP_MAX_IOAPIC_PIN 127
628
629static struct mp_ioapic_routing {
630 int apic_id;
631 int gsi_start;
632 int gsi_end;
633 u32 pin_programmed[4];
634} mp_ioapic_routing[MAX_IO_APICS];
635
636static int mp_find_ioapic(int gsi)
637{
638 int i = 0;
639
640 /* Find the IOAPIC that manages this GSI. */
641 for (i = 0; i < nr_ioapics; i++) {
642 if ((gsi >= mp_ioapic_routing[i].gsi_start)
643 && (gsi <= mp_ioapic_routing[i].gsi_end))
644 return i;
645 }
646
647 printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
648 return -1;
649}
650
651static u8 uniq_ioapic_id(u8 id)
652{
653 int i;
654 DECLARE_BITMAP(used, 256);
655 bitmap_zero(used, 256);
656 for (i = 0; i < nr_ioapics; i++) {
657 struct mpc_config_ioapic *ia = &mp_ioapics[i];
658 __set_bit(ia->mpc_apicid, used);
659 }
660 if (!test_bit(id, used))
661 return id;
662 return find_first_zero_bit(used, 256);
663}
664
665void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base)
666{
667 int idx = 0;
668
669 if (bad_ioapic(address))
670 return;
671
672 idx = nr_ioapics;
673
674 mp_ioapics[idx].mpc_type = MP_IOAPIC;
675 mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
676 mp_ioapics[idx].mpc_apicaddr = address;
677
678 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
679 mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id);
680 mp_ioapics[idx].mpc_apicver = 0;
681
682 /*
683 * Build basic IRQ lookup table to facilitate gsi->io_apic lookups
684 * and to prevent reprogramming of IOAPIC pins (PCI IRQs).
685 */
686 mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
687 mp_ioapic_routing[idx].gsi_start = gsi_base;
688 mp_ioapic_routing[idx].gsi_end = gsi_base +
689 io_apic_get_redir_entries(idx);
690
691 printk(KERN_INFO "IOAPIC[%d]: apic_id %d, address 0x%x, "
692 "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
693 mp_ioapics[idx].mpc_apicaddr,
694 mp_ioapic_routing[idx].gsi_start,
695 mp_ioapic_routing[idx].gsi_end);
696
697 nr_ioapics++;
698}
699
700void __init
701mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
702{
703 struct mpc_config_intsrc intsrc;
704 int ioapic = -1;
705 int pin = -1;
706
707 /*
708 * Convert 'gsi' to 'ioapic.pin'.
709 */
710 ioapic = mp_find_ioapic(gsi);
711 if (ioapic < 0)
712 return;
713 pin = gsi - mp_ioapic_routing[ioapic].gsi_start;
714
715 /*
716 * TBD: This check is for faulty timer entries, where the override
717 * erroneously sets the trigger to level, resulting in a HUGE
718 * increase of timer interrupts!
719 */
720 if ((bus_irq == 0) && (trigger == 3))
721 trigger = 1;
722
723 intsrc.mpc_type = MP_INTSRC;
724 intsrc.mpc_irqtype = mp_INT;
725 intsrc.mpc_irqflag = (trigger << 2) | polarity;
726 intsrc.mpc_srcbus = MP_ISA_BUS;
727 intsrc.mpc_srcbusirq = bus_irq; /* IRQ */
728 intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */
729 intsrc.mpc_dstirq = pin; /* INTIN# */
730
731 Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n",
732 intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
733 (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
734 intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq);
735
736 mp_irqs[mp_irq_entries] = intsrc;
737 if (++mp_irq_entries == MAX_IRQ_SOURCES)
738 panic("Max # of irq sources exceeded!\n");
739}
740
741void __init mp_config_acpi_legacy_irqs(void)
742{
743 struct mpc_config_intsrc intsrc;
744 int i = 0;
745 int ioapic = -1;
746
747 /*
748 * Fabricate the legacy ISA bus (bus #31).
749 */
750 set_bit(MP_ISA_BUS, mp_bus_not_pci);
751
752 /*
753 * Locate the IOAPIC that manages the ISA IRQs (0-15).
754 */
755 ioapic = mp_find_ioapic(0);
756 if (ioapic < 0)
757 return;
758
759 intsrc.mpc_type = MP_INTSRC;
760 intsrc.mpc_irqflag = 0; /* Conforming */
761 intsrc.mpc_srcbus = MP_ISA_BUS;
762 intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
763
764 /*
765 * Use the default configuration for the IRQs 0-15. Unless
766 * overridden by (MADT) interrupt source override entries.
767 */
768 for (i = 0; i < 16; i++) {
769 int idx;
770
771 for (idx = 0; idx < mp_irq_entries; idx++) {
772 struct mpc_config_intsrc *irq = mp_irqs + idx;
773
774 /* Do we already have a mapping for this ISA IRQ? */
775 if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i)
776 break;
777
778 /* Do we already have a mapping for this IOAPIC pin */
779 if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
780 (irq->mpc_dstirq == i))
781 break;
782 }
783
784 if (idx != mp_irq_entries) {
785 printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
786 continue; /* IRQ already used */
787 }
788
789 intsrc.mpc_irqtype = mp_INT;
790 intsrc.mpc_srcbusirq = i; /* Identity mapped */
791 intsrc.mpc_dstirq = i;
792
793 Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, "
794 "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
795 (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
796 intsrc.mpc_srcbusirq, intsrc.mpc_dstapic,
797 intsrc.mpc_dstirq);
798
799 mp_irqs[mp_irq_entries] = intsrc;
800 if (++mp_irq_entries == MAX_IRQ_SOURCES)
801 panic("Max # of irq sources exceeded!\n");
802 }
803}
804
805int mp_register_gsi(u32 gsi, int triggering, int polarity)
806{
807 int ioapic = -1;
808 int ioapic_pin = 0;
809 int idx, bit = 0;
810
811 if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
812 return gsi;
813
814 /* Don't set up the ACPI SCI because it's already set up */
815 if (acpi_gbl_FADT.sci_interrupt == gsi)
816 return gsi;
817
818 ioapic = mp_find_ioapic(gsi);
819 if (ioapic < 0) {
820 printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
821 return gsi;
822 }
823
824 ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_start;
825
826 /*
827 * Avoid pin reprogramming. PRTs typically include entries
828 * with redundant pin->gsi mappings (but unique PCI devices);
829 * we only program the IOAPIC on the first.
830 */
831 bit = ioapic_pin % 32;
832 idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32);
833 if (idx > 3) {
834 printk(KERN_ERR "Invalid reference to IOAPIC pin "
835 "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
836 ioapic_pin);
837 return gsi;
838 }
839 if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
840 Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
841 mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
842 return gsi;
843 }
844
845 mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
846
847 io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
848 triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
849 polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
850 return gsi;
851}
852#endif /*CONFIG_ACPI*/
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
new file mode 100644
index 000000000000..0c1069b8d638
--- /dev/null
+++ b/arch/x86/kernel/msr.c
@@ -0,0 +1,224 @@
1/* ----------------------------------------------------------------------- *
2 *
3 * Copyright 2000 H. Peter Anvin - All Rights Reserved
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139,
8 * USA; either version 2 of the License, or (at your option) any later
9 * version; incorporated herein by reference.
10 *
11 * ----------------------------------------------------------------------- */
12
13/*
14 * msr.c
15 *
16 * x86 MSR access device
17 *
18 * This device is accessed by lseek() to the appropriate register number
19 * and then read/write in chunks of 8 bytes. A larger size means multiple
20 * reads or writes of the same register.
21 *
22 * This driver uses /dev/cpu/%d/msr where %d is the minor number, and on
23 * an SMP box will direct the access to CPU %d.
24 */
25
26#include <linux/module.h>
27
28#include <linux/types.h>
29#include <linux/errno.h>
30#include <linux/fcntl.h>
31#include <linux/init.h>
32#include <linux/poll.h>
33#include <linux/smp.h>
34#include <linux/smp_lock.h>
35#include <linux/major.h>
36#include <linux/fs.h>
37#include <linux/device.h>
38#include <linux/cpu.h>
39#include <linux/notifier.h>
40
41#include <asm/processor.h>
42#include <asm/msr.h>
43#include <asm/uaccess.h>
44#include <asm/system.h>
45
46static struct class *msr_class;
47
48static loff_t msr_seek(struct file *file, loff_t offset, int orig)
49{
50 loff_t ret = -EINVAL;
51
52 lock_kernel();
53 switch (orig) {
54 case 0:
55 file->f_pos = offset;
56 ret = file->f_pos;
57 break;
58 case 1:
59 file->f_pos += offset;
60 ret = file->f_pos;
61 }
62 unlock_kernel();
63 return ret;
64}
65
66static ssize_t msr_read(struct file *file, char __user * buf,
67 size_t count, loff_t * ppos)
68{
69 u32 __user *tmp = (u32 __user *) buf;
70 u32 data[2];
71 u32 reg = *ppos;
72 int cpu = iminor(file->f_path.dentry->d_inode);
73 int err;
74
75 if (count % 8)
76 return -EINVAL; /* Invalid chunk size */
77
78 for (; count; count -= 8) {
79 err = rdmsr_safe_on_cpu(cpu, reg, &data[0], &data[1]);
80 if (err)
81 return -EIO;
82 if (copy_to_user(tmp, &data, 8))
83 return -EFAULT;
84 tmp += 2;
85 }
86
87 return ((char __user *)tmp) - buf;
88}
89
90static ssize_t msr_write(struct file *file, const char __user *buf,
91 size_t count, loff_t *ppos)
92{
93 const u32 __user *tmp = (const u32 __user *)buf;
94 u32 data[2];
95 u32 reg = *ppos;
96 int cpu = iminor(file->f_path.dentry->d_inode);
97 int err;
98
99 if (count % 8)
100 return -EINVAL; /* Invalid chunk size */
101
102 for (; count; count -= 8) {
103 if (copy_from_user(&data, tmp, 8))
104 return -EFAULT;
105 err = wrmsr_safe_on_cpu(cpu, reg, data[0], data[1]);
106 if (err)
107 return -EIO;
108 tmp += 2;
109 }
110
111 return ((char __user *)tmp) - buf;
112}
113
114static int msr_open(struct inode *inode, struct file *file)
115{
116 unsigned int cpu = iminor(file->f_path.dentry->d_inode);
117 struct cpuinfo_x86 *c = &(cpu_data)[cpu];
118
119 if (cpu >= NR_CPUS || !cpu_online(cpu))
120 return -ENXIO; /* No such CPU */
121 if (!cpu_has(c, X86_FEATURE_MSR))
122 return -EIO; /* MSR not supported */
123
124 return 0;
125}
126
127/*
128 * File operations we support
129 */
130static const struct file_operations msr_fops = {
131 .owner = THIS_MODULE,
132 .llseek = msr_seek,
133 .read = msr_read,
134 .write = msr_write,
135 .open = msr_open,
136};
137
138static int msr_device_create(int i)
139{
140 int err = 0;
141 struct device *dev;
142
143 dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, i), "msr%d",i);
144 if (IS_ERR(dev))
145 err = PTR_ERR(dev);
146 return err;
147}
148
149static int msr_class_cpu_callback(struct notifier_block *nfb,
150 unsigned long action, void *hcpu)
151{
152 unsigned int cpu = (unsigned long)hcpu;
153
154 switch (action) {
155 case CPU_ONLINE:
156 case CPU_ONLINE_FROZEN:
157 msr_device_create(cpu);
158 break;
159 case CPU_DEAD:
160 case CPU_DEAD_FROZEN:
161 device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu));
162 break;
163 }
164 return NOTIFY_OK;
165}
166
167static struct notifier_block __cpuinitdata msr_class_cpu_notifier =
168{
169 .notifier_call = msr_class_cpu_callback,
170};
171
172static int __init msr_init(void)
173{
174 int i, err = 0;
175 i = 0;
176
177 if (register_chrdev(MSR_MAJOR, "cpu/msr", &msr_fops)) {
178 printk(KERN_ERR "msr: unable to get major %d for msr\n",
179 MSR_MAJOR);
180 err = -EBUSY;
181 goto out;
182 }
183 msr_class = class_create(THIS_MODULE, "msr");
184 if (IS_ERR(msr_class)) {
185 err = PTR_ERR(msr_class);
186 goto out_chrdev;
187 }
188 for_each_online_cpu(i) {
189 err = msr_device_create(i);
190 if (err != 0)
191 goto out_class;
192 }
193 register_hotcpu_notifier(&msr_class_cpu_notifier);
194
195 err = 0;
196 goto out;
197
198out_class:
199 i = 0;
200 for_each_online_cpu(i)
201 device_destroy(msr_class, MKDEV(MSR_MAJOR, i));
202 class_destroy(msr_class);
203out_chrdev:
204 unregister_chrdev(MSR_MAJOR, "cpu/msr");
205out:
206 return err;
207}
208
209static void __exit msr_exit(void)
210{
211 int cpu = 0;
212 for_each_online_cpu(cpu)
213 device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu));
214 class_destroy(msr_class);
215 unregister_chrdev(MSR_MAJOR, "cpu/msr");
216 unregister_hotcpu_notifier(&msr_class_cpu_notifier);
217}
218
219module_init(msr_init);
220module_exit(msr_exit)
221
222MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>");
223MODULE_DESCRIPTION("x86 generic MSR driver");
224MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c
new file mode 100644
index 000000000000..c7227e2180f8
--- /dev/null
+++ b/arch/x86/kernel/nmi_32.c
@@ -0,0 +1,468 @@
1/*
2 * linux/arch/i386/nmi.c
3 *
4 * NMI watchdog support on APIC systems
5 *
6 * Started by Ingo Molnar <mingo@redhat.com>
7 *
8 * Fixes:
9 * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog.
10 * Mikael Pettersson : Power Management for local APIC NMI watchdog.
11 * Mikael Pettersson : Pentium 4 support for local APIC NMI watchdog.
12 * Pavel Machek and
13 * Mikael Pettersson : PM converted to driver model. Disable/enable API.
14 */
15
16#include <linux/delay.h>
17#include <linux/interrupt.h>
18#include <linux/module.h>
19#include <linux/nmi.h>
20#include <linux/sysdev.h>
21#include <linux/sysctl.h>
22#include <linux/percpu.h>
23#include <linux/kprobes.h>
24#include <linux/cpumask.h>
25#include <linux/kernel_stat.h>
26#include <linux/kdebug.h>
27
28#include <asm/smp.h>
29#include <asm/nmi.h>
30
31#include "mach_traps.h"
32
33int unknown_nmi_panic;
34int nmi_watchdog_enabled;
35
36static cpumask_t backtrace_mask = CPU_MASK_NONE;
37
38/* nmi_active:
39 * >0: the lapic NMI watchdog is active, but can be disabled
40 * <0: the lapic NMI watchdog has not been set up, and cannot
41 * be enabled
42 * 0: the lapic NMI watchdog is disabled, but can be enabled
43 */
44atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */
45
46unsigned int nmi_watchdog = NMI_DEFAULT;
47static unsigned int nmi_hz = HZ;
48
49static DEFINE_PER_CPU(short, wd_enabled);
50
51/* local prototypes */
52static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu);
53
54static int endflag __initdata = 0;
55
56#ifdef CONFIG_SMP
57/* The performance counters used by NMI_LOCAL_APIC don't trigger when
58 * the CPU is idle. To make sure the NMI watchdog really ticks on all
59 * CPUs during the test make them busy.
60 */
61static __init void nmi_cpu_busy(void *data)
62{
63 local_irq_enable_in_hardirq();
64 /* Intentionally don't use cpu_relax here. This is
65 to make sure that the performance counter really ticks,
66 even if there is a simulator or similar that catches the
67 pause instruction. On a real HT machine this is fine because
68 all other CPUs are busy with "useless" delay loops and don't
69 care if they get somewhat less cycles. */
70 while (endflag == 0)
71 mb();
72}
73#endif
74
75static int __init check_nmi_watchdog(void)
76{
77 unsigned int *prev_nmi_count;
78 int cpu;
79
80 if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DISABLED))
81 return 0;
82
83 if (!atomic_read(&nmi_active))
84 return 0;
85
86 prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
87 if (!prev_nmi_count)
88 return -1;
89
90 printk(KERN_INFO "Testing NMI watchdog ... ");
91
92 if (nmi_watchdog == NMI_LOCAL_APIC)
93 smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
94
95 for_each_possible_cpu(cpu)
96 prev_nmi_count[cpu] = per_cpu(irq_stat, cpu).__nmi_count;
97 local_irq_enable();
98 mdelay((20*1000)/nmi_hz); // wait 20 ticks
99
100 for_each_possible_cpu(cpu) {
101#ifdef CONFIG_SMP
102 /* Check cpu_callin_map here because that is set
103 after the timer is started. */
104 if (!cpu_isset(cpu, cpu_callin_map))
105 continue;
106#endif
107 if (!per_cpu(wd_enabled, cpu))
108 continue;
109 if (nmi_count(cpu) - prev_nmi_count[cpu] <= 5) {
110 printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n",
111 cpu,
112 prev_nmi_count[cpu],
113 nmi_count(cpu));
114 per_cpu(wd_enabled, cpu) = 0;
115 atomic_dec(&nmi_active);
116 }
117 }
118 endflag = 1;
119 if (!atomic_read(&nmi_active)) {
120 kfree(prev_nmi_count);
121 atomic_set(&nmi_active, -1);
122 return -1;
123 }
124 printk("OK.\n");
125
126 /* now that we know it works we can reduce NMI frequency to
127 something more reasonable; makes a difference in some configs */
128 if (nmi_watchdog == NMI_LOCAL_APIC)
129 nmi_hz = lapic_adjust_nmi_hz(1);
130
131 kfree(prev_nmi_count);
132 return 0;
133}
134/* This needs to happen later in boot so counters are working */
135late_initcall(check_nmi_watchdog);
136
137static int __init setup_nmi_watchdog(char *str)
138{
139 int nmi;
140
141 get_option(&str, &nmi);
142
143 if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE))
144 return 0;
145
146 nmi_watchdog = nmi;
147 return 1;
148}
149
150__setup("nmi_watchdog=", setup_nmi_watchdog);
151
152
153/* Suspend/resume support */
154
155#ifdef CONFIG_PM
156
157static int nmi_pm_active; /* nmi_active before suspend */
158
159static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
160{
161 /* only CPU0 goes here, other CPUs should be offline */
162 nmi_pm_active = atomic_read(&nmi_active);
163 stop_apic_nmi_watchdog(NULL);
164 BUG_ON(atomic_read(&nmi_active) != 0);
165 return 0;
166}
167
168static int lapic_nmi_resume(struct sys_device *dev)
169{
170 /* only CPU0 goes here, other CPUs should be offline */
171 if (nmi_pm_active > 0) {
172 setup_apic_nmi_watchdog(NULL);
173 touch_nmi_watchdog();
174 }
175 return 0;
176}
177
178
179static struct sysdev_class nmi_sysclass = {
180 set_kset_name("lapic_nmi"),
181 .resume = lapic_nmi_resume,
182 .suspend = lapic_nmi_suspend,
183};
184
185static struct sys_device device_lapic_nmi = {
186 .id = 0,
187 .cls = &nmi_sysclass,
188};
189
190static int __init init_lapic_nmi_sysfs(void)
191{
192 int error;
193
194 /* should really be a BUG_ON but b/c this is an
195 * init call, it just doesn't work. -dcz
196 */
197 if (nmi_watchdog != NMI_LOCAL_APIC)
198 return 0;
199
200 if (atomic_read(&nmi_active) < 0)
201 return 0;
202
203 error = sysdev_class_register(&nmi_sysclass);
204 if (!error)
205 error = sysdev_register(&device_lapic_nmi);
206 return error;
207}
208/* must come after the local APIC's device_initcall() */
209late_initcall(init_lapic_nmi_sysfs);
210
211#endif /* CONFIG_PM */
212
213static void __acpi_nmi_enable(void *__unused)
214{
215 apic_write_around(APIC_LVT0, APIC_DM_NMI);
216}
217
218/*
219 * Enable timer based NMIs on all CPUs:
220 */
221void acpi_nmi_enable(void)
222{
223 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
224 on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
225}
226
227static void __acpi_nmi_disable(void *__unused)
228{
229 apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
230}
231
232/*
233 * Disable timer based NMIs on all CPUs:
234 */
235void acpi_nmi_disable(void)
236{
237 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
238 on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
239}
240
241void setup_apic_nmi_watchdog (void *unused)
242{
243 if (__get_cpu_var(wd_enabled))
244 return;
245
246 /* cheap hack to support suspend/resume */
247 /* if cpu0 is not active neither should the other cpus */
248 if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0))
249 return;
250
251 switch (nmi_watchdog) {
252 case NMI_LOCAL_APIC:
253 __get_cpu_var(wd_enabled) = 1; /* enable it before to avoid race with handler */
254 if (lapic_watchdog_init(nmi_hz) < 0) {
255 __get_cpu_var(wd_enabled) = 0;
256 return;
257 }
258 /* FALL THROUGH */
259 case NMI_IO_APIC:
260 __get_cpu_var(wd_enabled) = 1;
261 atomic_inc(&nmi_active);
262 }
263}
264
265void stop_apic_nmi_watchdog(void *unused)
266{
267 /* only support LOCAL and IO APICs for now */
268 if ((nmi_watchdog != NMI_LOCAL_APIC) &&
269 (nmi_watchdog != NMI_IO_APIC))
270 return;
271 if (__get_cpu_var(wd_enabled) == 0)
272 return;
273 if (nmi_watchdog == NMI_LOCAL_APIC)
274 lapic_watchdog_stop();
275 __get_cpu_var(wd_enabled) = 0;
276 atomic_dec(&nmi_active);
277}
278
279/*
280 * the best way to detect whether a CPU has a 'hard lockup' problem
281 * is to check it's local APIC timer IRQ counts. If they are not
282 * changing then that CPU has some problem.
283 *
284 * as these watchdog NMI IRQs are generated on every CPU, we only
285 * have to check the current processor.
286 *
287 * since NMIs don't listen to _any_ locks, we have to be extremely
288 * careful not to rely on unsafe variables. The printk might lock
289 * up though, so we have to break up any console locks first ...
290 * [when there will be more tty-related locks, break them up
291 * here too!]
292 */
293
294static unsigned int
295 last_irq_sums [NR_CPUS],
296 alert_counter [NR_CPUS];
297
298void touch_nmi_watchdog(void)
299{
300 if (nmi_watchdog > 0) {
301 unsigned cpu;
302
303 /*
304 * Just reset the alert counters, (other CPUs might be
305 * spinning on locks we hold):
306 */
307 for_each_present_cpu(cpu) {
308 if (alert_counter[cpu])
309 alert_counter[cpu] = 0;
310 }
311 }
312
313 /*
314 * Tickle the softlockup detector too:
315 */
316 touch_softlockup_watchdog();
317}
318EXPORT_SYMBOL(touch_nmi_watchdog);
319
320extern void die_nmi(struct pt_regs *, const char *msg);
321
322__kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
323{
324
325 /*
326 * Since current_thread_info()-> is always on the stack, and we
327 * always switch the stack NMI-atomically, it's safe to use
328 * smp_processor_id().
329 */
330 unsigned int sum;
331 int touched = 0;
332 int cpu = smp_processor_id();
333 int rc=0;
334
335 /* check for other users first */
336 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
337 == NOTIFY_STOP) {
338 rc = 1;
339 touched = 1;
340 }
341
342 if (cpu_isset(cpu, backtrace_mask)) {
343 static DEFINE_SPINLOCK(lock); /* Serialise the printks */
344
345 spin_lock(&lock);
346 printk("NMI backtrace for cpu %d\n", cpu);
347 dump_stack();
348 spin_unlock(&lock);
349 cpu_clear(cpu, backtrace_mask);
350 }
351
352 /*
353 * Take the local apic timer and PIT/HPET into account. We don't
354 * know which one is active, when we have highres/dyntick on
355 */
356 sum = per_cpu(irq_stat, cpu).apic_timer_irqs + kstat_cpu(cpu).irqs[0];
357
358 /* if the none of the timers isn't firing, this cpu isn't doing much */
359 if (!touched && last_irq_sums[cpu] == sum) {
360 /*
361 * Ayiee, looks like this CPU is stuck ...
362 * wait a few IRQs (5 seconds) before doing the oops ...
363 */
364 alert_counter[cpu]++;
365 if (alert_counter[cpu] == 5*nmi_hz)
366 /*
367 * die_nmi will return ONLY if NOTIFY_STOP happens..
368 */
369 die_nmi(regs, "BUG: NMI Watchdog detected LOCKUP");
370 } else {
371 last_irq_sums[cpu] = sum;
372 alert_counter[cpu] = 0;
373 }
374 /* see if the nmi watchdog went off */
375 if (!__get_cpu_var(wd_enabled))
376 return rc;
377 switch (nmi_watchdog) {
378 case NMI_LOCAL_APIC:
379 rc |= lapic_wd_event(nmi_hz);
380 break;
381 case NMI_IO_APIC:
382 /* don't know how to accurately check for this.
383 * just assume it was a watchdog timer interrupt
384 * This matches the old behaviour.
385 */
386 rc = 1;
387 break;
388 }
389 return rc;
390}
391
392int do_nmi_callback(struct pt_regs * regs, int cpu)
393{
394#ifdef CONFIG_SYSCTL
395 if (unknown_nmi_panic)
396 return unknown_nmi_panic_callback(regs, cpu);
397#endif
398 return 0;
399}
400
401#ifdef CONFIG_SYSCTL
402
403static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
404{
405 unsigned char reason = get_nmi_reason();
406 char buf[64];
407
408 sprintf(buf, "NMI received for unknown reason %02x\n", reason);
409 die_nmi(regs, buf);
410 return 0;
411}
412
413/*
414 * proc handler for /proc/sys/kernel/nmi
415 */
416int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
417 void __user *buffer, size_t *length, loff_t *ppos)
418{
419 int old_state;
420
421 nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
422 old_state = nmi_watchdog_enabled;
423 proc_dointvec(table, write, file, buffer, length, ppos);
424 if (!!old_state == !!nmi_watchdog_enabled)
425 return 0;
426
427 if (atomic_read(&nmi_active) < 0 || nmi_watchdog == NMI_DISABLED) {
428 printk( KERN_WARNING "NMI watchdog is permanently disabled\n");
429 return -EIO;
430 }
431
432 if (nmi_watchdog == NMI_DEFAULT) {
433 if (lapic_watchdog_ok())
434 nmi_watchdog = NMI_LOCAL_APIC;
435 else
436 nmi_watchdog = NMI_IO_APIC;
437 }
438
439 if (nmi_watchdog == NMI_LOCAL_APIC) {
440 if (nmi_watchdog_enabled)
441 enable_lapic_nmi_watchdog();
442 else
443 disable_lapic_nmi_watchdog();
444 } else {
445 printk( KERN_WARNING
446 "NMI watchdog doesn't know what hardware to touch\n");
447 return -EIO;
448 }
449 return 0;
450}
451
452#endif
453
454void __trigger_all_cpu_backtrace(void)
455{
456 int i;
457
458 backtrace_mask = cpu_online_map;
459 /* Wait for up to 10 seconds for all CPUs to do the backtrace */
460 for (i = 0; i < 10 * 1000; i++) {
461 if (cpus_empty(backtrace_mask))
462 break;
463 mdelay(1);
464 }
465}
466
467EXPORT_SYMBOL(nmi_active);
468EXPORT_SYMBOL(nmi_watchdog);
diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c
new file mode 100644
index 000000000000..0ec6d2ddb931
--- /dev/null
+++ b/arch/x86/kernel/nmi_64.c
@@ -0,0 +1,483 @@
1/*
2 * linux/arch/x86_64/nmi.c
3 *
4 * NMI watchdog support on APIC systems
5 *
6 * Started by Ingo Molnar <mingo@redhat.com>
7 *
8 * Fixes:
9 * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog.
10 * Mikael Pettersson : Power Management for local APIC NMI watchdog.
11 * Pavel Machek and
12 * Mikael Pettersson : PM converted to driver model. Disable/enable API.
13 */
14
15#include <linux/nmi.h>
16#include <linux/mm.h>
17#include <linux/delay.h>
18#include <linux/interrupt.h>
19#include <linux/module.h>
20#include <linux/sysdev.h>
21#include <linux/sysctl.h>
22#include <linux/kprobes.h>
23#include <linux/cpumask.h>
24#include <linux/kdebug.h>
25
26#include <asm/smp.h>
27#include <asm/nmi.h>
28#include <asm/proto.h>
29#include <asm/mce.h>
30
31int unknown_nmi_panic;
32int nmi_watchdog_enabled;
33int panic_on_unrecovered_nmi;
34
35static cpumask_t backtrace_mask = CPU_MASK_NONE;
36
37/* nmi_active:
38 * >0: the lapic NMI watchdog is active, but can be disabled
39 * <0: the lapic NMI watchdog has not been set up, and cannot
40 * be enabled
41 * 0: the lapic NMI watchdog is disabled, but can be enabled
42 */
43atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */
44int panic_on_timeout;
45
46unsigned int nmi_watchdog = NMI_DEFAULT;
47static unsigned int nmi_hz = HZ;
48
49static DEFINE_PER_CPU(short, wd_enabled);
50
51/* local prototypes */
52static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu);
53
54/* Run after command line and cpu_init init, but before all other checks */
55void nmi_watchdog_default(void)
56{
57 if (nmi_watchdog != NMI_DEFAULT)
58 return;
59 nmi_watchdog = NMI_NONE;
60}
61
62static int endflag __initdata = 0;
63
64#ifdef CONFIG_SMP
65/* The performance counters used by NMI_LOCAL_APIC don't trigger when
66 * the CPU is idle. To make sure the NMI watchdog really ticks on all
67 * CPUs during the test make them busy.
68 */
69static __init void nmi_cpu_busy(void *data)
70{
71 local_irq_enable_in_hardirq();
72 /* Intentionally don't use cpu_relax here. This is
73 to make sure that the performance counter really ticks,
74 even if there is a simulator or similar that catches the
75 pause instruction. On a real HT machine this is fine because
76 all other CPUs are busy with "useless" delay loops and don't
77 care if they get somewhat less cycles. */
78 while (endflag == 0)
79 mb();
80}
81#endif
82
83int __init check_nmi_watchdog (void)
84{
85 int *counts;
86 int cpu;
87
88 if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DISABLED))
89 return 0;
90
91 if (!atomic_read(&nmi_active))
92 return 0;
93
94 counts = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
95 if (!counts)
96 return -1;
97
98 printk(KERN_INFO "testing NMI watchdog ... ");
99
100#ifdef CONFIG_SMP
101 if (nmi_watchdog == NMI_LOCAL_APIC)
102 smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
103#endif
104
105 for (cpu = 0; cpu < NR_CPUS; cpu++)
106 counts[cpu] = cpu_pda(cpu)->__nmi_count;
107 local_irq_enable();
108 mdelay((20*1000)/nmi_hz); // wait 20 ticks
109
110 for_each_online_cpu(cpu) {
111 if (!per_cpu(wd_enabled, cpu))
112 continue;
113 if (cpu_pda(cpu)->__nmi_count - counts[cpu] <= 5) {
114 printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n",
115 cpu,
116 counts[cpu],
117 cpu_pda(cpu)->__nmi_count);
118 per_cpu(wd_enabled, cpu) = 0;
119 atomic_dec(&nmi_active);
120 }
121 }
122 if (!atomic_read(&nmi_active)) {
123 kfree(counts);
124 atomic_set(&nmi_active, -1);
125 endflag = 1;
126 return -1;
127 }
128 endflag = 1;
129 printk("OK.\n");
130
131 /* now that we know it works we can reduce NMI frequency to
132 something more reasonable; makes a difference in some configs */
133 if (nmi_watchdog == NMI_LOCAL_APIC)
134 nmi_hz = lapic_adjust_nmi_hz(1);
135
136 kfree(counts);
137 return 0;
138}
139
140int __init setup_nmi_watchdog(char *str)
141{
142 int nmi;
143
144 if (!strncmp(str,"panic",5)) {
145 panic_on_timeout = 1;
146 str = strchr(str, ',');
147 if (!str)
148 return 1;
149 ++str;
150 }
151
152 get_option(&str, &nmi);
153
154 if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE))
155 return 0;
156
157 nmi_watchdog = nmi;
158 return 1;
159}
160
161__setup("nmi_watchdog=", setup_nmi_watchdog);
162
163
164static void __acpi_nmi_disable(void *__unused)
165{
166 apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
167}
168
169/*
170 * Disable timer based NMIs on all CPUs:
171 */
172void acpi_nmi_disable(void)
173{
174 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
175 on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
176}
177
178static void __acpi_nmi_enable(void *__unused)
179{
180 apic_write(APIC_LVT0, APIC_DM_NMI);
181}
182
183/*
184 * Enable timer based NMIs on all CPUs:
185 */
186void acpi_nmi_enable(void)
187{
188 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
189 on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
190}
191#ifdef CONFIG_PM
192
193static int nmi_pm_active; /* nmi_active before suspend */
194
195static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
196{
197 /* only CPU0 goes here, other CPUs should be offline */
198 nmi_pm_active = atomic_read(&nmi_active);
199 stop_apic_nmi_watchdog(NULL);
200 BUG_ON(atomic_read(&nmi_active) != 0);
201 return 0;
202}
203
204static int lapic_nmi_resume(struct sys_device *dev)
205{
206 /* only CPU0 goes here, other CPUs should be offline */
207 if (nmi_pm_active > 0) {
208 setup_apic_nmi_watchdog(NULL);
209 touch_nmi_watchdog();
210 }
211 return 0;
212}
213
214static struct sysdev_class nmi_sysclass = {
215 set_kset_name("lapic_nmi"),
216 .resume = lapic_nmi_resume,
217 .suspend = lapic_nmi_suspend,
218};
219
220static struct sys_device device_lapic_nmi = {
221 .id = 0,
222 .cls = &nmi_sysclass,
223};
224
225static int __init init_lapic_nmi_sysfs(void)
226{
227 int error;
228
229 /* should really be a BUG_ON but b/c this is an
230 * init call, it just doesn't work. -dcz
231 */
232 if (nmi_watchdog != NMI_LOCAL_APIC)
233 return 0;
234
235 if ( atomic_read(&nmi_active) < 0 )
236 return 0;
237
238 error = sysdev_class_register(&nmi_sysclass);
239 if (!error)
240 error = sysdev_register(&device_lapic_nmi);
241 return error;
242}
243/* must come after the local APIC's device_initcall() */
244late_initcall(init_lapic_nmi_sysfs);
245
246#endif /* CONFIG_PM */
247
248void setup_apic_nmi_watchdog(void *unused)
249{
250 if (__get_cpu_var(wd_enabled) == 1)
251 return;
252
253 /* cheap hack to support suspend/resume */
254 /* if cpu0 is not active neither should the other cpus */
255 if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0))
256 return;
257
258 switch (nmi_watchdog) {
259 case NMI_LOCAL_APIC:
260 __get_cpu_var(wd_enabled) = 1;
261 if (lapic_watchdog_init(nmi_hz) < 0) {
262 __get_cpu_var(wd_enabled) = 0;
263 return;
264 }
265 /* FALL THROUGH */
266 case NMI_IO_APIC:
267 __get_cpu_var(wd_enabled) = 1;
268 atomic_inc(&nmi_active);
269 }
270}
271
272void stop_apic_nmi_watchdog(void *unused)
273{
274 /* only support LOCAL and IO APICs for now */
275 if ((nmi_watchdog != NMI_LOCAL_APIC) &&
276 (nmi_watchdog != NMI_IO_APIC))
277 return;
278 if (__get_cpu_var(wd_enabled) == 0)
279 return;
280 if (nmi_watchdog == NMI_LOCAL_APIC)
281 lapic_watchdog_stop();
282 __get_cpu_var(wd_enabled) = 0;
283 atomic_dec(&nmi_active);
284}
285
286/*
287 * the best way to detect whether a CPU has a 'hard lockup' problem
288 * is to check it's local APIC timer IRQ counts. If they are not
289 * changing then that CPU has some problem.
290 *
291 * as these watchdog NMI IRQs are generated on every CPU, we only
292 * have to check the current processor.
293 */
294
295static DEFINE_PER_CPU(unsigned, last_irq_sum);
296static DEFINE_PER_CPU(local_t, alert_counter);
297static DEFINE_PER_CPU(int, nmi_touch);
298
299void touch_nmi_watchdog(void)
300{
301 if (nmi_watchdog > 0) {
302 unsigned cpu;
303
304 /*
305 * Tell other CPUs to reset their alert counters. We cannot
306 * do it ourselves because the alert count increase is not
307 * atomic.
308 */
309 for_each_present_cpu(cpu) {
310 if (per_cpu(nmi_touch, cpu) != 1)
311 per_cpu(nmi_touch, cpu) = 1;
312 }
313 }
314
315 touch_softlockup_watchdog();
316}
317
318int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
319{
320 int sum;
321 int touched = 0;
322 int cpu = smp_processor_id();
323 int rc = 0;
324
325 /* check for other users first */
326 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
327 == NOTIFY_STOP) {
328 rc = 1;
329 touched = 1;
330 }
331
332 sum = read_pda(apic_timer_irqs);
333 if (__get_cpu_var(nmi_touch)) {
334 __get_cpu_var(nmi_touch) = 0;
335 touched = 1;
336 }
337
338 if (cpu_isset(cpu, backtrace_mask)) {
339 static DEFINE_SPINLOCK(lock); /* Serialise the printks */
340
341 spin_lock(&lock);
342 printk("NMI backtrace for cpu %d\n", cpu);
343 dump_stack();
344 spin_unlock(&lock);
345 cpu_clear(cpu, backtrace_mask);
346 }
347
348#ifdef CONFIG_X86_MCE
349 /* Could check oops_in_progress here too, but it's safer
350 not too */
351 if (atomic_read(&mce_entry) > 0)
352 touched = 1;
353#endif
354 /* if the apic timer isn't firing, this cpu isn't doing much */
355 if (!touched && __get_cpu_var(last_irq_sum) == sum) {
356 /*
357 * Ayiee, looks like this CPU is stuck ...
358 * wait a few IRQs (5 seconds) before doing the oops ...
359 */
360 local_inc(&__get_cpu_var(alert_counter));
361 if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz)
362 die_nmi("NMI Watchdog detected LOCKUP on CPU %d\n", regs,
363 panic_on_timeout);
364 } else {
365 __get_cpu_var(last_irq_sum) = sum;
366 local_set(&__get_cpu_var(alert_counter), 0);
367 }
368
369 /* see if the nmi watchdog went off */
370 if (!__get_cpu_var(wd_enabled))
371 return rc;
372 switch (nmi_watchdog) {
373 case NMI_LOCAL_APIC:
374 rc |= lapic_wd_event(nmi_hz);
375 break;
376 case NMI_IO_APIC:
377 /* don't know how to accurately check for this.
378 * just assume it was a watchdog timer interrupt
379 * This matches the old behaviour.
380 */
381 rc = 1;
382 break;
383 }
384 return rc;
385}
386
387static unsigned ignore_nmis;
388
389asmlinkage __kprobes void do_nmi(struct pt_regs * regs, long error_code)
390{
391 nmi_enter();
392 add_pda(__nmi_count,1);
393 if (!ignore_nmis)
394 default_do_nmi(regs);
395 nmi_exit();
396}
397
398int do_nmi_callback(struct pt_regs * regs, int cpu)
399{
400#ifdef CONFIG_SYSCTL
401 if (unknown_nmi_panic)
402 return unknown_nmi_panic_callback(regs, cpu);
403#endif
404 return 0;
405}
406
407void stop_nmi(void)
408{
409 acpi_nmi_disable();
410 ignore_nmis++;
411}
412
413void restart_nmi(void)
414{
415 ignore_nmis--;
416 acpi_nmi_enable();
417}
418
419#ifdef CONFIG_SYSCTL
420
421static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
422{
423 unsigned char reason = get_nmi_reason();
424 char buf[64];
425
426 sprintf(buf, "NMI received for unknown reason %02x\n", reason);
427 die_nmi(buf, regs, 1); /* Always panic here */
428 return 0;
429}
430
431/*
432 * proc handler for /proc/sys/kernel/nmi
433 */
434int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
435 void __user *buffer, size_t *length, loff_t *ppos)
436{
437 int old_state;
438
439 nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
440 old_state = nmi_watchdog_enabled;
441 proc_dointvec(table, write, file, buffer, length, ppos);
442 if (!!old_state == !!nmi_watchdog_enabled)
443 return 0;
444
445 if (atomic_read(&nmi_active) < 0 || nmi_watchdog == NMI_DISABLED) {
446 printk( KERN_WARNING "NMI watchdog is permanently disabled\n");
447 return -EIO;
448 }
449
450 /* if nmi_watchdog is not set yet, then set it */
451 nmi_watchdog_default();
452
453 if (nmi_watchdog == NMI_LOCAL_APIC) {
454 if (nmi_watchdog_enabled)
455 enable_lapic_nmi_watchdog();
456 else
457 disable_lapic_nmi_watchdog();
458 } else {
459 printk( KERN_WARNING
460 "NMI watchdog doesn't know what hardware to touch\n");
461 return -EIO;
462 }
463 return 0;
464}
465
466#endif
467
468void __trigger_all_cpu_backtrace(void)
469{
470 int i;
471
472 backtrace_mask = cpu_online_map;
473 /* Wait for up to 10 seconds for all CPUs to do the backtrace */
474 for (i = 0; i < 10 * 1000; i++) {
475 if (cpus_empty(backtrace_mask))
476 break;
477 mdelay(1);
478 }
479}
480
481EXPORT_SYMBOL(nmi_active);
482EXPORT_SYMBOL(nmi_watchdog);
483EXPORT_SYMBOL(touch_nmi_watchdog);
diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c
new file mode 100644
index 000000000000..9000d82c6dc0
--- /dev/null
+++ b/arch/x86/kernel/numaq_32.c
@@ -0,0 +1,89 @@
1/*
2 * Written by: Patricia Gaughen, IBM Corporation
3 *
4 * Copyright (C) 2002, IBM Corp.
5 *
6 * All rights reserved.
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful, but
14 * WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
16 * NON INFRINGEMENT. See the GNU General Public License for more
17 * details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22 *
23 * Send feedback to <gone@us.ibm.com>
24 */
25
26#include <linux/mm.h>
27#include <linux/bootmem.h>
28#include <linux/mmzone.h>
29#include <linux/module.h>
30#include <linux/nodemask.h>
31#include <asm/numaq.h>
32#include <asm/topology.h>
33#include <asm/processor.h>
34
35#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT))
36
37/*
38 * Function: smp_dump_qct()
39 *
40 * Description: gets memory layout from the quad config table. This
41 * function also updates node_online_map with the nodes (quads) present.
42 */
43static void __init smp_dump_qct(void)
44{
45 int node;
46 struct eachquadmem *eq;
47 struct sys_cfg_data *scd =
48 (struct sys_cfg_data *)__va(SYS_CFG_DATA_PRIV_ADDR);
49
50 nodes_clear(node_online_map);
51 for_each_node(node) {
52 if (scd->quads_present31_0 & (1 << node)) {
53 node_set_online(node);
54 eq = &scd->eq[node];
55 /* Convert to pages */
56 node_start_pfn[node] = MB_TO_PAGES(
57 eq->hi_shrd_mem_start - eq->priv_mem_size);
58 node_end_pfn[node] = MB_TO_PAGES(
59 eq->hi_shrd_mem_start + eq->hi_shrd_mem_size);
60
61 memory_present(node,
62 node_start_pfn[node], node_end_pfn[node]);
63 node_remap_size[node] = node_memmap_size_bytes(node,
64 node_start_pfn[node],
65 node_end_pfn[node]);
66 }
67 }
68}
69
70/*
71 * Unlike Summit, we don't really care to let the NUMA-Q
72 * fall back to flat mode. Don't compile for NUMA-Q
73 * unless you really need it!
74 */
75int __init get_memcfg_numaq(void)
76{
77 smp_dump_qct();
78 return 1;
79}
80
81static int __init numaq_tsc_disable(void)
82{
83 if (num_online_nodes() > 1) {
84 printk(KERN_DEBUG "NUMAQ: disabling TSC\n");
85 tsc_disable = 1;
86 }
87 return 0;
88}
89arch_initcall(numaq_tsc_disable);
diff --git a/arch/x86/kernel/paravirt_32.c b/arch/x86/kernel/paravirt_32.c
new file mode 100644
index 000000000000..739cfb207dd7
--- /dev/null
+++ b/arch/x86/kernel/paravirt_32.c
@@ -0,0 +1,392 @@
1/* Paravirtualization interfaces
2 Copyright (C) 2006 Rusty Russell IBM Corporation
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17*/
18#include <linux/errno.h>
19#include <linux/module.h>
20#include <linux/efi.h>
21#include <linux/bcd.h>
22#include <linux/highmem.h>
23
24#include <asm/bug.h>
25#include <asm/paravirt.h>
26#include <asm/desc.h>
27#include <asm/setup.h>
28#include <asm/arch_hooks.h>
29#include <asm/time.h>
30#include <asm/irq.h>
31#include <asm/delay.h>
32#include <asm/fixmap.h>
33#include <asm/apic.h>
34#include <asm/tlbflush.h>
35#include <asm/timer.h>
36
37/* nop stub */
38void _paravirt_nop(void)
39{
40}
41
42static void __init default_banner(void)
43{
44 printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
45 paravirt_ops.name);
46}
47
48char *memory_setup(void)
49{
50 return paravirt_ops.memory_setup();
51}
52
53/* Simple instruction patching code. */
54#define DEF_NATIVE(name, code) \
55 extern const char start_##name[], end_##name[]; \
56 asm("start_" #name ": " code "; end_" #name ":")
57
58DEF_NATIVE(irq_disable, "cli");
59DEF_NATIVE(irq_enable, "sti");
60DEF_NATIVE(restore_fl, "push %eax; popf");
61DEF_NATIVE(save_fl, "pushf; pop %eax");
62DEF_NATIVE(iret, "iret");
63DEF_NATIVE(irq_enable_sysexit, "sti; sysexit");
64DEF_NATIVE(read_cr2, "mov %cr2, %eax");
65DEF_NATIVE(write_cr3, "mov %eax, %cr3");
66DEF_NATIVE(read_cr3, "mov %cr3, %eax");
67DEF_NATIVE(clts, "clts");
68DEF_NATIVE(read_tsc, "rdtsc");
69
70DEF_NATIVE(ud2a, "ud2a");
71
72static unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
73 unsigned long addr, unsigned len)
74{
75 const unsigned char *start, *end;
76 unsigned ret;
77
78 switch(type) {
79#define SITE(x) case PARAVIRT_PATCH(x): start = start_##x; end = end_##x; goto patch_site
80 SITE(irq_disable);
81 SITE(irq_enable);
82 SITE(restore_fl);
83 SITE(save_fl);
84 SITE(iret);
85 SITE(irq_enable_sysexit);
86 SITE(read_cr2);
87 SITE(read_cr3);
88 SITE(write_cr3);
89 SITE(clts);
90 SITE(read_tsc);
91#undef SITE
92
93 patch_site:
94 ret = paravirt_patch_insns(ibuf, len, start, end);
95 break;
96
97 case PARAVIRT_PATCH(make_pgd):
98 case PARAVIRT_PATCH(make_pte):
99 case PARAVIRT_PATCH(pgd_val):
100 case PARAVIRT_PATCH(pte_val):
101#ifdef CONFIG_X86_PAE
102 case PARAVIRT_PATCH(make_pmd):
103 case PARAVIRT_PATCH(pmd_val):
104#endif
105 /* These functions end up returning exactly what
106 they're passed, in the same registers. */
107 ret = paravirt_patch_nop();
108 break;
109
110 default:
111 ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
112 break;
113 }
114
115 return ret;
116}
117
118unsigned paravirt_patch_nop(void)
119{
120 return 0;
121}
122
123unsigned paravirt_patch_ignore(unsigned len)
124{
125 return len;
126}
127
128struct branch {
129 unsigned char opcode;
130 u32 delta;
131} __attribute__((packed));
132
133unsigned paravirt_patch_call(void *insnbuf,
134 const void *target, u16 tgt_clobbers,
135 unsigned long addr, u16 site_clobbers,
136 unsigned len)
137{
138 struct branch *b = insnbuf;
139 unsigned long delta = (unsigned long)target - (addr+5);
140
141 if (tgt_clobbers & ~site_clobbers)
142 return len; /* target would clobber too much for this site */
143 if (len < 5)
144 return len; /* call too long for patch site */
145
146 b->opcode = 0xe8; /* call */
147 b->delta = delta;
148 BUILD_BUG_ON(sizeof(*b) != 5);
149
150 return 5;
151}
152
153unsigned paravirt_patch_jmp(const void *target, void *insnbuf,
154 unsigned long addr, unsigned len)
155{
156 struct branch *b = insnbuf;
157 unsigned long delta = (unsigned long)target - (addr+5);
158
159 if (len < 5)
160 return len; /* call too long for patch site */
161
162 b->opcode = 0xe9; /* jmp */
163 b->delta = delta;
164
165 return 5;
166}
167
168unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
169 unsigned long addr, unsigned len)
170{
171 void *opfunc = *((void **)&paravirt_ops + type);
172 unsigned ret;
173
174 if (opfunc == NULL)
175 /* If there's no function, patch it with a ud2a (BUG) */
176 ret = paravirt_patch_insns(insnbuf, len, start_ud2a, end_ud2a);
177 else if (opfunc == paravirt_nop)
178 /* If the operation is a nop, then nop the callsite */
179 ret = paravirt_patch_nop();
180 else if (type == PARAVIRT_PATCH(iret) ||
181 type == PARAVIRT_PATCH(irq_enable_sysexit))
182 /* If operation requires a jmp, then jmp */
183 ret = paravirt_patch_jmp(opfunc, insnbuf, addr, len);
184 else
185 /* Otherwise call the function; assume target could
186 clobber any caller-save reg */
187 ret = paravirt_patch_call(insnbuf, opfunc, CLBR_ANY,
188 addr, clobbers, len);
189
190 return ret;
191}
192
193unsigned paravirt_patch_insns(void *insnbuf, unsigned len,
194 const char *start, const char *end)
195{
196 unsigned insn_len = end - start;
197
198 if (insn_len > len || start == NULL)
199 insn_len = len;
200 else
201 memcpy(insnbuf, start, insn_len);
202
203 return insn_len;
204}
205
206void init_IRQ(void)
207{
208 paravirt_ops.init_IRQ();
209}
210
211static void native_flush_tlb(void)
212{
213 __native_flush_tlb();
214}
215
216/*
217 * Global pages have to be flushed a bit differently. Not a real
218 * performance problem because this does not happen often.
219 */
220static void native_flush_tlb_global(void)
221{
222 __native_flush_tlb_global();
223}
224
225static void native_flush_tlb_single(unsigned long addr)
226{
227 __native_flush_tlb_single(addr);
228}
229
230/* These are in entry.S */
231extern void native_iret(void);
232extern void native_irq_enable_sysexit(void);
233
234static int __init print_banner(void)
235{
236 paravirt_ops.banner();
237 return 0;
238}
239core_initcall(print_banner);
240
241static struct resource reserve_ioports = {
242 .start = 0,
243 .end = IO_SPACE_LIMIT,
244 .name = "paravirt-ioport",
245 .flags = IORESOURCE_IO | IORESOURCE_BUSY,
246};
247
248static struct resource reserve_iomem = {
249 .start = 0,
250 .end = -1,
251 .name = "paravirt-iomem",
252 .flags = IORESOURCE_MEM | IORESOURCE_BUSY,
253};
254
255/*
256 * Reserve the whole legacy IO space to prevent any legacy drivers
257 * from wasting time probing for their hardware. This is a fairly
258 * brute-force approach to disabling all non-virtual drivers.
259 *
260 * Note that this must be called very early to have any effect.
261 */
262int paravirt_disable_iospace(void)
263{
264 int ret;
265
266 ret = request_resource(&ioport_resource, &reserve_ioports);
267 if (ret == 0) {
268 ret = request_resource(&iomem_resource, &reserve_iomem);
269 if (ret)
270 release_resource(&reserve_ioports);
271 }
272
273 return ret;
274}
275
276struct paravirt_ops paravirt_ops = {
277 .name = "bare hardware",
278 .paravirt_enabled = 0,
279 .kernel_rpl = 0,
280 .shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */
281
282 .patch = native_patch,
283 .banner = default_banner,
284 .arch_setup = paravirt_nop,
285 .memory_setup = machine_specific_memory_setup,
286 .get_wallclock = native_get_wallclock,
287 .set_wallclock = native_set_wallclock,
288 .time_init = hpet_time_init,
289 .init_IRQ = native_init_IRQ,
290
291 .cpuid = native_cpuid,
292 .get_debugreg = native_get_debugreg,
293 .set_debugreg = native_set_debugreg,
294 .clts = native_clts,
295 .read_cr0 = native_read_cr0,
296 .write_cr0 = native_write_cr0,
297 .read_cr2 = native_read_cr2,
298 .write_cr2 = native_write_cr2,
299 .read_cr3 = native_read_cr3,
300 .write_cr3 = native_write_cr3,
301 .read_cr4 = native_read_cr4,
302 .read_cr4_safe = native_read_cr4_safe,
303 .write_cr4 = native_write_cr4,
304 .save_fl = native_save_fl,
305 .restore_fl = native_restore_fl,
306 .irq_disable = native_irq_disable,
307 .irq_enable = native_irq_enable,
308 .safe_halt = native_safe_halt,
309 .halt = native_halt,
310 .wbinvd = native_wbinvd,
311 .read_msr = native_read_msr_safe,
312 .write_msr = native_write_msr_safe,
313 .read_tsc = native_read_tsc,
314 .read_pmc = native_read_pmc,
315 .sched_clock = native_sched_clock,
316 .get_cpu_khz = native_calculate_cpu_khz,
317 .load_tr_desc = native_load_tr_desc,
318 .set_ldt = native_set_ldt,
319 .load_gdt = native_load_gdt,
320 .load_idt = native_load_idt,
321 .store_gdt = native_store_gdt,
322 .store_idt = native_store_idt,
323 .store_tr = native_store_tr,
324 .load_tls = native_load_tls,
325 .write_ldt_entry = write_dt_entry,
326 .write_gdt_entry = write_dt_entry,
327 .write_idt_entry = write_dt_entry,
328 .load_esp0 = native_load_esp0,
329
330 .set_iopl_mask = native_set_iopl_mask,
331 .io_delay = native_io_delay,
332
333#ifdef CONFIG_X86_LOCAL_APIC
334 .apic_write = native_apic_write,
335 .apic_write_atomic = native_apic_write_atomic,
336 .apic_read = native_apic_read,
337 .setup_boot_clock = setup_boot_APIC_clock,
338 .setup_secondary_clock = setup_secondary_APIC_clock,
339 .startup_ipi_hook = paravirt_nop,
340#endif
341 .set_lazy_mode = paravirt_nop,
342
343 .pagetable_setup_start = native_pagetable_setup_start,
344 .pagetable_setup_done = native_pagetable_setup_done,
345
346 .flush_tlb_user = native_flush_tlb,
347 .flush_tlb_kernel = native_flush_tlb_global,
348 .flush_tlb_single = native_flush_tlb_single,
349 .flush_tlb_others = native_flush_tlb_others,
350
351 .alloc_pt = paravirt_nop,
352 .alloc_pd = paravirt_nop,
353 .alloc_pd_clone = paravirt_nop,
354 .release_pt = paravirt_nop,
355 .release_pd = paravirt_nop,
356
357 .set_pte = native_set_pte,
358 .set_pte_at = native_set_pte_at,
359 .set_pmd = native_set_pmd,
360 .pte_update = paravirt_nop,
361 .pte_update_defer = paravirt_nop,
362
363#ifdef CONFIG_HIGHPTE
364 .kmap_atomic_pte = kmap_atomic,
365#endif
366
367#ifdef CONFIG_X86_PAE
368 .set_pte_atomic = native_set_pte_atomic,
369 .set_pte_present = native_set_pte_present,
370 .set_pud = native_set_pud,
371 .pte_clear = native_pte_clear,
372 .pmd_clear = native_pmd_clear,
373
374 .pmd_val = native_pmd_val,
375 .make_pmd = native_make_pmd,
376#endif
377
378 .pte_val = native_pte_val,
379 .pgd_val = native_pgd_val,
380
381 .make_pte = native_make_pte,
382 .make_pgd = native_make_pgd,
383
384 .irq_enable_sysexit = native_irq_enable_sysexit,
385 .iret = native_iret,
386
387 .dup_mmap = paravirt_nop,
388 .exit_mmap = paravirt_nop,
389 .activate_mm = paravirt_nop,
390};
391
392EXPORT_SYMBOL(paravirt_ops);
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
new file mode 100644
index 000000000000..71da01e73f03
--- /dev/null
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -0,0 +1,1578 @@
1/*
2 * Derived from arch/powerpc/kernel/iommu.c
3 *
4 * Copyright IBM Corporation, 2006-2007
5 * Copyright (C) 2006 Jon Mason <jdmason@kudzu.us>
6 *
7 * Author: Jon Mason <jdmason@kudzu.us>
8 * Author: Muli Ben-Yehuda <muli@il.ibm.com>
9
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 */
24
25#include <linux/kernel.h>
26#include <linux/init.h>
27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/mm.h>
30#include <linux/spinlock.h>
31#include <linux/string.h>
32#include <linux/dma-mapping.h>
33#include <linux/init.h>
34#include <linux/bitops.h>
35#include <linux/pci_ids.h>
36#include <linux/pci.h>
37#include <linux/delay.h>
38#include <asm/iommu.h>
39#include <asm/calgary.h>
40#include <asm/tce.h>
41#include <asm/pci-direct.h>
42#include <asm/system.h>
43#include <asm/dma.h>
44#include <asm/rio.h>
45
46#ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT
47int use_calgary __read_mostly = 1;
48#else
49int use_calgary __read_mostly = 0;
50#endif /* CONFIG_CALGARY_DEFAULT_ENABLED */
51
52#define PCI_DEVICE_ID_IBM_CALGARY 0x02a1
53#define PCI_DEVICE_ID_IBM_CALIOC2 0x0308
54
55/* register offsets inside the host bridge space */
56#define CALGARY_CONFIG_REG 0x0108
57#define PHB_CSR_OFFSET 0x0110 /* Channel Status */
58#define PHB_PLSSR_OFFSET 0x0120
59#define PHB_CONFIG_RW_OFFSET 0x0160
60#define PHB_IOBASE_BAR_LOW 0x0170
61#define PHB_IOBASE_BAR_HIGH 0x0180
62#define PHB_MEM_1_LOW 0x0190
63#define PHB_MEM_1_HIGH 0x01A0
64#define PHB_IO_ADDR_SIZE 0x01B0
65#define PHB_MEM_1_SIZE 0x01C0
66#define PHB_MEM_ST_OFFSET 0x01D0
67#define PHB_AER_OFFSET 0x0200
68#define PHB_CONFIG_0_HIGH 0x0220
69#define PHB_CONFIG_0_LOW 0x0230
70#define PHB_CONFIG_0_END 0x0240
71#define PHB_MEM_2_LOW 0x02B0
72#define PHB_MEM_2_HIGH 0x02C0
73#define PHB_MEM_2_SIZE_HIGH 0x02D0
74#define PHB_MEM_2_SIZE_LOW 0x02E0
75#define PHB_DOSHOLE_OFFSET 0x08E0
76
77/* CalIOC2 specific */
78#define PHB_SAVIOR_L2 0x0DB0
79#define PHB_PAGE_MIG_CTRL 0x0DA8
80#define PHB_PAGE_MIG_DEBUG 0x0DA0
81#define PHB_ROOT_COMPLEX_STATUS 0x0CB0
82
83/* PHB_CONFIG_RW */
84#define PHB_TCE_ENABLE 0x20000000
85#define PHB_SLOT_DISABLE 0x1C000000
86#define PHB_DAC_DISABLE 0x01000000
87#define PHB_MEM2_ENABLE 0x00400000
88#define PHB_MCSR_ENABLE 0x00100000
89/* TAR (Table Address Register) */
90#define TAR_SW_BITS 0x0000ffffffff800fUL
91#define TAR_VALID 0x0000000000000008UL
92/* CSR (Channel/DMA Status Register) */
93#define CSR_AGENT_MASK 0xffe0ffff
94/* CCR (Calgary Configuration Register) */
95#define CCR_2SEC_TIMEOUT 0x000000000000000EUL
96/* PMCR/PMDR (Page Migration Control/Debug Registers */
97#define PMR_SOFTSTOP 0x80000000
98#define PMR_SOFTSTOPFAULT 0x40000000
99#define PMR_HARDSTOP 0x20000000
100
101#define MAX_NUM_OF_PHBS 8 /* how many PHBs in total? */
102#define MAX_NUM_CHASSIS 8 /* max number of chassis */
103/* MAX_PHB_BUS_NUM is the maximal possible dev->bus->number */
104#define MAX_PHB_BUS_NUM (MAX_NUM_OF_PHBS * MAX_NUM_CHASSIS * 2)
105#define PHBS_PER_CALGARY 4
106
107/* register offsets in Calgary's internal register space */
108static const unsigned long tar_offsets[] = {
109 0x0580 /* TAR0 */,
110 0x0588 /* TAR1 */,
111 0x0590 /* TAR2 */,
112 0x0598 /* TAR3 */
113};
114
115static const unsigned long split_queue_offsets[] = {
116 0x4870 /* SPLIT QUEUE 0 */,
117 0x5870 /* SPLIT QUEUE 1 */,
118 0x6870 /* SPLIT QUEUE 2 */,
119 0x7870 /* SPLIT QUEUE 3 */
120};
121
122static const unsigned long phb_offsets[] = {
123 0x8000 /* PHB0 */,
124 0x9000 /* PHB1 */,
125 0xA000 /* PHB2 */,
126 0xB000 /* PHB3 */
127};
128
129/* PHB debug registers */
130
131static const unsigned long phb_debug_offsets[] = {
132 0x4000 /* PHB 0 DEBUG */,
133 0x5000 /* PHB 1 DEBUG */,
134 0x6000 /* PHB 2 DEBUG */,
135 0x7000 /* PHB 3 DEBUG */
136};
137
138/*
139 * STUFF register for each debug PHB,
140 * byte 1 = start bus number, byte 2 = end bus number
141 */
142
143#define PHB_DEBUG_STUFF_OFFSET 0x0020
144
145#define EMERGENCY_PAGES 32 /* = 128KB */
146
147unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED;
148static int translate_empty_slots __read_mostly = 0;
149static int calgary_detected __read_mostly = 0;
150
151static struct rio_table_hdr *rio_table_hdr __initdata;
152static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata;
153static struct rio_detail *rio_devs[MAX_NUMNODES * 4] __initdata;
154
155struct calgary_bus_info {
156 void *tce_space;
157 unsigned char translation_disabled;
158 signed char phbid;
159 void __iomem *bbar;
160};
161
162static void calgary_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev);
163static void calgary_tce_cache_blast(struct iommu_table *tbl);
164static void calgary_dump_error_regs(struct iommu_table *tbl);
165static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev);
166static void calioc2_tce_cache_blast(struct iommu_table *tbl);
167static void calioc2_dump_error_regs(struct iommu_table *tbl);
168
169static struct cal_chipset_ops calgary_chip_ops = {
170 .handle_quirks = calgary_handle_quirks,
171 .tce_cache_blast = calgary_tce_cache_blast,
172 .dump_error_regs = calgary_dump_error_regs
173};
174
175static struct cal_chipset_ops calioc2_chip_ops = {
176 .handle_quirks = calioc2_handle_quirks,
177 .tce_cache_blast = calioc2_tce_cache_blast,
178 .dump_error_regs = calioc2_dump_error_regs
179};
180
181static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, };
182
183/* enable this to stress test the chip's TCE cache */
184#ifdef CONFIG_IOMMU_DEBUG
185int debugging __read_mostly = 1;
186
187static inline unsigned long verify_bit_range(unsigned long* bitmap,
188 int expected, unsigned long start, unsigned long end)
189{
190 unsigned long idx = start;
191
192 BUG_ON(start >= end);
193
194 while (idx < end) {
195 if (!!test_bit(idx, bitmap) != expected)
196 return idx;
197 ++idx;
198 }
199
200 /* all bits have the expected value */
201 return ~0UL;
202}
203#else /* debugging is disabled */
204int debugging __read_mostly = 0;
205
206static inline unsigned long verify_bit_range(unsigned long* bitmap,
207 int expected, unsigned long start, unsigned long end)
208{
209 return ~0UL;
210}
211
212#endif /* CONFIG_IOMMU_DEBUG */
213
214static inline unsigned int num_dma_pages(unsigned long dma, unsigned int dmalen)
215{
216 unsigned int npages;
217
218 npages = PAGE_ALIGN(dma + dmalen) - (dma & PAGE_MASK);
219 npages >>= PAGE_SHIFT;
220
221 return npages;
222}
223
224static inline int translate_phb(struct pci_dev* dev)
225{
226 int disabled = bus_info[dev->bus->number].translation_disabled;
227 return !disabled;
228}
229
230static void iommu_range_reserve(struct iommu_table *tbl,
231 unsigned long start_addr, unsigned int npages)
232{
233 unsigned long index;
234 unsigned long end;
235 unsigned long badbit;
236 unsigned long flags;
237
238 index = start_addr >> PAGE_SHIFT;
239
240 /* bail out if we're asked to reserve a region we don't cover */
241 if (index >= tbl->it_size)
242 return;
243
244 end = index + npages;
245 if (end > tbl->it_size) /* don't go off the table */
246 end = tbl->it_size;
247
248 spin_lock_irqsave(&tbl->it_lock, flags);
249
250 badbit = verify_bit_range(tbl->it_map, 0, index, end);
251 if (badbit != ~0UL) {
252 if (printk_ratelimit())
253 printk(KERN_ERR "Calgary: entry already allocated at "
254 "0x%lx tbl %p dma 0x%lx npages %u\n",
255 badbit, tbl, start_addr, npages);
256 }
257
258 set_bit_string(tbl->it_map, index, npages);
259
260 spin_unlock_irqrestore(&tbl->it_lock, flags);
261}
262
263static unsigned long iommu_range_alloc(struct iommu_table *tbl,
264 unsigned int npages)
265{
266 unsigned long flags;
267 unsigned long offset;
268
269 BUG_ON(npages == 0);
270
271 spin_lock_irqsave(&tbl->it_lock, flags);
272
273 offset = find_next_zero_string(tbl->it_map, tbl->it_hint,
274 tbl->it_size, npages);
275 if (offset == ~0UL) {
276 tbl->chip_ops->tce_cache_blast(tbl);
277 offset = find_next_zero_string(tbl->it_map, 0,
278 tbl->it_size, npages);
279 if (offset == ~0UL) {
280 printk(KERN_WARNING "Calgary: IOMMU full.\n");
281 spin_unlock_irqrestore(&tbl->it_lock, flags);
282 if (panic_on_overflow)
283 panic("Calgary: fix the allocator.\n");
284 else
285 return bad_dma_address;
286 }
287 }
288
289 set_bit_string(tbl->it_map, offset, npages);
290 tbl->it_hint = offset + npages;
291 BUG_ON(tbl->it_hint > tbl->it_size);
292
293 spin_unlock_irqrestore(&tbl->it_lock, flags);
294
295 return offset;
296}
297
298static dma_addr_t iommu_alloc(struct iommu_table *tbl, void *vaddr,
299 unsigned int npages, int direction)
300{
301 unsigned long entry;
302 dma_addr_t ret = bad_dma_address;
303
304 entry = iommu_range_alloc(tbl, npages);
305
306 if (unlikely(entry == bad_dma_address))
307 goto error;
308
309 /* set the return dma address */
310 ret = (entry << PAGE_SHIFT) | ((unsigned long)vaddr & ~PAGE_MASK);
311
312 /* put the TCEs in the HW table */
313 tce_build(tbl, entry, npages, (unsigned long)vaddr & PAGE_MASK,
314 direction);
315
316 return ret;
317
318error:
319 printk(KERN_WARNING "Calgary: failed to allocate %u pages in "
320 "iommu %p\n", npages, tbl);
321 return bad_dma_address;
322}
323
324static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
325 unsigned int npages)
326{
327 unsigned long entry;
328 unsigned long badbit;
329 unsigned long badend;
330 unsigned long flags;
331
332 /* were we called with bad_dma_address? */
333 badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE);
334 if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) {
335 printk(KERN_ERR "Calgary: driver tried unmapping bad DMA "
336 "address 0x%Lx\n", dma_addr);
337 WARN_ON(1);
338 return;
339 }
340
341 entry = dma_addr >> PAGE_SHIFT;
342
343 BUG_ON(entry + npages > tbl->it_size);
344
345 tce_free(tbl, entry, npages);
346
347 spin_lock_irqsave(&tbl->it_lock, flags);
348
349 badbit = verify_bit_range(tbl->it_map, 1, entry, entry + npages);
350 if (badbit != ~0UL) {
351 if (printk_ratelimit())
352 printk(KERN_ERR "Calgary: bit is off at 0x%lx "
353 "tbl %p dma 0x%Lx entry 0x%lx npages %u\n",
354 badbit, tbl, dma_addr, entry, npages);
355 }
356
357 __clear_bit_string(tbl->it_map, entry, npages);
358
359 spin_unlock_irqrestore(&tbl->it_lock, flags);
360}
361
362static inline struct iommu_table *find_iommu_table(struct device *dev)
363{
364 struct pci_dev *pdev;
365 struct pci_bus *pbus;
366 struct iommu_table *tbl;
367
368 pdev = to_pci_dev(dev);
369
370 pbus = pdev->bus;
371
372 /* is the device behind a bridge? Look for the root bus */
373 while (pbus->parent)
374 pbus = pbus->parent;
375
376 tbl = pci_iommu(pbus);
377
378 BUG_ON(tbl && (tbl->it_busno != pbus->number));
379
380 return tbl;
381}
382
383static void calgary_unmap_sg(struct device *dev,
384 struct scatterlist *sglist, int nelems, int direction)
385{
386 struct iommu_table *tbl = find_iommu_table(dev);
387
388 if (!translate_phb(to_pci_dev(dev)))
389 return;
390
391 while (nelems--) {
392 unsigned int npages;
393 dma_addr_t dma = sglist->dma_address;
394 unsigned int dmalen = sglist->dma_length;
395
396 if (dmalen == 0)
397 break;
398
399 npages = num_dma_pages(dma, dmalen);
400 iommu_free(tbl, dma, npages);
401 sglist++;
402 }
403}
404
405static int calgary_nontranslate_map_sg(struct device* dev,
406 struct scatterlist *sg, int nelems, int direction)
407{
408 int i;
409
410 for (i = 0; i < nelems; i++ ) {
411 struct scatterlist *s = &sg[i];
412 BUG_ON(!s->page);
413 s->dma_address = virt_to_bus(page_address(s->page) +s->offset);
414 s->dma_length = s->length;
415 }
416 return nelems;
417}
418
419static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
420 int nelems, int direction)
421{
422 struct iommu_table *tbl = find_iommu_table(dev);
423 unsigned long vaddr;
424 unsigned int npages;
425 unsigned long entry;
426 int i;
427
428 if (!translate_phb(to_pci_dev(dev)))
429 return calgary_nontranslate_map_sg(dev, sg, nelems, direction);
430
431 for (i = 0; i < nelems; i++ ) {
432 struct scatterlist *s = &sg[i];
433 BUG_ON(!s->page);
434
435 vaddr = (unsigned long)page_address(s->page) + s->offset;
436 npages = num_dma_pages(vaddr, s->length);
437
438 entry = iommu_range_alloc(tbl, npages);
439 if (entry == bad_dma_address) {
440 /* makes sure unmap knows to stop */
441 s->dma_length = 0;
442 goto error;
443 }
444
445 s->dma_address = (entry << PAGE_SHIFT) | s->offset;
446
447 /* insert into HW table */
448 tce_build(tbl, entry, npages, vaddr & PAGE_MASK,
449 direction);
450
451 s->dma_length = s->length;
452 }
453
454 return nelems;
455error:
456 calgary_unmap_sg(dev, sg, nelems, direction);
457 for (i = 0; i < nelems; i++) {
458 sg[i].dma_address = bad_dma_address;
459 sg[i].dma_length = 0;
460 }
461 return 0;
462}
463
464static dma_addr_t calgary_map_single(struct device *dev, void *vaddr,
465 size_t size, int direction)
466{
467 dma_addr_t dma_handle = bad_dma_address;
468 unsigned long uaddr;
469 unsigned int npages;
470 struct iommu_table *tbl = find_iommu_table(dev);
471
472 uaddr = (unsigned long)vaddr;
473 npages = num_dma_pages(uaddr, size);
474
475 if (translate_phb(to_pci_dev(dev)))
476 dma_handle = iommu_alloc(tbl, vaddr, npages, direction);
477 else
478 dma_handle = virt_to_bus(vaddr);
479
480 return dma_handle;
481}
482
483static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle,
484 size_t size, int direction)
485{
486 struct iommu_table *tbl = find_iommu_table(dev);
487 unsigned int npages;
488
489 if (!translate_phb(to_pci_dev(dev)))
490 return;
491
492 npages = num_dma_pages(dma_handle, size);
493 iommu_free(tbl, dma_handle, npages);
494}
495
496static void* calgary_alloc_coherent(struct device *dev, size_t size,
497 dma_addr_t *dma_handle, gfp_t flag)
498{
499 void *ret = NULL;
500 dma_addr_t mapping;
501 unsigned int npages, order;
502 struct iommu_table *tbl = find_iommu_table(dev);
503
504 size = PAGE_ALIGN(size); /* size rounded up to full pages */
505 npages = size >> PAGE_SHIFT;
506 order = get_order(size);
507
508 /* alloc enough pages (and possibly more) */
509 ret = (void *)__get_free_pages(flag, order);
510 if (!ret)
511 goto error;
512 memset(ret, 0, size);
513
514 if (translate_phb(to_pci_dev(dev))) {
515 /* set up tces to cover the allocated range */
516 mapping = iommu_alloc(tbl, ret, npages, DMA_BIDIRECTIONAL);
517 if (mapping == bad_dma_address)
518 goto free;
519
520 *dma_handle = mapping;
521 } else /* non translated slot */
522 *dma_handle = virt_to_bus(ret);
523
524 return ret;
525
526free:
527 free_pages((unsigned long)ret, get_order(size));
528 ret = NULL;
529error:
530 return ret;
531}
532
533static const struct dma_mapping_ops calgary_dma_ops = {
534 .alloc_coherent = calgary_alloc_coherent,
535 .map_single = calgary_map_single,
536 .unmap_single = calgary_unmap_single,
537 .map_sg = calgary_map_sg,
538 .unmap_sg = calgary_unmap_sg,
539};
540
541static inline void __iomem * busno_to_bbar(unsigned char num)
542{
543 return bus_info[num].bbar;
544}
545
546static inline int busno_to_phbid(unsigned char num)
547{
548 return bus_info[num].phbid;
549}
550
551static inline unsigned long split_queue_offset(unsigned char num)
552{
553 size_t idx = busno_to_phbid(num);
554
555 return split_queue_offsets[idx];
556}
557
558static inline unsigned long tar_offset(unsigned char num)
559{
560 size_t idx = busno_to_phbid(num);
561
562 return tar_offsets[idx];
563}
564
565static inline unsigned long phb_offset(unsigned char num)
566{
567 size_t idx = busno_to_phbid(num);
568
569 return phb_offsets[idx];
570}
571
572static inline void __iomem* calgary_reg(void __iomem *bar, unsigned long offset)
573{
574 unsigned long target = ((unsigned long)bar) | offset;
575 return (void __iomem*)target;
576}
577
578static inline int is_calioc2(unsigned short device)
579{
580 return (device == PCI_DEVICE_ID_IBM_CALIOC2);
581}
582
583static inline int is_calgary(unsigned short device)
584{
585 return (device == PCI_DEVICE_ID_IBM_CALGARY);
586}
587
588static inline int is_cal_pci_dev(unsigned short device)
589{
590 return (is_calgary(device) || is_calioc2(device));
591}
592
593static void calgary_tce_cache_blast(struct iommu_table *tbl)
594{
595 u64 val;
596 u32 aer;
597 int i = 0;
598 void __iomem *bbar = tbl->bbar;
599 void __iomem *target;
600
601 /* disable arbitration on the bus */
602 target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_AER_OFFSET);
603 aer = readl(target);
604 writel(0, target);
605
606 /* read plssr to ensure it got there */
607 target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_PLSSR_OFFSET);
608 val = readl(target);
609
610 /* poll split queues until all DMA activity is done */
611 target = calgary_reg(bbar, split_queue_offset(tbl->it_busno));
612 do {
613 val = readq(target);
614 i++;
615 } while ((val & 0xff) != 0xff && i < 100);
616 if (i == 100)
617 printk(KERN_WARNING "Calgary: PCI bus not quiesced, "
618 "continuing anyway\n");
619
620 /* invalidate TCE cache */
621 target = calgary_reg(bbar, tar_offset(tbl->it_busno));
622 writeq(tbl->tar_val, target);
623
624 /* enable arbitration */
625 target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_AER_OFFSET);
626 writel(aer, target);
627 (void)readl(target); /* flush */
628}
629
630static void calioc2_tce_cache_blast(struct iommu_table *tbl)
631{
632 void __iomem *bbar = tbl->bbar;
633 void __iomem *target;
634 u64 val64;
635 u32 val;
636 int i = 0;
637 int count = 1;
638 unsigned char bus = tbl->it_busno;
639
640begin:
641 printk(KERN_DEBUG "Calgary: CalIOC2 bus 0x%x entering tce cache blast "
642 "sequence - count %d\n", bus, count);
643
644 /* 1. using the Page Migration Control reg set SoftStop */
645 target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL);
646 val = be32_to_cpu(readl(target));
647 printk(KERN_DEBUG "1a. read 0x%x [LE] from %p\n", val, target);
648 val |= PMR_SOFTSTOP;
649 printk(KERN_DEBUG "1b. writing 0x%x [LE] to %p\n", val, target);
650 writel(cpu_to_be32(val), target);
651
652 /* 2. poll split queues until all DMA activity is done */
653 printk(KERN_DEBUG "2a. starting to poll split queues\n");
654 target = calgary_reg(bbar, split_queue_offset(bus));
655 do {
656 val64 = readq(target);
657 i++;
658 } while ((val64 & 0xff) != 0xff && i < 100);
659 if (i == 100)
660 printk(KERN_WARNING "CalIOC2: PCI bus not quiesced, "
661 "continuing anyway\n");
662
663 /* 3. poll Page Migration DEBUG for SoftStopFault */
664 target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG);
665 val = be32_to_cpu(readl(target));
666 printk(KERN_DEBUG "3. read 0x%x [LE] from %p\n", val, target);
667
668 /* 4. if SoftStopFault - goto (1) */
669 if (val & PMR_SOFTSTOPFAULT) {
670 if (++count < 100)
671 goto begin;
672 else {
673 printk(KERN_WARNING "CalIOC2: too many SoftStopFaults, "
674 "aborting TCE cache flush sequence!\n");
675 return; /* pray for the best */
676 }
677 }
678
679 /* 5. Slam into HardStop by reading PHB_PAGE_MIG_CTRL */
680 target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL);
681 printk(KERN_DEBUG "5a. slamming into HardStop by reading %p\n", target);
682 val = be32_to_cpu(readl(target));
683 printk(KERN_DEBUG "5b. read 0x%x [LE] from %p\n", val, target);
684 target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG);
685 val = be32_to_cpu(readl(target));
686 printk(KERN_DEBUG "5c. read 0x%x [LE] from %p (debug)\n", val, target);
687
688 /* 6. invalidate TCE cache */
689 printk(KERN_DEBUG "6. invalidating TCE cache\n");
690 target = calgary_reg(bbar, tar_offset(bus));
691 writeq(tbl->tar_val, target);
692
693 /* 7. Re-read PMCR */
694 printk(KERN_DEBUG "7a. Re-reading PMCR\n");
695 target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL);
696 val = be32_to_cpu(readl(target));
697 printk(KERN_DEBUG "7b. read 0x%x [LE] from %p\n", val, target);
698
699 /* 8. Remove HardStop */
700 printk(KERN_DEBUG "8a. removing HardStop from PMCR\n");
701 target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL);
702 val = 0;
703 printk(KERN_DEBUG "8b. writing 0x%x [LE] to %p\n", val, target);
704 writel(cpu_to_be32(val), target);
705 val = be32_to_cpu(readl(target));
706 printk(KERN_DEBUG "8c. read 0x%x [LE] from %p\n", val, target);
707}
708
709static void __init calgary_reserve_mem_region(struct pci_dev *dev, u64 start,
710 u64 limit)
711{
712 unsigned int numpages;
713
714 limit = limit | 0xfffff;
715 limit++;
716
717 numpages = ((limit - start) >> PAGE_SHIFT);
718 iommu_range_reserve(pci_iommu(dev->bus), start, numpages);
719}
720
721static void __init calgary_reserve_peripheral_mem_1(struct pci_dev *dev)
722{
723 void __iomem *target;
724 u64 low, high, sizelow;
725 u64 start, limit;
726 struct iommu_table *tbl = pci_iommu(dev->bus);
727 unsigned char busnum = dev->bus->number;
728 void __iomem *bbar = tbl->bbar;
729
730 /* peripheral MEM_1 region */
731 target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_LOW);
732 low = be32_to_cpu(readl(target));
733 target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_HIGH);
734 high = be32_to_cpu(readl(target));
735 target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_SIZE);
736 sizelow = be32_to_cpu(readl(target));
737
738 start = (high << 32) | low;
739 limit = sizelow;
740
741 calgary_reserve_mem_region(dev, start, limit);
742}
743
744static void __init calgary_reserve_peripheral_mem_2(struct pci_dev *dev)
745{
746 void __iomem *target;
747 u32 val32;
748 u64 low, high, sizelow, sizehigh;
749 u64 start, limit;
750 struct iommu_table *tbl = pci_iommu(dev->bus);
751 unsigned char busnum = dev->bus->number;
752 void __iomem *bbar = tbl->bbar;
753
754 /* is it enabled? */
755 target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET);
756 val32 = be32_to_cpu(readl(target));
757 if (!(val32 & PHB_MEM2_ENABLE))
758 return;
759
760 target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_LOW);
761 low = be32_to_cpu(readl(target));
762 target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_HIGH);
763 high = be32_to_cpu(readl(target));
764 target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_SIZE_LOW);
765 sizelow = be32_to_cpu(readl(target));
766 target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_SIZE_HIGH);
767 sizehigh = be32_to_cpu(readl(target));
768
769 start = (high << 32) | low;
770 limit = (sizehigh << 32) | sizelow;
771
772 calgary_reserve_mem_region(dev, start, limit);
773}
774
775/*
776 * some regions of the IO address space do not get translated, so we
777 * must not give devices IO addresses in those regions. The regions
778 * are the 640KB-1MB region and the two PCI peripheral memory holes.
779 * Reserve all of them in the IOMMU bitmap to avoid giving them out
780 * later.
781 */
782static void __init calgary_reserve_regions(struct pci_dev *dev)
783{
784 unsigned int npages;
785 u64 start;
786 struct iommu_table *tbl = pci_iommu(dev->bus);
787
788 /* reserve EMERGENCY_PAGES from bad_dma_address and up */
789 iommu_range_reserve(tbl, bad_dma_address, EMERGENCY_PAGES);
790
791 /* avoid the BIOS/VGA first 640KB-1MB region */
792 /* for CalIOC2 - avoid the entire first MB */
793 if (is_calgary(dev->device)) {
794 start = (640 * 1024);
795 npages = ((1024 - 640) * 1024) >> PAGE_SHIFT;
796 } else { /* calioc2 */
797 start = 0;
798 npages = (1 * 1024 * 1024) >> PAGE_SHIFT;
799 }
800 iommu_range_reserve(tbl, start, npages);
801
802 /* reserve the two PCI peripheral memory regions in IO space */
803 calgary_reserve_peripheral_mem_1(dev);
804 calgary_reserve_peripheral_mem_2(dev);
805}
806
807static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar)
808{
809 u64 val64;
810 u64 table_phys;
811 void __iomem *target;
812 int ret;
813 struct iommu_table *tbl;
814
815 /* build TCE tables for each PHB */
816 ret = build_tce_table(dev, bbar);
817 if (ret)
818 return ret;
819
820 tbl = pci_iommu(dev->bus);
821 tbl->it_base = (unsigned long)bus_info[dev->bus->number].tce_space;
822 tce_free(tbl, 0, tbl->it_size);
823
824 if (is_calgary(dev->device))
825 tbl->chip_ops = &calgary_chip_ops;
826 else if (is_calioc2(dev->device))
827 tbl->chip_ops = &calioc2_chip_ops;
828 else
829 BUG();
830
831 calgary_reserve_regions(dev);
832
833 /* set TARs for each PHB */
834 target = calgary_reg(bbar, tar_offset(dev->bus->number));
835 val64 = be64_to_cpu(readq(target));
836
837 /* zero out all TAR bits under sw control */
838 val64 &= ~TAR_SW_BITS;
839 table_phys = (u64)__pa(tbl->it_base);
840
841 val64 |= table_phys;
842
843 BUG_ON(specified_table_size > TCE_TABLE_SIZE_8M);
844 val64 |= (u64) specified_table_size;
845
846 tbl->tar_val = cpu_to_be64(val64);
847
848 writeq(tbl->tar_val, target);
849 readq(target); /* flush */
850
851 return 0;
852}
853
854static void __init calgary_free_bus(struct pci_dev *dev)
855{
856 u64 val64;
857 struct iommu_table *tbl = pci_iommu(dev->bus);
858 void __iomem *target;
859 unsigned int bitmapsz;
860
861 target = calgary_reg(tbl->bbar, tar_offset(dev->bus->number));
862 val64 = be64_to_cpu(readq(target));
863 val64 &= ~TAR_SW_BITS;
864 writeq(cpu_to_be64(val64), target);
865 readq(target); /* flush */
866
867 bitmapsz = tbl->it_size / BITS_PER_BYTE;
868 free_pages((unsigned long)tbl->it_map, get_order(bitmapsz));
869 tbl->it_map = NULL;
870
871 kfree(tbl);
872
873 set_pci_iommu(dev->bus, NULL);
874
875 /* Can't free bootmem allocated memory after system is up :-( */
876 bus_info[dev->bus->number].tce_space = NULL;
877}
878
879static void calgary_dump_error_regs(struct iommu_table *tbl)
880{
881 void __iomem *bbar = tbl->bbar;
882 void __iomem *target;
883 u32 csr, plssr;
884
885 target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_CSR_OFFSET);
886 csr = be32_to_cpu(readl(target));
887
888 target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_PLSSR_OFFSET);
889 plssr = be32_to_cpu(readl(target));
890
891 /* If no error, the agent ID in the CSR is not valid */
892 printk(KERN_EMERG "Calgary: DMA error on Calgary PHB 0x%x, "
893 "0x%08x@CSR 0x%08x@PLSSR\n", tbl->it_busno, csr, plssr);
894}
895
896static void calioc2_dump_error_regs(struct iommu_table *tbl)
897{
898 void __iomem *bbar = tbl->bbar;
899 u32 csr, csmr, plssr, mck, rcstat;
900 void __iomem *target;
901 unsigned long phboff = phb_offset(tbl->it_busno);
902 unsigned long erroff;
903 u32 errregs[7];
904 int i;
905
906 /* dump CSR */
907 target = calgary_reg(bbar, phboff | PHB_CSR_OFFSET);
908 csr = be32_to_cpu(readl(target));
909 /* dump PLSSR */
910 target = calgary_reg(bbar, phboff | PHB_PLSSR_OFFSET);
911 plssr = be32_to_cpu(readl(target));
912 /* dump CSMR */
913 target = calgary_reg(bbar, phboff | 0x290);
914 csmr = be32_to_cpu(readl(target));
915 /* dump mck */
916 target = calgary_reg(bbar, phboff | 0x800);
917 mck = be32_to_cpu(readl(target));
918
919 printk(KERN_EMERG "Calgary: DMA error on CalIOC2 PHB 0x%x\n",
920 tbl->it_busno);
921
922 printk(KERN_EMERG "Calgary: 0x%08x@CSR 0x%08x@PLSSR 0x%08x@CSMR 0x%08x@MCK\n",
923 csr, plssr, csmr, mck);
924
925 /* dump rest of error regs */
926 printk(KERN_EMERG "Calgary: ");
927 for (i = 0; i < ARRAY_SIZE(errregs); i++) {
928 /* err regs are at 0x810 - 0x870 */
929 erroff = (0x810 + (i * 0x10));
930 target = calgary_reg(bbar, phboff | erroff);
931 errregs[i] = be32_to_cpu(readl(target));
932 printk("0x%08x@0x%lx ", errregs[i], erroff);
933 }
934 printk("\n");
935
936 /* root complex status */
937 target = calgary_reg(bbar, phboff | PHB_ROOT_COMPLEX_STATUS);
938 rcstat = be32_to_cpu(readl(target));
939 printk(KERN_EMERG "Calgary: 0x%08x@0x%x\n", rcstat,
940 PHB_ROOT_COMPLEX_STATUS);
941}
942
943static void calgary_watchdog(unsigned long data)
944{
945 struct pci_dev *dev = (struct pci_dev *)data;
946 struct iommu_table *tbl = pci_iommu(dev->bus);
947 void __iomem *bbar = tbl->bbar;
948 u32 val32;
949 void __iomem *target;
950
951 target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_CSR_OFFSET);
952 val32 = be32_to_cpu(readl(target));
953
954 /* If no error, the agent ID in the CSR is not valid */
955 if (val32 & CSR_AGENT_MASK) {
956 tbl->chip_ops->dump_error_regs(tbl);
957
958 /* reset error */
959 writel(0, target);
960
961 /* Disable bus that caused the error */
962 target = calgary_reg(bbar, phb_offset(tbl->it_busno) |
963 PHB_CONFIG_RW_OFFSET);
964 val32 = be32_to_cpu(readl(target));
965 val32 |= PHB_SLOT_DISABLE;
966 writel(cpu_to_be32(val32), target);
967 readl(target); /* flush */
968 } else {
969 /* Reset the timer */
970 mod_timer(&tbl->watchdog_timer, jiffies + 2 * HZ);
971 }
972}
973
974static void __init calgary_set_split_completion_timeout(void __iomem *bbar,
975 unsigned char busnum, unsigned long timeout)
976{
977 u64 val64;
978 void __iomem *target;
979 unsigned int phb_shift = ~0; /* silence gcc */
980 u64 mask;
981
982 switch (busno_to_phbid(busnum)) {
983 case 0: phb_shift = (63 - 19);
984 break;
985 case 1: phb_shift = (63 - 23);
986 break;
987 case 2: phb_shift = (63 - 27);
988 break;
989 case 3: phb_shift = (63 - 35);
990 break;
991 default:
992 BUG_ON(busno_to_phbid(busnum));
993 }
994
995 target = calgary_reg(bbar, CALGARY_CONFIG_REG);
996 val64 = be64_to_cpu(readq(target));
997
998 /* zero out this PHB's timer bits */
999 mask = ~(0xFUL << phb_shift);
1000 val64 &= mask;
1001 val64 |= (timeout << phb_shift);
1002 writeq(cpu_to_be64(val64), target);
1003 readq(target); /* flush */
1004}
1005
1006static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev)
1007{
1008 unsigned char busnum = dev->bus->number;
1009 void __iomem *bbar = tbl->bbar;
1010 void __iomem *target;
1011 u32 val;
1012
1013 /*
1014 * CalIOC2 designers recommend setting bit 8 in 0xnDB0 to 1
1015 */
1016 target = calgary_reg(bbar, phb_offset(busnum) | PHB_SAVIOR_L2);
1017 val = cpu_to_be32(readl(target));
1018 val |= 0x00800000;
1019 writel(cpu_to_be32(val), target);
1020}
1021
1022static void calgary_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev)
1023{
1024 unsigned char busnum = dev->bus->number;
1025
1026 /*
1027 * Give split completion a longer timeout on bus 1 for aic94xx
1028 * http://bugzilla.kernel.org/show_bug.cgi?id=7180
1029 */
1030 if (is_calgary(dev->device) && (busnum == 1))
1031 calgary_set_split_completion_timeout(tbl->bbar, busnum,
1032 CCR_2SEC_TIMEOUT);
1033}
1034
1035static void __init calgary_enable_translation(struct pci_dev *dev)
1036{
1037 u32 val32;
1038 unsigned char busnum;
1039 void __iomem *target;
1040 void __iomem *bbar;
1041 struct iommu_table *tbl;
1042
1043 busnum = dev->bus->number;
1044 tbl = pci_iommu(dev->bus);
1045 bbar = tbl->bbar;
1046
1047 /* enable TCE in PHB Config Register */
1048 target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET);
1049 val32 = be32_to_cpu(readl(target));
1050 val32 |= PHB_TCE_ENABLE | PHB_DAC_DISABLE | PHB_MCSR_ENABLE;
1051
1052 printk(KERN_INFO "Calgary: enabling translation on %s PHB %#x\n",
1053 (dev->device == PCI_DEVICE_ID_IBM_CALGARY) ?
1054 "Calgary" : "CalIOC2", busnum);
1055 printk(KERN_INFO "Calgary: errant DMAs will now be prevented on this "
1056 "bus.\n");
1057
1058 writel(cpu_to_be32(val32), target);
1059 readl(target); /* flush */
1060
1061 init_timer(&tbl->watchdog_timer);
1062 tbl->watchdog_timer.function = &calgary_watchdog;
1063 tbl->watchdog_timer.data = (unsigned long)dev;
1064 mod_timer(&tbl->watchdog_timer, jiffies);
1065}
1066
1067static void __init calgary_disable_translation(struct pci_dev *dev)
1068{
1069 u32 val32;
1070 unsigned char busnum;
1071 void __iomem *target;
1072 void __iomem *bbar;
1073 struct iommu_table *tbl;
1074
1075 busnum = dev->bus->number;
1076 tbl = pci_iommu(dev->bus);
1077 bbar = tbl->bbar;
1078
1079 /* disable TCE in PHB Config Register */
1080 target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET);
1081 val32 = be32_to_cpu(readl(target));
1082 val32 &= ~(PHB_TCE_ENABLE | PHB_DAC_DISABLE | PHB_MCSR_ENABLE);
1083
1084 printk(KERN_INFO "Calgary: disabling translation on PHB %#x!\n", busnum);
1085 writel(cpu_to_be32(val32), target);
1086 readl(target); /* flush */
1087
1088 del_timer_sync(&tbl->watchdog_timer);
1089}
1090
1091static void __init calgary_init_one_nontraslated(struct pci_dev *dev)
1092{
1093 pci_dev_get(dev);
1094 set_pci_iommu(dev->bus, NULL);
1095
1096 /* is the device behind a bridge? */
1097 if (dev->bus->parent)
1098 dev->bus->parent->self = dev;
1099 else
1100 dev->bus->self = dev;
1101}
1102
1103static int __init calgary_init_one(struct pci_dev *dev)
1104{
1105 void __iomem *bbar;
1106 struct iommu_table *tbl;
1107 int ret;
1108
1109 BUG_ON(dev->bus->number >= MAX_PHB_BUS_NUM);
1110
1111 bbar = busno_to_bbar(dev->bus->number);
1112 ret = calgary_setup_tar(dev, bbar);
1113 if (ret)
1114 goto done;
1115
1116 pci_dev_get(dev);
1117
1118 if (dev->bus->parent) {
1119 if (dev->bus->parent->self)
1120 printk(KERN_WARNING "Calgary: IEEEE, dev %p has "
1121 "bus->parent->self!\n", dev);
1122 dev->bus->parent->self = dev;
1123 } else
1124 dev->bus->self = dev;
1125
1126 tbl = pci_iommu(dev->bus);
1127 tbl->chip_ops->handle_quirks(tbl, dev);
1128
1129 calgary_enable_translation(dev);
1130
1131 return 0;
1132
1133done:
1134 return ret;
1135}
1136
1137static int __init calgary_locate_bbars(void)
1138{
1139 int ret;
1140 int rioidx, phb, bus;
1141 void __iomem *bbar;
1142 void __iomem *target;
1143 unsigned long offset;
1144 u8 start_bus, end_bus;
1145 u32 val;
1146
1147 ret = -ENODATA;
1148 for (rioidx = 0; rioidx < rio_table_hdr->num_rio_dev; rioidx++) {
1149 struct rio_detail *rio = rio_devs[rioidx];
1150
1151 if ((rio->type != COMPAT_CALGARY) && (rio->type != ALT_CALGARY))
1152 continue;
1153
1154 /* map entire 1MB of Calgary config space */
1155 bbar = ioremap_nocache(rio->BBAR, 1024 * 1024);
1156 if (!bbar)
1157 goto error;
1158
1159 for (phb = 0; phb < PHBS_PER_CALGARY; phb++) {
1160 offset = phb_debug_offsets[phb] | PHB_DEBUG_STUFF_OFFSET;
1161 target = calgary_reg(bbar, offset);
1162
1163 val = be32_to_cpu(readl(target));
1164
1165 start_bus = (u8)((val & 0x00FF0000) >> 16);
1166 end_bus = (u8)((val & 0x0000FF00) >> 8);
1167
1168 if (end_bus) {
1169 for (bus = start_bus; bus <= end_bus; bus++) {
1170 bus_info[bus].bbar = bbar;
1171 bus_info[bus].phbid = phb;
1172 }
1173 } else {
1174 bus_info[start_bus].bbar = bbar;
1175 bus_info[start_bus].phbid = phb;
1176 }
1177 }
1178 }
1179
1180 return 0;
1181
1182error:
1183 /* scan bus_info and iounmap any bbars we previously ioremap'd */
1184 for (bus = 0; bus < ARRAY_SIZE(bus_info); bus++)
1185 if (bus_info[bus].bbar)
1186 iounmap(bus_info[bus].bbar);
1187
1188 return ret;
1189}
1190
1191static int __init calgary_init(void)
1192{
1193 int ret;
1194 struct pci_dev *dev = NULL;
1195 void *tce_space;
1196
1197 ret = calgary_locate_bbars();
1198 if (ret)
1199 return ret;
1200
1201 do {
1202 dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev);
1203 if (!dev)
1204 break;
1205 if (!is_cal_pci_dev(dev->device))
1206 continue;
1207 if (!translate_phb(dev)) {
1208 calgary_init_one_nontraslated(dev);
1209 continue;
1210 }
1211 tce_space = bus_info[dev->bus->number].tce_space;
1212 if (!tce_space && !translate_empty_slots)
1213 continue;
1214
1215 ret = calgary_init_one(dev);
1216 if (ret)
1217 goto error;
1218 } while (1);
1219
1220 return ret;
1221
1222error:
1223 do {
1224 dev = pci_get_device_reverse(PCI_VENDOR_ID_IBM,
1225 PCI_ANY_ID, dev);
1226 if (!dev)
1227 break;
1228 if (!is_cal_pci_dev(dev->device))
1229 continue;
1230 if (!translate_phb(dev)) {
1231 pci_dev_put(dev);
1232 continue;
1233 }
1234 if (!bus_info[dev->bus->number].tce_space && !translate_empty_slots)
1235 continue;
1236
1237 calgary_disable_translation(dev);
1238 calgary_free_bus(dev);
1239 pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */
1240 } while (1);
1241
1242 return ret;
1243}
1244
1245static inline int __init determine_tce_table_size(u64 ram)
1246{
1247 int ret;
1248
1249 if (specified_table_size != TCE_TABLE_SIZE_UNSPECIFIED)
1250 return specified_table_size;
1251
1252 /*
1253 * Table sizes are from 0 to 7 (TCE_TABLE_SIZE_64K to
1254 * TCE_TABLE_SIZE_8M). Table size 0 has 8K entries and each
1255 * larger table size has twice as many entries, so shift the
1256 * max ram address by 13 to divide by 8K and then look at the
1257 * order of the result to choose between 0-7.
1258 */
1259 ret = get_order(ram >> 13);
1260 if (ret > TCE_TABLE_SIZE_8M)
1261 ret = TCE_TABLE_SIZE_8M;
1262
1263 return ret;
1264}
1265
1266static int __init build_detail_arrays(void)
1267{
1268 unsigned long ptr;
1269 int i, scal_detail_size, rio_detail_size;
1270
1271 if (rio_table_hdr->num_scal_dev > MAX_NUMNODES){
1272 printk(KERN_WARNING
1273 "Calgary: MAX_NUMNODES too low! Defined as %d, "
1274 "but system has %d nodes.\n",
1275 MAX_NUMNODES, rio_table_hdr->num_scal_dev);
1276 return -ENODEV;
1277 }
1278
1279 switch (rio_table_hdr->version){
1280 case 2:
1281 scal_detail_size = 11;
1282 rio_detail_size = 13;
1283 break;
1284 case 3:
1285 scal_detail_size = 12;
1286 rio_detail_size = 15;
1287 break;
1288 default:
1289 printk(KERN_WARNING
1290 "Calgary: Invalid Rio Grande Table Version: %d\n",
1291 rio_table_hdr->version);
1292 return -EPROTO;
1293 }
1294
1295 ptr = ((unsigned long)rio_table_hdr) + 3;
1296 for (i = 0; i < rio_table_hdr->num_scal_dev;
1297 i++, ptr += scal_detail_size)
1298 scal_devs[i] = (struct scal_detail *)ptr;
1299
1300 for (i = 0; i < rio_table_hdr->num_rio_dev;
1301 i++, ptr += rio_detail_size)
1302 rio_devs[i] = (struct rio_detail *)ptr;
1303
1304 return 0;
1305}
1306
1307static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev)
1308{
1309 int dev;
1310 u32 val;
1311
1312 if (pci_dev == PCI_DEVICE_ID_IBM_CALIOC2) {
1313 /*
1314 * FIXME: properly scan for devices accross the
1315 * PCI-to-PCI bridge on every CalIOC2 port.
1316 */
1317 return 1;
1318 }
1319
1320 for (dev = 1; dev < 8; dev++) {
1321 val = read_pci_config(bus, dev, 0, 0);
1322 if (val != 0xffffffff)
1323 break;
1324 }
1325 return (val != 0xffffffff);
1326}
1327
1328void __init detect_calgary(void)
1329{
1330 int bus;
1331 void *tbl;
1332 int calgary_found = 0;
1333 unsigned long ptr;
1334 unsigned int offset, prev_offset;
1335 int ret;
1336
1337 /*
1338 * if the user specified iommu=off or iommu=soft or we found
1339 * another HW IOMMU already, bail out.
1340 */
1341 if (swiotlb || no_iommu || iommu_detected)
1342 return;
1343
1344 if (!use_calgary)
1345 return;
1346
1347 if (!early_pci_allowed())
1348 return;
1349
1350 printk(KERN_DEBUG "Calgary: detecting Calgary via BIOS EBDA area\n");
1351
1352 ptr = (unsigned long)phys_to_virt(get_bios_ebda());
1353
1354 rio_table_hdr = NULL;
1355 prev_offset = 0;
1356 offset = 0x180;
1357 /*
1358 * The next offset is stored in the 1st word.
1359 * Only parse up until the offset increases:
1360 */
1361 while (offset > prev_offset) {
1362 /* The block id is stored in the 2nd word */
1363 if (*((unsigned short *)(ptr + offset + 2)) == 0x4752){
1364 /* set the pointer past the offset & block id */
1365 rio_table_hdr = (struct rio_table_hdr *)(ptr + offset + 4);
1366 break;
1367 }
1368 prev_offset = offset;
1369 offset = *((unsigned short *)(ptr + offset));
1370 }
1371 if (!rio_table_hdr) {
1372 printk(KERN_DEBUG "Calgary: Unable to locate Rio Grande table "
1373 "in EBDA - bailing!\n");
1374 return;
1375 }
1376
1377 ret = build_detail_arrays();
1378 if (ret) {
1379 printk(KERN_DEBUG "Calgary: build_detail_arrays ret %d\n", ret);
1380 return;
1381 }
1382
1383 specified_table_size = determine_tce_table_size(end_pfn * PAGE_SIZE);
1384
1385 for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) {
1386 struct calgary_bus_info *info = &bus_info[bus];
1387 unsigned short pci_device;
1388 u32 val;
1389
1390 val = read_pci_config(bus, 0, 0, 0);
1391 pci_device = (val & 0xFFFF0000) >> 16;
1392
1393 if (!is_cal_pci_dev(pci_device))
1394 continue;
1395
1396 if (info->translation_disabled)
1397 continue;
1398
1399 if (calgary_bus_has_devices(bus, pci_device) ||
1400 translate_empty_slots) {
1401 tbl = alloc_tce_table();
1402 if (!tbl)
1403 goto cleanup;
1404 info->tce_space = tbl;
1405 calgary_found = 1;
1406 }
1407 }
1408
1409 printk(KERN_DEBUG "Calgary: finished detection, Calgary %s\n",
1410 calgary_found ? "found" : "not found");
1411
1412 if (calgary_found) {
1413 iommu_detected = 1;
1414 calgary_detected = 1;
1415 printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected.\n");
1416 printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, "
1417 "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size,
1418 debugging ? "enabled" : "disabled");
1419 }
1420 return;
1421
1422cleanup:
1423 for (--bus; bus >= 0; --bus) {
1424 struct calgary_bus_info *info = &bus_info[bus];
1425
1426 if (info->tce_space)
1427 free_tce_table(info->tce_space);
1428 }
1429}
1430
1431int __init calgary_iommu_init(void)
1432{
1433 int ret;
1434
1435 if (no_iommu || swiotlb)
1436 return -ENODEV;
1437
1438 if (!calgary_detected)
1439 return -ENODEV;
1440
1441 /* ok, we're trying to use Calgary - let's roll */
1442 printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n");
1443
1444 ret = calgary_init();
1445 if (ret) {
1446 printk(KERN_ERR "PCI-DMA: Calgary init failed %d, "
1447 "falling back to no_iommu\n", ret);
1448 if (end_pfn > MAX_DMA32_PFN)
1449 printk(KERN_ERR "WARNING more than 4GB of memory, "
1450 "32bit PCI may malfunction.\n");
1451 return ret;
1452 }
1453
1454 force_iommu = 1;
1455 bad_dma_address = 0x0;
1456 dma_ops = &calgary_dma_ops;
1457
1458 return 0;
1459}
1460
1461static int __init calgary_parse_options(char *p)
1462{
1463 unsigned int bridge;
1464 size_t len;
1465 char* endp;
1466
1467 while (*p) {
1468 if (!strncmp(p, "64k", 3))
1469 specified_table_size = TCE_TABLE_SIZE_64K;
1470 else if (!strncmp(p, "128k", 4))
1471 specified_table_size = TCE_TABLE_SIZE_128K;
1472 else if (!strncmp(p, "256k", 4))
1473 specified_table_size = TCE_TABLE_SIZE_256K;
1474 else if (!strncmp(p, "512k", 4))
1475 specified_table_size = TCE_TABLE_SIZE_512K;
1476 else if (!strncmp(p, "1M", 2))
1477 specified_table_size = TCE_TABLE_SIZE_1M;
1478 else if (!strncmp(p, "2M", 2))
1479 specified_table_size = TCE_TABLE_SIZE_2M;
1480 else if (!strncmp(p, "4M", 2))
1481 specified_table_size = TCE_TABLE_SIZE_4M;
1482 else if (!strncmp(p, "8M", 2))
1483 specified_table_size = TCE_TABLE_SIZE_8M;
1484
1485 len = strlen("translate_empty_slots");
1486 if (!strncmp(p, "translate_empty_slots", len))
1487 translate_empty_slots = 1;
1488
1489 len = strlen("disable");
1490 if (!strncmp(p, "disable", len)) {
1491 p += len;
1492 if (*p == '=')
1493 ++p;
1494 if (*p == '\0')
1495 break;
1496 bridge = simple_strtol(p, &endp, 0);
1497 if (p == endp)
1498 break;
1499
1500 if (bridge < MAX_PHB_BUS_NUM) {
1501 printk(KERN_INFO "Calgary: disabling "
1502 "translation for PHB %#x\n", bridge);
1503 bus_info[bridge].translation_disabled = 1;
1504 }
1505 }
1506
1507 p = strpbrk(p, ",");
1508 if (!p)
1509 break;
1510
1511 p++; /* skip ',' */
1512 }
1513 return 1;
1514}
1515__setup("calgary=", calgary_parse_options);
1516
1517static void __init calgary_fixup_one_tce_space(struct pci_dev *dev)
1518{
1519 struct iommu_table *tbl;
1520 unsigned int npages;
1521 int i;
1522
1523 tbl = pci_iommu(dev->bus);
1524
1525 for (i = 0; i < 4; i++) {
1526 struct resource *r = &dev->resource[PCI_BRIDGE_RESOURCES + i];
1527
1528 /* Don't give out TCEs that map MEM resources */
1529 if (!(r->flags & IORESOURCE_MEM))
1530 continue;
1531
1532 /* 0-based? we reserve the whole 1st MB anyway */
1533 if (!r->start)
1534 continue;
1535
1536 /* cover the whole region */
1537 npages = (r->end - r->start) >> PAGE_SHIFT;
1538 npages++;
1539
1540 iommu_range_reserve(tbl, r->start, npages);
1541 }
1542}
1543
1544static int __init calgary_fixup_tce_spaces(void)
1545{
1546 struct pci_dev *dev = NULL;
1547 void *tce_space;
1548
1549 if (no_iommu || swiotlb || !calgary_detected)
1550 return -ENODEV;
1551
1552 printk(KERN_DEBUG "Calgary: fixing up tce spaces\n");
1553
1554 do {
1555 dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev);
1556 if (!dev)
1557 break;
1558 if (!is_cal_pci_dev(dev->device))
1559 continue;
1560 if (!translate_phb(dev))
1561 continue;
1562
1563 tce_space = bus_info[dev->bus->number].tce_space;
1564 if (!tce_space)
1565 continue;
1566
1567 calgary_fixup_one_tce_space(dev);
1568
1569 } while (1);
1570
1571 return 0;
1572}
1573
1574/*
1575 * We need to be call after pcibios_assign_resources (fs_initcall level)
1576 * and before device_initcall.
1577 */
1578rootfs_initcall(calgary_fixup_tce_spaces);
diff --git a/arch/x86/kernel/pci-dma_32.c b/arch/x86/kernel/pci-dma_32.c
new file mode 100644
index 000000000000..048f09b62553
--- /dev/null
+++ b/arch/x86/kernel/pci-dma_32.c
@@ -0,0 +1,177 @@
1/*
2 * Dynamic DMA mapping support.
3 *
4 * On i386 there is no hardware dynamic DMA address translation,
5 * so consistent alloc/free are merely page allocation/freeing.
6 * The rest of the dynamic DMA mapping interface is implemented
7 * in asm/pci.h.
8 */
9
10#include <linux/types.h>
11#include <linux/mm.h>
12#include <linux/string.h>
13#include <linux/pci.h>
14#include <linux/module.h>
15#include <linux/pci.h>
16#include <asm/io.h>
17
18struct dma_coherent_mem {
19 void *virt_base;
20 u32 device_base;
21 int size;
22 int flags;
23 unsigned long *bitmap;
24};
25
26void *dma_alloc_coherent(struct device *dev, size_t size,
27 dma_addr_t *dma_handle, gfp_t gfp)
28{
29 void *ret;
30 struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
31 int order = get_order(size);
32 /* ignore region specifiers */
33 gfp &= ~(__GFP_DMA | __GFP_HIGHMEM);
34
35 if (mem) {
36 int page = bitmap_find_free_region(mem->bitmap, mem->size,
37 order);
38 if (page >= 0) {
39 *dma_handle = mem->device_base + (page << PAGE_SHIFT);
40 ret = mem->virt_base + (page << PAGE_SHIFT);
41 memset(ret, 0, size);
42 return ret;
43 }
44 if (mem->flags & DMA_MEMORY_EXCLUSIVE)
45 return NULL;
46 }
47
48 if (dev == NULL || (dev->coherent_dma_mask < 0xffffffff))
49 gfp |= GFP_DMA;
50
51 ret = (void *)__get_free_pages(gfp, order);
52
53 if (ret != NULL) {
54 memset(ret, 0, size);
55 *dma_handle = virt_to_phys(ret);
56 }
57 return ret;
58}
59EXPORT_SYMBOL(dma_alloc_coherent);
60
61void dma_free_coherent(struct device *dev, size_t size,
62 void *vaddr, dma_addr_t dma_handle)
63{
64 struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
65 int order = get_order(size);
66
67 if (mem && vaddr >= mem->virt_base && vaddr < (mem->virt_base + (mem->size << PAGE_SHIFT))) {
68 int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
69
70 bitmap_release_region(mem->bitmap, page, order);
71 } else
72 free_pages((unsigned long)vaddr, order);
73}
74EXPORT_SYMBOL(dma_free_coherent);
75
76int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
77 dma_addr_t device_addr, size_t size, int flags)
78{
79 void __iomem *mem_base = NULL;
80 int pages = size >> PAGE_SHIFT;
81 int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
82
83 if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
84 goto out;
85 if (!size)
86 goto out;
87 if (dev->dma_mem)
88 goto out;
89
90 /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
91
92 mem_base = ioremap(bus_addr, size);
93 if (!mem_base)
94 goto out;
95
96 dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
97 if (!dev->dma_mem)
98 goto out;
99 dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
100 if (!dev->dma_mem->bitmap)
101 goto free1_out;
102
103 dev->dma_mem->virt_base = mem_base;
104 dev->dma_mem->device_base = device_addr;
105 dev->dma_mem->size = pages;
106 dev->dma_mem->flags = flags;
107
108 if (flags & DMA_MEMORY_MAP)
109 return DMA_MEMORY_MAP;
110
111 return DMA_MEMORY_IO;
112
113 free1_out:
114 kfree(dev->dma_mem);
115 out:
116 if (mem_base)
117 iounmap(mem_base);
118 return 0;
119}
120EXPORT_SYMBOL(dma_declare_coherent_memory);
121
122void dma_release_declared_memory(struct device *dev)
123{
124 struct dma_coherent_mem *mem = dev->dma_mem;
125
126 if(!mem)
127 return;
128 dev->dma_mem = NULL;
129 iounmap(mem->virt_base);
130 kfree(mem->bitmap);
131 kfree(mem);
132}
133EXPORT_SYMBOL(dma_release_declared_memory);
134
135void *dma_mark_declared_memory_occupied(struct device *dev,
136 dma_addr_t device_addr, size_t size)
137{
138 struct dma_coherent_mem *mem = dev->dma_mem;
139 int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1) >> PAGE_SHIFT;
140 int pos, err;
141
142 if (!mem)
143 return ERR_PTR(-EINVAL);
144
145 pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
146 err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages));
147 if (err != 0)
148 return ERR_PTR(err);
149 return mem->virt_base + (pos << PAGE_SHIFT);
150}
151EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
152
153#ifdef CONFIG_PCI
154/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
155
156int forbid_dac;
157EXPORT_SYMBOL(forbid_dac);
158
159static __devinit void via_no_dac(struct pci_dev *dev)
160{
161 if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
162 printk(KERN_INFO "PCI: VIA PCI bridge detected. Disabling DAC.\n");
163 forbid_dac = 1;
164 }
165}
166DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac);
167
168static int check_iommu(char *s)
169{
170 if (!strcmp(s, "usedac")) {
171 forbid_dac = -1;
172 return 1;
173 }
174 return 0;
175}
176__setup("iommu=", check_iommu);
177#endif
diff --git a/arch/x86/kernel/pci-dma_64.c b/arch/x86/kernel/pci-dma_64.c
new file mode 100644
index 000000000000..29711445c818
--- /dev/null
+++ b/arch/x86/kernel/pci-dma_64.c
@@ -0,0 +1,346 @@
1/*
2 * Dynamic DMA mapping support.
3 */
4
5#include <linux/types.h>
6#include <linux/mm.h>
7#include <linux/string.h>
8#include <linux/pci.h>
9#include <linux/module.h>
10#include <asm/io.h>
11#include <asm/iommu.h>
12#include <asm/calgary.h>
13
14int iommu_merge __read_mostly = 0;
15EXPORT_SYMBOL(iommu_merge);
16
17dma_addr_t bad_dma_address __read_mostly;
18EXPORT_SYMBOL(bad_dma_address);
19
20/* This tells the BIO block layer to assume merging. Default to off
21 because we cannot guarantee merging later. */
22int iommu_bio_merge __read_mostly = 0;
23EXPORT_SYMBOL(iommu_bio_merge);
24
25static int iommu_sac_force __read_mostly = 0;
26
27int no_iommu __read_mostly;
28#ifdef CONFIG_IOMMU_DEBUG
29int panic_on_overflow __read_mostly = 1;
30int force_iommu __read_mostly = 1;
31#else
32int panic_on_overflow __read_mostly = 0;
33int force_iommu __read_mostly= 0;
34#endif
35
36/* Set this to 1 if there is a HW IOMMU in the system */
37int iommu_detected __read_mostly = 0;
38
39/* Dummy device used for NULL arguments (normally ISA). Better would
40 be probably a smaller DMA mask, but this is bug-to-bug compatible
41 to i386. */
42struct device fallback_dev = {
43 .bus_id = "fallback device",
44 .coherent_dma_mask = DMA_32BIT_MASK,
45 .dma_mask = &fallback_dev.coherent_dma_mask,
46};
47
48/* Allocate DMA memory on node near device */
49noinline static void *
50dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order)
51{
52 struct page *page;
53 int node;
54#ifdef CONFIG_PCI
55 if (dev->bus == &pci_bus_type)
56 node = pcibus_to_node(to_pci_dev(dev)->bus);
57 else
58#endif
59 node = numa_node_id();
60
61 if (node < first_node(node_online_map))
62 node = first_node(node_online_map);
63
64 page = alloc_pages_node(node, gfp, order);
65 return page ? page_address(page) : NULL;
66}
67
68/*
69 * Allocate memory for a coherent mapping.
70 */
71void *
72dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
73 gfp_t gfp)
74{
75 void *memory;
76 unsigned long dma_mask = 0;
77 u64 bus;
78
79 if (!dev)
80 dev = &fallback_dev;
81 dma_mask = dev->coherent_dma_mask;
82 if (dma_mask == 0)
83 dma_mask = DMA_32BIT_MASK;
84
85 /* Device not DMA able */
86 if (dev->dma_mask == NULL)
87 return NULL;
88
89 /* Don't invoke OOM killer */
90 gfp |= __GFP_NORETRY;
91
92 /* Kludge to make it bug-to-bug compatible with i386. i386
93 uses the normal dma_mask for alloc_coherent. */
94 dma_mask &= *dev->dma_mask;
95
96 /* Why <=? Even when the mask is smaller than 4GB it is often
97 larger than 16MB and in this case we have a chance of
98 finding fitting memory in the next higher zone first. If
99 not retry with true GFP_DMA. -AK */
100 if (dma_mask <= DMA_32BIT_MASK)
101 gfp |= GFP_DMA32;
102
103 again:
104 memory = dma_alloc_pages(dev, gfp, get_order(size));
105 if (memory == NULL)
106 return NULL;
107
108 {
109 int high, mmu;
110 bus = virt_to_bus(memory);
111 high = (bus + size) >= dma_mask;
112 mmu = high;
113 if (force_iommu && !(gfp & GFP_DMA))
114 mmu = 1;
115 else if (high) {
116 free_pages((unsigned long)memory,
117 get_order(size));
118
119 /* Don't use the 16MB ZONE_DMA unless absolutely
120 needed. It's better to use remapping first. */
121 if (dma_mask < DMA_32BIT_MASK && !(gfp & GFP_DMA)) {
122 gfp = (gfp & ~GFP_DMA32) | GFP_DMA;
123 goto again;
124 }
125
126 /* Let low level make its own zone decisions */
127 gfp &= ~(GFP_DMA32|GFP_DMA);
128
129 if (dma_ops->alloc_coherent)
130 return dma_ops->alloc_coherent(dev, size,
131 dma_handle, gfp);
132 return NULL;
133 }
134
135 memset(memory, 0, size);
136 if (!mmu) {
137 *dma_handle = virt_to_bus(memory);
138 return memory;
139 }
140 }
141
142 if (dma_ops->alloc_coherent) {
143 free_pages((unsigned long)memory, get_order(size));
144 gfp &= ~(GFP_DMA|GFP_DMA32);
145 return dma_ops->alloc_coherent(dev, size, dma_handle, gfp);
146 }
147
148 if (dma_ops->map_simple) {
149 *dma_handle = dma_ops->map_simple(dev, memory,
150 size,
151 PCI_DMA_BIDIRECTIONAL);
152 if (*dma_handle != bad_dma_address)
153 return memory;
154 }
155
156 if (panic_on_overflow)
157 panic("dma_alloc_coherent: IOMMU overflow by %lu bytes\n",size);
158 free_pages((unsigned long)memory, get_order(size));
159 return NULL;
160}
161EXPORT_SYMBOL(dma_alloc_coherent);
162
163/*
164 * Unmap coherent memory.
165 * The caller must ensure that the device has finished accessing the mapping.
166 */
167void dma_free_coherent(struct device *dev, size_t size,
168 void *vaddr, dma_addr_t bus)
169{
170 if (dma_ops->unmap_single)
171 dma_ops->unmap_single(dev, bus, size, 0);
172 free_pages((unsigned long)vaddr, get_order(size));
173}
174EXPORT_SYMBOL(dma_free_coherent);
175
176static int forbid_dac __read_mostly;
177
178int dma_supported(struct device *dev, u64 mask)
179{
180#ifdef CONFIG_PCI
181 if (mask > 0xffffffff && forbid_dac > 0) {
182
183
184
185 printk(KERN_INFO "PCI: Disallowing DAC for device %s\n", dev->bus_id);
186 return 0;
187 }
188#endif
189
190 if (dma_ops->dma_supported)
191 return dma_ops->dma_supported(dev, mask);
192
193 /* Copied from i386. Doesn't make much sense, because it will
194 only work for pci_alloc_coherent.
195 The caller just has to use GFP_DMA in this case. */
196 if (mask < DMA_24BIT_MASK)
197 return 0;
198
199 /* Tell the device to use SAC when IOMMU force is on. This
200 allows the driver to use cheaper accesses in some cases.
201
202 Problem with this is that if we overflow the IOMMU area and
203 return DAC as fallback address the device may not handle it
204 correctly.
205
206 As a special case some controllers have a 39bit address
207 mode that is as efficient as 32bit (aic79xx). Don't force
208 SAC for these. Assume all masks <= 40 bits are of this
209 type. Normally this doesn't make any difference, but gives
210 more gentle handling of IOMMU overflow. */
211 if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) {
212 printk(KERN_INFO "%s: Force SAC with mask %Lx\n", dev->bus_id,mask);
213 return 0;
214 }
215
216 return 1;
217}
218EXPORT_SYMBOL(dma_supported);
219
220int dma_set_mask(struct device *dev, u64 mask)
221{
222 if (!dev->dma_mask || !dma_supported(dev, mask))
223 return -EIO;
224 *dev->dma_mask = mask;
225 return 0;
226}
227EXPORT_SYMBOL(dma_set_mask);
228
229/*
230 * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter
231 * documentation.
232 */
233__init int iommu_setup(char *p)
234{
235 iommu_merge = 1;
236
237 if (!p)
238 return -EINVAL;
239
240 while (*p) {
241 if (!strncmp(p,"off",3))
242 no_iommu = 1;
243 /* gart_parse_options has more force support */
244 if (!strncmp(p,"force",5))
245 force_iommu = 1;
246 if (!strncmp(p,"noforce",7)) {
247 iommu_merge = 0;
248 force_iommu = 0;
249 }
250
251 if (!strncmp(p, "biomerge",8)) {
252 iommu_bio_merge = 4096;
253 iommu_merge = 1;
254 force_iommu = 1;
255 }
256 if (!strncmp(p, "panic",5))
257 panic_on_overflow = 1;
258 if (!strncmp(p, "nopanic",7))
259 panic_on_overflow = 0;
260 if (!strncmp(p, "merge",5)) {
261 iommu_merge = 1;
262 force_iommu = 1;
263 }
264 if (!strncmp(p, "nomerge",7))
265 iommu_merge = 0;
266 if (!strncmp(p, "forcesac",8))
267 iommu_sac_force = 1;
268 if (!strncmp(p, "allowdac", 8))
269 forbid_dac = 0;
270 if (!strncmp(p, "nodac", 5))
271 forbid_dac = -1;
272
273#ifdef CONFIG_SWIOTLB
274 if (!strncmp(p, "soft",4))
275 swiotlb = 1;
276#endif
277
278#ifdef CONFIG_IOMMU
279 gart_parse_options(p);
280#endif
281
282#ifdef CONFIG_CALGARY_IOMMU
283 if (!strncmp(p, "calgary", 7))
284 use_calgary = 1;
285#endif /* CONFIG_CALGARY_IOMMU */
286
287 p += strcspn(p, ",");
288 if (*p == ',')
289 ++p;
290 }
291 return 0;
292}
293early_param("iommu", iommu_setup);
294
295void __init pci_iommu_alloc(void)
296{
297 /*
298 * The order of these functions is important for
299 * fall-back/fail-over reasons
300 */
301#ifdef CONFIG_IOMMU
302 iommu_hole_init();
303#endif
304
305#ifdef CONFIG_CALGARY_IOMMU
306 detect_calgary();
307#endif
308
309#ifdef CONFIG_SWIOTLB
310 pci_swiotlb_init();
311#endif
312}
313
314static int __init pci_iommu_init(void)
315{
316#ifdef CONFIG_CALGARY_IOMMU
317 calgary_iommu_init();
318#endif
319
320#ifdef CONFIG_IOMMU
321 gart_iommu_init();
322#endif
323
324 no_iommu_init();
325 return 0;
326}
327
328void pci_iommu_shutdown(void)
329{
330 gart_iommu_shutdown();
331}
332
333#ifdef CONFIG_PCI
334/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
335
336static __devinit void via_no_dac(struct pci_dev *dev)
337{
338 if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
339 printk(KERN_INFO "PCI: VIA PCI bridge detected. Disabling DAC.\n");
340 forbid_dac = 1;
341 }
342}
343DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac);
344#endif
345/* Must execute after PCI subsystem */
346fs_initcall(pci_iommu_init);
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
new file mode 100644
index 000000000000..4918c575d582
--- /dev/null
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -0,0 +1,740 @@
1/*
2 * Dynamic DMA mapping support for AMD Hammer.
3 *
4 * Use the integrated AGP GART in the Hammer northbridge as an IOMMU for PCI.
5 * This allows to use PCI devices that only support 32bit addresses on systems
6 * with more than 4GB.
7 *
8 * See Documentation/DMA-mapping.txt for the interface specification.
9 *
10 * Copyright 2002 Andi Kleen, SuSE Labs.
11 */
12
13#include <linux/types.h>
14#include <linux/ctype.h>
15#include <linux/agp_backend.h>
16#include <linux/init.h>
17#include <linux/mm.h>
18#include <linux/string.h>
19#include <linux/spinlock.h>
20#include <linux/pci.h>
21#include <linux/module.h>
22#include <linux/topology.h>
23#include <linux/interrupt.h>
24#include <linux/bitops.h>
25#include <linux/kdebug.h>
26#include <asm/atomic.h>
27#include <asm/io.h>
28#include <asm/mtrr.h>
29#include <asm/pgtable.h>
30#include <asm/proto.h>
31#include <asm/iommu.h>
32#include <asm/cacheflush.h>
33#include <asm/swiotlb.h>
34#include <asm/dma.h>
35#include <asm/k8.h>
36
37unsigned long iommu_bus_base; /* GART remapping area (physical) */
38static unsigned long iommu_size; /* size of remapping area bytes */
39static unsigned long iommu_pages; /* .. and in pages */
40
41u32 *iommu_gatt_base; /* Remapping table */
42
43/* If this is disabled the IOMMU will use an optimized flushing strategy
44 of only flushing when an mapping is reused. With it true the GART is flushed
45 for every mapping. Problem is that doing the lazy flush seems to trigger
46 bugs with some popular PCI cards, in particular 3ware (but has been also
47 also seen with Qlogic at least). */
48int iommu_fullflush = 1;
49
50/* Allocation bitmap for the remapping area */
51static DEFINE_SPINLOCK(iommu_bitmap_lock);
52static unsigned long *iommu_gart_bitmap; /* guarded by iommu_bitmap_lock */
53
54static u32 gart_unmapped_entry;
55
56#define GPTE_VALID 1
57#define GPTE_COHERENT 2
58#define GPTE_ENCODE(x) \
59 (((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT)
60#define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28))
61
62#define to_pages(addr,size) \
63 (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT)
64
65#define EMERGENCY_PAGES 32 /* = 128KB */
66
67#ifdef CONFIG_AGP
68#define AGPEXTERN extern
69#else
70#define AGPEXTERN
71#endif
72
73/* backdoor interface to AGP driver */
74AGPEXTERN int agp_memory_reserved;
75AGPEXTERN __u32 *agp_gatt_table;
76
77static unsigned long next_bit; /* protected by iommu_bitmap_lock */
78static int need_flush; /* global flush state. set for each gart wrap */
79
80static unsigned long alloc_iommu(int size)
81{
82 unsigned long offset, flags;
83
84 spin_lock_irqsave(&iommu_bitmap_lock, flags);
85 offset = find_next_zero_string(iommu_gart_bitmap,next_bit,iommu_pages,size);
86 if (offset == -1) {
87 need_flush = 1;
88 offset = find_next_zero_string(iommu_gart_bitmap,0,iommu_pages,size);
89 }
90 if (offset != -1) {
91 set_bit_string(iommu_gart_bitmap, offset, size);
92 next_bit = offset+size;
93 if (next_bit >= iommu_pages) {
94 next_bit = 0;
95 need_flush = 1;
96 }
97 }
98 if (iommu_fullflush)
99 need_flush = 1;
100 spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
101 return offset;
102}
103
104static void free_iommu(unsigned long offset, int size)
105{
106 unsigned long flags;
107 spin_lock_irqsave(&iommu_bitmap_lock, flags);
108 __clear_bit_string(iommu_gart_bitmap, offset, size);
109 spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
110}
111
112/*
113 * Use global flush state to avoid races with multiple flushers.
114 */
115static void flush_gart(void)
116{
117 unsigned long flags;
118 spin_lock_irqsave(&iommu_bitmap_lock, flags);
119 if (need_flush) {
120 k8_flush_garts();
121 need_flush = 0;
122 }
123 spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
124}
125
126#ifdef CONFIG_IOMMU_LEAK
127
128#define SET_LEAK(x) if (iommu_leak_tab) \
129 iommu_leak_tab[x] = __builtin_return_address(0);
130#define CLEAR_LEAK(x) if (iommu_leak_tab) \
131 iommu_leak_tab[x] = NULL;
132
133/* Debugging aid for drivers that don't free their IOMMU tables */
134static void **iommu_leak_tab;
135static int leak_trace;
136int iommu_leak_pages = 20;
137void dump_leak(void)
138{
139 int i;
140 static int dump;
141 if (dump || !iommu_leak_tab) return;
142 dump = 1;
143 show_stack(NULL,NULL);
144 /* Very crude. dump some from the end of the table too */
145 printk("Dumping %d pages from end of IOMMU:\n", iommu_leak_pages);
146 for (i = 0; i < iommu_leak_pages; i+=2) {
147 printk("%lu: ", iommu_pages-i);
148 printk_address((unsigned long) iommu_leak_tab[iommu_pages-i]);
149 printk("%c", (i+1)%2 == 0 ? '\n' : ' ');
150 }
151 printk("\n");
152}
153#else
154#define SET_LEAK(x)
155#define CLEAR_LEAK(x)
156#endif
157
158static void iommu_full(struct device *dev, size_t size, int dir)
159{
160 /*
161 * Ran out of IOMMU space for this operation. This is very bad.
162 * Unfortunately the drivers cannot handle this operation properly.
163 * Return some non mapped prereserved space in the aperture and
164 * let the Northbridge deal with it. This will result in garbage
165 * in the IO operation. When the size exceeds the prereserved space
166 * memory corruption will occur or random memory will be DMAed
167 * out. Hopefully no network devices use single mappings that big.
168 */
169
170 printk(KERN_ERR
171 "PCI-DMA: Out of IOMMU space for %lu bytes at device %s\n",
172 size, dev->bus_id);
173
174 if (size > PAGE_SIZE*EMERGENCY_PAGES) {
175 if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL)
176 panic("PCI-DMA: Memory would be corrupted\n");
177 if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL)
178 panic(KERN_ERR "PCI-DMA: Random memory would be DMAed\n");
179 }
180
181#ifdef CONFIG_IOMMU_LEAK
182 dump_leak();
183#endif
184}
185
186static inline int need_iommu(struct device *dev, unsigned long addr, size_t size)
187{
188 u64 mask = *dev->dma_mask;
189 int high = addr + size > mask;
190 int mmu = high;
191 if (force_iommu)
192 mmu = 1;
193 return mmu;
194}
195
196static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
197{
198 u64 mask = *dev->dma_mask;
199 int high = addr + size > mask;
200 int mmu = high;
201 return mmu;
202}
203
204/* Map a single continuous physical area into the IOMMU.
205 * Caller needs to check if the iommu is needed and flush.
206 */
207static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
208 size_t size, int dir)
209{
210 unsigned long npages = to_pages(phys_mem, size);
211 unsigned long iommu_page = alloc_iommu(npages);
212 int i;
213 if (iommu_page == -1) {
214 if (!nonforced_iommu(dev, phys_mem, size))
215 return phys_mem;
216 if (panic_on_overflow)
217 panic("dma_map_area overflow %lu bytes\n", size);
218 iommu_full(dev, size, dir);
219 return bad_dma_address;
220 }
221
222 for (i = 0; i < npages; i++) {
223 iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem);
224 SET_LEAK(iommu_page + i);
225 phys_mem += PAGE_SIZE;
226 }
227 return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK);
228}
229
230static dma_addr_t gart_map_simple(struct device *dev, char *buf,
231 size_t size, int dir)
232{
233 dma_addr_t map = dma_map_area(dev, virt_to_bus(buf), size, dir);
234 flush_gart();
235 return map;
236}
237
238/* Map a single area into the IOMMU */
239static dma_addr_t gart_map_single(struct device *dev, void *addr, size_t size, int dir)
240{
241 unsigned long phys_mem, bus;
242
243 if (!dev)
244 dev = &fallback_dev;
245
246 phys_mem = virt_to_phys(addr);
247 if (!need_iommu(dev, phys_mem, size))
248 return phys_mem;
249
250 bus = gart_map_simple(dev, addr, size, dir);
251 return bus;
252}
253
254/*
255 * Free a DMA mapping.
256 */
257static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr,
258 size_t size, int direction)
259{
260 unsigned long iommu_page;
261 int npages;
262 int i;
263
264 if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE ||
265 dma_addr >= iommu_bus_base + iommu_size)
266 return;
267 iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT;
268 npages = to_pages(dma_addr, size);
269 for (i = 0; i < npages; i++) {
270 iommu_gatt_base[iommu_page + i] = gart_unmapped_entry;
271 CLEAR_LEAK(iommu_page + i);
272 }
273 free_iommu(iommu_page, npages);
274}
275
276/*
277 * Wrapper for pci_unmap_single working with scatterlists.
278 */
279static void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
280{
281 int i;
282
283 for (i = 0; i < nents; i++) {
284 struct scatterlist *s = &sg[i];
285 if (!s->dma_length || !s->length)
286 break;
287 gart_unmap_single(dev, s->dma_address, s->dma_length, dir);
288 }
289}
290
291/* Fallback for dma_map_sg in case of overflow */
292static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
293 int nents, int dir)
294{
295 int i;
296
297#ifdef CONFIG_IOMMU_DEBUG
298 printk(KERN_DEBUG "dma_map_sg overflow\n");
299#endif
300
301 for (i = 0; i < nents; i++ ) {
302 struct scatterlist *s = &sg[i];
303 unsigned long addr = page_to_phys(s->page) + s->offset;
304 if (nonforced_iommu(dev, addr, s->length)) {
305 addr = dma_map_area(dev, addr, s->length, dir);
306 if (addr == bad_dma_address) {
307 if (i > 0)
308 gart_unmap_sg(dev, sg, i, dir);
309 nents = 0;
310 sg[0].dma_length = 0;
311 break;
312 }
313 }
314 s->dma_address = addr;
315 s->dma_length = s->length;
316 }
317 flush_gart();
318 return nents;
319}
320
321/* Map multiple scatterlist entries continuous into the first. */
322static int __dma_map_cont(struct scatterlist *sg, int start, int stopat,
323 struct scatterlist *sout, unsigned long pages)
324{
325 unsigned long iommu_start = alloc_iommu(pages);
326 unsigned long iommu_page = iommu_start;
327 int i;
328
329 if (iommu_start == -1)
330 return -1;
331
332 for (i = start; i < stopat; i++) {
333 struct scatterlist *s = &sg[i];
334 unsigned long pages, addr;
335 unsigned long phys_addr = s->dma_address;
336
337 BUG_ON(i > start && s->offset);
338 if (i == start) {
339 *sout = *s;
340 sout->dma_address = iommu_bus_base;
341 sout->dma_address += iommu_page*PAGE_SIZE + s->offset;
342 sout->dma_length = s->length;
343 } else {
344 sout->dma_length += s->length;
345 }
346
347 addr = phys_addr;
348 pages = to_pages(s->offset, s->length);
349 while (pages--) {
350 iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr);
351 SET_LEAK(iommu_page);
352 addr += PAGE_SIZE;
353 iommu_page++;
354 }
355 }
356 BUG_ON(iommu_page - iommu_start != pages);
357 return 0;
358}
359
360static inline int dma_map_cont(struct scatterlist *sg, int start, int stopat,
361 struct scatterlist *sout,
362 unsigned long pages, int need)
363{
364 if (!need) {
365 BUG_ON(stopat - start != 1);
366 *sout = sg[start];
367 sout->dma_length = sg[start].length;
368 return 0;
369 }
370 return __dma_map_cont(sg, start, stopat, sout, pages);
371}
372
373/*
374 * DMA map all entries in a scatterlist.
375 * Merge chunks that have page aligned sizes into a continuous mapping.
376 */
377int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
378{
379 int i;
380 int out;
381 int start;
382 unsigned long pages = 0;
383 int need = 0, nextneed;
384
385 if (nents == 0)
386 return 0;
387
388 if (!dev)
389 dev = &fallback_dev;
390
391 out = 0;
392 start = 0;
393 for (i = 0; i < nents; i++) {
394 struct scatterlist *s = &sg[i];
395 dma_addr_t addr = page_to_phys(s->page) + s->offset;
396 s->dma_address = addr;
397 BUG_ON(s->length == 0);
398
399 nextneed = need_iommu(dev, addr, s->length);
400
401 /* Handle the previous not yet processed entries */
402 if (i > start) {
403 struct scatterlist *ps = &sg[i-1];
404 /* Can only merge when the last chunk ends on a page
405 boundary and the new one doesn't have an offset. */
406 if (!iommu_merge || !nextneed || !need || s->offset ||
407 (ps->offset + ps->length) % PAGE_SIZE) {
408 if (dma_map_cont(sg, start, i, sg+out, pages,
409 need) < 0)
410 goto error;
411 out++;
412 pages = 0;
413 start = i;
414 }
415 }
416
417 need = nextneed;
418 pages += to_pages(s->offset, s->length);
419 }
420 if (dma_map_cont(sg, start, i, sg+out, pages, need) < 0)
421 goto error;
422 out++;
423 flush_gart();
424 if (out < nents)
425 sg[out].dma_length = 0;
426 return out;
427
428error:
429 flush_gart();
430 gart_unmap_sg(dev, sg, nents, dir);
431 /* When it was forced or merged try again in a dumb way */
432 if (force_iommu || iommu_merge) {
433 out = dma_map_sg_nonforce(dev, sg, nents, dir);
434 if (out > 0)
435 return out;
436 }
437 if (panic_on_overflow)
438 panic("dma_map_sg: overflow on %lu pages\n", pages);
439 iommu_full(dev, pages << PAGE_SHIFT, dir);
440 for (i = 0; i < nents; i++)
441 sg[i].dma_address = bad_dma_address;
442 return 0;
443}
444
445static int no_agp;
446
447static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
448{
449 unsigned long a;
450 if (!iommu_size) {
451 iommu_size = aper_size;
452 if (!no_agp)
453 iommu_size /= 2;
454 }
455
456 a = aper + iommu_size;
457 iommu_size -= round_up(a, LARGE_PAGE_SIZE) - a;
458
459 if (iommu_size < 64*1024*1024)
460 printk(KERN_WARNING
461 "PCI-DMA: Warning: Small IOMMU %luMB. Consider increasing the AGP aperture in BIOS\n",iommu_size>>20);
462
463 return iommu_size;
464}
465
466static __init unsigned read_aperture(struct pci_dev *dev, u32 *size)
467{
468 unsigned aper_size = 0, aper_base_32;
469 u64 aper_base;
470 unsigned aper_order;
471
472 pci_read_config_dword(dev, 0x94, &aper_base_32);
473 pci_read_config_dword(dev, 0x90, &aper_order);
474 aper_order = (aper_order >> 1) & 7;
475
476 aper_base = aper_base_32 & 0x7fff;
477 aper_base <<= 25;
478
479 aper_size = (32 * 1024 * 1024) << aper_order;
480 if (aper_base + aper_size > 0x100000000UL || !aper_size)
481 aper_base = 0;
482
483 *size = aper_size;
484 return aper_base;
485}
486
487/*
488 * Private Northbridge GATT initialization in case we cannot use the
489 * AGP driver for some reason.
490 */
491static __init int init_k8_gatt(struct agp_kern_info *info)
492{
493 struct pci_dev *dev;
494 void *gatt;
495 unsigned aper_base, new_aper_base;
496 unsigned aper_size, gatt_size, new_aper_size;
497 int i;
498
499 printk(KERN_INFO "PCI-DMA: Disabling AGP.\n");
500 aper_size = aper_base = info->aper_size = 0;
501 dev = NULL;
502 for (i = 0; i < num_k8_northbridges; i++) {
503 dev = k8_northbridges[i];
504 new_aper_base = read_aperture(dev, &new_aper_size);
505 if (!new_aper_base)
506 goto nommu;
507
508 if (!aper_base) {
509 aper_size = new_aper_size;
510 aper_base = new_aper_base;
511 }
512 if (aper_size != new_aper_size || aper_base != new_aper_base)
513 goto nommu;
514 }
515 if (!aper_base)
516 goto nommu;
517 info->aper_base = aper_base;
518 info->aper_size = aper_size>>20;
519
520 gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32);
521 gatt = (void *)__get_free_pages(GFP_KERNEL, get_order(gatt_size));
522 if (!gatt)
523 panic("Cannot allocate GATT table");
524 if (change_page_attr_addr((unsigned long)gatt, gatt_size >> PAGE_SHIFT, PAGE_KERNEL_NOCACHE))
525 panic("Could not set GART PTEs to uncacheable pages");
526 global_flush_tlb();
527
528 memset(gatt, 0, gatt_size);
529 agp_gatt_table = gatt;
530
531 for (i = 0; i < num_k8_northbridges; i++) {
532 u32 ctl;
533 u32 gatt_reg;
534
535 dev = k8_northbridges[i];
536 gatt_reg = __pa(gatt) >> 12;
537 gatt_reg <<= 4;
538 pci_write_config_dword(dev, 0x98, gatt_reg);
539 pci_read_config_dword(dev, 0x90, &ctl);
540
541 ctl |= 1;
542 ctl &= ~((1<<4) | (1<<5));
543
544 pci_write_config_dword(dev, 0x90, ctl);
545 }
546 flush_gart();
547
548 printk("PCI-DMA: aperture base @ %x size %u KB\n",aper_base, aper_size>>10);
549 return 0;
550
551 nommu:
552 /* Should not happen anymore */
553 printk(KERN_ERR "PCI-DMA: More than 4GB of RAM and no IOMMU\n"
554 KERN_ERR "PCI-DMA: 32bit PCI IO may malfunction.\n");
555 return -1;
556}
557
558extern int agp_amd64_init(void);
559
560static const struct dma_mapping_ops gart_dma_ops = {
561 .mapping_error = NULL,
562 .map_single = gart_map_single,
563 .map_simple = gart_map_simple,
564 .unmap_single = gart_unmap_single,
565 .sync_single_for_cpu = NULL,
566 .sync_single_for_device = NULL,
567 .sync_single_range_for_cpu = NULL,
568 .sync_single_range_for_device = NULL,
569 .sync_sg_for_cpu = NULL,
570 .sync_sg_for_device = NULL,
571 .map_sg = gart_map_sg,
572 .unmap_sg = gart_unmap_sg,
573};
574
575void gart_iommu_shutdown(void)
576{
577 struct pci_dev *dev;
578 int i;
579
580 if (no_agp && (dma_ops != &gart_dma_ops))
581 return;
582
583 for (i = 0; i < num_k8_northbridges; i++) {
584 u32 ctl;
585
586 dev = k8_northbridges[i];
587 pci_read_config_dword(dev, 0x90, &ctl);
588
589 ctl &= ~1;
590
591 pci_write_config_dword(dev, 0x90, ctl);
592 }
593}
594
595void __init gart_iommu_init(void)
596{
597 struct agp_kern_info info;
598 unsigned long aper_size;
599 unsigned long iommu_start;
600 unsigned long scratch;
601 long i;
602
603 if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) {
604 printk(KERN_INFO "PCI-GART: No AMD northbridge found.\n");
605 return;
606 }
607
608#ifndef CONFIG_AGP_AMD64
609 no_agp = 1;
610#else
611 /* Makefile puts PCI initialization via subsys_initcall first. */
612 /* Add other K8 AGP bridge drivers here */
613 no_agp = no_agp ||
614 (agp_amd64_init() < 0) ||
615 (agp_copy_info(agp_bridge, &info) < 0);
616#endif
617
618 if (swiotlb)
619 return;
620
621 /* Did we detect a different HW IOMMU? */
622 if (iommu_detected && !iommu_aperture)
623 return;
624
625 if (no_iommu ||
626 (!force_iommu && end_pfn <= MAX_DMA32_PFN) ||
627 !iommu_aperture ||
628 (no_agp && init_k8_gatt(&info) < 0)) {
629 if (end_pfn > MAX_DMA32_PFN) {
630 printk(KERN_ERR "WARNING more than 4GB of memory "
631 "but GART IOMMU not available.\n"
632 KERN_ERR "WARNING 32bit PCI may malfunction.\n");
633 }
634 return;
635 }
636
637 printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n");
638 aper_size = info.aper_size * 1024 * 1024;
639 iommu_size = check_iommu_size(info.aper_base, aper_size);
640 iommu_pages = iommu_size >> PAGE_SHIFT;
641
642 iommu_gart_bitmap = (void*)__get_free_pages(GFP_KERNEL,
643 get_order(iommu_pages/8));
644 if (!iommu_gart_bitmap)
645 panic("Cannot allocate iommu bitmap\n");
646 memset(iommu_gart_bitmap, 0, iommu_pages/8);
647
648#ifdef CONFIG_IOMMU_LEAK
649 if (leak_trace) {
650 iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL,
651 get_order(iommu_pages*sizeof(void *)));
652 if (iommu_leak_tab)
653 memset(iommu_leak_tab, 0, iommu_pages * 8);
654 else
655 printk("PCI-DMA: Cannot allocate leak trace area\n");
656 }
657#endif
658
659 /*
660 * Out of IOMMU space handling.
661 * Reserve some invalid pages at the beginning of the GART.
662 */
663 set_bit_string(iommu_gart_bitmap, 0, EMERGENCY_PAGES);
664
665 agp_memory_reserved = iommu_size;
666 printk(KERN_INFO
667 "PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n",
668 iommu_size>>20);
669
670 iommu_start = aper_size - iommu_size;
671 iommu_bus_base = info.aper_base + iommu_start;
672 bad_dma_address = iommu_bus_base;
673 iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT);
674
675 /*
676 * Unmap the IOMMU part of the GART. The alias of the page is
677 * always mapped with cache enabled and there is no full cache
678 * coherency across the GART remapping. The unmapping avoids
679 * automatic prefetches from the CPU allocating cache lines in
680 * there. All CPU accesses are done via the direct mapping to
681 * the backing memory. The GART address is only used by PCI
682 * devices.
683 */
684 clear_kernel_mapping((unsigned long)__va(iommu_bus_base), iommu_size);
685
686 /*
687 * Try to workaround a bug (thanks to BenH)
688 * Set unmapped entries to a scratch page instead of 0.
689 * Any prefetches that hit unmapped entries won't get an bus abort
690 * then.
691 */
692 scratch = get_zeroed_page(GFP_KERNEL);
693 if (!scratch)
694 panic("Cannot allocate iommu scratch page");
695 gart_unmapped_entry = GPTE_ENCODE(__pa(scratch));
696 for (i = EMERGENCY_PAGES; i < iommu_pages; i++)
697 iommu_gatt_base[i] = gart_unmapped_entry;
698
699 flush_gart();
700 dma_ops = &gart_dma_ops;
701}
702
703void __init gart_parse_options(char *p)
704{
705 int arg;
706
707#ifdef CONFIG_IOMMU_LEAK
708 if (!strncmp(p,"leak",4)) {
709 leak_trace = 1;
710 p += 4;
711 if (*p == '=') ++p;
712 if (isdigit(*p) && get_option(&p, &arg))
713 iommu_leak_pages = arg;
714 }
715#endif
716 if (isdigit(*p) && get_option(&p, &arg))
717 iommu_size = arg;
718 if (!strncmp(p, "fullflush",8))
719 iommu_fullflush = 1;
720 if (!strncmp(p, "nofullflush",11))
721 iommu_fullflush = 0;
722 if (!strncmp(p,"noagp",5))
723 no_agp = 1;
724 if (!strncmp(p, "noaperture",10))
725 fix_aperture = 0;
726 /* duplicated from pci-dma.c */
727 if (!strncmp(p,"force",5))
728 iommu_aperture_allowed = 1;
729 if (!strncmp(p,"allowed",7))
730 iommu_aperture_allowed = 1;
731 if (!strncmp(p, "memaper", 7)) {
732 fallback_aper_force = 1;
733 p += 7;
734 if (*p == '=') {
735 ++p;
736 if (get_option(&p, &arg))
737 fallback_aper_order = arg;
738 }
739 }
740}
diff --git a/arch/x86/kernel/pci-nommu_64.c b/arch/x86/kernel/pci-nommu_64.c
new file mode 100644
index 000000000000..2a34c6c025a9
--- /dev/null
+++ b/arch/x86/kernel/pci-nommu_64.c
@@ -0,0 +1,97 @@
1/* Fallback functions when the main IOMMU code is not compiled in. This
2 code is roughly equivalent to i386. */
3#include <linux/mm.h>
4#include <linux/init.h>
5#include <linux/pci.h>
6#include <linux/string.h>
7#include <linux/dma-mapping.h>
8
9#include <asm/iommu.h>
10#include <asm/processor.h>
11#include <asm/dma.h>
12
13static int
14check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size)
15{
16 if (hwdev && bus + size > *hwdev->dma_mask) {
17 if (*hwdev->dma_mask >= DMA_32BIT_MASK)
18 printk(KERN_ERR
19 "nommu_%s: overflow %Lx+%zu of device mask %Lx\n",
20 name, (long long)bus, size,
21 (long long)*hwdev->dma_mask);
22 return 0;
23 }
24 return 1;
25}
26
27static dma_addr_t
28nommu_map_single(struct device *hwdev, void *ptr, size_t size,
29 int direction)
30{
31 dma_addr_t bus = virt_to_bus(ptr);
32 if (!check_addr("map_single", hwdev, bus, size))
33 return bad_dma_address;
34 return bus;
35}
36
37static void nommu_unmap_single(struct device *dev, dma_addr_t addr,size_t size,
38 int direction)
39{
40}
41
42/* Map a set of buffers described by scatterlist in streaming
43 * mode for DMA. This is the scatter-gather version of the
44 * above pci_map_single interface. Here the scatter gather list
45 * elements are each tagged with the appropriate dma address
46 * and length. They are obtained via sg_dma_{address,length}(SG).
47 *
48 * NOTE: An implementation may be able to use a smaller number of
49 * DMA address/length pairs than there are SG table elements.
50 * (for example via virtual mapping capabilities)
51 * The routine returns the number of addr/length pairs actually
52 * used, at most nents.
53 *
54 * Device ownership issues as mentioned above for pci_map_single are
55 * the same here.
56 */
57static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
58 int nents, int direction)
59{
60 int i;
61
62 for (i = 0; i < nents; i++ ) {
63 struct scatterlist *s = &sg[i];
64 BUG_ON(!s->page);
65 s->dma_address = virt_to_bus(page_address(s->page) +s->offset);
66 if (!check_addr("map_sg", hwdev, s->dma_address, s->length))
67 return 0;
68 s->dma_length = s->length;
69 }
70 return nents;
71}
72
73/* Unmap a set of streaming mode DMA translations.
74 * Again, cpu read rules concerning calls here are the same as for
75 * pci_unmap_single() above.
76 */
77static void nommu_unmap_sg(struct device *dev, struct scatterlist *sg,
78 int nents, int dir)
79{
80}
81
82const struct dma_mapping_ops nommu_dma_ops = {
83 .map_single = nommu_map_single,
84 .unmap_single = nommu_unmap_single,
85 .map_sg = nommu_map_sg,
86 .unmap_sg = nommu_unmap_sg,
87 .is_phys = 1,
88};
89
90void __init no_iommu_init(void)
91{
92 if (dma_ops)
93 return;
94
95 force_iommu = 0; /* no HW IOMMU */
96 dma_ops = &nommu_dma_ops;
97}
diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c
new file mode 100644
index 000000000000..b2f405ea7c85
--- /dev/null
+++ b/arch/x86/kernel/pci-swiotlb_64.c
@@ -0,0 +1,44 @@
1/* Glue code to lib/swiotlb.c */
2
3#include <linux/pci.h>
4#include <linux/cache.h>
5#include <linux/module.h>
6#include <linux/dma-mapping.h>
7
8#include <asm/iommu.h>
9#include <asm/swiotlb.h>
10#include <asm/dma.h>
11
12int swiotlb __read_mostly;
13EXPORT_SYMBOL(swiotlb);
14
15const struct dma_mapping_ops swiotlb_dma_ops = {
16 .mapping_error = swiotlb_dma_mapping_error,
17 .alloc_coherent = swiotlb_alloc_coherent,
18 .free_coherent = swiotlb_free_coherent,
19 .map_single = swiotlb_map_single,
20 .unmap_single = swiotlb_unmap_single,
21 .sync_single_for_cpu = swiotlb_sync_single_for_cpu,
22 .sync_single_for_device = swiotlb_sync_single_for_device,
23 .sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu,
24 .sync_single_range_for_device = swiotlb_sync_single_range_for_device,
25 .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
26 .sync_sg_for_device = swiotlb_sync_sg_for_device,
27 .map_sg = swiotlb_map_sg,
28 .unmap_sg = swiotlb_unmap_sg,
29 .dma_supported = NULL,
30};
31
32void __init pci_swiotlb_init(void)
33{
34 /* don't initialize swiotlb if iommu=off (no_iommu=1) */
35 if (!iommu_detected && !no_iommu && end_pfn > MAX_DMA32_PFN)
36 swiotlb = 1;
37 if (swiotlb_force)
38 swiotlb = 1;
39 if (swiotlb) {
40 printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
41 swiotlb_init();
42 dma_ops = &swiotlb_dma_ops;
43 }
44}
diff --git a/arch/x86/kernel/pcspeaker.c b/arch/x86/kernel/pcspeaker.c
new file mode 100644
index 000000000000..bc1f2d3ea277
--- /dev/null
+++ b/arch/x86/kernel/pcspeaker.c
@@ -0,0 +1,20 @@
1#include <linux/platform_device.h>
2#include <linux/errno.h>
3#include <linux/init.h>
4
5static __init int add_pcspkr(void)
6{
7 struct platform_device *pd;
8 int ret;
9
10 pd = platform_device_alloc("pcspkr", -1);
11 if (!pd)
12 return -ENOMEM;
13
14 ret = platform_device_add(pd);
15 if (ret)
16 platform_device_put(pd);
17
18 return ret;
19}
20device_initcall(add_pcspkr);
diff --git a/arch/x86/kernel/pmtimer_64.c b/arch/x86/kernel/pmtimer_64.c
new file mode 100644
index 000000000000..ae8f91214f15
--- /dev/null
+++ b/arch/x86/kernel/pmtimer_64.c
@@ -0,0 +1,69 @@
1/* Ported over from i386 by AK, original copyright was:
2 *
3 * (C) Dominik Brodowski <linux@brodo.de> 2003
4 *
5 * Driver to use the Power Management Timer (PMTMR) available in some
6 * southbridges as primary timing source for the Linux kernel.
7 *
8 * Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c,
9 * timer_hpet.c, and on Arjan van de Ven's implementation for 2.4.
10 *
11 * This file is licensed under the GPL v2.
12 *
13 * Dropped all the hardware bug workarounds for now. Hopefully they
14 * are not needed on 64bit chipsets.
15 */
16
17#include <linux/jiffies.h>
18#include <linux/kernel.h>
19#include <linux/time.h>
20#include <linux/init.h>
21#include <linux/cpumask.h>
22#include <asm/io.h>
23#include <asm/proto.h>
24#include <asm/msr.h>
25#include <asm/vsyscall.h>
26
27#define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */
28
29static inline u32 cyc2us(u32 cycles)
30{
31 /* The Power Management Timer ticks at 3.579545 ticks per microsecond.
32 * 1 / PM_TIMER_FREQUENCY == 0.27936511 =~ 286/1024 [error: 0.024%]
33 *
34 * Even with HZ = 100, delta is at maximum 35796 ticks, so it can
35 * easily be multiplied with 286 (=0x11E) without having to fear
36 * u32 overflows.
37 */
38 cycles *= 286;
39 return (cycles >> 10);
40}
41
42static unsigned pmtimer_wait_tick(void)
43{
44 u32 a, b;
45 for (a = b = inl(pmtmr_ioport) & ACPI_PM_MASK;
46 a == b;
47 b = inl(pmtmr_ioport) & ACPI_PM_MASK)
48 cpu_relax();
49 return b;
50}
51
52/* note: wait time is rounded up to one tick */
53void pmtimer_wait(unsigned us)
54{
55 u32 a, b;
56 a = pmtimer_wait_tick();
57 do {
58 b = inl(pmtmr_ioport);
59 cpu_relax();
60 } while (cyc2us(b - a) < us);
61}
62
63static int __init nopmtimer_setup(char *s)
64{
65 pmtmr_ioport = 0;
66 return 1;
67}
68
69__setup("nopmtimer", nopmtimer_setup);
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
new file mode 100644
index 000000000000..84664710b784
--- /dev/null
+++ b/arch/x86/kernel/process_32.c
@@ -0,0 +1,951 @@
1/*
2 * linux/arch/i386/kernel/process.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 *
6 * Pentium III FXSR, SSE support
7 * Gareth Hughes <gareth@valinux.com>, May 2000
8 */
9
10/*
11 * This file handles the architecture-dependent parts of process handling..
12 */
13
14#include <stdarg.h>
15
16#include <linux/cpu.h>
17#include <linux/errno.h>
18#include <linux/sched.h>
19#include <linux/fs.h>
20#include <linux/kernel.h>
21#include <linux/mm.h>
22#include <linux/elfcore.h>
23#include <linux/smp.h>
24#include <linux/stddef.h>
25#include <linux/slab.h>
26#include <linux/vmalloc.h>
27#include <linux/user.h>
28#include <linux/a.out.h>
29#include <linux/interrupt.h>
30#include <linux/utsname.h>
31#include <linux/delay.h>
32#include <linux/reboot.h>
33#include <linux/init.h>
34#include <linux/mc146818rtc.h>
35#include <linux/module.h>
36#include <linux/kallsyms.h>
37#include <linux/ptrace.h>
38#include <linux/random.h>
39#include <linux/personality.h>
40#include <linux/tick.h>
41#include <linux/percpu.h>
42
43#include <asm/uaccess.h>
44#include <asm/pgtable.h>
45#include <asm/system.h>
46#include <asm/io.h>
47#include <asm/ldt.h>
48#include <asm/processor.h>
49#include <asm/i387.h>
50#include <asm/desc.h>
51#include <asm/vm86.h>
52#ifdef CONFIG_MATH_EMULATION
53#include <asm/math_emu.h>
54#endif
55
56#include <linux/err.h>
57
58#include <asm/tlbflush.h>
59#include <asm/cpu.h>
60
61asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
62
63static int hlt_counter;
64
65unsigned long boot_option_idle_override = 0;
66EXPORT_SYMBOL(boot_option_idle_override);
67
68DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
69EXPORT_PER_CPU_SYMBOL(current_task);
70
71DEFINE_PER_CPU(int, cpu_number);
72EXPORT_PER_CPU_SYMBOL(cpu_number);
73
74/*
75 * Return saved PC of a blocked thread.
76 */
77unsigned long thread_saved_pc(struct task_struct *tsk)
78{
79 return ((unsigned long *)tsk->thread.esp)[3];
80}
81
82/*
83 * Powermanagement idle function, if any..
84 */
85void (*pm_idle)(void);
86EXPORT_SYMBOL(pm_idle);
87static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
88
89void disable_hlt(void)
90{
91 hlt_counter++;
92}
93
94EXPORT_SYMBOL(disable_hlt);
95
96void enable_hlt(void)
97{
98 hlt_counter--;
99}
100
101EXPORT_SYMBOL(enable_hlt);
102
103/*
104 * We use this if we don't have any better
105 * idle routine..
106 */
107void default_idle(void)
108{
109 if (!hlt_counter && boot_cpu_data.hlt_works_ok) {
110 current_thread_info()->status &= ~TS_POLLING;
111 /*
112 * TS_POLLING-cleared state must be visible before we
113 * test NEED_RESCHED:
114 */
115 smp_mb();
116
117 local_irq_disable();
118 if (!need_resched())
119 safe_halt(); /* enables interrupts racelessly */
120 else
121 local_irq_enable();
122 current_thread_info()->status |= TS_POLLING;
123 } else {
124 /* loop is done by the caller */
125 cpu_relax();
126 }
127}
128#ifdef CONFIG_APM_MODULE
129EXPORT_SYMBOL(default_idle);
130#endif
131
132/*
133 * On SMP it's slightly faster (but much more power-consuming!)
134 * to poll the ->work.need_resched flag instead of waiting for the
135 * cross-CPU IPI to arrive. Use this option with caution.
136 */
137static void poll_idle (void)
138{
139 cpu_relax();
140}
141
142#ifdef CONFIG_HOTPLUG_CPU
143#include <asm/nmi.h>
144/* We don't actually take CPU down, just spin without interrupts. */
145static inline void play_dead(void)
146{
147 /* This must be done before dead CPU ack */
148 cpu_exit_clear();
149 wbinvd();
150 mb();
151 /* Ack it */
152 __get_cpu_var(cpu_state) = CPU_DEAD;
153
154 /*
155 * With physical CPU hotplug, we should halt the cpu
156 */
157 local_irq_disable();
158 while (1)
159 halt();
160}
161#else
162static inline void play_dead(void)
163{
164 BUG();
165}
166#endif /* CONFIG_HOTPLUG_CPU */
167
168/*
169 * The idle thread. There's no useful work to be
170 * done, so just try to conserve power and have a
171 * low exit latency (ie sit in a loop waiting for
172 * somebody to say that they'd like to reschedule)
173 */
174void cpu_idle(void)
175{
176 int cpu = smp_processor_id();
177
178 current_thread_info()->status |= TS_POLLING;
179
180 /* endless idle loop with no priority at all */
181 while (1) {
182 tick_nohz_stop_sched_tick();
183 while (!need_resched()) {
184 void (*idle)(void);
185
186 if (__get_cpu_var(cpu_idle_state))
187 __get_cpu_var(cpu_idle_state) = 0;
188
189 check_pgt_cache();
190 rmb();
191 idle = pm_idle;
192
193 if (!idle)
194 idle = default_idle;
195
196 if (cpu_is_offline(cpu))
197 play_dead();
198
199 __get_cpu_var(irq_stat).idle_timestamp = jiffies;
200 idle();
201 }
202 tick_nohz_restart_sched_tick();
203 preempt_enable_no_resched();
204 schedule();
205 preempt_disable();
206 }
207}
208
209void cpu_idle_wait(void)
210{
211 unsigned int cpu, this_cpu = get_cpu();
212 cpumask_t map, tmp = current->cpus_allowed;
213
214 set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
215 put_cpu();
216
217 cpus_clear(map);
218 for_each_online_cpu(cpu) {
219 per_cpu(cpu_idle_state, cpu) = 1;
220 cpu_set(cpu, map);
221 }
222
223 __get_cpu_var(cpu_idle_state) = 0;
224
225 wmb();
226 do {
227 ssleep(1);
228 for_each_online_cpu(cpu) {
229 if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
230 cpu_clear(cpu, map);
231 }
232 cpus_and(map, map, cpu_online_map);
233 } while (!cpus_empty(map));
234
235 set_cpus_allowed(current, tmp);
236}
237EXPORT_SYMBOL_GPL(cpu_idle_wait);
238
239/*
240 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
241 * which can obviate IPI to trigger checking of need_resched.
242 * We execute MONITOR against need_resched and enter optimized wait state
243 * through MWAIT. Whenever someone changes need_resched, we would be woken
244 * up from MWAIT (without an IPI).
245 *
246 * New with Core Duo processors, MWAIT can take some hints based on CPU
247 * capability.
248 */
249void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
250{
251 if (!need_resched()) {
252 __monitor((void *)&current_thread_info()->flags, 0, 0);
253 smp_mb();
254 if (!need_resched())
255 __mwait(eax, ecx);
256 }
257}
258
259/* Default MONITOR/MWAIT with no hints, used for default C1 state */
260static void mwait_idle(void)
261{
262 local_irq_enable();
263 mwait_idle_with_hints(0, 0);
264}
265
266void __devinit select_idle_routine(const struct cpuinfo_x86 *c)
267{
268 if (cpu_has(c, X86_FEATURE_MWAIT)) {
269 printk("monitor/mwait feature present.\n");
270 /*
271 * Skip, if setup has overridden idle.
272 * One CPU supports mwait => All CPUs supports mwait
273 */
274 if (!pm_idle) {
275 printk("using mwait in idle threads.\n");
276 pm_idle = mwait_idle;
277 }
278 }
279}
280
281static int __init idle_setup(char *str)
282{
283 if (!strcmp(str, "poll")) {
284 printk("using polling idle threads.\n");
285 pm_idle = poll_idle;
286#ifdef CONFIG_X86_SMP
287 if (smp_num_siblings > 1)
288 printk("WARNING: polling idle and HT enabled, performance may degrade.\n");
289#endif
290 } else if (!strcmp(str, "mwait"))
291 force_mwait = 1;
292 else
293 return -1;
294
295 boot_option_idle_override = 1;
296 return 0;
297}
298early_param("idle", idle_setup);
299
300void show_regs(struct pt_regs * regs)
301{
302 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
303 unsigned long d0, d1, d2, d3, d6, d7;
304
305 printk("\n");
306 printk("Pid: %d, comm: %20s\n", current->pid, current->comm);
307 printk("EIP: %04x:[<%08lx>] CPU: %d\n",0xffff & regs->xcs,regs->eip, smp_processor_id());
308 print_symbol("EIP is at %s\n", regs->eip);
309
310 if (user_mode_vm(regs))
311 printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp);
312 printk(" EFLAGS: %08lx %s (%s %.*s)\n",
313 regs->eflags, print_tainted(), init_utsname()->release,
314 (int)strcspn(init_utsname()->version, " "),
315 init_utsname()->version);
316 printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
317 regs->eax,regs->ebx,regs->ecx,regs->edx);
318 printk("ESI: %08lx EDI: %08lx EBP: %08lx",
319 regs->esi, regs->edi, regs->ebp);
320 printk(" DS: %04x ES: %04x FS: %04x\n",
321 0xffff & regs->xds,0xffff & regs->xes, 0xffff & regs->xfs);
322
323 cr0 = read_cr0();
324 cr2 = read_cr2();
325 cr3 = read_cr3();
326 cr4 = read_cr4_safe();
327 printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4);
328
329 get_debugreg(d0, 0);
330 get_debugreg(d1, 1);
331 get_debugreg(d2, 2);
332 get_debugreg(d3, 3);
333 printk("DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n",
334 d0, d1, d2, d3);
335 get_debugreg(d6, 6);
336 get_debugreg(d7, 7);
337 printk("DR6: %08lx DR7: %08lx\n", d6, d7);
338
339 show_trace(NULL, regs, &regs->esp);
340}
341
342/*
343 * This gets run with %ebx containing the
344 * function to call, and %edx containing
345 * the "args".
346 */
347extern void kernel_thread_helper(void);
348
349/*
350 * Create a kernel thread
351 */
352int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
353{
354 struct pt_regs regs;
355
356 memset(&regs, 0, sizeof(regs));
357
358 regs.ebx = (unsigned long) fn;
359 regs.edx = (unsigned long) arg;
360
361 regs.xds = __USER_DS;
362 regs.xes = __USER_DS;
363 regs.xfs = __KERNEL_PERCPU;
364 regs.orig_eax = -1;
365 regs.eip = (unsigned long) kernel_thread_helper;
366 regs.xcs = __KERNEL_CS | get_kernel_rpl();
367 regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
368
369 /* Ok, create the new process.. */
370 return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
371}
372EXPORT_SYMBOL(kernel_thread);
373
374/*
375 * Free current thread data structures etc..
376 */
377void exit_thread(void)
378{
379 /* The process may have allocated an io port bitmap... nuke it. */
380 if (unlikely(test_thread_flag(TIF_IO_BITMAP))) {
381 struct task_struct *tsk = current;
382 struct thread_struct *t = &tsk->thread;
383 int cpu = get_cpu();
384 struct tss_struct *tss = &per_cpu(init_tss, cpu);
385
386 kfree(t->io_bitmap_ptr);
387 t->io_bitmap_ptr = NULL;
388 clear_thread_flag(TIF_IO_BITMAP);
389 /*
390 * Careful, clear this in the TSS too:
391 */
392 memset(tss->io_bitmap, 0xff, tss->io_bitmap_max);
393 t->io_bitmap_max = 0;
394 tss->io_bitmap_owner = NULL;
395 tss->io_bitmap_max = 0;
396 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
397 put_cpu();
398 }
399}
400
401void flush_thread(void)
402{
403 struct task_struct *tsk = current;
404
405 memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8);
406 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
407 clear_tsk_thread_flag(tsk, TIF_DEBUG);
408 /*
409 * Forget coprocessor state..
410 */
411 clear_fpu(tsk);
412 clear_used_math();
413}
414
415void release_thread(struct task_struct *dead_task)
416{
417 BUG_ON(dead_task->mm);
418 release_vm86_irqs(dead_task);
419}
420
421/*
422 * This gets called before we allocate a new thread and copy
423 * the current task into it.
424 */
425void prepare_to_copy(struct task_struct *tsk)
426{
427 unlazy_fpu(tsk);
428}
429
430int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
431 unsigned long unused,
432 struct task_struct * p, struct pt_regs * regs)
433{
434 struct pt_regs * childregs;
435 struct task_struct *tsk;
436 int err;
437
438 childregs = task_pt_regs(p);
439 *childregs = *regs;
440 childregs->eax = 0;
441 childregs->esp = esp;
442
443 p->thread.esp = (unsigned long) childregs;
444 p->thread.esp0 = (unsigned long) (childregs+1);
445
446 p->thread.eip = (unsigned long) ret_from_fork;
447
448 savesegment(gs,p->thread.gs);
449
450 tsk = current;
451 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
452 p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
453 IO_BITMAP_BYTES, GFP_KERNEL);
454 if (!p->thread.io_bitmap_ptr) {
455 p->thread.io_bitmap_max = 0;
456 return -ENOMEM;
457 }
458 set_tsk_thread_flag(p, TIF_IO_BITMAP);
459 }
460
461 /*
462 * Set a new TLS for the child thread?
463 */
464 if (clone_flags & CLONE_SETTLS) {
465 struct desc_struct *desc;
466 struct user_desc info;
467 int idx;
468
469 err = -EFAULT;
470 if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info)))
471 goto out;
472 err = -EINVAL;
473 if (LDT_empty(&info))
474 goto out;
475
476 idx = info.entry_number;
477 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
478 goto out;
479
480 desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
481 desc->a = LDT_entry_a(&info);
482 desc->b = LDT_entry_b(&info);
483 }
484
485 err = 0;
486 out:
487 if (err && p->thread.io_bitmap_ptr) {
488 kfree(p->thread.io_bitmap_ptr);
489 p->thread.io_bitmap_max = 0;
490 }
491 return err;
492}
493
494/*
495 * fill in the user structure for a core dump..
496 */
497void dump_thread(struct pt_regs * regs, struct user * dump)
498{
499 int i;
500
501/* changed the size calculations - should hopefully work better. lbt */
502 dump->magic = CMAGIC;
503 dump->start_code = 0;
504 dump->start_stack = regs->esp & ~(PAGE_SIZE - 1);
505 dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
506 dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
507 dump->u_dsize -= dump->u_tsize;
508 dump->u_ssize = 0;
509 for (i = 0; i < 8; i++)
510 dump->u_debugreg[i] = current->thread.debugreg[i];
511
512 if (dump->start_stack < TASK_SIZE)
513 dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT;
514
515 dump->regs.ebx = regs->ebx;
516 dump->regs.ecx = regs->ecx;
517 dump->regs.edx = regs->edx;
518 dump->regs.esi = regs->esi;
519 dump->regs.edi = regs->edi;
520 dump->regs.ebp = regs->ebp;
521 dump->regs.eax = regs->eax;
522 dump->regs.ds = regs->xds;
523 dump->regs.es = regs->xes;
524 dump->regs.fs = regs->xfs;
525 savesegment(gs,dump->regs.gs);
526 dump->regs.orig_eax = regs->orig_eax;
527 dump->regs.eip = regs->eip;
528 dump->regs.cs = regs->xcs;
529 dump->regs.eflags = regs->eflags;
530 dump->regs.esp = regs->esp;
531 dump->regs.ss = regs->xss;
532
533 dump->u_fpvalid = dump_fpu (regs, &dump->i387);
534}
535EXPORT_SYMBOL(dump_thread);
536
537/*
538 * Capture the user space registers if the task is not running (in user space)
539 */
540int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
541{
542 struct pt_regs ptregs = *task_pt_regs(tsk);
543 ptregs.xcs &= 0xffff;
544 ptregs.xds &= 0xffff;
545 ptregs.xes &= 0xffff;
546 ptregs.xss &= 0xffff;
547
548 elf_core_copy_regs(regs, &ptregs);
549
550 return 1;
551}
552
553#ifdef CONFIG_SECCOMP
554void hard_disable_TSC(void)
555{
556 write_cr4(read_cr4() | X86_CR4_TSD);
557}
558void disable_TSC(void)
559{
560 preempt_disable();
561 if (!test_and_set_thread_flag(TIF_NOTSC))
562 /*
563 * Must flip the CPU state synchronously with
564 * TIF_NOTSC in the current running context.
565 */
566 hard_disable_TSC();
567 preempt_enable();
568}
569void hard_enable_TSC(void)
570{
571 write_cr4(read_cr4() & ~X86_CR4_TSD);
572}
573#endif /* CONFIG_SECCOMP */
574
575static noinline void
576__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
577 struct tss_struct *tss)
578{
579 struct thread_struct *next;
580
581 next = &next_p->thread;
582
583 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
584 set_debugreg(next->debugreg[0], 0);
585 set_debugreg(next->debugreg[1], 1);
586 set_debugreg(next->debugreg[2], 2);
587 set_debugreg(next->debugreg[3], 3);
588 /* no 4 and 5 */
589 set_debugreg(next->debugreg[6], 6);
590 set_debugreg(next->debugreg[7], 7);
591 }
592
593#ifdef CONFIG_SECCOMP
594 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
595 test_tsk_thread_flag(next_p, TIF_NOTSC)) {
596 /* prev and next are different */
597 if (test_tsk_thread_flag(next_p, TIF_NOTSC))
598 hard_disable_TSC();
599 else
600 hard_enable_TSC();
601 }
602#endif
603
604 if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
605 /*
606 * Disable the bitmap via an invalid offset. We still cache
607 * the previous bitmap owner and the IO bitmap contents:
608 */
609 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
610 return;
611 }
612
613 if (likely(next == tss->io_bitmap_owner)) {
614 /*
615 * Previous owner of the bitmap (hence the bitmap content)
616 * matches the next task, we dont have to do anything but
617 * to set a valid offset in the TSS:
618 */
619 tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
620 return;
621 }
622 /*
623 * Lazy TSS's I/O bitmap copy. We set an invalid offset here
624 * and we let the task to get a GPF in case an I/O instruction
625 * is performed. The handler of the GPF will verify that the
626 * faulting task has a valid I/O bitmap and, it true, does the
627 * real copy and restart the instruction. This will save us
628 * redundant copies when the currently switched task does not
629 * perform any I/O during its timeslice.
630 */
631 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY;
632}
633
634/*
635 * switch_to(x,yn) should switch tasks from x to y.
636 *
637 * We fsave/fwait so that an exception goes off at the right time
638 * (as a call from the fsave or fwait in effect) rather than to
639 * the wrong process. Lazy FP saving no longer makes any sense
640 * with modern CPU's, and this simplifies a lot of things (SMP
641 * and UP become the same).
642 *
643 * NOTE! We used to use the x86 hardware context switching. The
644 * reason for not using it any more becomes apparent when you
645 * try to recover gracefully from saved state that is no longer
646 * valid (stale segment register values in particular). With the
647 * hardware task-switch, there is no way to fix up bad state in
648 * a reasonable manner.
649 *
650 * The fact that Intel documents the hardware task-switching to
651 * be slow is a fairly red herring - this code is not noticeably
652 * faster. However, there _is_ some room for improvement here,
653 * so the performance issues may eventually be a valid point.
654 * More important, however, is the fact that this allows us much
655 * more flexibility.
656 *
657 * The return value (in %eax) will be the "prev" task after
658 * the task-switch, and shows up in ret_from_fork in entry.S,
659 * for example.
660 */
661struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
662{
663 struct thread_struct *prev = &prev_p->thread,
664 *next = &next_p->thread;
665 int cpu = smp_processor_id();
666 struct tss_struct *tss = &per_cpu(init_tss, cpu);
667
668 /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
669
670 __unlazy_fpu(prev_p);
671
672
673 /* we're going to use this soon, after a few expensive things */
674 if (next_p->fpu_counter > 5)
675 prefetch(&next->i387.fxsave);
676
677 /*
678 * Reload esp0.
679 */
680 load_esp0(tss, next);
681
682 /*
683 * Save away %gs. No need to save %fs, as it was saved on the
684 * stack on entry. No need to save %es and %ds, as those are
685 * always kernel segments while inside the kernel. Doing this
686 * before setting the new TLS descriptors avoids the situation
687 * where we temporarily have non-reloadable segments in %fs
688 * and %gs. This could be an issue if the NMI handler ever
689 * used %fs or %gs (it does not today), or if the kernel is
690 * running inside of a hypervisor layer.
691 */
692 savesegment(gs, prev->gs);
693
694 /*
695 * Load the per-thread Thread-Local Storage descriptor.
696 */
697 load_TLS(next, cpu);
698
699 /*
700 * Restore IOPL if needed. In normal use, the flags restore
701 * in the switch assembly will handle this. But if the kernel
702 * is running virtualized at a non-zero CPL, the popf will
703 * not restore flags, so it must be done in a separate step.
704 */
705 if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl))
706 set_iopl_mask(next->iopl);
707
708 /*
709 * Now maybe handle debug registers and/or IO bitmaps
710 */
711 if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV ||
712 task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
713 __switch_to_xtra(prev_p, next_p, tss);
714
715 /*
716 * Leave lazy mode, flushing any hypercalls made here.
717 * This must be done before restoring TLS segments so
718 * the GDT and LDT are properly updated, and must be
719 * done before math_state_restore, so the TS bit is up
720 * to date.
721 */
722 arch_leave_lazy_cpu_mode();
723
724 /* If the task has used fpu the last 5 timeslices, just do a full
725 * restore of the math state immediately to avoid the trap; the
726 * chances of needing FPU soon are obviously high now
727 */
728 if (next_p->fpu_counter > 5)
729 math_state_restore();
730
731 /*
732 * Restore %gs if needed (which is common)
733 */
734 if (prev->gs | next->gs)
735 loadsegment(gs, next->gs);
736
737 x86_write_percpu(current_task, next_p);
738
739 return prev_p;
740}
741
742asmlinkage int sys_fork(struct pt_regs regs)
743{
744 return do_fork(SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
745}
746
747asmlinkage int sys_clone(struct pt_regs regs)
748{
749 unsigned long clone_flags;
750 unsigned long newsp;
751 int __user *parent_tidptr, *child_tidptr;
752
753 clone_flags = regs.ebx;
754 newsp = regs.ecx;
755 parent_tidptr = (int __user *)regs.edx;
756 child_tidptr = (int __user *)regs.edi;
757 if (!newsp)
758 newsp = regs.esp;
759 return do_fork(clone_flags, newsp, &regs, 0, parent_tidptr, child_tidptr);
760}
761
762/*
763 * This is trivial, and on the face of it looks like it
764 * could equally well be done in user mode.
765 *
766 * Not so, for quite unobvious reasons - register pressure.
767 * In user mode vfork() cannot have a stack frame, and if
768 * done by calling the "clone()" system call directly, you
769 * do not have enough call-clobbered registers to hold all
770 * the information you need.
771 */
772asmlinkage int sys_vfork(struct pt_regs regs)
773{
774 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
775}
776
777/*
778 * sys_execve() executes a new program.
779 */
780asmlinkage int sys_execve(struct pt_regs regs)
781{
782 int error;
783 char * filename;
784
785 filename = getname((char __user *) regs.ebx);
786 error = PTR_ERR(filename);
787 if (IS_ERR(filename))
788 goto out;
789 error = do_execve(filename,
790 (char __user * __user *) regs.ecx,
791 (char __user * __user *) regs.edx,
792 &regs);
793 if (error == 0) {
794 task_lock(current);
795 current->ptrace &= ~PT_DTRACE;
796 task_unlock(current);
797 /* Make sure we don't return using sysenter.. */
798 set_thread_flag(TIF_IRET);
799 }
800 putname(filename);
801out:
802 return error;
803}
804
805#define top_esp (THREAD_SIZE - sizeof(unsigned long))
806#define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long))
807
808unsigned long get_wchan(struct task_struct *p)
809{
810 unsigned long ebp, esp, eip;
811 unsigned long stack_page;
812 int count = 0;
813 if (!p || p == current || p->state == TASK_RUNNING)
814 return 0;
815 stack_page = (unsigned long)task_stack_page(p);
816 esp = p->thread.esp;
817 if (!stack_page || esp < stack_page || esp > top_esp+stack_page)
818 return 0;
819 /* include/asm-i386/system.h:switch_to() pushes ebp last. */
820 ebp = *(unsigned long *) esp;
821 do {
822 if (ebp < stack_page || ebp > top_ebp+stack_page)
823 return 0;
824 eip = *(unsigned long *) (ebp+4);
825 if (!in_sched_functions(eip))
826 return eip;
827 ebp = *(unsigned long *) ebp;
828 } while (count++ < 16);
829 return 0;
830}
831
832/*
833 * sys_alloc_thread_area: get a yet unused TLS descriptor index.
834 */
835static int get_free_idx(void)
836{
837 struct thread_struct *t = &current->thread;
838 int idx;
839
840 for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++)
841 if (desc_empty(t->tls_array + idx))
842 return idx + GDT_ENTRY_TLS_MIN;
843 return -ESRCH;
844}
845
846/*
847 * Set a given TLS descriptor:
848 */
849asmlinkage int sys_set_thread_area(struct user_desc __user *u_info)
850{
851 struct thread_struct *t = &current->thread;
852 struct user_desc info;
853 struct desc_struct *desc;
854 int cpu, idx;
855
856 if (copy_from_user(&info, u_info, sizeof(info)))
857 return -EFAULT;
858 idx = info.entry_number;
859
860 /*
861 * index -1 means the kernel should try to find and
862 * allocate an empty descriptor:
863 */
864 if (idx == -1) {
865 idx = get_free_idx();
866 if (idx < 0)
867 return idx;
868 if (put_user(idx, &u_info->entry_number))
869 return -EFAULT;
870 }
871
872 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
873 return -EINVAL;
874
875 desc = t->tls_array + idx - GDT_ENTRY_TLS_MIN;
876
877 /*
878 * We must not get preempted while modifying the TLS.
879 */
880 cpu = get_cpu();
881
882 if (LDT_empty(&info)) {
883 desc->a = 0;
884 desc->b = 0;
885 } else {
886 desc->a = LDT_entry_a(&info);
887 desc->b = LDT_entry_b(&info);
888 }
889 load_TLS(t, cpu);
890
891 put_cpu();
892
893 return 0;
894}
895
896/*
897 * Get the current Thread-Local Storage area:
898 */
899
900#define GET_BASE(desc) ( \
901 (((desc)->a >> 16) & 0x0000ffff) | \
902 (((desc)->b << 16) & 0x00ff0000) | \
903 ( (desc)->b & 0xff000000) )
904
905#define GET_LIMIT(desc) ( \
906 ((desc)->a & 0x0ffff) | \
907 ((desc)->b & 0xf0000) )
908
909#define GET_32BIT(desc) (((desc)->b >> 22) & 1)
910#define GET_CONTENTS(desc) (((desc)->b >> 10) & 3)
911#define GET_WRITABLE(desc) (((desc)->b >> 9) & 1)
912#define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1)
913#define GET_PRESENT(desc) (((desc)->b >> 15) & 1)
914#define GET_USEABLE(desc) (((desc)->b >> 20) & 1)
915
916asmlinkage int sys_get_thread_area(struct user_desc __user *u_info)
917{
918 struct user_desc info;
919 struct desc_struct *desc;
920 int idx;
921
922 if (get_user(idx, &u_info->entry_number))
923 return -EFAULT;
924 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
925 return -EINVAL;
926
927 memset(&info, 0, sizeof(info));
928
929 desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
930
931 info.entry_number = idx;
932 info.base_addr = GET_BASE(desc);
933 info.limit = GET_LIMIT(desc);
934 info.seg_32bit = GET_32BIT(desc);
935 info.contents = GET_CONTENTS(desc);
936 info.read_exec_only = !GET_WRITABLE(desc);
937 info.limit_in_pages = GET_LIMIT_PAGES(desc);
938 info.seg_not_present = !GET_PRESENT(desc);
939 info.useable = GET_USEABLE(desc);
940
941 if (copy_to_user(u_info, &info, sizeof(info)))
942 return -EFAULT;
943 return 0;
944}
945
946unsigned long arch_align_stack(unsigned long sp)
947{
948 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
949 sp -= get_random_int() % 8192;
950 return sp & ~0xf;
951}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
new file mode 100644
index 000000000000..98956555450b
--- /dev/null
+++ b/arch/x86/kernel/process_64.c
@@ -0,0 +1,903 @@
1/*
2 * linux/arch/x86-64/kernel/process.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 *
6 * Pentium III FXSR, SSE support
7 * Gareth Hughes <gareth@valinux.com>, May 2000
8 *
9 * X86-64 port
10 * Andi Kleen.
11 *
12 * CPU hotplug support - ashok.raj@intel.com
13 */
14
15/*
16 * This file handles the architecture-dependent parts of process handling..
17 */
18
19#include <stdarg.h>
20
21#include <linux/cpu.h>
22#include <linux/errno.h>
23#include <linux/sched.h>
24#include <linux/kernel.h>
25#include <linux/mm.h>
26#include <linux/fs.h>
27#include <linux/elfcore.h>
28#include <linux/smp.h>
29#include <linux/slab.h>
30#include <linux/user.h>
31#include <linux/module.h>
32#include <linux/a.out.h>
33#include <linux/interrupt.h>
34#include <linux/delay.h>
35#include <linux/ptrace.h>
36#include <linux/utsname.h>
37#include <linux/random.h>
38#include <linux/notifier.h>
39#include <linux/kprobes.h>
40#include <linux/kdebug.h>
41
42#include <asm/uaccess.h>
43#include <asm/pgtable.h>
44#include <asm/system.h>
45#include <asm/io.h>
46#include <asm/processor.h>
47#include <asm/i387.h>
48#include <asm/mmu_context.h>
49#include <asm/pda.h>
50#include <asm/prctl.h>
51#include <asm/desc.h>
52#include <asm/proto.h>
53#include <asm/ia32.h>
54#include <asm/idle.h>
55
56asmlinkage extern void ret_from_fork(void);
57
58unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
59
60unsigned long boot_option_idle_override = 0;
61EXPORT_SYMBOL(boot_option_idle_override);
62
63/*
64 * Powermanagement idle function, if any..
65 */
66void (*pm_idle)(void);
67EXPORT_SYMBOL(pm_idle);
68static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
69
70static ATOMIC_NOTIFIER_HEAD(idle_notifier);
71
72void idle_notifier_register(struct notifier_block *n)
73{
74 atomic_notifier_chain_register(&idle_notifier, n);
75}
76EXPORT_SYMBOL_GPL(idle_notifier_register);
77
78void idle_notifier_unregister(struct notifier_block *n)
79{
80 atomic_notifier_chain_unregister(&idle_notifier, n);
81}
82EXPORT_SYMBOL(idle_notifier_unregister);
83
84void enter_idle(void)
85{
86 write_pda(isidle, 1);
87 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
88}
89
90static void __exit_idle(void)
91{
92 if (test_and_clear_bit_pda(0, isidle) == 0)
93 return;
94 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
95}
96
97/* Called from interrupts to signify idle end */
98void exit_idle(void)
99{
100 /* idle loop has pid 0 */
101 if (current->pid)
102 return;
103 __exit_idle();
104}
105
106/*
107 * We use this if we don't have any better
108 * idle routine..
109 */
110static void default_idle(void)
111{
112 current_thread_info()->status &= ~TS_POLLING;
113 /*
114 * TS_POLLING-cleared state must be visible before we
115 * test NEED_RESCHED:
116 */
117 smp_mb();
118 local_irq_disable();
119 if (!need_resched()) {
120 /* Enables interrupts one instruction before HLT.
121 x86 special cases this so there is no race. */
122 safe_halt();
123 } else
124 local_irq_enable();
125 current_thread_info()->status |= TS_POLLING;
126}
127
128/*
129 * On SMP it's slightly faster (but much more power-consuming!)
130 * to poll the ->need_resched flag instead of waiting for the
131 * cross-CPU IPI to arrive. Use this option with caution.
132 */
133static void poll_idle (void)
134{
135 local_irq_enable();
136 cpu_relax();
137}
138
139void cpu_idle_wait(void)
140{
141 unsigned int cpu, this_cpu = get_cpu();
142 cpumask_t map, tmp = current->cpus_allowed;
143
144 set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
145 put_cpu();
146
147 cpus_clear(map);
148 for_each_online_cpu(cpu) {
149 per_cpu(cpu_idle_state, cpu) = 1;
150 cpu_set(cpu, map);
151 }
152
153 __get_cpu_var(cpu_idle_state) = 0;
154
155 wmb();
156 do {
157 ssleep(1);
158 for_each_online_cpu(cpu) {
159 if (cpu_isset(cpu, map) &&
160 !per_cpu(cpu_idle_state, cpu))
161 cpu_clear(cpu, map);
162 }
163 cpus_and(map, map, cpu_online_map);
164 } while (!cpus_empty(map));
165
166 set_cpus_allowed(current, tmp);
167}
168EXPORT_SYMBOL_GPL(cpu_idle_wait);
169
170#ifdef CONFIG_HOTPLUG_CPU
171DECLARE_PER_CPU(int, cpu_state);
172
173#include <asm/nmi.h>
174/* We halt the CPU with physical CPU hotplug */
175static inline void play_dead(void)
176{
177 idle_task_exit();
178 wbinvd();
179 mb();
180 /* Ack it */
181 __get_cpu_var(cpu_state) = CPU_DEAD;
182
183 local_irq_disable();
184 while (1)
185 halt();
186}
187#else
188static inline void play_dead(void)
189{
190 BUG();
191}
192#endif /* CONFIG_HOTPLUG_CPU */
193
194/*
195 * The idle thread. There's no useful work to be
196 * done, so just try to conserve power and have a
197 * low exit latency (ie sit in a loop waiting for
198 * somebody to say that they'd like to reschedule)
199 */
200void cpu_idle (void)
201{
202 current_thread_info()->status |= TS_POLLING;
203 /* endless idle loop with no priority at all */
204 while (1) {
205 while (!need_resched()) {
206 void (*idle)(void);
207
208 if (__get_cpu_var(cpu_idle_state))
209 __get_cpu_var(cpu_idle_state) = 0;
210
211 rmb();
212 idle = pm_idle;
213 if (!idle)
214 idle = default_idle;
215 if (cpu_is_offline(smp_processor_id()))
216 play_dead();
217 /*
218 * Idle routines should keep interrupts disabled
219 * from here on, until they go to idle.
220 * Otherwise, idle callbacks can misfire.
221 */
222 local_irq_disable();
223 enter_idle();
224 idle();
225 /* In many cases the interrupt that ended idle
226 has already called exit_idle. But some idle
227 loops can be woken up without interrupt. */
228 __exit_idle();
229 }
230
231 preempt_enable_no_resched();
232 schedule();
233 preempt_disable();
234 }
235}
236
237/*
238 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
239 * which can obviate IPI to trigger checking of need_resched.
240 * We execute MONITOR against need_resched and enter optimized wait state
241 * through MWAIT. Whenever someone changes need_resched, we would be woken
242 * up from MWAIT (without an IPI).
243 *
244 * New with Core Duo processors, MWAIT can take some hints based on CPU
245 * capability.
246 */
247void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
248{
249 if (!need_resched()) {
250 __monitor((void *)&current_thread_info()->flags, 0, 0);
251 smp_mb();
252 if (!need_resched())
253 __mwait(eax, ecx);
254 }
255}
256
257/* Default MONITOR/MWAIT with no hints, used for default C1 state */
258static void mwait_idle(void)
259{
260 if (!need_resched()) {
261 __monitor((void *)&current_thread_info()->flags, 0, 0);
262 smp_mb();
263 if (!need_resched())
264 __sti_mwait(0, 0);
265 else
266 local_irq_enable();
267 } else {
268 local_irq_enable();
269 }
270}
271
272void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
273{
274 static int printed;
275 if (cpu_has(c, X86_FEATURE_MWAIT)) {
276 /*
277 * Skip, if setup has overridden idle.
278 * One CPU supports mwait => All CPUs supports mwait
279 */
280 if (!pm_idle) {
281 if (!printed) {
282 printk(KERN_INFO "using mwait in idle threads.\n");
283 printed = 1;
284 }
285 pm_idle = mwait_idle;
286 }
287 }
288}
289
290static int __init idle_setup (char *str)
291{
292 if (!strcmp(str, "poll")) {
293 printk("using polling idle threads.\n");
294 pm_idle = poll_idle;
295 } else if (!strcmp(str, "mwait"))
296 force_mwait = 1;
297 else
298 return -1;
299
300 boot_option_idle_override = 1;
301 return 0;
302}
303early_param("idle", idle_setup);
304
305/* Prints also some state that isn't saved in the pt_regs */
306void __show_regs(struct pt_regs * regs)
307{
308 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
309 unsigned long d0, d1, d2, d3, d6, d7;
310 unsigned int fsindex,gsindex;
311 unsigned int ds,cs,es;
312
313 printk("\n");
314 print_modules();
315 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
316 current->pid, current->comm, print_tainted(),
317 init_utsname()->release,
318 (int)strcspn(init_utsname()->version, " "),
319 init_utsname()->version);
320 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
321 printk_address(regs->rip);
322 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp,
323 regs->eflags);
324 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
325 regs->rax, regs->rbx, regs->rcx);
326 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
327 regs->rdx, regs->rsi, regs->rdi);
328 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
329 regs->rbp, regs->r8, regs->r9);
330 printk("R10: %016lx R11: %016lx R12: %016lx\n",
331 regs->r10, regs->r11, regs->r12);
332 printk("R13: %016lx R14: %016lx R15: %016lx\n",
333 regs->r13, regs->r14, regs->r15);
334
335 asm("movl %%ds,%0" : "=r" (ds));
336 asm("movl %%cs,%0" : "=r" (cs));
337 asm("movl %%es,%0" : "=r" (es));
338 asm("movl %%fs,%0" : "=r" (fsindex));
339 asm("movl %%gs,%0" : "=r" (gsindex));
340
341 rdmsrl(MSR_FS_BASE, fs);
342 rdmsrl(MSR_GS_BASE, gs);
343 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
344
345 cr0 = read_cr0();
346 cr2 = read_cr2();
347 cr3 = read_cr3();
348 cr4 = read_cr4();
349
350 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
351 fs,fsindex,gs,gsindex,shadowgs);
352 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
353 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
354
355 get_debugreg(d0, 0);
356 get_debugreg(d1, 1);
357 get_debugreg(d2, 2);
358 printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
359 get_debugreg(d3, 3);
360 get_debugreg(d6, 6);
361 get_debugreg(d7, 7);
362 printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
363}
364
365void show_regs(struct pt_regs *regs)
366{
367 printk("CPU %d:", smp_processor_id());
368 __show_regs(regs);
369 show_trace(NULL, regs, (void *)(regs + 1));
370}
371
372/*
373 * Free current thread data structures etc..
374 */
375void exit_thread(void)
376{
377 struct task_struct *me = current;
378 struct thread_struct *t = &me->thread;
379
380 if (me->thread.io_bitmap_ptr) {
381 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
382
383 kfree(t->io_bitmap_ptr);
384 t->io_bitmap_ptr = NULL;
385 clear_thread_flag(TIF_IO_BITMAP);
386 /*
387 * Careful, clear this in the TSS too:
388 */
389 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
390 t->io_bitmap_max = 0;
391 put_cpu();
392 }
393}
394
395void flush_thread(void)
396{
397 struct task_struct *tsk = current;
398
399 if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
400 clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
401 if (test_tsk_thread_flag(tsk, TIF_IA32)) {
402 clear_tsk_thread_flag(tsk, TIF_IA32);
403 } else {
404 set_tsk_thread_flag(tsk, TIF_IA32);
405 current_thread_info()->status |= TS_COMPAT;
406 }
407 }
408 clear_tsk_thread_flag(tsk, TIF_DEBUG);
409
410 tsk->thread.debugreg0 = 0;
411 tsk->thread.debugreg1 = 0;
412 tsk->thread.debugreg2 = 0;
413 tsk->thread.debugreg3 = 0;
414 tsk->thread.debugreg6 = 0;
415 tsk->thread.debugreg7 = 0;
416 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
417 /*
418 * Forget coprocessor state..
419 */
420 clear_fpu(tsk);
421 clear_used_math();
422}
423
424void release_thread(struct task_struct *dead_task)
425{
426 if (dead_task->mm) {
427 if (dead_task->mm->context.size) {
428 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
429 dead_task->comm,
430 dead_task->mm->context.ldt,
431 dead_task->mm->context.size);
432 BUG();
433 }
434 }
435}
436
437static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
438{
439 struct user_desc ud = {
440 .base_addr = addr,
441 .limit = 0xfffff,
442 .seg_32bit = 1,
443 .limit_in_pages = 1,
444 .useable = 1,
445 };
446 struct n_desc_struct *desc = (void *)t->thread.tls_array;
447 desc += tls;
448 desc->a = LDT_entry_a(&ud);
449 desc->b = LDT_entry_b(&ud);
450}
451
452static inline u32 read_32bit_tls(struct task_struct *t, int tls)
453{
454 struct desc_struct *desc = (void *)t->thread.tls_array;
455 desc += tls;
456 return desc->base0 |
457 (((u32)desc->base1) << 16) |
458 (((u32)desc->base2) << 24);
459}
460
461/*
462 * This gets called before we allocate a new thread and copy
463 * the current task into it.
464 */
465void prepare_to_copy(struct task_struct *tsk)
466{
467 unlazy_fpu(tsk);
468}
469
470int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
471 unsigned long unused,
472 struct task_struct * p, struct pt_regs * regs)
473{
474 int err;
475 struct pt_regs * childregs;
476 struct task_struct *me = current;
477
478 childregs = ((struct pt_regs *)
479 (THREAD_SIZE + task_stack_page(p))) - 1;
480 *childregs = *regs;
481
482 childregs->rax = 0;
483 childregs->rsp = rsp;
484 if (rsp == ~0UL)
485 childregs->rsp = (unsigned long)childregs;
486
487 p->thread.rsp = (unsigned long) childregs;
488 p->thread.rsp0 = (unsigned long) (childregs+1);
489 p->thread.userrsp = me->thread.userrsp;
490
491 set_tsk_thread_flag(p, TIF_FORK);
492
493 p->thread.fs = me->thread.fs;
494 p->thread.gs = me->thread.gs;
495
496 asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
497 asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
498 asm("mov %%es,%0" : "=m" (p->thread.es));
499 asm("mov %%ds,%0" : "=m" (p->thread.ds));
500
501 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
502 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
503 if (!p->thread.io_bitmap_ptr) {
504 p->thread.io_bitmap_max = 0;
505 return -ENOMEM;
506 }
507 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
508 IO_BITMAP_BYTES);
509 set_tsk_thread_flag(p, TIF_IO_BITMAP);
510 }
511
512 /*
513 * Set a new TLS for the child thread?
514 */
515 if (clone_flags & CLONE_SETTLS) {
516#ifdef CONFIG_IA32_EMULATION
517 if (test_thread_flag(TIF_IA32))
518 err = ia32_child_tls(p, childregs);
519 else
520#endif
521 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
522 if (err)
523 goto out;
524 }
525 err = 0;
526out:
527 if (err && p->thread.io_bitmap_ptr) {
528 kfree(p->thread.io_bitmap_ptr);
529 p->thread.io_bitmap_max = 0;
530 }
531 return err;
532}
533
534/*
535 * This special macro can be used to load a debugging register
536 */
537#define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r)
538
539static inline void __switch_to_xtra(struct task_struct *prev_p,
540 struct task_struct *next_p,
541 struct tss_struct *tss)
542{
543 struct thread_struct *prev, *next;
544
545 prev = &prev_p->thread,
546 next = &next_p->thread;
547
548 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
549 loaddebug(next, 0);
550 loaddebug(next, 1);
551 loaddebug(next, 2);
552 loaddebug(next, 3);
553 /* no 4 and 5 */
554 loaddebug(next, 6);
555 loaddebug(next, 7);
556 }
557
558 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
559 /*
560 * Copy the relevant range of the IO bitmap.
561 * Normally this is 128 bytes or less:
562 */
563 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
564 max(prev->io_bitmap_max, next->io_bitmap_max));
565 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
566 /*
567 * Clear any possible leftover bits:
568 */
569 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
570 }
571}
572
573/*
574 * switch_to(x,y) should switch tasks from x to y.
575 *
576 * This could still be optimized:
577 * - fold all the options into a flag word and test it with a single test.
578 * - could test fs/gs bitsliced
579 *
580 * Kprobes not supported here. Set the probe on schedule instead.
581 */
582__kprobes struct task_struct *
583__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
584{
585 struct thread_struct *prev = &prev_p->thread,
586 *next = &next_p->thread;
587 int cpu = smp_processor_id();
588 struct tss_struct *tss = &per_cpu(init_tss, cpu);
589
590 /* we're going to use this soon, after a few expensive things */
591 if (next_p->fpu_counter>5)
592 prefetch(&next->i387.fxsave);
593
594 /*
595 * Reload esp0, LDT and the page table pointer:
596 */
597 tss->rsp0 = next->rsp0;
598
599 /*
600 * Switch DS and ES.
601 * This won't pick up thread selector changes, but I guess that is ok.
602 */
603 asm volatile("mov %%es,%0" : "=m" (prev->es));
604 if (unlikely(next->es | prev->es))
605 loadsegment(es, next->es);
606
607 asm volatile ("mov %%ds,%0" : "=m" (prev->ds));
608 if (unlikely(next->ds | prev->ds))
609 loadsegment(ds, next->ds);
610
611 load_TLS(next, cpu);
612
613 /*
614 * Switch FS and GS.
615 */
616 {
617 unsigned fsindex;
618 asm volatile("movl %%fs,%0" : "=r" (fsindex));
619 /* segment register != 0 always requires a reload.
620 also reload when it has changed.
621 when prev process used 64bit base always reload
622 to avoid an information leak. */
623 if (unlikely(fsindex | next->fsindex | prev->fs)) {
624 loadsegment(fs, next->fsindex);
625 /* check if the user used a selector != 0
626 * if yes clear 64bit base, since overloaded base
627 * is always mapped to the Null selector
628 */
629 if (fsindex)
630 prev->fs = 0;
631 }
632 /* when next process has a 64bit base use it */
633 if (next->fs)
634 wrmsrl(MSR_FS_BASE, next->fs);
635 prev->fsindex = fsindex;
636 }
637 {
638 unsigned gsindex;
639 asm volatile("movl %%gs,%0" : "=r" (gsindex));
640 if (unlikely(gsindex | next->gsindex | prev->gs)) {
641 load_gs_index(next->gsindex);
642 if (gsindex)
643 prev->gs = 0;
644 }
645 if (next->gs)
646 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
647 prev->gsindex = gsindex;
648 }
649
650 /* Must be after DS reload */
651 unlazy_fpu(prev_p);
652
653 /*
654 * Switch the PDA and FPU contexts.
655 */
656 prev->userrsp = read_pda(oldrsp);
657 write_pda(oldrsp, next->userrsp);
658 write_pda(pcurrent, next_p);
659
660 write_pda(kernelstack,
661 (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
662#ifdef CONFIG_CC_STACKPROTECTOR
663 write_pda(stack_canary, next_p->stack_canary);
664 /*
665 * Build time only check to make sure the stack_canary is at
666 * offset 40 in the pda; this is a gcc ABI requirement
667 */
668 BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
669#endif
670
671 /*
672 * Now maybe reload the debug registers and handle I/O bitmaps
673 */
674 if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW))
675 || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP))
676 __switch_to_xtra(prev_p, next_p, tss);
677
678 /* If the task has used fpu the last 5 timeslices, just do a full
679 * restore of the math state immediately to avoid the trap; the
680 * chances of needing FPU soon are obviously high now
681 */
682 if (next_p->fpu_counter>5)
683 math_state_restore();
684 return prev_p;
685}
686
687/*
688 * sys_execve() executes a new program.
689 */
690asmlinkage
691long sys_execve(char __user *name, char __user * __user *argv,
692 char __user * __user *envp, struct pt_regs regs)
693{
694 long error;
695 char * filename;
696
697 filename = getname(name);
698 error = PTR_ERR(filename);
699 if (IS_ERR(filename))
700 return error;
701 error = do_execve(filename, argv, envp, &regs);
702 if (error == 0) {
703 task_lock(current);
704 current->ptrace &= ~PT_DTRACE;
705 task_unlock(current);
706 }
707 putname(filename);
708 return error;
709}
710
711void set_personality_64bit(void)
712{
713 /* inherit personality from parent */
714
715 /* Make sure to be in 64bit mode */
716 clear_thread_flag(TIF_IA32);
717
718 /* TBD: overwrites user setup. Should have two bits.
719 But 64bit processes have always behaved this way,
720 so it's not too bad. The main problem is just that
721 32bit childs are affected again. */
722 current->personality &= ~READ_IMPLIES_EXEC;
723}
724
725asmlinkage long sys_fork(struct pt_regs *regs)
726{
727 return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
728}
729
730asmlinkage long
731sys_clone(unsigned long clone_flags, unsigned long newsp,
732 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
733{
734 if (!newsp)
735 newsp = regs->rsp;
736 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
737}
738
739/*
740 * This is trivial, and on the face of it looks like it
741 * could equally well be done in user mode.
742 *
743 * Not so, for quite unobvious reasons - register pressure.
744 * In user mode vfork() cannot have a stack frame, and if
745 * done by calling the "clone()" system call directly, you
746 * do not have enough call-clobbered registers to hold all
747 * the information you need.
748 */
749asmlinkage long sys_vfork(struct pt_regs *regs)
750{
751 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
752 NULL, NULL);
753}
754
755unsigned long get_wchan(struct task_struct *p)
756{
757 unsigned long stack;
758 u64 fp,rip;
759 int count = 0;
760
761 if (!p || p == current || p->state==TASK_RUNNING)
762 return 0;
763 stack = (unsigned long)task_stack_page(p);
764 if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
765 return 0;
766 fp = *(u64 *)(p->thread.rsp);
767 do {
768 if (fp < (unsigned long)stack ||
769 fp > (unsigned long)stack+THREAD_SIZE)
770 return 0;
771 rip = *(u64 *)(fp+8);
772 if (!in_sched_functions(rip))
773 return rip;
774 fp = *(u64 *)fp;
775 } while (count++ < 16);
776 return 0;
777}
778
779long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
780{
781 int ret = 0;
782 int doit = task == current;
783 int cpu;
784
785 switch (code) {
786 case ARCH_SET_GS:
787 if (addr >= TASK_SIZE_OF(task))
788 return -EPERM;
789 cpu = get_cpu();
790 /* handle small bases via the GDT because that's faster to
791 switch. */
792 if (addr <= 0xffffffff) {
793 set_32bit_tls(task, GS_TLS, addr);
794 if (doit) {
795 load_TLS(&task->thread, cpu);
796 load_gs_index(GS_TLS_SEL);
797 }
798 task->thread.gsindex = GS_TLS_SEL;
799 task->thread.gs = 0;
800 } else {
801 task->thread.gsindex = 0;
802 task->thread.gs = addr;
803 if (doit) {
804 load_gs_index(0);
805 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
806 }
807 }
808 put_cpu();
809 break;
810 case ARCH_SET_FS:
811 /* Not strictly needed for fs, but do it for symmetry
812 with gs */
813 if (addr >= TASK_SIZE_OF(task))
814 return -EPERM;
815 cpu = get_cpu();
816 /* handle small bases via the GDT because that's faster to
817 switch. */
818 if (addr <= 0xffffffff) {
819 set_32bit_tls(task, FS_TLS, addr);
820 if (doit) {
821 load_TLS(&task->thread, cpu);
822 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
823 }
824 task->thread.fsindex = FS_TLS_SEL;
825 task->thread.fs = 0;
826 } else {
827 task->thread.fsindex = 0;
828 task->thread.fs = addr;
829 if (doit) {
830 /* set the selector to 0 to not confuse
831 __switch_to */
832 asm volatile("movl %0,%%fs" :: "r" (0));
833 ret = checking_wrmsrl(MSR_FS_BASE, addr);
834 }
835 }
836 put_cpu();
837 break;
838 case ARCH_GET_FS: {
839 unsigned long base;
840 if (task->thread.fsindex == FS_TLS_SEL)
841 base = read_32bit_tls(task, FS_TLS);
842 else if (doit)
843 rdmsrl(MSR_FS_BASE, base);
844 else
845 base = task->thread.fs;
846 ret = put_user(base, (unsigned long __user *)addr);
847 break;
848 }
849 case ARCH_GET_GS: {
850 unsigned long base;
851 unsigned gsindex;
852 if (task->thread.gsindex == GS_TLS_SEL)
853 base = read_32bit_tls(task, GS_TLS);
854 else if (doit) {
855 asm("movl %%gs,%0" : "=r" (gsindex));
856 if (gsindex)
857 rdmsrl(MSR_KERNEL_GS_BASE, base);
858 else
859 base = task->thread.gs;
860 }
861 else
862 base = task->thread.gs;
863 ret = put_user(base, (unsigned long __user *)addr);
864 break;
865 }
866
867 default:
868 ret = -EINVAL;
869 break;
870 }
871
872 return ret;
873}
874
875long sys_arch_prctl(int code, unsigned long addr)
876{
877 return do_arch_prctl(current, code, addr);
878}
879
880/*
881 * Capture the user space registers if the task is not running (in user space)
882 */
883int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
884{
885 struct pt_regs *pp, ptregs;
886
887 pp = task_pt_regs(tsk);
888
889 ptregs = *pp;
890 ptregs.cs &= 0xffff;
891 ptregs.ss &= 0xffff;
892
893 elf_core_copy_regs(regs, &ptregs);
894
895 return 1;
896}
897
898unsigned long arch_align_stack(unsigned long sp)
899{
900 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
901 sp -= get_random_int() % 8192;
902 return sp & ~0xf;
903}
diff --git a/arch/x86/kernel/ptrace_32.c b/arch/x86/kernel/ptrace_32.c
new file mode 100644
index 000000000000..7c1b92522e95
--- /dev/null
+++ b/arch/x86/kernel/ptrace_32.c
@@ -0,0 +1,723 @@
1/* ptrace.c */
2/* By Ross Biro 1/23/92 */
3/*
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
6 */
7
8#include <linux/kernel.h>
9#include <linux/sched.h>
10#include <linux/mm.h>
11#include <linux/smp.h>
12#include <linux/errno.h>
13#include <linux/ptrace.h>
14#include <linux/user.h>
15#include <linux/security.h>
16#include <linux/audit.h>
17#include <linux/seccomp.h>
18#include <linux/signal.h>
19
20#include <asm/uaccess.h>
21#include <asm/pgtable.h>
22#include <asm/system.h>
23#include <asm/processor.h>
24#include <asm/i387.h>
25#include <asm/debugreg.h>
26#include <asm/ldt.h>
27#include <asm/desc.h>
28
29/*
30 * does not yet catch signals sent when the child dies.
31 * in exit.c or in signal.c.
32 */
33
34/*
35 * Determines which flags the user has access to [1 = access, 0 = no access].
36 * Prohibits changing ID(21), VIP(20), VIF(19), VM(17), NT(14), IOPL(12-13), IF(9).
37 * Also masks reserved bits (31-22, 15, 5, 3, 1).
38 */
39#define FLAG_MASK 0x00050dd5
40
41/* set's the trap flag. */
42#define TRAP_FLAG 0x100
43
44/*
45 * Offset of eflags on child stack..
46 */
47#define EFL_OFFSET offsetof(struct pt_regs, eflags)
48
49static inline struct pt_regs *get_child_regs(struct task_struct *task)
50{
51 void *stack_top = (void *)task->thread.esp0;
52 return stack_top - sizeof(struct pt_regs);
53}
54
55/*
56 * This routine will get a word off of the processes privileged stack.
57 * the offset is bytes into the pt_regs structure on the stack.
58 * This routine assumes that all the privileged stacks are in our
59 * data space.
60 */
61static inline int get_stack_long(struct task_struct *task, int offset)
62{
63 unsigned char *stack;
64
65 stack = (unsigned char *)task->thread.esp0 - sizeof(struct pt_regs);
66 stack += offset;
67 return (*((int *)stack));
68}
69
70/*
71 * This routine will put a word on the processes privileged stack.
72 * the offset is bytes into the pt_regs structure on the stack.
73 * This routine assumes that all the privileged stacks are in our
74 * data space.
75 */
76static inline int put_stack_long(struct task_struct *task, int offset,
77 unsigned long data)
78{
79 unsigned char * stack;
80
81 stack = (unsigned char *)task->thread.esp0 - sizeof(struct pt_regs);
82 stack += offset;
83 *(unsigned long *) stack = data;
84 return 0;
85}
86
87static int putreg(struct task_struct *child,
88 unsigned long regno, unsigned long value)
89{
90 switch (regno >> 2) {
91 case GS:
92 if (value && (value & 3) != 3)
93 return -EIO;
94 child->thread.gs = value;
95 return 0;
96 case DS:
97 case ES:
98 case FS:
99 if (value && (value & 3) != 3)
100 return -EIO;
101 value &= 0xffff;
102 break;
103 case SS:
104 case CS:
105 if ((value & 3) != 3)
106 return -EIO;
107 value &= 0xffff;
108 break;
109 case EFL:
110 value &= FLAG_MASK;
111 value |= get_stack_long(child, EFL_OFFSET) & ~FLAG_MASK;
112 break;
113 }
114 if (regno > FS*4)
115 regno -= 1*4;
116 put_stack_long(child, regno, value);
117 return 0;
118}
119
120static unsigned long getreg(struct task_struct *child,
121 unsigned long regno)
122{
123 unsigned long retval = ~0UL;
124
125 switch (regno >> 2) {
126 case GS:
127 retval = child->thread.gs;
128 break;
129 case DS:
130 case ES:
131 case FS:
132 case SS:
133 case CS:
134 retval = 0xffff;
135 /* fall through */
136 default:
137 if (regno > FS*4)
138 regno -= 1*4;
139 retval &= get_stack_long(child, regno);
140 }
141 return retval;
142}
143
144#define LDT_SEGMENT 4
145
146static unsigned long convert_eip_to_linear(struct task_struct *child, struct pt_regs *regs)
147{
148 unsigned long addr, seg;
149
150 addr = regs->eip;
151 seg = regs->xcs & 0xffff;
152 if (regs->eflags & VM_MASK) {
153 addr = (addr & 0xffff) + (seg << 4);
154 return addr;
155 }
156
157 /*
158 * We'll assume that the code segments in the GDT
159 * are all zero-based. That is largely true: the
160 * TLS segments are used for data, and the PNPBIOS
161 * and APM bios ones we just ignore here.
162 */
163 if (seg & LDT_SEGMENT) {
164 u32 *desc;
165 unsigned long base;
166
167 seg &= ~7UL;
168
169 down(&child->mm->context.sem);
170 if (unlikely((seg >> 3) >= child->mm->context.size))
171 addr = -1L; /* bogus selector, access would fault */
172 else {
173 desc = child->mm->context.ldt + seg;
174 base = ((desc[0] >> 16) |
175 ((desc[1] & 0xff) << 16) |
176 (desc[1] & 0xff000000));
177
178 /* 16-bit code segment? */
179 if (!((desc[1] >> 22) & 1))
180 addr &= 0xffff;
181 addr += base;
182 }
183 up(&child->mm->context.sem);
184 }
185 return addr;
186}
187
188static inline int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
189{
190 int i, copied;
191 unsigned char opcode[15];
192 unsigned long addr = convert_eip_to_linear(child, regs);
193
194 copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0);
195 for (i = 0; i < copied; i++) {
196 switch (opcode[i]) {
197 /* popf and iret */
198 case 0x9d: case 0xcf:
199 return 1;
200 /* opcode and address size prefixes */
201 case 0x66: case 0x67:
202 continue;
203 /* irrelevant prefixes (segment overrides and repeats) */
204 case 0x26: case 0x2e:
205 case 0x36: case 0x3e:
206 case 0x64: case 0x65:
207 case 0xf0: case 0xf2: case 0xf3:
208 continue;
209
210 /*
211 * pushf: NOTE! We should probably not let
212 * the user see the TF bit being set. But
213 * it's more pain than it's worth to avoid
214 * it, and a debugger could emulate this
215 * all in user space if it _really_ cares.
216 */
217 case 0x9c:
218 default:
219 return 0;
220 }
221 }
222 return 0;
223}
224
225static void set_singlestep(struct task_struct *child)
226{
227 struct pt_regs *regs = get_child_regs(child);
228
229 /*
230 * Always set TIF_SINGLESTEP - this guarantees that
231 * we single-step system calls etc.. This will also
232 * cause us to set TF when returning to user mode.
233 */
234 set_tsk_thread_flag(child, TIF_SINGLESTEP);
235
236 /*
237 * If TF was already set, don't do anything else
238 */
239 if (regs->eflags & TRAP_FLAG)
240 return;
241
242 /* Set TF on the kernel stack.. */
243 regs->eflags |= TRAP_FLAG;
244
245 /*
246 * ..but if TF is changed by the instruction we will trace,
247 * don't mark it as being "us" that set it, so that we
248 * won't clear it by hand later.
249 */
250 if (is_setting_trap_flag(child, regs))
251 return;
252
253 child->ptrace |= PT_DTRACE;
254}
255
256static void clear_singlestep(struct task_struct *child)
257{
258 /* Always clear TIF_SINGLESTEP... */
259 clear_tsk_thread_flag(child, TIF_SINGLESTEP);
260
261 /* But touch TF only if it was set by us.. */
262 if (child->ptrace & PT_DTRACE) {
263 struct pt_regs *regs = get_child_regs(child);
264 regs->eflags &= ~TRAP_FLAG;
265 child->ptrace &= ~PT_DTRACE;
266 }
267}
268
269/*
270 * Called by kernel/ptrace.c when detaching..
271 *
272 * Make sure the single step bit is not set.
273 */
274void ptrace_disable(struct task_struct *child)
275{
276 clear_singlestep(child);
277 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
278}
279
280/*
281 * Perform get_thread_area on behalf of the traced child.
282 */
283static int
284ptrace_get_thread_area(struct task_struct *child,
285 int idx, struct user_desc __user *user_desc)
286{
287 struct user_desc info;
288 struct desc_struct *desc;
289
290/*
291 * Get the current Thread-Local Storage area:
292 */
293
294#define GET_BASE(desc) ( \
295 (((desc)->a >> 16) & 0x0000ffff) | \
296 (((desc)->b << 16) & 0x00ff0000) | \
297 ( (desc)->b & 0xff000000) )
298
299#define GET_LIMIT(desc) ( \
300 ((desc)->a & 0x0ffff) | \
301 ((desc)->b & 0xf0000) )
302
303#define GET_32BIT(desc) (((desc)->b >> 22) & 1)
304#define GET_CONTENTS(desc) (((desc)->b >> 10) & 3)
305#define GET_WRITABLE(desc) (((desc)->b >> 9) & 1)
306#define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1)
307#define GET_PRESENT(desc) (((desc)->b >> 15) & 1)
308#define GET_USEABLE(desc) (((desc)->b >> 20) & 1)
309
310 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
311 return -EINVAL;
312
313 desc = child->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
314
315 info.entry_number = idx;
316 info.base_addr = GET_BASE(desc);
317 info.limit = GET_LIMIT(desc);
318 info.seg_32bit = GET_32BIT(desc);
319 info.contents = GET_CONTENTS(desc);
320 info.read_exec_only = !GET_WRITABLE(desc);
321 info.limit_in_pages = GET_LIMIT_PAGES(desc);
322 info.seg_not_present = !GET_PRESENT(desc);
323 info.useable = GET_USEABLE(desc);
324
325 if (copy_to_user(user_desc, &info, sizeof(info)))
326 return -EFAULT;
327
328 return 0;
329}
330
331/*
332 * Perform set_thread_area on behalf of the traced child.
333 */
334static int
335ptrace_set_thread_area(struct task_struct *child,
336 int idx, struct user_desc __user *user_desc)
337{
338 struct user_desc info;
339 struct desc_struct *desc;
340
341 if (copy_from_user(&info, user_desc, sizeof(info)))
342 return -EFAULT;
343
344 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
345 return -EINVAL;
346
347 desc = child->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
348 if (LDT_empty(&info)) {
349 desc->a = 0;
350 desc->b = 0;
351 } else {
352 desc->a = LDT_entry_a(&info);
353 desc->b = LDT_entry_b(&info);
354 }
355
356 return 0;
357}
358
359long arch_ptrace(struct task_struct *child, long request, long addr, long data)
360{
361 struct user * dummy = NULL;
362 int i, ret;
363 unsigned long __user *datap = (unsigned long __user *)data;
364
365 switch (request) {
366 /* when I and D space are separate, these will need to be fixed. */
367 case PTRACE_PEEKTEXT: /* read word at location addr. */
368 case PTRACE_PEEKDATA:
369 ret = generic_ptrace_peekdata(child, addr, data);
370 break;
371
372 /* read the word at location addr in the USER area. */
373 case PTRACE_PEEKUSR: {
374 unsigned long tmp;
375
376 ret = -EIO;
377 if ((addr & 3) || addr < 0 ||
378 addr > sizeof(struct user) - 3)
379 break;
380
381 tmp = 0; /* Default return condition */
382 if(addr < FRAME_SIZE*sizeof(long))
383 tmp = getreg(child, addr);
384 if(addr >= (long) &dummy->u_debugreg[0] &&
385 addr <= (long) &dummy->u_debugreg[7]){
386 addr -= (long) &dummy->u_debugreg[0];
387 addr = addr >> 2;
388 tmp = child->thread.debugreg[addr];
389 }
390 ret = put_user(tmp, datap);
391 break;
392 }
393
394 /* when I and D space are separate, this will have to be fixed. */
395 case PTRACE_POKETEXT: /* write the word at location addr. */
396 case PTRACE_POKEDATA:
397 ret = generic_ptrace_pokedata(child, addr, data);
398 break;
399
400 case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
401 ret = -EIO;
402 if ((addr & 3) || addr < 0 ||
403 addr > sizeof(struct user) - 3)
404 break;
405
406 if (addr < FRAME_SIZE*sizeof(long)) {
407 ret = putreg(child, addr, data);
408 break;
409 }
410 /* We need to be very careful here. We implicitly
411 want to modify a portion of the task_struct, and we
412 have to be selective about what portions we allow someone
413 to modify. */
414
415 ret = -EIO;
416 if(addr >= (long) &dummy->u_debugreg[0] &&
417 addr <= (long) &dummy->u_debugreg[7]){
418
419 if(addr == (long) &dummy->u_debugreg[4]) break;
420 if(addr == (long) &dummy->u_debugreg[5]) break;
421 if(addr < (long) &dummy->u_debugreg[4] &&
422 ((unsigned long) data) >= TASK_SIZE-3) break;
423
424 /* Sanity-check data. Take one half-byte at once with
425 * check = (val >> (16 + 4*i)) & 0xf. It contains the
426 * R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits
427 * 2 and 3 are LENi. Given a list of invalid values,
428 * we do mask |= 1 << invalid_value, so that
429 * (mask >> check) & 1 is a correct test for invalid
430 * values.
431 *
432 * R/Wi contains the type of the breakpoint /
433 * watchpoint, LENi contains the length of the watched
434 * data in the watchpoint case.
435 *
436 * The invalid values are:
437 * - LENi == 0x10 (undefined), so mask |= 0x0f00.
438 * - R/Wi == 0x10 (break on I/O reads or writes), so
439 * mask |= 0x4444.
440 * - R/Wi == 0x00 && LENi != 0x00, so we have mask |=
441 * 0x1110.
442 *
443 * Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54.
444 *
445 * See the Intel Manual "System Programming Guide",
446 * 15.2.4
447 *
448 * Note that LENi == 0x10 is defined on x86_64 in long
449 * mode (i.e. even for 32-bit userspace software, but
450 * 64-bit kernel), so the x86_64 mask value is 0x5454.
451 * See the AMD manual no. 24593 (AMD64 System
452 * Programming)*/
453
454 if(addr == (long) &dummy->u_debugreg[7]) {
455 data &= ~DR_CONTROL_RESERVED;
456 for(i=0; i<4; i++)
457 if ((0x5f54 >> ((data >> (16 + 4*i)) & 0xf)) & 1)
458 goto out_tsk;
459 if (data)
460 set_tsk_thread_flag(child, TIF_DEBUG);
461 else
462 clear_tsk_thread_flag(child, TIF_DEBUG);
463 }
464 addr -= (long) &dummy->u_debugreg;
465 addr = addr >> 2;
466 child->thread.debugreg[addr] = data;
467 ret = 0;
468 }
469 break;
470
471 case PTRACE_SYSEMU: /* continue and stop at next syscall, which will not be executed */
472 case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */
473 case PTRACE_CONT: /* restart after signal. */
474 ret = -EIO;
475 if (!valid_signal(data))
476 break;
477 if (request == PTRACE_SYSEMU) {
478 set_tsk_thread_flag(child, TIF_SYSCALL_EMU);
479 clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
480 } else if (request == PTRACE_SYSCALL) {
481 set_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
482 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
483 } else {
484 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
485 clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
486 }
487 child->exit_code = data;
488 /* make sure the single step bit is not set. */
489 clear_singlestep(child);
490 wake_up_process(child);
491 ret = 0;
492 break;
493
494/*
495 * make the child exit. Best I can do is send it a sigkill.
496 * perhaps it should be put in the status that it wants to
497 * exit.
498 */
499 case PTRACE_KILL:
500 ret = 0;
501 if (child->exit_state == EXIT_ZOMBIE) /* already dead */
502 break;
503 child->exit_code = SIGKILL;
504 /* make sure the single step bit is not set. */
505 clear_singlestep(child);
506 wake_up_process(child);
507 break;
508
509 case PTRACE_SYSEMU_SINGLESTEP: /* Same as SYSEMU, but singlestep if not syscall */
510 case PTRACE_SINGLESTEP: /* set the trap flag. */
511 ret = -EIO;
512 if (!valid_signal(data))
513 break;
514
515 if (request == PTRACE_SYSEMU_SINGLESTEP)
516 set_tsk_thread_flag(child, TIF_SYSCALL_EMU);
517 else
518 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
519
520 clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
521 set_singlestep(child);
522 child->exit_code = data;
523 /* give it a chance to run. */
524 wake_up_process(child);
525 ret = 0;
526 break;
527
528 case PTRACE_DETACH:
529 /* detach a process that was attached. */
530 ret = ptrace_detach(child, data);
531 break;
532
533 case PTRACE_GETREGS: { /* Get all gp regs from the child. */
534 if (!access_ok(VERIFY_WRITE, datap, FRAME_SIZE*sizeof(long))) {
535 ret = -EIO;
536 break;
537 }
538 for ( i = 0; i < FRAME_SIZE*sizeof(long); i += sizeof(long) ) {
539 __put_user(getreg(child, i), datap);
540 datap++;
541 }
542 ret = 0;
543 break;
544 }
545
546 case PTRACE_SETREGS: { /* Set all gp regs in the child. */
547 unsigned long tmp;
548 if (!access_ok(VERIFY_READ, datap, FRAME_SIZE*sizeof(long))) {
549 ret = -EIO;
550 break;
551 }
552 for ( i = 0; i < FRAME_SIZE*sizeof(long); i += sizeof(long) ) {
553 __get_user(tmp, datap);
554 putreg(child, i, tmp);
555 datap++;
556 }
557 ret = 0;
558 break;
559 }
560
561 case PTRACE_GETFPREGS: { /* Get the child FPU state. */
562 if (!access_ok(VERIFY_WRITE, datap,
563 sizeof(struct user_i387_struct))) {
564 ret = -EIO;
565 break;
566 }
567 ret = 0;
568 if (!tsk_used_math(child))
569 init_fpu(child);
570 get_fpregs((struct user_i387_struct __user *)data, child);
571 break;
572 }
573
574 case PTRACE_SETFPREGS: { /* Set the child FPU state. */
575 if (!access_ok(VERIFY_READ, datap,
576 sizeof(struct user_i387_struct))) {
577 ret = -EIO;
578 break;
579 }
580 set_stopped_child_used_math(child);
581 set_fpregs(child, (struct user_i387_struct __user *)data);
582 ret = 0;
583 break;
584 }
585
586 case PTRACE_GETFPXREGS: { /* Get the child extended FPU state. */
587 if (!access_ok(VERIFY_WRITE, datap,
588 sizeof(struct user_fxsr_struct))) {
589 ret = -EIO;
590 break;
591 }
592 if (!tsk_used_math(child))
593 init_fpu(child);
594 ret = get_fpxregs((struct user_fxsr_struct __user *)data, child);
595 break;
596 }
597
598 case PTRACE_SETFPXREGS: { /* Set the child extended FPU state. */
599 if (!access_ok(VERIFY_READ, datap,
600 sizeof(struct user_fxsr_struct))) {
601 ret = -EIO;
602 break;
603 }
604 set_stopped_child_used_math(child);
605 ret = set_fpxregs(child, (struct user_fxsr_struct __user *)data);
606 break;
607 }
608
609 case PTRACE_GET_THREAD_AREA:
610 ret = ptrace_get_thread_area(child, addr,
611 (struct user_desc __user *) data);
612 break;
613
614 case PTRACE_SET_THREAD_AREA:
615 ret = ptrace_set_thread_area(child, addr,
616 (struct user_desc __user *) data);
617 break;
618
619 default:
620 ret = ptrace_request(child, request, addr, data);
621 break;
622 }
623 out_tsk:
624 return ret;
625}
626
627void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
628{
629 struct siginfo info;
630
631 tsk->thread.trap_no = 1;
632 tsk->thread.error_code = error_code;
633
634 memset(&info, 0, sizeof(info));
635 info.si_signo = SIGTRAP;
636 info.si_code = TRAP_BRKPT;
637
638 /* User-mode eip? */
639 info.si_addr = user_mode_vm(regs) ? (void __user *) regs->eip : NULL;
640
641 /* Send us the fakey SIGTRAP */
642 force_sig_info(SIGTRAP, &info, tsk);
643}
644
645/* notification of system call entry/exit
646 * - triggered by current->work.syscall_trace
647 */
648__attribute__((regparm(3)))
649int do_syscall_trace(struct pt_regs *regs, int entryexit)
650{
651 int is_sysemu = test_thread_flag(TIF_SYSCALL_EMU);
652 /*
653 * With TIF_SYSCALL_EMU set we want to ignore TIF_SINGLESTEP for syscall
654 * interception
655 */
656 int is_singlestep = !is_sysemu && test_thread_flag(TIF_SINGLESTEP);
657 int ret = 0;
658
659 /* do the secure computing check first */
660 if (!entryexit)
661 secure_computing(regs->orig_eax);
662
663 if (unlikely(current->audit_context)) {
664 if (entryexit)
665 audit_syscall_exit(AUDITSC_RESULT(regs->eax),
666 regs->eax);
667 /* Debug traps, when using PTRACE_SINGLESTEP, must be sent only
668 * on the syscall exit path. Normally, when TIF_SYSCALL_AUDIT is
669 * not used, entry.S will call us only on syscall exit, not
670 * entry; so when TIF_SYSCALL_AUDIT is used we must avoid
671 * calling send_sigtrap() on syscall entry.
672 *
673 * Note that when PTRACE_SYSEMU_SINGLESTEP is used,
674 * is_singlestep is false, despite his name, so we will still do
675 * the correct thing.
676 */
677 else if (is_singlestep)
678 goto out;
679 }
680
681 if (!(current->ptrace & PT_PTRACED))
682 goto out;
683
684 /* If a process stops on the 1st tracepoint with SYSCALL_TRACE
685 * and then is resumed with SYSEMU_SINGLESTEP, it will come in
686 * here. We have to check this and return */
687 if (is_sysemu && entryexit)
688 return 0;
689
690 /* Fake a debug trap */
691 if (is_singlestep)
692 send_sigtrap(current, regs, 0);
693
694 if (!test_thread_flag(TIF_SYSCALL_TRACE) && !is_sysemu)
695 goto out;
696
697 /* the 0x80 provides a way for the tracing parent to distinguish
698 between a syscall stop and SIGTRAP delivery */
699 /* Note that the debugger could change the result of test_thread_flag!*/
700 ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80:0));
701
702 /*
703 * this isn't the same as continuing with a signal, but it will do
704 * for normal use. strace only continues with a signal if the
705 * stopping signal is not SIGTRAP. -brl
706 */
707 if (current->exit_code) {
708 send_sig(current->exit_code, current, 1);
709 current->exit_code = 0;
710 }
711 ret = is_sysemu;
712out:
713 if (unlikely(current->audit_context) && !entryexit)
714 audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_eax,
715 regs->ebx, regs->ecx, regs->edx, regs->esi);
716 if (ret == 0)
717 return 0;
718
719 regs->orig_eax = -1; /* force skip of syscall restarting */
720 if (unlikely(current->audit_context))
721 audit_syscall_exit(AUDITSC_RESULT(regs->eax), regs->eax);
722 return 1;
723}
diff --git a/arch/x86/kernel/ptrace_64.c b/arch/x86/kernel/ptrace_64.c
new file mode 100644
index 000000000000..eea3702427b4
--- /dev/null
+++ b/arch/x86/kernel/ptrace_64.c
@@ -0,0 +1,627 @@
1/* ptrace.c */
2/* By Ross Biro 1/23/92 */
3/*
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
6 *
7 * x86-64 port 2000-2002 Andi Kleen
8 */
9
10#include <linux/kernel.h>
11#include <linux/sched.h>
12#include <linux/mm.h>
13#include <linux/smp.h>
14#include <linux/errno.h>
15#include <linux/ptrace.h>
16#include <linux/user.h>
17#include <linux/security.h>
18#include <linux/audit.h>
19#include <linux/seccomp.h>
20#include <linux/signal.h>
21
22#include <asm/uaccess.h>
23#include <asm/pgtable.h>
24#include <asm/system.h>
25#include <asm/processor.h>
26#include <asm/i387.h>
27#include <asm/debugreg.h>
28#include <asm/ldt.h>
29#include <asm/desc.h>
30#include <asm/proto.h>
31#include <asm/ia32.h>
32
33/*
34 * does not yet catch signals sent when the child dies.
35 * in exit.c or in signal.c.
36 */
37
38/*
39 * Determines which flags the user has access to [1 = access, 0 = no access].
40 * Prohibits changing ID(21), VIP(20), VIF(19), VM(17), IOPL(12-13), IF(9).
41 * Also masks reserved bits (63-22, 15, 5, 3, 1).
42 */
43#define FLAG_MASK 0x54dd5UL
44
45/* set's the trap flag. */
46#define TRAP_FLAG 0x100UL
47
48/*
49 * eflags and offset of eflags on child stack..
50 */
51#define EFLAGS offsetof(struct pt_regs, eflags)
52#define EFL_OFFSET ((int)(EFLAGS-sizeof(struct pt_regs)))
53
54/*
55 * this routine will get a word off of the processes privileged stack.
56 * the offset is how far from the base addr as stored in the TSS.
57 * this routine assumes that all the privileged stacks are in our
58 * data space.
59 */
60static inline unsigned long get_stack_long(struct task_struct *task, int offset)
61{
62 unsigned char *stack;
63
64 stack = (unsigned char *)task->thread.rsp0;
65 stack += offset;
66 return (*((unsigned long *)stack));
67}
68
69/*
70 * this routine will put a word on the processes privileged stack.
71 * the offset is how far from the base addr as stored in the TSS.
72 * this routine assumes that all the privileged stacks are in our
73 * data space.
74 */
75static inline long put_stack_long(struct task_struct *task, int offset,
76 unsigned long data)
77{
78 unsigned char * stack;
79
80 stack = (unsigned char *) task->thread.rsp0;
81 stack += offset;
82 *(unsigned long *) stack = data;
83 return 0;
84}
85
86#define LDT_SEGMENT 4
87
88unsigned long convert_rip_to_linear(struct task_struct *child, struct pt_regs *regs)
89{
90 unsigned long addr, seg;
91
92 addr = regs->rip;
93 seg = regs->cs & 0xffff;
94
95 /*
96 * We'll assume that the code segments in the GDT
97 * are all zero-based. That is largely true: the
98 * TLS segments are used for data, and the PNPBIOS
99 * and APM bios ones we just ignore here.
100 */
101 if (seg & LDT_SEGMENT) {
102 u32 *desc;
103 unsigned long base;
104
105 seg &= ~7UL;
106
107 down(&child->mm->context.sem);
108 if (unlikely((seg >> 3) >= child->mm->context.size))
109 addr = -1L; /* bogus selector, access would fault */
110 else {
111 desc = child->mm->context.ldt + seg;
112 base = ((desc[0] >> 16) |
113 ((desc[1] & 0xff) << 16) |
114 (desc[1] & 0xff000000));
115
116 /* 16-bit code segment? */
117 if (!((desc[1] >> 22) & 1))
118 addr &= 0xffff;
119 addr += base;
120 }
121 up(&child->mm->context.sem);
122 }
123
124 return addr;
125}
126
127static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
128{
129 int i, copied;
130 unsigned char opcode[15];
131 unsigned long addr = convert_rip_to_linear(child, regs);
132
133 copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0);
134 for (i = 0; i < copied; i++) {
135 switch (opcode[i]) {
136 /* popf and iret */
137 case 0x9d: case 0xcf:
138 return 1;
139
140 /* CHECKME: 64 65 */
141
142 /* opcode and address size prefixes */
143 case 0x66: case 0x67:
144 continue;
145 /* irrelevant prefixes (segment overrides and repeats) */
146 case 0x26: case 0x2e:
147 case 0x36: case 0x3e:
148 case 0x64: case 0x65:
149 case 0xf2: case 0xf3:
150 continue;
151
152 case 0x40 ... 0x4f:
153 if (regs->cs != __USER_CS)
154 /* 32-bit mode: register increment */
155 return 0;
156 /* 64-bit mode: REX prefix */
157 continue;
158
159 /* CHECKME: f2, f3 */
160
161 /*
162 * pushf: NOTE! We should probably not let
163 * the user see the TF bit being set. But
164 * it's more pain than it's worth to avoid
165 * it, and a debugger could emulate this
166 * all in user space if it _really_ cares.
167 */
168 case 0x9c:
169 default:
170 return 0;
171 }
172 }
173 return 0;
174}
175
176static void set_singlestep(struct task_struct *child)
177{
178 struct pt_regs *regs = task_pt_regs(child);
179
180 /*
181 * Always set TIF_SINGLESTEP - this guarantees that
182 * we single-step system calls etc.. This will also
183 * cause us to set TF when returning to user mode.
184 */
185 set_tsk_thread_flag(child, TIF_SINGLESTEP);
186
187 /*
188 * If TF was already set, don't do anything else
189 */
190 if (regs->eflags & TRAP_FLAG)
191 return;
192
193 /* Set TF on the kernel stack.. */
194 regs->eflags |= TRAP_FLAG;
195
196 /*
197 * ..but if TF is changed by the instruction we will trace,
198 * don't mark it as being "us" that set it, so that we
199 * won't clear it by hand later.
200 */
201 if (is_setting_trap_flag(child, regs))
202 return;
203
204 child->ptrace |= PT_DTRACE;
205}
206
207static void clear_singlestep(struct task_struct *child)
208{
209 /* Always clear TIF_SINGLESTEP... */
210 clear_tsk_thread_flag(child, TIF_SINGLESTEP);
211
212 /* But touch TF only if it was set by us.. */
213 if (child->ptrace & PT_DTRACE) {
214 struct pt_regs *regs = task_pt_regs(child);
215 regs->eflags &= ~TRAP_FLAG;
216 child->ptrace &= ~PT_DTRACE;
217 }
218}
219
220/*
221 * Called by kernel/ptrace.c when detaching..
222 *
223 * Make sure the single step bit is not set.
224 */
225void ptrace_disable(struct task_struct *child)
226{
227 clear_singlestep(child);
228}
229
230static int putreg(struct task_struct *child,
231 unsigned long regno, unsigned long value)
232{
233 unsigned long tmp;
234
235 switch (regno) {
236 case offsetof(struct user_regs_struct,fs):
237 if (value && (value & 3) != 3)
238 return -EIO;
239 child->thread.fsindex = value & 0xffff;
240 return 0;
241 case offsetof(struct user_regs_struct,gs):
242 if (value && (value & 3) != 3)
243 return -EIO;
244 child->thread.gsindex = value & 0xffff;
245 return 0;
246 case offsetof(struct user_regs_struct,ds):
247 if (value && (value & 3) != 3)
248 return -EIO;
249 child->thread.ds = value & 0xffff;
250 return 0;
251 case offsetof(struct user_regs_struct,es):
252 if (value && (value & 3) != 3)
253 return -EIO;
254 child->thread.es = value & 0xffff;
255 return 0;
256 case offsetof(struct user_regs_struct,ss):
257 if ((value & 3) != 3)
258 return -EIO;
259 value &= 0xffff;
260 return 0;
261 case offsetof(struct user_regs_struct,fs_base):
262 if (value >= TASK_SIZE_OF(child))
263 return -EIO;
264 child->thread.fs = value;
265 return 0;
266 case offsetof(struct user_regs_struct,gs_base):
267 if (value >= TASK_SIZE_OF(child))
268 return -EIO;
269 child->thread.gs = value;
270 return 0;
271 case offsetof(struct user_regs_struct, eflags):
272 value &= FLAG_MASK;
273 tmp = get_stack_long(child, EFL_OFFSET);
274 tmp &= ~FLAG_MASK;
275 value |= tmp;
276 break;
277 case offsetof(struct user_regs_struct,cs):
278 if ((value & 3) != 3)
279 return -EIO;
280 value &= 0xffff;
281 break;
282 }
283 put_stack_long(child, regno - sizeof(struct pt_regs), value);
284 return 0;
285}
286
287static unsigned long getreg(struct task_struct *child, unsigned long regno)
288{
289 unsigned long val;
290 switch (regno) {
291 case offsetof(struct user_regs_struct, fs):
292 return child->thread.fsindex;
293 case offsetof(struct user_regs_struct, gs):
294 return child->thread.gsindex;
295 case offsetof(struct user_regs_struct, ds):
296 return child->thread.ds;
297 case offsetof(struct user_regs_struct, es):
298 return child->thread.es;
299 case offsetof(struct user_regs_struct, fs_base):
300 return child->thread.fs;
301 case offsetof(struct user_regs_struct, gs_base):
302 return child->thread.gs;
303 default:
304 regno = regno - sizeof(struct pt_regs);
305 val = get_stack_long(child, regno);
306 if (test_tsk_thread_flag(child, TIF_IA32))
307 val &= 0xffffffff;
308 return val;
309 }
310
311}
312
313long arch_ptrace(struct task_struct *child, long request, long addr, long data)
314{
315 long i, ret;
316 unsigned ui;
317
318 switch (request) {
319 /* when I and D space are separate, these will need to be fixed. */
320 case PTRACE_PEEKTEXT: /* read word at location addr. */
321 case PTRACE_PEEKDATA:
322 ret = generic_ptrace_peekdata(child, addr, data);
323 break;
324
325 /* read the word at location addr in the USER area. */
326 case PTRACE_PEEKUSR: {
327 unsigned long tmp;
328
329 ret = -EIO;
330 if ((addr & 7) ||
331 addr > sizeof(struct user) - 7)
332 break;
333
334 switch (addr) {
335 case 0 ... sizeof(struct user_regs_struct) - sizeof(long):
336 tmp = getreg(child, addr);
337 break;
338 case offsetof(struct user, u_debugreg[0]):
339 tmp = child->thread.debugreg0;
340 break;
341 case offsetof(struct user, u_debugreg[1]):
342 tmp = child->thread.debugreg1;
343 break;
344 case offsetof(struct user, u_debugreg[2]):
345 tmp = child->thread.debugreg2;
346 break;
347 case offsetof(struct user, u_debugreg[3]):
348 tmp = child->thread.debugreg3;
349 break;
350 case offsetof(struct user, u_debugreg[6]):
351 tmp = child->thread.debugreg6;
352 break;
353 case offsetof(struct user, u_debugreg[7]):
354 tmp = child->thread.debugreg7;
355 break;
356 default:
357 tmp = 0;
358 break;
359 }
360 ret = put_user(tmp,(unsigned long __user *) data);
361 break;
362 }
363
364 /* when I and D space are separate, this will have to be fixed. */
365 case PTRACE_POKETEXT: /* write the word at location addr. */
366 case PTRACE_POKEDATA:
367 ret = generic_ptrace_pokedata(child, addr, data);
368 break;
369
370 case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
371 {
372 int dsize = test_tsk_thread_flag(child, TIF_IA32) ? 3 : 7;
373 ret = -EIO;
374 if ((addr & 7) ||
375 addr > sizeof(struct user) - 7)
376 break;
377
378 switch (addr) {
379 case 0 ... sizeof(struct user_regs_struct) - sizeof(long):
380 ret = putreg(child, addr, data);
381 break;
382 /* Disallows to set a breakpoint into the vsyscall */
383 case offsetof(struct user, u_debugreg[0]):
384 if (data >= TASK_SIZE_OF(child) - dsize) break;
385 child->thread.debugreg0 = data;
386 ret = 0;
387 break;
388 case offsetof(struct user, u_debugreg[1]):
389 if (data >= TASK_SIZE_OF(child) - dsize) break;
390 child->thread.debugreg1 = data;
391 ret = 0;
392 break;
393 case offsetof(struct user, u_debugreg[2]):
394 if (data >= TASK_SIZE_OF(child) - dsize) break;
395 child->thread.debugreg2 = data;
396 ret = 0;
397 break;
398 case offsetof(struct user, u_debugreg[3]):
399 if (data >= TASK_SIZE_OF(child) - dsize) break;
400 child->thread.debugreg3 = data;
401 ret = 0;
402 break;
403 case offsetof(struct user, u_debugreg[6]):
404 if (data >> 32)
405 break;
406 child->thread.debugreg6 = data;
407 ret = 0;
408 break;
409 case offsetof(struct user, u_debugreg[7]):
410 /* See arch/i386/kernel/ptrace.c for an explanation of
411 * this awkward check.*/
412 data &= ~DR_CONTROL_RESERVED;
413 for(i=0; i<4; i++)
414 if ((0x5554 >> ((data >> (16 + 4*i)) & 0xf)) & 1)
415 break;
416 if (i == 4) {
417 child->thread.debugreg7 = data;
418 if (data)
419 set_tsk_thread_flag(child, TIF_DEBUG);
420 else
421 clear_tsk_thread_flag(child, TIF_DEBUG);
422 ret = 0;
423 }
424 break;
425 }
426 break;
427 }
428 case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */
429 case PTRACE_CONT: /* restart after signal. */
430
431 ret = -EIO;
432 if (!valid_signal(data))
433 break;
434 if (request == PTRACE_SYSCALL)
435 set_tsk_thread_flag(child,TIF_SYSCALL_TRACE);
436 else
437 clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE);
438 clear_tsk_thread_flag(child, TIF_SINGLESTEP);
439 child->exit_code = data;
440 /* make sure the single step bit is not set. */
441 clear_singlestep(child);
442 wake_up_process(child);
443 ret = 0;
444 break;
445
446#ifdef CONFIG_IA32_EMULATION
447 /* This makes only sense with 32bit programs. Allow a
448 64bit debugger to fully examine them too. Better
449 don't use it against 64bit processes, use
450 PTRACE_ARCH_PRCTL instead. */
451 case PTRACE_SET_THREAD_AREA: {
452 struct user_desc __user *p;
453 int old;
454 p = (struct user_desc __user *)data;
455 get_user(old, &p->entry_number);
456 put_user(addr, &p->entry_number);
457 ret = do_set_thread_area(&child->thread, p);
458 put_user(old, &p->entry_number);
459 break;
460 case PTRACE_GET_THREAD_AREA:
461 p = (struct user_desc __user *)data;
462 get_user(old, &p->entry_number);
463 put_user(addr, &p->entry_number);
464 ret = do_get_thread_area(&child->thread, p);
465 put_user(old, &p->entry_number);
466 break;
467 }
468#endif
469 /* normal 64bit interface to access TLS data.
470 Works just like arch_prctl, except that the arguments
471 are reversed. */
472 case PTRACE_ARCH_PRCTL:
473 ret = do_arch_prctl(child, data, addr);
474 break;
475
476/*
477 * make the child exit. Best I can do is send it a sigkill.
478 * perhaps it should be put in the status that it wants to
479 * exit.
480 */
481 case PTRACE_KILL:
482 ret = 0;
483 if (child->exit_state == EXIT_ZOMBIE) /* already dead */
484 break;
485 clear_tsk_thread_flag(child, TIF_SINGLESTEP);
486 child->exit_code = SIGKILL;
487 /* make sure the single step bit is not set. */
488 clear_singlestep(child);
489 wake_up_process(child);
490 break;
491
492 case PTRACE_SINGLESTEP: /* set the trap flag. */
493 ret = -EIO;
494 if (!valid_signal(data))
495 break;
496 clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE);
497 set_singlestep(child);
498 child->exit_code = data;
499 /* give it a chance to run. */
500 wake_up_process(child);
501 ret = 0;
502 break;
503
504 case PTRACE_DETACH:
505 /* detach a process that was attached. */
506 ret = ptrace_detach(child, data);
507 break;
508
509 case PTRACE_GETREGS: { /* Get all gp regs from the child. */
510 if (!access_ok(VERIFY_WRITE, (unsigned __user *)data,
511 sizeof(struct user_regs_struct))) {
512 ret = -EIO;
513 break;
514 }
515 ret = 0;
516 for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) {
517 ret |= __put_user(getreg(child, ui),(unsigned long __user *) data);
518 data += sizeof(long);
519 }
520 break;
521 }
522
523 case PTRACE_SETREGS: { /* Set all gp regs in the child. */
524 unsigned long tmp;
525 if (!access_ok(VERIFY_READ, (unsigned __user *)data,
526 sizeof(struct user_regs_struct))) {
527 ret = -EIO;
528 break;
529 }
530 ret = 0;
531 for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) {
532 ret = __get_user(tmp, (unsigned long __user *) data);
533 if (ret)
534 break;
535 ret = putreg(child, ui, tmp);
536 if (ret)
537 break;
538 data += sizeof(long);
539 }
540 break;
541 }
542
543 case PTRACE_GETFPREGS: { /* Get the child extended FPU state. */
544 if (!access_ok(VERIFY_WRITE, (unsigned __user *)data,
545 sizeof(struct user_i387_struct))) {
546 ret = -EIO;
547 break;
548 }
549 ret = get_fpregs((struct user_i387_struct __user *)data, child);
550 break;
551 }
552
553 case PTRACE_SETFPREGS: { /* Set the child extended FPU state. */
554 if (!access_ok(VERIFY_READ, (unsigned __user *)data,
555 sizeof(struct user_i387_struct))) {
556 ret = -EIO;
557 break;
558 }
559 set_stopped_child_used_math(child);
560 ret = set_fpregs(child, (struct user_i387_struct __user *)data);
561 break;
562 }
563
564 default:
565 ret = ptrace_request(child, request, addr, data);
566 break;
567 }
568 return ret;
569}
570
571static void syscall_trace(struct pt_regs *regs)
572{
573
574#if 0
575 printk("trace %s rip %lx rsp %lx rax %d origrax %d caller %lx tiflags %x ptrace %x\n",
576 current->comm,
577 regs->rip, regs->rsp, regs->rax, regs->orig_rax, __builtin_return_address(0),
578 current_thread_info()->flags, current->ptrace);
579#endif
580
581 ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD)
582 ? 0x80 : 0));
583 /*
584 * this isn't the same as continuing with a signal, but it will do
585 * for normal use. strace only continues with a signal if the
586 * stopping signal is not SIGTRAP. -brl
587 */
588 if (current->exit_code) {
589 send_sig(current->exit_code, current, 1);
590 current->exit_code = 0;
591 }
592}
593
594asmlinkage void syscall_trace_enter(struct pt_regs *regs)
595{
596 /* do the secure computing check first */
597 secure_computing(regs->orig_rax);
598
599 if (test_thread_flag(TIF_SYSCALL_TRACE)
600 && (current->ptrace & PT_PTRACED))
601 syscall_trace(regs);
602
603 if (unlikely(current->audit_context)) {
604 if (test_thread_flag(TIF_IA32)) {
605 audit_syscall_entry(AUDIT_ARCH_I386,
606 regs->orig_rax,
607 regs->rbx, regs->rcx,
608 regs->rdx, regs->rsi);
609 } else {
610 audit_syscall_entry(AUDIT_ARCH_X86_64,
611 regs->orig_rax,
612 regs->rdi, regs->rsi,
613 regs->rdx, regs->r10);
614 }
615 }
616}
617
618asmlinkage void syscall_trace_leave(struct pt_regs *regs)
619{
620 if (unlikely(current->audit_context))
621 audit_syscall_exit(AUDITSC_RESULT(regs->rax), regs->rax);
622
623 if ((test_thread_flag(TIF_SYSCALL_TRACE)
624 || test_thread_flag(TIF_SINGLESTEP))
625 && (current->ptrace & PT_PTRACED))
626 syscall_trace(regs);
627}
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
new file mode 100644
index 000000000000..6722469c2633
--- /dev/null
+++ b/arch/x86/kernel/quirks.c
@@ -0,0 +1,49 @@
1/*
2 * This file contains work-arounds for x86 and x86_64 platform bugs.
3 */
4#include <linux/pci.h>
5#include <linux/irq.h>
6
7#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI)
8
9static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
10{
11 u8 config, rev;
12 u32 word;
13
14 /* BIOS may enable hardware IRQ balancing for
15 * E7520/E7320/E7525(revision ID 0x9 and below)
16 * based platforms.
17 * Disable SW irqbalance/affinity on those platforms.
18 */
19 pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
20 if (rev > 0x9)
21 return;
22
23 /* enable access to config space*/
24 pci_read_config_byte(dev, 0xf4, &config);
25 pci_write_config_byte(dev, 0xf4, config|0x2);
26
27 /* read xTPR register */
28 raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word);
29
30 if (!(word & (1 << 13))) {
31 printk(KERN_INFO "Intel E7520/7320/7525 detected. "
32 "Disabling irq balancing and affinity\n");
33#ifdef CONFIG_IRQBALANCE
34 irqbalance_disable("");
35#endif
36 noirqdebug_setup("");
37#ifdef CONFIG_PROC_FS
38 no_irq_affinity = 1;
39#endif
40 }
41
42 /* put back the original value for config space*/
43 if (!(config & 0x2))
44 pci_write_config_byte(dev, 0xf4, config);
45}
46DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, quirk_intel_irqbalance);
47DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, quirk_intel_irqbalance);
48DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, quirk_intel_irqbalance);
49#endif
diff --git a/arch/x86/kernel/reboot_32.c b/arch/x86/kernel/reboot_32.c
new file mode 100644
index 000000000000..b37ed226830a
--- /dev/null
+++ b/arch/x86/kernel/reboot_32.c
@@ -0,0 +1,413 @@
1/*
2 * linux/arch/i386/kernel/reboot.c
3 */
4
5#include <linux/mm.h>
6#include <linux/module.h>
7#include <linux/delay.h>
8#include <linux/init.h>
9#include <linux/interrupt.h>
10#include <linux/mc146818rtc.h>
11#include <linux/efi.h>
12#include <linux/dmi.h>
13#include <linux/ctype.h>
14#include <linux/pm.h>
15#include <linux/reboot.h>
16#include <asm/uaccess.h>
17#include <asm/apic.h>
18#include <asm/desc.h>
19#include "mach_reboot.h"
20#include <asm/reboot_fixups.h>
21#include <asm/reboot.h>
22
23/*
24 * Power off function, if any
25 */
26void (*pm_power_off)(void);
27EXPORT_SYMBOL(pm_power_off);
28
29static int reboot_mode;
30static int reboot_thru_bios;
31
32#ifdef CONFIG_SMP
33static int reboot_cpu = -1;
34#endif
35static int __init reboot_setup(char *str)
36{
37 while(1) {
38 switch (*str) {
39 case 'w': /* "warm" reboot (no memory testing etc) */
40 reboot_mode = 0x1234;
41 break;
42 case 'c': /* "cold" reboot (with memory testing etc) */
43 reboot_mode = 0x0;
44 break;
45 case 'b': /* "bios" reboot by jumping through the BIOS */
46 reboot_thru_bios = 1;
47 break;
48 case 'h': /* "hard" reboot by toggling RESET and/or crashing the CPU */
49 reboot_thru_bios = 0;
50 break;
51#ifdef CONFIG_SMP
52 case 's': /* "smp" reboot by executing reset on BSP or other CPU*/
53 if (isdigit(*(str+1))) {
54 reboot_cpu = (int) (*(str+1) - '0');
55 if (isdigit(*(str+2)))
56 reboot_cpu = reboot_cpu*10 + (int)(*(str+2) - '0');
57 }
58 /* we will leave sorting out the final value
59 when we are ready to reboot, since we might not
60 have set up boot_cpu_id or smp_num_cpu */
61 break;
62#endif
63 }
64 if((str = strchr(str,',')) != NULL)
65 str++;
66 else
67 break;
68 }
69 return 1;
70}
71
72__setup("reboot=", reboot_setup);
73
74/*
75 * Reboot options and system auto-detection code provided by
76 * Dell Inc. so their systems "just work". :-)
77 */
78
79/*
80 * Some machines require the "reboot=b" commandline option, this quirk makes that automatic.
81 */
82static int __init set_bios_reboot(const struct dmi_system_id *d)
83{
84 if (!reboot_thru_bios) {
85 reboot_thru_bios = 1;
86 printk(KERN_INFO "%s series board detected. Selecting BIOS-method for reboots.\n", d->ident);
87 }
88 return 0;
89}
90
91static struct dmi_system_id __initdata reboot_dmi_table[] = {
92 { /* Handle problems with rebooting on Dell E520's */
93 .callback = set_bios_reboot,
94 .ident = "Dell E520",
95 .matches = {
96 DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
97 DMI_MATCH(DMI_PRODUCT_NAME, "Dell DM061"),
98 },
99 },
100 { /* Handle problems with rebooting on Dell 1300's */
101 .callback = set_bios_reboot,
102 .ident = "Dell PowerEdge 1300",
103 .matches = {
104 DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
105 DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 1300/"),
106 },
107 },
108 { /* Handle problems with rebooting on Dell 300's */
109 .callback = set_bios_reboot,
110 .ident = "Dell PowerEdge 300",
111 .matches = {
112 DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
113 DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 300/"),
114 },
115 },
116 { /* Handle problems with rebooting on Dell Optiplex 745's SFF*/
117 .callback = set_bios_reboot,
118 .ident = "Dell OptiPlex 745",
119 .matches = {
120 DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
121 DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"),
122 DMI_MATCH(DMI_BOARD_NAME, "0WF810"),
123 },
124 },
125 { /* Handle problems with rebooting on Dell 2400's */
126 .callback = set_bios_reboot,
127 .ident = "Dell PowerEdge 2400",
128 .matches = {
129 DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
130 DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"),
131 },
132 },
133 { /* Handle problems with rebooting on HP laptops */
134 .callback = set_bios_reboot,
135 .ident = "HP Compaq Laptop",
136 .matches = {
137 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
138 DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"),
139 },
140 },
141 { }
142};
143
144static int __init reboot_init(void)
145{
146 dmi_check_system(reboot_dmi_table);
147 return 0;
148}
149
150core_initcall(reboot_init);
151
152/* The following code and data reboots the machine by switching to real
153 mode and jumping to the BIOS reset entry point, as if the CPU has
154 really been reset. The previous version asked the keyboard
155 controller to pulse the CPU reset line, which is more thorough, but
156 doesn't work with at least one type of 486 motherboard. It is easy
157 to stop this code working; hence the copious comments. */
158
159static unsigned long long
160real_mode_gdt_entries [3] =
161{
162 0x0000000000000000ULL, /* Null descriptor */
163 0x00009a000000ffffULL, /* 16-bit real-mode 64k code at 0x00000000 */
164 0x000092000100ffffULL /* 16-bit real-mode 64k data at 0x00000100 */
165};
166
167static struct Xgt_desc_struct
168real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, (long)real_mode_gdt_entries },
169real_mode_idt = { 0x3ff, 0 },
170no_idt = { 0, 0 };
171
172
173/* This is 16-bit protected mode code to disable paging and the cache,
174 switch to real mode and jump to the BIOS reset code.
175
176 The instruction that switches to real mode by writing to CR0 must be
177 followed immediately by a far jump instruction, which set CS to a
178 valid value for real mode, and flushes the prefetch queue to avoid
179 running instructions that have already been decoded in protected
180 mode.
181
182 Clears all the flags except ET, especially PG (paging), PE
183 (protected-mode enable) and TS (task switch for coprocessor state
184 save). Flushes the TLB after paging has been disabled. Sets CD and
185 NW, to disable the cache on a 486, and invalidates the cache. This
186 is more like the state of a 486 after reset. I don't know if
187 something else should be done for other chips.
188
189 More could be done here to set up the registers as if a CPU reset had
190 occurred; hopefully real BIOSs don't assume much. */
191
192static unsigned char real_mode_switch [] =
193{
194 0x66, 0x0f, 0x20, 0xc0, /* movl %cr0,%eax */
195 0x66, 0x83, 0xe0, 0x11, /* andl $0x00000011,%eax */
196 0x66, 0x0d, 0x00, 0x00, 0x00, 0x60, /* orl $0x60000000,%eax */
197 0x66, 0x0f, 0x22, 0xc0, /* movl %eax,%cr0 */
198 0x66, 0x0f, 0x22, 0xd8, /* movl %eax,%cr3 */
199 0x66, 0x0f, 0x20, 0xc3, /* movl %cr0,%ebx */
200 0x66, 0x81, 0xe3, 0x00, 0x00, 0x00, 0x60, /* andl $0x60000000,%ebx */
201 0x74, 0x02, /* jz f */
202 0x0f, 0x09, /* wbinvd */
203 0x24, 0x10, /* f: andb $0x10,al */
204 0x66, 0x0f, 0x22, 0xc0 /* movl %eax,%cr0 */
205};
206static unsigned char jump_to_bios [] =
207{
208 0xea, 0x00, 0x00, 0xff, 0xff /* ljmp $0xffff,$0x0000 */
209};
210
211/*
212 * Switch to real mode and then execute the code
213 * specified by the code and length parameters.
214 * We assume that length will aways be less that 100!
215 */
216void machine_real_restart(unsigned char *code, int length)
217{
218 local_irq_disable();
219
220 /* Write zero to CMOS register number 0x0f, which the BIOS POST
221 routine will recognize as telling it to do a proper reboot. (Well
222 that's what this book in front of me says -- it may only apply to
223 the Phoenix BIOS though, it's not clear). At the same time,
224 disable NMIs by setting the top bit in the CMOS address register,
225 as we're about to do peculiar things to the CPU. I'm not sure if
226 `outb_p' is needed instead of just `outb'. Use it to be on the
227 safe side. (Yes, CMOS_WRITE does outb_p's. - Paul G.)
228 */
229
230 spin_lock(&rtc_lock);
231 CMOS_WRITE(0x00, 0x8f);
232 spin_unlock(&rtc_lock);
233
234 /* Remap the kernel at virtual address zero, as well as offset zero
235 from the kernel segment. This assumes the kernel segment starts at
236 virtual address PAGE_OFFSET. */
237
238 memcpy (swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
239 sizeof (swapper_pg_dir [0]) * KERNEL_PGD_PTRS);
240
241 /*
242 * Use `swapper_pg_dir' as our page directory.
243 */
244 load_cr3(swapper_pg_dir);
245
246 /* Write 0x1234 to absolute memory location 0x472. The BIOS reads
247 this on booting to tell it to "Bypass memory test (also warm
248 boot)". This seems like a fairly standard thing that gets set by
249 REBOOT.COM programs, and the previous reset routine did this
250 too. */
251
252 *((unsigned short *)0x472) = reboot_mode;
253
254 /* For the switch to real mode, copy some code to low memory. It has
255 to be in the first 64k because it is running in 16-bit mode, and it
256 has to have the same physical and virtual address, because it turns
257 off paging. Copy it near the end of the first page, out of the way
258 of BIOS variables. */
259
260 memcpy ((void *) (0x1000 - sizeof (real_mode_switch) - 100),
261 real_mode_switch, sizeof (real_mode_switch));
262 memcpy ((void *) (0x1000 - 100), code, length);
263
264 /* Set up the IDT for real mode. */
265
266 load_idt(&real_mode_idt);
267
268 /* Set up a GDT from which we can load segment descriptors for real
269 mode. The GDT is not used in real mode; it is just needed here to
270 prepare the descriptors. */
271
272 load_gdt(&real_mode_gdt);
273
274 /* Load the data segment registers, and thus the descriptors ready for
275 real mode. The base address of each segment is 0x100, 16 times the
276 selector value being loaded here. This is so that the segment
277 registers don't have to be reloaded after switching to real mode:
278 the values are consistent for real mode operation already. */
279
280 __asm__ __volatile__ ("movl $0x0010,%%eax\n"
281 "\tmovl %%eax,%%ds\n"
282 "\tmovl %%eax,%%es\n"
283 "\tmovl %%eax,%%fs\n"
284 "\tmovl %%eax,%%gs\n"
285 "\tmovl %%eax,%%ss" : : : "eax");
286
287 /* Jump to the 16-bit code that we copied earlier. It disables paging
288 and the cache, switches to real mode, and jumps to the BIOS reset
289 entry point. */
290
291 __asm__ __volatile__ ("ljmp $0x0008,%0"
292 :
293 : "i" ((void *) (0x1000 - sizeof (real_mode_switch) - 100)));
294}
295#ifdef CONFIG_APM_MODULE
296EXPORT_SYMBOL(machine_real_restart);
297#endif
298
299static void native_machine_shutdown(void)
300{
301#ifdef CONFIG_SMP
302 int reboot_cpu_id;
303
304 /* The boot cpu is always logical cpu 0 */
305 reboot_cpu_id = 0;
306
307 /* See if there has been given a command line override */
308 if ((reboot_cpu != -1) && (reboot_cpu < NR_CPUS) &&
309 cpu_isset(reboot_cpu, cpu_online_map)) {
310 reboot_cpu_id = reboot_cpu;
311 }
312
313 /* Make certain the cpu I'm rebooting on is online */
314 if (!cpu_isset(reboot_cpu_id, cpu_online_map)) {
315 reboot_cpu_id = smp_processor_id();
316 }
317
318 /* Make certain I only run on the appropriate processor */
319 set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id));
320
321 /* O.K. Now that I'm on the appropriate processor, stop
322 * all of the others, and disable their local APICs.
323 */
324
325 smp_send_stop();
326#endif /* CONFIG_SMP */
327
328 lapic_shutdown();
329
330#ifdef CONFIG_X86_IO_APIC
331 disable_IO_APIC();
332#endif
333}
334
335void __attribute__((weak)) mach_reboot_fixups(void)
336{
337}
338
339static void native_machine_emergency_restart(void)
340{
341 if (!reboot_thru_bios) {
342 if (efi_enabled) {
343 efi.reset_system(EFI_RESET_COLD, EFI_SUCCESS, 0, NULL);
344 load_idt(&no_idt);
345 __asm__ __volatile__("int3");
346 }
347 /* rebooting needs to touch the page at absolute addr 0 */
348 *((unsigned short *)__va(0x472)) = reboot_mode;
349 for (;;) {
350 mach_reboot_fixups(); /* for board specific fixups */
351 mach_reboot();
352 /* That didn't work - force a triple fault.. */
353 load_idt(&no_idt);
354 __asm__ __volatile__("int3");
355 }
356 }
357 if (efi_enabled)
358 efi.reset_system(EFI_RESET_WARM, EFI_SUCCESS, 0, NULL);
359
360 machine_real_restart(jump_to_bios, sizeof(jump_to_bios));
361}
362
363static void native_machine_restart(char * __unused)
364{
365 machine_shutdown();
366 machine_emergency_restart();
367}
368
369static void native_machine_halt(void)
370{
371}
372
373static void native_machine_power_off(void)
374{
375 if (pm_power_off) {
376 machine_shutdown();
377 pm_power_off();
378 }
379}
380
381
382struct machine_ops machine_ops = {
383 .power_off = native_machine_power_off,
384 .shutdown = native_machine_shutdown,
385 .emergency_restart = native_machine_emergency_restart,
386 .restart = native_machine_restart,
387 .halt = native_machine_halt,
388};
389
390void machine_power_off(void)
391{
392 machine_ops.power_off();
393}
394
395void machine_shutdown(void)
396{
397 machine_ops.shutdown();
398}
399
400void machine_emergency_restart(void)
401{
402 machine_ops.emergency_restart();
403}
404
405void machine_restart(char *cmd)
406{
407 machine_ops.restart(cmd);
408}
409
410void machine_halt(void)
411{
412 machine_ops.halt();
413}
diff --git a/arch/x86/kernel/reboot_64.c b/arch/x86/kernel/reboot_64.c
new file mode 100644
index 000000000000..368db2b9c5ac
--- /dev/null
+++ b/arch/x86/kernel/reboot_64.c
@@ -0,0 +1,171 @@
1/* Various gunk just to reboot the machine. */
2#include <linux/module.h>
3#include <linux/reboot.h>
4#include <linux/init.h>
5#include <linux/smp.h>
6#include <linux/kernel.h>
7#include <linux/ctype.h>
8#include <linux/string.h>
9#include <linux/pm.h>
10#include <linux/kdebug.h>
11#include <linux/sched.h>
12#include <asm/io.h>
13#include <asm/delay.h>
14#include <asm/hw_irq.h>
15#include <asm/system.h>
16#include <asm/pgtable.h>
17#include <asm/tlbflush.h>
18#include <asm/apic.h>
19#include <asm/iommu.h>
20
21/*
22 * Power off function, if any
23 */
24void (*pm_power_off)(void);
25EXPORT_SYMBOL(pm_power_off);
26
27static long no_idt[3];
28static enum {
29 BOOT_TRIPLE = 't',
30 BOOT_KBD = 'k'
31} reboot_type = BOOT_KBD;
32static int reboot_mode = 0;
33int reboot_force;
34
35/* reboot=t[riple] | k[bd] [, [w]arm | [c]old]
36 warm Don't set the cold reboot flag
37 cold Set the cold reboot flag
38 triple Force a triple fault (init)
39 kbd Use the keyboard controller. cold reset (default)
40 force Avoid anything that could hang.
41 */
42static int __init reboot_setup(char *str)
43{
44 for (;;) {
45 switch (*str) {
46 case 'w':
47 reboot_mode = 0x1234;
48 break;
49
50 case 'c':
51 reboot_mode = 0;
52 break;
53
54 case 't':
55 case 'b':
56 case 'k':
57 reboot_type = *str;
58 break;
59 case 'f':
60 reboot_force = 1;
61 break;
62 }
63 if((str = strchr(str,',')) != NULL)
64 str++;
65 else
66 break;
67 }
68 return 1;
69}
70
71__setup("reboot=", reboot_setup);
72
73static inline void kb_wait(void)
74{
75 int i;
76
77 for (i=0; i<0x10000; i++)
78 if ((inb_p(0x64) & 0x02) == 0)
79 break;
80}
81
82void machine_shutdown(void)
83{
84 unsigned long flags;
85
86 /* Stop the cpus and apics */
87#ifdef CONFIG_SMP
88 int reboot_cpu_id;
89
90 /* The boot cpu is always logical cpu 0 */
91 reboot_cpu_id = 0;
92
93 /* Make certain the cpu I'm about to reboot on is online */
94 if (!cpu_isset(reboot_cpu_id, cpu_online_map)) {
95 reboot_cpu_id = smp_processor_id();
96 }
97
98 /* Make certain I only run on the appropriate processor */
99 set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id));
100
101 /* O.K Now that I'm on the appropriate processor,
102 * stop all of the others.
103 */
104 smp_send_stop();
105#endif
106
107 local_irq_save(flags);
108
109#ifndef CONFIG_SMP
110 disable_local_APIC();
111#endif
112
113 disable_IO_APIC();
114
115 local_irq_restore(flags);
116
117 pci_iommu_shutdown();
118}
119
120void machine_emergency_restart(void)
121{
122 int i;
123
124 /* Tell the BIOS if we want cold or warm reboot */
125 *((unsigned short *)__va(0x472)) = reboot_mode;
126
127 for (;;) {
128 /* Could also try the reset bit in the Hammer NB */
129 switch (reboot_type) {
130 case BOOT_KBD:
131 for (i=0; i<10; i++) {
132 kb_wait();
133 udelay(50);
134 outb(0xfe,0x64); /* pulse reset low */
135 udelay(50);
136 }
137
138 case BOOT_TRIPLE:
139 __asm__ __volatile__("lidt (%0)": :"r" (&no_idt));
140 __asm__ __volatile__("int3");
141
142 reboot_type = BOOT_KBD;
143 break;
144 }
145 }
146}
147
148void machine_restart(char * __unused)
149{
150 printk("machine restart\n");
151
152 if (!reboot_force) {
153 machine_shutdown();
154 }
155 machine_emergency_restart();
156}
157
158void machine_halt(void)
159{
160}
161
162void machine_power_off(void)
163{
164 if (pm_power_off) {
165 if (!reboot_force) {
166 machine_shutdown();
167 }
168 pm_power_off();
169 }
170}
171
diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c
new file mode 100644
index 000000000000..03e1cce58f49
--- /dev/null
+++ b/arch/x86/kernel/reboot_fixups_32.c
@@ -0,0 +1,68 @@
1/*
2 * linux/arch/i386/kernel/reboot_fixups.c
3 *
4 * This is a good place to put board specific reboot fixups.
5 *
6 * List of supported fixups:
7 * geode-gx1/cs5530a - Jaya Kumar <jayalk@intworks.biz>
8 * geode-gx/lx/cs5536 - Andres Salomon <dilinger@debian.org>
9 *
10 */
11
12#include <asm/delay.h>
13#include <linux/pci.h>
14#include <asm/reboot_fixups.h>
15#include <asm/msr.h>
16
17static void cs5530a_warm_reset(struct pci_dev *dev)
18{
19 /* writing 1 to the reset control register, 0x44 causes the
20 cs5530a to perform a system warm reset */
21 pci_write_config_byte(dev, 0x44, 0x1);
22 udelay(50); /* shouldn't get here but be safe and spin-a-while */
23 return;
24}
25
26static void cs5536_warm_reset(struct pci_dev *dev)
27{
28 /*
29 * 6.6.2.12 Soft Reset (DIVIL_SOFT_RESET)
30 * writing 1 to the LSB of this MSR causes a hard reset.
31 */
32 wrmsrl(0x51400017, 1ULL);
33 udelay(50); /* shouldn't get here but be safe and spin a while */
34}
35
36struct device_fixup {
37 unsigned int vendor;
38 unsigned int device;
39 void (*reboot_fixup)(struct pci_dev *);
40};
41
42static struct device_fixup fixups_table[] = {
43{ PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, cs5530a_warm_reset },
44{ PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, cs5536_warm_reset },
45};
46
47/*
48 * we see if any fixup is available for our current hardware. if there
49 * is a fixup, we call it and we expect to never return from it. if we
50 * do return, we keep looking and then eventually fall back to the
51 * standard mach_reboot on return.
52 */
53void mach_reboot_fixups(void)
54{
55 struct device_fixup *cur;
56 struct pci_dev *dev;
57 int i;
58
59 for (i=0; i < ARRAY_SIZE(fixups_table); i++) {
60 cur = &(fixups_table[i]);
61 dev = pci_get_device(cur->vendor, cur->device, NULL);
62 if (!dev)
63 continue;
64
65 cur->reboot_fixup(dev);
66 }
67}
68
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
new file mode 100644
index 000000000000..f151d6fae462
--- /dev/null
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -0,0 +1,252 @@
1/*
2 * relocate_kernel.S - put the kernel image in place to boot
3 * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
4 *
5 * This source code is licensed under the GNU General Public License,
6 * Version 2. See the file COPYING for more details.
7 */
8
9#include <linux/linkage.h>
10#include <asm/page.h>
11#include <asm/kexec.h>
12
13/*
14 * Must be relocatable PIC code callable as a C function
15 */
16
17#define PTR(x) (x << 2)
18#define PAGE_ALIGNED (1 << PAGE_SHIFT)
19#define PAGE_ATTR 0x63 /* _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY */
20#define PAE_PGD_ATTR 0x01 /* _PAGE_PRESENT */
21
22 .text
23 .align PAGE_ALIGNED
24 .globl relocate_kernel
25relocate_kernel:
26 movl 8(%esp), %ebp /* list of pages */
27
28#ifdef CONFIG_X86_PAE
29 /* map the control page at its virtual address */
30
31 movl PTR(VA_PGD)(%ebp), %edi
32 movl PTR(VA_CONTROL_PAGE)(%ebp), %eax
33 andl $0xc0000000, %eax
34 shrl $27, %eax
35 addl %edi, %eax
36
37 movl PTR(PA_PMD_0)(%ebp), %edx
38 orl $PAE_PGD_ATTR, %edx
39 movl %edx, (%eax)
40
41 movl PTR(VA_PMD_0)(%ebp), %edi
42 movl PTR(VA_CONTROL_PAGE)(%ebp), %eax
43 andl $0x3fe00000, %eax
44 shrl $18, %eax
45 addl %edi, %eax
46
47 movl PTR(PA_PTE_0)(%ebp), %edx
48 orl $PAGE_ATTR, %edx
49 movl %edx, (%eax)
50
51 movl PTR(VA_PTE_0)(%ebp), %edi
52 movl PTR(VA_CONTROL_PAGE)(%ebp), %eax
53 andl $0x001ff000, %eax
54 shrl $9, %eax
55 addl %edi, %eax
56
57 movl PTR(PA_CONTROL_PAGE)(%ebp), %edx
58 orl $PAGE_ATTR, %edx
59 movl %edx, (%eax)
60
61 /* identity map the control page at its physical address */
62
63 movl PTR(VA_PGD)(%ebp), %edi
64 movl PTR(PA_CONTROL_PAGE)(%ebp), %eax
65 andl $0xc0000000, %eax
66 shrl $27, %eax
67 addl %edi, %eax
68
69 movl PTR(PA_PMD_1)(%ebp), %edx
70 orl $PAE_PGD_ATTR, %edx
71 movl %edx, (%eax)
72
73 movl PTR(VA_PMD_1)(%ebp), %edi
74 movl PTR(PA_CONTROL_PAGE)(%ebp), %eax
75 andl $0x3fe00000, %eax
76 shrl $18, %eax
77 addl %edi, %eax
78
79 movl PTR(PA_PTE_1)(%ebp), %edx
80 orl $PAGE_ATTR, %edx
81 movl %edx, (%eax)
82
83 movl PTR(VA_PTE_1)(%ebp), %edi
84 movl PTR(PA_CONTROL_PAGE)(%ebp), %eax
85 andl $0x001ff000, %eax
86 shrl $9, %eax
87 addl %edi, %eax
88
89 movl PTR(PA_CONTROL_PAGE)(%ebp), %edx
90 orl $PAGE_ATTR, %edx
91 movl %edx, (%eax)
92#else
93 /* map the control page at its virtual address */
94
95 movl PTR(VA_PGD)(%ebp), %edi
96 movl PTR(VA_CONTROL_PAGE)(%ebp), %eax
97 andl $0xffc00000, %eax
98 shrl $20, %eax
99 addl %edi, %eax
100
101 movl PTR(PA_PTE_0)(%ebp), %edx
102 orl $PAGE_ATTR, %edx
103 movl %edx, (%eax)
104
105 movl PTR(VA_PTE_0)(%ebp), %edi
106 movl PTR(VA_CONTROL_PAGE)(%ebp), %eax
107 andl $0x003ff000, %eax
108 shrl $10, %eax
109 addl %edi, %eax
110
111 movl PTR(PA_CONTROL_PAGE)(%ebp), %edx
112 orl $PAGE_ATTR, %edx
113 movl %edx, (%eax)
114
115 /* identity map the control page at its physical address */
116
117 movl PTR(VA_PGD)(%ebp), %edi
118 movl PTR(PA_CONTROL_PAGE)(%ebp), %eax
119 andl $0xffc00000, %eax
120 shrl $20, %eax
121 addl %edi, %eax
122
123 movl PTR(PA_PTE_1)(%ebp), %edx
124 orl $PAGE_ATTR, %edx
125 movl %edx, (%eax)
126
127 movl PTR(VA_PTE_1)(%ebp), %edi
128 movl PTR(PA_CONTROL_PAGE)(%ebp), %eax
129 andl $0x003ff000, %eax
130 shrl $10, %eax
131 addl %edi, %eax
132
133 movl PTR(PA_CONTROL_PAGE)(%ebp), %edx
134 orl $PAGE_ATTR, %edx
135 movl %edx, (%eax)
136#endif
137
138relocate_new_kernel:
139 /* read the arguments and say goodbye to the stack */
140 movl 4(%esp), %ebx /* page_list */
141 movl 8(%esp), %ebp /* list of pages */
142 movl 12(%esp), %edx /* start address */
143 movl 16(%esp), %ecx /* cpu_has_pae */
144
145 /* zero out flags, and disable interrupts */
146 pushl $0
147 popfl
148
149 /* get physical address of control page now */
150 /* this is impossible after page table switch */
151 movl PTR(PA_CONTROL_PAGE)(%ebp), %edi
152
153 /* switch to new set of page tables */
154 movl PTR(PA_PGD)(%ebp), %eax
155 movl %eax, %cr3
156
157 /* setup a new stack at the end of the physical control page */
158 lea 4096(%edi), %esp
159
160 /* jump to identity mapped page */
161 movl %edi, %eax
162 addl $(identity_mapped - relocate_kernel), %eax
163 pushl %eax
164 ret
165
166identity_mapped:
167 /* store the start address on the stack */
168 pushl %edx
169
170 /* Set cr0 to a known state:
171 * 31 0 == Paging disabled
172 * 18 0 == Alignment check disabled
173 * 16 0 == Write protect disabled
174 * 3 0 == No task switch
175 * 2 0 == Don't do FP software emulation.
176 * 0 1 == Proctected mode enabled
177 */
178 movl %cr0, %eax
179 andl $~((1<<31)|(1<<18)|(1<<16)|(1<<3)|(1<<2)), %eax
180 orl $(1<<0), %eax
181 movl %eax, %cr0
182
183 /* clear cr4 if applicable */
184 testl %ecx, %ecx
185 jz 1f
186 /* Set cr4 to a known state:
187 * Setting everything to zero seems safe.
188 */
189 movl %cr4, %eax
190 andl $0, %eax
191 movl %eax, %cr4
192
193 jmp 1f
1941:
195
196 /* Flush the TLB (needed?) */
197 xorl %eax, %eax
198 movl %eax, %cr3
199
200 /* Do the copies */
201 movl %ebx, %ecx
202 jmp 1f
203
2040: /* top, read another word from the indirection page */
205 movl (%ebx), %ecx
206 addl $4, %ebx
2071:
208 testl $0x1, %ecx /* is it a destination page */
209 jz 2f
210 movl %ecx, %edi
211 andl $0xfffff000, %edi
212 jmp 0b
2132:
214 testl $0x2, %ecx /* is it an indirection page */
215 jz 2f
216 movl %ecx, %ebx
217 andl $0xfffff000, %ebx
218 jmp 0b
2192:
220 testl $0x4, %ecx /* is it the done indicator */
221 jz 2f
222 jmp 3f
2232:
224 testl $0x8, %ecx /* is it the source indicator */
225 jz 0b /* Ignore it otherwise */
226 movl %ecx, %esi /* For every source page do a copy */
227 andl $0xfffff000, %esi
228
229 movl $1024, %ecx
230 rep ; movsl
231 jmp 0b
232
2333:
234
235 /* To be certain of avoiding problems with self-modifying code
236 * I need to execute a serializing instruction here.
237 * So I flush the TLB, it's handy, and not processor dependent.
238 */
239 xorl %eax, %eax
240 movl %eax, %cr3
241
242 /* set all of the registers to known values */
243 /* leave %esp alone */
244
245 xorl %eax, %eax
246 xorl %ebx, %ebx
247 xorl %ecx, %ecx
248 xorl %edx, %edx
249 xorl %esi, %esi
250 xorl %edi, %edi
251 xorl %ebp, %ebp
252 ret
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
new file mode 100644
index 000000000000..14e95872c6a3
--- /dev/null
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -0,0 +1,276 @@
1/*
2 * relocate_kernel.S - put the kernel image in place to boot
3 * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com>
4 *
5 * This source code is licensed under the GNU General Public License,
6 * Version 2. See the file COPYING for more details.
7 */
8
9#include <linux/linkage.h>
10#include <asm/page.h>
11#include <asm/kexec.h>
12
13/*
14 * Must be relocatable PIC code callable as a C function
15 */
16
17#define PTR(x) (x << 3)
18#define PAGE_ALIGNED (1 << PAGE_SHIFT)
19#define PAGE_ATTR 0x63 /* _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY */
20
21 .text
22 .align PAGE_ALIGNED
23 .code64
24 .globl relocate_kernel
25relocate_kernel:
26 /* %rdi indirection_page
27 * %rsi page_list
28 * %rdx start address
29 */
30
31 /* map the control page at its virtual address */
32
33 movq $0x0000ff8000000000, %r10 /* mask */
34 mov $(39 - 3), %cl /* bits to shift */
35 movq PTR(VA_CONTROL_PAGE)(%rsi), %r11 /* address to map */
36
37 movq %r11, %r9
38 andq %r10, %r9
39 shrq %cl, %r9
40
41 movq PTR(VA_PGD)(%rsi), %r8
42 addq %r8, %r9
43 movq PTR(PA_PUD_0)(%rsi), %r8
44 orq $PAGE_ATTR, %r8
45 movq %r8, (%r9)
46
47 shrq $9, %r10
48 sub $9, %cl
49
50 movq %r11, %r9
51 andq %r10, %r9
52 shrq %cl, %r9
53
54 movq PTR(VA_PUD_0)(%rsi), %r8
55 addq %r8, %r9
56 movq PTR(PA_PMD_0)(%rsi), %r8
57 orq $PAGE_ATTR, %r8
58 movq %r8, (%r9)
59
60 shrq $9, %r10
61 sub $9, %cl
62
63 movq %r11, %r9
64 andq %r10, %r9
65 shrq %cl, %r9
66
67 movq PTR(VA_PMD_0)(%rsi), %r8
68 addq %r8, %r9
69 movq PTR(PA_PTE_0)(%rsi), %r8
70 orq $PAGE_ATTR, %r8
71 movq %r8, (%r9)
72
73 shrq $9, %r10
74 sub $9, %cl
75
76 movq %r11, %r9
77 andq %r10, %r9
78 shrq %cl, %r9
79
80 movq PTR(VA_PTE_0)(%rsi), %r8
81 addq %r8, %r9
82 movq PTR(PA_CONTROL_PAGE)(%rsi), %r8
83 orq $PAGE_ATTR, %r8
84 movq %r8, (%r9)
85
86 /* identity map the control page at its physical address */
87
88 movq $0x0000ff8000000000, %r10 /* mask */
89 mov $(39 - 3), %cl /* bits to shift */
90 movq PTR(PA_CONTROL_PAGE)(%rsi), %r11 /* address to map */
91
92 movq %r11, %r9
93 andq %r10, %r9
94 shrq %cl, %r9
95
96 movq PTR(VA_PGD)(%rsi), %r8
97 addq %r8, %r9
98 movq PTR(PA_PUD_1)(%rsi), %r8
99 orq $PAGE_ATTR, %r8
100 movq %r8, (%r9)
101
102 shrq $9, %r10
103 sub $9, %cl
104
105 movq %r11, %r9
106 andq %r10, %r9
107 shrq %cl, %r9
108
109 movq PTR(VA_PUD_1)(%rsi), %r8
110 addq %r8, %r9
111 movq PTR(PA_PMD_1)(%rsi), %r8
112 orq $PAGE_ATTR, %r8
113 movq %r8, (%r9)
114
115 shrq $9, %r10
116 sub $9, %cl
117
118 movq %r11, %r9
119 andq %r10, %r9
120 shrq %cl, %r9
121
122 movq PTR(VA_PMD_1)(%rsi), %r8
123 addq %r8, %r9
124 movq PTR(PA_PTE_1)(%rsi), %r8
125 orq $PAGE_ATTR, %r8
126 movq %r8, (%r9)
127
128 shrq $9, %r10
129 sub $9, %cl
130
131 movq %r11, %r9
132 andq %r10, %r9
133 shrq %cl, %r9
134
135 movq PTR(VA_PTE_1)(%rsi), %r8
136 addq %r8, %r9
137 movq PTR(PA_CONTROL_PAGE)(%rsi), %r8
138 orq $PAGE_ATTR, %r8
139 movq %r8, (%r9)
140
141relocate_new_kernel:
142 /* %rdi indirection_page
143 * %rsi page_list
144 * %rdx start address
145 */
146
147 /* zero out flags, and disable interrupts */
148 pushq $0
149 popfq
150
151 /* get physical address of control page now */
152 /* this is impossible after page table switch */
153 movq PTR(PA_CONTROL_PAGE)(%rsi), %r8
154
155 /* get physical address of page table now too */
156 movq PTR(PA_TABLE_PAGE)(%rsi), %rcx
157
158 /* switch to new set of page tables */
159 movq PTR(PA_PGD)(%rsi), %r9
160 movq %r9, %cr3
161
162 /* setup a new stack at the end of the physical control page */
163 lea 4096(%r8), %rsp
164
165 /* jump to identity mapped page */
166 addq $(identity_mapped - relocate_kernel), %r8
167 pushq %r8
168 ret
169
170identity_mapped:
171 /* store the start address on the stack */
172 pushq %rdx
173
174 /* Set cr0 to a known state:
175 * 31 1 == Paging enabled
176 * 18 0 == Alignment check disabled
177 * 16 0 == Write protect disabled
178 * 3 0 == No task switch
179 * 2 0 == Don't do FP software emulation.
180 * 0 1 == Proctected mode enabled
181 */
182 movq %cr0, %rax
183 andq $~((1<<18)|(1<<16)|(1<<3)|(1<<2)), %rax
184 orl $((1<<31)|(1<<0)), %eax
185 movq %rax, %cr0
186
187 /* Set cr4 to a known state:
188 * 10 0 == xmm exceptions disabled
189 * 9 0 == xmm registers instructions disabled
190 * 8 0 == performance monitoring counter disabled
191 * 7 0 == page global disabled
192 * 6 0 == machine check exceptions disabled
193 * 5 1 == physical address extension enabled
194 * 4 0 == page size extensions disabled
195 * 3 0 == Debug extensions disabled
196 * 2 0 == Time stamp disable (disabled)
197 * 1 0 == Protected mode virtual interrupts disabled
198 * 0 0 == VME disabled
199 */
200
201 movq $((1<<5)), %rax
202 movq %rax, %cr4
203
204 jmp 1f
2051:
206
207 /* Switch to the identity mapped page tables,
208 * and flush the TLB.
209 */
210 movq %rcx, %cr3
211
212 /* Do the copies */
213 movq %rdi, %rcx /* Put the page_list in %rcx */
214 xorq %rdi, %rdi
215 xorq %rsi, %rsi
216 jmp 1f
217
2180: /* top, read another word for the indirection page */
219
220 movq (%rbx), %rcx
221 addq $8, %rbx
2221:
223 testq $0x1, %rcx /* is it a destination page? */
224 jz 2f
225 movq %rcx, %rdi
226 andq $0xfffffffffffff000, %rdi
227 jmp 0b
2282:
229 testq $0x2, %rcx /* is it an indirection page? */
230 jz 2f
231 movq %rcx, %rbx
232 andq $0xfffffffffffff000, %rbx
233 jmp 0b
2342:
235 testq $0x4, %rcx /* is it the done indicator? */
236 jz 2f
237 jmp 3f
2382:
239 testq $0x8, %rcx /* is it the source indicator? */
240 jz 0b /* Ignore it otherwise */
241 movq %rcx, %rsi /* For ever source page do a copy */
242 andq $0xfffffffffffff000, %rsi
243
244 movq $512, %rcx
245 rep ; movsq
246 jmp 0b
2473:
248
249 /* To be certain of avoiding problems with self-modifying code
250 * I need to execute a serializing instruction here.
251 * So I flush the TLB by reloading %cr3 here, it's handy,
252 * and not processor dependent.
253 */
254 movq %cr3, %rax
255 movq %rax, %cr3
256
257 /* set all of the registers to known values */
258 /* leave %rsp alone */
259
260 xorq %rax, %rax
261 xorq %rbx, %rbx
262 xorq %rcx, %rcx
263 xorq %rdx, %rdx
264 xorq %rsi, %rsi
265 xorq %rdi, %rdi
266 xorq %rbp, %rbp
267 xorq %r8, %r8
268 xorq %r9, %r9
269 xorq %r10, %r9
270 xorq %r11, %r11
271 xorq %r12, %r12
272 xorq %r13, %r13
273 xorq %r14, %r14
274 xorq %r15, %r15
275
276 ret
diff --git a/arch/x86/kernel/scx200_32.c b/arch/x86/kernel/scx200_32.c
new file mode 100644
index 000000000000..c7d3df23f589
--- /dev/null
+++ b/arch/x86/kernel/scx200_32.c
@@ -0,0 +1,131 @@
1/* linux/arch/i386/kernel/scx200.c
2
3 Copyright (c) 2001,2002 Christer Weinigel <wingel@nano-system.com>
4
5 National Semiconductor SCx200 support. */
6
7#include <linux/module.h>
8#include <linux/errno.h>
9#include <linux/kernel.h>
10#include <linux/init.h>
11#include <linux/mutex.h>
12#include <linux/pci.h>
13
14#include <linux/scx200.h>
15#include <linux/scx200_gpio.h>
16
17/* Verify that the configuration block really is there */
18#define scx200_cb_probe(base) (inw((base) + SCx200_CBA) == (base))
19
20#define NAME "scx200"
21
22MODULE_AUTHOR("Christer Weinigel <wingel@nano-system.com>");
23MODULE_DESCRIPTION("NatSemi SCx200 Driver");
24MODULE_LICENSE("GPL");
25
26unsigned scx200_gpio_base = 0;
27long scx200_gpio_shadow[2];
28
29unsigned scx200_cb_base = 0;
30
31static struct pci_device_id scx200_tbl[] = {
32 { PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SCx200_BRIDGE) },
33 { PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE) },
34 { PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SCx200_XBUS) },
35 { PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_XBUS) },
36 { },
37};
38MODULE_DEVICE_TABLE(pci,scx200_tbl);
39
40static int __devinit scx200_probe(struct pci_dev *, const struct pci_device_id *);
41
42static struct pci_driver scx200_pci_driver = {
43 .name = "scx200",
44 .id_table = scx200_tbl,
45 .probe = scx200_probe,
46};
47
48static DEFINE_MUTEX(scx200_gpio_config_lock);
49
50static void __devinit scx200_init_shadow(void)
51{
52 int bank;
53
54 /* read the current values driven on the GPIO signals */
55 for (bank = 0; bank < 2; ++bank)
56 scx200_gpio_shadow[bank] = inl(scx200_gpio_base + 0x10 * bank);
57}
58
59static int __devinit scx200_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
60{
61 unsigned base;
62
63 if (pdev->device == PCI_DEVICE_ID_NS_SCx200_BRIDGE ||
64 pdev->device == PCI_DEVICE_ID_NS_SC1100_BRIDGE) {
65 base = pci_resource_start(pdev, 0);
66 printk(KERN_INFO NAME ": GPIO base 0x%x\n", base);
67
68 if (request_region(base, SCx200_GPIO_SIZE, "NatSemi SCx200 GPIO") == 0) {
69 printk(KERN_ERR NAME ": can't allocate I/O for GPIOs\n");
70 return -EBUSY;
71 }
72
73 scx200_gpio_base = base;
74 scx200_init_shadow();
75
76 } else {
77 /* find the base of the Configuration Block */
78 if (scx200_cb_probe(SCx200_CB_BASE_FIXED)) {
79 scx200_cb_base = SCx200_CB_BASE_FIXED;
80 } else {
81 pci_read_config_dword(pdev, SCx200_CBA_SCRATCH, &base);
82 if (scx200_cb_probe(base)) {
83 scx200_cb_base = base;
84 } else {
85 printk(KERN_WARNING NAME ": Configuration Block not found\n");
86 return -ENODEV;
87 }
88 }
89 printk(KERN_INFO NAME ": Configuration Block base 0x%x\n", scx200_cb_base);
90 }
91
92 return 0;
93}
94
95u32 scx200_gpio_configure(unsigned index, u32 mask, u32 bits)
96{
97 u32 config, new_config;
98
99 mutex_lock(&scx200_gpio_config_lock);
100
101 outl(index, scx200_gpio_base + 0x20);
102 config = inl(scx200_gpio_base + 0x24);
103
104 new_config = (config & mask) | bits;
105 outl(new_config, scx200_gpio_base + 0x24);
106
107 mutex_unlock(&scx200_gpio_config_lock);
108
109 return config;
110}
111
112static int __init scx200_init(void)
113{
114 printk(KERN_INFO NAME ": NatSemi SCx200 Driver\n");
115
116 return pci_register_driver(&scx200_pci_driver);
117}
118
119static void __exit scx200_cleanup(void)
120{
121 pci_unregister_driver(&scx200_pci_driver);
122 release_region(scx200_gpio_base, SCx200_GPIO_SIZE);
123}
124
125module_init(scx200_init);
126module_exit(scx200_cleanup);
127
128EXPORT_SYMBOL(scx200_gpio_base);
129EXPORT_SYMBOL(scx200_gpio_shadow);
130EXPORT_SYMBOL(scx200_gpio_configure);
131EXPORT_SYMBOL(scx200_cb_base);
diff --git a/arch/x86/kernel/setup64.c b/arch/x86/kernel/setup64.c
new file mode 100644
index 000000000000..1200aaac403e
--- /dev/null
+++ b/arch/x86/kernel/setup64.c
@@ -0,0 +1,289 @@
1/*
2 * X86-64 specific CPU setup.
3 * Copyright (C) 1995 Linus Torvalds
4 * Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen.
5 * See setup.c for older changelog.
6 */
7#include <linux/init.h>
8#include <linux/kernel.h>
9#include <linux/sched.h>
10#include <linux/string.h>
11#include <linux/bootmem.h>
12#include <linux/bitops.h>
13#include <linux/module.h>
14#include <asm/bootsetup.h>
15#include <asm/pda.h>
16#include <asm/pgtable.h>
17#include <asm/processor.h>
18#include <asm/desc.h>
19#include <asm/atomic.h>
20#include <asm/mmu_context.h>
21#include <asm/smp.h>
22#include <asm/i387.h>
23#include <asm/percpu.h>
24#include <asm/proto.h>
25#include <asm/sections.h>
26
27char x86_boot_params[BOOT_PARAM_SIZE] __initdata;
28
29cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
30
31struct x8664_pda *_cpu_pda[NR_CPUS] __read_mostly;
32EXPORT_SYMBOL(_cpu_pda);
33struct x8664_pda boot_cpu_pda[NR_CPUS] __cacheline_aligned;
34
35struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
36
37char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));
38
39unsigned long __supported_pte_mask __read_mostly = ~0UL;
40static int do_not_nx __cpuinitdata = 0;
41
42/* noexec=on|off
43Control non executable mappings for 64bit processes.
44
45on Enable(default)
46off Disable
47*/
48static int __init nonx_setup(char *str)
49{
50 if (!str)
51 return -EINVAL;
52 if (!strncmp(str, "on", 2)) {
53 __supported_pte_mask |= _PAGE_NX;
54 do_not_nx = 0;
55 } else if (!strncmp(str, "off", 3)) {
56 do_not_nx = 1;
57 __supported_pte_mask &= ~_PAGE_NX;
58 }
59 return 0;
60}
61early_param("noexec", nonx_setup);
62
63int force_personality32 = 0;
64
65/* noexec32=on|off
66Control non executable heap for 32bit processes.
67To control the stack too use noexec=off
68
69on PROT_READ does not imply PROT_EXEC for 32bit processes
70off PROT_READ implies PROT_EXEC (default)
71*/
72static int __init nonx32_setup(char *str)
73{
74 if (!strcmp(str, "on"))
75 force_personality32 &= ~READ_IMPLIES_EXEC;
76 else if (!strcmp(str, "off"))
77 force_personality32 |= READ_IMPLIES_EXEC;
78 return 1;
79}
80__setup("noexec32=", nonx32_setup);
81
82/*
83 * Great future plan:
84 * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
85 * Always point %gs to its beginning
86 */
87void __init setup_per_cpu_areas(void)
88{
89 int i;
90 unsigned long size;
91
92#ifdef CONFIG_HOTPLUG_CPU
93 prefill_possible_map();
94#endif
95
96 /* Copy section for each CPU (we discard the original) */
97 size = PERCPU_ENOUGH_ROOM;
98
99 printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n", size);
100 for_each_cpu_mask (i, cpu_possible_map) {
101 char *ptr;
102
103 if (!NODE_DATA(cpu_to_node(i))) {
104 printk("cpu with no node %d, num_online_nodes %d\n",
105 i, num_online_nodes());
106 ptr = alloc_bootmem_pages(size);
107 } else {
108 ptr = alloc_bootmem_pages_node(NODE_DATA(cpu_to_node(i)), size);
109 }
110 if (!ptr)
111 panic("Cannot allocate cpu data for CPU %d\n", i);
112 cpu_pda(i)->data_offset = ptr - __per_cpu_start;
113 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
114 }
115}
116
117void pda_init(int cpu)
118{
119 struct x8664_pda *pda = cpu_pda(cpu);
120
121 /* Setup up data that may be needed in __get_free_pages early */
122 asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0));
123 /* Memory clobbers used to order PDA accessed */
124 mb();
125 wrmsrl(MSR_GS_BASE, pda);
126 mb();
127
128 pda->cpunumber = cpu;
129 pda->irqcount = -1;
130 pda->kernelstack =
131 (unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE;
132 pda->active_mm = &init_mm;
133 pda->mmu_state = 0;
134
135 if (cpu == 0) {
136 /* others are initialized in smpboot.c */
137 pda->pcurrent = &init_task;
138 pda->irqstackptr = boot_cpu_stack;
139 } else {
140 pda->irqstackptr = (char *)
141 __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
142 if (!pda->irqstackptr)
143 panic("cannot allocate irqstack for cpu %d", cpu);
144 }
145
146
147 pda->irqstackptr += IRQSTACKSIZE-64;
148}
149
150char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]
151__attribute__((section(".bss.page_aligned")));
152
153extern asmlinkage void ignore_sysret(void);
154
155/* May not be marked __init: used by software suspend */
156void syscall_init(void)
157{
158 /*
159 * LSTAR and STAR live in a bit strange symbiosis.
160 * They both write to the same internal register. STAR allows to set CS/DS
161 * but only a 32bit target. LSTAR sets the 64bit rip.
162 */
163 wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
164 wrmsrl(MSR_LSTAR, system_call);
165 wrmsrl(MSR_CSTAR, ignore_sysret);
166
167#ifdef CONFIG_IA32_EMULATION
168 syscall32_cpu_init ();
169#endif
170
171 /* Flags to clear on syscall */
172 wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000);
173}
174
175void __cpuinit check_efer(void)
176{
177 unsigned long efer;
178
179 rdmsrl(MSR_EFER, efer);
180 if (!(efer & EFER_NX) || do_not_nx) {
181 __supported_pte_mask &= ~_PAGE_NX;
182 }
183}
184
185unsigned long kernel_eflags;
186
187/*
188 * cpu_init() initializes state that is per-CPU. Some data is already
189 * initialized (naturally) in the bootstrap process, such as the GDT
190 * and IDT. We reload them nevertheless, this function acts as a
191 * 'CPU state barrier', nothing should get across.
192 * A lot of state is already set up in PDA init.
193 */
194void __cpuinit cpu_init (void)
195{
196 int cpu = stack_smp_processor_id();
197 struct tss_struct *t = &per_cpu(init_tss, cpu);
198 struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
199 unsigned long v;
200 char *estacks = NULL;
201 struct task_struct *me;
202 int i;
203
204 /* CPU 0 is initialised in head64.c */
205 if (cpu != 0) {
206 pda_init(cpu);
207 } else
208 estacks = boot_exception_stacks;
209
210 me = current;
211
212 if (cpu_test_and_set(cpu, cpu_initialized))
213 panic("CPU#%d already initialized!\n", cpu);
214
215 printk("Initializing CPU#%d\n", cpu);
216
217 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
218
219 /*
220 * Initialize the per-CPU GDT with the boot GDT,
221 * and set up the GDT descriptor:
222 */
223 if (cpu)
224 memcpy(cpu_gdt(cpu), cpu_gdt_table, GDT_SIZE);
225
226 cpu_gdt_descr[cpu].size = GDT_SIZE;
227 asm volatile("lgdt %0" :: "m" (cpu_gdt_descr[cpu]));
228 asm volatile("lidt %0" :: "m" (idt_descr));
229
230 memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
231 syscall_init();
232
233 wrmsrl(MSR_FS_BASE, 0);
234 wrmsrl(MSR_KERNEL_GS_BASE, 0);
235 barrier();
236
237 check_efer();
238
239 /*
240 * set up and load the per-CPU TSS
241 */
242 for (v = 0; v < N_EXCEPTION_STACKS; v++) {
243 static const unsigned int order[N_EXCEPTION_STACKS] = {
244 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
245 [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
246 };
247 if (cpu) {
248 estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
249 if (!estacks)
250 panic("Cannot allocate exception stack %ld %d\n",
251 v, cpu);
252 }
253 estacks += PAGE_SIZE << order[v];
254 orig_ist->ist[v] = t->ist[v] = (unsigned long)estacks;
255 }
256
257 t->io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
258 /*
259 * <= is required because the CPU will access up to
260 * 8 bits beyond the end of the IO permission bitmap.
261 */
262 for (i = 0; i <= IO_BITMAP_LONGS; i++)
263 t->io_bitmap[i] = ~0UL;
264
265 atomic_inc(&init_mm.mm_count);
266 me->active_mm = &init_mm;
267 if (me->mm)
268 BUG();
269 enter_lazy_tlb(&init_mm, me);
270
271 set_tss_desc(cpu, t);
272 load_TR_desc();
273 load_LDT(&init_mm.context);
274
275 /*
276 * Clear all 6 debug registers:
277 */
278
279 set_debugreg(0UL, 0);
280 set_debugreg(0UL, 1);
281 set_debugreg(0UL, 2);
282 set_debugreg(0UL, 3);
283 set_debugreg(0UL, 6);
284 set_debugreg(0UL, 7);
285
286 fpu_init();
287
288 raw_local_save_flags(kernel_eflags);
289}
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
new file mode 100644
index 000000000000..d474cd639bcb
--- /dev/null
+++ b/arch/x86/kernel/setup_32.c
@@ -0,0 +1,653 @@
1/*
2 * linux/arch/i386/kernel/setup.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 *
6 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
7 *
8 * Memory region support
9 * David Parsons <orc@pell.chi.il.us>, July-August 1999
10 *
11 * Added E820 sanitization routine (removes overlapping memory regions);
12 * Brian Moyle <bmoyle@mvista.com>, February 2001
13 *
14 * Moved CPU detection code to cpu/${cpu}.c
15 * Patrick Mochel <mochel@osdl.org>, March 2002
16 *
17 * Provisions for empty E820 memory regions (reported by certain BIOSes).
18 * Alex Achenbach <xela@slit.de>, December 2002.
19 *
20 */
21
22/*
23 * This file handles the architecture-dependent parts of initialization
24 */
25
26#include <linux/sched.h>
27#include <linux/mm.h>
28#include <linux/mmzone.h>
29#include <linux/screen_info.h>
30#include <linux/ioport.h>
31#include <linux/acpi.h>
32#include <linux/apm_bios.h>
33#include <linux/initrd.h>
34#include <linux/bootmem.h>
35#include <linux/seq_file.h>
36#include <linux/console.h>
37#include <linux/mca.h>
38#include <linux/root_dev.h>
39#include <linux/highmem.h>
40#include <linux/module.h>
41#include <linux/efi.h>
42#include <linux/init.h>
43#include <linux/edd.h>
44#include <linux/nodemask.h>
45#include <linux/kexec.h>
46#include <linux/crash_dump.h>
47#include <linux/dmi.h>
48#include <linux/pfn.h>
49
50#include <video/edid.h>
51
52#include <asm/apic.h>
53#include <asm/e820.h>
54#include <asm/mpspec.h>
55#include <asm/mmzone.h>
56#include <asm/setup.h>
57#include <asm/arch_hooks.h>
58#include <asm/sections.h>
59#include <asm/io_apic.h>
60#include <asm/ist.h>
61#include <asm/io.h>
62#include <asm/vmi.h>
63#include <setup_arch.h>
64#include <bios_ebda.h>
65
66/* This value is set up by the early boot code to point to the value
67 immediately after the boot time page tables. It contains a *physical*
68 address, and must not be in the .bss segment! */
69unsigned long init_pg_tables_end __initdata = ~0UL;
70
71int disable_pse __devinitdata = 0;
72
73/*
74 * Machine setup..
75 */
76extern struct resource code_resource;
77extern struct resource data_resource;
78
79/* cpu data as detected by the assembly code in head.S */
80struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
81/* common cpu data for all cpus */
82struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
83EXPORT_SYMBOL(boot_cpu_data);
84
85unsigned long mmu_cr4_features;
86
87/* for MCA, but anyone else can use it if they want */
88unsigned int machine_id;
89#ifdef CONFIG_MCA
90EXPORT_SYMBOL(machine_id);
91#endif
92unsigned int machine_submodel_id;
93unsigned int BIOS_revision;
94unsigned int mca_pentium_flag;
95
96/* Boot loader ID as an integer, for the benefit of proc_dointvec */
97int bootloader_type;
98
99/* user-defined highmem size */
100static unsigned int highmem_pages = -1;
101
102/*
103 * Setup options
104 */
105struct screen_info screen_info;
106EXPORT_SYMBOL(screen_info);
107struct apm_info apm_info;
108EXPORT_SYMBOL(apm_info);
109struct edid_info edid_info;
110EXPORT_SYMBOL_GPL(edid_info);
111struct ist_info ist_info;
112#if defined(CONFIG_X86_SPEEDSTEP_SMI) || \
113 defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
114EXPORT_SYMBOL(ist_info);
115#endif
116
117extern void early_cpu_init(void);
118extern int root_mountflags;
119
120unsigned long saved_videomode;
121
122#define RAMDISK_IMAGE_START_MASK 0x07FF
123#define RAMDISK_PROMPT_FLAG 0x8000
124#define RAMDISK_LOAD_FLAG 0x4000
125
126static char __initdata command_line[COMMAND_LINE_SIZE];
127
128struct boot_params __initdata boot_params;
129
130#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
131struct edd edd;
132#ifdef CONFIG_EDD_MODULE
133EXPORT_SYMBOL(edd);
134#endif
135/**
136 * copy_edd() - Copy the BIOS EDD information
137 * from boot_params into a safe place.
138 *
139 */
140static inline void copy_edd(void)
141{
142 memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature));
143 memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info));
144 edd.mbr_signature_nr = EDD_MBR_SIG_NR;
145 edd.edd_info_nr = EDD_NR;
146}
147#else
148static inline void copy_edd(void)
149{
150}
151#endif
152
153int __initdata user_defined_memmap = 0;
154
155/*
156 * "mem=nopentium" disables the 4MB page tables.
157 * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
158 * to <mem>, overriding the bios size.
159 * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
160 * <start> to <start>+<mem>, overriding the bios size.
161 *
162 * HPA tells me bootloaders need to parse mem=, so no new
163 * option should be mem= [also see Documentation/i386/boot.txt]
164 */
165static int __init parse_mem(char *arg)
166{
167 if (!arg)
168 return -EINVAL;
169
170 if (strcmp(arg, "nopentium") == 0) {
171 clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
172 disable_pse = 1;
173 } else {
174 /* If the user specifies memory size, we
175 * limit the BIOS-provided memory map to
176 * that size. exactmap can be used to specify
177 * the exact map. mem=number can be used to
178 * trim the existing memory map.
179 */
180 unsigned long long mem_size;
181
182 mem_size = memparse(arg, &arg);
183 limit_regions(mem_size);
184 user_defined_memmap = 1;
185 }
186 return 0;
187}
188early_param("mem", parse_mem);
189
190#ifdef CONFIG_PROC_VMCORE
191/* elfcorehdr= specifies the location of elf core header
192 * stored by the crashed kernel.
193 */
194static int __init parse_elfcorehdr(char *arg)
195{
196 if (!arg)
197 return -EINVAL;
198
199 elfcorehdr_addr = memparse(arg, &arg);
200 return 0;
201}
202early_param("elfcorehdr", parse_elfcorehdr);
203#endif /* CONFIG_PROC_VMCORE */
204
205/*
206 * highmem=size forces highmem to be exactly 'size' bytes.
207 * This works even on boxes that have no highmem otherwise.
208 * This also works to reduce highmem size on bigger boxes.
209 */
210static int __init parse_highmem(char *arg)
211{
212 if (!arg)
213 return -EINVAL;
214
215 highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
216 return 0;
217}
218early_param("highmem", parse_highmem);
219
220/*
221 * vmalloc=size forces the vmalloc area to be exactly 'size'
222 * bytes. This can be used to increase (or decrease) the
223 * vmalloc area - the default is 128m.
224 */
225static int __init parse_vmalloc(char *arg)
226{
227 if (!arg)
228 return -EINVAL;
229
230 __VMALLOC_RESERVE = memparse(arg, &arg);
231 return 0;
232}
233early_param("vmalloc", parse_vmalloc);
234
235/*
236 * reservetop=size reserves a hole at the top of the kernel address space which
237 * a hypervisor can load into later. Needed for dynamically loaded hypervisors,
238 * so relocating the fixmap can be done before paging initialization.
239 */
240static int __init parse_reservetop(char *arg)
241{
242 unsigned long address;
243
244 if (!arg)
245 return -EINVAL;
246
247 address = memparse(arg, &arg);
248 reserve_top_address(address);
249 return 0;
250}
251early_param("reservetop", parse_reservetop);
252
253/*
254 * Determine low and high memory ranges:
255 */
256unsigned long __init find_max_low_pfn(void)
257{
258 unsigned long max_low_pfn;
259
260 max_low_pfn = max_pfn;
261 if (max_low_pfn > MAXMEM_PFN) {
262 if (highmem_pages == -1)
263 highmem_pages = max_pfn - MAXMEM_PFN;
264 if (highmem_pages + MAXMEM_PFN < max_pfn)
265 max_pfn = MAXMEM_PFN + highmem_pages;
266 if (highmem_pages + MAXMEM_PFN > max_pfn) {
267 printk("only %luMB highmem pages available, ignoring highmem size of %uMB.\n", pages_to_mb(max_pfn - MAXMEM_PFN), pages_to_mb(highmem_pages));
268 highmem_pages = 0;
269 }
270 max_low_pfn = MAXMEM_PFN;
271#ifndef CONFIG_HIGHMEM
272 /* Maximum memory usable is what is directly addressable */
273 printk(KERN_WARNING "Warning only %ldMB will be used.\n",
274 MAXMEM>>20);
275 if (max_pfn > MAX_NONPAE_PFN)
276 printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
277 else
278 printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
279 max_pfn = MAXMEM_PFN;
280#else /* !CONFIG_HIGHMEM */
281#ifndef CONFIG_HIGHMEM64G
282 if (max_pfn > MAX_NONPAE_PFN) {
283 max_pfn = MAX_NONPAE_PFN;
284 printk(KERN_WARNING "Warning only 4GB will be used.\n");
285 printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
286 }
287#endif /* !CONFIG_HIGHMEM64G */
288#endif /* !CONFIG_HIGHMEM */
289 } else {
290 if (highmem_pages == -1)
291 highmem_pages = 0;
292#ifdef CONFIG_HIGHMEM
293 if (highmem_pages >= max_pfn) {
294 printk(KERN_ERR "highmem size specified (%uMB) is bigger than pages available (%luMB)!.\n", pages_to_mb(highmem_pages), pages_to_mb(max_pfn));
295 highmem_pages = 0;
296 }
297 if (highmem_pages) {
298 if (max_low_pfn-highmem_pages < 64*1024*1024/PAGE_SIZE){
299 printk(KERN_ERR "highmem size %uMB results in smaller than 64MB lowmem, ignoring it.\n", pages_to_mb(highmem_pages));
300 highmem_pages = 0;
301 }
302 max_low_pfn -= highmem_pages;
303 }
304#else
305 if (highmem_pages)
306 printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n");
307#endif
308 }
309 return max_low_pfn;
310}
311
312/*
313 * workaround for Dell systems that neglect to reserve EBDA
314 */
315static void __init reserve_ebda_region(void)
316{
317 unsigned int addr;
318 addr = get_bios_ebda();
319 if (addr)
320 reserve_bootmem(addr, PAGE_SIZE);
321}
322
323#ifndef CONFIG_NEED_MULTIPLE_NODES
324void __init setup_bootmem_allocator(void);
325static unsigned long __init setup_memory(void)
326{
327 /*
328 * partially used pages are not usable - thus
329 * we are rounding upwards:
330 */
331 min_low_pfn = PFN_UP(init_pg_tables_end);
332
333 find_max_pfn();
334
335 max_low_pfn = find_max_low_pfn();
336
337#ifdef CONFIG_HIGHMEM
338 highstart_pfn = highend_pfn = max_pfn;
339 if (max_pfn > max_low_pfn) {
340 highstart_pfn = max_low_pfn;
341 }
342 printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
343 pages_to_mb(highend_pfn - highstart_pfn));
344 num_physpages = highend_pfn;
345 high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
346#else
347 num_physpages = max_low_pfn;
348 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
349#endif
350#ifdef CONFIG_FLATMEM
351 max_mapnr = num_physpages;
352#endif
353 printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
354 pages_to_mb(max_low_pfn));
355
356 setup_bootmem_allocator();
357
358 return max_low_pfn;
359}
360
361void __init zone_sizes_init(void)
362{
363 unsigned long max_zone_pfns[MAX_NR_ZONES];
364 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
365 max_zone_pfns[ZONE_DMA] =
366 virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
367 max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
368#ifdef CONFIG_HIGHMEM
369 max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
370 add_active_range(0, 0, highend_pfn);
371#else
372 add_active_range(0, 0, max_low_pfn);
373#endif
374
375 free_area_init_nodes(max_zone_pfns);
376}
377#else
378extern unsigned long __init setup_memory(void);
379extern void zone_sizes_init(void);
380#endif /* !CONFIG_NEED_MULTIPLE_NODES */
381
382void __init setup_bootmem_allocator(void)
383{
384 unsigned long bootmap_size;
385 /*
386 * Initialize the boot-time allocator (with low memory only):
387 */
388 bootmap_size = init_bootmem(min_low_pfn, max_low_pfn);
389
390 register_bootmem_low_pages(max_low_pfn);
391
392 /*
393 * Reserve the bootmem bitmap itself as well. We do this in two
394 * steps (first step was init_bootmem()) because this catches
395 * the (very unlikely) case of us accidentally initializing the
396 * bootmem allocator with an invalid RAM area.
397 */
398 reserve_bootmem(__pa_symbol(_text), (PFN_PHYS(min_low_pfn) +
399 bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text));
400
401 /*
402 * reserve physical page 0 - it's a special BIOS page on many boxes,
403 * enabling clean reboots, SMP operation, laptop functions.
404 */
405 reserve_bootmem(0, PAGE_SIZE);
406
407 /* reserve EBDA region, it's a 4K region */
408 reserve_ebda_region();
409
410 /* could be an AMD 768MPX chipset. Reserve a page before VGA to prevent
411 PCI prefetch into it (errata #56). Usually the page is reserved anyways,
412 unless you have no PS/2 mouse plugged in. */
413 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
414 boot_cpu_data.x86 == 6)
415 reserve_bootmem(0xa0000 - 4096, 4096);
416
417#ifdef CONFIG_SMP
418 /*
419 * But first pinch a few for the stack/trampoline stuff
420 * FIXME: Don't need the extra page at 4K, but need to fix
421 * trampoline before removing it. (see the GDT stuff)
422 */
423 reserve_bootmem(PAGE_SIZE, PAGE_SIZE);
424#endif
425#ifdef CONFIG_ACPI_SLEEP
426 /*
427 * Reserve low memory region for sleep support.
428 */
429 acpi_reserve_bootmem();
430#endif
431#ifdef CONFIG_X86_FIND_SMP_CONFIG
432 /*
433 * Find and reserve possible boot-time SMP configuration:
434 */
435 find_smp_config();
436#endif
437 numa_kva_reserve();
438#ifdef CONFIG_BLK_DEV_INITRD
439 if (LOADER_TYPE && INITRD_START) {
440 if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) {
441 reserve_bootmem(INITRD_START, INITRD_SIZE);
442 initrd_start = INITRD_START + PAGE_OFFSET;
443 initrd_end = initrd_start+INITRD_SIZE;
444 }
445 else {
446 printk(KERN_ERR "initrd extends beyond end of memory "
447 "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
448 INITRD_START + INITRD_SIZE,
449 max_low_pfn << PAGE_SHIFT);
450 initrd_start = 0;
451 }
452 }
453#endif
454#ifdef CONFIG_KEXEC
455 if (crashk_res.start != crashk_res.end)
456 reserve_bootmem(crashk_res.start,
457 crashk_res.end - crashk_res.start + 1);
458#endif
459}
460
461/*
462 * The node 0 pgdat is initialized before all of these because
463 * it's needed for bootmem. node>0 pgdats have their virtual
464 * space allocated before the pagetables are in place to access
465 * them, so they can't be cleared then.
466 *
467 * This should all compile down to nothing when NUMA is off.
468 */
469static void __init remapped_pgdat_init(void)
470{
471 int nid;
472
473 for_each_online_node(nid) {
474 if (nid != 0)
475 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
476 }
477}
478
479#ifdef CONFIG_MCA
480static void set_mca_bus(int x)
481{
482 MCA_bus = x;
483}
484#else
485static void set_mca_bus(int x) { }
486#endif
487
488/* Overridden in paravirt.c if CONFIG_PARAVIRT */
489char * __init __attribute__((weak)) memory_setup(void)
490{
491 return machine_specific_memory_setup();
492}
493
494/*
495 * Determine if we were loaded by an EFI loader. If so, then we have also been
496 * passed the efi memmap, systab, etc., so we should use these data structures
497 * for initialization. Note, the efi init code path is determined by the
498 * global efi_enabled. This allows the same kernel image to be used on existing
499 * systems (with a traditional BIOS) as well as on EFI systems.
500 */
501void __init setup_arch(char **cmdline_p)
502{
503 unsigned long max_low_pfn;
504
505 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
506 pre_setup_arch_hook();
507 early_cpu_init();
508
509 /*
510 * FIXME: This isn't an official loader_type right
511 * now but does currently work with elilo.
512 * If we were configured as an EFI kernel, check to make
513 * sure that we were loaded correctly from elilo and that
514 * the system table is valid. If not, then initialize normally.
515 */
516#ifdef CONFIG_EFI
517 if ((LOADER_TYPE == 0x50) && EFI_SYSTAB)
518 efi_enabled = 1;
519#endif
520
521 ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV);
522 screen_info = SCREEN_INFO;
523 edid_info = EDID_INFO;
524 apm_info.bios = APM_BIOS_INFO;
525 ist_info = IST_INFO;
526 saved_videomode = VIDEO_MODE;
527 if( SYS_DESC_TABLE.length != 0 ) {
528 set_mca_bus(SYS_DESC_TABLE.table[3] & 0x2);
529 machine_id = SYS_DESC_TABLE.table[0];
530 machine_submodel_id = SYS_DESC_TABLE.table[1];
531 BIOS_revision = SYS_DESC_TABLE.table[2];
532 }
533 bootloader_type = LOADER_TYPE;
534
535#ifdef CONFIG_BLK_DEV_RAM
536 rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK;
537 rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0);
538 rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0);
539#endif
540 ARCH_SETUP
541 if (efi_enabled)
542 efi_init();
543 else {
544 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
545 print_memory_map(memory_setup());
546 }
547
548 copy_edd();
549
550 if (!MOUNT_ROOT_RDONLY)
551 root_mountflags &= ~MS_RDONLY;
552 init_mm.start_code = (unsigned long) _text;
553 init_mm.end_code = (unsigned long) _etext;
554 init_mm.end_data = (unsigned long) _edata;
555 init_mm.brk = init_pg_tables_end + PAGE_OFFSET;
556
557 code_resource.start = virt_to_phys(_text);
558 code_resource.end = virt_to_phys(_etext)-1;
559 data_resource.start = virt_to_phys(_etext);
560 data_resource.end = virt_to_phys(_edata)-1;
561
562 parse_early_param();
563
564 if (user_defined_memmap) {
565 printk(KERN_INFO "user-defined physical RAM map:\n");
566 print_memory_map("user");
567 }
568
569 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
570 *cmdline_p = command_line;
571
572 max_low_pfn = setup_memory();
573
574#ifdef CONFIG_VMI
575 /*
576 * Must be after max_low_pfn is determined, and before kernel
577 * pagetables are setup.
578 */
579 vmi_init();
580#endif
581
582 /*
583 * NOTE: before this point _nobody_ is allowed to allocate
584 * any memory using the bootmem allocator. Although the
585 * alloctor is now initialised only the first 8Mb of the kernel
586 * virtual address space has been mapped. All allocations before
587 * paging_init() has completed must use the alloc_bootmem_low_pages()
588 * variant (which allocates DMA'able memory) and care must be taken
589 * not to exceed the 8Mb limit.
590 */
591
592#ifdef CONFIG_SMP
593 smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
594#endif
595 paging_init();
596 remapped_pgdat_init();
597 sparse_init();
598 zone_sizes_init();
599
600 /*
601 * NOTE: at this point the bootmem allocator is fully available.
602 */
603
604 paravirt_post_allocator_init();
605
606 dmi_scan_machine();
607
608#ifdef CONFIG_X86_GENERICARCH
609 generic_apic_probe();
610#endif
611 if (efi_enabled)
612 efi_map_memmap();
613
614#ifdef CONFIG_ACPI
615 /*
616 * Parse the ACPI tables for possible boot-time SMP configuration.
617 */
618 acpi_boot_table_init();
619#endif
620
621#ifdef CONFIG_PCI
622#ifdef CONFIG_X86_IO_APIC
623 check_acpi_pci(); /* Checks more than just ACPI actually */
624#endif
625#endif
626
627#ifdef CONFIG_ACPI
628 acpi_boot_init();
629
630#if defined(CONFIG_SMP) && defined(CONFIG_X86_PC)
631 if (def_to_bigsmp)
632 printk(KERN_WARNING "More than 8 CPUs detected and "
633 "CONFIG_X86_PC cannot handle it.\nUse "
634 "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n");
635#endif
636#endif
637#ifdef CONFIG_X86_LOCAL_APIC
638 if (smp_found_config)
639 get_smp_config();
640#endif
641
642 e820_register_memory();
643 e820_mark_nosave_regions();
644
645#ifdef CONFIG_VT
646#if defined(CONFIG_VGA_CONSOLE)
647 if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
648 conswitchp = &vga_con;
649#elif defined(CONFIG_DUMMY_CONSOLE)
650 conswitchp = &dummy_con;
651#endif
652#endif
653}
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
new file mode 100644
index 000000000000..af838f6b0b7f
--- /dev/null
+++ b/arch/x86/kernel/setup_64.c
@@ -0,0 +1,1117 @@
1/*
2 * linux/arch/x86-64/kernel/setup.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 *
6 * Nov 2001 Dave Jones <davej@suse.de>
7 * Forked from i386 setup code.
8 */
9
10/*
11 * This file handles the architecture-dependent parts of initialization
12 */
13
14#include <linux/errno.h>
15#include <linux/sched.h>
16#include <linux/kernel.h>
17#include <linux/mm.h>
18#include <linux/stddef.h>
19#include <linux/unistd.h>
20#include <linux/ptrace.h>
21#include <linux/slab.h>
22#include <linux/user.h>
23#include <linux/a.out.h>
24#include <linux/screen_info.h>
25#include <linux/ioport.h>
26#include <linux/delay.h>
27#include <linux/init.h>
28#include <linux/initrd.h>
29#include <linux/highmem.h>
30#include <linux/bootmem.h>
31#include <linux/module.h>
32#include <asm/processor.h>
33#include <linux/console.h>
34#include <linux/seq_file.h>
35#include <linux/crash_dump.h>
36#include <linux/root_dev.h>
37#include <linux/pci.h>
38#include <linux/acpi.h>
39#include <linux/kallsyms.h>
40#include <linux/edd.h>
41#include <linux/mmzone.h>
42#include <linux/kexec.h>
43#include <linux/cpufreq.h>
44#include <linux/dmi.h>
45#include <linux/dma-mapping.h>
46#include <linux/ctype.h>
47
48#include <asm/mtrr.h>
49#include <asm/uaccess.h>
50#include <asm/system.h>
51#include <asm/io.h>
52#include <asm/smp.h>
53#include <asm/msr.h>
54#include <asm/desc.h>
55#include <video/edid.h>
56#include <asm/e820.h>
57#include <asm/dma.h>
58#include <asm/mpspec.h>
59#include <asm/mmu_context.h>
60#include <asm/bootsetup.h>
61#include <asm/proto.h>
62#include <asm/setup.h>
63#include <asm/mach_apic.h>
64#include <asm/numa.h>
65#include <asm/sections.h>
66#include <asm/dmi.h>
67
68/*
69 * Machine setup..
70 */
71
72struct cpuinfo_x86 boot_cpu_data __read_mostly;
73EXPORT_SYMBOL(boot_cpu_data);
74
75unsigned long mmu_cr4_features;
76
77/* Boot loader ID as an integer, for the benefit of proc_dointvec */
78int bootloader_type;
79
80unsigned long saved_video_mode;
81
82int force_mwait __cpuinitdata;
83
84/*
85 * Early DMI memory
86 */
87int dmi_alloc_index;
88char dmi_alloc_data[DMI_MAX_DATA];
89
90/*
91 * Setup options
92 */
93struct screen_info screen_info;
94EXPORT_SYMBOL(screen_info);
95struct sys_desc_table_struct {
96 unsigned short length;
97 unsigned char table[0];
98};
99
100struct edid_info edid_info;
101EXPORT_SYMBOL_GPL(edid_info);
102
103extern int root_mountflags;
104
105char __initdata command_line[COMMAND_LINE_SIZE];
106
107struct resource standard_io_resources[] = {
108 { .name = "dma1", .start = 0x00, .end = 0x1f,
109 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
110 { .name = "pic1", .start = 0x20, .end = 0x21,
111 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
112 { .name = "timer0", .start = 0x40, .end = 0x43,
113 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
114 { .name = "timer1", .start = 0x50, .end = 0x53,
115 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
116 { .name = "keyboard", .start = 0x60, .end = 0x6f,
117 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
118 { .name = "dma page reg", .start = 0x80, .end = 0x8f,
119 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
120 { .name = "pic2", .start = 0xa0, .end = 0xa1,
121 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
122 { .name = "dma2", .start = 0xc0, .end = 0xdf,
123 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
124 { .name = "fpu", .start = 0xf0, .end = 0xff,
125 .flags = IORESOURCE_BUSY | IORESOURCE_IO }
126};
127
128#define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM)
129
130struct resource data_resource = {
131 .name = "Kernel data",
132 .start = 0,
133 .end = 0,
134 .flags = IORESOURCE_RAM,
135};
136struct resource code_resource = {
137 .name = "Kernel code",
138 .start = 0,
139 .end = 0,
140 .flags = IORESOURCE_RAM,
141};
142
143#ifdef CONFIG_PROC_VMCORE
144/* elfcorehdr= specifies the location of elf core header
145 * stored by the crashed kernel. This option will be passed
146 * by kexec loader to the capture kernel.
147 */
148static int __init setup_elfcorehdr(char *arg)
149{
150 char *end;
151 if (!arg)
152 return -EINVAL;
153 elfcorehdr_addr = memparse(arg, &end);
154 return end > arg ? 0 : -EINVAL;
155}
156early_param("elfcorehdr", setup_elfcorehdr);
157#endif
158
159#ifndef CONFIG_NUMA
160static void __init
161contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
162{
163 unsigned long bootmap_size, bootmap;
164
165 bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
166 bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size);
167 if (bootmap == -1L)
168 panic("Cannot find bootmem map of size %ld\n",bootmap_size);
169 bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
170 e820_register_active_regions(0, start_pfn, end_pfn);
171 free_bootmem_with_active_regions(0, end_pfn);
172 reserve_bootmem(bootmap, bootmap_size);
173}
174#endif
175
176#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
177struct edd edd;
178#ifdef CONFIG_EDD_MODULE
179EXPORT_SYMBOL(edd);
180#endif
181/**
182 * copy_edd() - Copy the BIOS EDD information
183 * from boot_params into a safe place.
184 *
185 */
186static inline void copy_edd(void)
187{
188 memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature));
189 memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info));
190 edd.mbr_signature_nr = EDD_MBR_SIG_NR;
191 edd.edd_info_nr = EDD_NR;
192}
193#else
194static inline void copy_edd(void)
195{
196}
197#endif
198
199#define EBDA_ADDR_POINTER 0x40E
200
201unsigned __initdata ebda_addr;
202unsigned __initdata ebda_size;
203
204static void discover_ebda(void)
205{
206 /*
207 * there is a real-mode segmented pointer pointing to the
208 * 4K EBDA area at 0x40E
209 */
210 ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER);
211 ebda_addr <<= 4;
212
213 ebda_size = *(unsigned short *)__va(ebda_addr);
214
215 /* Round EBDA up to pages */
216 if (ebda_size == 0)
217 ebda_size = 1;
218 ebda_size <<= 10;
219 ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE);
220 if (ebda_size > 64*1024)
221 ebda_size = 64*1024;
222}
223
224void __init setup_arch(char **cmdline_p)
225{
226 printk(KERN_INFO "Command line: %s\n", boot_command_line);
227
228 ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV);
229 screen_info = SCREEN_INFO;
230 edid_info = EDID_INFO;
231 saved_video_mode = SAVED_VIDEO_MODE;
232 bootloader_type = LOADER_TYPE;
233
234#ifdef CONFIG_BLK_DEV_RAM
235 rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK;
236 rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0);
237 rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0);
238#endif
239 setup_memory_region();
240 copy_edd();
241
242 if (!MOUNT_ROOT_RDONLY)
243 root_mountflags &= ~MS_RDONLY;
244 init_mm.start_code = (unsigned long) &_text;
245 init_mm.end_code = (unsigned long) &_etext;
246 init_mm.end_data = (unsigned long) &_edata;
247 init_mm.brk = (unsigned long) &_end;
248
249 code_resource.start = virt_to_phys(&_text);
250 code_resource.end = virt_to_phys(&_etext)-1;
251 data_resource.start = virt_to_phys(&_etext);
252 data_resource.end = virt_to_phys(&_edata)-1;
253
254 early_identify_cpu(&boot_cpu_data);
255
256 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
257 *cmdline_p = command_line;
258
259 parse_early_param();
260
261 finish_e820_parsing();
262
263 e820_register_active_regions(0, 0, -1UL);
264 /*
265 * partially used pages are not usable - thus
266 * we are rounding upwards:
267 */
268 end_pfn = e820_end_of_ram();
269 num_physpages = end_pfn;
270
271 check_efer();
272
273 discover_ebda();
274
275 init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT));
276
277 dmi_scan_machine();
278
279#ifdef CONFIG_ACPI
280 /*
281 * Initialize the ACPI boot-time table parser (gets the RSDP and SDT).
282 * Call this early for SRAT node setup.
283 */
284 acpi_boot_table_init();
285#endif
286
287 /* How many end-of-memory variables you have, grandma! */
288 max_low_pfn = end_pfn;
289 max_pfn = end_pfn;
290 high_memory = (void *)__va(end_pfn * PAGE_SIZE - 1) + 1;
291
292 /* Remove active ranges so rediscovery with NUMA-awareness happens */
293 remove_all_active_ranges();
294
295#ifdef CONFIG_ACPI_NUMA
296 /*
297 * Parse SRAT to discover nodes.
298 */
299 acpi_numa_init();
300#endif
301
302#ifdef CONFIG_NUMA
303 numa_initmem_init(0, end_pfn);
304#else
305 contig_initmem_init(0, end_pfn);
306#endif
307
308 /* Reserve direct mapping */
309 reserve_bootmem_generic(table_start << PAGE_SHIFT,
310 (table_end - table_start) << PAGE_SHIFT);
311
312 /* reserve kernel */
313 reserve_bootmem_generic(__pa_symbol(&_text),
314 __pa_symbol(&_end) - __pa_symbol(&_text));
315
316 /*
317 * reserve physical page 0 - it's a special BIOS page on many boxes,
318 * enabling clean reboots, SMP operation, laptop functions.
319 */
320 reserve_bootmem_generic(0, PAGE_SIZE);
321
322 /* reserve ebda region */
323 if (ebda_addr)
324 reserve_bootmem_generic(ebda_addr, ebda_size);
325#ifdef CONFIG_NUMA
326 /* reserve nodemap region */
327 if (nodemap_addr)
328 reserve_bootmem_generic(nodemap_addr, nodemap_size);
329#endif
330
331#ifdef CONFIG_SMP
332 /* Reserve SMP trampoline */
333 reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, 2*PAGE_SIZE);
334#endif
335
336#ifdef CONFIG_ACPI_SLEEP
337 /*
338 * Reserve low memory region for sleep support.
339 */
340 acpi_reserve_bootmem();
341#endif
342 /*
343 * Find and reserve possible boot-time SMP configuration:
344 */
345 find_smp_config();
346#ifdef CONFIG_BLK_DEV_INITRD
347 if (LOADER_TYPE && INITRD_START) {
348 if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) {
349 reserve_bootmem_generic(INITRD_START, INITRD_SIZE);
350 initrd_start = INITRD_START + PAGE_OFFSET;
351 initrd_end = initrd_start+INITRD_SIZE;
352 }
353 else {
354 printk(KERN_ERR "initrd extends beyond end of memory "
355 "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
356 (unsigned long)(INITRD_START + INITRD_SIZE),
357 (unsigned long)(end_pfn << PAGE_SHIFT));
358 initrd_start = 0;
359 }
360 }
361#endif
362#ifdef CONFIG_KEXEC
363 if (crashk_res.start != crashk_res.end) {
364 reserve_bootmem_generic(crashk_res.start,
365 crashk_res.end - crashk_res.start + 1);
366 }
367#endif
368
369 paging_init();
370
371#ifdef CONFIG_PCI
372 early_quirks();
373#endif
374
375 /*
376 * set this early, so we dont allocate cpu0
377 * if MADT list doesnt list BSP first
378 * mpparse.c/MP_processor_info() allocates logical cpu numbers.
379 */
380 cpu_set(0, cpu_present_map);
381#ifdef CONFIG_ACPI
382 /*
383 * Read APIC and some other early information from ACPI tables.
384 */
385 acpi_boot_init();
386#endif
387
388 init_cpu_to_node();
389
390 /*
391 * get boot-time SMP configuration:
392 */
393 if (smp_found_config)
394 get_smp_config();
395 init_apic_mappings();
396
397 /*
398 * We trust e820 completely. No explicit ROM probing in memory.
399 */
400 e820_reserve_resources();
401 e820_mark_nosave_regions();
402
403 {
404 unsigned i;
405 /* request I/O space for devices used on all i[345]86 PCs */
406 for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
407 request_resource(&ioport_resource, &standard_io_resources[i]);
408 }
409
410 e820_setup_gap();
411
412#ifdef CONFIG_VT
413#if defined(CONFIG_VGA_CONSOLE)
414 conswitchp = &vga_con;
415#elif defined(CONFIG_DUMMY_CONSOLE)
416 conswitchp = &dummy_con;
417#endif
418#endif
419}
420
421static int __cpuinit get_model_name(struct cpuinfo_x86 *c)
422{
423 unsigned int *v;
424
425 if (c->extended_cpuid_level < 0x80000004)
426 return 0;
427
428 v = (unsigned int *) c->x86_model_id;
429 cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
430 cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
431 cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
432 c->x86_model_id[48] = 0;
433 return 1;
434}
435
436
437static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
438{
439 unsigned int n, dummy, eax, ebx, ecx, edx;
440
441 n = c->extended_cpuid_level;
442
443 if (n >= 0x80000005) {
444 cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
445 printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
446 edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
447 c->x86_cache_size=(ecx>>24)+(edx>>24);
448 /* On K8 L1 TLB is inclusive, so don't count it */
449 c->x86_tlbsize = 0;
450 }
451
452 if (n >= 0x80000006) {
453 cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
454 ecx = cpuid_ecx(0x80000006);
455 c->x86_cache_size = ecx >> 16;
456 c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
457
458 printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
459 c->x86_cache_size, ecx & 0xFF);
460 }
461
462 if (n >= 0x80000007)
463 cpuid(0x80000007, &dummy, &dummy, &dummy, &c->x86_power);
464 if (n >= 0x80000008) {
465 cpuid(0x80000008, &eax, &dummy, &dummy, &dummy);
466 c->x86_virt_bits = (eax >> 8) & 0xff;
467 c->x86_phys_bits = eax & 0xff;
468 }
469}
470
471#ifdef CONFIG_NUMA
472static int nearby_node(int apicid)
473{
474 int i;
475 for (i = apicid - 1; i >= 0; i--) {
476 int node = apicid_to_node[i];
477 if (node != NUMA_NO_NODE && node_online(node))
478 return node;
479 }
480 for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
481 int node = apicid_to_node[i];
482 if (node != NUMA_NO_NODE && node_online(node))
483 return node;
484 }
485 return first_node(node_online_map); /* Shouldn't happen */
486}
487#endif
488
489/*
490 * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
491 * Assumes number of cores is a power of two.
492 */
493static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
494{
495#ifdef CONFIG_SMP
496 unsigned bits;
497#ifdef CONFIG_NUMA
498 int cpu = smp_processor_id();
499 int node = 0;
500 unsigned apicid = hard_smp_processor_id();
501#endif
502 unsigned ecx = cpuid_ecx(0x80000008);
503
504 c->x86_max_cores = (ecx & 0xff) + 1;
505
506 /* CPU telling us the core id bits shift? */
507 bits = (ecx >> 12) & 0xF;
508
509 /* Otherwise recompute */
510 if (bits == 0) {
511 while ((1 << bits) < c->x86_max_cores)
512 bits++;
513 }
514
515 /* Low order bits define the core id (index of core in socket) */
516 c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1);
517 /* Convert the APIC ID into the socket ID */
518 c->phys_proc_id = phys_pkg_id(bits);
519
520#ifdef CONFIG_NUMA
521 node = c->phys_proc_id;
522 if (apicid_to_node[apicid] != NUMA_NO_NODE)
523 node = apicid_to_node[apicid];
524 if (!node_online(node)) {
525 /* Two possibilities here:
526 - The CPU is missing memory and no node was created.
527 In that case try picking one from a nearby CPU
528 - The APIC IDs differ from the HyperTransport node IDs
529 which the K8 northbridge parsing fills in.
530 Assume they are all increased by a constant offset,
531 but in the same order as the HT nodeids.
532 If that doesn't result in a usable node fall back to the
533 path for the previous case. */
534 int ht_nodeid = apicid - (cpu_data[0].phys_proc_id << bits);
535 if (ht_nodeid >= 0 &&
536 apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
537 node = apicid_to_node[ht_nodeid];
538 /* Pick a nearby node */
539 if (!node_online(node))
540 node = nearby_node(apicid);
541 }
542 numa_set_node(cpu, node);
543
544 printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
545#endif
546#endif
547}
548
549static void __cpuinit init_amd(struct cpuinfo_x86 *c)
550{
551 unsigned level;
552
553#ifdef CONFIG_SMP
554 unsigned long value;
555
556 /*
557 * Disable TLB flush filter by setting HWCR.FFDIS on K8
558 * bit 6 of msr C001_0015
559 *
560 * Errata 63 for SH-B3 steppings
561 * Errata 122 for all steppings (F+ have it disabled by default)
562 */
563 if (c->x86 == 15) {
564 rdmsrl(MSR_K8_HWCR, value);
565 value |= 1 << 6;
566 wrmsrl(MSR_K8_HWCR, value);
567 }
568#endif
569
570 /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
571 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
572 clear_bit(0*32+31, &c->x86_capability);
573
574 /* On C+ stepping K8 rep microcode works well for copy/memset */
575 level = cpuid_eax(1);
576 if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58))
577 set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
578 if (c->x86 == 0x10)
579 set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
580
581 /* Enable workaround for FXSAVE leak */
582 if (c->x86 >= 6)
583 set_bit(X86_FEATURE_FXSAVE_LEAK, &c->x86_capability);
584
585 level = get_model_name(c);
586 if (!level) {
587 switch (c->x86) {
588 case 15:
589 /* Should distinguish Models here, but this is only
590 a fallback anyways. */
591 strcpy(c->x86_model_id, "Hammer");
592 break;
593 }
594 }
595 display_cacheinfo(c);
596
597 /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
598 if (c->x86_power & (1<<8))
599 set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
600
601 /* Multi core CPU? */
602 if (c->extended_cpuid_level >= 0x80000008)
603 amd_detect_cmp(c);
604
605 if (c->extended_cpuid_level >= 0x80000006 &&
606 (cpuid_edx(0x80000006) & 0xf000))
607 num_cache_leaves = 4;
608 else
609 num_cache_leaves = 3;
610
611 if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11)
612 set_bit(X86_FEATURE_K8, &c->x86_capability);
613
614 /* RDTSC can be speculated around */
615 clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
616
617 /* Family 10 doesn't support C states in MWAIT so don't use it */
618 if (c->x86 == 0x10 && !force_mwait)
619 clear_bit(X86_FEATURE_MWAIT, &c->x86_capability);
620}
621
622static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
623{
624#ifdef CONFIG_SMP
625 u32 eax, ebx, ecx, edx;
626 int index_msb, core_bits;
627
628 cpuid(1, &eax, &ebx, &ecx, &edx);
629
630
631 if (!cpu_has(c, X86_FEATURE_HT))
632 return;
633 if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
634 goto out;
635
636 smp_num_siblings = (ebx & 0xff0000) >> 16;
637
638 if (smp_num_siblings == 1) {
639 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
640 } else if (smp_num_siblings > 1 ) {
641
642 if (smp_num_siblings > NR_CPUS) {
643 printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings);
644 smp_num_siblings = 1;
645 return;
646 }
647
648 index_msb = get_count_order(smp_num_siblings);
649 c->phys_proc_id = phys_pkg_id(index_msb);
650
651 smp_num_siblings = smp_num_siblings / c->x86_max_cores;
652
653 index_msb = get_count_order(smp_num_siblings) ;
654
655 core_bits = get_count_order(c->x86_max_cores);
656
657 c->cpu_core_id = phys_pkg_id(index_msb) &
658 ((1 << core_bits) - 1);
659 }
660out:
661 if ((c->x86_max_cores * smp_num_siblings) > 1) {
662 printk(KERN_INFO "CPU: Physical Processor ID: %d\n", c->phys_proc_id);
663 printk(KERN_INFO "CPU: Processor Core ID: %d\n", c->cpu_core_id);
664 }
665
666#endif
667}
668
669/*
670 * find out the number of processor cores on the die
671 */
672static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
673{
674 unsigned int eax, t;
675
676 if (c->cpuid_level < 4)
677 return 1;
678
679 cpuid_count(4, 0, &eax, &t, &t, &t);
680
681 if (eax & 0x1f)
682 return ((eax >> 26) + 1);
683 else
684 return 1;
685}
686
687static void srat_detect_node(void)
688{
689#ifdef CONFIG_NUMA
690 unsigned node;
691 int cpu = smp_processor_id();
692 int apicid = hard_smp_processor_id();
693
694 /* Don't do the funky fallback heuristics the AMD version employs
695 for now. */
696 node = apicid_to_node[apicid];
697 if (node == NUMA_NO_NODE)
698 node = first_node(node_online_map);
699 numa_set_node(cpu, node);
700
701 printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
702#endif
703}
704
705static void __cpuinit init_intel(struct cpuinfo_x86 *c)
706{
707 /* Cache sizes */
708 unsigned n;
709
710 init_intel_cacheinfo(c);
711 if (c->cpuid_level > 9 ) {
712 unsigned eax = cpuid_eax(10);
713 /* Check for version and the number of counters */
714 if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
715 set_bit(X86_FEATURE_ARCH_PERFMON, &c->x86_capability);
716 }
717
718 if (cpu_has_ds) {
719 unsigned int l1, l2;
720 rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
721 if (!(l1 & (1<<11)))
722 set_bit(X86_FEATURE_BTS, c->x86_capability);
723 if (!(l1 & (1<<12)))
724 set_bit(X86_FEATURE_PEBS, c->x86_capability);
725 }
726
727 n = c->extended_cpuid_level;
728 if (n >= 0x80000008) {
729 unsigned eax = cpuid_eax(0x80000008);
730 c->x86_virt_bits = (eax >> 8) & 0xff;
731 c->x86_phys_bits = eax & 0xff;
732 /* CPUID workaround for Intel 0F34 CPU */
733 if (c->x86_vendor == X86_VENDOR_INTEL &&
734 c->x86 == 0xF && c->x86_model == 0x3 &&
735 c->x86_mask == 0x4)
736 c->x86_phys_bits = 36;
737 }
738
739 if (c->x86 == 15)
740 c->x86_cache_alignment = c->x86_clflush_size * 2;
741 if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
742 (c->x86 == 0x6 && c->x86_model >= 0x0e))
743 set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
744 if (c->x86 == 6)
745 set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
746 if (c->x86 == 15)
747 set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
748 else
749 clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
750 c->x86_max_cores = intel_num_cpu_cores(c);
751
752 srat_detect_node();
753}
754
755static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
756{
757 char *v = c->x86_vendor_id;
758
759 if (!strcmp(v, "AuthenticAMD"))
760 c->x86_vendor = X86_VENDOR_AMD;
761 else if (!strcmp(v, "GenuineIntel"))
762 c->x86_vendor = X86_VENDOR_INTEL;
763 else
764 c->x86_vendor = X86_VENDOR_UNKNOWN;
765}
766
767struct cpu_model_info {
768 int vendor;
769 int family;
770 char *model_names[16];
771};
772
773/* Do some early cpuid on the boot CPU to get some parameter that are
774 needed before check_bugs. Everything advanced is in identify_cpu
775 below. */
776void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
777{
778 u32 tfms;
779
780 c->loops_per_jiffy = loops_per_jiffy;
781 c->x86_cache_size = -1;
782 c->x86_vendor = X86_VENDOR_UNKNOWN;
783 c->x86_model = c->x86_mask = 0; /* So far unknown... */
784 c->x86_vendor_id[0] = '\0'; /* Unset */
785 c->x86_model_id[0] = '\0'; /* Unset */
786 c->x86_clflush_size = 64;
787 c->x86_cache_alignment = c->x86_clflush_size;
788 c->x86_max_cores = 1;
789 c->extended_cpuid_level = 0;
790 memset(&c->x86_capability, 0, sizeof c->x86_capability);
791
792 /* Get vendor name */
793 cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
794 (unsigned int *)&c->x86_vendor_id[0],
795 (unsigned int *)&c->x86_vendor_id[8],
796 (unsigned int *)&c->x86_vendor_id[4]);
797
798 get_cpu_vendor(c);
799
800 /* Initialize the standard set of capabilities */
801 /* Note that the vendor-specific code below might override */
802
803 /* Intel-defined flags: level 0x00000001 */
804 if (c->cpuid_level >= 0x00000001) {
805 __u32 misc;
806 cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
807 &c->x86_capability[0]);
808 c->x86 = (tfms >> 8) & 0xf;
809 c->x86_model = (tfms >> 4) & 0xf;
810 c->x86_mask = tfms & 0xf;
811 if (c->x86 == 0xf)
812 c->x86 += (tfms >> 20) & 0xff;
813 if (c->x86 >= 0x6)
814 c->x86_model += ((tfms >> 16) & 0xF) << 4;
815 if (c->x86_capability[0] & (1<<19))
816 c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
817 } else {
818 /* Have CPUID level 0 only - unheard of */
819 c->x86 = 4;
820 }
821
822#ifdef CONFIG_SMP
823 c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
824#endif
825}
826
827/*
828 * This does the hard work of actually picking apart the CPU stuff...
829 */
830void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
831{
832 int i;
833 u32 xlvl;
834
835 early_identify_cpu(c);
836
837 /* AMD-defined flags: level 0x80000001 */
838 xlvl = cpuid_eax(0x80000000);
839 c->extended_cpuid_level = xlvl;
840 if ((xlvl & 0xffff0000) == 0x80000000) {
841 if (xlvl >= 0x80000001) {
842 c->x86_capability[1] = cpuid_edx(0x80000001);
843 c->x86_capability[6] = cpuid_ecx(0x80000001);
844 }
845 if (xlvl >= 0x80000004)
846 get_model_name(c); /* Default name */
847 }
848
849 /* Transmeta-defined flags: level 0x80860001 */
850 xlvl = cpuid_eax(0x80860000);
851 if ((xlvl & 0xffff0000) == 0x80860000) {
852 /* Don't set x86_cpuid_level here for now to not confuse. */
853 if (xlvl >= 0x80860001)
854 c->x86_capability[2] = cpuid_edx(0x80860001);
855 }
856
857 init_scattered_cpuid_features(c);
858
859 c->apicid = phys_pkg_id(0);
860
861 /*
862 * Vendor-specific initialization. In this section we
863 * canonicalize the feature flags, meaning if there are
864 * features a certain CPU supports which CPUID doesn't
865 * tell us, CPUID claiming incorrect flags, or other bugs,
866 * we handle them here.
867 *
868 * At the end of this section, c->x86_capability better
869 * indicate the features this CPU genuinely supports!
870 */
871 switch (c->x86_vendor) {
872 case X86_VENDOR_AMD:
873 init_amd(c);
874 break;
875
876 case X86_VENDOR_INTEL:
877 init_intel(c);
878 break;
879
880 case X86_VENDOR_UNKNOWN:
881 default:
882 display_cacheinfo(c);
883 break;
884 }
885
886 select_idle_routine(c);
887 detect_ht(c);
888
889 /*
890 * On SMP, boot_cpu_data holds the common feature set between
891 * all CPUs; so make sure that we indicate which features are
892 * common between the CPUs. The first time this routine gets
893 * executed, c == &boot_cpu_data.
894 */
895 if (c != &boot_cpu_data) {
896 /* AND the already accumulated flags with these */
897 for (i = 0 ; i < NCAPINTS ; i++)
898 boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
899 }
900
901#ifdef CONFIG_X86_MCE
902 mcheck_init(c);
903#endif
904 if (c != &boot_cpu_data)
905 mtrr_ap_init();
906#ifdef CONFIG_NUMA
907 numa_add_cpu(smp_processor_id());
908#endif
909}
910
911
912void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
913{
914 if (c->x86_model_id[0])
915 printk("%s", c->x86_model_id);
916
917 if (c->x86_mask || c->cpuid_level >= 0)
918 printk(" stepping %02x\n", c->x86_mask);
919 else
920 printk("\n");
921}
922
923/*
924 * Get CPU information for use by the procfs.
925 */
926
927static int show_cpuinfo(struct seq_file *m, void *v)
928{
929 struct cpuinfo_x86 *c = v;
930
931 /*
932 * These flag bits must match the definitions in <asm/cpufeature.h>.
933 * NULL means this bit is undefined or reserved; either way it doesn't
934 * have meaning as far as Linux is concerned. Note that it's important
935 * to realize there is a difference between this table and CPUID -- if
936 * applications want to get the raw CPUID data, they should access
937 * /dev/cpu/<cpu_nr>/cpuid instead.
938 */
939 static char *x86_cap_flags[] = {
940 /* Intel-defined */
941 "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
942 "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
943 "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx",
944 "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe",
945
946 /* AMD-defined */
947 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
948 NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
949 NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL,
950 NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm",
951 "3dnowext", "3dnow",
952
953 /* Transmeta-defined */
954 "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
955 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
956 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
957 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
958
959 /* Other (Linux-defined) */
960 "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr",
961 NULL, NULL, NULL, NULL,
962 "constant_tsc", "up", NULL, "arch_perfmon",
963 "pebs", "bts", NULL, "sync_rdtsc",
964 "rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
965 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
966
967 /* Intel-defined (#2) */
968 "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
969 "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL,
970 NULL, NULL, "dca", NULL, NULL, NULL, NULL, "popcnt",
971 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
972
973 /* VIA/Cyrix/Centaur-defined */
974 NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en",
975 "ace2", "ace2_en", "phe", "phe_en", "pmm", "pmm_en", NULL, NULL,
976 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
977 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
978
979 /* AMD-defined (#2) */
980 "lahf_lm", "cmp_legacy", "svm", "extapic", "cr8_legacy",
981 "altmovcr8", "abm", "sse4a",
982 "misalignsse", "3dnowprefetch",
983 "osvw", "ibs", NULL, NULL, NULL, NULL,
984 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
985 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
986
987 /* Auxiliary (Linux-defined) */
988 "ida", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
989 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
990 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
991 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
992 };
993 static char *x86_power_flags[] = {
994 "ts", /* temperature sensor */
995 "fid", /* frequency id control */
996 "vid", /* voltage id control */
997 "ttp", /* thermal trip */
998 "tm",
999 "stc",
1000 "100mhzsteps",
1001 "hwpstate",
1002 "", /* tsc invariant mapped to constant_tsc */
1003 /* nothing */
1004 };
1005
1006
1007#ifdef CONFIG_SMP
1008 if (!cpu_online(c-cpu_data))
1009 return 0;
1010#endif
1011
1012 seq_printf(m,"processor\t: %u\n"
1013 "vendor_id\t: %s\n"
1014 "cpu family\t: %d\n"
1015 "model\t\t: %d\n"
1016 "model name\t: %s\n",
1017 (unsigned)(c-cpu_data),
1018 c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
1019 c->x86,
1020 (int)c->x86_model,
1021 c->x86_model_id[0] ? c->x86_model_id : "unknown");
1022
1023 if (c->x86_mask || c->cpuid_level >= 0)
1024 seq_printf(m, "stepping\t: %d\n", c->x86_mask);
1025 else
1026 seq_printf(m, "stepping\t: unknown\n");
1027
1028 if (cpu_has(c,X86_FEATURE_TSC)) {
1029 unsigned int freq = cpufreq_quick_get((unsigned)(c-cpu_data));
1030 if (!freq)
1031 freq = cpu_khz;
1032 seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
1033 freq / 1000, (freq % 1000));
1034 }
1035
1036 /* Cache size */
1037 if (c->x86_cache_size >= 0)
1038 seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
1039
1040#ifdef CONFIG_SMP
1041 if (smp_num_siblings * c->x86_max_cores > 1) {
1042 int cpu = c - cpu_data;
1043 seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
1044 seq_printf(m, "siblings\t: %d\n", cpus_weight(cpu_core_map[cpu]));
1045 seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
1046 seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
1047 }
1048#endif
1049
1050 seq_printf(m,
1051 "fpu\t\t: yes\n"
1052 "fpu_exception\t: yes\n"
1053 "cpuid level\t: %d\n"
1054 "wp\t\t: yes\n"
1055 "flags\t\t:",
1056 c->cpuid_level);
1057
1058 {
1059 int i;
1060 for ( i = 0 ; i < 32*NCAPINTS ; i++ )
1061 if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
1062 seq_printf(m, " %s", x86_cap_flags[i]);
1063 }
1064
1065 seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
1066 c->loops_per_jiffy/(500000/HZ),
1067 (c->loops_per_jiffy/(5000/HZ)) % 100);
1068
1069 if (c->x86_tlbsize > 0)
1070 seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize);
1071 seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size);
1072 seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment);
1073
1074 seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n",
1075 c->x86_phys_bits, c->x86_virt_bits);
1076
1077 seq_printf(m, "power management:");
1078 {
1079 unsigned i;
1080 for (i = 0; i < 32; i++)
1081 if (c->x86_power & (1 << i)) {
1082 if (i < ARRAY_SIZE(x86_power_flags) &&
1083 x86_power_flags[i])
1084 seq_printf(m, "%s%s",
1085 x86_power_flags[i][0]?" ":"",
1086 x86_power_flags[i]);
1087 else
1088 seq_printf(m, " [%d]", i);
1089 }
1090 }
1091
1092 seq_printf(m, "\n\n");
1093
1094 return 0;
1095}
1096
1097static void *c_start(struct seq_file *m, loff_t *pos)
1098{
1099 return *pos < NR_CPUS ? cpu_data + *pos : NULL;
1100}
1101
1102static void *c_next(struct seq_file *m, void *v, loff_t *pos)
1103{
1104 ++*pos;
1105 return c_start(m, pos);
1106}
1107
1108static void c_stop(struct seq_file *m, void *v)
1109{
1110}
1111
1112struct seq_operations cpuinfo_op = {
1113 .start =c_start,
1114 .next = c_next,
1115 .stop = c_stop,
1116 .show = show_cpuinfo,
1117};
diff --git a/arch/x86/kernel/sigframe_32.h b/arch/x86/kernel/sigframe_32.h
new file mode 100644
index 000000000000..0b2221711dad
--- /dev/null
+++ b/arch/x86/kernel/sigframe_32.h
@@ -0,0 +1,21 @@
1struct sigframe
2{
3 char __user *pretcode;
4 int sig;
5 struct sigcontext sc;
6 struct _fpstate fpstate;
7 unsigned long extramask[_NSIG_WORDS-1];
8 char retcode[8];
9};
10
11struct rt_sigframe
12{
13 char __user *pretcode;
14 int sig;
15 struct siginfo __user *pinfo;
16 void __user *puc;
17 struct siginfo info;
18 struct ucontext uc;
19 struct _fpstate fpstate;
20 char retcode[8];
21};
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
new file mode 100644
index 000000000000..c03570f7fe8e
--- /dev/null
+++ b/arch/x86/kernel/signal_32.c
@@ -0,0 +1,667 @@
1/*
2 * linux/arch/i386/kernel/signal.c
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 *
6 * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson
7 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
8 */
9
10#include <linux/sched.h>
11#include <linux/mm.h>
12#include <linux/smp.h>
13#include <linux/kernel.h>
14#include <linux/signal.h>
15#include <linux/errno.h>
16#include <linux/wait.h>
17#include <linux/unistd.h>
18#include <linux/stddef.h>
19#include <linux/personality.h>
20#include <linux/suspend.h>
21#include <linux/ptrace.h>
22#include <linux/elf.h>
23#include <linux/binfmts.h>
24#include <asm/processor.h>
25#include <asm/ucontext.h>
26#include <asm/uaccess.h>
27#include <asm/i387.h>
28#include "sigframe_32.h"
29
30#define DEBUG_SIG 0
31
32#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
33
34/*
35 * Atomically swap in the new signal mask, and wait for a signal.
36 */
37asmlinkage int
38sys_sigsuspend(int history0, int history1, old_sigset_t mask)
39{
40 mask &= _BLOCKABLE;
41 spin_lock_irq(&current->sighand->siglock);
42 current->saved_sigmask = current->blocked;
43 siginitset(&current->blocked, mask);
44 recalc_sigpending();
45 spin_unlock_irq(&current->sighand->siglock);
46
47 current->state = TASK_INTERRUPTIBLE;
48 schedule();
49 set_thread_flag(TIF_RESTORE_SIGMASK);
50 return -ERESTARTNOHAND;
51}
52
53asmlinkage int
54sys_sigaction(int sig, const struct old_sigaction __user *act,
55 struct old_sigaction __user *oact)
56{
57 struct k_sigaction new_ka, old_ka;
58 int ret;
59
60 if (act) {
61 old_sigset_t mask;
62 if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
63 __get_user(new_ka.sa.sa_handler, &act->sa_handler) ||
64 __get_user(new_ka.sa.sa_restorer, &act->sa_restorer))
65 return -EFAULT;
66 __get_user(new_ka.sa.sa_flags, &act->sa_flags);
67 __get_user(mask, &act->sa_mask);
68 siginitset(&new_ka.sa.sa_mask, mask);
69 }
70
71 ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
72
73 if (!ret && oact) {
74 if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
75 __put_user(old_ka.sa.sa_handler, &oact->sa_handler) ||
76 __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer))
77 return -EFAULT;
78 __put_user(old_ka.sa.sa_flags, &oact->sa_flags);
79 __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask);
80 }
81
82 return ret;
83}
84
85asmlinkage int
86sys_sigaltstack(unsigned long ebx)
87{
88 /* This is needed to make gcc realize it doesn't own the "struct pt_regs" */
89 struct pt_regs *regs = (struct pt_regs *)&ebx;
90 const stack_t __user *uss = (const stack_t __user *)ebx;
91 stack_t __user *uoss = (stack_t __user *)regs->ecx;
92
93 return do_sigaltstack(uss, uoss, regs->esp);
94}
95
96
97/*
98 * Do a signal return; undo the signal stack.
99 */
100
101static int
102restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax)
103{
104 unsigned int err = 0;
105
106 /* Always make any pending restarted system calls return -EINTR */
107 current_thread_info()->restart_block.fn = do_no_restart_syscall;
108
109#define COPY(x) err |= __get_user(regs->x, &sc->x)
110
111#define COPY_SEG(seg) \
112 { unsigned short tmp; \
113 err |= __get_user(tmp, &sc->seg); \
114 regs->x##seg = tmp; }
115
116#define COPY_SEG_STRICT(seg) \
117 { unsigned short tmp; \
118 err |= __get_user(tmp, &sc->seg); \
119 regs->x##seg = tmp|3; }
120
121#define GET_SEG(seg) \
122 { unsigned short tmp; \
123 err |= __get_user(tmp, &sc->seg); \
124 loadsegment(seg,tmp); }
125
126#define FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_RF | \
127 X86_EFLAGS_OF | X86_EFLAGS_DF | \
128 X86_EFLAGS_TF | X86_EFLAGS_SF | X86_EFLAGS_ZF | \
129 X86_EFLAGS_AF | X86_EFLAGS_PF | X86_EFLAGS_CF)
130
131 GET_SEG(gs);
132 COPY_SEG(fs);
133 COPY_SEG(es);
134 COPY_SEG(ds);
135 COPY(edi);
136 COPY(esi);
137 COPY(ebp);
138 COPY(esp);
139 COPY(ebx);
140 COPY(edx);
141 COPY(ecx);
142 COPY(eip);
143 COPY_SEG_STRICT(cs);
144 COPY_SEG_STRICT(ss);
145
146 {
147 unsigned int tmpflags;
148 err |= __get_user(tmpflags, &sc->eflags);
149 regs->eflags = (regs->eflags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
150 regs->orig_eax = -1; /* disable syscall checks */
151 }
152
153 {
154 struct _fpstate __user * buf;
155 err |= __get_user(buf, &sc->fpstate);
156 if (buf) {
157 if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
158 goto badframe;
159 err |= restore_i387(buf);
160 } else {
161 struct task_struct *me = current;
162 if (used_math()) {
163 clear_fpu(me);
164 clear_used_math();
165 }
166 }
167 }
168
169 err |= __get_user(*peax, &sc->eax);
170 return err;
171
172badframe:
173 return 1;
174}
175
176asmlinkage int sys_sigreturn(unsigned long __unused)
177{
178 struct pt_regs *regs = (struct pt_regs *) &__unused;
179 struct sigframe __user *frame = (struct sigframe __user *)(regs->esp - 8);
180 sigset_t set;
181 int eax;
182
183 if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
184 goto badframe;
185 if (__get_user(set.sig[0], &frame->sc.oldmask)
186 || (_NSIG_WORDS > 1
187 && __copy_from_user(&set.sig[1], &frame->extramask,
188 sizeof(frame->extramask))))
189 goto badframe;
190
191 sigdelsetmask(&set, ~_BLOCKABLE);
192 spin_lock_irq(&current->sighand->siglock);
193 current->blocked = set;
194 recalc_sigpending();
195 spin_unlock_irq(&current->sighand->siglock);
196
197 if (restore_sigcontext(regs, &frame->sc, &eax))
198 goto badframe;
199 return eax;
200
201badframe:
202 if (show_unhandled_signals && printk_ratelimit())
203 printk("%s%s[%d] bad frame in sigreturn frame:%p eip:%lx"
204 " esp:%lx oeax:%lx\n",
205 current->pid > 1 ? KERN_INFO : KERN_EMERG,
206 current->comm, current->pid, frame, regs->eip,
207 regs->esp, regs->orig_eax);
208
209 force_sig(SIGSEGV, current);
210 return 0;
211}
212
213asmlinkage int sys_rt_sigreturn(unsigned long __unused)
214{
215 struct pt_regs *regs = (struct pt_regs *) &__unused;
216 struct rt_sigframe __user *frame = (struct rt_sigframe __user *)(regs->esp - 4);
217 sigset_t set;
218 int eax;
219
220 if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
221 goto badframe;
222 if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
223 goto badframe;
224
225 sigdelsetmask(&set, ~_BLOCKABLE);
226 spin_lock_irq(&current->sighand->siglock);
227 current->blocked = set;
228 recalc_sigpending();
229 spin_unlock_irq(&current->sighand->siglock);
230
231 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax))
232 goto badframe;
233
234 if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->esp) == -EFAULT)
235 goto badframe;
236
237 return eax;
238
239badframe:
240 force_sig(SIGSEGV, current);
241 return 0;
242}
243
244/*
245 * Set up a signal frame.
246 */
247
248static int
249setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
250 struct pt_regs *regs, unsigned long mask)
251{
252 int tmp, err = 0;
253
254 err |= __put_user(regs->xfs, (unsigned int __user *)&sc->fs);
255 savesegment(gs, tmp);
256 err |= __put_user(tmp, (unsigned int __user *)&sc->gs);
257
258 err |= __put_user(regs->xes, (unsigned int __user *)&sc->es);
259 err |= __put_user(regs->xds, (unsigned int __user *)&sc->ds);
260 err |= __put_user(regs->edi, &sc->edi);
261 err |= __put_user(regs->esi, &sc->esi);
262 err |= __put_user(regs->ebp, &sc->ebp);
263 err |= __put_user(regs->esp, &sc->esp);
264 err |= __put_user(regs->ebx, &sc->ebx);
265 err |= __put_user(regs->edx, &sc->edx);
266 err |= __put_user(regs->ecx, &sc->ecx);
267 err |= __put_user(regs->eax, &sc->eax);
268 err |= __put_user(current->thread.trap_no, &sc->trapno);
269 err |= __put_user(current->thread.error_code, &sc->err);
270 err |= __put_user(regs->eip, &sc->eip);
271 err |= __put_user(regs->xcs, (unsigned int __user *)&sc->cs);
272 err |= __put_user(regs->eflags, &sc->eflags);
273 err |= __put_user(regs->esp, &sc->esp_at_signal);
274 err |= __put_user(regs->xss, (unsigned int __user *)&sc->ss);
275
276 tmp = save_i387(fpstate);
277 if (tmp < 0)
278 err = 1;
279 else
280 err |= __put_user(tmp ? fpstate : NULL, &sc->fpstate);
281
282 /* non-iBCS2 extensions.. */
283 err |= __put_user(mask, &sc->oldmask);
284 err |= __put_user(current->thread.cr2, &sc->cr2);
285
286 return err;
287}
288
289/*
290 * Determine which stack to use..
291 */
292static inline void __user *
293get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size)
294{
295 unsigned long esp;
296
297 /* Default to using normal stack */
298 esp = regs->esp;
299
300 /* This is the X/Open sanctioned signal stack switching. */
301 if (ka->sa.sa_flags & SA_ONSTACK) {
302 if (sas_ss_flags(esp) == 0)
303 esp = current->sas_ss_sp + current->sas_ss_size;
304 }
305
306 /* This is the legacy signal stack switching. */
307 else if ((regs->xss & 0xffff) != __USER_DS &&
308 !(ka->sa.sa_flags & SA_RESTORER) &&
309 ka->sa.sa_restorer) {
310 esp = (unsigned long) ka->sa.sa_restorer;
311 }
312
313 esp -= frame_size;
314 /* Align the stack pointer according to the i386 ABI,
315 * i.e. so that on function entry ((sp + 4) & 15) == 0. */
316 esp = ((esp + 4) & -16ul) - 4;
317 return (void __user *) esp;
318}
319
320/* These symbols are defined with the addresses in the vsyscall page.
321 See vsyscall-sigreturn.S. */
322extern void __user __kernel_sigreturn;
323extern void __user __kernel_rt_sigreturn;
324
325static int setup_frame(int sig, struct k_sigaction *ka,
326 sigset_t *set, struct pt_regs * regs)
327{
328 void __user *restorer;
329 struct sigframe __user *frame;
330 int err = 0;
331 int usig;
332
333 frame = get_sigframe(ka, regs, sizeof(*frame));
334
335 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
336 goto give_sigsegv;
337
338 usig = current_thread_info()->exec_domain
339 && current_thread_info()->exec_domain->signal_invmap
340 && sig < 32
341 ? current_thread_info()->exec_domain->signal_invmap[sig]
342 : sig;
343
344 err = __put_user(usig, &frame->sig);
345 if (err)
346 goto give_sigsegv;
347
348 err = setup_sigcontext(&frame->sc, &frame->fpstate, regs, set->sig[0]);
349 if (err)
350 goto give_sigsegv;
351
352 if (_NSIG_WORDS > 1) {
353 err = __copy_to_user(&frame->extramask, &set->sig[1],
354 sizeof(frame->extramask));
355 if (err)
356 goto give_sigsegv;
357 }
358
359 if (current->binfmt->hasvdso)
360 restorer = (void *)VDSO_SYM(&__kernel_sigreturn);
361 else
362 restorer = (void *)&frame->retcode;
363 if (ka->sa.sa_flags & SA_RESTORER)
364 restorer = ka->sa.sa_restorer;
365
366 /* Set up to return from userspace. */
367 err |= __put_user(restorer, &frame->pretcode);
368
369 /*
370 * This is popl %eax ; movl $,%eax ; int $0x80
371 *
372 * WE DO NOT USE IT ANY MORE! It's only left here for historical
373 * reasons and because gdb uses it as a signature to notice
374 * signal handler stack frames.
375 */
376 err |= __put_user(0xb858, (short __user *)(frame->retcode+0));
377 err |= __put_user(__NR_sigreturn, (int __user *)(frame->retcode+2));
378 err |= __put_user(0x80cd, (short __user *)(frame->retcode+6));
379
380 if (err)
381 goto give_sigsegv;
382
383 /* Set up registers for signal handler */
384 regs->esp = (unsigned long) frame;
385 regs->eip = (unsigned long) ka->sa.sa_handler;
386 regs->eax = (unsigned long) sig;
387 regs->edx = (unsigned long) 0;
388 regs->ecx = (unsigned long) 0;
389
390 set_fs(USER_DS);
391 regs->xds = __USER_DS;
392 regs->xes = __USER_DS;
393 regs->xss = __USER_DS;
394 regs->xcs = __USER_CS;
395
396 /*
397 * Clear TF when entering the signal handler, but
398 * notify any tracer that was single-stepping it.
399 * The tracer may want to single-step inside the
400 * handler too.
401 */
402 regs->eflags &= ~TF_MASK;
403 if (test_thread_flag(TIF_SINGLESTEP))
404 ptrace_notify(SIGTRAP);
405
406#if DEBUG_SIG
407 printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n",
408 current->comm, current->pid, frame, regs->eip, frame->pretcode);
409#endif
410
411 return 0;
412
413give_sigsegv:
414 force_sigsegv(sig, current);
415 return -EFAULT;
416}
417
418static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
419 sigset_t *set, struct pt_regs * regs)
420{
421 void __user *restorer;
422 struct rt_sigframe __user *frame;
423 int err = 0;
424 int usig;
425
426 frame = get_sigframe(ka, regs, sizeof(*frame));
427
428 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
429 goto give_sigsegv;
430
431 usig = current_thread_info()->exec_domain
432 && current_thread_info()->exec_domain->signal_invmap
433 && sig < 32
434 ? current_thread_info()->exec_domain->signal_invmap[sig]
435 : sig;
436
437 err |= __put_user(usig, &frame->sig);
438 err |= __put_user(&frame->info, &frame->pinfo);
439 err |= __put_user(&frame->uc, &frame->puc);
440 err |= copy_siginfo_to_user(&frame->info, info);
441 if (err)
442 goto give_sigsegv;
443
444 /* Create the ucontext. */
445 err |= __put_user(0, &frame->uc.uc_flags);
446 err |= __put_user(0, &frame->uc.uc_link);
447 err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
448 err |= __put_user(sas_ss_flags(regs->esp),
449 &frame->uc.uc_stack.ss_flags);
450 err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
451 err |= setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate,
452 regs, set->sig[0]);
453 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
454 if (err)
455 goto give_sigsegv;
456
457 /* Set up to return from userspace. */
458 restorer = (void *)VDSO_SYM(&__kernel_rt_sigreturn);
459 if (ka->sa.sa_flags & SA_RESTORER)
460 restorer = ka->sa.sa_restorer;
461 err |= __put_user(restorer, &frame->pretcode);
462
463 /*
464 * This is movl $,%eax ; int $0x80
465 *
466 * WE DO NOT USE IT ANY MORE! It's only left here for historical
467 * reasons and because gdb uses it as a signature to notice
468 * signal handler stack frames.
469 */
470 err |= __put_user(0xb8, (char __user *)(frame->retcode+0));
471 err |= __put_user(__NR_rt_sigreturn, (int __user *)(frame->retcode+1));
472 err |= __put_user(0x80cd, (short __user *)(frame->retcode+5));
473
474 if (err)
475 goto give_sigsegv;
476
477 /* Set up registers for signal handler */
478 regs->esp = (unsigned long) frame;
479 regs->eip = (unsigned long) ka->sa.sa_handler;
480 regs->eax = (unsigned long) usig;
481 regs->edx = (unsigned long) &frame->info;
482 regs->ecx = (unsigned long) &frame->uc;
483
484 set_fs(USER_DS);
485 regs->xds = __USER_DS;
486 regs->xes = __USER_DS;
487 regs->xss = __USER_DS;
488 regs->xcs = __USER_CS;
489
490 /*
491 * Clear TF when entering the signal handler, but
492 * notify any tracer that was single-stepping it.
493 * The tracer may want to single-step inside the
494 * handler too.
495 */
496 regs->eflags &= ~TF_MASK;
497 if (test_thread_flag(TIF_SINGLESTEP))
498 ptrace_notify(SIGTRAP);
499
500#if DEBUG_SIG
501 printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n",
502 current->comm, current->pid, frame, regs->eip, frame->pretcode);
503#endif
504
505 return 0;
506
507give_sigsegv:
508 force_sigsegv(sig, current);
509 return -EFAULT;
510}
511
512/*
513 * OK, we're invoking a handler
514 */
515
516static int
517handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
518 sigset_t *oldset, struct pt_regs * regs)
519{
520 int ret;
521
522 /* Are we from a system call? */
523 if (regs->orig_eax >= 0) {
524 /* If so, check system call restarting.. */
525 switch (regs->eax) {
526 case -ERESTART_RESTARTBLOCK:
527 case -ERESTARTNOHAND:
528 regs->eax = -EINTR;
529 break;
530
531 case -ERESTARTSYS:
532 if (!(ka->sa.sa_flags & SA_RESTART)) {
533 regs->eax = -EINTR;
534 break;
535 }
536 /* fallthrough */
537 case -ERESTARTNOINTR:
538 regs->eax = regs->orig_eax;
539 regs->eip -= 2;
540 }
541 }
542
543 /*
544 * If TF is set due to a debugger (PT_DTRACE), clear the TF flag so
545 * that register information in the sigcontext is correct.
546 */
547 if (unlikely(regs->eflags & TF_MASK)
548 && likely(current->ptrace & PT_DTRACE)) {
549 current->ptrace &= ~PT_DTRACE;
550 regs->eflags &= ~TF_MASK;
551 }
552
553 /* Set up the stack frame */
554 if (ka->sa.sa_flags & SA_SIGINFO)
555 ret = setup_rt_frame(sig, ka, info, oldset, regs);
556 else
557 ret = setup_frame(sig, ka, oldset, regs);
558
559 if (ret == 0) {
560 spin_lock_irq(&current->sighand->siglock);
561 sigorsets(&current->blocked,&current->blocked,&ka->sa.sa_mask);
562 if (!(ka->sa.sa_flags & SA_NODEFER))
563 sigaddset(&current->blocked,sig);
564 recalc_sigpending();
565 spin_unlock_irq(&current->sighand->siglock);
566 }
567
568 return ret;
569}
570
571/*
572 * Note that 'init' is a special process: it doesn't get signals it doesn't
573 * want to handle. Thus you cannot kill init even with a SIGKILL even by
574 * mistake.
575 */
576static void fastcall do_signal(struct pt_regs *regs)
577{
578 siginfo_t info;
579 int signr;
580 struct k_sigaction ka;
581 sigset_t *oldset;
582
583 /*
584 * We want the common case to go fast, which
585 * is why we may in certain cases get here from
586 * kernel mode. Just return without doing anything
587 * if so. vm86 regs switched out by assembly code
588 * before reaching here, so testing against kernel
589 * CS suffices.
590 */
591 if (!user_mode(regs))
592 return;
593
594 if (test_thread_flag(TIF_RESTORE_SIGMASK))
595 oldset = &current->saved_sigmask;
596 else
597 oldset = &current->blocked;
598
599 signr = get_signal_to_deliver(&info, &ka, regs, NULL);
600 if (signr > 0) {
601 /* Reenable any watchpoints before delivering the
602 * signal to user space. The processor register will
603 * have been cleared if the watchpoint triggered
604 * inside the kernel.
605 */
606 if (unlikely(current->thread.debugreg[7]))
607 set_debugreg(current->thread.debugreg[7], 7);
608
609 /* Whee! Actually deliver the signal. */
610 if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
611 /* a signal was successfully delivered; the saved
612 * sigmask will have been stored in the signal frame,
613 * and will be restored by sigreturn, so we can simply
614 * clear the TIF_RESTORE_SIGMASK flag */
615 if (test_thread_flag(TIF_RESTORE_SIGMASK))
616 clear_thread_flag(TIF_RESTORE_SIGMASK);
617 }
618
619 return;
620 }
621
622 /* Did we come from a system call? */
623 if (regs->orig_eax >= 0) {
624 /* Restart the system call - no handlers present */
625 switch (regs->eax) {
626 case -ERESTARTNOHAND:
627 case -ERESTARTSYS:
628 case -ERESTARTNOINTR:
629 regs->eax = regs->orig_eax;
630 regs->eip -= 2;
631 break;
632
633 case -ERESTART_RESTARTBLOCK:
634 regs->eax = __NR_restart_syscall;
635 regs->eip -= 2;
636 break;
637 }
638 }
639
640 /* if there's no signal to deliver, we just put the saved sigmask
641 * back */
642 if (test_thread_flag(TIF_RESTORE_SIGMASK)) {
643 clear_thread_flag(TIF_RESTORE_SIGMASK);
644 sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
645 }
646}
647
648/*
649 * notification of userspace execution resumption
650 * - triggered by the TIF_WORK_MASK flags
651 */
652__attribute__((regparm(3)))
653void do_notify_resume(struct pt_regs *regs, void *_unused,
654 __u32 thread_info_flags)
655{
656 /* Pending single-step? */
657 if (thread_info_flags & _TIF_SINGLESTEP) {
658 regs->eflags |= TF_MASK;
659 clear_thread_flag(TIF_SINGLESTEP);
660 }
661
662 /* deal with pending signal delivery */
663 if (thread_info_flags & (_TIF_SIGPENDING | _TIF_RESTORE_SIGMASK))
664 do_signal(regs);
665
666 clear_thread_flag(TIF_IRET);
667}
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
new file mode 100644
index 000000000000..739175b01e06
--- /dev/null
+++ b/arch/x86/kernel/signal_64.c
@@ -0,0 +1,495 @@
1/*
2 * linux/arch/x86_64/kernel/signal.c
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
6 *
7 * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson
8 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
9 * 2000-2002 x86-64 support by Andi Kleen
10 */
11
12#include <linux/sched.h>
13#include <linux/mm.h>
14#include <linux/smp.h>
15#include <linux/kernel.h>
16#include <linux/signal.h>
17#include <linux/errno.h>
18#include <linux/wait.h>
19#include <linux/ptrace.h>
20#include <linux/unistd.h>
21#include <linux/stddef.h>
22#include <linux/personality.h>
23#include <linux/compiler.h>
24#include <asm/ucontext.h>
25#include <asm/uaccess.h>
26#include <asm/i387.h>
27#include <asm/proto.h>
28#include <asm/ia32_unistd.h>
29#include <asm/mce.h>
30
31/* #define DEBUG_SIG 1 */
32
33#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
34
35int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
36 sigset_t *set, struct pt_regs * regs);
37int ia32_setup_frame(int sig, struct k_sigaction *ka,
38 sigset_t *set, struct pt_regs * regs);
39
40asmlinkage long
41sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
42 struct pt_regs *regs)
43{
44 return do_sigaltstack(uss, uoss, regs->rsp);
45}
46
47
48/*
49 * Do a signal return; undo the signal stack.
50 */
51
52struct rt_sigframe
53{
54 char __user *pretcode;
55 struct ucontext uc;
56 struct siginfo info;
57};
58
59static int
60restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned long *prax)
61{
62 unsigned int err = 0;
63
64 /* Always make any pending restarted system calls return -EINTR */
65 current_thread_info()->restart_block.fn = do_no_restart_syscall;
66
67#define COPY(x) err |= __get_user(regs->x, &sc->x)
68
69 COPY(rdi); COPY(rsi); COPY(rbp); COPY(rsp); COPY(rbx);
70 COPY(rdx); COPY(rcx); COPY(rip);
71 COPY(r8);
72 COPY(r9);
73 COPY(r10);
74 COPY(r11);
75 COPY(r12);
76 COPY(r13);
77 COPY(r14);
78 COPY(r15);
79
80 /* Kernel saves and restores only the CS segment register on signals,
81 * which is the bare minimum needed to allow mixed 32/64-bit code.
82 * App's signal handler can save/restore other segments if needed. */
83 {
84 unsigned cs;
85 err |= __get_user(cs, &sc->cs);
86 regs->cs = cs | 3; /* Force into user mode */
87 }
88
89 {
90 unsigned int tmpflags;
91 err |= __get_user(tmpflags, &sc->eflags);
92 regs->eflags = (regs->eflags & ~0x40DD5) | (tmpflags & 0x40DD5);
93 regs->orig_rax = -1; /* disable syscall checks */
94 }
95
96 {
97 struct _fpstate __user * buf;
98 err |= __get_user(buf, &sc->fpstate);
99
100 if (buf) {
101 if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
102 goto badframe;
103 err |= restore_i387(buf);
104 } else {
105 struct task_struct *me = current;
106 if (used_math()) {
107 clear_fpu(me);
108 clear_used_math();
109 }
110 }
111 }
112
113 err |= __get_user(*prax, &sc->rax);
114 return err;
115
116badframe:
117 return 1;
118}
119
120asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
121{
122 struct rt_sigframe __user *frame;
123 sigset_t set;
124 unsigned long eax;
125
126 frame = (struct rt_sigframe __user *)(regs->rsp - 8);
127 if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) {
128 goto badframe;
129 }
130 if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) {
131 goto badframe;
132 }
133
134 sigdelsetmask(&set, ~_BLOCKABLE);
135 spin_lock_irq(&current->sighand->siglock);
136 current->blocked = set;
137 recalc_sigpending();
138 spin_unlock_irq(&current->sighand->siglock);
139
140 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax))
141 goto badframe;
142
143#ifdef DEBUG_SIG
144 printk("%d sigreturn rip:%lx rsp:%lx frame:%p rax:%lx\n",current->pid,regs->rip,regs->rsp,frame,eax);
145#endif
146
147 if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->rsp) == -EFAULT)
148 goto badframe;
149
150 return eax;
151
152badframe:
153 signal_fault(regs,frame,"sigreturn");
154 return 0;
155}
156
157/*
158 * Set up a signal frame.
159 */
160
161static inline int
162setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned long mask, struct task_struct *me)
163{
164 int err = 0;
165
166 err |= __put_user(regs->cs, &sc->cs);
167 err |= __put_user(0, &sc->gs);
168 err |= __put_user(0, &sc->fs);
169
170 err |= __put_user(regs->rdi, &sc->rdi);
171 err |= __put_user(regs->rsi, &sc->rsi);
172 err |= __put_user(regs->rbp, &sc->rbp);
173 err |= __put_user(regs->rsp, &sc->rsp);
174 err |= __put_user(regs->rbx, &sc->rbx);
175 err |= __put_user(regs->rdx, &sc->rdx);
176 err |= __put_user(regs->rcx, &sc->rcx);
177 err |= __put_user(regs->rax, &sc->rax);
178 err |= __put_user(regs->r8, &sc->r8);
179 err |= __put_user(regs->r9, &sc->r9);
180 err |= __put_user(regs->r10, &sc->r10);
181 err |= __put_user(regs->r11, &sc->r11);
182 err |= __put_user(regs->r12, &sc->r12);
183 err |= __put_user(regs->r13, &sc->r13);
184 err |= __put_user(regs->r14, &sc->r14);
185 err |= __put_user(regs->r15, &sc->r15);
186 err |= __put_user(me->thread.trap_no, &sc->trapno);
187 err |= __put_user(me->thread.error_code, &sc->err);
188 err |= __put_user(regs->rip, &sc->rip);
189 err |= __put_user(regs->eflags, &sc->eflags);
190 err |= __put_user(mask, &sc->oldmask);
191 err |= __put_user(me->thread.cr2, &sc->cr2);
192
193 return err;
194}
195
196/*
197 * Determine which stack to use..
198 */
199
200static void __user *
201get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size)
202{
203 unsigned long rsp;
204
205 /* Default to using normal stack - redzone*/
206 rsp = regs->rsp - 128;
207
208 /* This is the X/Open sanctioned signal stack switching. */
209 if (ka->sa.sa_flags & SA_ONSTACK) {
210 if (sas_ss_flags(rsp) == 0)
211 rsp = current->sas_ss_sp + current->sas_ss_size;
212 }
213
214 return (void __user *)round_down(rsp - size, 16);
215}
216
217static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
218 sigset_t *set, struct pt_regs * regs)
219{
220 struct rt_sigframe __user *frame;
221 struct _fpstate __user *fp = NULL;
222 int err = 0;
223 struct task_struct *me = current;
224
225 if (used_math()) {
226 fp = get_stack(ka, regs, sizeof(struct _fpstate));
227 frame = (void __user *)round_down(
228 (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8;
229
230 if (!access_ok(VERIFY_WRITE, fp, sizeof(struct _fpstate)))
231 goto give_sigsegv;
232
233 if (save_i387(fp) < 0)
234 err |= -1;
235 } else
236 frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8;
237
238 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
239 goto give_sigsegv;
240
241 if (ka->sa.sa_flags & SA_SIGINFO) {
242 err |= copy_siginfo_to_user(&frame->info, info);
243 if (err)
244 goto give_sigsegv;
245 }
246
247 /* Create the ucontext. */
248 err |= __put_user(0, &frame->uc.uc_flags);
249 err |= __put_user(0, &frame->uc.uc_link);
250 err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
251 err |= __put_user(sas_ss_flags(regs->rsp),
252 &frame->uc.uc_stack.ss_flags);
253 err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size);
254 err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me);
255 err |= __put_user(fp, &frame->uc.uc_mcontext.fpstate);
256 if (sizeof(*set) == 16) {
257 __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]);
258 __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]);
259 } else
260 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
261
262 /* Set up to return from userspace. If provided, use a stub
263 already in userspace. */
264 /* x86-64 should always use SA_RESTORER. */
265 if (ka->sa.sa_flags & SA_RESTORER) {
266 err |= __put_user(ka->sa.sa_restorer, &frame->pretcode);
267 } else {
268 /* could use a vstub here */
269 goto give_sigsegv;
270 }
271
272 if (err)
273 goto give_sigsegv;
274
275#ifdef DEBUG_SIG
276 printk("%d old rip %lx old rsp %lx old rax %lx\n", current->pid,regs->rip,regs->rsp,regs->rax);
277#endif
278
279 /* Set up registers for signal handler */
280 regs->rdi = sig;
281 /* In case the signal handler was declared without prototypes */
282 regs->rax = 0;
283
284 /* This also works for non SA_SIGINFO handlers because they expect the
285 next argument after the signal number on the stack. */
286 regs->rsi = (unsigned long)&frame->info;
287 regs->rdx = (unsigned long)&frame->uc;
288 regs->rip = (unsigned long) ka->sa.sa_handler;
289
290 regs->rsp = (unsigned long)frame;
291
292 /* Set up the CS register to run signal handlers in 64-bit mode,
293 even if the handler happens to be interrupting 32-bit code. */
294 regs->cs = __USER_CS;
295
296 /* This, by contrast, has nothing to do with segment registers -
297 see include/asm-x86_64/uaccess.h for details. */
298 set_fs(USER_DS);
299
300 regs->eflags &= ~TF_MASK;
301 if (test_thread_flag(TIF_SINGLESTEP))
302 ptrace_notify(SIGTRAP);
303#ifdef DEBUG_SIG
304 printk("SIG deliver (%s:%d): sp=%p pc=%lx ra=%p\n",
305 current->comm, current->pid, frame, regs->rip, frame->pretcode);
306#endif
307
308 return 0;
309
310give_sigsegv:
311 force_sigsegv(sig, current);
312 return -EFAULT;
313}
314
315/*
316 * OK, we're invoking a handler
317 */
318
319static int
320handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
321 sigset_t *oldset, struct pt_regs *regs)
322{
323 int ret;
324
325#ifdef DEBUG_SIG
326 printk("handle_signal pid:%d sig:%lu rip:%lx rsp:%lx regs=%p\n",
327 current->pid, sig,
328 regs->rip, regs->rsp, regs);
329#endif
330
331 /* Are we from a system call? */
332 if ((long)regs->orig_rax >= 0) {
333 /* If so, check system call restarting.. */
334 switch (regs->rax) {
335 case -ERESTART_RESTARTBLOCK:
336 case -ERESTARTNOHAND:
337 regs->rax = -EINTR;
338 break;
339
340 case -ERESTARTSYS:
341 if (!(ka->sa.sa_flags & SA_RESTART)) {
342 regs->rax = -EINTR;
343 break;
344 }
345 /* fallthrough */
346 case -ERESTARTNOINTR:
347 regs->rax = regs->orig_rax;
348 regs->rip -= 2;
349 break;
350 }
351 }
352
353 /*
354 * If TF is set due to a debugger (PT_DTRACE), clear the TF
355 * flag so that register information in the sigcontext is
356 * correct.
357 */
358 if (unlikely(regs->eflags & TF_MASK)) {
359 if (likely(current->ptrace & PT_DTRACE)) {
360 current->ptrace &= ~PT_DTRACE;
361 regs->eflags &= ~TF_MASK;
362 }
363 }
364
365#ifdef CONFIG_IA32_EMULATION
366 if (test_thread_flag(TIF_IA32)) {
367 if (ka->sa.sa_flags & SA_SIGINFO)
368 ret = ia32_setup_rt_frame(sig, ka, info, oldset, regs);
369 else
370 ret = ia32_setup_frame(sig, ka, oldset, regs);
371 } else
372#endif
373 ret = setup_rt_frame(sig, ka, info, oldset, regs);
374
375 if (ret == 0) {
376 spin_lock_irq(&current->sighand->siglock);
377 sigorsets(&current->blocked,&current->blocked,&ka->sa.sa_mask);
378 if (!(ka->sa.sa_flags & SA_NODEFER))
379 sigaddset(&current->blocked,sig);
380 recalc_sigpending();
381 spin_unlock_irq(&current->sighand->siglock);
382 }
383
384 return ret;
385}
386
387/*
388 * Note that 'init' is a special process: it doesn't get signals it doesn't
389 * want to handle. Thus you cannot kill init even with a SIGKILL even by
390 * mistake.
391 */
392static void do_signal(struct pt_regs *regs)
393{
394 struct k_sigaction ka;
395 siginfo_t info;
396 int signr;
397 sigset_t *oldset;
398
399 /*
400 * We want the common case to go fast, which
401 * is why we may in certain cases get here from
402 * kernel mode. Just return without doing anything
403 * if so.
404 */
405 if (!user_mode(regs))
406 return;
407
408 if (test_thread_flag(TIF_RESTORE_SIGMASK))
409 oldset = &current->saved_sigmask;
410 else
411 oldset = &current->blocked;
412
413 signr = get_signal_to_deliver(&info, &ka, regs, NULL);
414 if (signr > 0) {
415 /* Reenable any watchpoints before delivering the
416 * signal to user space. The processor register will
417 * have been cleared if the watchpoint triggered
418 * inside the kernel.
419 */
420 if (current->thread.debugreg7)
421 set_debugreg(current->thread.debugreg7, 7);
422
423 /* Whee! Actually deliver the signal. */
424 if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
425 /* a signal was successfully delivered; the saved
426 * sigmask will have been stored in the signal frame,
427 * and will be restored by sigreturn, so we can simply
428 * clear the TIF_RESTORE_SIGMASK flag */
429 clear_thread_flag(TIF_RESTORE_SIGMASK);
430 }
431 return;
432 }
433
434 /* Did we come from a system call? */
435 if ((long)regs->orig_rax >= 0) {
436 /* Restart the system call - no handlers present */
437 long res = regs->rax;
438 switch (res) {
439 case -ERESTARTNOHAND:
440 case -ERESTARTSYS:
441 case -ERESTARTNOINTR:
442 regs->rax = regs->orig_rax;
443 regs->rip -= 2;
444 break;
445 case -ERESTART_RESTARTBLOCK:
446 regs->rax = test_thread_flag(TIF_IA32) ?
447 __NR_ia32_restart_syscall :
448 __NR_restart_syscall;
449 regs->rip -= 2;
450 break;
451 }
452 }
453
454 /* if there's no signal to deliver, we just put the saved sigmask
455 back. */
456 if (test_thread_flag(TIF_RESTORE_SIGMASK)) {
457 clear_thread_flag(TIF_RESTORE_SIGMASK);
458 sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
459 }
460}
461
462void
463do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
464{
465#ifdef DEBUG_SIG
466 printk("do_notify_resume flags:%x rip:%lx rsp:%lx caller:%p pending:%x\n",
467 thread_info_flags, regs->rip, regs->rsp, __builtin_return_address(0),signal_pending(current));
468#endif
469
470 /* Pending single-step? */
471 if (thread_info_flags & _TIF_SINGLESTEP) {
472 regs->eflags |= TF_MASK;
473 clear_thread_flag(TIF_SINGLESTEP);
474 }
475
476#ifdef CONFIG_X86_MCE
477 /* notify userspace of pending MCEs */
478 if (thread_info_flags & _TIF_MCE_NOTIFY)
479 mce_notify_user();
480#endif /* CONFIG_X86_MCE */
481
482 /* deal with pending signal delivery */
483 if (thread_info_flags & (_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK))
484 do_signal(regs);
485}
486
487void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
488{
489 struct task_struct *me = current;
490 if (show_unhandled_signals && printk_ratelimit())
491 printk("%s[%d] bad frame in %s frame:%p rip:%lx rsp:%lx orax:%lx\n",
492 me->comm,me->pid,where,frame,regs->rip,regs->rsp,regs->orig_rax);
493
494 force_sig(SIGSEGV, me);
495}
diff --git a/arch/x86/kernel/smp_32.c b/arch/x86/kernel/smp_32.c
new file mode 100644
index 000000000000..2d35d8502029
--- /dev/null
+++ b/arch/x86/kernel/smp_32.c
@@ -0,0 +1,707 @@
1/*
2 * Intel SMP support routines.
3 *
4 * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
5 * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
6 *
7 * This code is released under the GNU General Public License version 2 or
8 * later.
9 */
10
11#include <linux/init.h>
12
13#include <linux/mm.h>
14#include <linux/delay.h>
15#include <linux/spinlock.h>
16#include <linux/kernel_stat.h>
17#include <linux/mc146818rtc.h>
18#include <linux/cache.h>
19#include <linux/interrupt.h>
20#include <linux/cpu.h>
21#include <linux/module.h>
22
23#include <asm/mtrr.h>
24#include <asm/tlbflush.h>
25#include <asm/mmu_context.h>
26#include <mach_apic.h>
27
28/*
29 * Some notes on x86 processor bugs affecting SMP operation:
30 *
31 * Pentium, Pentium Pro, II, III (and all CPUs) have bugs.
32 * The Linux implications for SMP are handled as follows:
33 *
34 * Pentium III / [Xeon]
35 * None of the E1AP-E3AP errata are visible to the user.
36 *
37 * E1AP. see PII A1AP
38 * E2AP. see PII A2AP
39 * E3AP. see PII A3AP
40 *
41 * Pentium II / [Xeon]
42 * None of the A1AP-A3AP errata are visible to the user.
43 *
44 * A1AP. see PPro 1AP
45 * A2AP. see PPro 2AP
46 * A3AP. see PPro 7AP
47 *
48 * Pentium Pro
49 * None of 1AP-9AP errata are visible to the normal user,
50 * except occasional delivery of 'spurious interrupt' as trap #15.
51 * This is very rare and a non-problem.
52 *
53 * 1AP. Linux maps APIC as non-cacheable
54 * 2AP. worked around in hardware
55 * 3AP. fixed in C0 and above steppings microcode update.
56 * Linux does not use excessive STARTUP_IPIs.
57 * 4AP. worked around in hardware
58 * 5AP. symmetric IO mode (normal Linux operation) not affected.
59 * 'noapic' mode has vector 0xf filled out properly.
60 * 6AP. 'noapic' mode might be affected - fixed in later steppings
61 * 7AP. We do not assume writes to the LVT deassering IRQs
62 * 8AP. We do not enable low power mode (deep sleep) during MP bootup
63 * 9AP. We do not use mixed mode
64 *
65 * Pentium
66 * There is a marginal case where REP MOVS on 100MHz SMP
67 * machines with B stepping processors can fail. XXX should provide
68 * an L1cache=Writethrough or L1cache=off option.
69 *
70 * B stepping CPUs may hang. There are hardware work arounds
71 * for this. We warn about it in case your board doesn't have the work
72 * arounds. Basically thats so I can tell anyone with a B stepping
73 * CPU and SMP problems "tough".
74 *
75 * Specific items [From Pentium Processor Specification Update]
76 *
77 * 1AP. Linux doesn't use remote read
78 * 2AP. Linux doesn't trust APIC errors
79 * 3AP. We work around this
80 * 4AP. Linux never generated 3 interrupts of the same priority
81 * to cause a lost local interrupt.
82 * 5AP. Remote read is never used
83 * 6AP. not affected - worked around in hardware
84 * 7AP. not affected - worked around in hardware
85 * 8AP. worked around in hardware - we get explicit CS errors if not
86 * 9AP. only 'noapic' mode affected. Might generate spurious
87 * interrupts, we log only the first one and count the
88 * rest silently.
89 * 10AP. not affected - worked around in hardware
90 * 11AP. Linux reads the APIC between writes to avoid this, as per
91 * the documentation. Make sure you preserve this as it affects
92 * the C stepping chips too.
93 * 12AP. not affected - worked around in hardware
94 * 13AP. not affected - worked around in hardware
95 * 14AP. we always deassert INIT during bootup
96 * 15AP. not affected - worked around in hardware
97 * 16AP. not affected - worked around in hardware
98 * 17AP. not affected - worked around in hardware
99 * 18AP. not affected - worked around in hardware
100 * 19AP. not affected - worked around in BIOS
101 *
102 * If this sounds worrying believe me these bugs are either ___RARE___,
103 * or are signal timing bugs worked around in hardware and there's
104 * about nothing of note with C stepping upwards.
105 */
106
107DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_mm, 0, };
108
109/*
110 * the following functions deal with sending IPIs between CPUs.
111 *
112 * We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
113 */
114
115static inline int __prepare_ICR (unsigned int shortcut, int vector)
116{
117 unsigned int icr = shortcut | APIC_DEST_LOGICAL;
118
119 switch (vector) {
120 default:
121 icr |= APIC_DM_FIXED | vector;
122 break;
123 case NMI_VECTOR:
124 icr |= APIC_DM_NMI;
125 break;
126 }
127 return icr;
128}
129
130static inline int __prepare_ICR2 (unsigned int mask)
131{
132 return SET_APIC_DEST_FIELD(mask);
133}
134
135void __send_IPI_shortcut(unsigned int shortcut, int vector)
136{
137 /*
138 * Subtle. In the case of the 'never do double writes' workaround
139 * we have to lock out interrupts to be safe. As we don't care
140 * of the value read we use an atomic rmw access to avoid costly
141 * cli/sti. Otherwise we use an even cheaper single atomic write
142 * to the APIC.
143 */
144 unsigned int cfg;
145
146 /*
147 * Wait for idle.
148 */
149 apic_wait_icr_idle();
150
151 /*
152 * No need to touch the target chip field
153 */
154 cfg = __prepare_ICR(shortcut, vector);
155
156 /*
157 * Send the IPI. The write to APIC_ICR fires this off.
158 */
159 apic_write_around(APIC_ICR, cfg);
160}
161
162void fastcall send_IPI_self(int vector)
163{
164 __send_IPI_shortcut(APIC_DEST_SELF, vector);
165}
166
167/*
168 * This is used to send an IPI with no shorthand notation (the destination is
169 * specified in bits 56 to 63 of the ICR).
170 */
171static inline void __send_IPI_dest_field(unsigned long mask, int vector)
172{
173 unsigned long cfg;
174
175 /*
176 * Wait for idle.
177 */
178 if (unlikely(vector == NMI_VECTOR))
179 safe_apic_wait_icr_idle();
180 else
181 apic_wait_icr_idle();
182
183 /*
184 * prepare target chip field
185 */
186 cfg = __prepare_ICR2(mask);
187 apic_write_around(APIC_ICR2, cfg);
188
189 /*
190 * program the ICR
191 */
192 cfg = __prepare_ICR(0, vector);
193
194 /*
195 * Send the IPI. The write to APIC_ICR fires this off.
196 */
197 apic_write_around(APIC_ICR, cfg);
198}
199
200/*
201 * This is only used on smaller machines.
202 */
203void send_IPI_mask_bitmask(cpumask_t cpumask, int vector)
204{
205 unsigned long mask = cpus_addr(cpumask)[0];
206 unsigned long flags;
207
208 local_irq_save(flags);
209 WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]);
210 __send_IPI_dest_field(mask, vector);
211 local_irq_restore(flags);
212}
213
214void send_IPI_mask_sequence(cpumask_t mask, int vector)
215{
216 unsigned long flags;
217 unsigned int query_cpu;
218
219 /*
220 * Hack. The clustered APIC addressing mode doesn't allow us to send
221 * to an arbitrary mask, so I do a unicasts to each CPU instead. This
222 * should be modified to do 1 message per cluster ID - mbligh
223 */
224
225 local_irq_save(flags);
226 for (query_cpu = 0; query_cpu < NR_CPUS; ++query_cpu) {
227 if (cpu_isset(query_cpu, mask)) {
228 __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu),
229 vector);
230 }
231 }
232 local_irq_restore(flags);
233}
234
235#include <mach_ipi.h> /* must come after the send_IPI functions above for inlining */
236
237/*
238 * Smarter SMP flushing macros.
239 * c/o Linus Torvalds.
240 *
241 * These mean you can really definitely utterly forget about
242 * writing to user space from interrupts. (Its not allowed anyway).
243 *
244 * Optimizations Manfred Spraul <manfred@colorfullife.com>
245 */
246
247static cpumask_t flush_cpumask;
248static struct mm_struct * flush_mm;
249static unsigned long flush_va;
250static DEFINE_SPINLOCK(tlbstate_lock);
251
252/*
253 * We cannot call mmdrop() because we are in interrupt context,
254 * instead update mm->cpu_vm_mask.
255 *
256 * We need to reload %cr3 since the page tables may be going
257 * away from under us..
258 */
259void leave_mm(unsigned long cpu)
260{
261 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
262 BUG();
263 cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
264 load_cr3(swapper_pg_dir);
265}
266
267/*
268 *
269 * The flush IPI assumes that a thread switch happens in this order:
270 * [cpu0: the cpu that switches]
271 * 1) switch_mm() either 1a) or 1b)
272 * 1a) thread switch to a different mm
273 * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
274 * Stop ipi delivery for the old mm. This is not synchronized with
275 * the other cpus, but smp_invalidate_interrupt ignore flush ipis
276 * for the wrong mm, and in the worst case we perform a superflous
277 * tlb flush.
278 * 1a2) set cpu_tlbstate to TLBSTATE_OK
279 * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
280 * was in lazy tlb mode.
281 * 1a3) update cpu_tlbstate[].active_mm
282 * Now cpu0 accepts tlb flushes for the new mm.
283 * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
284 * Now the other cpus will send tlb flush ipis.
285 * 1a4) change cr3.
286 * 1b) thread switch without mm change
287 * cpu_tlbstate[].active_mm is correct, cpu0 already handles
288 * flush ipis.
289 * 1b1) set cpu_tlbstate to TLBSTATE_OK
290 * 1b2) test_and_set the cpu bit in cpu_vm_mask.
291 * Atomically set the bit [other cpus will start sending flush ipis],
292 * and test the bit.
293 * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
294 * 2) switch %%esp, ie current
295 *
296 * The interrupt must handle 2 special cases:
297 * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
298 * - the cpu performs speculative tlb reads, i.e. even if the cpu only
299 * runs in kernel space, the cpu could load tlb entries for user space
300 * pages.
301 *
302 * The good news is that cpu_tlbstate is local to each cpu, no
303 * write/read ordering problems.
304 */
305
306/*
307 * TLB flush IPI:
308 *
309 * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
310 * 2) Leave the mm if we are in the lazy tlb mode.
311 */
312
313fastcall void smp_invalidate_interrupt(struct pt_regs *regs)
314{
315 unsigned long cpu;
316
317 cpu = get_cpu();
318
319 if (!cpu_isset(cpu, flush_cpumask))
320 goto out;
321 /*
322 * This was a BUG() but until someone can quote me the
323 * line from the intel manual that guarantees an IPI to
324 * multiple CPUs is retried _only_ on the erroring CPUs
325 * its staying as a return
326 *
327 * BUG();
328 */
329
330 if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) {
331 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) {
332 if (flush_va == TLB_FLUSH_ALL)
333 local_flush_tlb();
334 else
335 __flush_tlb_one(flush_va);
336 } else
337 leave_mm(cpu);
338 }
339 ack_APIC_irq();
340 smp_mb__before_clear_bit();
341 cpu_clear(cpu, flush_cpumask);
342 smp_mb__after_clear_bit();
343out:
344 put_cpu_no_resched();
345}
346
347void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
348 unsigned long va)
349{
350 cpumask_t cpumask = *cpumaskp;
351
352 /*
353 * A couple of (to be removed) sanity checks:
354 *
355 * - current CPU must not be in mask
356 * - mask must exist :)
357 */
358 BUG_ON(cpus_empty(cpumask));
359 BUG_ON(cpu_isset(smp_processor_id(), cpumask));
360 BUG_ON(!mm);
361
362#ifdef CONFIG_HOTPLUG_CPU
363 /* If a CPU which we ran on has gone down, OK. */
364 cpus_and(cpumask, cpumask, cpu_online_map);
365 if (unlikely(cpus_empty(cpumask)))
366 return;
367#endif
368
369 /*
370 * i'm not happy about this global shared spinlock in the
371 * MM hot path, but we'll see how contended it is.
372 * AK: x86-64 has a faster method that could be ported.
373 */
374 spin_lock(&tlbstate_lock);
375
376 flush_mm = mm;
377 flush_va = va;
378 cpus_or(flush_cpumask, cpumask, flush_cpumask);
379 /*
380 * We have to send the IPI only to
381 * CPUs affected.
382 */
383 send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR);
384
385 while (!cpus_empty(flush_cpumask))
386 /* nothing. lockup detection does not belong here */
387 cpu_relax();
388
389 flush_mm = NULL;
390 flush_va = 0;
391 spin_unlock(&tlbstate_lock);
392}
393
394void flush_tlb_current_task(void)
395{
396 struct mm_struct *mm = current->mm;
397 cpumask_t cpu_mask;
398
399 preempt_disable();
400 cpu_mask = mm->cpu_vm_mask;
401 cpu_clear(smp_processor_id(), cpu_mask);
402
403 local_flush_tlb();
404 if (!cpus_empty(cpu_mask))
405 flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
406 preempt_enable();
407}
408
409void flush_tlb_mm (struct mm_struct * mm)
410{
411 cpumask_t cpu_mask;
412
413 preempt_disable();
414 cpu_mask = mm->cpu_vm_mask;
415 cpu_clear(smp_processor_id(), cpu_mask);
416
417 if (current->active_mm == mm) {
418 if (current->mm)
419 local_flush_tlb();
420 else
421 leave_mm(smp_processor_id());
422 }
423 if (!cpus_empty(cpu_mask))
424 flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
425
426 preempt_enable();
427}
428
429void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
430{
431 struct mm_struct *mm = vma->vm_mm;
432 cpumask_t cpu_mask;
433
434 preempt_disable();
435 cpu_mask = mm->cpu_vm_mask;
436 cpu_clear(smp_processor_id(), cpu_mask);
437
438 if (current->active_mm == mm) {
439 if(current->mm)
440 __flush_tlb_one(va);
441 else
442 leave_mm(smp_processor_id());
443 }
444
445 if (!cpus_empty(cpu_mask))
446 flush_tlb_others(cpu_mask, mm, va);
447
448 preempt_enable();
449}
450EXPORT_SYMBOL(flush_tlb_page);
451
452static void do_flush_tlb_all(void* info)
453{
454 unsigned long cpu = smp_processor_id();
455
456 __flush_tlb_all();
457 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY)
458 leave_mm(cpu);
459}
460
461void flush_tlb_all(void)
462{
463 on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
464}
465
466/*
467 * this function sends a 'reschedule' IPI to another CPU.
468 * it goes straight through and wastes no time serializing
469 * anything. Worst case is that we lose a reschedule ...
470 */
471static void native_smp_send_reschedule(int cpu)
472{
473 WARN_ON(cpu_is_offline(cpu));
474 send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
475}
476
477/*
478 * Structure and data for smp_call_function(). This is designed to minimise
479 * static memory requirements. It also looks cleaner.
480 */
481static DEFINE_SPINLOCK(call_lock);
482
483struct call_data_struct {
484 void (*func) (void *info);
485 void *info;
486 atomic_t started;
487 atomic_t finished;
488 int wait;
489};
490
491void lock_ipi_call_lock(void)
492{
493 spin_lock_irq(&call_lock);
494}
495
496void unlock_ipi_call_lock(void)
497{
498 spin_unlock_irq(&call_lock);
499}
500
501static struct call_data_struct *call_data;
502
503static void __smp_call_function(void (*func) (void *info), void *info,
504 int nonatomic, int wait)
505{
506 struct call_data_struct data;
507 int cpus = num_online_cpus() - 1;
508
509 if (!cpus)
510 return;
511
512 data.func = func;
513 data.info = info;
514 atomic_set(&data.started, 0);
515 data.wait = wait;
516 if (wait)
517 atomic_set(&data.finished, 0);
518
519 call_data = &data;
520 mb();
521
522 /* Send a message to all other CPUs and wait for them to respond */
523 send_IPI_allbutself(CALL_FUNCTION_VECTOR);
524
525 /* Wait for response */
526 while (atomic_read(&data.started) != cpus)
527 cpu_relax();
528
529 if (wait)
530 while (atomic_read(&data.finished) != cpus)
531 cpu_relax();
532}
533
534
535/**
536 * smp_call_function_mask(): Run a function on a set of other CPUs.
537 * @mask: The set of cpus to run on. Must not include the current cpu.
538 * @func: The function to run. This must be fast and non-blocking.
539 * @info: An arbitrary pointer to pass to the function.
540 * @wait: If true, wait (atomically) until function has completed on other CPUs.
541 *
542 * Returns 0 on success, else a negative status code.
543 *
544 * If @wait is true, then returns once @func has returned; otherwise
545 * it returns just before the target cpu calls @func.
546 *
547 * You must not call this function with disabled interrupts or from a
548 * hardware interrupt handler or from a bottom half handler.
549 */
550static int
551native_smp_call_function_mask(cpumask_t mask,
552 void (*func)(void *), void *info,
553 int wait)
554{
555 struct call_data_struct data;
556 cpumask_t allbutself;
557 int cpus;
558
559 /* Can deadlock when called with interrupts disabled */
560 WARN_ON(irqs_disabled());
561
562 /* Holding any lock stops cpus from going down. */
563 spin_lock(&call_lock);
564
565 allbutself = cpu_online_map;
566 cpu_clear(smp_processor_id(), allbutself);
567
568 cpus_and(mask, mask, allbutself);
569 cpus = cpus_weight(mask);
570
571 if (!cpus) {
572 spin_unlock(&call_lock);
573 return 0;
574 }
575
576 data.func = func;
577 data.info = info;
578 atomic_set(&data.started, 0);
579 data.wait = wait;
580 if (wait)
581 atomic_set(&data.finished, 0);
582
583 call_data = &data;
584 mb();
585
586 /* Send a message to other CPUs */
587 if (cpus_equal(mask, allbutself))
588 send_IPI_allbutself(CALL_FUNCTION_VECTOR);
589 else
590 send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
591
592 /* Wait for response */
593 while (atomic_read(&data.started) != cpus)
594 cpu_relax();
595
596 if (wait)
597 while (atomic_read(&data.finished) != cpus)
598 cpu_relax();
599 spin_unlock(&call_lock);
600
601 return 0;
602}
603
604static void stop_this_cpu (void * dummy)
605{
606 local_irq_disable();
607 /*
608 * Remove this CPU:
609 */
610 cpu_clear(smp_processor_id(), cpu_online_map);
611 disable_local_APIC();
612 if (cpu_data[smp_processor_id()].hlt_works_ok)
613 for(;;) halt();
614 for (;;);
615}
616
617/*
618 * this function calls the 'stop' function on all other CPUs in the system.
619 */
620
621static void native_smp_send_stop(void)
622{
623 /* Don't deadlock on the call lock in panic */
624 int nolock = !spin_trylock(&call_lock);
625 unsigned long flags;
626
627 local_irq_save(flags);
628 __smp_call_function(stop_this_cpu, NULL, 0, 0);
629 if (!nolock)
630 spin_unlock(&call_lock);
631 disable_local_APIC();
632 local_irq_restore(flags);
633}
634
635/*
636 * Reschedule call back. Nothing to do,
637 * all the work is done automatically when
638 * we return from the interrupt.
639 */
640fastcall void smp_reschedule_interrupt(struct pt_regs *regs)
641{
642 ack_APIC_irq();
643}
644
645fastcall void smp_call_function_interrupt(struct pt_regs *regs)
646{
647 void (*func) (void *info) = call_data->func;
648 void *info = call_data->info;
649 int wait = call_data->wait;
650
651 ack_APIC_irq();
652 /*
653 * Notify initiating CPU that I've grabbed the data and am
654 * about to execute the function
655 */
656 mb();
657 atomic_inc(&call_data->started);
658 /*
659 * At this point the info structure may be out of scope unless wait==1
660 */
661 irq_enter();
662 (*func)(info);
663 irq_exit();
664
665 if (wait) {
666 mb();
667 atomic_inc(&call_data->finished);
668 }
669}
670
671static int convert_apicid_to_cpu(int apic_id)
672{
673 int i;
674
675 for (i = 0; i < NR_CPUS; i++) {
676 if (x86_cpu_to_apicid[i] == apic_id)
677 return i;
678 }
679 return -1;
680}
681
682int safe_smp_processor_id(void)
683{
684 int apicid, cpuid;
685
686 if (!boot_cpu_has(X86_FEATURE_APIC))
687 return 0;
688
689 apicid = hard_smp_processor_id();
690 if (apicid == BAD_APICID)
691 return 0;
692
693 cpuid = convert_apicid_to_cpu(apicid);
694
695 return cpuid >= 0 ? cpuid : 0;
696}
697
698struct smp_ops smp_ops = {
699 .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,
700 .smp_prepare_cpus = native_smp_prepare_cpus,
701 .cpu_up = native_cpu_up,
702 .smp_cpus_done = native_smp_cpus_done,
703
704 .smp_send_stop = native_smp_send_stop,
705 .smp_send_reschedule = native_smp_send_reschedule,
706 .smp_call_function_mask = native_smp_call_function_mask,
707};
diff --git a/arch/x86/kernel/smp_64.c b/arch/x86/kernel/smp_64.c
new file mode 100644
index 000000000000..df4a82812adb
--- /dev/null
+++ b/arch/x86/kernel/smp_64.c
@@ -0,0 +1,523 @@
1/*
2 * Intel SMP support routines.
3 *
4 * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
5 * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
6 * (c) 2002,2003 Andi Kleen, SuSE Labs.
7 *
8 * This code is released under the GNU General Public License version 2 or
9 * later.
10 */
11
12#include <linux/init.h>
13
14#include <linux/mm.h>
15#include <linux/delay.h>
16#include <linux/spinlock.h>
17#include <linux/smp.h>
18#include <linux/kernel_stat.h>
19#include <linux/mc146818rtc.h>
20#include <linux/interrupt.h>
21
22#include <asm/mtrr.h>
23#include <asm/pgalloc.h>
24#include <asm/tlbflush.h>
25#include <asm/mach_apic.h>
26#include <asm/mmu_context.h>
27#include <asm/proto.h>
28#include <asm/apicdef.h>
29#include <asm/idle.h>
30
31/*
32 * Smarter SMP flushing macros.
33 * c/o Linus Torvalds.
34 *
35 * These mean you can really definitely utterly forget about
36 * writing to user space from interrupts. (Its not allowed anyway).
37 *
38 * Optimizations Manfred Spraul <manfred@colorfullife.com>
39 *
40 * More scalable flush, from Andi Kleen
41 *
42 * To avoid global state use 8 different call vectors.
43 * Each CPU uses a specific vector to trigger flushes on other
44 * CPUs. Depending on the received vector the target CPUs look into
45 * the right per cpu variable for the flush data.
46 *
47 * With more than 8 CPUs they are hashed to the 8 available
48 * vectors. The limited global vector space forces us to this right now.
49 * In future when interrupts are split into per CPU domains this could be
50 * fixed, at the cost of triggering multiple IPIs in some cases.
51 */
52
53union smp_flush_state {
54 struct {
55 cpumask_t flush_cpumask;
56 struct mm_struct *flush_mm;
57 unsigned long flush_va;
58#define FLUSH_ALL -1ULL
59 spinlock_t tlbstate_lock;
60 };
61 char pad[SMP_CACHE_BYTES];
62} ____cacheline_aligned;
63
64/* State is put into the per CPU data section, but padded
65 to a full cache line because other CPUs can access it and we don't
66 want false sharing in the per cpu data segment. */
67static DEFINE_PER_CPU(union smp_flush_state, flush_state);
68
69/*
70 * We cannot call mmdrop() because we are in interrupt context,
71 * instead update mm->cpu_vm_mask.
72 */
73static inline void leave_mm(int cpu)
74{
75 if (read_pda(mmu_state) == TLBSTATE_OK)
76 BUG();
77 cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask);
78 load_cr3(swapper_pg_dir);
79}
80
81/*
82 *
83 * The flush IPI assumes that a thread switch happens in this order:
84 * [cpu0: the cpu that switches]
85 * 1) switch_mm() either 1a) or 1b)
86 * 1a) thread switch to a different mm
87 * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
88 * Stop ipi delivery for the old mm. This is not synchronized with
89 * the other cpus, but smp_invalidate_interrupt ignore flush ipis
90 * for the wrong mm, and in the worst case we perform a superfluous
91 * tlb flush.
92 * 1a2) set cpu mmu_state to TLBSTATE_OK
93 * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
94 * was in lazy tlb mode.
95 * 1a3) update cpu active_mm
96 * Now cpu0 accepts tlb flushes for the new mm.
97 * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
98 * Now the other cpus will send tlb flush ipis.
99 * 1a4) change cr3.
100 * 1b) thread switch without mm change
101 * cpu active_mm is correct, cpu0 already handles
102 * flush ipis.
103 * 1b1) set cpu mmu_state to TLBSTATE_OK
104 * 1b2) test_and_set the cpu bit in cpu_vm_mask.
105 * Atomically set the bit [other cpus will start sending flush ipis],
106 * and test the bit.
107 * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
108 * 2) switch %%esp, ie current
109 *
110 * The interrupt must handle 2 special cases:
111 * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
112 * - the cpu performs speculative tlb reads, i.e. even if the cpu only
113 * runs in kernel space, the cpu could load tlb entries for user space
114 * pages.
115 *
116 * The good news is that cpu mmu_state is local to each cpu, no
117 * write/read ordering problems.
118 */
119
120/*
121 * TLB flush IPI:
122 *
123 * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
124 * 2) Leave the mm if we are in the lazy tlb mode.
125 *
126 * Interrupts are disabled.
127 */
128
129asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
130{
131 int cpu;
132 int sender;
133 union smp_flush_state *f;
134
135 cpu = smp_processor_id();
136 /*
137 * orig_rax contains the negated interrupt vector.
138 * Use that to determine where the sender put the data.
139 */
140 sender = ~regs->orig_rax - INVALIDATE_TLB_VECTOR_START;
141 f = &per_cpu(flush_state, sender);
142
143 if (!cpu_isset(cpu, f->flush_cpumask))
144 goto out;
145 /*
146 * This was a BUG() but until someone can quote me the
147 * line from the intel manual that guarantees an IPI to
148 * multiple CPUs is retried _only_ on the erroring CPUs
149 * its staying as a return
150 *
151 * BUG();
152 */
153
154 if (f->flush_mm == read_pda(active_mm)) {
155 if (read_pda(mmu_state) == TLBSTATE_OK) {
156 if (f->flush_va == FLUSH_ALL)
157 local_flush_tlb();
158 else
159 __flush_tlb_one(f->flush_va);
160 } else
161 leave_mm(cpu);
162 }
163out:
164 ack_APIC_irq();
165 cpu_clear(cpu, f->flush_cpumask);
166}
167
168static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
169 unsigned long va)
170{
171 int sender;
172 union smp_flush_state *f;
173
174 /* Caller has disabled preemption */
175 sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
176 f = &per_cpu(flush_state, sender);
177
178 /* Could avoid this lock when
179 num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
180 probably not worth checking this for a cache-hot lock. */
181 spin_lock(&f->tlbstate_lock);
182
183 f->flush_mm = mm;
184 f->flush_va = va;
185 cpus_or(f->flush_cpumask, cpumask, f->flush_cpumask);
186
187 /*
188 * We have to send the IPI only to
189 * CPUs affected.
190 */
191 send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR_START + sender);
192
193 while (!cpus_empty(f->flush_cpumask))
194 cpu_relax();
195
196 f->flush_mm = NULL;
197 f->flush_va = 0;
198 spin_unlock(&f->tlbstate_lock);
199}
200
201int __cpuinit init_smp_flush(void)
202{
203 int i;
204 for_each_cpu_mask(i, cpu_possible_map) {
205 spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock);
206 }
207 return 0;
208}
209
210core_initcall(init_smp_flush);
211
212void flush_tlb_current_task(void)
213{
214 struct mm_struct *mm = current->mm;
215 cpumask_t cpu_mask;
216
217 preempt_disable();
218 cpu_mask = mm->cpu_vm_mask;
219 cpu_clear(smp_processor_id(), cpu_mask);
220
221 local_flush_tlb();
222 if (!cpus_empty(cpu_mask))
223 flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
224 preempt_enable();
225}
226EXPORT_SYMBOL(flush_tlb_current_task);
227
228void flush_tlb_mm (struct mm_struct * mm)
229{
230 cpumask_t cpu_mask;
231
232 preempt_disable();
233 cpu_mask = mm->cpu_vm_mask;
234 cpu_clear(smp_processor_id(), cpu_mask);
235
236 if (current->active_mm == mm) {
237 if (current->mm)
238 local_flush_tlb();
239 else
240 leave_mm(smp_processor_id());
241 }
242 if (!cpus_empty(cpu_mask))
243 flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
244
245 preempt_enable();
246}
247EXPORT_SYMBOL(flush_tlb_mm);
248
249void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
250{
251 struct mm_struct *mm = vma->vm_mm;
252 cpumask_t cpu_mask;
253
254 preempt_disable();
255 cpu_mask = mm->cpu_vm_mask;
256 cpu_clear(smp_processor_id(), cpu_mask);
257
258 if (current->active_mm == mm) {
259 if(current->mm)
260 __flush_tlb_one(va);
261 else
262 leave_mm(smp_processor_id());
263 }
264
265 if (!cpus_empty(cpu_mask))
266 flush_tlb_others(cpu_mask, mm, va);
267
268 preempt_enable();
269}
270EXPORT_SYMBOL(flush_tlb_page);
271
272static void do_flush_tlb_all(void* info)
273{
274 unsigned long cpu = smp_processor_id();
275
276 __flush_tlb_all();
277 if (read_pda(mmu_state) == TLBSTATE_LAZY)
278 leave_mm(cpu);
279}
280
281void flush_tlb_all(void)
282{
283 on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
284}
285
286/*
287 * this function sends a 'reschedule' IPI to another CPU.
288 * it goes straight through and wastes no time serializing
289 * anything. Worst case is that we lose a reschedule ...
290 */
291
292void smp_send_reschedule(int cpu)
293{
294 send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
295}
296
297/*
298 * Structure and data for smp_call_function(). This is designed to minimise
299 * static memory requirements. It also looks cleaner.
300 */
301static DEFINE_SPINLOCK(call_lock);
302
303struct call_data_struct {
304 void (*func) (void *info);
305 void *info;
306 atomic_t started;
307 atomic_t finished;
308 int wait;
309};
310
311static struct call_data_struct * call_data;
312
313void lock_ipi_call_lock(void)
314{
315 spin_lock_irq(&call_lock);
316}
317
318void unlock_ipi_call_lock(void)
319{
320 spin_unlock_irq(&call_lock);
321}
322
323/*
324 * this function sends a 'generic call function' IPI to one other CPU
325 * in the system.
326 *
327 * cpu is a standard Linux logical CPU number.
328 */
329static void
330__smp_call_function_single(int cpu, void (*func) (void *info), void *info,
331 int nonatomic, int wait)
332{
333 struct call_data_struct data;
334 int cpus = 1;
335
336 data.func = func;
337 data.info = info;
338 atomic_set(&data.started, 0);
339 data.wait = wait;
340 if (wait)
341 atomic_set(&data.finished, 0);
342
343 call_data = &data;
344 wmb();
345 /* Send a message to all other CPUs and wait for them to respond */
346 send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_VECTOR);
347
348 /* Wait for response */
349 while (atomic_read(&data.started) != cpus)
350 cpu_relax();
351
352 if (!wait)
353 return;
354
355 while (atomic_read(&data.finished) != cpus)
356 cpu_relax();
357}
358
359/*
360 * smp_call_function_single - Run a function on a specific CPU
361 * @func: The function to run. This must be fast and non-blocking.
362 * @info: An arbitrary pointer to pass to the function.
363 * @nonatomic: Currently unused.
364 * @wait: If true, wait until function has completed on other CPUs.
365 *
366 * Retrurns 0 on success, else a negative status code.
367 *
368 * Does not return until the remote CPU is nearly ready to execute <func>
369 * or is or has executed.
370 */
371
372int smp_call_function_single (int cpu, void (*func) (void *info), void *info,
373 int nonatomic, int wait)
374{
375 /* prevent preemption and reschedule on another processor */
376 int me = get_cpu();
377
378 /* Can deadlock when called with interrupts disabled */
379 WARN_ON(irqs_disabled());
380
381 if (cpu == me) {
382 local_irq_disable();
383 func(info);
384 local_irq_enable();
385 put_cpu();
386 return 0;
387 }
388
389 spin_lock(&call_lock);
390 __smp_call_function_single(cpu, func, info, nonatomic, wait);
391 spin_unlock(&call_lock);
392 put_cpu();
393 return 0;
394}
395EXPORT_SYMBOL(smp_call_function_single);
396
397/*
398 * this function sends a 'generic call function' IPI to all other CPUs
399 * in the system.
400 */
401static void __smp_call_function (void (*func) (void *info), void *info,
402 int nonatomic, int wait)
403{
404 struct call_data_struct data;
405 int cpus = num_online_cpus()-1;
406
407 if (!cpus)
408 return;
409
410 data.func = func;
411 data.info = info;
412 atomic_set(&data.started, 0);
413 data.wait = wait;
414 if (wait)
415 atomic_set(&data.finished, 0);
416
417 call_data = &data;
418 wmb();
419 /* Send a message to all other CPUs and wait for them to respond */
420 send_IPI_allbutself(CALL_FUNCTION_VECTOR);
421
422 /* Wait for response */
423 while (atomic_read(&data.started) != cpus)
424 cpu_relax();
425
426 if (!wait)
427 return;
428
429 while (atomic_read(&data.finished) != cpus)
430 cpu_relax();
431}
432
433/*
434 * smp_call_function - run a function on all other CPUs.
435 * @func: The function to run. This must be fast and non-blocking.
436 * @info: An arbitrary pointer to pass to the function.
437 * @nonatomic: currently unused.
438 * @wait: If true, wait (atomically) until function has completed on other
439 * CPUs.
440 *
441 * Returns 0 on success, else a negative status code. Does not return until
442 * remote CPUs are nearly ready to execute func or are or have executed.
443 *
444 * You must not call this function with disabled interrupts or from a
445 * hardware interrupt handler or from a bottom half handler.
446 * Actually there are a few legal cases, like panic.
447 */
448int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
449 int wait)
450{
451 spin_lock(&call_lock);
452 __smp_call_function(func,info,nonatomic,wait);
453 spin_unlock(&call_lock);
454 return 0;
455}
456EXPORT_SYMBOL(smp_call_function);
457
458static void stop_this_cpu(void *dummy)
459{
460 local_irq_disable();
461 /*
462 * Remove this CPU:
463 */
464 cpu_clear(smp_processor_id(), cpu_online_map);
465 disable_local_APIC();
466 for (;;)
467 halt();
468}
469
470void smp_send_stop(void)
471{
472 int nolock;
473 unsigned long flags;
474
475 if (reboot_force)
476 return;
477
478 /* Don't deadlock on the call lock in panic */
479 nolock = !spin_trylock(&call_lock);
480 local_irq_save(flags);
481 __smp_call_function(stop_this_cpu, NULL, 0, 0);
482 if (!nolock)
483 spin_unlock(&call_lock);
484 disable_local_APIC();
485 local_irq_restore(flags);
486}
487
488/*
489 * Reschedule call back. Nothing to do,
490 * all the work is done automatically when
491 * we return from the interrupt.
492 */
493asmlinkage void smp_reschedule_interrupt(void)
494{
495 ack_APIC_irq();
496}
497
498asmlinkage void smp_call_function_interrupt(void)
499{
500 void (*func) (void *info) = call_data->func;
501 void *info = call_data->info;
502 int wait = call_data->wait;
503
504 ack_APIC_irq();
505 /*
506 * Notify initiating CPU that I've grabbed the data and am
507 * about to execute the function
508 */
509 mb();
510 atomic_inc(&call_data->started);
511 /*
512 * At this point the info structure may be out of scope unless wait==1
513 */
514 exit_idle();
515 irq_enter();
516 (*func)(info);
517 irq_exit();
518 if (wait) {
519 mb();
520 atomic_inc(&call_data->finished);
521 }
522}
523
diff --git a/arch/x86/kernel/smpboot_32.c b/arch/x86/kernel/smpboot_32.c
new file mode 100644
index 000000000000..e4f61d1c6248
--- /dev/null
+++ b/arch/x86/kernel/smpboot_32.c
@@ -0,0 +1,1322 @@
1/*
2 * x86 SMP booting functions
3 *
4 * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
5 * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
6 *
7 * Much of the core SMP work is based on previous work by Thomas Radke, to
8 * whom a great many thanks are extended.
9 *
10 * Thanks to Intel for making available several different Pentium,
11 * Pentium Pro and Pentium-II/Xeon MP machines.
12 * Original development of Linux SMP code supported by Caldera.
13 *
14 * This code is released under the GNU General Public License version 2 or
15 * later.
16 *
17 * Fixes
18 * Felix Koop : NR_CPUS used properly
19 * Jose Renau : Handle single CPU case.
20 * Alan Cox : By repeated request 8) - Total BogoMIPS report.
21 * Greg Wright : Fix for kernel stacks panic.
22 * Erich Boleyn : MP v1.4 and additional changes.
23 * Matthias Sattler : Changes for 2.1 kernel map.
24 * Michel Lespinasse : Changes for 2.1 kernel map.
25 * Michael Chastain : Change trampoline.S to gnu as.
26 * Alan Cox : Dumb bug: 'B' step PPro's are fine
27 * Ingo Molnar : Added APIC timers, based on code
28 * from Jose Renau
29 * Ingo Molnar : various cleanups and rewrites
30 * Tigran Aivazian : fixed "0.00 in /proc/uptime on SMP" bug.
31 * Maciej W. Rozycki : Bits for genuine 82489DX APICs
32 * Martin J. Bligh : Added support for multi-quad systems
33 * Dave Jones : Report invalid combinations of Athlon CPUs.
34* Rusty Russell : Hacked into shape for new "hotplug" boot process. */
35
36#include <linux/module.h>
37#include <linux/init.h>
38#include <linux/kernel.h>
39
40#include <linux/mm.h>
41#include <linux/sched.h>
42#include <linux/kernel_stat.h>
43#include <linux/bootmem.h>
44#include <linux/notifier.h>
45#include <linux/cpu.h>
46#include <linux/percpu.h>
47#include <linux/nmi.h>
48
49#include <linux/delay.h>
50#include <linux/mc146818rtc.h>
51#include <asm/tlbflush.h>
52#include <asm/desc.h>
53#include <asm/arch_hooks.h>
54#include <asm/nmi.h>
55
56#include <mach_apic.h>
57#include <mach_wakecpu.h>
58#include <smpboot_hooks.h>
59#include <asm/vmi.h>
60#include <asm/mtrr.h>
61
62/* Set if we find a B stepping CPU */
63static int __devinitdata smp_b_stepping;
64
65/* Number of siblings per CPU package */
66int smp_num_siblings = 1;
67EXPORT_SYMBOL(smp_num_siblings);
68
69/* Last level cache ID of each logical CPU */
70int cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID};
71
72/* representing HT siblings of each logical CPU */
73cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly;
74EXPORT_SYMBOL(cpu_sibling_map);
75
76/* representing HT and core siblings of each logical CPU */
77cpumask_t cpu_core_map[NR_CPUS] __read_mostly;
78EXPORT_SYMBOL(cpu_core_map);
79
80/* bitmap of online cpus */
81cpumask_t cpu_online_map __read_mostly;
82EXPORT_SYMBOL(cpu_online_map);
83
84cpumask_t cpu_callin_map;
85cpumask_t cpu_callout_map;
86EXPORT_SYMBOL(cpu_callout_map);
87cpumask_t cpu_possible_map;
88EXPORT_SYMBOL(cpu_possible_map);
89static cpumask_t smp_commenced_mask;
90
91/* Per CPU bogomips and other parameters */
92struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
93EXPORT_SYMBOL(cpu_data);
94
95u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly =
96 { [0 ... NR_CPUS-1] = 0xff };
97EXPORT_SYMBOL(x86_cpu_to_apicid);
98
99u8 apicid_2_node[MAX_APICID];
100
101/*
102 * Trampoline 80x86 program as an array.
103 */
104
105extern unsigned char trampoline_data [];
106extern unsigned char trampoline_end [];
107static unsigned char *trampoline_base;
108static int trampoline_exec;
109
110static void map_cpu_to_logical_apicid(void);
111
112/* State of each CPU. */
113DEFINE_PER_CPU(int, cpu_state) = { 0 };
114
115/*
116 * Currently trivial. Write the real->protected mode
117 * bootstrap into the page concerned. The caller
118 * has made sure it's suitably aligned.
119 */
120
121static unsigned long __devinit setup_trampoline(void)
122{
123 memcpy(trampoline_base, trampoline_data, trampoline_end - trampoline_data);
124 return virt_to_phys(trampoline_base);
125}
126
127/*
128 * We are called very early to get the low memory for the
129 * SMP bootup trampoline page.
130 */
131void __init smp_alloc_memory(void)
132{
133 trampoline_base = (void *) alloc_bootmem_low_pages(PAGE_SIZE);
134 /*
135 * Has to be in very low memory so we can execute
136 * real-mode AP code.
137 */
138 if (__pa(trampoline_base) >= 0x9F000)
139 BUG();
140 /*
141 * Make the SMP trampoline executable:
142 */
143 trampoline_exec = set_kernel_exec((unsigned long)trampoline_base, 1);
144}
145
146/*
147 * The bootstrap kernel entry code has set these up. Save them for
148 * a given CPU
149 */
150
151void __cpuinit smp_store_cpu_info(int id)
152{
153 struct cpuinfo_x86 *c = cpu_data + id;
154
155 *c = boot_cpu_data;
156 if (id!=0)
157 identify_secondary_cpu(c);
158 /*
159 * Mask B, Pentium, but not Pentium MMX
160 */
161 if (c->x86_vendor == X86_VENDOR_INTEL &&
162 c->x86 == 5 &&
163 c->x86_mask >= 1 && c->x86_mask <= 4 &&
164 c->x86_model <= 3)
165 /*
166 * Remember we have B step Pentia with bugs
167 */
168 smp_b_stepping = 1;
169
170 /*
171 * Certain Athlons might work (for various values of 'work') in SMP
172 * but they are not certified as MP capable.
173 */
174 if ((c->x86_vendor == X86_VENDOR_AMD) && (c->x86 == 6)) {
175
176 if (num_possible_cpus() == 1)
177 goto valid_k7;
178
179 /* Athlon 660/661 is valid. */
180 if ((c->x86_model==6) && ((c->x86_mask==0) || (c->x86_mask==1)))
181 goto valid_k7;
182
183 /* Duron 670 is valid */
184 if ((c->x86_model==7) && (c->x86_mask==0))
185 goto valid_k7;
186
187 /*
188 * Athlon 662, Duron 671, and Athlon >model 7 have capability bit.
189 * It's worth noting that the A5 stepping (662) of some Athlon XP's
190 * have the MP bit set.
191 * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for more.
192 */
193 if (((c->x86_model==6) && (c->x86_mask>=2)) ||
194 ((c->x86_model==7) && (c->x86_mask>=1)) ||
195 (c->x86_model> 7))
196 if (cpu_has_mp)
197 goto valid_k7;
198
199 /* If we get here, it's not a certified SMP capable AMD system. */
200 add_taint(TAINT_UNSAFE_SMP);
201 }
202
203valid_k7:
204 ;
205}
206
207extern void calibrate_delay(void);
208
209static atomic_t init_deasserted;
210
211static void __cpuinit smp_callin(void)
212{
213 int cpuid, phys_id;
214 unsigned long timeout;
215
216 /*
217 * If waken up by an INIT in an 82489DX configuration
218 * we may get here before an INIT-deassert IPI reaches
219 * our local APIC. We have to wait for the IPI or we'll
220 * lock up on an APIC access.
221 */
222 wait_for_init_deassert(&init_deasserted);
223
224 /*
225 * (This works even if the APIC is not enabled.)
226 */
227 phys_id = GET_APIC_ID(apic_read(APIC_ID));
228 cpuid = smp_processor_id();
229 if (cpu_isset(cpuid, cpu_callin_map)) {
230 printk("huh, phys CPU#%d, CPU#%d already present??\n",
231 phys_id, cpuid);
232 BUG();
233 }
234 Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id);
235
236 /*
237 * STARTUP IPIs are fragile beasts as they might sometimes
238 * trigger some glue motherboard logic. Complete APIC bus
239 * silence for 1 second, this overestimates the time the
240 * boot CPU is spending to send the up to 2 STARTUP IPIs
241 * by a factor of two. This should be enough.
242 */
243
244 /*
245 * Waiting 2s total for startup (udelay is not yet working)
246 */
247 timeout = jiffies + 2*HZ;
248 while (time_before(jiffies, timeout)) {
249 /*
250 * Has the boot CPU finished it's STARTUP sequence?
251 */
252 if (cpu_isset(cpuid, cpu_callout_map))
253 break;
254 rep_nop();
255 }
256
257 if (!time_before(jiffies, timeout)) {
258 printk("BUG: CPU%d started up but did not get a callout!\n",
259 cpuid);
260 BUG();
261 }
262
263 /*
264 * the boot CPU has finished the init stage and is spinning
265 * on callin_map until we finish. We are free to set up this
266 * CPU, first the APIC. (this is probably redundant on most
267 * boards)
268 */
269
270 Dprintk("CALLIN, before setup_local_APIC().\n");
271 smp_callin_clear_local_apic();
272 setup_local_APIC();
273 map_cpu_to_logical_apicid();
274
275 /*
276 * Get our bogomips.
277 */
278 calibrate_delay();
279 Dprintk("Stack at about %p\n",&cpuid);
280
281 /*
282 * Save our processor parameters
283 */
284 smp_store_cpu_info(cpuid);
285
286 /*
287 * Allow the master to continue.
288 */
289 cpu_set(cpuid, cpu_callin_map);
290}
291
292static int cpucount;
293
294/* maps the cpu to the sched domain representing multi-core */
295cpumask_t cpu_coregroup_map(int cpu)
296{
297 struct cpuinfo_x86 *c = cpu_data + cpu;
298 /*
299 * For perf, we return last level cache shared map.
300 * And for power savings, we return cpu_core_map
301 */
302 if (sched_mc_power_savings || sched_smt_power_savings)
303 return cpu_core_map[cpu];
304 else
305 return c->llc_shared_map;
306}
307
308/* representing cpus for which sibling maps can be computed */
309static cpumask_t cpu_sibling_setup_map;
310
311void __cpuinit set_cpu_sibling_map(int cpu)
312{
313 int i;
314 struct cpuinfo_x86 *c = cpu_data;
315
316 cpu_set(cpu, cpu_sibling_setup_map);
317
318 if (smp_num_siblings > 1) {
319 for_each_cpu_mask(i, cpu_sibling_setup_map) {
320 if (c[cpu].phys_proc_id == c[i].phys_proc_id &&
321 c[cpu].cpu_core_id == c[i].cpu_core_id) {
322 cpu_set(i, cpu_sibling_map[cpu]);
323 cpu_set(cpu, cpu_sibling_map[i]);
324 cpu_set(i, cpu_core_map[cpu]);
325 cpu_set(cpu, cpu_core_map[i]);
326 cpu_set(i, c[cpu].llc_shared_map);
327 cpu_set(cpu, c[i].llc_shared_map);
328 }
329 }
330 } else {
331 cpu_set(cpu, cpu_sibling_map[cpu]);
332 }
333
334 cpu_set(cpu, c[cpu].llc_shared_map);
335
336 if (current_cpu_data.x86_max_cores == 1) {
337 cpu_core_map[cpu] = cpu_sibling_map[cpu];
338 c[cpu].booted_cores = 1;
339 return;
340 }
341
342 for_each_cpu_mask(i, cpu_sibling_setup_map) {
343 if (cpu_llc_id[cpu] != BAD_APICID &&
344 cpu_llc_id[cpu] == cpu_llc_id[i]) {
345 cpu_set(i, c[cpu].llc_shared_map);
346 cpu_set(cpu, c[i].llc_shared_map);
347 }
348 if (c[cpu].phys_proc_id == c[i].phys_proc_id) {
349 cpu_set(i, cpu_core_map[cpu]);
350 cpu_set(cpu, cpu_core_map[i]);
351 /*
352 * Does this new cpu bringup a new core?
353 */
354 if (cpus_weight(cpu_sibling_map[cpu]) == 1) {
355 /*
356 * for each core in package, increment
357 * the booted_cores for this new cpu
358 */
359 if (first_cpu(cpu_sibling_map[i]) == i)
360 c[cpu].booted_cores++;
361 /*
362 * increment the core count for all
363 * the other cpus in this package
364 */
365 if (i != cpu)
366 c[i].booted_cores++;
367 } else if (i != cpu && !c[cpu].booted_cores)
368 c[cpu].booted_cores = c[i].booted_cores;
369 }
370 }
371}
372
373/*
374 * Activate a secondary processor.
375 */
376static void __cpuinit start_secondary(void *unused)
377{
378 /*
379 * Don't put *anything* before cpu_init(), SMP booting is too
380 * fragile that we want to limit the things done here to the
381 * most necessary things.
382 */
383#ifdef CONFIG_VMI
384 vmi_bringup();
385#endif
386 cpu_init();
387 preempt_disable();
388 smp_callin();
389 while (!cpu_isset(smp_processor_id(), smp_commenced_mask))
390 rep_nop();
391 /*
392 * Check TSC synchronization with the BP:
393 */
394 check_tsc_sync_target();
395
396 setup_secondary_clock();
397 if (nmi_watchdog == NMI_IO_APIC) {
398 disable_8259A_irq(0);
399 enable_NMI_through_LVT0(NULL);
400 enable_8259A_irq(0);
401 }
402 /*
403 * low-memory mappings have been cleared, flush them from
404 * the local TLBs too.
405 */
406 local_flush_tlb();
407
408 /* This must be done before setting cpu_online_map */
409 set_cpu_sibling_map(raw_smp_processor_id());
410 wmb();
411
412 /*
413 * We need to hold call_lock, so there is no inconsistency
414 * between the time smp_call_function() determines number of
415 * IPI receipients, and the time when the determination is made
416 * for which cpus receive the IPI. Holding this
417 * lock helps us to not include this cpu in a currently in progress
418 * smp_call_function().
419 */
420 lock_ipi_call_lock();
421 cpu_set(smp_processor_id(), cpu_online_map);
422 unlock_ipi_call_lock();
423 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
424
425 /* We can take interrupts now: we're officially "up". */
426 local_irq_enable();
427
428 wmb();
429 cpu_idle();
430}
431
432/*
433 * Everything has been set up for the secondary
434 * CPUs - they just need to reload everything
435 * from the task structure
436 * This function must not return.
437 */
438void __devinit initialize_secondary(void)
439{
440 /*
441 * We don't actually need to load the full TSS,
442 * basically just the stack pointer and the eip.
443 */
444
445 asm volatile(
446 "movl %0,%%esp\n\t"
447 "jmp *%1"
448 :
449 :"m" (current->thread.esp),"m" (current->thread.eip));
450}
451
452/* Static state in head.S used to set up a CPU */
453extern struct {
454 void * esp;
455 unsigned short ss;
456} stack_start;
457
458#ifdef CONFIG_NUMA
459
460/* which logical CPUs are on which nodes */
461cpumask_t node_2_cpu_mask[MAX_NUMNODES] __read_mostly =
462 { [0 ... MAX_NUMNODES-1] = CPU_MASK_NONE };
463EXPORT_SYMBOL(node_2_cpu_mask);
464/* which node each logical CPU is on */
465int cpu_2_node[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 };
466EXPORT_SYMBOL(cpu_2_node);
467
468/* set up a mapping between cpu and node. */
469static inline void map_cpu_to_node(int cpu, int node)
470{
471 printk("Mapping cpu %d to node %d\n", cpu, node);
472 cpu_set(cpu, node_2_cpu_mask[node]);
473 cpu_2_node[cpu] = node;
474}
475
476/* undo a mapping between cpu and node. */
477static inline void unmap_cpu_to_node(int cpu)
478{
479 int node;
480
481 printk("Unmapping cpu %d from all nodes\n", cpu);
482 for (node = 0; node < MAX_NUMNODES; node ++)
483 cpu_clear(cpu, node_2_cpu_mask[node]);
484 cpu_2_node[cpu] = 0;
485}
486#else /* !CONFIG_NUMA */
487
488#define map_cpu_to_node(cpu, node) ({})
489#define unmap_cpu_to_node(cpu) ({})
490
491#endif /* CONFIG_NUMA */
492
493u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
494
495static void map_cpu_to_logical_apicid(void)
496{
497 int cpu = smp_processor_id();
498 int apicid = logical_smp_processor_id();
499 int node = apicid_to_node(apicid);
500
501 if (!node_online(node))
502 node = first_online_node;
503
504 cpu_2_logical_apicid[cpu] = apicid;
505 map_cpu_to_node(cpu, node);
506}
507
508static void unmap_cpu_to_logical_apicid(int cpu)
509{
510 cpu_2_logical_apicid[cpu] = BAD_APICID;
511 unmap_cpu_to_node(cpu);
512}
513
514static inline void __inquire_remote_apic(int apicid)
515{
516 int i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
517 char *names[] = { "ID", "VERSION", "SPIV" };
518 int timeout;
519 unsigned long status;
520
521 printk("Inquiring remote APIC #%d...\n", apicid);
522
523 for (i = 0; i < ARRAY_SIZE(regs); i++) {
524 printk("... APIC #%d %s: ", apicid, names[i]);
525
526 /*
527 * Wait for idle.
528 */
529 status = safe_apic_wait_icr_idle();
530 if (status)
531 printk("a previous APIC delivery may have failed\n");
532
533 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
534 apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]);
535
536 timeout = 0;
537 do {
538 udelay(100);
539 status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK;
540 } while (status == APIC_ICR_RR_INPROG && timeout++ < 1000);
541
542 switch (status) {
543 case APIC_ICR_RR_VALID:
544 status = apic_read(APIC_RRR);
545 printk("%lx\n", status);
546 break;
547 default:
548 printk("failed\n");
549 }
550 }
551}
552
553#ifdef WAKE_SECONDARY_VIA_NMI
554/*
555 * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal
556 * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
557 * won't ... remember to clear down the APIC, etc later.
558 */
559static int __devinit
560wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
561{
562 unsigned long send_status, accept_status = 0;
563 int maxlvt;
564
565 /* Target chip */
566 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid));
567
568 /* Boot on the stack */
569 /* Kick the second */
570 apic_write_around(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL);
571
572 Dprintk("Waiting for send to finish...\n");
573 send_status = safe_apic_wait_icr_idle();
574
575 /*
576 * Give the other CPU some time to accept the IPI.
577 */
578 udelay(200);
579 /*
580 * Due to the Pentium erratum 3AP.
581 */
582 maxlvt = lapic_get_maxlvt();
583 if (maxlvt > 3) {
584 apic_read_around(APIC_SPIV);
585 apic_write(APIC_ESR, 0);
586 }
587 accept_status = (apic_read(APIC_ESR) & 0xEF);
588 Dprintk("NMI sent.\n");
589
590 if (send_status)
591 printk("APIC never delivered???\n");
592 if (accept_status)
593 printk("APIC delivery error (%lx).\n", accept_status);
594
595 return (send_status | accept_status);
596}
597#endif /* WAKE_SECONDARY_VIA_NMI */
598
599#ifdef WAKE_SECONDARY_VIA_INIT
600static int __devinit
601wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
602{
603 unsigned long send_status, accept_status = 0;
604 int maxlvt, num_starts, j;
605
606 /*
607 * Be paranoid about clearing APIC errors.
608 */
609 if (APIC_INTEGRATED(apic_version[phys_apicid])) {
610 apic_read_around(APIC_SPIV);
611 apic_write(APIC_ESR, 0);
612 apic_read(APIC_ESR);
613 }
614
615 Dprintk("Asserting INIT.\n");
616
617 /*
618 * Turn INIT on target chip
619 */
620 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
621
622 /*
623 * Send IPI
624 */
625 apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT
626 | APIC_DM_INIT);
627
628 Dprintk("Waiting for send to finish...\n");
629 send_status = safe_apic_wait_icr_idle();
630
631 mdelay(10);
632
633 Dprintk("Deasserting INIT.\n");
634
635 /* Target chip */
636 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
637
638 /* Send IPI */
639 apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT);
640
641 Dprintk("Waiting for send to finish...\n");
642 send_status = safe_apic_wait_icr_idle();
643
644 atomic_set(&init_deasserted, 1);
645
646 /*
647 * Should we send STARTUP IPIs ?
648 *
649 * Determine this based on the APIC version.
650 * If we don't have an integrated APIC, don't send the STARTUP IPIs.
651 */
652 if (APIC_INTEGRATED(apic_version[phys_apicid]))
653 num_starts = 2;
654 else
655 num_starts = 0;
656
657 /*
658 * Paravirt / VMI wants a startup IPI hook here to set up the
659 * target processor state.
660 */
661 startup_ipi_hook(phys_apicid, (unsigned long) start_secondary,
662 (unsigned long) stack_start.esp);
663
664 /*
665 * Run STARTUP IPI loop.
666 */
667 Dprintk("#startup loops: %d.\n", num_starts);
668
669 maxlvt = lapic_get_maxlvt();
670
671 for (j = 1; j <= num_starts; j++) {
672 Dprintk("Sending STARTUP #%d.\n",j);
673 apic_read_around(APIC_SPIV);
674 apic_write(APIC_ESR, 0);
675 apic_read(APIC_ESR);
676 Dprintk("After apic_write.\n");
677
678 /*
679 * STARTUP IPI
680 */
681
682 /* Target chip */
683 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
684
685 /* Boot on the stack */
686 /* Kick the second */
687 apic_write_around(APIC_ICR, APIC_DM_STARTUP
688 | (start_eip >> 12));
689
690 /*
691 * Give the other CPU some time to accept the IPI.
692 */
693 udelay(300);
694
695 Dprintk("Startup point 1.\n");
696
697 Dprintk("Waiting for send to finish...\n");
698 send_status = safe_apic_wait_icr_idle();
699
700 /*
701 * Give the other CPU some time to accept the IPI.
702 */
703 udelay(200);
704 /*
705 * Due to the Pentium erratum 3AP.
706 */
707 if (maxlvt > 3) {
708 apic_read_around(APIC_SPIV);
709 apic_write(APIC_ESR, 0);
710 }
711 accept_status = (apic_read(APIC_ESR) & 0xEF);
712 if (send_status || accept_status)
713 break;
714 }
715 Dprintk("After Startup.\n");
716
717 if (send_status)
718 printk("APIC never delivered???\n");
719 if (accept_status)
720 printk("APIC delivery error (%lx).\n", accept_status);
721
722 return (send_status | accept_status);
723}
724#endif /* WAKE_SECONDARY_VIA_INIT */
725
726extern cpumask_t cpu_initialized;
727static inline int alloc_cpu_id(void)
728{
729 cpumask_t tmp_map;
730 int cpu;
731 cpus_complement(tmp_map, cpu_present_map);
732 cpu = first_cpu(tmp_map);
733 if (cpu >= NR_CPUS)
734 return -ENODEV;
735 return cpu;
736}
737
738#ifdef CONFIG_HOTPLUG_CPU
739static struct task_struct * __devinitdata cpu_idle_tasks[NR_CPUS];
740static inline struct task_struct * alloc_idle_task(int cpu)
741{
742 struct task_struct *idle;
743
744 if ((idle = cpu_idle_tasks[cpu]) != NULL) {
745 /* initialize thread_struct. we really want to avoid destroy
746 * idle tread
747 */
748 idle->thread.esp = (unsigned long)task_pt_regs(idle);
749 init_idle(idle, cpu);
750 return idle;
751 }
752 idle = fork_idle(cpu);
753
754 if (!IS_ERR(idle))
755 cpu_idle_tasks[cpu] = idle;
756 return idle;
757}
758#else
759#define alloc_idle_task(cpu) fork_idle(cpu)
760#endif
761
762static int __cpuinit do_boot_cpu(int apicid, int cpu)
763/*
764 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
765 * (ie clustered apic addressing mode), this is a LOGICAL apic ID.
766 * Returns zero if CPU booted OK, else error code from wakeup_secondary_cpu.
767 */
768{
769 struct task_struct *idle;
770 unsigned long boot_error;
771 int timeout;
772 unsigned long start_eip;
773 unsigned short nmi_high = 0, nmi_low = 0;
774
775 /*
776 * Save current MTRR state in case it was changed since early boot
777 * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync:
778 */
779 mtrr_save_state();
780
781 /*
782 * We can't use kernel_thread since we must avoid to
783 * reschedule the child.
784 */
785 idle = alloc_idle_task(cpu);
786 if (IS_ERR(idle))
787 panic("failed fork for CPU %d", cpu);
788
789 init_gdt(cpu);
790 per_cpu(current_task, cpu) = idle;
791 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
792
793 idle->thread.eip = (unsigned long) start_secondary;
794 /* start_eip had better be page-aligned! */
795 start_eip = setup_trampoline();
796
797 ++cpucount;
798 alternatives_smp_switch(1);
799
800 /* So we see what's up */
801 printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip);
802 /* Stack for startup_32 can be just as for start_secondary onwards */
803 stack_start.esp = (void *) idle->thread.esp;
804
805 irq_ctx_init(cpu);
806
807 x86_cpu_to_apicid[cpu] = apicid;
808 /*
809 * This grunge runs the startup process for
810 * the targeted processor.
811 */
812
813 atomic_set(&init_deasserted, 0);
814
815 Dprintk("Setting warm reset code and vector.\n");
816
817 store_NMI_vector(&nmi_high, &nmi_low);
818
819 smpboot_setup_warm_reset_vector(start_eip);
820
821 /*
822 * Starting actual IPI sequence...
823 */
824 boot_error = wakeup_secondary_cpu(apicid, start_eip);
825
826 if (!boot_error) {
827 /*
828 * allow APs to start initializing.
829 */
830 Dprintk("Before Callout %d.\n", cpu);
831 cpu_set(cpu, cpu_callout_map);
832 Dprintk("After Callout %d.\n", cpu);
833
834 /*
835 * Wait 5s total for a response
836 */
837 for (timeout = 0; timeout < 50000; timeout++) {
838 if (cpu_isset(cpu, cpu_callin_map))
839 break; /* It has booted */
840 udelay(100);
841 }
842
843 if (cpu_isset(cpu, cpu_callin_map)) {
844 /* number CPUs logically, starting from 1 (BSP is 0) */
845 Dprintk("OK.\n");
846 printk("CPU%d: ", cpu);
847 print_cpu_info(&cpu_data[cpu]);
848 Dprintk("CPU has booted.\n");
849 } else {
850 boot_error= 1;
851 if (*((volatile unsigned char *)trampoline_base)
852 == 0xA5)
853 /* trampoline started but...? */
854 printk("Stuck ??\n");
855 else
856 /* trampoline code not run */
857 printk("Not responding.\n");
858 inquire_remote_apic(apicid);
859 }
860 }
861
862 if (boot_error) {
863 /* Try to put things back the way they were before ... */
864 unmap_cpu_to_logical_apicid(cpu);
865 cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */
866 cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */
867 cpucount--;
868 } else {
869 x86_cpu_to_apicid[cpu] = apicid;
870 cpu_set(cpu, cpu_present_map);
871 }
872
873 /* mark "stuck" area as not stuck */
874 *((volatile unsigned long *)trampoline_base) = 0;
875
876 return boot_error;
877}
878
879#ifdef CONFIG_HOTPLUG_CPU
880void cpu_exit_clear(void)
881{
882 int cpu = raw_smp_processor_id();
883
884 idle_task_exit();
885
886 cpucount --;
887 cpu_uninit();
888 irq_ctx_exit(cpu);
889
890 cpu_clear(cpu, cpu_callout_map);
891 cpu_clear(cpu, cpu_callin_map);
892
893 cpu_clear(cpu, smp_commenced_mask);
894 unmap_cpu_to_logical_apicid(cpu);
895}
896
897struct warm_boot_cpu_info {
898 struct completion *complete;
899 struct work_struct task;
900 int apicid;
901 int cpu;
902};
903
904static void __cpuinit do_warm_boot_cpu(struct work_struct *work)
905{
906 struct warm_boot_cpu_info *info =
907 container_of(work, struct warm_boot_cpu_info, task);
908 do_boot_cpu(info->apicid, info->cpu);
909 complete(info->complete);
910}
911
912static int __cpuinit __smp_prepare_cpu(int cpu)
913{
914 DECLARE_COMPLETION_ONSTACK(done);
915 struct warm_boot_cpu_info info;
916 int apicid, ret;
917
918 apicid = x86_cpu_to_apicid[cpu];
919 if (apicid == BAD_APICID) {
920 ret = -ENODEV;
921 goto exit;
922 }
923
924 info.complete = &done;
925 info.apicid = apicid;
926 info.cpu = cpu;
927 INIT_WORK(&info.task, do_warm_boot_cpu);
928
929 /* init low mem mapping */
930 clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
931 min_t(unsigned long, KERNEL_PGD_PTRS, USER_PGD_PTRS));
932 flush_tlb_all();
933 schedule_work(&info.task);
934 wait_for_completion(&done);
935
936 zap_low_mappings();
937 ret = 0;
938exit:
939 return ret;
940}
941#endif
942
943/*
944 * Cycle through the processors sending APIC IPIs to boot each.
945 */
946
947static int boot_cpu_logical_apicid;
948/* Where the IO area was mapped on multiquad, always 0 otherwise */
949void *xquad_portio;
950#ifdef CONFIG_X86_NUMAQ
951EXPORT_SYMBOL(xquad_portio);
952#endif
953
954static void __init smp_boot_cpus(unsigned int max_cpus)
955{
956 int apicid, cpu, bit, kicked;
957 unsigned long bogosum = 0;
958
959 /*
960 * Setup boot CPU information
961 */
962 smp_store_cpu_info(0); /* Final full version of the data */
963 printk("CPU%d: ", 0);
964 print_cpu_info(&cpu_data[0]);
965
966 boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
967 boot_cpu_logical_apicid = logical_smp_processor_id();
968 x86_cpu_to_apicid[0] = boot_cpu_physical_apicid;
969
970 current_thread_info()->cpu = 0;
971
972 set_cpu_sibling_map(0);
973
974 /*
975 * If we couldn't find an SMP configuration at boot time,
976 * get out of here now!
977 */
978 if (!smp_found_config && !acpi_lapic) {
979 printk(KERN_NOTICE "SMP motherboard not detected.\n");
980 smpboot_clear_io_apic_irqs();
981 phys_cpu_present_map = physid_mask_of_physid(0);
982 if (APIC_init_uniprocessor())
983 printk(KERN_NOTICE "Local APIC not detected."
984 " Using dummy APIC emulation.\n");
985 map_cpu_to_logical_apicid();
986 cpu_set(0, cpu_sibling_map[0]);
987 cpu_set(0, cpu_core_map[0]);
988 return;
989 }
990
991 /*
992 * Should not be necessary because the MP table should list the boot
993 * CPU too, but we do it for the sake of robustness anyway.
994 * Makes no sense to do this check in clustered apic mode, so skip it
995 */
996 if (!check_phys_apicid_present(boot_cpu_physical_apicid)) {
997 printk("weird, boot CPU (#%d) not listed by the BIOS.\n",
998 boot_cpu_physical_apicid);
999 physid_set(hard_smp_processor_id(), phys_cpu_present_map);
1000 }
1001
1002 /*
1003 * If we couldn't find a local APIC, then get out of here now!
1004 */
1005 if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) && !cpu_has_apic) {
1006 printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
1007 boot_cpu_physical_apicid);
1008 printk(KERN_ERR "... forcing use of dummy APIC emulation. (tell your hw vendor)\n");
1009 smpboot_clear_io_apic_irqs();
1010 phys_cpu_present_map = physid_mask_of_physid(0);
1011 cpu_set(0, cpu_sibling_map[0]);
1012 cpu_set(0, cpu_core_map[0]);
1013 return;
1014 }
1015
1016 verify_local_APIC();
1017
1018 /*
1019 * If SMP should be disabled, then really disable it!
1020 */
1021 if (!max_cpus) {
1022 smp_found_config = 0;
1023 printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n");
1024 smpboot_clear_io_apic_irqs();
1025 phys_cpu_present_map = physid_mask_of_physid(0);
1026 cpu_set(0, cpu_sibling_map[0]);
1027 cpu_set(0, cpu_core_map[0]);
1028 return;
1029 }
1030
1031 connect_bsp_APIC();
1032 setup_local_APIC();
1033 map_cpu_to_logical_apicid();
1034
1035
1036 setup_portio_remap();
1037
1038 /*
1039 * Scan the CPU present map and fire up the other CPUs via do_boot_cpu
1040 *
1041 * In clustered apic mode, phys_cpu_present_map is a constructed thus:
1042 * bits 0-3 are quad0, 4-7 are quad1, etc. A perverse twist on the
1043 * clustered apic ID.
1044 */
1045 Dprintk("CPU present map: %lx\n", physids_coerce(phys_cpu_present_map));
1046
1047 kicked = 1;
1048 for (bit = 0; kicked < NR_CPUS && bit < MAX_APICS; bit++) {
1049 apicid = cpu_present_to_apicid(bit);
1050 /*
1051 * Don't even attempt to start the boot CPU!
1052 */
1053 if ((apicid == boot_cpu_apicid) || (apicid == BAD_APICID))
1054 continue;
1055
1056 if (!check_apicid_present(bit))
1057 continue;
1058 if (max_cpus <= cpucount+1)
1059 continue;
1060
1061 if (((cpu = alloc_cpu_id()) <= 0) || do_boot_cpu(apicid, cpu))
1062 printk("CPU #%d not responding - cannot use it.\n",
1063 apicid);
1064 else
1065 ++kicked;
1066 }
1067
1068 /*
1069 * Cleanup possible dangling ends...
1070 */
1071 smpboot_restore_warm_reset_vector();
1072
1073 /*
1074 * Allow the user to impress friends.
1075 */
1076 Dprintk("Before bogomips.\n");
1077 for (cpu = 0; cpu < NR_CPUS; cpu++)
1078 if (cpu_isset(cpu, cpu_callout_map))
1079 bogosum += cpu_data[cpu].loops_per_jiffy;
1080 printk(KERN_INFO
1081 "Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
1082 cpucount+1,
1083 bogosum/(500000/HZ),
1084 (bogosum/(5000/HZ))%100);
1085
1086 Dprintk("Before bogocount - setting activated=1.\n");
1087
1088 if (smp_b_stepping)
1089 printk(KERN_WARNING "WARNING: SMP operation may be unreliable with B stepping processors.\n");
1090
1091 /*
1092 * Don't taint if we are running SMP kernel on a single non-MP
1093 * approved Athlon
1094 */
1095 if (tainted & TAINT_UNSAFE_SMP) {
1096 if (cpucount)
1097 printk (KERN_INFO "WARNING: This combination of AMD processors is not suitable for SMP.\n");
1098 else
1099 tainted &= ~TAINT_UNSAFE_SMP;
1100 }
1101
1102 Dprintk("Boot done.\n");
1103
1104 /*
1105 * construct cpu_sibling_map[], so that we can tell sibling CPUs
1106 * efficiently.
1107 */
1108 for (cpu = 0; cpu < NR_CPUS; cpu++) {
1109 cpus_clear(cpu_sibling_map[cpu]);
1110 cpus_clear(cpu_core_map[cpu]);
1111 }
1112
1113 cpu_set(0, cpu_sibling_map[0]);
1114 cpu_set(0, cpu_core_map[0]);
1115
1116 smpboot_setup_io_apic();
1117
1118 setup_boot_clock();
1119}
1120
1121/* These are wrappers to interface to the new boot process. Someone
1122 who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */
1123void __init native_smp_prepare_cpus(unsigned int max_cpus)
1124{
1125 smp_commenced_mask = cpumask_of_cpu(0);
1126 cpu_callin_map = cpumask_of_cpu(0);
1127 mb();
1128 smp_boot_cpus(max_cpus);
1129}
1130
1131void __init native_smp_prepare_boot_cpu(void)
1132{
1133 unsigned int cpu = smp_processor_id();
1134
1135 init_gdt(cpu);
1136 switch_to_new_gdt();
1137
1138 cpu_set(cpu, cpu_online_map);
1139 cpu_set(cpu, cpu_callout_map);
1140 cpu_set(cpu, cpu_present_map);
1141 cpu_set(cpu, cpu_possible_map);
1142 __get_cpu_var(cpu_state) = CPU_ONLINE;
1143}
1144
1145#ifdef CONFIG_HOTPLUG_CPU
1146void remove_siblinginfo(int cpu)
1147{
1148 int sibling;
1149 struct cpuinfo_x86 *c = cpu_data;
1150
1151 for_each_cpu_mask(sibling, cpu_core_map[cpu]) {
1152 cpu_clear(cpu, cpu_core_map[sibling]);
1153 /*
1154 * last thread sibling in this cpu core going down
1155 */
1156 if (cpus_weight(cpu_sibling_map[cpu]) == 1)
1157 c[sibling].booted_cores--;
1158 }
1159
1160 for_each_cpu_mask(sibling, cpu_sibling_map[cpu])
1161 cpu_clear(cpu, cpu_sibling_map[sibling]);
1162 cpus_clear(cpu_sibling_map[cpu]);
1163 cpus_clear(cpu_core_map[cpu]);
1164 c[cpu].phys_proc_id = 0;
1165 c[cpu].cpu_core_id = 0;
1166 cpu_clear(cpu, cpu_sibling_setup_map);
1167}
1168
1169int __cpu_disable(void)
1170{
1171 cpumask_t map = cpu_online_map;
1172 int cpu = smp_processor_id();
1173
1174 /*
1175 * Perhaps use cpufreq to drop frequency, but that could go
1176 * into generic code.
1177 *
1178 * We won't take down the boot processor on i386 due to some
1179 * interrupts only being able to be serviced by the BSP.
1180 * Especially so if we're not using an IOAPIC -zwane
1181 */
1182 if (cpu == 0)
1183 return -EBUSY;
1184 if (nmi_watchdog == NMI_LOCAL_APIC)
1185 stop_apic_nmi_watchdog(NULL);
1186 clear_local_APIC();
1187 /* Allow any queued timer interrupts to get serviced */
1188 local_irq_enable();
1189 mdelay(1);
1190 local_irq_disable();
1191
1192 remove_siblinginfo(cpu);
1193
1194 cpu_clear(cpu, map);
1195 fixup_irqs(map);
1196 /* It's now safe to remove this processor from the online map */
1197 cpu_clear(cpu, cpu_online_map);
1198 return 0;
1199}
1200
1201void __cpu_die(unsigned int cpu)
1202{
1203 /* We don't do anything here: idle task is faking death itself. */
1204 unsigned int i;
1205
1206 for (i = 0; i < 10; i++) {
1207 /* They ack this in play_dead by setting CPU_DEAD */
1208 if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
1209 printk ("CPU %d is now offline\n", cpu);
1210 if (1 == num_online_cpus())
1211 alternatives_smp_switch(0);
1212 return;
1213 }
1214 msleep(100);
1215 }
1216 printk(KERN_ERR "CPU %u didn't die...\n", cpu);
1217}
1218#else /* ... !CONFIG_HOTPLUG_CPU */
1219int __cpu_disable(void)
1220{
1221 return -ENOSYS;
1222}
1223
1224void __cpu_die(unsigned int cpu)
1225{
1226 /* We said "no" in __cpu_disable */
1227 BUG();
1228}
1229#endif /* CONFIG_HOTPLUG_CPU */
1230
1231int __cpuinit native_cpu_up(unsigned int cpu)
1232{
1233 unsigned long flags;
1234#ifdef CONFIG_HOTPLUG_CPU
1235 int ret = 0;
1236
1237 /*
1238 * We do warm boot only on cpus that had booted earlier
1239 * Otherwise cold boot is all handled from smp_boot_cpus().
1240 * cpu_callin_map is set during AP kickstart process. Its reset
1241 * when a cpu is taken offline from cpu_exit_clear().
1242 */
1243 if (!cpu_isset(cpu, cpu_callin_map))
1244 ret = __smp_prepare_cpu(cpu);
1245
1246 if (ret)
1247 return -EIO;
1248#endif
1249
1250 /* In case one didn't come up */
1251 if (!cpu_isset(cpu, cpu_callin_map)) {
1252 printk(KERN_DEBUG "skipping cpu%d, didn't come online\n", cpu);
1253 return -EIO;
1254 }
1255
1256 per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
1257 /* Unleash the CPU! */
1258 cpu_set(cpu, smp_commenced_mask);
1259
1260 /*
1261 * Check TSC synchronization with the AP (keep irqs disabled
1262 * while doing so):
1263 */
1264 local_irq_save(flags);
1265 check_tsc_sync_source(cpu);
1266 local_irq_restore(flags);
1267
1268 while (!cpu_isset(cpu, cpu_online_map)) {
1269 cpu_relax();
1270 touch_nmi_watchdog();
1271 }
1272
1273 return 0;
1274}
1275
1276void __init native_smp_cpus_done(unsigned int max_cpus)
1277{
1278#ifdef CONFIG_X86_IO_APIC
1279 setup_ioapic_dest();
1280#endif
1281 zap_low_mappings();
1282#ifndef CONFIG_HOTPLUG_CPU
1283 /*
1284 * Disable executability of the SMP trampoline:
1285 */
1286 set_kernel_exec((unsigned long)trampoline_base, trampoline_exec);
1287#endif
1288}
1289
1290void __init smp_intr_init(void)
1291{
1292 /*
1293 * IRQ0 must be given a fixed assignment and initialized,
1294 * because it's used before the IO-APIC is set up.
1295 */
1296 set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]);
1297
1298 /*
1299 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
1300 * IPI, driven by wakeup.
1301 */
1302 set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
1303
1304 /* IPI for invalidation */
1305 set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
1306
1307 /* IPI for generic function call */
1308 set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
1309}
1310
1311/*
1312 * If the BIOS enumerates physical processors before logical,
1313 * maxcpus=N at enumeration-time can be used to disable HT.
1314 */
1315static int __init parse_maxcpus(char *arg)
1316{
1317 extern unsigned int maxcpus;
1318
1319 maxcpus = simple_strtoul(arg, NULL, 0);
1320 return 0;
1321}
1322early_param("maxcpus", parse_maxcpus);
diff --git a/arch/x86/kernel/smpboot_64.c b/arch/x86/kernel/smpboot_64.c
new file mode 100644
index 000000000000..32f50783edc8
--- /dev/null
+++ b/arch/x86/kernel/smpboot_64.c
@@ -0,0 +1,1085 @@
1/*
2 * x86 SMP booting functions
3 *
4 * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
5 * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
6 * Copyright 2001 Andi Kleen, SuSE Labs.
7 *
8 * Much of the core SMP work is based on previous work by Thomas Radke, to
9 * whom a great many thanks are extended.
10 *
11 * Thanks to Intel for making available several different Pentium,
12 * Pentium Pro and Pentium-II/Xeon MP machines.
13 * Original development of Linux SMP code supported by Caldera.
14 *
15 * This code is released under the GNU General Public License version 2
16 *
17 * Fixes
18 * Felix Koop : NR_CPUS used properly
19 * Jose Renau : Handle single CPU case.
20 * Alan Cox : By repeated request 8) - Total BogoMIP report.
21 * Greg Wright : Fix for kernel stacks panic.
22 * Erich Boleyn : MP v1.4 and additional changes.
23 * Matthias Sattler : Changes for 2.1 kernel map.
24 * Michel Lespinasse : Changes for 2.1 kernel map.
25 * Michael Chastain : Change trampoline.S to gnu as.
26 * Alan Cox : Dumb bug: 'B' step PPro's are fine
27 * Ingo Molnar : Added APIC timers, based on code
28 * from Jose Renau
29 * Ingo Molnar : various cleanups and rewrites
30 * Tigran Aivazian : fixed "0.00 in /proc/uptime on SMP" bug.
31 * Maciej W. Rozycki : Bits for genuine 82489DX APICs
32 * Andi Kleen : Changed for SMP boot into long mode.
33 * Rusty Russell : Hacked into shape for new "hotplug" boot process.
34 * Andi Kleen : Converted to new state machine.
35 * Various cleanups.
36 * Probably mostly hotplug CPU ready now.
37 * Ashok Raj : CPU hotplug support
38 */
39
40
41#include <linux/init.h>
42
43#include <linux/mm.h>
44#include <linux/kernel_stat.h>
45#include <linux/bootmem.h>
46#include <linux/thread_info.h>
47#include <linux/module.h>
48#include <linux/delay.h>
49#include <linux/mc146818rtc.h>
50#include <linux/smp.h>
51#include <linux/kdebug.h>
52
53#include <asm/mtrr.h>
54#include <asm/pgalloc.h>
55#include <asm/desc.h>
56#include <asm/tlbflush.h>
57#include <asm/proto.h>
58#include <asm/nmi.h>
59#include <asm/irq.h>
60#include <asm/hw_irq.h>
61#include <asm/numa.h>
62
63/* Number of siblings per CPU package */
64int smp_num_siblings = 1;
65EXPORT_SYMBOL(smp_num_siblings);
66
67/* Last level cache ID of each logical CPU */
68u8 cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID};
69
70/* Bitmask of currently online CPUs */
71cpumask_t cpu_online_map __read_mostly;
72
73EXPORT_SYMBOL(cpu_online_map);
74
75/*
76 * Private maps to synchronize booting between AP and BP.
77 * Probably not needed anymore, but it makes for easier debugging. -AK
78 */
79cpumask_t cpu_callin_map;
80cpumask_t cpu_callout_map;
81EXPORT_SYMBOL(cpu_callout_map);
82
83cpumask_t cpu_possible_map;
84EXPORT_SYMBOL(cpu_possible_map);
85
86/* Per CPU bogomips and other parameters */
87struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
88EXPORT_SYMBOL(cpu_data);
89
90/* Set when the idlers are all forked */
91int smp_threads_ready;
92
93/* representing HT siblings of each logical CPU */
94cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly;
95EXPORT_SYMBOL(cpu_sibling_map);
96
97/* representing HT and core siblings of each logical CPU */
98cpumask_t cpu_core_map[NR_CPUS] __read_mostly;
99EXPORT_SYMBOL(cpu_core_map);
100
101/*
102 * Trampoline 80x86 program as an array.
103 */
104
105extern unsigned char trampoline_data[];
106extern unsigned char trampoline_end[];
107
108/* State of each CPU */
109DEFINE_PER_CPU(int, cpu_state) = { 0 };
110
111/*
112 * Store all idle threads, this can be reused instead of creating
113 * a new thread. Also avoids complicated thread destroy functionality
114 * for idle threads.
115 */
116struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
117
118#define get_idle_for_cpu(x) (idle_thread_array[(x)])
119#define set_idle_for_cpu(x,p) (idle_thread_array[(x)] = (p))
120
121/*
122 * Currently trivial. Write the real->protected mode
123 * bootstrap into the page concerned. The caller
124 * has made sure it's suitably aligned.
125 */
126
127static unsigned long __cpuinit setup_trampoline(void)
128{
129 void *tramp = __va(SMP_TRAMPOLINE_BASE);
130 memcpy(tramp, trampoline_data, trampoline_end - trampoline_data);
131 return virt_to_phys(tramp);
132}
133
134/*
135 * The bootstrap kernel entry code has set these up. Save them for
136 * a given CPU
137 */
138
139static void __cpuinit smp_store_cpu_info(int id)
140{
141 struct cpuinfo_x86 *c = cpu_data + id;
142
143 *c = boot_cpu_data;
144 identify_cpu(c);
145 print_cpu_info(c);
146}
147
148static atomic_t init_deasserted __cpuinitdata;
149
150/*
151 * Report back to the Boot Processor.
152 * Running on AP.
153 */
154void __cpuinit smp_callin(void)
155{
156 int cpuid, phys_id;
157 unsigned long timeout;
158
159 /*
160 * If waken up by an INIT in an 82489DX configuration
161 * we may get here before an INIT-deassert IPI reaches
162 * our local APIC. We have to wait for the IPI or we'll
163 * lock up on an APIC access.
164 */
165 while (!atomic_read(&init_deasserted))
166 cpu_relax();
167
168 /*
169 * (This works even if the APIC is not enabled.)
170 */
171 phys_id = GET_APIC_ID(apic_read(APIC_ID));
172 cpuid = smp_processor_id();
173 if (cpu_isset(cpuid, cpu_callin_map)) {
174 panic("smp_callin: phys CPU#%d, CPU#%d already present??\n",
175 phys_id, cpuid);
176 }
177 Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id);
178
179 /*
180 * STARTUP IPIs are fragile beasts as they might sometimes
181 * trigger some glue motherboard logic. Complete APIC bus
182 * silence for 1 second, this overestimates the time the
183 * boot CPU is spending to send the up to 2 STARTUP IPIs
184 * by a factor of two. This should be enough.
185 */
186
187 /*
188 * Waiting 2s total for startup (udelay is not yet working)
189 */
190 timeout = jiffies + 2*HZ;
191 while (time_before(jiffies, timeout)) {
192 /*
193 * Has the boot CPU finished it's STARTUP sequence?
194 */
195 if (cpu_isset(cpuid, cpu_callout_map))
196 break;
197 cpu_relax();
198 }
199
200 if (!time_before(jiffies, timeout)) {
201 panic("smp_callin: CPU%d started up but did not get a callout!\n",
202 cpuid);
203 }
204
205 /*
206 * the boot CPU has finished the init stage and is spinning
207 * on callin_map until we finish. We are free to set up this
208 * CPU, first the APIC. (this is probably redundant on most
209 * boards)
210 */
211
212 Dprintk("CALLIN, before setup_local_APIC().\n");
213 setup_local_APIC();
214
215 /*
216 * Get our bogomips.
217 *
218 * Need to enable IRQs because it can take longer and then
219 * the NMI watchdog might kill us.
220 */
221 local_irq_enable();
222 calibrate_delay();
223 local_irq_disable();
224 Dprintk("Stack at about %p\n",&cpuid);
225
226 disable_APIC_timer();
227
228 /*
229 * Save our processor parameters
230 */
231 smp_store_cpu_info(cpuid);
232
233 /*
234 * Allow the master to continue.
235 */
236 cpu_set(cpuid, cpu_callin_map);
237}
238
239/* maps the cpu to the sched domain representing multi-core */
240cpumask_t cpu_coregroup_map(int cpu)
241{
242 struct cpuinfo_x86 *c = cpu_data + cpu;
243 /*
244 * For perf, we return last level cache shared map.
245 * And for power savings, we return cpu_core_map
246 */
247 if (sched_mc_power_savings || sched_smt_power_savings)
248 return cpu_core_map[cpu];
249 else
250 return c->llc_shared_map;
251}
252
253/* representing cpus for which sibling maps can be computed */
254static cpumask_t cpu_sibling_setup_map;
255
256static inline void set_cpu_sibling_map(int cpu)
257{
258 int i;
259 struct cpuinfo_x86 *c = cpu_data;
260
261 cpu_set(cpu, cpu_sibling_setup_map);
262
263 if (smp_num_siblings > 1) {
264 for_each_cpu_mask(i, cpu_sibling_setup_map) {
265 if (c[cpu].phys_proc_id == c[i].phys_proc_id &&
266 c[cpu].cpu_core_id == c[i].cpu_core_id) {
267 cpu_set(i, cpu_sibling_map[cpu]);
268 cpu_set(cpu, cpu_sibling_map[i]);
269 cpu_set(i, cpu_core_map[cpu]);
270 cpu_set(cpu, cpu_core_map[i]);
271 cpu_set(i, c[cpu].llc_shared_map);
272 cpu_set(cpu, c[i].llc_shared_map);
273 }
274 }
275 } else {
276 cpu_set(cpu, cpu_sibling_map[cpu]);
277 }
278
279 cpu_set(cpu, c[cpu].llc_shared_map);
280
281 if (current_cpu_data.x86_max_cores == 1) {
282 cpu_core_map[cpu] = cpu_sibling_map[cpu];
283 c[cpu].booted_cores = 1;
284 return;
285 }
286
287 for_each_cpu_mask(i, cpu_sibling_setup_map) {
288 if (cpu_llc_id[cpu] != BAD_APICID &&
289 cpu_llc_id[cpu] == cpu_llc_id[i]) {
290 cpu_set(i, c[cpu].llc_shared_map);
291 cpu_set(cpu, c[i].llc_shared_map);
292 }
293 if (c[cpu].phys_proc_id == c[i].phys_proc_id) {
294 cpu_set(i, cpu_core_map[cpu]);
295 cpu_set(cpu, cpu_core_map[i]);
296 /*
297 * Does this new cpu bringup a new core?
298 */
299 if (cpus_weight(cpu_sibling_map[cpu]) == 1) {
300 /*
301 * for each core in package, increment
302 * the booted_cores for this new cpu
303 */
304 if (first_cpu(cpu_sibling_map[i]) == i)
305 c[cpu].booted_cores++;
306 /*
307 * increment the core count for all
308 * the other cpus in this package
309 */
310 if (i != cpu)
311 c[i].booted_cores++;
312 } else if (i != cpu && !c[cpu].booted_cores)
313 c[cpu].booted_cores = c[i].booted_cores;
314 }
315 }
316}
317
318/*
319 * Setup code on secondary processor (after comming out of the trampoline)
320 */
321void __cpuinit start_secondary(void)
322{
323 /*
324 * Dont put anything before smp_callin(), SMP
325 * booting is too fragile that we want to limit the
326 * things done here to the most necessary things.
327 */
328 cpu_init();
329 preempt_disable();
330 smp_callin();
331
332 /* otherwise gcc will move up the smp_processor_id before the cpu_init */
333 barrier();
334
335 /*
336 * Check TSC sync first:
337 */
338 check_tsc_sync_target();
339
340 Dprintk("cpu %d: setting up apic clock\n", smp_processor_id());
341 setup_secondary_APIC_clock();
342
343 Dprintk("cpu %d: enabling apic timer\n", smp_processor_id());
344
345 if (nmi_watchdog == NMI_IO_APIC) {
346 disable_8259A_irq(0);
347 enable_NMI_through_LVT0(NULL);
348 enable_8259A_irq(0);
349 }
350
351 enable_APIC_timer();
352
353 /*
354 * The sibling maps must be set before turing the online map on for
355 * this cpu
356 */
357 set_cpu_sibling_map(smp_processor_id());
358
359 /*
360 * We need to hold call_lock, so there is no inconsistency
361 * between the time smp_call_function() determines number of
362 * IPI receipients, and the time when the determination is made
363 * for which cpus receive the IPI in genapic_flat.c. Holding this
364 * lock helps us to not include this cpu in a currently in progress
365 * smp_call_function().
366 */
367 lock_ipi_call_lock();
368 spin_lock(&vector_lock);
369
370 /* Setup the per cpu irq handling data structures */
371 __setup_vector_irq(smp_processor_id());
372 /*
373 * Allow the master to continue.
374 */
375 cpu_set(smp_processor_id(), cpu_online_map);
376 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
377 spin_unlock(&vector_lock);
378
379 unlock_ipi_call_lock();
380
381 cpu_idle();
382}
383
384extern volatile unsigned long init_rsp;
385extern void (*initial_code)(void);
386
387#ifdef APIC_DEBUG
388static void inquire_remote_apic(int apicid)
389{
390 unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
391 char *names[] = { "ID", "VERSION", "SPIV" };
392 int timeout;
393 unsigned int status;
394
395 printk(KERN_INFO "Inquiring remote APIC #%d...\n", apicid);
396
397 for (i = 0; i < sizeof(regs) / sizeof(*regs); i++) {
398 printk("... APIC #%d %s: ", apicid, names[i]);
399
400 /*
401 * Wait for idle.
402 */
403 status = safe_apic_wait_icr_idle();
404 if (status)
405 printk("a previous APIC delivery may have failed\n");
406
407 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
408 apic_write(APIC_ICR, APIC_DM_REMRD | regs[i]);
409
410 timeout = 0;
411 do {
412 udelay(100);
413 status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK;
414 } while (status == APIC_ICR_RR_INPROG && timeout++ < 1000);
415
416 switch (status) {
417 case APIC_ICR_RR_VALID:
418 status = apic_read(APIC_RRR);
419 printk("%08x\n", status);
420 break;
421 default:
422 printk("failed\n");
423 }
424 }
425}
426#endif
427
428/*
429 * Kick the secondary to wake up.
430 */
431static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int start_rip)
432{
433 unsigned long send_status, accept_status = 0;
434 int maxlvt, num_starts, j;
435
436 Dprintk("Asserting INIT.\n");
437
438 /*
439 * Turn INIT on target chip
440 */
441 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
442
443 /*
444 * Send IPI
445 */
446 apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT
447 | APIC_DM_INIT);
448
449 Dprintk("Waiting for send to finish...\n");
450 send_status = safe_apic_wait_icr_idle();
451
452 mdelay(10);
453
454 Dprintk("Deasserting INIT.\n");
455
456 /* Target chip */
457 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
458
459 /* Send IPI */
460 apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT);
461
462 Dprintk("Waiting for send to finish...\n");
463 send_status = safe_apic_wait_icr_idle();
464
465 mb();
466 atomic_set(&init_deasserted, 1);
467
468 num_starts = 2;
469
470 /*
471 * Run STARTUP IPI loop.
472 */
473 Dprintk("#startup loops: %d.\n", num_starts);
474
475 maxlvt = get_maxlvt();
476
477 for (j = 1; j <= num_starts; j++) {
478 Dprintk("Sending STARTUP #%d.\n",j);
479 apic_write(APIC_ESR, 0);
480 apic_read(APIC_ESR);
481 Dprintk("After apic_write.\n");
482
483 /*
484 * STARTUP IPI
485 */
486
487 /* Target chip */
488 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
489
490 /* Boot on the stack */
491 /* Kick the second */
492 apic_write(APIC_ICR, APIC_DM_STARTUP | (start_rip >> 12));
493
494 /*
495 * Give the other CPU some time to accept the IPI.
496 */
497 udelay(300);
498
499 Dprintk("Startup point 1.\n");
500
501 Dprintk("Waiting for send to finish...\n");
502 send_status = safe_apic_wait_icr_idle();
503
504 /*
505 * Give the other CPU some time to accept the IPI.
506 */
507 udelay(200);
508 /*
509 * Due to the Pentium erratum 3AP.
510 */
511 if (maxlvt > 3) {
512 apic_write(APIC_ESR, 0);
513 }
514 accept_status = (apic_read(APIC_ESR) & 0xEF);
515 if (send_status || accept_status)
516 break;
517 }
518 Dprintk("After Startup.\n");
519
520 if (send_status)
521 printk(KERN_ERR "APIC never delivered???\n");
522 if (accept_status)
523 printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status);
524
525 return (send_status | accept_status);
526}
527
528struct create_idle {
529 struct work_struct work;
530 struct task_struct *idle;
531 struct completion done;
532 int cpu;
533};
534
535void do_fork_idle(struct work_struct *work)
536{
537 struct create_idle *c_idle =
538 container_of(work, struct create_idle, work);
539
540 c_idle->idle = fork_idle(c_idle->cpu);
541 complete(&c_idle->done);
542}
543
544/*
545 * Boot one CPU.
546 */
547static int __cpuinit do_boot_cpu(int cpu, int apicid)
548{
549 unsigned long boot_error;
550 int timeout;
551 unsigned long start_rip;
552 struct create_idle c_idle = {
553 .work = __WORK_INITIALIZER(c_idle.work, do_fork_idle),
554 .cpu = cpu,
555 .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
556 };
557
558 /* allocate memory for gdts of secondary cpus. Hotplug is considered */
559 if (!cpu_gdt_descr[cpu].address &&
560 !(cpu_gdt_descr[cpu].address = get_zeroed_page(GFP_KERNEL))) {
561 printk(KERN_ERR "Failed to allocate GDT for CPU %d\n", cpu);
562 return -1;
563 }
564
565 /* Allocate node local memory for AP pdas */
566 if (cpu_pda(cpu) == &boot_cpu_pda[cpu]) {
567 struct x8664_pda *newpda, *pda;
568 int node = cpu_to_node(cpu);
569 pda = cpu_pda(cpu);
570 newpda = kmalloc_node(sizeof (struct x8664_pda), GFP_ATOMIC,
571 node);
572 if (newpda) {
573 memcpy(newpda, pda, sizeof (struct x8664_pda));
574 cpu_pda(cpu) = newpda;
575 } else
576 printk(KERN_ERR
577 "Could not allocate node local PDA for CPU %d on node %d\n",
578 cpu, node);
579 }
580
581 alternatives_smp_switch(1);
582
583 c_idle.idle = get_idle_for_cpu(cpu);
584
585 if (c_idle.idle) {
586 c_idle.idle->thread.rsp = (unsigned long) (((struct pt_regs *)
587 (THREAD_SIZE + task_stack_page(c_idle.idle))) - 1);
588 init_idle(c_idle.idle, cpu);
589 goto do_rest;
590 }
591
592 /*
593 * During cold boot process, keventd thread is not spun up yet.
594 * When we do cpu hot-add, we create idle threads on the fly, we should
595 * not acquire any attributes from the calling context. Hence the clean
596 * way to create kernel_threads() is to do that from keventd().
597 * We do the current_is_keventd() due to the fact that ACPI notifier
598 * was also queuing to keventd() and when the caller is already running
599 * in context of keventd(), we would end up with locking up the keventd
600 * thread.
601 */
602 if (!keventd_up() || current_is_keventd())
603 c_idle.work.func(&c_idle.work);
604 else {
605 schedule_work(&c_idle.work);
606 wait_for_completion(&c_idle.done);
607 }
608
609 if (IS_ERR(c_idle.idle)) {
610 printk("failed fork for CPU %d\n", cpu);
611 return PTR_ERR(c_idle.idle);
612 }
613
614 set_idle_for_cpu(cpu, c_idle.idle);
615
616do_rest:
617
618 cpu_pda(cpu)->pcurrent = c_idle.idle;
619
620 start_rip = setup_trampoline();
621
622 init_rsp = c_idle.idle->thread.rsp;
623 per_cpu(init_tss,cpu).rsp0 = init_rsp;
624 initial_code = start_secondary;
625 clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
626
627 printk(KERN_INFO "Booting processor %d/%d APIC 0x%x\n", cpu,
628 cpus_weight(cpu_present_map),
629 apicid);
630
631 /*
632 * This grunge runs the startup process for
633 * the targeted processor.
634 */
635
636 atomic_set(&init_deasserted, 0);
637
638 Dprintk("Setting warm reset code and vector.\n");
639
640 CMOS_WRITE(0xa, 0xf);
641 local_flush_tlb();
642 Dprintk("1.\n");
643 *((volatile unsigned short *) phys_to_virt(0x469)) = start_rip >> 4;
644 Dprintk("2.\n");
645 *((volatile unsigned short *) phys_to_virt(0x467)) = start_rip & 0xf;
646 Dprintk("3.\n");
647
648 /*
649 * Be paranoid about clearing APIC errors.
650 */
651 apic_write(APIC_ESR, 0);
652 apic_read(APIC_ESR);
653
654 /*
655 * Status is now clean
656 */
657 boot_error = 0;
658
659 /*
660 * Starting actual IPI sequence...
661 */
662 boot_error = wakeup_secondary_via_INIT(apicid, start_rip);
663
664 if (!boot_error) {
665 /*
666 * allow APs to start initializing.
667 */
668 Dprintk("Before Callout %d.\n", cpu);
669 cpu_set(cpu, cpu_callout_map);
670 Dprintk("After Callout %d.\n", cpu);
671
672 /*
673 * Wait 5s total for a response
674 */
675 for (timeout = 0; timeout < 50000; timeout++) {
676 if (cpu_isset(cpu, cpu_callin_map))
677 break; /* It has booted */
678 udelay(100);
679 }
680
681 if (cpu_isset(cpu, cpu_callin_map)) {
682 /* number CPUs logically, starting from 1 (BSP is 0) */
683 Dprintk("CPU has booted.\n");
684 } else {
685 boot_error = 1;
686 if (*((volatile unsigned char *)phys_to_virt(SMP_TRAMPOLINE_BASE))
687 == 0xA5)
688 /* trampoline started but...? */
689 printk("Stuck ??\n");
690 else
691 /* trampoline code not run */
692 printk("Not responding.\n");
693#ifdef APIC_DEBUG
694 inquire_remote_apic(apicid);
695#endif
696 }
697 }
698 if (boot_error) {
699 cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */
700 clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */
701 clear_node_cpumask(cpu); /* was set by numa_add_cpu */
702 cpu_clear(cpu, cpu_present_map);
703 cpu_clear(cpu, cpu_possible_map);
704 x86_cpu_to_apicid[cpu] = BAD_APICID;
705 x86_cpu_to_log_apicid[cpu] = BAD_APICID;
706 return -EIO;
707 }
708
709 return 0;
710}
711
712cycles_t cacheflush_time;
713unsigned long cache_decay_ticks;
714
715/*
716 * Cleanup possible dangling ends...
717 */
718static __cpuinit void smp_cleanup_boot(void)
719{
720 /*
721 * Paranoid: Set warm reset code and vector here back
722 * to default values.
723 */
724 CMOS_WRITE(0, 0xf);
725
726 /*
727 * Reset trampoline flag
728 */
729 *((volatile int *) phys_to_virt(0x467)) = 0;
730}
731
732/*
733 * Fall back to non SMP mode after errors.
734 *
735 * RED-PEN audit/test this more. I bet there is more state messed up here.
736 */
737static __init void disable_smp(void)
738{
739 cpu_present_map = cpumask_of_cpu(0);
740 cpu_possible_map = cpumask_of_cpu(0);
741 if (smp_found_config)
742 phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id);
743 else
744 phys_cpu_present_map = physid_mask_of_physid(0);
745 cpu_set(0, cpu_sibling_map[0]);
746 cpu_set(0, cpu_core_map[0]);
747}
748
749#ifdef CONFIG_HOTPLUG_CPU
750
751int additional_cpus __initdata = -1;
752
753/*
754 * cpu_possible_map should be static, it cannot change as cpu's
755 * are onlined, or offlined. The reason is per-cpu data-structures
756 * are allocated by some modules at init time, and dont expect to
757 * do this dynamically on cpu arrival/departure.
758 * cpu_present_map on the other hand can change dynamically.
759 * In case when cpu_hotplug is not compiled, then we resort to current
760 * behaviour, which is cpu_possible == cpu_present.
761 * - Ashok Raj
762 *
763 * Three ways to find out the number of additional hotplug CPUs:
764 * - If the BIOS specified disabled CPUs in ACPI/mptables use that.
765 * - The user can overwrite it with additional_cpus=NUM
766 * - Otherwise don't reserve additional CPUs.
767 * We do this because additional CPUs waste a lot of memory.
768 * -AK
769 */
770__init void prefill_possible_map(void)
771{
772 int i;
773 int possible;
774
775 if (additional_cpus == -1) {
776 if (disabled_cpus > 0)
777 additional_cpus = disabled_cpus;
778 else
779 additional_cpus = 0;
780 }
781 possible = num_processors + additional_cpus;
782 if (possible > NR_CPUS)
783 possible = NR_CPUS;
784
785 printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n",
786 possible,
787 max_t(int, possible - num_processors, 0));
788
789 for (i = 0; i < possible; i++)
790 cpu_set(i, cpu_possible_map);
791}
792#endif
793
794/*
795 * Various sanity checks.
796 */
797static int __init smp_sanity_check(unsigned max_cpus)
798{
799 if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
800 printk("weird, boot CPU (#%d) not listed by the BIOS.\n",
801 hard_smp_processor_id());
802 physid_set(hard_smp_processor_id(), phys_cpu_present_map);
803 }
804
805 /*
806 * If we couldn't find an SMP configuration at boot time,
807 * get out of here now!
808 */
809 if (!smp_found_config) {
810 printk(KERN_NOTICE "SMP motherboard not detected.\n");
811 disable_smp();
812 if (APIC_init_uniprocessor())
813 printk(KERN_NOTICE "Local APIC not detected."
814 " Using dummy APIC emulation.\n");
815 return -1;
816 }
817
818 /*
819 * Should not be necessary because the MP table should list the boot
820 * CPU too, but we do it for the sake of robustness anyway.
821 */
822 if (!physid_isset(boot_cpu_id, phys_cpu_present_map)) {
823 printk(KERN_NOTICE "weird, boot CPU (#%d) not listed by the BIOS.\n",
824 boot_cpu_id);
825 physid_set(hard_smp_processor_id(), phys_cpu_present_map);
826 }
827
828 /*
829 * If we couldn't find a local APIC, then get out of here now!
830 */
831 if (!cpu_has_apic) {
832 printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
833 boot_cpu_id);
834 printk(KERN_ERR "... forcing use of dummy APIC emulation. (tell your hw vendor)\n");
835 nr_ioapics = 0;
836 return -1;
837 }
838
839 /*
840 * If SMP should be disabled, then really disable it!
841 */
842 if (!max_cpus) {
843 printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n");
844 nr_ioapics = 0;
845 return -1;
846 }
847
848 return 0;
849}
850
851/*
852 * Prepare for SMP bootup. The MP table or ACPI has been read
853 * earlier. Just do some sanity checking here and enable APIC mode.
854 */
855void __init smp_prepare_cpus(unsigned int max_cpus)
856{
857 nmi_watchdog_default();
858 current_cpu_data = boot_cpu_data;
859 current_thread_info()->cpu = 0; /* needed? */
860 set_cpu_sibling_map(0);
861
862 if (smp_sanity_check(max_cpus) < 0) {
863 printk(KERN_INFO "SMP disabled\n");
864 disable_smp();
865 return;
866 }
867
868
869 /*
870 * Switch from PIC to APIC mode.
871 */
872 setup_local_APIC();
873
874 if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_id) {
875 panic("Boot APIC ID in local APIC unexpected (%d vs %d)",
876 GET_APIC_ID(apic_read(APIC_ID)), boot_cpu_id);
877 /* Or can we switch back to PIC here? */
878 }
879
880 /*
881 * Now start the IO-APICs
882 */
883 if (!skip_ioapic_setup && nr_ioapics)
884 setup_IO_APIC();
885 else
886 nr_ioapics = 0;
887
888 /*
889 * Set up local APIC timer on boot CPU.
890 */
891
892 setup_boot_APIC_clock();
893}
894
895/*
896 * Early setup to make printk work.
897 */
898void __init smp_prepare_boot_cpu(void)
899{
900 int me = smp_processor_id();
901 cpu_set(me, cpu_online_map);
902 cpu_set(me, cpu_callout_map);
903 per_cpu(cpu_state, me) = CPU_ONLINE;
904}
905
906/*
907 * Entry point to boot a CPU.
908 */
909int __cpuinit __cpu_up(unsigned int cpu)
910{
911 int apicid = cpu_present_to_apicid(cpu);
912 unsigned long flags;
913 int err;
914
915 WARN_ON(irqs_disabled());
916
917 Dprintk("++++++++++++++++++++=_---CPU UP %u\n", cpu);
918
919 if (apicid == BAD_APICID || apicid == boot_cpu_id ||
920 !physid_isset(apicid, phys_cpu_present_map)) {
921 printk("__cpu_up: bad cpu %d\n", cpu);
922 return -EINVAL;
923 }
924
925 /*
926 * Already booted CPU?
927 */
928 if (cpu_isset(cpu, cpu_callin_map)) {
929 Dprintk("do_boot_cpu %d Already started\n", cpu);
930 return -ENOSYS;
931 }
932
933 /*
934 * Save current MTRR state in case it was changed since early boot
935 * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync:
936 */
937 mtrr_save_state();
938
939 per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
940 /* Boot it! */
941 err = do_boot_cpu(cpu, apicid);
942 if (err < 0) {
943 Dprintk("do_boot_cpu failed %d\n", err);
944 return err;
945 }
946
947 /* Unleash the CPU! */
948 Dprintk("waiting for cpu %d\n", cpu);
949
950 /*
951 * Make sure and check TSC sync:
952 */
953 local_irq_save(flags);
954 check_tsc_sync_source(cpu);
955 local_irq_restore(flags);
956
957 while (!cpu_isset(cpu, cpu_online_map))
958 cpu_relax();
959 err = 0;
960
961 return err;
962}
963
964/*
965 * Finish the SMP boot.
966 */
967void __init smp_cpus_done(unsigned int max_cpus)
968{
969 smp_cleanup_boot();
970 setup_ioapic_dest();
971 check_nmi_watchdog();
972}
973
974#ifdef CONFIG_HOTPLUG_CPU
975
976static void remove_siblinginfo(int cpu)
977{
978 int sibling;
979 struct cpuinfo_x86 *c = cpu_data;
980
981 for_each_cpu_mask(sibling, cpu_core_map[cpu]) {
982 cpu_clear(cpu, cpu_core_map[sibling]);
983 /*
984 * last thread sibling in this cpu core going down
985 */
986 if (cpus_weight(cpu_sibling_map[cpu]) == 1)
987 c[sibling].booted_cores--;
988 }
989
990 for_each_cpu_mask(sibling, cpu_sibling_map[cpu])
991 cpu_clear(cpu, cpu_sibling_map[sibling]);
992 cpus_clear(cpu_sibling_map[cpu]);
993 cpus_clear(cpu_core_map[cpu]);
994 c[cpu].phys_proc_id = 0;
995 c[cpu].cpu_core_id = 0;
996 cpu_clear(cpu, cpu_sibling_setup_map);
997}
998
999void remove_cpu_from_maps(void)
1000{
1001 int cpu = smp_processor_id();
1002
1003 cpu_clear(cpu, cpu_callout_map);
1004 cpu_clear(cpu, cpu_callin_map);
1005 clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */
1006 clear_node_cpumask(cpu);
1007}
1008
1009int __cpu_disable(void)
1010{
1011 int cpu = smp_processor_id();
1012
1013 /*
1014 * Perhaps use cpufreq to drop frequency, but that could go
1015 * into generic code.
1016 *
1017 * We won't take down the boot processor on i386 due to some
1018 * interrupts only being able to be serviced by the BSP.
1019 * Especially so if we're not using an IOAPIC -zwane
1020 */
1021 if (cpu == 0)
1022 return -EBUSY;
1023
1024 if (nmi_watchdog == NMI_LOCAL_APIC)
1025 stop_apic_nmi_watchdog(NULL);
1026 clear_local_APIC();
1027
1028 /*
1029 * HACK:
1030 * Allow any queued timer interrupts to get serviced
1031 * This is only a temporary solution until we cleanup
1032 * fixup_irqs as we do for IA64.
1033 */
1034 local_irq_enable();
1035 mdelay(1);
1036
1037 local_irq_disable();
1038 remove_siblinginfo(cpu);
1039
1040 spin_lock(&vector_lock);
1041 /* It's now safe to remove this processor from the online map */
1042 cpu_clear(cpu, cpu_online_map);
1043 spin_unlock(&vector_lock);
1044 remove_cpu_from_maps();
1045 fixup_irqs(cpu_online_map);
1046 return 0;
1047}
1048
1049void __cpu_die(unsigned int cpu)
1050{
1051 /* We don't do anything here: idle task is faking death itself. */
1052 unsigned int i;
1053
1054 for (i = 0; i < 10; i++) {
1055 /* They ack this in play_dead by setting CPU_DEAD */
1056 if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
1057 printk ("CPU %d is now offline\n", cpu);
1058 if (1 == num_online_cpus())
1059 alternatives_smp_switch(0);
1060 return;
1061 }
1062 msleep(100);
1063 }
1064 printk(KERN_ERR "CPU %u didn't die...\n", cpu);
1065}
1066
1067static __init int setup_additional_cpus(char *s)
1068{
1069 return s && get_option(&s, &additional_cpus) ? 0 : -EINVAL;
1070}
1071early_param("additional_cpus", setup_additional_cpus);
1072
1073#else /* ... !CONFIG_HOTPLUG_CPU */
1074
1075int __cpu_disable(void)
1076{
1077 return -ENOSYS;
1078}
1079
1080void __cpu_die(unsigned int cpu)
1081{
1082 /* We said "no" in __cpu_disable */
1083 BUG();
1084}
1085#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/arch/x86/kernel/smpcommon_32.c b/arch/x86/kernel/smpcommon_32.c
new file mode 100644
index 000000000000..bbfe85a0f699
--- /dev/null
+++ b/arch/x86/kernel/smpcommon_32.c
@@ -0,0 +1,81 @@
1/*
2 * SMP stuff which is common to all sub-architectures.
3 */
4#include <linux/module.h>
5#include <asm/smp.h>
6
7DEFINE_PER_CPU(unsigned long, this_cpu_off);
8EXPORT_PER_CPU_SYMBOL(this_cpu_off);
9
10/* Initialize the CPU's GDT. This is either the boot CPU doing itself
11 (still using the master per-cpu area), or a CPU doing it for a
12 secondary which will soon come up. */
13__cpuinit void init_gdt(int cpu)
14{
15 struct desc_struct *gdt = get_cpu_gdt_table(cpu);
16
17 pack_descriptor((u32 *)&gdt[GDT_ENTRY_PERCPU].a,
18 (u32 *)&gdt[GDT_ENTRY_PERCPU].b,
19 __per_cpu_offset[cpu], 0xFFFFF,
20 0x80 | DESCTYPE_S | 0x2, 0x8);
21
22 per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu];
23 per_cpu(cpu_number, cpu) = cpu;
24}
25
26
27/**
28 * smp_call_function(): Run a function on all other CPUs.
29 * @func: The function to run. This must be fast and non-blocking.
30 * @info: An arbitrary pointer to pass to the function.
31 * @nonatomic: Unused.
32 * @wait: If true, wait (atomically) until function has completed on other CPUs.
33 *
34 * Returns 0 on success, else a negative status code.
35 *
36 * If @wait is true, then returns once @func has returned; otherwise
37 * it returns just before the target cpu calls @func.
38 *
39 * You must not call this function with disabled interrupts or from a
40 * hardware interrupt handler or from a bottom half handler.
41 */
42int smp_call_function(void (*func) (void *info), void *info, int nonatomic,
43 int wait)
44{
45 return smp_call_function_mask(cpu_online_map, func, info, wait);
46}
47EXPORT_SYMBOL(smp_call_function);
48
49/**
50 * smp_call_function_single - Run a function on a specific CPU
51 * @cpu: The target CPU. Cannot be the calling CPU.
52 * @func: The function to run. This must be fast and non-blocking.
53 * @info: An arbitrary pointer to pass to the function.
54 * @nonatomic: Unused.
55 * @wait: If true, wait until function has completed on other CPUs.
56 *
57 * Returns 0 on success, else a negative status code.
58 *
59 * If @wait is true, then returns once @func has returned; otherwise
60 * it returns just before the target cpu calls @func.
61 */
62int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
63 int nonatomic, int wait)
64{
65 /* prevent preemption and reschedule on another processor */
66 int ret;
67 int me = get_cpu();
68 if (cpu == me) {
69 local_irq_disable();
70 func(info);
71 local_irq_enable();
72 put_cpu();
73 return 0;
74 }
75
76 ret = smp_call_function_mask(cpumask_of_cpu(cpu), func, info, wait);
77
78 put_cpu();
79 return ret;
80}
81EXPORT_SYMBOL(smp_call_function_single);
diff --git a/arch/x86/kernel/srat_32.c b/arch/x86/kernel/srat_32.c
new file mode 100644
index 000000000000..2a8713ec0f9a
--- /dev/null
+++ b/arch/x86/kernel/srat_32.c
@@ -0,0 +1,360 @@
1/*
2 * Some of the code in this file has been gleaned from the 64 bit
3 * discontigmem support code base.
4 *
5 * Copyright (C) 2002, IBM Corp.
6 *
7 * All rights reserved.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
17 * NON INFRINGEMENT. See the GNU General Public License for more
18 * details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * Send feedback to Pat Gaughen <gone@us.ibm.com>
25 */
26#include <linux/mm.h>
27#include <linux/bootmem.h>
28#include <linux/mmzone.h>
29#include <linux/acpi.h>
30#include <linux/nodemask.h>
31#include <asm/srat.h>
32#include <asm/topology.h>
33#include <asm/smp.h>
34
35/*
36 * proximity macros and definitions
37 */
38#define NODE_ARRAY_INDEX(x) ((x) / 8) /* 8 bits/char */
39#define NODE_ARRAY_OFFSET(x) ((x) % 8) /* 8 bits/char */
40#define BMAP_SET(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] |= 1 << NODE_ARRAY_OFFSET(bit))
41#define BMAP_TEST(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit)))
42/* bitmap length; _PXM is at most 255 */
43#define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8)
44static u8 pxm_bitmap[PXM_BITMAP_LEN]; /* bitmap of proximity domains */
45
46#define MAX_CHUNKS_PER_NODE 3
47#define MAXCHUNKS (MAX_CHUNKS_PER_NODE * MAX_NUMNODES)
48struct node_memory_chunk_s {
49 unsigned long start_pfn;
50 unsigned long end_pfn;
51 u8 pxm; // proximity domain of node
52 u8 nid; // which cnode contains this chunk?
53 u8 bank; // which mem bank on this node
54};
55static struct node_memory_chunk_s node_memory_chunk[MAXCHUNKS];
56
57static int num_memory_chunks; /* total number of memory chunks */
58static u8 __initdata apicid_to_pxm[MAX_APICID];
59
60extern void * boot_ioremap(unsigned long, unsigned long);
61
62/* Identify CPU proximity domains */
63static void __init parse_cpu_affinity_structure(char *p)
64{
65 struct acpi_srat_cpu_affinity *cpu_affinity =
66 (struct acpi_srat_cpu_affinity *) p;
67
68 if ((cpu_affinity->flags & ACPI_SRAT_CPU_ENABLED) == 0)
69 return; /* empty entry */
70
71 /* mark this node as "seen" in node bitmap */
72 BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo);
73
74 apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo;
75
76 printk("CPU 0x%02X in proximity domain 0x%02X\n",
77 cpu_affinity->apic_id, cpu_affinity->proximity_domain_lo);
78}
79
80/*
81 * Identify memory proximity domains and hot-remove capabilities.
82 * Fill node memory chunk list structure.
83 */
84static void __init parse_memory_affinity_structure (char *sratp)
85{
86 unsigned long long paddr, size;
87 unsigned long start_pfn, end_pfn;
88 u8 pxm;
89 struct node_memory_chunk_s *p, *q, *pend;
90 struct acpi_srat_mem_affinity *memory_affinity =
91 (struct acpi_srat_mem_affinity *) sratp;
92
93 if ((memory_affinity->flags & ACPI_SRAT_MEM_ENABLED) == 0)
94 return; /* empty entry */
95
96 pxm = memory_affinity->proximity_domain & 0xff;
97
98 /* mark this node as "seen" in node bitmap */
99 BMAP_SET(pxm_bitmap, pxm);
100
101 /* calculate info for memory chunk structure */
102 paddr = memory_affinity->base_address;
103 size = memory_affinity->length;
104
105 start_pfn = paddr >> PAGE_SHIFT;
106 end_pfn = (paddr + size) >> PAGE_SHIFT;
107
108
109 if (num_memory_chunks >= MAXCHUNKS) {
110 printk("Too many mem chunks in SRAT. Ignoring %lld MBytes at %llx\n",
111 size/(1024*1024), paddr);
112 return;
113 }
114
115 /* Insertion sort based on base address */
116 pend = &node_memory_chunk[num_memory_chunks];
117 for (p = &node_memory_chunk[0]; p < pend; p++) {
118 if (start_pfn < p->start_pfn)
119 break;
120 }
121 if (p < pend) {
122 for (q = pend; q >= p; q--)
123 *(q + 1) = *q;
124 }
125 p->start_pfn = start_pfn;
126 p->end_pfn = end_pfn;
127 p->pxm = pxm;
128
129 num_memory_chunks++;
130
131 printk("Memory range 0x%lX to 0x%lX (type 0x%X) in proximity domain 0x%02X %s\n",
132 start_pfn, end_pfn,
133 memory_affinity->memory_type,
134 pxm,
135 ((memory_affinity->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) ?
136 "enabled and removable" : "enabled" ) );
137}
138
139/*
140 * The SRAT table always lists ascending addresses, so can always
141 * assume that the first "start" address that you see is the real
142 * start of the node, and that the current "end" address is after
143 * the previous one.
144 */
145static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk)
146{
147 /*
148 * Only add present memory as told by the e820.
149 * There is no guarantee from the SRAT that the memory it
150 * enumerates is present at boot time because it represents
151 * *possible* memory hotplug areas the same as normal RAM.
152 */
153 if (memory_chunk->start_pfn >= max_pfn) {
154 printk (KERN_INFO "Ignoring SRAT pfns: 0x%08lx -> %08lx\n",
155 memory_chunk->start_pfn, memory_chunk->end_pfn);
156 return;
157 }
158 if (memory_chunk->nid != nid)
159 return;
160
161 if (!node_has_online_mem(nid))
162 node_start_pfn[nid] = memory_chunk->start_pfn;
163
164 if (node_start_pfn[nid] > memory_chunk->start_pfn)
165 node_start_pfn[nid] = memory_chunk->start_pfn;
166
167 if (node_end_pfn[nid] < memory_chunk->end_pfn)
168 node_end_pfn[nid] = memory_chunk->end_pfn;
169}
170
171/* Parse the ACPI Static Resource Affinity Table */
172static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
173{
174 u8 *start, *end, *p;
175 int i, j, nid;
176
177 start = (u8 *)(&(sratp->reserved) + 1); /* skip header */
178 p = start;
179 end = (u8 *)sratp + sratp->header.length;
180
181 memset(pxm_bitmap, 0, sizeof(pxm_bitmap)); /* init proximity domain bitmap */
182 memset(node_memory_chunk, 0, sizeof(node_memory_chunk));
183
184 num_memory_chunks = 0;
185 while (p < end) {
186 switch (*p) {
187 case ACPI_SRAT_TYPE_CPU_AFFINITY:
188 parse_cpu_affinity_structure(p);
189 break;
190 case ACPI_SRAT_TYPE_MEMORY_AFFINITY:
191 parse_memory_affinity_structure(p);
192 break;
193 default:
194 printk("ACPI 2.0 SRAT: unknown entry skipped: type=0x%02X, len=%d\n", p[0], p[1]);
195 break;
196 }
197 p += p[1];
198 if (p[1] == 0) {
199 printk("acpi20_parse_srat: Entry length value is zero;"
200 " can't parse any further!\n");
201 break;
202 }
203 }
204
205 if (num_memory_chunks == 0) {
206 printk("could not finy any ACPI SRAT memory areas.\n");
207 goto out_fail;
208 }
209
210 /* Calculate total number of nodes in system from PXM bitmap and create
211 * a set of sequential node IDs starting at zero. (ACPI doesn't seem
212 * to specify the range of _PXM values.)
213 */
214 /*
215 * MCD - we no longer HAVE to number nodes sequentially. PXM domain
216 * numbers could go as high as 256, and MAX_NUMNODES for i386 is typically
217 * 32, so we will continue numbering them in this manner until MAX_NUMNODES
218 * approaches MAX_PXM_DOMAINS for i386.
219 */
220 nodes_clear(node_online_map);
221 for (i = 0; i < MAX_PXM_DOMAINS; i++) {
222 if (BMAP_TEST(pxm_bitmap, i)) {
223 int nid = acpi_map_pxm_to_node(i);
224 node_set_online(nid);
225 }
226 }
227 BUG_ON(num_online_nodes() == 0);
228
229 /* set cnode id in memory chunk structure */
230 for (i = 0; i < num_memory_chunks; i++)
231 node_memory_chunk[i].nid = pxm_to_node(node_memory_chunk[i].pxm);
232
233 printk("pxm bitmap: ");
234 for (i = 0; i < sizeof(pxm_bitmap); i++) {
235 printk("%02X ", pxm_bitmap[i]);
236 }
237 printk("\n");
238 printk("Number of logical nodes in system = %d\n", num_online_nodes());
239 printk("Number of memory chunks in system = %d\n", num_memory_chunks);
240
241 for (i = 0; i < MAX_APICID; i++)
242 apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]);
243
244 for (j = 0; j < num_memory_chunks; j++){
245 struct node_memory_chunk_s * chunk = &node_memory_chunk[j];
246 printk("chunk %d nid %d start_pfn %08lx end_pfn %08lx\n",
247 j, chunk->nid, chunk->start_pfn, chunk->end_pfn);
248 node_read_chunk(chunk->nid, chunk);
249 add_active_range(chunk->nid, chunk->start_pfn, chunk->end_pfn);
250 }
251
252 for_each_online_node(nid) {
253 unsigned long start = node_start_pfn[nid];
254 unsigned long end = node_end_pfn[nid];
255
256 memory_present(nid, start, end);
257 node_remap_size[nid] = node_memmap_size_bytes(nid, start, end);
258 }
259 return 1;
260out_fail:
261 return 0;
262}
263
264struct acpi_static_rsdt {
265 struct acpi_table_rsdt table;
266 u32 padding[7]; /* Allow for 7 more table entries */
267};
268
269int __init get_memcfg_from_srat(void)
270{
271 struct acpi_table_header *header = NULL;
272 struct acpi_table_rsdp *rsdp = NULL;
273 struct acpi_table_rsdt *rsdt = NULL;
274 acpi_native_uint rsdp_address = 0;
275 struct acpi_static_rsdt saved_rsdt;
276 int tables = 0;
277 int i = 0;
278
279 rsdp_address = acpi_find_rsdp();
280 if (!rsdp_address) {
281 printk("%s: System description tables not found\n",
282 __FUNCTION__);
283 goto out_err;
284 }
285
286 printk("%s: assigning address to rsdp\n", __FUNCTION__);
287 rsdp = (struct acpi_table_rsdp *)(u32)rsdp_address;
288 if (!rsdp) {
289 printk("%s: Didn't find ACPI root!\n", __FUNCTION__);
290 goto out_err;
291 }
292
293 printk(KERN_INFO "%.8s v%d [%.6s]\n", rsdp->signature, rsdp->revision,
294 rsdp->oem_id);
295
296 if (strncmp(rsdp->signature, ACPI_SIG_RSDP,strlen(ACPI_SIG_RSDP))) {
297 printk(KERN_WARNING "%s: RSDP table signature incorrect\n", __FUNCTION__);
298 goto out_err;
299 }
300
301 rsdt = (struct acpi_table_rsdt *)
302 boot_ioremap(rsdp->rsdt_physical_address, sizeof(struct acpi_table_rsdt));
303
304 if (!rsdt) {
305 printk(KERN_WARNING
306 "%s: ACPI: Invalid root system description tables (RSDT)\n",
307 __FUNCTION__);
308 goto out_err;
309 }
310
311 header = &rsdt->header;
312
313 if (strncmp(header->signature, ACPI_SIG_RSDT, strlen(ACPI_SIG_RSDT))) {
314 printk(KERN_WARNING "ACPI: RSDT signature incorrect\n");
315 goto out_err;
316 }
317
318 /*
319 * The number of tables is computed by taking the
320 * size of all entries (header size minus total
321 * size of RSDT) divided by the size of each entry
322 * (4-byte table pointers).
323 */
324 tables = (header->length - sizeof(struct acpi_table_header)) / 4;
325
326 if (!tables)
327 goto out_err;
328
329 memcpy(&saved_rsdt, rsdt, sizeof(saved_rsdt));
330
331 if (saved_rsdt.table.header.length > sizeof(saved_rsdt)) {
332 printk(KERN_WARNING "ACPI: Too big length in RSDT: %d\n",
333 saved_rsdt.table.header.length);
334 goto out_err;
335 }
336
337 printk("Begin SRAT table scan....\n");
338
339 for (i = 0; i < tables; i++) {
340 /* Map in header, then map in full table length. */
341 header = (struct acpi_table_header *)
342 boot_ioremap(saved_rsdt.table.table_offset_entry[i], sizeof(struct acpi_table_header));
343 if (!header)
344 break;
345 header = (struct acpi_table_header *)
346 boot_ioremap(saved_rsdt.table.table_offset_entry[i], header->length);
347 if (!header)
348 break;
349
350 if (strncmp((char *) &header->signature, ACPI_SIG_SRAT, 4))
351 continue;
352
353 /* we've found the srat table. don't need to look at any more tables */
354 return acpi20_parse_srat((struct acpi_table_srat *)header);
355 }
356out_err:
357 remove_all_active_ranges();
358 printk("failed to get NUMA memory information from SRAT table\n");
359 return 0;
360}
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
new file mode 100644
index 000000000000..cb9109113584
--- /dev/null
+++ b/arch/x86/kernel/stacktrace.c
@@ -0,0 +1,54 @@
1/*
2 * arch/x86_64/kernel/stacktrace.c
3 *
4 * Stack trace management functions
5 *
6 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
7 */
8#include <linux/sched.h>
9#include <linux/stacktrace.h>
10#include <linux/module.h>
11#include <asm/stacktrace.h>
12
13static void save_stack_warning(void *data, char *msg)
14{
15}
16
17static void
18save_stack_warning_symbol(void *data, char *msg, unsigned long symbol)
19{
20}
21
22static int save_stack_stack(void *data, char *name)
23{
24 return -1;
25}
26
27static void save_stack_address(void *data, unsigned long addr)
28{
29 struct stack_trace *trace = (struct stack_trace *)data;
30 if (trace->skip > 0) {
31 trace->skip--;
32 return;
33 }
34 if (trace->nr_entries < trace->max_entries)
35 trace->entries[trace->nr_entries++] = addr;
36}
37
38static struct stacktrace_ops save_stack_ops = {
39 .warning = save_stack_warning,
40 .warning_symbol = save_stack_warning_symbol,
41 .stack = save_stack_stack,
42 .address = save_stack_address,
43};
44
45/*
46 * Save stack-backtrace addresses into a stack_trace buffer.
47 */
48void save_stack_trace(struct stack_trace *trace)
49{
50 dump_trace(current, NULL, NULL, &save_stack_ops, trace);
51 if (trace->nr_entries < trace->max_entries)
52 trace->entries[trace->nr_entries++] = ULONG_MAX;
53}
54EXPORT_SYMBOL(save_stack_trace);
diff --git a/arch/x86/kernel/summit_32.c b/arch/x86/kernel/summit_32.c
new file mode 100644
index 000000000000..d0e01a3acf35
--- /dev/null
+++ b/arch/x86/kernel/summit_32.c
@@ -0,0 +1,180 @@
1/*
2 * arch/i386/kernel/summit.c - IBM Summit-Specific Code
3 *
4 * Written By: Matthew Dobson, IBM Corporation
5 *
6 * Copyright (c) 2003 IBM Corp.
7 *
8 * All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or (at
13 * your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful, but
16 * WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
18 * NON INFRINGEMENT. See the GNU General Public License for more
19 * details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *
25 * Send feedback to <colpatch@us.ibm.com>
26 *
27 */
28
29#include <linux/mm.h>
30#include <linux/init.h>
31#include <asm/io.h>
32#include <asm/mach-summit/mach_mpparse.h>
33
34static struct rio_table_hdr *rio_table_hdr __initdata;
35static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata;
36static struct rio_detail *rio_devs[MAX_NUMNODES*4] __initdata;
37
38static int __init setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus)
39{
40 int twister = 0, node = 0;
41 int i, bus, num_buses;
42
43 for(i = 0; i < rio_table_hdr->num_rio_dev; i++){
44 if (rio_devs[i]->node_id == rio_devs[wpeg_num]->owner_id){
45 twister = rio_devs[i]->owner_id;
46 break;
47 }
48 }
49 if (i == rio_table_hdr->num_rio_dev){
50 printk(KERN_ERR "%s: Couldn't find owner Cyclone for Winnipeg!\n", __FUNCTION__);
51 return last_bus;
52 }
53
54 for(i = 0; i < rio_table_hdr->num_scal_dev; i++){
55 if (scal_devs[i]->node_id == twister){
56 node = scal_devs[i]->node_id;
57 break;
58 }
59 }
60 if (i == rio_table_hdr->num_scal_dev){
61 printk(KERN_ERR "%s: Couldn't find owner Twister for Cyclone!\n", __FUNCTION__);
62 return last_bus;
63 }
64
65 switch (rio_devs[wpeg_num]->type){
66 case CompatWPEG:
67 /* The Compatability Winnipeg controls the 2 legacy buses,
68 * the 66MHz PCI bus [2 slots] and the 2 "extra" buses in case
69 * a PCI-PCI bridge card is used in either slot: total 5 buses.
70 */
71 num_buses = 5;
72 break;
73 case AltWPEG:
74 /* The Alternate Winnipeg controls the 2 133MHz buses [1 slot
75 * each], their 2 "extra" buses, the 100MHz bus [2 slots] and
76 * the "extra" buses for each of those slots: total 7 buses.
77 */
78 num_buses = 7;
79 break;
80 case LookOutAWPEG:
81 case LookOutBWPEG:
82 /* A Lookout Winnipeg controls 3 100MHz buses [2 slots each]
83 * & the "extra" buses for each of those slots: total 9 buses.
84 */
85 num_buses = 9;
86 break;
87 default:
88 printk(KERN_INFO "%s: Unsupported Winnipeg type!\n", __FUNCTION__);
89 return last_bus;
90 }
91
92 for(bus = last_bus; bus < last_bus + num_buses; bus++)
93 mp_bus_id_to_node[bus] = node;
94 return bus;
95}
96
97static int __init build_detail_arrays(void)
98{
99 unsigned long ptr;
100 int i, scal_detail_size, rio_detail_size;
101
102 if (rio_table_hdr->num_scal_dev > MAX_NUMNODES){
103 printk(KERN_WARNING "%s: MAX_NUMNODES too low! Defined as %d, but system has %d nodes.\n", __FUNCTION__, MAX_NUMNODES, rio_table_hdr->num_scal_dev);
104 return 0;
105 }
106
107 switch (rio_table_hdr->version){
108 default:
109 printk(KERN_WARNING "%s: Invalid Rio Grande Table Version: %d\n", __FUNCTION__, rio_table_hdr->version);
110 return 0;
111 case 2:
112 scal_detail_size = 11;
113 rio_detail_size = 13;
114 break;
115 case 3:
116 scal_detail_size = 12;
117 rio_detail_size = 15;
118 break;
119 }
120
121 ptr = (unsigned long)rio_table_hdr + 3;
122 for(i = 0; i < rio_table_hdr->num_scal_dev; i++, ptr += scal_detail_size)
123 scal_devs[i] = (struct scal_detail *)ptr;
124
125 for(i = 0; i < rio_table_hdr->num_rio_dev; i++, ptr += rio_detail_size)
126 rio_devs[i] = (struct rio_detail *)ptr;
127
128 return 1;
129}
130
131void __init setup_summit(void)
132{
133 unsigned long ptr;
134 unsigned short offset;
135 int i, next_wpeg, next_bus = 0;
136
137 /* The pointer to the EBDA is stored in the word @ phys 0x40E(40:0E) */
138 ptr = *(unsigned short *)phys_to_virt(0x40Eul);
139 ptr = (unsigned long)phys_to_virt(ptr << 4);
140
141 rio_table_hdr = NULL;
142 offset = 0x180;
143 while (offset){
144 /* The block id is stored in the 2nd word */
145 if (*((unsigned short *)(ptr + offset + 2)) == 0x4752){
146 /* set the pointer past the offset & block id */
147 rio_table_hdr = (struct rio_table_hdr *)(ptr + offset + 4);
148 break;
149 }
150 /* The next offset is stored in the 1st word. 0 means no more */
151 offset = *((unsigned short *)(ptr + offset));
152 }
153 if (!rio_table_hdr){
154 printk(KERN_ERR "%s: Unable to locate Rio Grande Table in EBDA - bailing!\n", __FUNCTION__);
155 return;
156 }
157
158 if (!build_detail_arrays())
159 return;
160
161 /* The first Winnipeg we're looking for has an index of 0 */
162 next_wpeg = 0;
163 do {
164 for(i = 0; i < rio_table_hdr->num_rio_dev; i++){
165 if (is_WPEG(rio_devs[i]) && rio_devs[i]->WP_index == next_wpeg){
166 /* It's the Winnipeg we're looking for! */
167 next_bus = setup_pci_node_map_for_wpeg(i, next_bus);
168 next_wpeg++;
169 break;
170 }
171 }
172 /*
173 * If we go through all Rio devices and don't find one with
174 * the next index, it means we've found all the Winnipegs,
175 * and thus all the PCI buses.
176 */
177 if (i == rio_table_hdr->num_rio_dev)
178 next_wpeg = 0;
179 } while (next_wpeg != 0);
180}
diff --git a/arch/x86/kernel/suspend_64.c b/arch/x86/kernel/suspend_64.c
new file mode 100644
index 000000000000..573c0a6e0ac6
--- /dev/null
+++ b/arch/x86/kernel/suspend_64.c
@@ -0,0 +1,239 @@
1/*
2 * Suspend support specific for i386.
3 *
4 * Distribute under GPLv2
5 *
6 * Copyright (c) 2002 Pavel Machek <pavel@suse.cz>
7 * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org>
8 */
9
10#include <linux/smp.h>
11#include <linux/suspend.h>
12#include <asm/proto.h>
13#include <asm/page.h>
14#include <asm/pgtable.h>
15#include <asm/mtrr.h>
16
17/* References to section boundaries */
18extern const void __nosave_begin, __nosave_end;
19
20struct saved_context saved_context;
21
22unsigned long saved_context_eax, saved_context_ebx, saved_context_ecx, saved_context_edx;
23unsigned long saved_context_esp, saved_context_ebp, saved_context_esi, saved_context_edi;
24unsigned long saved_context_r08, saved_context_r09, saved_context_r10, saved_context_r11;
25unsigned long saved_context_r12, saved_context_r13, saved_context_r14, saved_context_r15;
26unsigned long saved_context_eflags;
27
28void __save_processor_state(struct saved_context *ctxt)
29{
30 kernel_fpu_begin();
31
32 /*
33 * descriptor tables
34 */
35 asm volatile ("sgdt %0" : "=m" (ctxt->gdt_limit));
36 asm volatile ("sidt %0" : "=m" (ctxt->idt_limit));
37 asm volatile ("str %0" : "=m" (ctxt->tr));
38
39 /* XMM0..XMM15 should be handled by kernel_fpu_begin(). */
40 /*
41 * segment registers
42 */
43 asm volatile ("movw %%ds, %0" : "=m" (ctxt->ds));
44 asm volatile ("movw %%es, %0" : "=m" (ctxt->es));
45 asm volatile ("movw %%fs, %0" : "=m" (ctxt->fs));
46 asm volatile ("movw %%gs, %0" : "=m" (ctxt->gs));
47 asm volatile ("movw %%ss, %0" : "=m" (ctxt->ss));
48
49 rdmsrl(MSR_FS_BASE, ctxt->fs_base);
50 rdmsrl(MSR_GS_BASE, ctxt->gs_base);
51 rdmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base);
52 mtrr_save_fixed_ranges(NULL);
53
54 /*
55 * control registers
56 */
57 rdmsrl(MSR_EFER, ctxt->efer);
58 ctxt->cr0 = read_cr0();
59 ctxt->cr2 = read_cr2();
60 ctxt->cr3 = read_cr3();
61 ctxt->cr4 = read_cr4();
62 ctxt->cr8 = read_cr8();
63}
64
65void save_processor_state(void)
66{
67 __save_processor_state(&saved_context);
68}
69
70static void do_fpu_end(void)
71{
72 /*
73 * Restore FPU regs if necessary
74 */
75 kernel_fpu_end();
76}
77
78void __restore_processor_state(struct saved_context *ctxt)
79{
80 /*
81 * control registers
82 */
83 wrmsrl(MSR_EFER, ctxt->efer);
84 write_cr8(ctxt->cr8);
85 write_cr4(ctxt->cr4);
86 write_cr3(ctxt->cr3);
87 write_cr2(ctxt->cr2);
88 write_cr0(ctxt->cr0);
89
90 /*
91 * now restore the descriptor tables to their proper values
92 * ltr is done i fix_processor_context().
93 */
94 asm volatile ("lgdt %0" :: "m" (ctxt->gdt_limit));
95 asm volatile ("lidt %0" :: "m" (ctxt->idt_limit));
96
97 /*
98 * segment registers
99 */
100 asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds));
101 asm volatile ("movw %0, %%es" :: "r" (ctxt->es));
102 asm volatile ("movw %0, %%fs" :: "r" (ctxt->fs));
103 load_gs_index(ctxt->gs);
104 asm volatile ("movw %0, %%ss" :: "r" (ctxt->ss));
105
106 wrmsrl(MSR_FS_BASE, ctxt->fs_base);
107 wrmsrl(MSR_GS_BASE, ctxt->gs_base);
108 wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base);
109
110 fix_processor_context();
111
112 do_fpu_end();
113 mtrr_ap_init();
114}
115
116void restore_processor_state(void)
117{
118 __restore_processor_state(&saved_context);
119}
120
121void fix_processor_context(void)
122{
123 int cpu = smp_processor_id();
124 struct tss_struct *t = &per_cpu(init_tss, cpu);
125
126 set_tss_desc(cpu,t); /* This just modifies memory; should not be neccessary. But... This is neccessary, because 386 hardware has concept of busy TSS or some similar stupidity. */
127
128 cpu_gdt(cpu)[GDT_ENTRY_TSS].type = 9;
129
130 syscall_init(); /* This sets MSR_*STAR and related */
131 load_TR_desc(); /* This does ltr */
132 load_LDT(&current->active_mm->context); /* This does lldt */
133
134 /*
135 * Now maybe reload the debug registers
136 */
137 if (current->thread.debugreg7){
138 loaddebug(&current->thread, 0);
139 loaddebug(&current->thread, 1);
140 loaddebug(&current->thread, 2);
141 loaddebug(&current->thread, 3);
142 /* no 4 and 5 */
143 loaddebug(&current->thread, 6);
144 loaddebug(&current->thread, 7);
145 }
146
147}
148
149#ifdef CONFIG_HIBERNATION
150/* Defined in arch/x86_64/kernel/suspend_asm.S */
151extern int restore_image(void);
152
153pgd_t *temp_level4_pgt;
154
155static int res_phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
156{
157 long i, j;
158
159 i = pud_index(address);
160 pud = pud + i;
161 for (; i < PTRS_PER_PUD; pud++, i++) {
162 unsigned long paddr;
163 pmd_t *pmd;
164
165 paddr = address + i*PUD_SIZE;
166 if (paddr >= end)
167 break;
168
169 pmd = (pmd_t *)get_safe_page(GFP_ATOMIC);
170 if (!pmd)
171 return -ENOMEM;
172 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
173 for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) {
174 unsigned long pe;
175
176 if (paddr >= end)
177 break;
178 pe = _PAGE_NX | _PAGE_PSE | _KERNPG_TABLE | paddr;
179 pe &= __supported_pte_mask;
180 set_pmd(pmd, __pmd(pe));
181 }
182 }
183 return 0;
184}
185
186static int set_up_temporary_mappings(void)
187{
188 unsigned long start, end, next;
189 int error;
190
191 temp_level4_pgt = (pgd_t *)get_safe_page(GFP_ATOMIC);
192 if (!temp_level4_pgt)
193 return -ENOMEM;
194
195 /* It is safe to reuse the original kernel mapping */
196 set_pgd(temp_level4_pgt + pgd_index(__START_KERNEL_map),
197 init_level4_pgt[pgd_index(__START_KERNEL_map)]);
198
199 /* Set up the direct mapping from scratch */
200 start = (unsigned long)pfn_to_kaddr(0);
201 end = (unsigned long)pfn_to_kaddr(end_pfn);
202
203 for (; start < end; start = next) {
204 pud_t *pud = (pud_t *)get_safe_page(GFP_ATOMIC);
205 if (!pud)
206 return -ENOMEM;
207 next = start + PGDIR_SIZE;
208 if (next > end)
209 next = end;
210 if ((error = res_phys_pud_init(pud, __pa(start), __pa(next))))
211 return error;
212 set_pgd(temp_level4_pgt + pgd_index(start),
213 mk_kernel_pgd(__pa(pud)));
214 }
215 return 0;
216}
217
218int swsusp_arch_resume(void)
219{
220 int error;
221
222 /* We have got enough memory and from now on we cannot recover */
223 if ((error = set_up_temporary_mappings()))
224 return error;
225 restore_image();
226 return 0;
227}
228
229/*
230 * pfn_is_nosave - check if given pfn is in the 'nosave' section
231 */
232
233int pfn_is_nosave(unsigned long pfn)
234{
235 unsigned long nosave_begin_pfn = __pa_symbol(&__nosave_begin) >> PAGE_SHIFT;
236 unsigned long nosave_end_pfn = PAGE_ALIGN(__pa_symbol(&__nosave_end)) >> PAGE_SHIFT;
237 return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
238}
239#endif /* CONFIG_HIBERNATION */
diff --git a/arch/x86/kernel/suspend_asm_64.S b/arch/x86/kernel/suspend_asm_64.S
new file mode 100644
index 000000000000..16d183f67bc1
--- /dev/null
+++ b/arch/x86/kernel/suspend_asm_64.S
@@ -0,0 +1,110 @@
1/* Copyright 2004,2005 Pavel Machek <pavel@suse.cz>, Andi Kleen <ak@suse.de>, Rafael J. Wysocki <rjw@sisk.pl>
2 *
3 * Distribute under GPLv2.
4 *
5 * swsusp_arch_resume may not use any stack, nor any variable that is
6 * not "NoSave" during copying pages:
7 *
8 * Its rewriting one kernel image with another. What is stack in "old"
9 * image could very well be data page in "new" image, and overwriting
10 * your own stack under you is bad idea.
11 */
12
13 .text
14#include <linux/linkage.h>
15#include <asm/segment.h>
16#include <asm/page.h>
17#include <asm/asm-offsets.h>
18
19ENTRY(swsusp_arch_suspend)
20
21 movq %rsp, saved_context_esp(%rip)
22 movq %rax, saved_context_eax(%rip)
23 movq %rbx, saved_context_ebx(%rip)
24 movq %rcx, saved_context_ecx(%rip)
25 movq %rdx, saved_context_edx(%rip)
26 movq %rbp, saved_context_ebp(%rip)
27 movq %rsi, saved_context_esi(%rip)
28 movq %rdi, saved_context_edi(%rip)
29 movq %r8, saved_context_r08(%rip)
30 movq %r9, saved_context_r09(%rip)
31 movq %r10, saved_context_r10(%rip)
32 movq %r11, saved_context_r11(%rip)
33 movq %r12, saved_context_r12(%rip)
34 movq %r13, saved_context_r13(%rip)
35 movq %r14, saved_context_r14(%rip)
36 movq %r15, saved_context_r15(%rip)
37 pushfq ; popq saved_context_eflags(%rip)
38
39 call swsusp_save
40 ret
41
42ENTRY(restore_image)
43 /* switch to temporary page tables */
44 movq $__PAGE_OFFSET, %rdx
45 movq temp_level4_pgt(%rip), %rax
46 subq %rdx, %rax
47 movq %rax, %cr3
48 /* Flush TLB */
49 movq mmu_cr4_features(%rip), %rax
50 movq %rax, %rdx
51 andq $~(1<<7), %rdx # PGE
52 movq %rdx, %cr4; # turn off PGE
53 movq %cr3, %rcx; # flush TLB
54 movq %rcx, %cr3;
55 movq %rax, %cr4; # turn PGE back on
56
57 movq restore_pblist(%rip), %rdx
58loop:
59 testq %rdx, %rdx
60 jz done
61
62 /* get addresses from the pbe and copy the page */
63 movq pbe_address(%rdx), %rsi
64 movq pbe_orig_address(%rdx), %rdi
65 movq $512, %rcx
66 rep
67 movsq
68
69 /* progress to the next pbe */
70 movq pbe_next(%rdx), %rdx
71 jmp loop
72done:
73 /* go back to the original page tables */
74 movq $(init_level4_pgt - __START_KERNEL_map), %rax
75 addq phys_base(%rip), %rax
76 movq %rax, %cr3
77
78 /* Flush TLB, including "global" things (vmalloc) */
79 movq mmu_cr4_features(%rip), %rax
80 movq %rax, %rdx
81 andq $~(1<<7), %rdx; # PGE
82 movq %rdx, %cr4; # turn off PGE
83 movq %cr3, %rcx; # flush TLB
84 movq %rcx, %cr3
85 movq %rax, %cr4; # turn PGE back on
86
87 movl $24, %eax
88 movl %eax, %ds
89
90 movq saved_context_esp(%rip), %rsp
91 movq saved_context_ebp(%rip), %rbp
92 /* Don't restore %rax, it must be 0 anyway */
93 movq saved_context_ebx(%rip), %rbx
94 movq saved_context_ecx(%rip), %rcx
95 movq saved_context_edx(%rip), %rdx
96 movq saved_context_esi(%rip), %rsi
97 movq saved_context_edi(%rip), %rdi
98 movq saved_context_r08(%rip), %r8
99 movq saved_context_r09(%rip), %r9
100 movq saved_context_r10(%rip), %r10
101 movq saved_context_r11(%rip), %r11
102 movq saved_context_r12(%rip), %r12
103 movq saved_context_r13(%rip), %r13
104 movq saved_context_r14(%rip), %r14
105 movq saved_context_r15(%rip), %r15
106 pushq saved_context_eflags(%rip) ; popfq
107
108 xorq %rax, %rax
109
110 ret
diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c
new file mode 100644
index 000000000000..42147304de88
--- /dev/null
+++ b/arch/x86/kernel/sys_i386_32.c
@@ -0,0 +1,265 @@
1/*
2 * linux/arch/i386/kernel/sys_i386.c
3 *
4 * This file contains various random system calls that
5 * have a non-standard calling sequence on the Linux/i386
6 * platform.
7 */
8
9#include <linux/errno.h>
10#include <linux/sched.h>
11#include <linux/mm.h>
12#include <linux/fs.h>
13#include <linux/smp.h>
14#include <linux/sem.h>
15#include <linux/msg.h>
16#include <linux/shm.h>
17#include <linux/stat.h>
18#include <linux/syscalls.h>
19#include <linux/mman.h>
20#include <linux/file.h>
21#include <linux/utsname.h>
22
23#include <asm/uaccess.h>
24#include <asm/unistd.h>
25#include <asm/ipc.h>
26
27/*
28 * sys_pipe() is the normal C calling standard for creating
29 * a pipe. It's not the way Unix traditionally does this, though.
30 */
31asmlinkage int sys_pipe(unsigned long __user * fildes)
32{
33 int fd[2];
34 int error;
35
36 error = do_pipe(fd);
37 if (!error) {
38 if (copy_to_user(fildes, fd, 2*sizeof(int)))
39 error = -EFAULT;
40 }
41 return error;
42}
43
44asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
45 unsigned long prot, unsigned long flags,
46 unsigned long fd, unsigned long pgoff)
47{
48 int error = -EBADF;
49 struct file *file = NULL;
50 struct mm_struct *mm = current->mm;
51
52 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
53 if (!(flags & MAP_ANONYMOUS)) {
54 file = fget(fd);
55 if (!file)
56 goto out;
57 }
58
59 down_write(&mm->mmap_sem);
60 error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
61 up_write(&mm->mmap_sem);
62
63 if (file)
64 fput(file);
65out:
66 return error;
67}
68
69/*
70 * Perform the select(nd, in, out, ex, tv) and mmap() system
71 * calls. Linux/i386 didn't use to be able to handle more than
72 * 4 system call parameters, so these system calls used a memory
73 * block for parameter passing..
74 */
75
76struct mmap_arg_struct {
77 unsigned long addr;
78 unsigned long len;
79 unsigned long prot;
80 unsigned long flags;
81 unsigned long fd;
82 unsigned long offset;
83};
84
85asmlinkage int old_mmap(struct mmap_arg_struct __user *arg)
86{
87 struct mmap_arg_struct a;
88 int err = -EFAULT;
89
90 if (copy_from_user(&a, arg, sizeof(a)))
91 goto out;
92
93 err = -EINVAL;
94 if (a.offset & ~PAGE_MASK)
95 goto out;
96
97 err = sys_mmap2(a.addr, a.len, a.prot, a.flags,
98 a.fd, a.offset >> PAGE_SHIFT);
99out:
100 return err;
101}
102
103
104struct sel_arg_struct {
105 unsigned long n;
106 fd_set __user *inp, *outp, *exp;
107 struct timeval __user *tvp;
108};
109
110asmlinkage int old_select(struct sel_arg_struct __user *arg)
111{
112 struct sel_arg_struct a;
113
114 if (copy_from_user(&a, arg, sizeof(a)))
115 return -EFAULT;
116 /* sys_select() does the appropriate kernel locking */
117 return sys_select(a.n, a.inp, a.outp, a.exp, a.tvp);
118}
119
120/*
121 * sys_ipc() is the de-multiplexer for the SysV IPC calls..
122 *
123 * This is really horribly ugly.
124 */
125asmlinkage int sys_ipc (uint call, int first, int second,
126 int third, void __user *ptr, long fifth)
127{
128 int version, ret;
129
130 version = call >> 16; /* hack for backward compatibility */
131 call &= 0xffff;
132
133 switch (call) {
134 case SEMOP:
135 return sys_semtimedop (first, (struct sembuf __user *)ptr, second, NULL);
136 case SEMTIMEDOP:
137 return sys_semtimedop(first, (struct sembuf __user *)ptr, second,
138 (const struct timespec __user *)fifth);
139
140 case SEMGET:
141 return sys_semget (first, second, third);
142 case SEMCTL: {
143 union semun fourth;
144 if (!ptr)
145 return -EINVAL;
146 if (get_user(fourth.__pad, (void __user * __user *) ptr))
147 return -EFAULT;
148 return sys_semctl (first, second, third, fourth);
149 }
150
151 case MSGSND:
152 return sys_msgsnd (first, (struct msgbuf __user *) ptr,
153 second, third);
154 case MSGRCV:
155 switch (version) {
156 case 0: {
157 struct ipc_kludge tmp;
158 if (!ptr)
159 return -EINVAL;
160
161 if (copy_from_user(&tmp,
162 (struct ipc_kludge __user *) ptr,
163 sizeof (tmp)))
164 return -EFAULT;
165 return sys_msgrcv (first, tmp.msgp, second,
166 tmp.msgtyp, third);
167 }
168 default:
169 return sys_msgrcv (first,
170 (struct msgbuf __user *) ptr,
171 second, fifth, third);
172 }
173 case MSGGET:
174 return sys_msgget ((key_t) first, second);
175 case MSGCTL:
176 return sys_msgctl (first, second, (struct msqid_ds __user *) ptr);
177
178 case SHMAT:
179 switch (version) {
180 default: {
181 ulong raddr;
182 ret = do_shmat (first, (char __user *) ptr, second, &raddr);
183 if (ret)
184 return ret;
185 return put_user (raddr, (ulong __user *) third);
186 }
187 case 1: /* iBCS2 emulator entry point */
188 if (!segment_eq(get_fs(), get_ds()))
189 return -EINVAL;
190 /* The "(ulong *) third" is valid _only_ because of the kernel segment thing */
191 return do_shmat (first, (char __user *) ptr, second, (ulong *) third);
192 }
193 case SHMDT:
194 return sys_shmdt ((char __user *)ptr);
195 case SHMGET:
196 return sys_shmget (first, second, third);
197 case SHMCTL:
198 return sys_shmctl (first, second,
199 (struct shmid_ds __user *) ptr);
200 default:
201 return -ENOSYS;
202 }
203}
204
205/*
206 * Old cruft
207 */
208asmlinkage int sys_uname(struct old_utsname __user * name)
209{
210 int err;
211 if (!name)
212 return -EFAULT;
213 down_read(&uts_sem);
214 err = copy_to_user(name, utsname(), sizeof (*name));
215 up_read(&uts_sem);
216 return err?-EFAULT:0;
217}
218
219asmlinkage int sys_olduname(struct oldold_utsname __user * name)
220{
221 int error;
222
223 if (!name)
224 return -EFAULT;
225 if (!access_ok(VERIFY_WRITE,name,sizeof(struct oldold_utsname)))
226 return -EFAULT;
227
228 down_read(&uts_sem);
229
230 error = __copy_to_user(&name->sysname, &utsname()->sysname,
231 __OLD_UTS_LEN);
232 error |= __put_user(0, name->sysname + __OLD_UTS_LEN);
233 error |= __copy_to_user(&name->nodename, &utsname()->nodename,
234 __OLD_UTS_LEN);
235 error |= __put_user(0, name->nodename + __OLD_UTS_LEN);
236 error |= __copy_to_user(&name->release, &utsname()->release,
237 __OLD_UTS_LEN);
238 error |= __put_user(0, name->release + __OLD_UTS_LEN);
239 error |= __copy_to_user(&name->version, &utsname()->version,
240 __OLD_UTS_LEN);
241 error |= __put_user(0, name->version + __OLD_UTS_LEN);
242 error |= __copy_to_user(&name->machine, &utsname()->machine,
243 __OLD_UTS_LEN);
244 error |= __put_user(0, name->machine + __OLD_UTS_LEN);
245
246 up_read(&uts_sem);
247
248 error = error ? -EFAULT : 0;
249
250 return error;
251}
252
253
254/*
255 * Do a system call from kernel instead of calling sys_execve so we
256 * end up with proper pt_regs.
257 */
258int kernel_execve(const char *filename, char *const argv[], char *const envp[])
259{
260 long __res;
261 asm volatile ("push %%ebx ; movl %2,%%ebx ; int $0x80 ; pop %%ebx"
262 : "=a" (__res)
263 : "0" (__NR_execve),"ri" (filename),"c" (argv), "d" (envp) : "memory");
264 return __res;
265}
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
new file mode 100644
index 000000000000..4770b7a2052c
--- /dev/null
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -0,0 +1,159 @@
1/*
2 * linux/arch/x86_64/kernel/sys_x86_64.c
3 */
4
5#include <linux/errno.h>
6#include <linux/sched.h>
7#include <linux/syscalls.h>
8#include <linux/mm.h>
9#include <linux/fs.h>
10#include <linux/smp.h>
11#include <linux/sem.h>
12#include <linux/msg.h>
13#include <linux/shm.h>
14#include <linux/stat.h>
15#include <linux/mman.h>
16#include <linux/file.h>
17#include <linux/utsname.h>
18#include <linux/personality.h>
19
20#include <asm/uaccess.h>
21#include <asm/ia32.h>
22
23/*
24 * sys_pipe() is the normal C calling standard for creating
25 * a pipe. It's not the way Unix traditionally does this, though.
26 */
27asmlinkage long sys_pipe(int __user *fildes)
28{
29 int fd[2];
30 int error;
31
32 error = do_pipe(fd);
33 if (!error) {
34 if (copy_to_user(fildes, fd, 2*sizeof(int)))
35 error = -EFAULT;
36 }
37 return error;
38}
39
40asmlinkage long sys_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags,
41 unsigned long fd, unsigned long off)
42{
43 long error;
44 struct file * file;
45
46 error = -EINVAL;
47 if (off & ~PAGE_MASK)
48 goto out;
49
50 error = -EBADF;
51 file = NULL;
52 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
53 if (!(flags & MAP_ANONYMOUS)) {
54 file = fget(fd);
55 if (!file)
56 goto out;
57 }
58 down_write(&current->mm->mmap_sem);
59 error = do_mmap_pgoff(file, addr, len, prot, flags, off >> PAGE_SHIFT);
60 up_write(&current->mm->mmap_sem);
61
62 if (file)
63 fput(file);
64out:
65 return error;
66}
67
68static void find_start_end(unsigned long flags, unsigned long *begin,
69 unsigned long *end)
70{
71 if (!test_thread_flag(TIF_IA32) && (flags & MAP_32BIT)) {
72 /* This is usually used needed to map code in small
73 model, so it needs to be in the first 31bit. Limit
74 it to that. This means we need to move the
75 unmapped base down for this case. This can give
76 conflicts with the heap, but we assume that glibc
77 malloc knows how to fall back to mmap. Give it 1GB
78 of playground for now. -AK */
79 *begin = 0x40000000;
80 *end = 0x80000000;
81 } else {
82 *begin = TASK_UNMAPPED_BASE;
83 *end = TASK_SIZE;
84 }
85}
86
87unsigned long
88arch_get_unmapped_area(struct file *filp, unsigned long addr,
89 unsigned long len, unsigned long pgoff, unsigned long flags)
90{
91 struct mm_struct *mm = current->mm;
92 struct vm_area_struct *vma;
93 unsigned long start_addr;
94 unsigned long begin, end;
95
96 if (flags & MAP_FIXED)
97 return addr;
98
99 find_start_end(flags, &begin, &end);
100
101 if (len > end)
102 return -ENOMEM;
103
104 if (addr) {
105 addr = PAGE_ALIGN(addr);
106 vma = find_vma(mm, addr);
107 if (end - len >= addr &&
108 (!vma || addr + len <= vma->vm_start))
109 return addr;
110 }
111 if (((flags & MAP_32BIT) || test_thread_flag(TIF_IA32))
112 && len <= mm->cached_hole_size) {
113 mm->cached_hole_size = 0;
114 mm->free_area_cache = begin;
115 }
116 addr = mm->free_area_cache;
117 if (addr < begin)
118 addr = begin;
119 start_addr = addr;
120
121full_search:
122 for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
123 /* At this point: (!vma || addr < vma->vm_end). */
124 if (end - len < addr) {
125 /*
126 * Start a new search - just in case we missed
127 * some holes.
128 */
129 if (start_addr != begin) {
130 start_addr = addr = begin;
131 mm->cached_hole_size = 0;
132 goto full_search;
133 }
134 return -ENOMEM;
135 }
136 if (!vma || addr + len <= vma->vm_start) {
137 /*
138 * Remember the place where we stopped the search:
139 */
140 mm->free_area_cache = addr + len;
141 return addr;
142 }
143 if (addr + mm->cached_hole_size < vma->vm_start)
144 mm->cached_hole_size = vma->vm_start - addr;
145
146 addr = vma->vm_end;
147 }
148}
149
150asmlinkage long sys_uname(struct new_utsname __user * name)
151{
152 int err;
153 down_read(&uts_sem);
154 err = copy_to_user(name, utsname(), sizeof (*name));
155 up_read(&uts_sem);
156 if (personality(current->personality) == PER_LINUX32)
157 err |= copy_to_user(&name->machine, "i686", 5);
158 return err ? -EFAULT : 0;
159}
diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c
new file mode 100644
index 000000000000..9d498c2f8eea
--- /dev/null
+++ b/arch/x86/kernel/syscall_64.c
@@ -0,0 +1,26 @@
1/* System call table for x86-64. */
2
3#include <linux/linkage.h>
4#include <linux/sys.h>
5#include <linux/cache.h>
6#include <asm/asm-offsets.h>
7
8#define __NO_STUBS
9
10#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ;
11#undef _ASM_X86_64_UNISTD_H_
12#include <asm/unistd_64.h>
13
14#undef __SYSCALL
15#define __SYSCALL(nr, sym) [ nr ] = sym,
16#undef _ASM_X86_64_UNISTD_H_
17
18typedef void (*sys_call_ptr_t)(void);
19
20extern void sys_ni_syscall(void);
21
22const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
23 /* Smells like a like a compiler bug -- it doesn't work when the & below is removed. */
24 [0 ... __NR_syscall_max] = &sys_ni_syscall,
25#include <asm/unistd_64.h>
26};
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
new file mode 100644
index 000000000000..8344c70adf61
--- /dev/null
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -0,0 +1,326 @@
1ENTRY(sys_call_table)
2 .long sys_restart_syscall /* 0 - old "setup()" system call, used for restarting */
3 .long sys_exit
4 .long sys_fork
5 .long sys_read
6 .long sys_write
7 .long sys_open /* 5 */
8 .long sys_close
9 .long sys_waitpid
10 .long sys_creat
11 .long sys_link
12 .long sys_unlink /* 10 */
13 .long sys_execve
14 .long sys_chdir
15 .long sys_time
16 .long sys_mknod
17 .long sys_chmod /* 15 */
18 .long sys_lchown16
19 .long sys_ni_syscall /* old break syscall holder */
20 .long sys_stat
21 .long sys_lseek
22 .long sys_getpid /* 20 */
23 .long sys_mount
24 .long sys_oldumount
25 .long sys_setuid16
26 .long sys_getuid16
27 .long sys_stime /* 25 */
28 .long sys_ptrace
29 .long sys_alarm
30 .long sys_fstat
31 .long sys_pause
32 .long sys_utime /* 30 */
33 .long sys_ni_syscall /* old stty syscall holder */
34 .long sys_ni_syscall /* old gtty syscall holder */
35 .long sys_access
36 .long sys_nice
37 .long sys_ni_syscall /* 35 - old ftime syscall holder */
38 .long sys_sync
39 .long sys_kill
40 .long sys_rename
41 .long sys_mkdir
42 .long sys_rmdir /* 40 */
43 .long sys_dup
44 .long sys_pipe
45 .long sys_times
46 .long sys_ni_syscall /* old prof syscall holder */
47 .long sys_brk /* 45 */
48 .long sys_setgid16
49 .long sys_getgid16
50 .long sys_signal
51 .long sys_geteuid16
52 .long sys_getegid16 /* 50 */
53 .long sys_acct
54 .long sys_umount /* recycled never used phys() */
55 .long sys_ni_syscall /* old lock syscall holder */
56 .long sys_ioctl
57 .long sys_fcntl /* 55 */
58 .long sys_ni_syscall /* old mpx syscall holder */
59 .long sys_setpgid
60 .long sys_ni_syscall /* old ulimit syscall holder */
61 .long sys_olduname
62 .long sys_umask /* 60 */
63 .long sys_chroot
64 .long sys_ustat
65 .long sys_dup2
66 .long sys_getppid
67 .long sys_getpgrp /* 65 */
68 .long sys_setsid
69 .long sys_sigaction
70 .long sys_sgetmask
71 .long sys_ssetmask
72 .long sys_setreuid16 /* 70 */
73 .long sys_setregid16
74 .long sys_sigsuspend
75 .long sys_sigpending
76 .long sys_sethostname
77 .long sys_setrlimit /* 75 */
78 .long sys_old_getrlimit
79 .long sys_getrusage
80 .long sys_gettimeofday
81 .long sys_settimeofday
82 .long sys_getgroups16 /* 80 */
83 .long sys_setgroups16
84 .long old_select
85 .long sys_symlink
86 .long sys_lstat
87 .long sys_readlink /* 85 */
88 .long sys_uselib
89 .long sys_swapon
90 .long sys_reboot
91 .long old_readdir
92 .long old_mmap /* 90 */
93 .long sys_munmap
94 .long sys_truncate
95 .long sys_ftruncate
96 .long sys_fchmod
97 .long sys_fchown16 /* 95 */
98 .long sys_getpriority
99 .long sys_setpriority
100 .long sys_ni_syscall /* old profil syscall holder */
101 .long sys_statfs
102 .long sys_fstatfs /* 100 */
103 .long sys_ioperm
104 .long sys_socketcall
105 .long sys_syslog
106 .long sys_setitimer
107 .long sys_getitimer /* 105 */
108 .long sys_newstat
109 .long sys_newlstat
110 .long sys_newfstat
111 .long sys_uname
112 .long sys_iopl /* 110 */
113 .long sys_vhangup
114 .long sys_ni_syscall /* old "idle" system call */
115 .long sys_vm86old
116 .long sys_wait4
117 .long sys_swapoff /* 115 */
118 .long sys_sysinfo
119 .long sys_ipc
120 .long sys_fsync
121 .long sys_sigreturn
122 .long sys_clone /* 120 */
123 .long sys_setdomainname
124 .long sys_newuname
125 .long sys_modify_ldt
126 .long sys_adjtimex
127 .long sys_mprotect /* 125 */
128 .long sys_sigprocmask
129 .long sys_ni_syscall /* old "create_module" */
130 .long sys_init_module
131 .long sys_delete_module
132 .long sys_ni_syscall /* 130: old "get_kernel_syms" */
133 .long sys_quotactl
134 .long sys_getpgid
135 .long sys_fchdir
136 .long sys_bdflush
137 .long sys_sysfs /* 135 */
138 .long sys_personality
139 .long sys_ni_syscall /* reserved for afs_syscall */
140 .long sys_setfsuid16
141 .long sys_setfsgid16
142 .long sys_llseek /* 140 */
143 .long sys_getdents
144 .long sys_select
145 .long sys_flock
146 .long sys_msync
147 .long sys_readv /* 145 */
148 .long sys_writev
149 .long sys_getsid
150 .long sys_fdatasync
151 .long sys_sysctl
152 .long sys_mlock /* 150 */
153 .long sys_munlock
154 .long sys_mlockall
155 .long sys_munlockall
156 .long sys_sched_setparam
157 .long sys_sched_getparam /* 155 */
158 .long sys_sched_setscheduler
159 .long sys_sched_getscheduler
160 .long sys_sched_yield
161 .long sys_sched_get_priority_max
162 .long sys_sched_get_priority_min /* 160 */
163 .long sys_sched_rr_get_interval
164 .long sys_nanosleep
165 .long sys_mremap
166 .long sys_setresuid16
167 .long sys_getresuid16 /* 165 */
168 .long sys_vm86
169 .long sys_ni_syscall /* Old sys_query_module */
170 .long sys_poll
171 .long sys_nfsservctl
172 .long sys_setresgid16 /* 170 */
173 .long sys_getresgid16
174 .long sys_prctl
175 .long sys_rt_sigreturn
176 .long sys_rt_sigaction
177 .long sys_rt_sigprocmask /* 175 */
178 .long sys_rt_sigpending
179 .long sys_rt_sigtimedwait
180 .long sys_rt_sigqueueinfo
181 .long sys_rt_sigsuspend
182 .long sys_pread64 /* 180 */
183 .long sys_pwrite64
184 .long sys_chown16
185 .long sys_getcwd
186 .long sys_capget
187 .long sys_capset /* 185 */
188 .long sys_sigaltstack
189 .long sys_sendfile
190 .long sys_ni_syscall /* reserved for streams1 */
191 .long sys_ni_syscall /* reserved for streams2 */
192 .long sys_vfork /* 190 */
193 .long sys_getrlimit
194 .long sys_mmap2
195 .long sys_truncate64
196 .long sys_ftruncate64
197 .long sys_stat64 /* 195 */
198 .long sys_lstat64
199 .long sys_fstat64
200 .long sys_lchown
201 .long sys_getuid
202 .long sys_getgid /* 200 */
203 .long sys_geteuid
204 .long sys_getegid
205 .long sys_setreuid
206 .long sys_setregid
207 .long sys_getgroups /* 205 */
208 .long sys_setgroups
209 .long sys_fchown
210 .long sys_setresuid
211 .long sys_getresuid
212 .long sys_setresgid /* 210 */
213 .long sys_getresgid
214 .long sys_chown
215 .long sys_setuid
216 .long sys_setgid
217 .long sys_setfsuid /* 215 */
218 .long sys_setfsgid
219 .long sys_pivot_root
220 .long sys_mincore
221 .long sys_madvise
222 .long sys_getdents64 /* 220 */
223 .long sys_fcntl64
224 .long sys_ni_syscall /* reserved for TUX */
225 .long sys_ni_syscall
226 .long sys_gettid
227 .long sys_readahead /* 225 */
228 .long sys_setxattr
229 .long sys_lsetxattr
230 .long sys_fsetxattr
231 .long sys_getxattr
232 .long sys_lgetxattr /* 230 */
233 .long sys_fgetxattr
234 .long sys_listxattr
235 .long sys_llistxattr
236 .long sys_flistxattr
237 .long sys_removexattr /* 235 */
238 .long sys_lremovexattr
239 .long sys_fremovexattr
240 .long sys_tkill
241 .long sys_sendfile64
242 .long sys_futex /* 240 */
243 .long sys_sched_setaffinity
244 .long sys_sched_getaffinity
245 .long sys_set_thread_area
246 .long sys_get_thread_area
247 .long sys_io_setup /* 245 */
248 .long sys_io_destroy
249 .long sys_io_getevents
250 .long sys_io_submit
251 .long sys_io_cancel
252 .long sys_fadvise64 /* 250 */
253 .long sys_ni_syscall
254 .long sys_exit_group
255 .long sys_lookup_dcookie
256 .long sys_epoll_create
257 .long sys_epoll_ctl /* 255 */
258 .long sys_epoll_wait
259 .long sys_remap_file_pages
260 .long sys_set_tid_address
261 .long sys_timer_create
262 .long sys_timer_settime /* 260 */
263 .long sys_timer_gettime
264 .long sys_timer_getoverrun
265 .long sys_timer_delete
266 .long sys_clock_settime
267 .long sys_clock_gettime /* 265 */
268 .long sys_clock_getres
269 .long sys_clock_nanosleep
270 .long sys_statfs64
271 .long sys_fstatfs64
272 .long sys_tgkill /* 270 */
273 .long sys_utimes
274 .long sys_fadvise64_64
275 .long sys_ni_syscall /* sys_vserver */
276 .long sys_mbind
277 .long sys_get_mempolicy
278 .long sys_set_mempolicy
279 .long sys_mq_open
280 .long sys_mq_unlink
281 .long sys_mq_timedsend
282 .long sys_mq_timedreceive /* 280 */
283 .long sys_mq_notify
284 .long sys_mq_getsetattr
285 .long sys_kexec_load
286 .long sys_waitid
287 .long sys_ni_syscall /* 285 */ /* available */
288 .long sys_add_key
289 .long sys_request_key
290 .long sys_keyctl
291 .long sys_ioprio_set
292 .long sys_ioprio_get /* 290 */
293 .long sys_inotify_init
294 .long sys_inotify_add_watch
295 .long sys_inotify_rm_watch
296 .long sys_migrate_pages
297 .long sys_openat /* 295 */
298 .long sys_mkdirat
299 .long sys_mknodat
300 .long sys_fchownat
301 .long sys_futimesat
302 .long sys_fstatat64 /* 300 */
303 .long sys_unlinkat
304 .long sys_renameat
305 .long sys_linkat
306 .long sys_symlinkat
307 .long sys_readlinkat /* 305 */
308 .long sys_fchmodat
309 .long sys_faccessat
310 .long sys_pselect6
311 .long sys_ppoll
312 .long sys_unshare /* 310 */
313 .long sys_set_robust_list
314 .long sys_get_robust_list
315 .long sys_splice
316 .long sys_sync_file_range
317 .long sys_tee /* 315 */
318 .long sys_vmsplice
319 .long sys_move_pages
320 .long sys_getcpu
321 .long sys_epoll_pwait
322 .long sys_utimensat /* 320 */
323 .long sys_signalfd
324 .long sys_timerfd
325 .long sys_eventfd
326 .long sys_fallocate
diff --git a/arch/x86/kernel/sysenter_32.c b/arch/x86/kernel/sysenter_32.c
new file mode 100644
index 000000000000..4eb2e408764f
--- /dev/null
+++ b/arch/x86/kernel/sysenter_32.c
@@ -0,0 +1,348 @@
1/*
2 * linux/arch/i386/kernel/sysenter.c
3 *
4 * (C) Copyright 2002 Linus Torvalds
5 * Portions based on the vdso-randomization code from exec-shield:
6 * Copyright(C) 2005-2006, Red Hat, Inc., Ingo Molnar
7 *
8 * This file contains the needed initializations to support sysenter.
9 */
10
11#include <linux/init.h>
12#include <linux/smp.h>
13#include <linux/thread_info.h>
14#include <linux/sched.h>
15#include <linux/gfp.h>
16#include <linux/string.h>
17#include <linux/elf.h>
18#include <linux/mm.h>
19#include <linux/err.h>
20#include <linux/module.h>
21
22#include <asm/cpufeature.h>
23#include <asm/msr.h>
24#include <asm/pgtable.h>
25#include <asm/unistd.h>
26#include <asm/elf.h>
27#include <asm/tlbflush.h>
28
29enum {
30 VDSO_DISABLED = 0,
31 VDSO_ENABLED = 1,
32 VDSO_COMPAT = 2,
33};
34
35#ifdef CONFIG_COMPAT_VDSO
36#define VDSO_DEFAULT VDSO_COMPAT
37#else
38#define VDSO_DEFAULT VDSO_ENABLED
39#endif
40
41/*
42 * Should the kernel map a VDSO page into processes and pass its
43 * address down to glibc upon exec()?
44 */
45unsigned int __read_mostly vdso_enabled = VDSO_DEFAULT;
46
47EXPORT_SYMBOL_GPL(vdso_enabled);
48
49static int __init vdso_setup(char *s)
50{
51 vdso_enabled = simple_strtoul(s, NULL, 0);
52
53 return 1;
54}
55
56__setup("vdso=", vdso_setup);
57
58extern asmlinkage void sysenter_entry(void);
59
60static __init void reloc_symtab(Elf32_Ehdr *ehdr,
61 unsigned offset, unsigned size)
62{
63 Elf32_Sym *sym = (void *)ehdr + offset;
64 unsigned nsym = size / sizeof(*sym);
65 unsigned i;
66
67 for(i = 0; i < nsym; i++, sym++) {
68 if (sym->st_shndx == SHN_UNDEF ||
69 sym->st_shndx == SHN_ABS)
70 continue; /* skip */
71
72 if (sym->st_shndx > SHN_LORESERVE) {
73 printk(KERN_INFO "VDSO: unexpected st_shndx %x\n",
74 sym->st_shndx);
75 continue;
76 }
77
78 switch(ELF_ST_TYPE(sym->st_info)) {
79 case STT_OBJECT:
80 case STT_FUNC:
81 case STT_SECTION:
82 case STT_FILE:
83 sym->st_value += VDSO_HIGH_BASE;
84 }
85 }
86}
87
88static __init void reloc_dyn(Elf32_Ehdr *ehdr, unsigned offset)
89{
90 Elf32_Dyn *dyn = (void *)ehdr + offset;
91
92 for(; dyn->d_tag != DT_NULL; dyn++)
93 switch(dyn->d_tag) {
94 case DT_PLTGOT:
95 case DT_HASH:
96 case DT_STRTAB:
97 case DT_SYMTAB:
98 case DT_RELA:
99 case DT_INIT:
100 case DT_FINI:
101 case DT_REL:
102 case DT_DEBUG:
103 case DT_JMPREL:
104 case DT_VERSYM:
105 case DT_VERDEF:
106 case DT_VERNEED:
107 case DT_ADDRRNGLO ... DT_ADDRRNGHI:
108 /* definitely pointers needing relocation */
109 dyn->d_un.d_ptr += VDSO_HIGH_BASE;
110 break;
111
112 case DT_ENCODING ... OLD_DT_LOOS-1:
113 case DT_LOOS ... DT_HIOS-1:
114 /* Tags above DT_ENCODING are pointers if
115 they're even */
116 if (dyn->d_tag >= DT_ENCODING &&
117 (dyn->d_tag & 1) == 0)
118 dyn->d_un.d_ptr += VDSO_HIGH_BASE;
119 break;
120
121 case DT_VERDEFNUM:
122 case DT_VERNEEDNUM:
123 case DT_FLAGS_1:
124 case DT_RELACOUNT:
125 case DT_RELCOUNT:
126 case DT_VALRNGLO ... DT_VALRNGHI:
127 /* definitely not pointers */
128 break;
129
130 case OLD_DT_LOOS ... DT_LOOS-1:
131 case DT_HIOS ... DT_VALRNGLO-1:
132 default:
133 if (dyn->d_tag > DT_ENCODING)
134 printk(KERN_INFO "VDSO: unexpected DT_tag %x\n",
135 dyn->d_tag);
136 break;
137 }
138}
139
140static __init void relocate_vdso(Elf32_Ehdr *ehdr)
141{
142 Elf32_Phdr *phdr;
143 Elf32_Shdr *shdr;
144 int i;
145
146 BUG_ON(memcmp(ehdr->e_ident, ELFMAG, 4) != 0 ||
147 !elf_check_arch(ehdr) ||
148 ehdr->e_type != ET_DYN);
149
150 ehdr->e_entry += VDSO_HIGH_BASE;
151
152 /* rebase phdrs */
153 phdr = (void *)ehdr + ehdr->e_phoff;
154 for (i = 0; i < ehdr->e_phnum; i++) {
155 phdr[i].p_vaddr += VDSO_HIGH_BASE;
156
157 /* relocate dynamic stuff */
158 if (phdr[i].p_type == PT_DYNAMIC)
159 reloc_dyn(ehdr, phdr[i].p_offset);
160 }
161
162 /* rebase sections */
163 shdr = (void *)ehdr + ehdr->e_shoff;
164 for(i = 0; i < ehdr->e_shnum; i++) {
165 if (!(shdr[i].sh_flags & SHF_ALLOC))
166 continue;
167
168 shdr[i].sh_addr += VDSO_HIGH_BASE;
169
170 if (shdr[i].sh_type == SHT_SYMTAB ||
171 shdr[i].sh_type == SHT_DYNSYM)
172 reloc_symtab(ehdr, shdr[i].sh_offset,
173 shdr[i].sh_size);
174 }
175}
176
177void enable_sep_cpu(void)
178{
179 int cpu = get_cpu();
180 struct tss_struct *tss = &per_cpu(init_tss, cpu);
181
182 if (!boot_cpu_has(X86_FEATURE_SEP)) {
183 put_cpu();
184 return;
185 }
186
187 tss->x86_tss.ss1 = __KERNEL_CS;
188 tss->x86_tss.esp1 = sizeof(struct tss_struct) + (unsigned long) tss;
189 wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
190 wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.esp1, 0);
191 wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) sysenter_entry, 0);
192 put_cpu();
193}
194
195static struct vm_area_struct gate_vma;
196
197static int __init gate_vma_init(void)
198{
199 gate_vma.vm_mm = NULL;
200 gate_vma.vm_start = FIXADDR_USER_START;
201 gate_vma.vm_end = FIXADDR_USER_END;
202 gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
203 gate_vma.vm_page_prot = __P101;
204 /*
205 * Make sure the vDSO gets into every core dump.
206 * Dumping its contents makes post-mortem fully interpretable later
207 * without matching up the same kernel and hardware config to see
208 * what PC values meant.
209 */
210 gate_vma.vm_flags |= VM_ALWAYSDUMP;
211 return 0;
212}
213
214/*
215 * These symbols are defined by vsyscall.o to mark the bounds
216 * of the ELF DSO images included therein.
217 */
218extern const char vsyscall_int80_start, vsyscall_int80_end;
219extern const char vsyscall_sysenter_start, vsyscall_sysenter_end;
220static struct page *syscall_pages[1];
221
222static void map_compat_vdso(int map)
223{
224 static int vdso_mapped;
225
226 if (map == vdso_mapped)
227 return;
228
229 vdso_mapped = map;
230
231 __set_fixmap(FIX_VDSO, page_to_pfn(syscall_pages[0]) << PAGE_SHIFT,
232 map ? PAGE_READONLY_EXEC : PAGE_NONE);
233
234 /* flush stray tlbs */
235 flush_tlb_all();
236}
237
238int __init sysenter_setup(void)
239{
240 void *syscall_page = (void *)get_zeroed_page(GFP_ATOMIC);
241 const void *vsyscall;
242 size_t vsyscall_len;
243
244 syscall_pages[0] = virt_to_page(syscall_page);
245
246 gate_vma_init();
247
248 printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO));
249
250 if (!boot_cpu_has(X86_FEATURE_SEP)) {
251 vsyscall = &vsyscall_int80_start;
252 vsyscall_len = &vsyscall_int80_end - &vsyscall_int80_start;
253 } else {
254 vsyscall = &vsyscall_sysenter_start;
255 vsyscall_len = &vsyscall_sysenter_end - &vsyscall_sysenter_start;
256 }
257
258 memcpy(syscall_page, vsyscall, vsyscall_len);
259 relocate_vdso(syscall_page);
260
261 return 0;
262}
263
264/* Defined in vsyscall-sysenter.S */
265extern void SYSENTER_RETURN;
266
267/* Setup a VMA at program startup for the vsyscall page */
268int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
269{
270 struct mm_struct *mm = current->mm;
271 unsigned long addr;
272 int ret = 0;
273 bool compat;
274
275 down_write(&mm->mmap_sem);
276
277 /* Test compat mode once here, in case someone
278 changes it via sysctl */
279 compat = (vdso_enabled == VDSO_COMPAT);
280
281 map_compat_vdso(compat);
282
283 if (compat)
284 addr = VDSO_HIGH_BASE;
285 else {
286 addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0);
287 if (IS_ERR_VALUE(addr)) {
288 ret = addr;
289 goto up_fail;
290 }
291
292 /*
293 * MAYWRITE to allow gdb to COW and set breakpoints
294 *
295 * Make sure the vDSO gets into every core dump.
296 * Dumping its contents makes post-mortem fully
297 * interpretable later without matching up the same
298 * kernel and hardware config to see what PC values
299 * meant.
300 */
301 ret = install_special_mapping(mm, addr, PAGE_SIZE,
302 VM_READ|VM_EXEC|
303 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
304 VM_ALWAYSDUMP,
305 syscall_pages);
306
307 if (ret)
308 goto up_fail;
309 }
310
311 current->mm->context.vdso = (void *)addr;
312 current_thread_info()->sysenter_return =
313 (void *)VDSO_SYM(&SYSENTER_RETURN);
314
315 up_fail:
316 up_write(&mm->mmap_sem);
317
318 return ret;
319}
320
321const char *arch_vma_name(struct vm_area_struct *vma)
322{
323 if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
324 return "[vdso]";
325 return NULL;
326}
327
328struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
329{
330 struct mm_struct *mm = tsk->mm;
331
332 /* Check to see if this task was created in compat vdso mode */
333 if (mm && mm->context.vdso == (void *)VDSO_HIGH_BASE)
334 return &gate_vma;
335 return NULL;
336}
337
338int in_gate_area(struct task_struct *task, unsigned long addr)
339{
340 const struct vm_area_struct *vma = get_gate_vma(task);
341
342 return vma && addr >= vma->vm_start && addr < vma->vm_end;
343}
344
345int in_gate_area_no_task(unsigned long addr)
346{
347 return 0;
348}
diff --git a/arch/x86/kernel/tce_64.c b/arch/x86/kernel/tce_64.c
new file mode 100644
index 000000000000..e3f2569b2c44
--- /dev/null
+++ b/arch/x86/kernel/tce_64.c
@@ -0,0 +1,189 @@
1/*
2 * This file manages the translation entries for the IBM Calgary IOMMU.
3 *
4 * Derived from arch/powerpc/platforms/pseries/iommu.c
5 *
6 * Copyright (C) IBM Corporation, 2006
7 *
8 * Author: Jon Mason <jdmason@us.ibm.com>
9 * Author: Muli Ben-Yehuda <muli@il.ibm.com>
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 */
25
26#include <linux/types.h>
27#include <linux/slab.h>
28#include <linux/mm.h>
29#include <linux/spinlock.h>
30#include <linux/string.h>
31#include <linux/pci.h>
32#include <linux/dma-mapping.h>
33#include <linux/bootmem.h>
34#include <asm/tce.h>
35#include <asm/calgary.h>
36#include <asm/proto.h>
37
38/* flush a tce at 'tceaddr' to main memory */
39static inline void flush_tce(void* tceaddr)
40{
41 /* a single tce can't cross a cache line */
42 if (cpu_has_clflush)
43 asm volatile("clflush (%0)" :: "r" (tceaddr));
44 else
45 asm volatile("wbinvd":::"memory");
46}
47
48void tce_build(struct iommu_table *tbl, unsigned long index,
49 unsigned int npages, unsigned long uaddr, int direction)
50{
51 u64* tp;
52 u64 t;
53 u64 rpn;
54
55 t = (1 << TCE_READ_SHIFT);
56 if (direction != DMA_TO_DEVICE)
57 t |= (1 << TCE_WRITE_SHIFT);
58
59 tp = ((u64*)tbl->it_base) + index;
60
61 while (npages--) {
62 rpn = (virt_to_bus((void*)uaddr)) >> PAGE_SHIFT;
63 t &= ~TCE_RPN_MASK;
64 t |= (rpn << TCE_RPN_SHIFT);
65
66 *tp = cpu_to_be64(t);
67 flush_tce(tp);
68
69 uaddr += PAGE_SIZE;
70 tp++;
71 }
72}
73
74void tce_free(struct iommu_table *tbl, long index, unsigned int npages)
75{
76 u64* tp;
77
78 tp = ((u64*)tbl->it_base) + index;
79
80 while (npages--) {
81 *tp = cpu_to_be64(0);
82 flush_tce(tp);
83 tp++;
84 }
85}
86
87static inline unsigned int table_size_to_number_of_entries(unsigned char size)
88{
89 /*
90 * size is the order of the table, 0-7
91 * smallest table is 8K entries, so shift result by 13 to
92 * multiply by 8K
93 */
94 return (1 << size) << 13;
95}
96
97static int tce_table_setparms(struct pci_dev *dev, struct iommu_table *tbl)
98{
99 unsigned int bitmapsz;
100 unsigned long bmppages;
101 int ret;
102
103 tbl->it_busno = dev->bus->number;
104
105 /* set the tce table size - measured in entries */
106 tbl->it_size = table_size_to_number_of_entries(specified_table_size);
107
108 /*
109 * number of bytes needed for the bitmap size in number of
110 * entries; we need one bit per entry
111 */
112 bitmapsz = tbl->it_size / BITS_PER_BYTE;
113 bmppages = __get_free_pages(GFP_KERNEL, get_order(bitmapsz));
114 if (!bmppages) {
115 printk(KERN_ERR "Calgary: cannot allocate bitmap\n");
116 ret = -ENOMEM;
117 goto done;
118 }
119
120 tbl->it_map = (unsigned long*)bmppages;
121
122 memset(tbl->it_map, 0, bitmapsz);
123
124 tbl->it_hint = 0;
125
126 spin_lock_init(&tbl->it_lock);
127
128 return 0;
129
130done:
131 return ret;
132}
133
134int __init build_tce_table(struct pci_dev *dev, void __iomem *bbar)
135{
136 struct iommu_table *tbl;
137 int ret;
138
139 if (pci_iommu(dev->bus)) {
140 printk(KERN_ERR "Calgary: dev %p has sysdata->iommu %p\n",
141 dev, pci_iommu(dev->bus));
142 BUG();
143 }
144
145 tbl = kzalloc(sizeof(struct iommu_table), GFP_KERNEL);
146 if (!tbl) {
147 printk(KERN_ERR "Calgary: error allocating iommu_table\n");
148 ret = -ENOMEM;
149 goto done;
150 }
151
152 ret = tce_table_setparms(dev, tbl);
153 if (ret)
154 goto free_tbl;
155
156 tbl->bbar = bbar;
157
158 set_pci_iommu(dev->bus, tbl);
159
160 return 0;
161
162free_tbl:
163 kfree(tbl);
164done:
165 return ret;
166}
167
168void * __init alloc_tce_table(void)
169{
170 unsigned int size;
171
172 size = table_size_to_number_of_entries(specified_table_size);
173 size *= TCE_ENTRY_SIZE;
174
175 return __alloc_bootmem_low(size, size, 0);
176}
177
178void __init free_tce_table(void *tbl)
179{
180 unsigned int size;
181
182 if (!tbl)
183 return;
184
185 size = table_size_to_number_of_entries(specified_table_size);
186 size *= TCE_ENTRY_SIZE;
187
188 free_bootmem(__pa(tbl), size);
189}
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c
new file mode 100644
index 000000000000..19a6c678d02e
--- /dev/null
+++ b/arch/x86/kernel/time_32.c
@@ -0,0 +1,236 @@
1/*
2 * linux/arch/i386/kernel/time.c
3 *
4 * Copyright (C) 1991, 1992, 1995 Linus Torvalds
5 *
6 * This file contains the PC-specific time handling details:
7 * reading the RTC at bootup, etc..
8 * 1994-07-02 Alan Modra
9 * fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime
10 * 1995-03-26 Markus Kuhn
11 * fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887
12 * precision CMOS clock update
13 * 1996-05-03 Ingo Molnar
14 * fixed time warps in do_[slow|fast]_gettimeoffset()
15 * 1997-09-10 Updated NTP code according to technical memorandum Jan '96
16 * "A Kernel Model for Precision Timekeeping" by Dave Mills
17 * 1998-09-05 (Various)
18 * More robust do_fast_gettimeoffset() algorithm implemented
19 * (works with APM, Cyrix 6x86MX and Centaur C6),
20 * monotonic gettimeofday() with fast_get_timeoffset(),
21 * drift-proof precision TSC calibration on boot
22 * (C. Scott Ananian <cananian@alumni.princeton.edu>, Andrew D.
23 * Balsa <andrebalsa@altern.org>, Philip Gladstone <philip@raptor.com>;
24 * ported from 2.0.35 Jumbo-9 by Michael Krause <m.krause@tu-harburg.de>).
25 * 1998-12-16 Andrea Arcangeli
26 * Fixed Jumbo-9 code in 2.1.131: do_gettimeofday was missing 1 jiffy
27 * because was not accounting lost_ticks.
28 * 1998-12-24 Copyright (C) 1998 Andrea Arcangeli
29 * Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
30 * serialize accesses to xtime/lost_ticks).
31 */
32
33#include <linux/errno.h>
34#include <linux/sched.h>
35#include <linux/kernel.h>
36#include <linux/param.h>
37#include <linux/string.h>
38#include <linux/mm.h>
39#include <linux/interrupt.h>
40#include <linux/time.h>
41#include <linux/delay.h>
42#include <linux/init.h>
43#include <linux/smp.h>
44#include <linux/module.h>
45#include <linux/sysdev.h>
46#include <linux/bcd.h>
47#include <linux/efi.h>
48#include <linux/mca.h>
49
50#include <asm/io.h>
51#include <asm/smp.h>
52#include <asm/irq.h>
53#include <asm/msr.h>
54#include <asm/delay.h>
55#include <asm/mpspec.h>
56#include <asm/uaccess.h>
57#include <asm/processor.h>
58#include <asm/timer.h>
59#include <asm/time.h>
60
61#include "mach_time.h"
62
63#include <linux/timex.h>
64
65#include <asm/hpet.h>
66
67#include <asm/arch_hooks.h>
68
69#include "io_ports.h"
70
71#include <asm/i8259.h>
72
73#include "do_timer.h"
74
75unsigned int cpu_khz; /* Detected as we calibrate the TSC */
76EXPORT_SYMBOL(cpu_khz);
77
78DEFINE_SPINLOCK(rtc_lock);
79EXPORT_SYMBOL(rtc_lock);
80
81/*
82 * This is a special lock that is owned by the CPU and holds the index
83 * register we are working with. It is required for NMI access to the
84 * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details.
85 */
86volatile unsigned long cmos_lock = 0;
87EXPORT_SYMBOL(cmos_lock);
88
89/* Routines for accessing the CMOS RAM/RTC. */
90unsigned char rtc_cmos_read(unsigned char addr)
91{
92 unsigned char val;
93 lock_cmos_prefix(addr);
94 outb_p(addr, RTC_PORT(0));
95 val = inb_p(RTC_PORT(1));
96 lock_cmos_suffix(addr);
97 return val;
98}
99EXPORT_SYMBOL(rtc_cmos_read);
100
101void rtc_cmos_write(unsigned char val, unsigned char addr)
102{
103 lock_cmos_prefix(addr);
104 outb_p(addr, RTC_PORT(0));
105 outb_p(val, RTC_PORT(1));
106 lock_cmos_suffix(addr);
107}
108EXPORT_SYMBOL(rtc_cmos_write);
109
110static int set_rtc_mmss(unsigned long nowtime)
111{
112 int retval;
113 unsigned long flags;
114
115 /* gets recalled with irq locally disabled */
116 /* XXX - does irqsave resolve this? -johnstul */
117 spin_lock_irqsave(&rtc_lock, flags);
118 retval = set_wallclock(nowtime);
119 spin_unlock_irqrestore(&rtc_lock, flags);
120
121 return retval;
122}
123
124
125int timer_ack;
126
127unsigned long profile_pc(struct pt_regs *regs)
128{
129 unsigned long pc = instruction_pointer(regs);
130
131#ifdef CONFIG_SMP
132 if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->xcs) &&
133 in_lock_functions(pc)) {
134#ifdef CONFIG_FRAME_POINTER
135 return *(unsigned long *)(regs->ebp + 4);
136#else
137 unsigned long *sp = (unsigned long *)&regs->esp;
138
139 /* Return address is either directly at stack pointer
140 or above a saved eflags. Eflags has bits 22-31 zero,
141 kernel addresses don't. */
142 if (sp[0] >> 22)
143 return sp[0];
144 if (sp[1] >> 22)
145 return sp[1];
146#endif
147 }
148#endif
149 return pc;
150}
151EXPORT_SYMBOL(profile_pc);
152
153/*
154 * This is the same as the above, except we _also_ save the current
155 * Time Stamp Counter value at the time of the timer interrupt, so that
156 * we later on can estimate the time of day more exactly.
157 */
158irqreturn_t timer_interrupt(int irq, void *dev_id)
159{
160#ifdef CONFIG_X86_IO_APIC
161 if (timer_ack) {
162 /*
163 * Subtle, when I/O APICs are used we have to ack timer IRQ
164 * manually to reset the IRR bit for do_slow_gettimeoffset().
165 * This will also deassert NMI lines for the watchdog if run
166 * on an 82489DX-based system.
167 */
168 spin_lock(&i8259A_lock);
169 outb(0x0c, PIC_MASTER_OCW3);
170 /* Ack the IRQ; AEOI will end it automatically. */
171 inb(PIC_MASTER_POLL);
172 spin_unlock(&i8259A_lock);
173 }
174#endif
175
176 do_timer_interrupt_hook();
177
178 if (MCA_bus) {
179 /* The PS/2 uses level-triggered interrupts. You can't
180 turn them off, nor would you want to (any attempt to
181 enable edge-triggered interrupts usually gets intercepted by a
182 special hardware circuit). Hence we have to acknowledge
183 the timer interrupt. Through some incredibly stupid
184 design idea, the reset for IRQ 0 is done by setting the
185 high bit of the PPI port B (0x61). Note that some PS/2s,
186 notably the 55SX, work fine if this is removed. */
187
188 u8 irq_v = inb_p( 0x61 ); /* read the current state */
189 outb_p( irq_v|0x80, 0x61 ); /* reset the IRQ */
190 }
191
192 return IRQ_HANDLED;
193}
194
195/* not static: needed by APM */
196unsigned long read_persistent_clock(void)
197{
198 unsigned long retval;
199 unsigned long flags;
200
201 spin_lock_irqsave(&rtc_lock, flags);
202
203 retval = get_wallclock();
204
205 spin_unlock_irqrestore(&rtc_lock, flags);
206
207 return retval;
208}
209
210int update_persistent_clock(struct timespec now)
211{
212 return set_rtc_mmss(now.tv_sec);
213}
214
215extern void (*late_time_init)(void);
216/* Duplicate of time_init() below, with hpet_enable part added */
217void __init hpet_time_init(void)
218{
219 if (!hpet_enable())
220 setup_pit_timer();
221 time_init_hook();
222}
223
224/*
225 * This is called directly from init code; we must delay timer setup in the
226 * HPET case as we can't make the decision to turn on HPET this early in the
227 * boot process.
228 *
229 * The chosen time_init function will usually be hpet_time_init, above, but
230 * in the case of virtual hardware, an alternative function may be substituted.
231 */
232void __init time_init(void)
233{
234 tsc_init();
235 late_time_init = choose_time_init();
236}
diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
new file mode 100644
index 000000000000..6d48a4e826d9
--- /dev/null
+++ b/arch/x86/kernel/time_64.c
@@ -0,0 +1,447 @@
1/*
2 * linux/arch/x86-64/kernel/time.c
3 *
4 * "High Precision Event Timer" based timekeeping.
5 *
6 * Copyright (c) 1991,1992,1995 Linus Torvalds
7 * Copyright (c) 1994 Alan Modra
8 * Copyright (c) 1995 Markus Kuhn
9 * Copyright (c) 1996 Ingo Molnar
10 * Copyright (c) 1998 Andrea Arcangeli
11 * Copyright (c) 2002,2006 Vojtech Pavlik
12 * Copyright (c) 2003 Andi Kleen
13 * RTC support code taken from arch/i386/kernel/timers/time_hpet.c
14 */
15
16#include <linux/kernel.h>
17#include <linux/sched.h>
18#include <linux/interrupt.h>
19#include <linux/init.h>
20#include <linux/mc146818rtc.h>
21#include <linux/time.h>
22#include <linux/ioport.h>
23#include <linux/module.h>
24#include <linux/device.h>
25#include <linux/sysdev.h>
26#include <linux/bcd.h>
27#include <linux/notifier.h>
28#include <linux/cpu.h>
29#include <linux/kallsyms.h>
30#include <linux/acpi.h>
31#ifdef CONFIG_ACPI
32#include <acpi/achware.h> /* for PM timer frequency */
33#include <acpi/acpi_bus.h>
34#endif
35#include <asm/8253pit.h>
36#include <asm/i8253.h>
37#include <asm/pgtable.h>
38#include <asm/vsyscall.h>
39#include <asm/timex.h>
40#include <asm/proto.h>
41#include <asm/hpet.h>
42#include <asm/sections.h>
43#include <linux/hpet.h>
44#include <asm/apic.h>
45#include <asm/hpet.h>
46#include <asm/mpspec.h>
47#include <asm/nmi.h>
48#include <asm/vgtod.h>
49
50static char *timename = NULL;
51
52DEFINE_SPINLOCK(rtc_lock);
53EXPORT_SYMBOL(rtc_lock);
54DEFINE_SPINLOCK(i8253_lock);
55EXPORT_SYMBOL(i8253_lock);
56
57volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
58
59unsigned long profile_pc(struct pt_regs *regs)
60{
61 unsigned long pc = instruction_pointer(regs);
62
63 /* Assume the lock function has either no stack frame or a copy
64 of eflags from PUSHF
65 Eflags always has bits 22 and up cleared unlike kernel addresses. */
66 if (!user_mode(regs) && in_lock_functions(pc)) {
67 unsigned long *sp = (unsigned long *)regs->rsp;
68 if (sp[0] >> 22)
69 return sp[0];
70 if (sp[1] >> 22)
71 return sp[1];
72 }
73 return pc;
74}
75EXPORT_SYMBOL(profile_pc);
76
77/*
78 * In order to set the CMOS clock precisely, set_rtc_mmss has to be called 500
79 * ms after the second nowtime has started, because when nowtime is written
80 * into the registers of the CMOS clock, it will jump to the next second
81 * precisely 500 ms later. Check the Motorola MC146818A or Dallas DS12887 data
82 * sheet for details.
83 */
84
85static int set_rtc_mmss(unsigned long nowtime)
86{
87 int retval = 0;
88 int real_seconds, real_minutes, cmos_minutes;
89 unsigned char control, freq_select;
90
91/*
92 * IRQs are disabled when we're called from the timer interrupt,
93 * no need for spin_lock_irqsave()
94 */
95
96 spin_lock(&rtc_lock);
97
98/*
99 * Tell the clock it's being set and stop it.
100 */
101
102 control = CMOS_READ(RTC_CONTROL);
103 CMOS_WRITE(control | RTC_SET, RTC_CONTROL);
104
105 freq_select = CMOS_READ(RTC_FREQ_SELECT);
106 CMOS_WRITE(freq_select | RTC_DIV_RESET2, RTC_FREQ_SELECT);
107
108 cmos_minutes = CMOS_READ(RTC_MINUTES);
109 BCD_TO_BIN(cmos_minutes);
110
111/*
112 * since we're only adjusting minutes and seconds, don't interfere with hour
113 * overflow. This avoids messing with unknown time zones but requires your RTC
114 * not to be off by more than 15 minutes. Since we're calling it only when
115 * our clock is externally synchronized using NTP, this shouldn't be a problem.
116 */
117
118 real_seconds = nowtime % 60;
119 real_minutes = nowtime / 60;
120 if (((abs(real_minutes - cmos_minutes) + 15) / 30) & 1)
121 real_minutes += 30; /* correct for half hour time zone */
122 real_minutes %= 60;
123
124 if (abs(real_minutes - cmos_minutes) >= 30) {
125 printk(KERN_WARNING "time.c: can't update CMOS clock "
126 "from %d to %d\n", cmos_minutes, real_minutes);
127 retval = -1;
128 } else {
129 BIN_TO_BCD(real_seconds);
130 BIN_TO_BCD(real_minutes);
131 CMOS_WRITE(real_seconds, RTC_SECONDS);
132 CMOS_WRITE(real_minutes, RTC_MINUTES);
133 }
134
135/*
136 * The following flags have to be released exactly in this order, otherwise the
137 * DS12887 (popular MC146818A clone with integrated battery and quartz) will
138 * not reset the oscillator and will not update precisely 500 ms later. You
139 * won't find this mentioned in the Dallas Semiconductor data sheets, but who
140 * believes data sheets anyway ... -- Markus Kuhn
141 */
142
143 CMOS_WRITE(control, RTC_CONTROL);
144 CMOS_WRITE(freq_select, RTC_FREQ_SELECT);
145
146 spin_unlock(&rtc_lock);
147
148 return retval;
149}
150
151int update_persistent_clock(struct timespec now)
152{
153 return set_rtc_mmss(now.tv_sec);
154}
155
156void main_timer_handler(void)
157{
158/*
159 * Here we are in the timer irq handler. We have irqs locally disabled (so we
160 * don't need spin_lock_irqsave()) but we don't know if the timer_bh is running
161 * on the other CPU, so we need a lock. We also need to lock the vsyscall
162 * variables, because both do_timer() and us change them -arca+vojtech
163 */
164
165 write_seqlock(&xtime_lock);
166
167/*
168 * Do the timer stuff.
169 */
170
171 do_timer(1);
172#ifndef CONFIG_SMP
173 update_process_times(user_mode(get_irq_regs()));
174#endif
175
176/*
177 * In the SMP case we use the local APIC timer interrupt to do the profiling,
178 * except when we simulate SMP mode on a uniprocessor system, in that case we
179 * have to call the local interrupt handler.
180 */
181
182 if (!using_apic_timer)
183 smp_local_timer_interrupt();
184
185 write_sequnlock(&xtime_lock);
186}
187
188static irqreturn_t timer_interrupt(int irq, void *dev_id)
189{
190 if (apic_runs_main_timer > 1)
191 return IRQ_HANDLED;
192 main_timer_handler();
193 if (using_apic_timer)
194 smp_send_timer_broadcast_ipi();
195 return IRQ_HANDLED;
196}
197
198unsigned long read_persistent_clock(void)
199{
200 unsigned int year, mon, day, hour, min, sec;
201 unsigned long flags;
202 unsigned century = 0;
203
204 spin_lock_irqsave(&rtc_lock, flags);
205
206 do {
207 sec = CMOS_READ(RTC_SECONDS);
208 min = CMOS_READ(RTC_MINUTES);
209 hour = CMOS_READ(RTC_HOURS);
210 day = CMOS_READ(RTC_DAY_OF_MONTH);
211 mon = CMOS_READ(RTC_MONTH);
212 year = CMOS_READ(RTC_YEAR);
213#ifdef CONFIG_ACPI
214 if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
215 acpi_gbl_FADT.century)
216 century = CMOS_READ(acpi_gbl_FADT.century);
217#endif
218 } while (sec != CMOS_READ(RTC_SECONDS));
219
220 spin_unlock_irqrestore(&rtc_lock, flags);
221
222 /*
223 * We know that x86-64 always uses BCD format, no need to check the
224 * config register.
225 */
226
227 BCD_TO_BIN(sec);
228 BCD_TO_BIN(min);
229 BCD_TO_BIN(hour);
230 BCD_TO_BIN(day);
231 BCD_TO_BIN(mon);
232 BCD_TO_BIN(year);
233
234 if (century) {
235 BCD_TO_BIN(century);
236 year += century * 100;
237 printk(KERN_INFO "Extended CMOS year: %d\n", century * 100);
238 } else {
239 /*
240 * x86-64 systems only exists since 2002.
241 * This will work up to Dec 31, 2100
242 */
243 year += 2000;
244 }
245
246 return mktime(year, mon, day, hour, min, sec);
247}
248
249/* calibrate_cpu is used on systems with fixed rate TSCs to determine
250 * processor frequency */
251#define TICK_COUNT 100000000
252static unsigned int __init tsc_calibrate_cpu_khz(void)
253{
254 int tsc_start, tsc_now;
255 int i, no_ctr_free;
256 unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0;
257 unsigned long flags;
258
259 for (i = 0; i < 4; i++)
260 if (avail_to_resrv_perfctr_nmi_bit(i))
261 break;
262 no_ctr_free = (i == 4);
263 if (no_ctr_free) {
264 i = 3;
265 rdmsrl(MSR_K7_EVNTSEL3, evntsel3);
266 wrmsrl(MSR_K7_EVNTSEL3, 0);
267 rdmsrl(MSR_K7_PERFCTR3, pmc3);
268 } else {
269 reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i);
270 reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
271 }
272 local_irq_save(flags);
273 /* start meauring cycles, incrementing from 0 */
274 wrmsrl(MSR_K7_PERFCTR0 + i, 0);
275 wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76);
276 rdtscl(tsc_start);
277 do {
278 rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now);
279 tsc_now = get_cycles_sync();
280 } while ((tsc_now - tsc_start) < TICK_COUNT);
281
282 local_irq_restore(flags);
283 if (no_ctr_free) {
284 wrmsrl(MSR_K7_EVNTSEL3, 0);
285 wrmsrl(MSR_K7_PERFCTR3, pmc3);
286 wrmsrl(MSR_K7_EVNTSEL3, evntsel3);
287 } else {
288 release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
289 release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
290 }
291
292 return pmc_now * tsc_khz / (tsc_now - tsc_start);
293}
294
295/*
296 * pit_calibrate_tsc() uses the speaker output (channel 2) of
297 * the PIT. This is better than using the timer interrupt output,
298 * because we can read the value of the speaker with just one inb(),
299 * where we need three i/o operations for the interrupt channel.
300 * We count how many ticks the TSC does in 50 ms.
301 */
302
303static unsigned int __init pit_calibrate_tsc(void)
304{
305 unsigned long start, end;
306 unsigned long flags;
307
308 spin_lock_irqsave(&i8253_lock, flags);
309
310 outb((inb(0x61) & ~0x02) | 0x01, 0x61);
311
312 outb(0xb0, 0x43);
313 outb((PIT_TICK_RATE / (1000 / 50)) & 0xff, 0x42);
314 outb((PIT_TICK_RATE / (1000 / 50)) >> 8, 0x42);
315 start = get_cycles_sync();
316 while ((inb(0x61) & 0x20) == 0);
317 end = get_cycles_sync();
318
319 spin_unlock_irqrestore(&i8253_lock, flags);
320
321 return (end - start) / 50;
322}
323
324#define PIT_MODE 0x43
325#define PIT_CH0 0x40
326
327static void __pit_init(int val, u8 mode)
328{
329 unsigned long flags;
330
331 spin_lock_irqsave(&i8253_lock, flags);
332 outb_p(mode, PIT_MODE);
333 outb_p(val & 0xff, PIT_CH0); /* LSB */
334 outb_p(val >> 8, PIT_CH0); /* MSB */
335 spin_unlock_irqrestore(&i8253_lock, flags);
336}
337
338void __init pit_init(void)
339{
340 __pit_init(LATCH, 0x34); /* binary, mode 2, LSB/MSB, ch 0 */
341}
342
343void pit_stop_interrupt(void)
344{
345 __pit_init(0, 0x30); /* mode 0 */
346}
347
348void stop_timer_interrupt(void)
349{
350 char *name;
351 if (hpet_address) {
352 name = "HPET";
353 hpet_timer_stop_set_go(0);
354 } else {
355 name = "PIT";
356 pit_stop_interrupt();
357 }
358 printk(KERN_INFO "timer: %s interrupt stopped.\n", name);
359}
360
361static struct irqaction irq0 = {
362 .handler = timer_interrupt,
363 .flags = IRQF_DISABLED | IRQF_IRQPOLL,
364 .mask = CPU_MASK_NONE,
365 .name = "timer"
366};
367
368void __init time_init(void)
369{
370 if (nohpet)
371 hpet_address = 0;
372
373 if (hpet_arch_init())
374 hpet_address = 0;
375
376 if (hpet_use_timer) {
377 /* set tick_nsec to use the proper rate for HPET */
378 tick_nsec = TICK_NSEC_HPET;
379 tsc_khz = hpet_calibrate_tsc();
380 timename = "HPET";
381 } else {
382 pit_init();
383 tsc_khz = pit_calibrate_tsc();
384 timename = "PIT";
385 }
386
387 cpu_khz = tsc_khz;
388 if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) &&
389 boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
390 boot_cpu_data.x86 == 16)
391 cpu_khz = tsc_calibrate_cpu_khz();
392
393 if (unsynchronized_tsc())
394 mark_tsc_unstable("TSCs unsynchronized");
395
396 if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP))
397 vgetcpu_mode = VGETCPU_RDTSCP;
398 else
399 vgetcpu_mode = VGETCPU_LSL;
400
401 set_cyc2ns_scale(tsc_khz);
402 printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n",
403 cpu_khz / 1000, cpu_khz % 1000);
404 init_tsc_clocksource();
405
406 setup_irq(0, &irq0);
407}
408
409/*
410 * sysfs support for the timer.
411 */
412
413static int timer_suspend(struct sys_device *dev, pm_message_t state)
414{
415 return 0;
416}
417
418static int timer_resume(struct sys_device *dev)
419{
420 if (hpet_address)
421 hpet_reenable();
422 else
423 i8254_timer_resume();
424 return 0;
425}
426
427static struct sysdev_class timer_sysclass = {
428 .resume = timer_resume,
429 .suspend = timer_suspend,
430 set_kset_name("timer"),
431};
432
433/* XXX this sysfs stuff should probably go elsewhere later -john */
434static struct sys_device device_timer = {
435 .id = 0,
436 .cls = &timer_sysclass,
437};
438
439static int time_init_device(void)
440{
441 int error = sysdev_class_register(&timer_sysclass);
442 if (!error)
443 error = sysdev_register(&device_timer);
444 return error;
445}
446
447device_initcall(time_init_device);
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
new file mode 100644
index 000000000000..45782356a618
--- /dev/null
+++ b/arch/x86/kernel/topology.c
@@ -0,0 +1,77 @@
1/*
2 * arch/i386/kernel/topology.c - Populate sysfs with topology information
3 *
4 * Written by: Matthew Dobson, IBM Corporation
5 * Original Code: Paul Dorwin, IBM Corporation, Patrick Mochel, OSDL
6 *
7 * Copyright (C) 2002, IBM Corp.
8 *
9 * All rights reserved.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful, but
17 * WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
19 * NON INFRINGEMENT. See the GNU General Public License for more
20 * details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, write to the Free Software
24 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25 *
26 * Send feedback to <colpatch@us.ibm.com>
27 */
28#include <linux/init.h>
29#include <linux/smp.h>
30#include <linux/nodemask.h>
31#include <linux/mmzone.h>
32#include <asm/cpu.h>
33
34static struct i386_cpu cpu_devices[NR_CPUS];
35
36int arch_register_cpu(int num)
37{
38 /*
39 * CPU0 cannot be offlined due to several
40 * restrictions and assumptions in kernel. This basically
41 * doesnt add a control file, one cannot attempt to offline
42 * BSP.
43 *
44 * Also certain PCI quirks require not to enable hotplug control
45 * for all CPU's.
46 */
47 if (num && enable_cpu_hotplug)
48 cpu_devices[num].cpu.hotpluggable = 1;
49
50 return register_cpu(&cpu_devices[num].cpu, num);
51}
52
53#ifdef CONFIG_HOTPLUG_CPU
54int enable_cpu_hotplug = 1;
55
56void arch_unregister_cpu(int num) {
57 return unregister_cpu(&cpu_devices[num].cpu);
58}
59EXPORT_SYMBOL(arch_register_cpu);
60EXPORT_SYMBOL(arch_unregister_cpu);
61#endif /*CONFIG_HOTPLUG_CPU*/
62
63static int __init topology_init(void)
64{
65 int i;
66
67#ifdef CONFIG_NUMA
68 for_each_online_node(i)
69 register_one_node(i);
70#endif /* CONFIG_NUMA */
71
72 for_each_present_cpu(i)
73 arch_register_cpu(i);
74 return 0;
75}
76
77subsys_initcall(topology_init);
diff --git a/arch/x86/kernel/trampoline_32.S b/arch/x86/kernel/trampoline_32.S
new file mode 100644
index 000000000000..f62815f8d06a
--- /dev/null
+++ b/arch/x86/kernel/trampoline_32.S
@@ -0,0 +1,85 @@
1/*
2 *
3 * Trampoline.S Derived from Setup.S by Linus Torvalds
4 *
5 * 4 Jan 1997 Michael Chastain: changed to gnu as.
6 *
7 * This is only used for booting secondary CPUs in SMP machine
8 *
9 * Entry: CS:IP point to the start of our code, we are
10 * in real mode with no stack, but the rest of the
11 * trampoline page to make our stack and everything else
12 * is a mystery.
13 *
14 * In fact we don't actually need a stack so we don't
15 * set one up.
16 *
17 * We jump into the boot/compressed/head.S code. So you'd
18 * better be running a compressed kernel image or you
19 * won't get very far.
20 *
21 * On entry to trampoline_data, the processor is in real mode
22 * with 16-bit addressing and 16-bit data. CS has some value
23 * and IP is zero. Thus, data addresses need to be absolute
24 * (no relocation) and are taken with regard to r_base.
25 *
26 * If you work on this file, check the object module with
27 * objdump --reloc to make sure there are no relocation
28 * entries except for:
29 *
30 * TYPE VALUE
31 * R_386_32 startup_32_smp
32 * R_386_32 boot_gdt
33 */
34
35#include <linux/linkage.h>
36#include <asm/segment.h>
37#include <asm/page.h>
38
39.data
40
41/* We can free up trampoline after bootup if cpu hotplug is not supported. */
42#ifndef CONFIG_HOTPLUG_CPU
43.section ".init.data","aw",@progbits
44#endif
45
46.code16
47
48ENTRY(trampoline_data)
49r_base = .
50 wbinvd # Needed for NUMA-Q should be harmless for others
51 mov %cs, %ax # Code and data in the same place
52 mov %ax, %ds
53
54 cli # We should be safe anyway
55
56 movl $0xA5A5A5A5, trampoline_data - r_base
57 # write marker for master knows we're running
58
59 /* GDT tables in non default location kernel can be beyond 16MB and
60 * lgdt will not be able to load the address as in real mode default
61 * operand size is 16bit. Use lgdtl instead to force operand size
62 * to 32 bit.
63 */
64
65 lidtl boot_idt_descr - r_base # load idt with 0, 0
66 lgdtl boot_gdt_descr - r_base # load gdt with whatever is appropriate
67
68 xor %ax, %ax
69 inc %ax # protected mode (PE) bit
70 lmsw %ax # into protected mode
71 # flush prefetch and jump to startup_32_smp in arch/i386/kernel/head.S
72 ljmpl $__BOOT_CS, $(startup_32_smp-__PAGE_OFFSET)
73
74 # These need to be in the same 64K segment as the above;
75 # hence we don't use the boot_gdt_descr defined in head.S
76boot_gdt_descr:
77 .word __BOOT_DS + 7 # gdt limit
78 .long boot_gdt - __PAGE_OFFSET # gdt base
79
80boot_idt_descr:
81 .word 0 # idt limit = 0
82 .long 0 # idt base = 0L
83
84.globl trampoline_end
85trampoline_end:
diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S
new file mode 100644
index 000000000000..607983b0d27b
--- /dev/null
+++ b/arch/x86/kernel/trampoline_64.S
@@ -0,0 +1,166 @@
1/*
2 *
3 * Trampoline.S Derived from Setup.S by Linus Torvalds
4 *
5 * 4 Jan 1997 Michael Chastain: changed to gnu as.
6 * 15 Sept 2005 Eric Biederman: 64bit PIC support
7 *
8 * Entry: CS:IP point to the start of our code, we are
9 * in real mode with no stack, but the rest of the
10 * trampoline page to make our stack and everything else
11 * is a mystery.
12 *
13 * In fact we don't actually need a stack so we don't
14 * set one up.
15 *
16 * On entry to trampoline_data, the processor is in real mode
17 * with 16-bit addressing and 16-bit data. CS has some value
18 * and IP is zero. Thus, data addresses need to be absolute
19 * (no relocation) and are taken with regard to r_base.
20 *
21 * With the addition of trampoline_level4_pgt this code can
22 * now enter a 64bit kernel that lives at arbitrary 64bit
23 * physical addresses.
24 *
25 * If you work on this file, check the object module with objdump
26 * --full-contents --reloc to make sure there are no relocation
27 * entries.
28 */
29
30#include <linux/linkage.h>
31#include <asm/pgtable.h>
32#include <asm/page.h>
33#include <asm/msr.h>
34#include <asm/segment.h>
35
36.data
37
38.code16
39
40ENTRY(trampoline_data)
41r_base = .
42 cli # We should be safe anyway
43 wbinvd
44 mov %cs, %ax # Code and data in the same place
45 mov %ax, %ds
46 mov %ax, %es
47 mov %ax, %ss
48
49
50 movl $0xA5A5A5A5, trampoline_data - r_base
51 # write marker for master knows we're running
52
53 # Setup stack
54 movw $(trampoline_stack_end - r_base), %sp
55
56 call verify_cpu # Verify the cpu supports long mode
57 testl %eax, %eax # Check for return code
58 jnz no_longmode
59
60 mov %cs, %ax
61 movzx %ax, %esi # Find the 32bit trampoline location
62 shll $4, %esi
63
64 # Fixup the vectors
65 addl %esi, startup_32_vector - r_base
66 addl %esi, startup_64_vector - r_base
67 addl %esi, tgdt + 2 - r_base # Fixup the gdt pointer
68
69 /*
70 * GDT tables in non default location kernel can be beyond 16MB and
71 * lgdt will not be able to load the address as in real mode default
72 * operand size is 16bit. Use lgdtl instead to force operand size
73 * to 32 bit.
74 */
75
76 lidtl tidt - r_base # load idt with 0, 0
77 lgdtl tgdt - r_base # load gdt with whatever is appropriate
78
79 xor %ax, %ax
80 inc %ax # protected mode (PE) bit
81 lmsw %ax # into protected mode
82
83 # flush prefetch and jump to startup_32
84 ljmpl *(startup_32_vector - r_base)
85
86 .code32
87 .balign 4
88startup_32:
89 movl $__KERNEL_DS, %eax # Initialize the %ds segment register
90 movl %eax, %ds
91
92 xorl %eax, %eax
93 btsl $5, %eax # Enable PAE mode
94 movl %eax, %cr4
95
96 # Setup trampoline 4 level pagetables
97 leal (trampoline_level4_pgt - r_base)(%esi), %eax
98 movl %eax, %cr3
99
100 movl $MSR_EFER, %ecx
101 movl $(1 << _EFER_LME), %eax # Enable Long Mode
102 xorl %edx, %edx
103 wrmsr
104
105 xorl %eax, %eax
106 btsl $31, %eax # Enable paging and in turn activate Long Mode
107 btsl $0, %eax # Enable protected mode
108 movl %eax, %cr0
109
110 /*
111 * At this point we're in long mode but in 32bit compatibility mode
112 * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn
113 * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we use
114 * the new gdt/idt that has __KERNEL_CS with CS.L = 1.
115 */
116 ljmp *(startup_64_vector - r_base)(%esi)
117
118 .code64
119 .balign 4
120startup_64:
121 # Now jump into the kernel using virtual addresses
122 movq $secondary_startup_64, %rax
123 jmp *%rax
124
125 .code16
126no_longmode:
127 hlt
128 jmp no_longmode
129#include "verify_cpu_64.S"
130
131 # Careful these need to be in the same 64K segment as the above;
132tidt:
133 .word 0 # idt limit = 0
134 .word 0, 0 # idt base = 0L
135
136 # Duplicate the global descriptor table
137 # so the kernel can live anywhere
138 .balign 4
139tgdt:
140 .short tgdt_end - tgdt # gdt limit
141 .long tgdt - r_base
142 .short 0
143 .quad 0x00cf9b000000ffff # __KERNEL32_CS
144 .quad 0x00af9b000000ffff # __KERNEL_CS
145 .quad 0x00cf93000000ffff # __KERNEL_DS
146tgdt_end:
147
148 .balign 4
149startup_32_vector:
150 .long startup_32 - r_base
151 .word __KERNEL32_CS, 0
152
153 .balign 4
154startup_64_vector:
155 .long startup_64 - r_base
156 .word __KERNEL_CS, 0
157
158trampoline_stack:
159 .org 0x1000
160trampoline_stack_end:
161ENTRY(trampoline_level4_pgt)
162 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
163 .fill 510,8,0
164 .quad level3_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
165
166ENTRY(trampoline_end)
diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
new file mode 100644
index 000000000000..47b0bef335bd
--- /dev/null
+++ b/arch/x86/kernel/traps_32.c
@@ -0,0 +1,1250 @@
1/*
2 * linux/arch/i386/traps.c
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 *
6 * Pentium III FXSR, SSE support
7 * Gareth Hughes <gareth@valinux.com>, May 2000
8 */
9
10/*
11 * 'Traps.c' handles hardware traps and faults after we have saved some
12 * state in 'asm.s'.
13 */
14#include <linux/sched.h>
15#include <linux/kernel.h>
16#include <linux/string.h>
17#include <linux/errno.h>
18#include <linux/timer.h>
19#include <linux/mm.h>
20#include <linux/init.h>
21#include <linux/delay.h>
22#include <linux/spinlock.h>
23#include <linux/interrupt.h>
24#include <linux/highmem.h>
25#include <linux/kallsyms.h>
26#include <linux/ptrace.h>
27#include <linux/utsname.h>
28#include <linux/kprobes.h>
29#include <linux/kexec.h>
30#include <linux/unwind.h>
31#include <linux/uaccess.h>
32#include <linux/nmi.h>
33#include <linux/bug.h>
34
35#ifdef CONFIG_EISA
36#include <linux/ioport.h>
37#include <linux/eisa.h>
38#endif
39
40#ifdef CONFIG_MCA
41#include <linux/mca.h>
42#endif
43
44#if defined(CONFIG_EDAC)
45#include <linux/edac.h>
46#endif
47
48#include <asm/processor.h>
49#include <asm/system.h>
50#include <asm/io.h>
51#include <asm/atomic.h>
52#include <asm/debugreg.h>
53#include <asm/desc.h>
54#include <asm/i387.h>
55#include <asm/nmi.h>
56#include <asm/unwind.h>
57#include <asm/smp.h>
58#include <asm/arch_hooks.h>
59#include <linux/kdebug.h>
60#include <asm/stacktrace.h>
61
62#include <linux/module.h>
63
64#include "mach_traps.h"
65
66int panic_on_unrecovered_nmi;
67
68asmlinkage int system_call(void);
69
70/* Do we ignore FPU interrupts ? */
71char ignore_fpu_irq = 0;
72
73/*
74 * The IDT has to be page-aligned to simplify the Pentium
75 * F0 0F bug workaround.. We have a special link segment
76 * for this.
77 */
78struct desc_struct idt_table[256] __attribute__((__section__(".data.idt"))) = { {0, 0}, };
79
80asmlinkage void divide_error(void);
81asmlinkage void debug(void);
82asmlinkage void nmi(void);
83asmlinkage void int3(void);
84asmlinkage void overflow(void);
85asmlinkage void bounds(void);
86asmlinkage void invalid_op(void);
87asmlinkage void device_not_available(void);
88asmlinkage void coprocessor_segment_overrun(void);
89asmlinkage void invalid_TSS(void);
90asmlinkage void segment_not_present(void);
91asmlinkage void stack_segment(void);
92asmlinkage void general_protection(void);
93asmlinkage void page_fault(void);
94asmlinkage void coprocessor_error(void);
95asmlinkage void simd_coprocessor_error(void);
96asmlinkage void alignment_check(void);
97asmlinkage void spurious_interrupt_bug(void);
98asmlinkage void machine_check(void);
99
100int kstack_depth_to_print = 24;
101static unsigned int code_bytes = 64;
102
103static inline int valid_stack_ptr(struct thread_info *tinfo, void *p, unsigned size)
104{
105 return p > (void *)tinfo &&
106 p <= (void *)tinfo + THREAD_SIZE - size;
107}
108
109/* The form of the top of the frame on the stack */
110struct stack_frame {
111 struct stack_frame *next_frame;
112 unsigned long return_address;
113};
114
115static inline unsigned long print_context_stack(struct thread_info *tinfo,
116 unsigned long *stack, unsigned long ebp,
117 struct stacktrace_ops *ops, void *data)
118{
119#ifdef CONFIG_FRAME_POINTER
120 struct stack_frame *frame = (struct stack_frame *)ebp;
121 while (valid_stack_ptr(tinfo, frame, sizeof(*frame))) {
122 struct stack_frame *next;
123 unsigned long addr;
124
125 addr = frame->return_address;
126 ops->address(data, addr);
127 /*
128 * break out of recursive entries (such as
129 * end_of_stack_stop_unwind_function). Also,
130 * we can never allow a frame pointer to
131 * move downwards!
132 */
133 next = frame->next_frame;
134 if (next <= frame)
135 break;
136 frame = next;
137 }
138#else
139 while (valid_stack_ptr(tinfo, stack, sizeof(*stack))) {
140 unsigned long addr;
141
142 addr = *stack++;
143 if (__kernel_text_address(addr))
144 ops->address(data, addr);
145 }
146#endif
147 return ebp;
148}
149
150#define MSG(msg) ops->warning(data, msg)
151
152void dump_trace(struct task_struct *task, struct pt_regs *regs,
153 unsigned long *stack,
154 struct stacktrace_ops *ops, void *data)
155{
156 unsigned long ebp = 0;
157
158 if (!task)
159 task = current;
160
161 if (!stack) {
162 unsigned long dummy;
163 stack = &dummy;
164 if (task != current)
165 stack = (unsigned long *)task->thread.esp;
166 }
167
168#ifdef CONFIG_FRAME_POINTER
169 if (!ebp) {
170 if (task == current) {
171 /* Grab ebp right from our regs */
172 asm ("movl %%ebp, %0" : "=r" (ebp) : );
173 } else {
174 /* ebp is the last reg pushed by switch_to */
175 ebp = *(unsigned long *) task->thread.esp;
176 }
177 }
178#endif
179
180 while (1) {
181 struct thread_info *context;
182 context = (struct thread_info *)
183 ((unsigned long)stack & (~(THREAD_SIZE - 1)));
184 ebp = print_context_stack(context, stack, ebp, ops, data);
185 /* Should be after the line below, but somewhere
186 in early boot context comes out corrupted and we
187 can't reference it -AK */
188 if (ops->stack(data, "IRQ") < 0)
189 break;
190 stack = (unsigned long*)context->previous_esp;
191 if (!stack)
192 break;
193 touch_nmi_watchdog();
194 }
195}
196EXPORT_SYMBOL(dump_trace);
197
198static void
199print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
200{
201 printk(data);
202 print_symbol(msg, symbol);
203 printk("\n");
204}
205
206static void print_trace_warning(void *data, char *msg)
207{
208 printk("%s%s\n", (char *)data, msg);
209}
210
211static int print_trace_stack(void *data, char *name)
212{
213 return 0;
214}
215
216/*
217 * Print one address/symbol entries per line.
218 */
219static void print_trace_address(void *data, unsigned long addr)
220{
221 printk("%s [<%08lx>] ", (char *)data, addr);
222 print_symbol("%s\n", addr);
223 touch_nmi_watchdog();
224}
225
226static struct stacktrace_ops print_trace_ops = {
227 .warning = print_trace_warning,
228 .warning_symbol = print_trace_warning_symbol,
229 .stack = print_trace_stack,
230 .address = print_trace_address,
231};
232
233static void
234show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
235 unsigned long * stack, char *log_lvl)
236{
237 dump_trace(task, regs, stack, &print_trace_ops, log_lvl);
238 printk("%s =======================\n", log_lvl);
239}
240
241void show_trace(struct task_struct *task, struct pt_regs *regs,
242 unsigned long * stack)
243{
244 show_trace_log_lvl(task, regs, stack, "");
245}
246
247static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
248 unsigned long *esp, char *log_lvl)
249{
250 unsigned long *stack;
251 int i;
252
253 if (esp == NULL) {
254 if (task)
255 esp = (unsigned long*)task->thread.esp;
256 else
257 esp = (unsigned long *)&esp;
258 }
259
260 stack = esp;
261 for(i = 0; i < kstack_depth_to_print; i++) {
262 if (kstack_end(stack))
263 break;
264 if (i && ((i % 8) == 0))
265 printk("\n%s ", log_lvl);
266 printk("%08lx ", *stack++);
267 }
268 printk("\n%sCall Trace:\n", log_lvl);
269 show_trace_log_lvl(task, regs, esp, log_lvl);
270}
271
272void show_stack(struct task_struct *task, unsigned long *esp)
273{
274 printk(" ");
275 show_stack_log_lvl(task, NULL, esp, "");
276}
277
278/*
279 * The architecture-independent dump_stack generator
280 */
281void dump_stack(void)
282{
283 unsigned long stack;
284
285 show_trace(current, NULL, &stack);
286}
287
288EXPORT_SYMBOL(dump_stack);
289
290void show_registers(struct pt_regs *regs)
291{
292 int i;
293 int in_kernel = 1;
294 unsigned long esp;
295 unsigned short ss, gs;
296
297 esp = (unsigned long) (&regs->esp);
298 savesegment(ss, ss);
299 savesegment(gs, gs);
300 if (user_mode_vm(regs)) {
301 in_kernel = 0;
302 esp = regs->esp;
303 ss = regs->xss & 0xffff;
304 }
305 print_modules();
306 printk(KERN_EMERG "CPU: %d\n"
307 KERN_EMERG "EIP: %04x:[<%08lx>] %s VLI\n"
308 KERN_EMERG "EFLAGS: %08lx (%s %.*s)\n",
309 smp_processor_id(), 0xffff & regs->xcs, regs->eip,
310 print_tainted(), regs->eflags, init_utsname()->release,
311 (int)strcspn(init_utsname()->version, " "),
312 init_utsname()->version);
313 print_symbol(KERN_EMERG "EIP is at %s\n", regs->eip);
314 printk(KERN_EMERG "eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n",
315 regs->eax, regs->ebx, regs->ecx, regs->edx);
316 printk(KERN_EMERG "esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n",
317 regs->esi, regs->edi, regs->ebp, esp);
318 printk(KERN_EMERG "ds: %04x es: %04x fs: %04x gs: %04x ss: %04x\n",
319 regs->xds & 0xffff, regs->xes & 0xffff, regs->xfs & 0xffff, gs, ss);
320 printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)",
321 TASK_COMM_LEN, current->comm, current->pid,
322 current_thread_info(), current, task_thread_info(current));
323 /*
324 * When in-kernel, we also print out the stack and code at the
325 * time of the fault..
326 */
327 if (in_kernel) {
328 u8 *eip;
329 unsigned int code_prologue = code_bytes * 43 / 64;
330 unsigned int code_len = code_bytes;
331 unsigned char c;
332
333 printk("\n" KERN_EMERG "Stack: ");
334 show_stack_log_lvl(NULL, regs, (unsigned long *)esp, KERN_EMERG);
335
336 printk(KERN_EMERG "Code: ");
337
338 eip = (u8 *)regs->eip - code_prologue;
339 if (eip < (u8 *)PAGE_OFFSET ||
340 probe_kernel_address(eip, c)) {
341 /* try starting at EIP */
342 eip = (u8 *)regs->eip;
343 code_len = code_len - code_prologue + 1;
344 }
345 for (i = 0; i < code_len; i++, eip++) {
346 if (eip < (u8 *)PAGE_OFFSET ||
347 probe_kernel_address(eip, c)) {
348 printk(" Bad EIP value.");
349 break;
350 }
351 if (eip == (u8 *)regs->eip)
352 printk("<%02x> ", c);
353 else
354 printk("%02x ", c);
355 }
356 }
357 printk("\n");
358}
359
360int is_valid_bugaddr(unsigned long eip)
361{
362 unsigned short ud2;
363
364 if (eip < PAGE_OFFSET)
365 return 0;
366 if (probe_kernel_address((unsigned short *)eip, ud2))
367 return 0;
368
369 return ud2 == 0x0b0f;
370}
371
372/*
373 * This is gone through when something in the kernel has done something bad and
374 * is about to be terminated.
375 */
376void die(const char * str, struct pt_regs * regs, long err)
377{
378 static struct {
379 spinlock_t lock;
380 u32 lock_owner;
381 int lock_owner_depth;
382 } die = {
383 .lock = __SPIN_LOCK_UNLOCKED(die.lock),
384 .lock_owner = -1,
385 .lock_owner_depth = 0
386 };
387 static int die_counter;
388 unsigned long flags;
389
390 oops_enter();
391
392 if (die.lock_owner != raw_smp_processor_id()) {
393 console_verbose();
394 spin_lock_irqsave(&die.lock, flags);
395 die.lock_owner = smp_processor_id();
396 die.lock_owner_depth = 0;
397 bust_spinlocks(1);
398 }
399 else
400 local_save_flags(flags);
401
402 if (++die.lock_owner_depth < 3) {
403 int nl = 0;
404 unsigned long esp;
405 unsigned short ss;
406
407 report_bug(regs->eip, regs);
408
409 printk(KERN_EMERG "%s: %04lx [#%d]\n", str, err & 0xffff, ++die_counter);
410#ifdef CONFIG_PREEMPT
411 printk(KERN_EMERG "PREEMPT ");
412 nl = 1;
413#endif
414#ifdef CONFIG_SMP
415 if (!nl)
416 printk(KERN_EMERG);
417 printk("SMP ");
418 nl = 1;
419#endif
420#ifdef CONFIG_DEBUG_PAGEALLOC
421 if (!nl)
422 printk(KERN_EMERG);
423 printk("DEBUG_PAGEALLOC");
424 nl = 1;
425#endif
426 if (nl)
427 printk("\n");
428 if (notify_die(DIE_OOPS, str, regs, err,
429 current->thread.trap_no, SIGSEGV) !=
430 NOTIFY_STOP) {
431 show_registers(regs);
432 /* Executive summary in case the oops scrolled away */
433 esp = (unsigned long) (&regs->esp);
434 savesegment(ss, ss);
435 if (user_mode(regs)) {
436 esp = regs->esp;
437 ss = regs->xss & 0xffff;
438 }
439 printk(KERN_EMERG "EIP: [<%08lx>] ", regs->eip);
440 print_symbol("%s", regs->eip);
441 printk(" SS:ESP %04x:%08lx\n", ss, esp);
442 }
443 else
444 regs = NULL;
445 } else
446 printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
447
448 bust_spinlocks(0);
449 die.lock_owner = -1;
450 add_taint(TAINT_DIE);
451 spin_unlock_irqrestore(&die.lock, flags);
452
453 if (!regs)
454 return;
455
456 if (kexec_should_crash(current))
457 crash_kexec(regs);
458
459 if (in_interrupt())
460 panic("Fatal exception in interrupt");
461
462 if (panic_on_oops)
463 panic("Fatal exception");
464
465 oops_exit();
466 do_exit(SIGSEGV);
467}
468
469static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err)
470{
471 if (!user_mode_vm(regs))
472 die(str, regs, err);
473}
474
475static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86,
476 struct pt_regs * regs, long error_code,
477 siginfo_t *info)
478{
479 struct task_struct *tsk = current;
480
481 if (regs->eflags & VM_MASK) {
482 if (vm86)
483 goto vm86_trap;
484 goto trap_signal;
485 }
486
487 if (!user_mode(regs))
488 goto kernel_trap;
489
490 trap_signal: {
491 /*
492 * We want error_code and trap_no set for userspace faults and
493 * kernelspace faults which result in die(), but not
494 * kernelspace faults which are fixed up. die() gives the
495 * process no chance to handle the signal and notice the
496 * kernel fault information, so that won't result in polluting
497 * the information about previously queued, but not yet
498 * delivered, faults. See also do_general_protection below.
499 */
500 tsk->thread.error_code = error_code;
501 tsk->thread.trap_no = trapnr;
502
503 if (info)
504 force_sig_info(signr, info, tsk);
505 else
506 force_sig(signr, tsk);
507 return;
508 }
509
510 kernel_trap: {
511 if (!fixup_exception(regs)) {
512 tsk->thread.error_code = error_code;
513 tsk->thread.trap_no = trapnr;
514 die(str, regs, error_code);
515 }
516 return;
517 }
518
519 vm86_trap: {
520 int ret = handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, trapnr);
521 if (ret) goto trap_signal;
522 return;
523 }
524}
525
526#define DO_ERROR(trapnr, signr, str, name) \
527fastcall void do_##name(struct pt_regs * regs, long error_code) \
528{ \
529 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
530 == NOTIFY_STOP) \
531 return; \
532 do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \
533}
534
535#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \
536fastcall void do_##name(struct pt_regs * regs, long error_code) \
537{ \
538 siginfo_t info; \
539 if (irq) \
540 local_irq_enable(); \
541 info.si_signo = signr; \
542 info.si_errno = 0; \
543 info.si_code = sicode; \
544 info.si_addr = (void __user *)siaddr; \
545 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
546 == NOTIFY_STOP) \
547 return; \
548 do_trap(trapnr, signr, str, 0, regs, error_code, &info); \
549}
550
551#define DO_VM86_ERROR(trapnr, signr, str, name) \
552fastcall void do_##name(struct pt_regs * regs, long error_code) \
553{ \
554 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
555 == NOTIFY_STOP) \
556 return; \
557 do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \
558}
559
560#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
561fastcall void do_##name(struct pt_regs * regs, long error_code) \
562{ \
563 siginfo_t info; \
564 info.si_signo = signr; \
565 info.si_errno = 0; \
566 info.si_code = sicode; \
567 info.si_addr = (void __user *)siaddr; \
568 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
569 == NOTIFY_STOP) \
570 return; \
571 do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
572}
573
574DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->eip)
575#ifndef CONFIG_KPROBES
576DO_VM86_ERROR( 3, SIGTRAP, "int3", int3)
577#endif
578DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow)
579DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds)
580DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->eip, 0)
581DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
582DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
583DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
584DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
585DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0)
586DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0, 1)
587
588fastcall void __kprobes do_general_protection(struct pt_regs * regs,
589 long error_code)
590{
591 int cpu = get_cpu();
592 struct tss_struct *tss = &per_cpu(init_tss, cpu);
593 struct thread_struct *thread = &current->thread;
594
595 /*
596 * Perform the lazy TSS's I/O bitmap copy. If the TSS has an
597 * invalid offset set (the LAZY one) and the faulting thread has
598 * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS
599 * and we set the offset field correctly. Then we let the CPU to
600 * restart the faulting instruction.
601 */
602 if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY &&
603 thread->io_bitmap_ptr) {
604 memcpy(tss->io_bitmap, thread->io_bitmap_ptr,
605 thread->io_bitmap_max);
606 /*
607 * If the previously set map was extending to higher ports
608 * than the current one, pad extra space with 0xff (no access).
609 */
610 if (thread->io_bitmap_max < tss->io_bitmap_max)
611 memset((char *) tss->io_bitmap +
612 thread->io_bitmap_max, 0xff,
613 tss->io_bitmap_max - thread->io_bitmap_max);
614 tss->io_bitmap_max = thread->io_bitmap_max;
615 tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
616 tss->io_bitmap_owner = thread;
617 put_cpu();
618 return;
619 }
620 put_cpu();
621
622 if (regs->eflags & VM_MASK)
623 goto gp_in_vm86;
624
625 if (!user_mode(regs))
626 goto gp_in_kernel;
627
628 current->thread.error_code = error_code;
629 current->thread.trap_no = 13;
630 if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) &&
631 printk_ratelimit())
632 printk(KERN_INFO
633 "%s[%d] general protection eip:%lx esp:%lx error:%lx\n",
634 current->comm, current->pid,
635 regs->eip, regs->esp, error_code);
636
637 force_sig(SIGSEGV, current);
638 return;
639
640gp_in_vm86:
641 local_irq_enable();
642 handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
643 return;
644
645gp_in_kernel:
646 if (!fixup_exception(regs)) {
647 current->thread.error_code = error_code;
648 current->thread.trap_no = 13;
649 if (notify_die(DIE_GPF, "general protection fault", regs,
650 error_code, 13, SIGSEGV) == NOTIFY_STOP)
651 return;
652 die("general protection fault", regs, error_code);
653 }
654}
655
656static __kprobes void
657mem_parity_error(unsigned char reason, struct pt_regs * regs)
658{
659 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
660 "CPU %d.\n", reason, smp_processor_id());
661 printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
662
663#if defined(CONFIG_EDAC)
664 if(edac_handler_set()) {
665 edac_atomic_assert_error();
666 return;
667 }
668#endif
669
670 if (panic_on_unrecovered_nmi)
671 panic("NMI: Not continuing");
672
673 printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
674
675 /* Clear and disable the memory parity error line. */
676 clear_mem_error(reason);
677}
678
679static __kprobes void
680io_check_error(unsigned char reason, struct pt_regs * regs)
681{
682 unsigned long i;
683
684 printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n");
685 show_registers(regs);
686
687 /* Re-enable the IOCK line, wait for a few seconds */
688 reason = (reason & 0xf) | 8;
689 outb(reason, 0x61);
690 i = 2000;
691 while (--i) udelay(1000);
692 reason &= ~8;
693 outb(reason, 0x61);
694}
695
696static __kprobes void
697unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
698{
699#ifdef CONFIG_MCA
700 /* Might actually be able to figure out what the guilty party
701 * is. */
702 if( MCA_bus ) {
703 mca_handle_nmi();
704 return;
705 }
706#endif
707 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
708 "CPU %d.\n", reason, smp_processor_id());
709 printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
710 if (panic_on_unrecovered_nmi)
711 panic("NMI: Not continuing");
712
713 printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
714}
715
716static DEFINE_SPINLOCK(nmi_print_lock);
717
718void __kprobes die_nmi(struct pt_regs *regs, const char *msg)
719{
720 if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) ==
721 NOTIFY_STOP)
722 return;
723
724 spin_lock(&nmi_print_lock);
725 /*
726 * We are in trouble anyway, lets at least try
727 * to get a message out.
728 */
729 bust_spinlocks(1);
730 printk(KERN_EMERG "%s", msg);
731 printk(" on CPU%d, eip %08lx, registers:\n",
732 smp_processor_id(), regs->eip);
733 show_registers(regs);
734 console_silent();
735 spin_unlock(&nmi_print_lock);
736 bust_spinlocks(0);
737
738 /* If we are in kernel we are probably nested up pretty bad
739 * and might aswell get out now while we still can.
740 */
741 if (!user_mode_vm(regs)) {
742 current->thread.trap_no = 2;
743 crash_kexec(regs);
744 }
745
746 do_exit(SIGSEGV);
747}
748
749static __kprobes void default_do_nmi(struct pt_regs * regs)
750{
751 unsigned char reason = 0;
752
753 /* Only the BSP gets external NMIs from the system. */
754 if (!smp_processor_id())
755 reason = get_nmi_reason();
756
757 if (!(reason & 0xc0)) {
758 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
759 == NOTIFY_STOP)
760 return;
761#ifdef CONFIG_X86_LOCAL_APIC
762 /*
763 * Ok, so this is none of the documented NMI sources,
764 * so it must be the NMI watchdog.
765 */
766 if (nmi_watchdog_tick(regs, reason))
767 return;
768 if (!do_nmi_callback(regs, smp_processor_id()))
769#endif
770 unknown_nmi_error(reason, regs);
771
772 return;
773 }
774 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
775 return;
776 if (reason & 0x80)
777 mem_parity_error(reason, regs);
778 if (reason & 0x40)
779 io_check_error(reason, regs);
780 /*
781 * Reassert NMI in case it became active meanwhile
782 * as it's edge-triggered.
783 */
784 reassert_nmi();
785}
786
787static int ignore_nmis;
788
789fastcall __kprobes void do_nmi(struct pt_regs * regs, long error_code)
790{
791 int cpu;
792
793 nmi_enter();
794
795 cpu = smp_processor_id();
796
797 ++nmi_count(cpu);
798
799 if (!ignore_nmis)
800 default_do_nmi(regs);
801
802 nmi_exit();
803}
804
805void stop_nmi(void)
806{
807 acpi_nmi_disable();
808 ignore_nmis++;
809}
810
811void restart_nmi(void)
812{
813 ignore_nmis--;
814 acpi_nmi_enable();
815}
816
817#ifdef CONFIG_KPROBES
818fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code)
819{
820 if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
821 == NOTIFY_STOP)
822 return;
823 /* This is an interrupt gate, because kprobes wants interrupts
824 disabled. Normal trap handlers don't. */
825 restore_interrupts(regs);
826 do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL);
827}
828#endif
829
830/*
831 * Our handling of the processor debug registers is non-trivial.
832 * We do not clear them on entry and exit from the kernel. Therefore
833 * it is possible to get a watchpoint trap here from inside the kernel.
834 * However, the code in ./ptrace.c has ensured that the user can
835 * only set watchpoints on userspace addresses. Therefore the in-kernel
836 * watchpoint trap can only occur in code which is reading/writing
837 * from user space. Such code must not hold kernel locks (since it
838 * can equally take a page fault), therefore it is safe to call
839 * force_sig_info even though that claims and releases locks.
840 *
841 * Code in ./signal.c ensures that the debug control register
842 * is restored before we deliver any signal, and therefore that
843 * user code runs with the correct debug control register even though
844 * we clear it here.
845 *
846 * Being careful here means that we don't have to be as careful in a
847 * lot of more complicated places (task switching can be a bit lazy
848 * about restoring all the debug state, and ptrace doesn't have to
849 * find every occurrence of the TF bit that could be saved away even
850 * by user code)
851 */
852fastcall void __kprobes do_debug(struct pt_regs * regs, long error_code)
853{
854 unsigned int condition;
855 struct task_struct *tsk = current;
856
857 get_debugreg(condition, 6);
858
859 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
860 SIGTRAP) == NOTIFY_STOP)
861 return;
862 /* It's safe to allow irq's after DR6 has been saved */
863 if (regs->eflags & X86_EFLAGS_IF)
864 local_irq_enable();
865
866 /* Mask out spurious debug traps due to lazy DR7 setting */
867 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
868 if (!tsk->thread.debugreg[7])
869 goto clear_dr7;
870 }
871
872 if (regs->eflags & VM_MASK)
873 goto debug_vm86;
874
875 /* Save debug status register where ptrace can see it */
876 tsk->thread.debugreg[6] = condition;
877
878 /*
879 * Single-stepping through TF: make sure we ignore any events in
880 * kernel space (but re-enable TF when returning to user mode).
881 */
882 if (condition & DR_STEP) {
883 /*
884 * We already checked v86 mode above, so we can
885 * check for kernel mode by just checking the CPL
886 * of CS.
887 */
888 if (!user_mode(regs))
889 goto clear_TF_reenable;
890 }
891
892 /* Ok, finally something we can handle */
893 send_sigtrap(tsk, regs, error_code);
894
895 /* Disable additional traps. They'll be re-enabled when
896 * the signal is delivered.
897 */
898clear_dr7:
899 set_debugreg(0, 7);
900 return;
901
902debug_vm86:
903 handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);
904 return;
905
906clear_TF_reenable:
907 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
908 regs->eflags &= ~TF_MASK;
909 return;
910}
911
912/*
913 * Note that we play around with the 'TS' bit in an attempt to get
914 * the correct behaviour even in the presence of the asynchronous
915 * IRQ13 behaviour
916 */
917void math_error(void __user *eip)
918{
919 struct task_struct * task;
920 siginfo_t info;
921 unsigned short cwd, swd;
922
923 /*
924 * Save the info for the exception handler and clear the error.
925 */
926 task = current;
927 save_init_fpu(task);
928 task->thread.trap_no = 16;
929 task->thread.error_code = 0;
930 info.si_signo = SIGFPE;
931 info.si_errno = 0;
932 info.si_code = __SI_FAULT;
933 info.si_addr = eip;
934 /*
935 * (~cwd & swd) will mask out exceptions that are not set to unmasked
936 * status. 0x3f is the exception bits in these regs, 0x200 is the
937 * C1 reg you need in case of a stack fault, 0x040 is the stack
938 * fault bit. We should only be taking one exception at a time,
939 * so if this combination doesn't produce any single exception,
940 * then we have a bad program that isn't syncronizing its FPU usage
941 * and it will suffer the consequences since we won't be able to
942 * fully reproduce the context of the exception
943 */
944 cwd = get_fpu_cwd(task);
945 swd = get_fpu_swd(task);
946 switch (swd & ~cwd & 0x3f) {
947 case 0x000: /* No unmasked exception */
948 return;
949 default: /* Multiple exceptions */
950 break;
951 case 0x001: /* Invalid Op */
952 /*
953 * swd & 0x240 == 0x040: Stack Underflow
954 * swd & 0x240 == 0x240: Stack Overflow
955 * User must clear the SF bit (0x40) if set
956 */
957 info.si_code = FPE_FLTINV;
958 break;
959 case 0x002: /* Denormalize */
960 case 0x010: /* Underflow */
961 info.si_code = FPE_FLTUND;
962 break;
963 case 0x004: /* Zero Divide */
964 info.si_code = FPE_FLTDIV;
965 break;
966 case 0x008: /* Overflow */
967 info.si_code = FPE_FLTOVF;
968 break;
969 case 0x020: /* Precision */
970 info.si_code = FPE_FLTRES;
971 break;
972 }
973 force_sig_info(SIGFPE, &info, task);
974}
975
976fastcall void do_coprocessor_error(struct pt_regs * regs, long error_code)
977{
978 ignore_fpu_irq = 1;
979 math_error((void __user *)regs->eip);
980}
981
982static void simd_math_error(void __user *eip)
983{
984 struct task_struct * task;
985 siginfo_t info;
986 unsigned short mxcsr;
987
988 /*
989 * Save the info for the exception handler and clear the error.
990 */
991 task = current;
992 save_init_fpu(task);
993 task->thread.trap_no = 19;
994 task->thread.error_code = 0;
995 info.si_signo = SIGFPE;
996 info.si_errno = 0;
997 info.si_code = __SI_FAULT;
998 info.si_addr = eip;
999 /*
1000 * The SIMD FPU exceptions are handled a little differently, as there
1001 * is only a single status/control register. Thus, to determine which
1002 * unmasked exception was caught we must mask the exception mask bits
1003 * at 0x1f80, and then use these to mask the exception bits at 0x3f.
1004 */
1005 mxcsr = get_fpu_mxcsr(task);
1006 switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
1007 case 0x000:
1008 default:
1009 break;
1010 case 0x001: /* Invalid Op */
1011 info.si_code = FPE_FLTINV;
1012 break;
1013 case 0x002: /* Denormalize */
1014 case 0x010: /* Underflow */
1015 info.si_code = FPE_FLTUND;
1016 break;
1017 case 0x004: /* Zero Divide */
1018 info.si_code = FPE_FLTDIV;
1019 break;
1020 case 0x008: /* Overflow */
1021 info.si_code = FPE_FLTOVF;
1022 break;
1023 case 0x020: /* Precision */
1024 info.si_code = FPE_FLTRES;
1025 break;
1026 }
1027 force_sig_info(SIGFPE, &info, task);
1028}
1029
1030fastcall void do_simd_coprocessor_error(struct pt_regs * regs,
1031 long error_code)
1032{
1033 if (cpu_has_xmm) {
1034 /* Handle SIMD FPU exceptions on PIII+ processors. */
1035 ignore_fpu_irq = 1;
1036 simd_math_error((void __user *)regs->eip);
1037 } else {
1038 /*
1039 * Handle strange cache flush from user space exception
1040 * in all other cases. This is undocumented behaviour.
1041 */
1042 if (regs->eflags & VM_MASK) {
1043 handle_vm86_fault((struct kernel_vm86_regs *)regs,
1044 error_code);
1045 return;
1046 }
1047 current->thread.trap_no = 19;
1048 current->thread.error_code = error_code;
1049 die_if_kernel("cache flush denied", regs, error_code);
1050 force_sig(SIGSEGV, current);
1051 }
1052}
1053
1054fastcall void do_spurious_interrupt_bug(struct pt_regs * regs,
1055 long error_code)
1056{
1057#if 0
1058 /* No need to warn about this any longer. */
1059 printk("Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
1060#endif
1061}
1062
1063fastcall unsigned long patch_espfix_desc(unsigned long uesp,
1064 unsigned long kesp)
1065{
1066 struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt;
1067 unsigned long base = (kesp - uesp) & -THREAD_SIZE;
1068 unsigned long new_kesp = kesp - base;
1069 unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT;
1070 __u64 desc = *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS];
1071 /* Set up base for espfix segment */
1072 desc &= 0x00f0ff0000000000ULL;
1073 desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) |
1074 ((((__u64)base) << 32) & 0xff00000000000000ULL) |
1075 ((((__u64)lim_pages) << 32) & 0x000f000000000000ULL) |
1076 (lim_pages & 0xffff);
1077 *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS] = desc;
1078 return new_kesp;
1079}
1080
1081/*
1082 * 'math_state_restore()' saves the current math information in the
1083 * old math state array, and gets the new ones from the current task
1084 *
1085 * Careful.. There are problems with IBM-designed IRQ13 behaviour.
1086 * Don't touch unless you *really* know how it works.
1087 *
1088 * Must be called with kernel preemption disabled (in this case,
1089 * local interrupts are disabled at the call-site in entry.S).
1090 */
1091asmlinkage void math_state_restore(void)
1092{
1093 struct thread_info *thread = current_thread_info();
1094 struct task_struct *tsk = thread->task;
1095
1096 clts(); /* Allow maths ops (or we recurse) */
1097 if (!tsk_used_math(tsk))
1098 init_fpu(tsk);
1099 restore_fpu(tsk);
1100 thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */
1101 tsk->fpu_counter++;
1102}
1103EXPORT_SYMBOL_GPL(math_state_restore);
1104
1105#ifndef CONFIG_MATH_EMULATION
1106
1107asmlinkage void math_emulate(long arg)
1108{
1109 printk(KERN_EMERG "math-emulation not enabled and no coprocessor found.\n");
1110 printk(KERN_EMERG "killing %s.\n",current->comm);
1111 force_sig(SIGFPE,current);
1112 schedule();
1113}
1114
1115#endif /* CONFIG_MATH_EMULATION */
1116
1117#ifdef CONFIG_X86_F00F_BUG
1118void __init trap_init_f00f_bug(void)
1119{
1120 __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO);
1121
1122 /*
1123 * Update the IDT descriptor and reload the IDT so that
1124 * it uses the read-only mapped virtual address.
1125 */
1126 idt_descr.address = fix_to_virt(FIX_F00F_IDT);
1127 load_idt(&idt_descr);
1128}
1129#endif
1130
1131/*
1132 * This needs to use 'idt_table' rather than 'idt', and
1133 * thus use the _nonmapped_ version of the IDT, as the
1134 * Pentium F0 0F bugfix can have resulted in the mapped
1135 * IDT being write-protected.
1136 */
1137void set_intr_gate(unsigned int n, void *addr)
1138{
1139 _set_gate(n, DESCTYPE_INT, addr, __KERNEL_CS);
1140}
1141
1142/*
1143 * This routine sets up an interrupt gate at directory privilege level 3.
1144 */
1145static inline void set_system_intr_gate(unsigned int n, void *addr)
1146{
1147 _set_gate(n, DESCTYPE_INT | DESCTYPE_DPL3, addr, __KERNEL_CS);
1148}
1149
1150static void __init set_trap_gate(unsigned int n, void *addr)
1151{
1152 _set_gate(n, DESCTYPE_TRAP, addr, __KERNEL_CS);
1153}
1154
1155static void __init set_system_gate(unsigned int n, void *addr)
1156{
1157 _set_gate(n, DESCTYPE_TRAP | DESCTYPE_DPL3, addr, __KERNEL_CS);
1158}
1159
1160static void __init set_task_gate(unsigned int n, unsigned int gdt_entry)
1161{
1162 _set_gate(n, DESCTYPE_TASK, (void *)0, (gdt_entry<<3));
1163}
1164
1165
1166void __init trap_init(void)
1167{
1168#ifdef CONFIG_EISA
1169 void __iomem *p = ioremap(0x0FFFD9, 4);
1170 if (readl(p) == 'E'+('I'<<8)+('S'<<16)+('A'<<24)) {
1171 EISA_bus = 1;
1172 }
1173 iounmap(p);
1174#endif
1175
1176#ifdef CONFIG_X86_LOCAL_APIC
1177 init_apic_mappings();
1178#endif
1179
1180 set_trap_gate(0,&divide_error);
1181 set_intr_gate(1,&debug);
1182 set_intr_gate(2,&nmi);
1183 set_system_intr_gate(3, &int3); /* int3/4 can be called from all */
1184 set_system_gate(4,&overflow);
1185 set_trap_gate(5,&bounds);
1186 set_trap_gate(6,&invalid_op);
1187 set_trap_gate(7,&device_not_available);
1188 set_task_gate(8,GDT_ENTRY_DOUBLEFAULT_TSS);
1189 set_trap_gate(9,&coprocessor_segment_overrun);
1190 set_trap_gate(10,&invalid_TSS);
1191 set_trap_gate(11,&segment_not_present);
1192 set_trap_gate(12,&stack_segment);
1193 set_trap_gate(13,&general_protection);
1194 set_intr_gate(14,&page_fault);
1195 set_trap_gate(15,&spurious_interrupt_bug);
1196 set_trap_gate(16,&coprocessor_error);
1197 set_trap_gate(17,&alignment_check);
1198#ifdef CONFIG_X86_MCE
1199 set_trap_gate(18,&machine_check);
1200#endif
1201 set_trap_gate(19,&simd_coprocessor_error);
1202
1203 if (cpu_has_fxsr) {
1204 /*
1205 * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned.
1206 * Generates a compile-time "error: zero width for bit-field" if
1207 * the alignment is wrong.
1208 */
1209 struct fxsrAlignAssert {
1210 int _:!(offsetof(struct task_struct,
1211 thread.i387.fxsave) & 15);
1212 };
1213
1214 printk(KERN_INFO "Enabling fast FPU save and restore... ");
1215 set_in_cr4(X86_CR4_OSFXSR);
1216 printk("done.\n");
1217 }
1218 if (cpu_has_xmm) {
1219 printk(KERN_INFO "Enabling unmasked SIMD FPU exception "
1220 "support... ");
1221 set_in_cr4(X86_CR4_OSXMMEXCPT);
1222 printk("done.\n");
1223 }
1224
1225 set_system_gate(SYSCALL_VECTOR,&system_call);
1226
1227 /*
1228 * Should be a barrier for any external CPU state.
1229 */
1230 cpu_init();
1231
1232 trap_init_hook();
1233}
1234
1235static int __init kstack_setup(char *s)
1236{
1237 kstack_depth_to_print = simple_strtoul(s, NULL, 0);
1238 return 1;
1239}
1240__setup("kstack=", kstack_setup);
1241
1242static int __init code_bytes_setup(char *s)
1243{
1244 code_bytes = simple_strtoul(s, NULL, 0);
1245 if (code_bytes > 8192)
1246 code_bytes = 8192;
1247
1248 return 1;
1249}
1250__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
new file mode 100644
index 000000000000..03888420775d
--- /dev/null
+++ b/arch/x86/kernel/traps_64.c
@@ -0,0 +1,1138 @@
1/*
2 * linux/arch/x86-64/traps.c
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
6 *
7 * Pentium III FXSR, SSE support
8 * Gareth Hughes <gareth@valinux.com>, May 2000
9 */
10
11/*
12 * 'Traps.c' handles hardware traps and faults after we have saved some
13 * state in 'entry.S'.
14 */
15#include <linux/sched.h>
16#include <linux/kernel.h>
17#include <linux/string.h>
18#include <linux/errno.h>
19#include <linux/ptrace.h>
20#include <linux/timer.h>
21#include <linux/mm.h>
22#include <linux/init.h>
23#include <linux/delay.h>
24#include <linux/spinlock.h>
25#include <linux/interrupt.h>
26#include <linux/kallsyms.h>
27#include <linux/module.h>
28#include <linux/moduleparam.h>
29#include <linux/nmi.h>
30#include <linux/kprobes.h>
31#include <linux/kexec.h>
32#include <linux/unwind.h>
33#include <linux/uaccess.h>
34#include <linux/bug.h>
35#include <linux/kdebug.h>
36
37#if defined(CONFIG_EDAC)
38#include <linux/edac.h>
39#endif
40
41#include <asm/system.h>
42#include <asm/io.h>
43#include <asm/atomic.h>
44#include <asm/debugreg.h>
45#include <asm/desc.h>
46#include <asm/i387.h>
47#include <asm/processor.h>
48#include <asm/unwind.h>
49#include <asm/smp.h>
50#include <asm/pgalloc.h>
51#include <asm/pda.h>
52#include <asm/proto.h>
53#include <asm/nmi.h>
54#include <asm/stacktrace.h>
55
56asmlinkage void divide_error(void);
57asmlinkage void debug(void);
58asmlinkage void nmi(void);
59asmlinkage void int3(void);
60asmlinkage void overflow(void);
61asmlinkage void bounds(void);
62asmlinkage void invalid_op(void);
63asmlinkage void device_not_available(void);
64asmlinkage void double_fault(void);
65asmlinkage void coprocessor_segment_overrun(void);
66asmlinkage void invalid_TSS(void);
67asmlinkage void segment_not_present(void);
68asmlinkage void stack_segment(void);
69asmlinkage void general_protection(void);
70asmlinkage void page_fault(void);
71asmlinkage void coprocessor_error(void);
72asmlinkage void simd_coprocessor_error(void);
73asmlinkage void reserved(void);
74asmlinkage void alignment_check(void);
75asmlinkage void machine_check(void);
76asmlinkage void spurious_interrupt_bug(void);
77
78static inline void conditional_sti(struct pt_regs *regs)
79{
80 if (regs->eflags & X86_EFLAGS_IF)
81 local_irq_enable();
82}
83
84static inline void preempt_conditional_sti(struct pt_regs *regs)
85{
86 preempt_disable();
87 if (regs->eflags & X86_EFLAGS_IF)
88 local_irq_enable();
89}
90
91static inline void preempt_conditional_cli(struct pt_regs *regs)
92{
93 if (regs->eflags & X86_EFLAGS_IF)
94 local_irq_disable();
95 /* Make sure to not schedule here because we could be running
96 on an exception stack. */
97 preempt_enable_no_resched();
98}
99
100int kstack_depth_to_print = 12;
101
102#ifdef CONFIG_KALLSYMS
103void printk_address(unsigned long address)
104{
105 unsigned long offset = 0, symsize;
106 const char *symname;
107 char *modname;
108 char *delim = ":";
109 char namebuf[128];
110
111 symname = kallsyms_lookup(address, &symsize, &offset,
112 &modname, namebuf);
113 if (!symname) {
114 printk(" [<%016lx>]\n", address);
115 return;
116 }
117 if (!modname)
118 modname = delim = "";
119 printk(" [<%016lx>] %s%s%s%s+0x%lx/0x%lx\n",
120 address, delim, modname, delim, symname, offset, symsize);
121}
122#else
123void printk_address(unsigned long address)
124{
125 printk(" [<%016lx>]\n", address);
126}
127#endif
128
129static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
130 unsigned *usedp, char **idp)
131{
132 static char ids[][8] = {
133 [DEBUG_STACK - 1] = "#DB",
134 [NMI_STACK - 1] = "NMI",
135 [DOUBLEFAULT_STACK - 1] = "#DF",
136 [STACKFAULT_STACK - 1] = "#SS",
137 [MCE_STACK - 1] = "#MC",
138#if DEBUG_STKSZ > EXCEPTION_STKSZ
139 [N_EXCEPTION_STACKS ... N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]"
140#endif
141 };
142 unsigned k;
143
144 /*
145 * Iterate over all exception stacks, and figure out whether
146 * 'stack' is in one of them:
147 */
148 for (k = 0; k < N_EXCEPTION_STACKS; k++) {
149 unsigned long end = per_cpu(orig_ist, cpu).ist[k];
150 /*
151 * Is 'stack' above this exception frame's end?
152 * If yes then skip to the next frame.
153 */
154 if (stack >= end)
155 continue;
156 /*
157 * Is 'stack' above this exception frame's start address?
158 * If yes then we found the right frame.
159 */
160 if (stack >= end - EXCEPTION_STKSZ) {
161 /*
162 * Make sure we only iterate through an exception
163 * stack once. If it comes up for the second time
164 * then there's something wrong going on - just
165 * break out and return NULL:
166 */
167 if (*usedp & (1U << k))
168 break;
169 *usedp |= 1U << k;
170 *idp = ids[k];
171 return (unsigned long *)end;
172 }
173 /*
174 * If this is a debug stack, and if it has a larger size than
175 * the usual exception stacks, then 'stack' might still
176 * be within the lower portion of the debug stack:
177 */
178#if DEBUG_STKSZ > EXCEPTION_STKSZ
179 if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) {
180 unsigned j = N_EXCEPTION_STACKS - 1;
181
182 /*
183 * Black magic. A large debug stack is composed of
184 * multiple exception stack entries, which we
185 * iterate through now. Dont look:
186 */
187 do {
188 ++j;
189 end -= EXCEPTION_STKSZ;
190 ids[j][4] = '1' + (j - N_EXCEPTION_STACKS);
191 } while (stack < end - EXCEPTION_STKSZ);
192 if (*usedp & (1U << j))
193 break;
194 *usedp |= 1U << j;
195 *idp = ids[j];
196 return (unsigned long *)end;
197 }
198#endif
199 }
200 return NULL;
201}
202
203#define MSG(txt) ops->warning(data, txt)
204
205/*
206 * x86-64 can have upto three kernel stacks:
207 * process stack
208 * interrupt stack
209 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
210 */
211
212static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
213{
214 void *t = (void *)tinfo;
215 return p > t && p < t + THREAD_SIZE - 3;
216}
217
218void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
219 unsigned long *stack,
220 struct stacktrace_ops *ops, void *data)
221{
222 const unsigned cpu = get_cpu();
223 unsigned long *irqstack_end = (unsigned long*)cpu_pda(cpu)->irqstackptr;
224 unsigned used = 0;
225 struct thread_info *tinfo;
226
227 if (!tsk)
228 tsk = current;
229
230 if (!stack) {
231 unsigned long dummy;
232 stack = &dummy;
233 if (tsk && tsk != current)
234 stack = (unsigned long *)tsk->thread.rsp;
235 }
236
237 /*
238 * Print function call entries within a stack. 'cond' is the
239 * "end of stackframe" condition, that the 'stack++'
240 * iteration will eventually trigger.
241 */
242#define HANDLE_STACK(cond) \
243 do while (cond) { \
244 unsigned long addr = *stack++; \
245 /* Use unlocked access here because except for NMIs \
246 we should be already protected against module unloads */ \
247 if (__kernel_text_address(addr)) { \
248 /* \
249 * If the address is either in the text segment of the \
250 * kernel, or in the region which contains vmalloc'ed \
251 * memory, it *may* be the address of a calling \
252 * routine; if so, print it so that someone tracing \
253 * down the cause of the crash will be able to figure \
254 * out the call path that was taken. \
255 */ \
256 ops->address(data, addr); \
257 } \
258 } while (0)
259
260 /*
261 * Print function call entries in all stacks, starting at the
262 * current stack address. If the stacks consist of nested
263 * exceptions
264 */
265 for (;;) {
266 char *id;
267 unsigned long *estack_end;
268 estack_end = in_exception_stack(cpu, (unsigned long)stack,
269 &used, &id);
270
271 if (estack_end) {
272 if (ops->stack(data, id) < 0)
273 break;
274 HANDLE_STACK (stack < estack_end);
275 ops->stack(data, "<EOE>");
276 /*
277 * We link to the next stack via the
278 * second-to-last pointer (index -2 to end) in the
279 * exception stack:
280 */
281 stack = (unsigned long *) estack_end[-2];
282 continue;
283 }
284 if (irqstack_end) {
285 unsigned long *irqstack;
286 irqstack = irqstack_end -
287 (IRQSTACKSIZE - 64) / sizeof(*irqstack);
288
289 if (stack >= irqstack && stack < irqstack_end) {
290 if (ops->stack(data, "IRQ") < 0)
291 break;
292 HANDLE_STACK (stack < irqstack_end);
293 /*
294 * We link to the next stack (which would be
295 * the process stack normally) the last
296 * pointer (index -1 to end) in the IRQ stack:
297 */
298 stack = (unsigned long *) (irqstack_end[-1]);
299 irqstack_end = NULL;
300 ops->stack(data, "EOI");
301 continue;
302 }
303 }
304 break;
305 }
306
307 /*
308 * This handles the process stack:
309 */
310 tinfo = task_thread_info(tsk);
311 HANDLE_STACK (valid_stack_ptr(tinfo, stack));
312#undef HANDLE_STACK
313 put_cpu();
314}
315EXPORT_SYMBOL(dump_trace);
316
317static void
318print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
319{
320 print_symbol(msg, symbol);
321 printk("\n");
322}
323
324static void print_trace_warning(void *data, char *msg)
325{
326 printk("%s\n", msg);
327}
328
329static int print_trace_stack(void *data, char *name)
330{
331 printk(" <%s> ", name);
332 return 0;
333}
334
335static void print_trace_address(void *data, unsigned long addr)
336{
337 touch_nmi_watchdog();
338 printk_address(addr);
339}
340
341static struct stacktrace_ops print_trace_ops = {
342 .warning = print_trace_warning,
343 .warning_symbol = print_trace_warning_symbol,
344 .stack = print_trace_stack,
345 .address = print_trace_address,
346};
347
348void
349show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack)
350{
351 printk("\nCall Trace:\n");
352 dump_trace(tsk, regs, stack, &print_trace_ops, NULL);
353 printk("\n");
354}
355
356static void
357_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp)
358{
359 unsigned long *stack;
360 int i;
361 const int cpu = smp_processor_id();
362 unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr);
363 unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
364
365 // debugging aid: "show_stack(NULL, NULL);" prints the
366 // back trace for this cpu.
367
368 if (rsp == NULL) {
369 if (tsk)
370 rsp = (unsigned long *)tsk->thread.rsp;
371 else
372 rsp = (unsigned long *)&rsp;
373 }
374
375 stack = rsp;
376 for(i=0; i < kstack_depth_to_print; i++) {
377 if (stack >= irqstack && stack <= irqstack_end) {
378 if (stack == irqstack_end) {
379 stack = (unsigned long *) (irqstack_end[-1]);
380 printk(" <EOI> ");
381 }
382 } else {
383 if (((long) stack & (THREAD_SIZE-1)) == 0)
384 break;
385 }
386 if (i && ((i % 4) == 0))
387 printk("\n");
388 printk(" %016lx", *stack++);
389 touch_nmi_watchdog();
390 }
391 show_trace(tsk, regs, rsp);
392}
393
394void show_stack(struct task_struct *tsk, unsigned long * rsp)
395{
396 _show_stack(tsk, NULL, rsp);
397}
398
399/*
400 * The architecture-independent dump_stack generator
401 */
402void dump_stack(void)
403{
404 unsigned long dummy;
405 show_trace(NULL, NULL, &dummy);
406}
407
408EXPORT_SYMBOL(dump_stack);
409
410void show_registers(struct pt_regs *regs)
411{
412 int i;
413 int in_kernel = !user_mode(regs);
414 unsigned long rsp;
415 const int cpu = smp_processor_id();
416 struct task_struct *cur = cpu_pda(cpu)->pcurrent;
417
418 rsp = regs->rsp;
419 printk("CPU %d ", cpu);
420 __show_regs(regs);
421 printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
422 cur->comm, cur->pid, task_thread_info(cur), cur);
423
424 /*
425 * When in-kernel, we also print out the stack and code at the
426 * time of the fault..
427 */
428 if (in_kernel) {
429 printk("Stack: ");
430 _show_stack(NULL, regs, (unsigned long*)rsp);
431
432 printk("\nCode: ");
433 if (regs->rip < PAGE_OFFSET)
434 goto bad;
435
436 for (i=0; i<20; i++) {
437 unsigned char c;
438 if (__get_user(c, &((unsigned char*)regs->rip)[i])) {
439bad:
440 printk(" Bad RIP value.");
441 break;
442 }
443 printk("%02x ", c);
444 }
445 }
446 printk("\n");
447}
448
449int is_valid_bugaddr(unsigned long rip)
450{
451 unsigned short ud2;
452
453 if (__copy_from_user(&ud2, (const void __user *) rip, sizeof(ud2)))
454 return 0;
455
456 return ud2 == 0x0b0f;
457}
458
459#ifdef CONFIG_BUG
460void out_of_line_bug(void)
461{
462 BUG();
463}
464EXPORT_SYMBOL(out_of_line_bug);
465#endif
466
467static DEFINE_SPINLOCK(die_lock);
468static int die_owner = -1;
469static unsigned int die_nest_count;
470
471unsigned __kprobes long oops_begin(void)
472{
473 int cpu;
474 unsigned long flags;
475
476 oops_enter();
477
478 /* racy, but better than risking deadlock. */
479 local_irq_save(flags);
480 cpu = smp_processor_id();
481 if (!spin_trylock(&die_lock)) {
482 if (cpu == die_owner)
483 /* nested oops. should stop eventually */;
484 else
485 spin_lock(&die_lock);
486 }
487 die_nest_count++;
488 die_owner = cpu;
489 console_verbose();
490 bust_spinlocks(1);
491 return flags;
492}
493
494void __kprobes oops_end(unsigned long flags)
495{
496 die_owner = -1;
497 bust_spinlocks(0);
498 die_nest_count--;
499 if (die_nest_count)
500 /* We still own the lock */
501 local_irq_restore(flags);
502 else
503 /* Nest count reaches zero, release the lock. */
504 spin_unlock_irqrestore(&die_lock, flags);
505 if (panic_on_oops)
506 panic("Fatal exception");
507 oops_exit();
508}
509
510void __kprobes __die(const char * str, struct pt_regs * regs, long err)
511{
512 static int die_counter;
513 printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter);
514#ifdef CONFIG_PREEMPT
515 printk("PREEMPT ");
516#endif
517#ifdef CONFIG_SMP
518 printk("SMP ");
519#endif
520#ifdef CONFIG_DEBUG_PAGEALLOC
521 printk("DEBUG_PAGEALLOC");
522#endif
523 printk("\n");
524 notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV);
525 show_registers(regs);
526 add_taint(TAINT_DIE);
527 /* Executive summary in case the oops scrolled away */
528 printk(KERN_ALERT "RIP ");
529 printk_address(regs->rip);
530 printk(" RSP <%016lx>\n", regs->rsp);
531 if (kexec_should_crash(current))
532 crash_kexec(regs);
533}
534
535void die(const char * str, struct pt_regs * regs, long err)
536{
537 unsigned long flags = oops_begin();
538
539 if (!user_mode(regs))
540 report_bug(regs->rip, regs);
541
542 __die(str, regs, err);
543 oops_end(flags);
544 do_exit(SIGSEGV);
545}
546
547void __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic)
548{
549 unsigned long flags = oops_begin();
550
551 /*
552 * We are in trouble anyway, lets at least try
553 * to get a message out.
554 */
555 printk(str, smp_processor_id());
556 show_registers(regs);
557 if (kexec_should_crash(current))
558 crash_kexec(regs);
559 if (do_panic || panic_on_oops)
560 panic("Non maskable interrupt");
561 oops_end(flags);
562 nmi_exit();
563 local_irq_enable();
564 do_exit(SIGSEGV);
565}
566
567static void __kprobes do_trap(int trapnr, int signr, char *str,
568 struct pt_regs * regs, long error_code,
569 siginfo_t *info)
570{
571 struct task_struct *tsk = current;
572
573 if (user_mode(regs)) {
574 /*
575 * We want error_code and trap_no set for userspace
576 * faults and kernelspace faults which result in
577 * die(), but not kernelspace faults which are fixed
578 * up. die() gives the process no chance to handle
579 * the signal and notice the kernel fault information,
580 * so that won't result in polluting the information
581 * about previously queued, but not yet delivered,
582 * faults. See also do_general_protection below.
583 */
584 tsk->thread.error_code = error_code;
585 tsk->thread.trap_no = trapnr;
586
587 if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
588 printk_ratelimit())
589 printk(KERN_INFO
590 "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n",
591 tsk->comm, tsk->pid, str,
592 regs->rip, regs->rsp, error_code);
593
594 if (info)
595 force_sig_info(signr, info, tsk);
596 else
597 force_sig(signr, tsk);
598 return;
599 }
600
601
602 /* kernel trap */
603 {
604 const struct exception_table_entry *fixup;
605 fixup = search_exception_tables(regs->rip);
606 if (fixup)
607 regs->rip = fixup->fixup;
608 else {
609 tsk->thread.error_code = error_code;
610 tsk->thread.trap_no = trapnr;
611 die(str, regs, error_code);
612 }
613 return;
614 }
615}
616
617#define DO_ERROR(trapnr, signr, str, name) \
618asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
619{ \
620 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
621 == NOTIFY_STOP) \
622 return; \
623 conditional_sti(regs); \
624 do_trap(trapnr, signr, str, regs, error_code, NULL); \
625}
626
627#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
628asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
629{ \
630 siginfo_t info; \
631 info.si_signo = signr; \
632 info.si_errno = 0; \
633 info.si_code = sicode; \
634 info.si_addr = (void __user *)siaddr; \
635 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
636 == NOTIFY_STOP) \
637 return; \
638 conditional_sti(regs); \
639 do_trap(trapnr, signr, str, regs, error_code, &info); \
640}
641
642DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->rip)
643DO_ERROR( 4, SIGSEGV, "overflow", overflow)
644DO_ERROR( 5, SIGSEGV, "bounds", bounds)
645DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->rip)
646DO_ERROR( 7, SIGSEGV, "device not available", device_not_available)
647DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
648DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
649DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
650DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
651DO_ERROR(18, SIGSEGV, "reserved", reserved)
652
653/* Runs on IST stack */
654asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code)
655{
656 if (notify_die(DIE_TRAP, "stack segment", regs, error_code,
657 12, SIGBUS) == NOTIFY_STOP)
658 return;
659 preempt_conditional_sti(regs);
660 do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL);
661 preempt_conditional_cli(regs);
662}
663
664asmlinkage void do_double_fault(struct pt_regs * regs, long error_code)
665{
666 static const char str[] = "double fault";
667 struct task_struct *tsk = current;
668
669 /* Return not checked because double check cannot be ignored */
670 notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV);
671
672 tsk->thread.error_code = error_code;
673 tsk->thread.trap_no = 8;
674
675 /* This is always a kernel trap and never fixable (and thus must
676 never return). */
677 for (;;)
678 die(str, regs, error_code);
679}
680
681asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
682 long error_code)
683{
684 struct task_struct *tsk = current;
685
686 conditional_sti(regs);
687
688 if (user_mode(regs)) {
689 tsk->thread.error_code = error_code;
690 tsk->thread.trap_no = 13;
691
692 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
693 printk_ratelimit())
694 printk(KERN_INFO
695 "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n",
696 tsk->comm, tsk->pid,
697 regs->rip, regs->rsp, error_code);
698
699 force_sig(SIGSEGV, tsk);
700 return;
701 }
702
703 /* kernel gp */
704 {
705 const struct exception_table_entry *fixup;
706 fixup = search_exception_tables(regs->rip);
707 if (fixup) {
708 regs->rip = fixup->fixup;
709 return;
710 }
711
712 tsk->thread.error_code = error_code;
713 tsk->thread.trap_no = 13;
714 if (notify_die(DIE_GPF, "general protection fault", regs,
715 error_code, 13, SIGSEGV) == NOTIFY_STOP)
716 return;
717 die("general protection fault", regs, error_code);
718 }
719}
720
721static __kprobes void
722mem_parity_error(unsigned char reason, struct pt_regs * regs)
723{
724 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
725 reason);
726 printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
727
728#if defined(CONFIG_EDAC)
729 if(edac_handler_set()) {
730 edac_atomic_assert_error();
731 return;
732 }
733#endif
734
735 if (panic_on_unrecovered_nmi)
736 panic("NMI: Not continuing");
737
738 printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
739
740 /* Clear and disable the memory parity error line. */
741 reason = (reason & 0xf) | 4;
742 outb(reason, 0x61);
743}
744
745static __kprobes void
746io_check_error(unsigned char reason, struct pt_regs * regs)
747{
748 printk("NMI: IOCK error (debug interrupt?)\n");
749 show_registers(regs);
750
751 /* Re-enable the IOCK line, wait for a few seconds */
752 reason = (reason & 0xf) | 8;
753 outb(reason, 0x61);
754 mdelay(2000);
755 reason &= ~8;
756 outb(reason, 0x61);
757}
758
759static __kprobes void
760unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
761{
762 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
763 reason);
764 printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
765
766 if (panic_on_unrecovered_nmi)
767 panic("NMI: Not continuing");
768
769 printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
770}
771
772/* Runs on IST stack. This code must keep interrupts off all the time.
773 Nested NMIs are prevented by the CPU. */
774asmlinkage __kprobes void default_do_nmi(struct pt_regs *regs)
775{
776 unsigned char reason = 0;
777 int cpu;
778
779 cpu = smp_processor_id();
780
781 /* Only the BSP gets external NMIs from the system. */
782 if (!cpu)
783 reason = get_nmi_reason();
784
785 if (!(reason & 0xc0)) {
786 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
787 == NOTIFY_STOP)
788 return;
789 /*
790 * Ok, so this is none of the documented NMI sources,
791 * so it must be the NMI watchdog.
792 */
793 if (nmi_watchdog_tick(regs,reason))
794 return;
795 if (!do_nmi_callback(regs,cpu))
796 unknown_nmi_error(reason, regs);
797
798 return;
799 }
800 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
801 return;
802
803 /* AK: following checks seem to be broken on modern chipsets. FIXME */
804
805 if (reason & 0x80)
806 mem_parity_error(reason, regs);
807 if (reason & 0x40)
808 io_check_error(reason, regs);
809}
810
811/* runs on IST stack. */
812asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code)
813{
814 if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) {
815 return;
816 }
817 preempt_conditional_sti(regs);
818 do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
819 preempt_conditional_cli(regs);
820}
821
822/* Help handler running on IST stack to switch back to user stack
823 for scheduling or signal handling. The actual stack switch is done in
824 entry.S */
825asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
826{
827 struct pt_regs *regs = eregs;
828 /* Did already sync */
829 if (eregs == (struct pt_regs *)eregs->rsp)
830 ;
831 /* Exception from user space */
832 else if (user_mode(eregs))
833 regs = task_pt_regs(current);
834 /* Exception from kernel and interrupts are enabled. Move to
835 kernel process stack. */
836 else if (eregs->eflags & X86_EFLAGS_IF)
837 regs = (struct pt_regs *)(eregs->rsp -= sizeof(struct pt_regs));
838 if (eregs != regs)
839 *regs = *eregs;
840 return regs;
841}
842
843/* runs on IST stack. */
844asmlinkage void __kprobes do_debug(struct pt_regs * regs,
845 unsigned long error_code)
846{
847 unsigned long condition;
848 struct task_struct *tsk = current;
849 siginfo_t info;
850
851 get_debugreg(condition, 6);
852
853 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
854 SIGTRAP) == NOTIFY_STOP)
855 return;
856
857 preempt_conditional_sti(regs);
858
859 /* Mask out spurious debug traps due to lazy DR7 setting */
860 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
861 if (!tsk->thread.debugreg7) {
862 goto clear_dr7;
863 }
864 }
865
866 tsk->thread.debugreg6 = condition;
867
868 /* Mask out spurious TF errors due to lazy TF clearing */
869 if (condition & DR_STEP) {
870 /*
871 * The TF error should be masked out only if the current
872 * process is not traced and if the TRAP flag has been set
873 * previously by a tracing process (condition detected by
874 * the PT_DTRACE flag); remember that the i386 TRAP flag
875 * can be modified by the process itself in user mode,
876 * allowing programs to debug themselves without the ptrace()
877 * interface.
878 */
879 if (!user_mode(regs))
880 goto clear_TF_reenable;
881 /*
882 * Was the TF flag set by a debugger? If so, clear it now,
883 * so that register information is correct.
884 */
885 if (tsk->ptrace & PT_DTRACE) {
886 regs->eflags &= ~TF_MASK;
887 tsk->ptrace &= ~PT_DTRACE;
888 }
889 }
890
891 /* Ok, finally something we can handle */
892 tsk->thread.trap_no = 1;
893 tsk->thread.error_code = error_code;
894 info.si_signo = SIGTRAP;
895 info.si_errno = 0;
896 info.si_code = TRAP_BRKPT;
897 info.si_addr = user_mode(regs) ? (void __user *)regs->rip : NULL;
898 force_sig_info(SIGTRAP, &info, tsk);
899
900clear_dr7:
901 set_debugreg(0UL, 7);
902 preempt_conditional_cli(regs);
903 return;
904
905clear_TF_reenable:
906 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
907 regs->eflags &= ~TF_MASK;
908 preempt_conditional_cli(regs);
909}
910
911static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
912{
913 const struct exception_table_entry *fixup;
914 fixup = search_exception_tables(regs->rip);
915 if (fixup) {
916 regs->rip = fixup->fixup;
917 return 1;
918 }
919 notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
920 /* Illegal floating point operation in the kernel */
921 current->thread.trap_no = trapnr;
922 die(str, regs, 0);
923 return 0;
924}
925
926/*
927 * Note that we play around with the 'TS' bit in an attempt to get
928 * the correct behaviour even in the presence of the asynchronous
929 * IRQ13 behaviour
930 */
931asmlinkage void do_coprocessor_error(struct pt_regs *regs)
932{
933 void __user *rip = (void __user *)(regs->rip);
934 struct task_struct * task;
935 siginfo_t info;
936 unsigned short cwd, swd;
937
938 conditional_sti(regs);
939 if (!user_mode(regs) &&
940 kernel_math_error(regs, "kernel x87 math error", 16))
941 return;
942
943 /*
944 * Save the info for the exception handler and clear the error.
945 */
946 task = current;
947 save_init_fpu(task);
948 task->thread.trap_no = 16;
949 task->thread.error_code = 0;
950 info.si_signo = SIGFPE;
951 info.si_errno = 0;
952 info.si_code = __SI_FAULT;
953 info.si_addr = rip;
954 /*
955 * (~cwd & swd) will mask out exceptions that are not set to unmasked
956 * status. 0x3f is the exception bits in these regs, 0x200 is the
957 * C1 reg you need in case of a stack fault, 0x040 is the stack
958 * fault bit. We should only be taking one exception at a time,
959 * so if this combination doesn't produce any single exception,
960 * then we have a bad program that isn't synchronizing its FPU usage
961 * and it will suffer the consequences since we won't be able to
962 * fully reproduce the context of the exception
963 */
964 cwd = get_fpu_cwd(task);
965 swd = get_fpu_swd(task);
966 switch (swd & ~cwd & 0x3f) {
967 case 0x000:
968 default:
969 break;
970 case 0x001: /* Invalid Op */
971 /*
972 * swd & 0x240 == 0x040: Stack Underflow
973 * swd & 0x240 == 0x240: Stack Overflow
974 * User must clear the SF bit (0x40) if set
975 */
976 info.si_code = FPE_FLTINV;
977 break;
978 case 0x002: /* Denormalize */
979 case 0x010: /* Underflow */
980 info.si_code = FPE_FLTUND;
981 break;
982 case 0x004: /* Zero Divide */
983 info.si_code = FPE_FLTDIV;
984 break;
985 case 0x008: /* Overflow */
986 info.si_code = FPE_FLTOVF;
987 break;
988 case 0x020: /* Precision */
989 info.si_code = FPE_FLTRES;
990 break;
991 }
992 force_sig_info(SIGFPE, &info, task);
993}
994
995asmlinkage void bad_intr(void)
996{
997 printk("bad interrupt");
998}
999
1000asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
1001{
1002 void __user *rip = (void __user *)(regs->rip);
1003 struct task_struct * task;
1004 siginfo_t info;
1005 unsigned short mxcsr;
1006
1007 conditional_sti(regs);
1008 if (!user_mode(regs) &&
1009 kernel_math_error(regs, "kernel simd math error", 19))
1010 return;
1011
1012 /*
1013 * Save the info for the exception handler and clear the error.
1014 */
1015 task = current;
1016 save_init_fpu(task);
1017 task->thread.trap_no = 19;
1018 task->thread.error_code = 0;
1019 info.si_signo = SIGFPE;
1020 info.si_errno = 0;
1021 info.si_code = __SI_FAULT;
1022 info.si_addr = rip;
1023 /*
1024 * The SIMD FPU exceptions are handled a little differently, as there
1025 * is only a single status/control register. Thus, to determine which
1026 * unmasked exception was caught we must mask the exception mask bits
1027 * at 0x1f80, and then use these to mask the exception bits at 0x3f.
1028 */
1029 mxcsr = get_fpu_mxcsr(task);
1030 switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
1031 case 0x000:
1032 default:
1033 break;
1034 case 0x001: /* Invalid Op */
1035 info.si_code = FPE_FLTINV;
1036 break;
1037 case 0x002: /* Denormalize */
1038 case 0x010: /* Underflow */
1039 info.si_code = FPE_FLTUND;
1040 break;
1041 case 0x004: /* Zero Divide */
1042 info.si_code = FPE_FLTDIV;
1043 break;
1044 case 0x008: /* Overflow */
1045 info.si_code = FPE_FLTOVF;
1046 break;
1047 case 0x020: /* Precision */
1048 info.si_code = FPE_FLTRES;
1049 break;
1050 }
1051 force_sig_info(SIGFPE, &info, task);
1052}
1053
1054asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs)
1055{
1056}
1057
1058asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
1059{
1060}
1061
1062asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void)
1063{
1064}
1065
1066/*
1067 * 'math_state_restore()' saves the current math information in the
1068 * old math state array, and gets the new ones from the current task
1069 *
1070 * Careful.. There are problems with IBM-designed IRQ13 behaviour.
1071 * Don't touch unless you *really* know how it works.
1072 */
1073asmlinkage void math_state_restore(void)
1074{
1075 struct task_struct *me = current;
1076 clts(); /* Allow maths ops (or we recurse) */
1077
1078 if (!used_math())
1079 init_fpu(me);
1080 restore_fpu_checking(&me->thread.i387.fxsave);
1081 task_thread_info(me)->status |= TS_USEDFPU;
1082 me->fpu_counter++;
1083}
1084
1085void __init trap_init(void)
1086{
1087 set_intr_gate(0,&divide_error);
1088 set_intr_gate_ist(1,&debug,DEBUG_STACK);
1089 set_intr_gate_ist(2,&nmi,NMI_STACK);
1090 set_system_gate_ist(3,&int3,DEBUG_STACK); /* int3 can be called from all */
1091 set_system_gate(4,&overflow); /* int4 can be called from all */
1092 set_intr_gate(5,&bounds);
1093 set_intr_gate(6,&invalid_op);
1094 set_intr_gate(7,&device_not_available);
1095 set_intr_gate_ist(8,&double_fault, DOUBLEFAULT_STACK);
1096 set_intr_gate(9,&coprocessor_segment_overrun);
1097 set_intr_gate(10,&invalid_TSS);
1098 set_intr_gate(11,&segment_not_present);
1099 set_intr_gate_ist(12,&stack_segment,STACKFAULT_STACK);
1100 set_intr_gate(13,&general_protection);
1101 set_intr_gate(14,&page_fault);
1102 set_intr_gate(15,&spurious_interrupt_bug);
1103 set_intr_gate(16,&coprocessor_error);
1104 set_intr_gate(17,&alignment_check);
1105#ifdef CONFIG_X86_MCE
1106 set_intr_gate_ist(18,&machine_check, MCE_STACK);
1107#endif
1108 set_intr_gate(19,&simd_coprocessor_error);
1109
1110#ifdef CONFIG_IA32_EMULATION
1111 set_system_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
1112#endif
1113
1114 /*
1115 * Should be a barrier for any external CPU state.
1116 */
1117 cpu_init();
1118}
1119
1120
1121static int __init oops_setup(char *s)
1122{
1123 if (!s)
1124 return -EINVAL;
1125 if (!strcmp(s, "panic"))
1126 panic_on_oops = 1;
1127 return 0;
1128}
1129early_param("oops", oops_setup);
1130
1131static int __init kstack_setup(char *s)
1132{
1133 if (!s)
1134 return -EINVAL;
1135 kstack_depth_to_print = simple_strtoul(s,NULL,0);
1136 return 0;
1137}
1138early_param("kstack", kstack_setup);
diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c
new file mode 100644
index 000000000000..3ed0ae8c918d
--- /dev/null
+++ b/arch/x86/kernel/tsc_32.c
@@ -0,0 +1,413 @@
1/*
2 * This code largely moved from arch/i386/kernel/timer/timer_tsc.c
3 * which was originally moved from arch/i386/kernel/time.c.
4 * See comments there for proper credits.
5 */
6
7#include <linux/sched.h>
8#include <linux/clocksource.h>
9#include <linux/workqueue.h>
10#include <linux/cpufreq.h>
11#include <linux/jiffies.h>
12#include <linux/init.h>
13#include <linux/dmi.h>
14
15#include <asm/delay.h>
16#include <asm/tsc.h>
17#include <asm/io.h>
18#include <asm/timer.h>
19
20#include "mach_timer.h"
21
22static int tsc_enabled;
23
24/*
25 * On some systems the TSC frequency does not
26 * change with the cpu frequency. So we need
27 * an extra value to store the TSC freq
28 */
29unsigned int tsc_khz;
30EXPORT_SYMBOL_GPL(tsc_khz);
31
32int tsc_disable;
33
34#ifdef CONFIG_X86_TSC
35static int __init tsc_setup(char *str)
36{
37 printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, "
38 "cannot disable TSC.\n");
39 return 1;
40}
41#else
42/*
43 * disable flag for tsc. Takes effect by clearing the TSC cpu flag
44 * in cpu/common.c
45 */
46static int __init tsc_setup(char *str)
47{
48 tsc_disable = 1;
49
50 return 1;
51}
52#endif
53
54__setup("notsc", tsc_setup);
55
56/*
57 * code to mark and check if the TSC is unstable
58 * due to cpufreq or due to unsynced TSCs
59 */
60static int tsc_unstable;
61
62int check_tsc_unstable(void)
63{
64 return tsc_unstable;
65}
66EXPORT_SYMBOL_GPL(check_tsc_unstable);
67
68/* Accellerators for sched_clock()
69 * convert from cycles(64bits) => nanoseconds (64bits)
70 * basic equation:
71 * ns = cycles / (freq / ns_per_sec)
72 * ns = cycles * (ns_per_sec / freq)
73 * ns = cycles * (10^9 / (cpu_khz * 10^3))
74 * ns = cycles * (10^6 / cpu_khz)
75 *
76 * Then we use scaling math (suggested by george@mvista.com) to get:
77 * ns = cycles * (10^6 * SC / cpu_khz) / SC
78 * ns = cycles * cyc2ns_scale / SC
79 *
80 * And since SC is a constant power of two, we can convert the div
81 * into a shift.
82 *
83 * We can use khz divisor instead of mhz to keep a better percision, since
84 * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
85 * (mathieu.desnoyers@polymtl.ca)
86 *
87 * -johnstul@us.ibm.com "math is hard, lets go shopping!"
88 */
89unsigned long cyc2ns_scale __read_mostly;
90
91#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
92
93static inline void set_cyc2ns_scale(unsigned long cpu_khz)
94{
95 cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz;
96}
97
98/*
99 * Scheduler clock - returns current time in nanosec units.
100 */
101unsigned long long native_sched_clock(void)
102{
103 unsigned long long this_offset;
104
105 /*
106 * Fall back to jiffies if there's no TSC available:
107 * ( But note that we still use it if the TSC is marked
108 * unstable. We do this because unlike Time Of Day,
109 * the scheduler clock tolerates small errors and it's
110 * very important for it to be as fast as the platform
111 * can achive it. )
112 */
113 if (unlikely(!tsc_enabled && !tsc_unstable))
114 /* No locking but a rare wrong value is not a big deal: */
115 return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
116
117 /* read the Time Stamp Counter: */
118 rdtscll(this_offset);
119
120 /* return the value in ns */
121 return cycles_2_ns(this_offset);
122}
123
124/* We need to define a real function for sched_clock, to override the
125 weak default version */
126#ifdef CONFIG_PARAVIRT
127unsigned long long sched_clock(void)
128{
129 return paravirt_sched_clock();
130}
131#else
132unsigned long long sched_clock(void)
133 __attribute__((alias("native_sched_clock")));
134#endif
135
136unsigned long native_calculate_cpu_khz(void)
137{
138 unsigned long long start, end;
139 unsigned long count;
140 u64 delta64;
141 int i;
142 unsigned long flags;
143
144 local_irq_save(flags);
145
146 /* run 3 times to ensure the cache is warm */
147 for (i = 0; i < 3; i++) {
148 mach_prepare_counter();
149 rdtscll(start);
150 mach_countup(&count);
151 rdtscll(end);
152 }
153 /*
154 * Error: ECTCNEVERSET
155 * The CTC wasn't reliable: we got a hit on the very first read,
156 * or the CPU was so fast/slow that the quotient wouldn't fit in
157 * 32 bits..
158 */
159 if (count <= 1)
160 goto err;
161
162 delta64 = end - start;
163
164 /* cpu freq too fast: */
165 if (delta64 > (1ULL<<32))
166 goto err;
167
168 /* cpu freq too slow: */
169 if (delta64 <= CALIBRATE_TIME_MSEC)
170 goto err;
171
172 delta64 += CALIBRATE_TIME_MSEC/2; /* round for do_div */
173 do_div(delta64,CALIBRATE_TIME_MSEC);
174
175 local_irq_restore(flags);
176 return (unsigned long)delta64;
177err:
178 local_irq_restore(flags);
179 return 0;
180}
181
182int recalibrate_cpu_khz(void)
183{
184#ifndef CONFIG_SMP
185 unsigned long cpu_khz_old = cpu_khz;
186
187 if (cpu_has_tsc) {
188 cpu_khz = calculate_cpu_khz();
189 tsc_khz = cpu_khz;
190 cpu_data[0].loops_per_jiffy =
191 cpufreq_scale(cpu_data[0].loops_per_jiffy,
192 cpu_khz_old, cpu_khz);
193 return 0;
194 } else
195 return -ENODEV;
196#else
197 return -ENODEV;
198#endif
199}
200
201EXPORT_SYMBOL(recalibrate_cpu_khz);
202
203#ifdef CONFIG_CPU_FREQ
204
205/*
206 * if the CPU frequency is scaled, TSC-based delays will need a different
207 * loops_per_jiffy value to function properly.
208 */
209static unsigned int ref_freq = 0;
210static unsigned long loops_per_jiffy_ref = 0;
211static unsigned long cpu_khz_ref = 0;
212
213static int
214time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data)
215{
216 struct cpufreq_freqs *freq = data;
217
218 if (!ref_freq) {
219 if (!freq->old){
220 ref_freq = freq->new;
221 return 0;
222 }
223 ref_freq = freq->old;
224 loops_per_jiffy_ref = cpu_data[freq->cpu].loops_per_jiffy;
225 cpu_khz_ref = cpu_khz;
226 }
227
228 if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) ||
229 (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
230 (val == CPUFREQ_RESUMECHANGE)) {
231 if (!(freq->flags & CPUFREQ_CONST_LOOPS))
232 cpu_data[freq->cpu].loops_per_jiffy =
233 cpufreq_scale(loops_per_jiffy_ref,
234 ref_freq, freq->new);
235
236 if (cpu_khz) {
237
238 if (num_online_cpus() == 1)
239 cpu_khz = cpufreq_scale(cpu_khz_ref,
240 ref_freq, freq->new);
241 if (!(freq->flags & CPUFREQ_CONST_LOOPS)) {
242 tsc_khz = cpu_khz;
243 set_cyc2ns_scale(cpu_khz);
244 /*
245 * TSC based sched_clock turns
246 * to junk w/ cpufreq
247 */
248 mark_tsc_unstable("cpufreq changes");
249 }
250 }
251 }
252
253 return 0;
254}
255
256static struct notifier_block time_cpufreq_notifier_block = {
257 .notifier_call = time_cpufreq_notifier
258};
259
260static int __init cpufreq_tsc(void)
261{
262 return cpufreq_register_notifier(&time_cpufreq_notifier_block,
263 CPUFREQ_TRANSITION_NOTIFIER);
264}
265core_initcall(cpufreq_tsc);
266
267#endif
268
269/* clock source code */
270
271static unsigned long current_tsc_khz = 0;
272
273static cycle_t read_tsc(void)
274{
275 cycle_t ret;
276
277 rdtscll(ret);
278
279 return ret;
280}
281
282static struct clocksource clocksource_tsc = {
283 .name = "tsc",
284 .rating = 300,
285 .read = read_tsc,
286 .mask = CLOCKSOURCE_MASK(64),
287 .mult = 0, /* to be set */
288 .shift = 22,
289 .flags = CLOCK_SOURCE_IS_CONTINUOUS |
290 CLOCK_SOURCE_MUST_VERIFY,
291};
292
293void mark_tsc_unstable(char *reason)
294{
295 if (!tsc_unstable) {
296 tsc_unstable = 1;
297 tsc_enabled = 0;
298 printk("Marking TSC unstable due to: %s.\n", reason);
299 /* Can be called before registration */
300 if (clocksource_tsc.mult)
301 clocksource_change_rating(&clocksource_tsc, 0);
302 else
303 clocksource_tsc.rating = 0;
304 }
305}
306EXPORT_SYMBOL_GPL(mark_tsc_unstable);
307
308static int __init dmi_mark_tsc_unstable(const struct dmi_system_id *d)
309{
310 printk(KERN_NOTICE "%s detected: marking TSC unstable.\n",
311 d->ident);
312 tsc_unstable = 1;
313 return 0;
314}
315
316/* List of systems that have known TSC problems */
317static struct dmi_system_id __initdata bad_tsc_dmi_table[] = {
318 {
319 .callback = dmi_mark_tsc_unstable,
320 .ident = "IBM Thinkpad 380XD",
321 .matches = {
322 DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
323 DMI_MATCH(DMI_BOARD_NAME, "2635FA0"),
324 },
325 },
326 {}
327};
328
329/*
330 * Make an educated guess if the TSC is trustworthy and synchronized
331 * over all CPUs.
332 */
333__cpuinit int unsynchronized_tsc(void)
334{
335 if (!cpu_has_tsc || tsc_unstable)
336 return 1;
337 /*
338 * Intel systems are normally all synchronized.
339 * Exceptions must mark TSC as unstable:
340 */
341 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
342 /* assume multi socket systems are not synchronized: */
343 if (num_possible_cpus() > 1)
344 tsc_unstable = 1;
345 }
346 return tsc_unstable;
347}
348
349/*
350 * Geode_LX - the OLPC CPU has a possibly a very reliable TSC
351 */
352#ifdef CONFIG_MGEODE_LX
353/* RTSC counts during suspend */
354#define RTSC_SUSP 0x100
355
356static void __init check_geode_tsc_reliable(void)
357{
358 unsigned long val;
359
360 rdmsrl(MSR_GEODE_BUSCONT_CONF0, val);
361 if ((val & RTSC_SUSP))
362 clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
363}
364#else
365static inline void check_geode_tsc_reliable(void) { }
366#endif
367
368
369void __init tsc_init(void)
370{
371 if (!cpu_has_tsc || tsc_disable)
372 goto out_no_tsc;
373
374 cpu_khz = calculate_cpu_khz();
375 tsc_khz = cpu_khz;
376
377 if (!cpu_khz)
378 goto out_no_tsc;
379
380 printk("Detected %lu.%03lu MHz processor.\n",
381 (unsigned long)cpu_khz / 1000,
382 (unsigned long)cpu_khz % 1000);
383
384 set_cyc2ns_scale(cpu_khz);
385 use_tsc_delay();
386
387 /* Check and install the TSC clocksource */
388 dmi_check_system(bad_tsc_dmi_table);
389
390 unsynchronized_tsc();
391 check_geode_tsc_reliable();
392 current_tsc_khz = tsc_khz;
393 clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz,
394 clocksource_tsc.shift);
395 /* lower the rating if we already know its unstable: */
396 if (check_tsc_unstable()) {
397 clocksource_tsc.rating = 0;
398 clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
399 } else
400 tsc_enabled = 1;
401
402 clocksource_register(&clocksource_tsc);
403
404 return;
405
406out_no_tsc:
407 /*
408 * Set the tsc_disable flag if there's no TSC support, this
409 * makes it a fast flag for the kernel to see whether it
410 * should be using the TSC.
411 */
412 tsc_disable = 1;
413}
diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c
new file mode 100644
index 000000000000..2a59bde663f2
--- /dev/null
+++ b/arch/x86/kernel/tsc_64.c
@@ -0,0 +1,207 @@
1#include <linux/kernel.h>
2#include <linux/sched.h>
3#include <linux/interrupt.h>
4#include <linux/init.h>
5#include <linux/clocksource.h>
6#include <linux/time.h>
7#include <linux/acpi.h>
8#include <linux/cpufreq.h>
9
10#include <asm/timex.h>
11
12static int notsc __initdata = 0;
13
14unsigned int cpu_khz; /* TSC clocks / usec, not used here */
15EXPORT_SYMBOL(cpu_khz);
16unsigned int tsc_khz;
17EXPORT_SYMBOL(tsc_khz);
18
19static unsigned int cyc2ns_scale __read_mostly;
20
21void set_cyc2ns_scale(unsigned long khz)
22{
23 cyc2ns_scale = (NSEC_PER_MSEC << NS_SCALE) / khz;
24}
25
26static unsigned long long cycles_2_ns(unsigned long long cyc)
27{
28 return (cyc * cyc2ns_scale) >> NS_SCALE;
29}
30
31unsigned long long sched_clock(void)
32{
33 unsigned long a = 0;
34
35 /* Could do CPU core sync here. Opteron can execute rdtsc speculatively,
36 * which means it is not completely exact and may not be monotonous
37 * between CPUs. But the errors should be too small to matter for
38 * scheduling purposes.
39 */
40
41 rdtscll(a);
42 return cycles_2_ns(a);
43}
44
45static int tsc_unstable;
46
47inline int check_tsc_unstable(void)
48{
49 return tsc_unstable;
50}
51#ifdef CONFIG_CPU_FREQ
52
53/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
54 * changes.
55 *
56 * RED-PEN: On SMP we assume all CPUs run with the same frequency. It's
57 * not that important because current Opteron setups do not support
58 * scaling on SMP anyroads.
59 *
60 * Should fix up last_tsc too. Currently gettimeofday in the
61 * first tick after the change will be slightly wrong.
62 */
63
64static unsigned int ref_freq;
65static unsigned long loops_per_jiffy_ref;
66static unsigned long tsc_khz_ref;
67
68static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
69 void *data)
70{
71 struct cpufreq_freqs *freq = data;
72 unsigned long *lpj, dummy;
73
74 if (cpu_has(&cpu_data[freq->cpu], X86_FEATURE_CONSTANT_TSC))
75 return 0;
76
77 lpj = &dummy;
78 if (!(freq->flags & CPUFREQ_CONST_LOOPS))
79#ifdef CONFIG_SMP
80 lpj = &cpu_data[freq->cpu].loops_per_jiffy;
81#else
82 lpj = &boot_cpu_data.loops_per_jiffy;
83#endif
84
85 if (!ref_freq) {
86 ref_freq = freq->old;
87 loops_per_jiffy_ref = *lpj;
88 tsc_khz_ref = tsc_khz;
89 }
90 if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) ||
91 (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
92 (val == CPUFREQ_RESUMECHANGE)) {
93 *lpj =
94 cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
95
96 tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
97 if (!(freq->flags & CPUFREQ_CONST_LOOPS))
98 mark_tsc_unstable("cpufreq changes");
99 }
100
101 set_cyc2ns_scale(tsc_khz_ref);
102
103 return 0;
104}
105
106static struct notifier_block time_cpufreq_notifier_block = {
107 .notifier_call = time_cpufreq_notifier
108};
109
110static int __init cpufreq_tsc(void)
111{
112 cpufreq_register_notifier(&time_cpufreq_notifier_block,
113 CPUFREQ_TRANSITION_NOTIFIER);
114 return 0;
115}
116
117core_initcall(cpufreq_tsc);
118
119#endif
120
121/*
122 * Make an educated guess if the TSC is trustworthy and synchronized
123 * over all CPUs.
124 */
125__cpuinit int unsynchronized_tsc(void)
126{
127 if (tsc_unstable)
128 return 1;
129
130#ifdef CONFIG_SMP
131 if (apic_is_clustered_box())
132 return 1;
133#endif
134 /* Most intel systems have synchronized TSCs except for
135 multi node systems */
136 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
137#ifdef CONFIG_ACPI
138 /* But TSC doesn't tick in C3 so don't use it there */
139 if (acpi_gbl_FADT.header.length > 0 &&
140 acpi_gbl_FADT.C3latency < 1000)
141 return 1;
142#endif
143 return 0;
144 }
145
146 /* Assume multi socket systems are not synchronized */
147 return num_present_cpus() > 1;
148}
149
150int __init notsc_setup(char *s)
151{
152 notsc = 1;
153 return 1;
154}
155
156__setup("notsc", notsc_setup);
157
158
159/* clock source code: */
160static cycle_t read_tsc(void)
161{
162 cycle_t ret = (cycle_t)get_cycles_sync();
163 return ret;
164}
165
166static cycle_t __vsyscall_fn vread_tsc(void)
167{
168 cycle_t ret = (cycle_t)get_cycles_sync();
169 return ret;
170}
171
172static struct clocksource clocksource_tsc = {
173 .name = "tsc",
174 .rating = 300,
175 .read = read_tsc,
176 .mask = CLOCKSOURCE_MASK(64),
177 .shift = 22,
178 .flags = CLOCK_SOURCE_IS_CONTINUOUS |
179 CLOCK_SOURCE_MUST_VERIFY,
180 .vread = vread_tsc,
181};
182
183void mark_tsc_unstable(char *reason)
184{
185 if (!tsc_unstable) {
186 tsc_unstable = 1;
187 printk("Marking TSC unstable due to %s\n", reason);
188 /* Change only the rating, when not registered */
189 if (clocksource_tsc.mult)
190 clocksource_change_rating(&clocksource_tsc, 0);
191 else
192 clocksource_tsc.rating = 0;
193 }
194}
195EXPORT_SYMBOL_GPL(mark_tsc_unstable);
196
197void __init init_tsc_clocksource(void)
198{
199 if (!notsc) {
200 clocksource_tsc.mult = clocksource_khz2mult(tsc_khz,
201 clocksource_tsc.shift);
202 if (check_tsc_unstable())
203 clocksource_tsc.rating = 0;
204
205 clocksource_register(&clocksource_tsc);
206 }
207}
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
new file mode 100644
index 000000000000..355f5f506c81
--- /dev/null
+++ b/arch/x86/kernel/tsc_sync.c
@@ -0,0 +1,187 @@
1/*
2 * arch/x86_64/kernel/tsc_sync.c: check TSC synchronization.
3 *
4 * Copyright (C) 2006, Red Hat, Inc., Ingo Molnar
5 *
6 * We check whether all boot CPUs have their TSC's synchronized,
7 * print a warning if not and turn off the TSC clock-source.
8 *
9 * The warp-check is point-to-point between two CPUs, the CPU
10 * initiating the bootup is the 'source CPU', the freshly booting
11 * CPU is the 'target CPU'.
12 *
13 * Only two CPUs may participate - they can enter in any order.
14 * ( The serial nature of the boot logic and the CPU hotplug lock
15 * protects against more than 2 CPUs entering this code. )
16 */
17#include <linux/spinlock.h>
18#include <linux/kernel.h>
19#include <linux/init.h>
20#include <linux/smp.h>
21#include <linux/nmi.h>
22#include <asm/tsc.h>
23
24/*
25 * Entry/exit counters that make sure that both CPUs
26 * run the measurement code at once:
27 */
28static __cpuinitdata atomic_t start_count;
29static __cpuinitdata atomic_t stop_count;
30
31/*
32 * We use a raw spinlock in this exceptional case, because
33 * we want to have the fastest, inlined, non-debug version
34 * of a critical section, to be able to prove TSC time-warps:
35 */
36static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED;
37static __cpuinitdata cycles_t last_tsc;
38static __cpuinitdata cycles_t max_warp;
39static __cpuinitdata int nr_warps;
40
41/*
42 * TSC-warp measurement loop running on both CPUs:
43 */
44static __cpuinit void check_tsc_warp(void)
45{
46 cycles_t start, now, prev, end;
47 int i;
48
49 start = get_cycles_sync();
50 /*
51 * The measurement runs for 20 msecs:
52 */
53 end = start + tsc_khz * 20ULL;
54 now = start;
55
56 for (i = 0; ; i++) {
57 /*
58 * We take the global lock, measure TSC, save the
59 * previous TSC that was measured (possibly on
60 * another CPU) and update the previous TSC timestamp.
61 */
62 __raw_spin_lock(&sync_lock);
63 prev = last_tsc;
64 now = get_cycles_sync();
65 last_tsc = now;
66 __raw_spin_unlock(&sync_lock);
67
68 /*
69 * Be nice every now and then (and also check whether
70 * measurement is done [we also insert a 100 million
71 * loops safety exit, so we dont lock up in case the
72 * TSC readout is totally broken]):
73 */
74 if (unlikely(!(i & 7))) {
75 if (now > end || i > 100000000)
76 break;
77 cpu_relax();
78 touch_nmi_watchdog();
79 }
80 /*
81 * Outside the critical section we can now see whether
82 * we saw a time-warp of the TSC going backwards:
83 */
84 if (unlikely(prev > now)) {
85 __raw_spin_lock(&sync_lock);
86 max_warp = max(max_warp, prev - now);
87 nr_warps++;
88 __raw_spin_unlock(&sync_lock);
89 }
90
91 }
92}
93
94/*
95 * Source CPU calls into this - it waits for the freshly booted
96 * target CPU to arrive and then starts the measurement:
97 */
98void __cpuinit check_tsc_sync_source(int cpu)
99{
100 int cpus = 2;
101
102 /*
103 * No need to check if we already know that the TSC is not
104 * synchronized:
105 */
106 if (unsynchronized_tsc())
107 return;
108
109 printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:",
110 smp_processor_id(), cpu);
111
112 /*
113 * Reset it - in case this is a second bootup:
114 */
115 atomic_set(&stop_count, 0);
116
117 /*
118 * Wait for the target to arrive:
119 */
120 while (atomic_read(&start_count) != cpus-1)
121 cpu_relax();
122 /*
123 * Trigger the target to continue into the measurement too:
124 */
125 atomic_inc(&start_count);
126
127 check_tsc_warp();
128
129 while (atomic_read(&stop_count) != cpus-1)
130 cpu_relax();
131
132 /*
133 * Reset it - just in case we boot another CPU later:
134 */
135 atomic_set(&start_count, 0);
136
137 if (nr_warps) {
138 printk("\n");
139 printk(KERN_WARNING "Measured %Ld cycles TSC warp between CPUs,"
140 " turning off TSC clock.\n", max_warp);
141 mark_tsc_unstable("check_tsc_sync_source failed");
142 nr_warps = 0;
143 max_warp = 0;
144 last_tsc = 0;
145 } else {
146 printk(" passed.\n");
147 }
148
149 /*
150 * Let the target continue with the bootup:
151 */
152 atomic_inc(&stop_count);
153}
154
155/*
156 * Freshly booted CPUs call into this:
157 */
158void __cpuinit check_tsc_sync_target(void)
159{
160 int cpus = 2;
161
162 if (unsynchronized_tsc())
163 return;
164
165 /*
166 * Register this CPU's participation and wait for the
167 * source CPU to start the measurement:
168 */
169 atomic_inc(&start_count);
170 while (atomic_read(&start_count) != cpus)
171 cpu_relax();
172
173 check_tsc_warp();
174
175 /*
176 * Ok, we are done:
177 */
178 atomic_inc(&stop_count);
179
180 /*
181 * Wait for the source CPU to print stuff:
182 */
183 while (atomic_read(&stop_count) != cpus)
184 cpu_relax();
185}
186#undef NR_LOOPS
187
diff --git a/arch/x86/kernel/verify_cpu_64.S b/arch/x86/kernel/verify_cpu_64.S
new file mode 100644
index 000000000000..45b6f8a975a1
--- /dev/null
+++ b/arch/x86/kernel/verify_cpu_64.S
@@ -0,0 +1,105 @@
1/*
2 *
3 * verify_cpu.S - Code for cpu long mode and SSE verification. This
4 * code has been borrowed from boot/setup.S and was introduced by
5 * Andi Kleen.
6 *
7 * Copyright (c) 2007 Andi Kleen (ak@suse.de)
8 * Copyright (c) 2007 Eric Biederman (ebiederm@xmission.com)
9 * Copyright (c) 2007 Vivek Goyal (vgoyal@in.ibm.com)
10 *
11 * This source code is licensed under the GNU General Public License,
12 * Version 2. See the file COPYING for more details.
13 *
14 * This is a common code for verification whether CPU supports
15 * long mode and SSE or not. It is not called directly instead this
16 * file is included at various places and compiled in that context.
17 * Following are the current usage.
18 *
19 * This file is included by both 16bit and 32bit code.
20 *
21 * arch/x86_64/boot/setup.S : Boot cpu verification (16bit)
22 * arch/x86_64/boot/compressed/head.S: Boot cpu verification (32bit)
23 * arch/x86_64/kernel/trampoline.S: secondary processor verfication (16bit)
24 * arch/x86_64/kernel/acpi/wakeup.S:Verfication at resume (16bit)
25 *
26 * verify_cpu, returns the status of cpu check in register %eax.
27 * 0: Success 1: Failure
28 *
29 * The caller needs to check for the error code and take the action
30 * appropriately. Either display a message or halt.
31 */
32
33#include <asm/cpufeature.h>
34
35verify_cpu:
36 pushfl # Save caller passed flags
37 pushl $0 # Kill any dangerous flags
38 popfl
39
40 pushfl # standard way to check for cpuid
41 popl %eax
42 movl %eax,%ebx
43 xorl $0x200000,%eax
44 pushl %eax
45 popfl
46 pushfl
47 popl %eax
48 cmpl %eax,%ebx
49 jz verify_cpu_no_longmode # cpu has no cpuid
50
51 movl $0x0,%eax # See if cpuid 1 is implemented
52 cpuid
53 cmpl $0x1,%eax
54 jb verify_cpu_no_longmode # no cpuid 1
55
56 xor %di,%di
57 cmpl $0x68747541,%ebx # AuthenticAMD
58 jnz verify_cpu_noamd
59 cmpl $0x69746e65,%edx
60 jnz verify_cpu_noamd
61 cmpl $0x444d4163,%ecx
62 jnz verify_cpu_noamd
63 mov $1,%di # cpu is from AMD
64
65verify_cpu_noamd:
66 movl $0x1,%eax # Does the cpu have what it takes
67 cpuid
68 andl $REQUIRED_MASK0,%edx
69 xorl $REQUIRED_MASK0,%edx
70 jnz verify_cpu_no_longmode
71
72 movl $0x80000000,%eax # See if extended cpuid is implemented
73 cpuid
74 cmpl $0x80000001,%eax
75 jb verify_cpu_no_longmode # no extended cpuid
76
77 movl $0x80000001,%eax # Does the cpu have what it takes
78 cpuid
79 andl $REQUIRED_MASK1,%edx
80 xorl $REQUIRED_MASK1,%edx
81 jnz verify_cpu_no_longmode
82
83verify_cpu_sse_test:
84 movl $1,%eax
85 cpuid
86 andl $SSE_MASK,%edx
87 cmpl $SSE_MASK,%edx
88 je verify_cpu_sse_ok
89 test %di,%di
90 jz verify_cpu_no_longmode # only try to force SSE on AMD
91 movl $0xc0010015,%ecx # HWCR
92 rdmsr
93 btr $15,%eax # enable SSE
94 wrmsr
95 xor %di,%di # don't loop
96 jmp verify_cpu_sse_test # try again
97
98verify_cpu_no_longmode:
99 popfl # Restore caller passed flags
100 movl $1,%eax
101 ret
102verify_cpu_sse_ok:
103 popfl # Restore caller passed flags
104 xorl %eax, %eax
105 ret
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
new file mode 100644
index 000000000000..f2dcd1d27c0a
--- /dev/null
+++ b/arch/x86/kernel/vm86_32.c
@@ -0,0 +1,843 @@
1/*
2 * linux/kernel/vm86.c
3 *
4 * Copyright (C) 1994 Linus Torvalds
5 *
6 * 29 dec 2001 - Fixed oopses caused by unchecked access to the vm86
7 * stack - Manfred Spraul <manfred@colorfullife.com>
8 *
9 * 22 mar 2002 - Manfred detected the stackfaults, but didn't handle
10 * them correctly. Now the emulation will be in a
11 * consistent state after stackfaults - Kasper Dupont
12 * <kasperd@daimi.au.dk>
13 *
14 * 22 mar 2002 - Added missing clear_IF in set_vflags_* Kasper Dupont
15 * <kasperd@daimi.au.dk>
16 *
17 * ?? ??? 2002 - Fixed premature returns from handle_vm86_fault
18 * caused by Kasper Dupont's changes - Stas Sergeev
19 *
20 * 4 apr 2002 - Fixed CHECK_IF_IN_TRAP broken by Stas' changes.
21 * Kasper Dupont <kasperd@daimi.au.dk>
22 *
23 * 9 apr 2002 - Changed syntax of macros in handle_vm86_fault.
24 * Kasper Dupont <kasperd@daimi.au.dk>
25 *
26 * 9 apr 2002 - Changed stack access macros to jump to a label
27 * instead of returning to userspace. This simplifies
28 * do_int, and is needed by handle_vm6_fault. Kasper
29 * Dupont <kasperd@daimi.au.dk>
30 *
31 */
32
33#include <linux/capability.h>
34#include <linux/errno.h>
35#include <linux/interrupt.h>
36#include <linux/sched.h>
37#include <linux/kernel.h>
38#include <linux/signal.h>
39#include <linux/string.h>
40#include <linux/mm.h>
41#include <linux/smp.h>
42#include <linux/highmem.h>
43#include <linux/ptrace.h>
44#include <linux/audit.h>
45#include <linux/stddef.h>
46
47#include <asm/uaccess.h>
48#include <asm/io.h>
49#include <asm/tlbflush.h>
50#include <asm/irq.h>
51
52/*
53 * Known problems:
54 *
55 * Interrupt handling is not guaranteed:
56 * - a real x86 will disable all interrupts for one instruction
57 * after a "mov ss,xx" to make stack handling atomic even without
58 * the 'lss' instruction. We can't guarantee this in v86 mode,
59 * as the next instruction might result in a page fault or similar.
60 * - a real x86 will have interrupts disabled for one instruction
61 * past the 'sti' that enables them. We don't bother with all the
62 * details yet.
63 *
64 * Let's hope these problems do not actually matter for anything.
65 */
66
67
68#define KVM86 ((struct kernel_vm86_struct *)regs)
69#define VMPI KVM86->vm86plus
70
71
72/*
73 * 8- and 16-bit register defines..
74 */
75#define AL(regs) (((unsigned char *)&((regs)->pt.eax))[0])
76#define AH(regs) (((unsigned char *)&((regs)->pt.eax))[1])
77#define IP(regs) (*(unsigned short *)&((regs)->pt.eip))
78#define SP(regs) (*(unsigned short *)&((regs)->pt.esp))
79
80/*
81 * virtual flags (16 and 32-bit versions)
82 */
83#define VFLAGS (*(unsigned short *)&(current->thread.v86flags))
84#define VEFLAGS (current->thread.v86flags)
85
86#define set_flags(X,new,mask) \
87((X) = ((X) & ~(mask)) | ((new) & (mask)))
88
89#define SAFE_MASK (0xDD5)
90#define RETURN_MASK (0xDFF)
91
92/* convert kernel_vm86_regs to vm86_regs */
93static int copy_vm86_regs_to_user(struct vm86_regs __user *user,
94 const struct kernel_vm86_regs *regs)
95{
96 int ret = 0;
97
98 /* kernel_vm86_regs is missing xgs, so copy everything up to
99 (but not including) orig_eax, and then rest including orig_eax. */
100 ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.orig_eax));
101 ret += copy_to_user(&user->orig_eax, &regs->pt.orig_eax,
102 sizeof(struct kernel_vm86_regs) -
103 offsetof(struct kernel_vm86_regs, pt.orig_eax));
104
105 return ret;
106}
107
108/* convert vm86_regs to kernel_vm86_regs */
109static int copy_vm86_regs_from_user(struct kernel_vm86_regs *regs,
110 const struct vm86_regs __user *user,
111 unsigned extra)
112{
113 int ret = 0;
114
115 /* copy eax-xfs inclusive */
116 ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.orig_eax));
117 /* copy orig_eax-__gsh+extra */
118 ret += copy_from_user(&regs->pt.orig_eax, &user->orig_eax,
119 sizeof(struct kernel_vm86_regs) -
120 offsetof(struct kernel_vm86_regs, pt.orig_eax) +
121 extra);
122 return ret;
123}
124
125struct pt_regs * FASTCALL(save_v86_state(struct kernel_vm86_regs * regs));
126struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
127{
128 struct tss_struct *tss;
129 struct pt_regs *ret;
130 unsigned long tmp;
131
132 /*
133 * This gets called from entry.S with interrupts disabled, but
134 * from process context. Enable interrupts here, before trying
135 * to access user space.
136 */
137 local_irq_enable();
138
139 if (!current->thread.vm86_info) {
140 printk("no vm86_info: BAD\n");
141 do_exit(SIGSEGV);
142 }
143 set_flags(regs->pt.eflags, VEFLAGS, VIF_MASK | current->thread.v86mask);
144 tmp = copy_vm86_regs_to_user(&current->thread.vm86_info->regs,regs);
145 tmp += put_user(current->thread.screen_bitmap,&current->thread.vm86_info->screen_bitmap);
146 if (tmp) {
147 printk("vm86: could not access userspace vm86_info\n");
148 do_exit(SIGSEGV);
149 }
150
151 tss = &per_cpu(init_tss, get_cpu());
152 current->thread.esp0 = current->thread.saved_esp0;
153 current->thread.sysenter_cs = __KERNEL_CS;
154 load_esp0(tss, &current->thread);
155 current->thread.saved_esp0 = 0;
156 put_cpu();
157
158 ret = KVM86->regs32;
159
160 ret->xfs = current->thread.saved_fs;
161 loadsegment(gs, current->thread.saved_gs);
162
163 return ret;
164}
165
166static void mark_screen_rdonly(struct mm_struct *mm)
167{
168 pgd_t *pgd;
169 pud_t *pud;
170 pmd_t *pmd;
171 pte_t *pte;
172 spinlock_t *ptl;
173 int i;
174
175 pgd = pgd_offset(mm, 0xA0000);
176 if (pgd_none_or_clear_bad(pgd))
177 goto out;
178 pud = pud_offset(pgd, 0xA0000);
179 if (pud_none_or_clear_bad(pud))
180 goto out;
181 pmd = pmd_offset(pud, 0xA0000);
182 if (pmd_none_or_clear_bad(pmd))
183 goto out;
184 pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl);
185 for (i = 0; i < 32; i++) {
186 if (pte_present(*pte))
187 set_pte(pte, pte_wrprotect(*pte));
188 pte++;
189 }
190 pte_unmap_unlock(pte, ptl);
191out:
192 flush_tlb();
193}
194
195
196
197static int do_vm86_irq_handling(int subfunction, int irqnumber);
198static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk);
199
200asmlinkage int sys_vm86old(struct pt_regs regs)
201{
202 struct vm86_struct __user *v86 = (struct vm86_struct __user *)regs.ebx;
203 struct kernel_vm86_struct info; /* declare this _on top_,
204 * this avoids wasting of stack space.
205 * This remains on the stack until we
206 * return to 32 bit user space.
207 */
208 struct task_struct *tsk;
209 int tmp, ret = -EPERM;
210
211 tsk = current;
212 if (tsk->thread.saved_esp0)
213 goto out;
214 tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
215 offsetof(struct kernel_vm86_struct, vm86plus) -
216 sizeof(info.regs));
217 ret = -EFAULT;
218 if (tmp)
219 goto out;
220 memset(&info.vm86plus, 0, (int)&info.regs32 - (int)&info.vm86plus);
221 info.regs32 = &regs;
222 tsk->thread.vm86_info = v86;
223 do_sys_vm86(&info, tsk);
224 ret = 0; /* we never return here */
225out:
226 return ret;
227}
228
229
230asmlinkage int sys_vm86(struct pt_regs regs)
231{
232 struct kernel_vm86_struct info; /* declare this _on top_,
233 * this avoids wasting of stack space.
234 * This remains on the stack until we
235 * return to 32 bit user space.
236 */
237 struct task_struct *tsk;
238 int tmp, ret;
239 struct vm86plus_struct __user *v86;
240
241 tsk = current;
242 switch (regs.ebx) {
243 case VM86_REQUEST_IRQ:
244 case VM86_FREE_IRQ:
245 case VM86_GET_IRQ_BITS:
246 case VM86_GET_AND_RESET_IRQ:
247 ret = do_vm86_irq_handling(regs.ebx, (int)regs.ecx);
248 goto out;
249 case VM86_PLUS_INSTALL_CHECK:
250 /* NOTE: on old vm86 stuff this will return the error
251 from access_ok(), because the subfunction is
252 interpreted as (invalid) address to vm86_struct.
253 So the installation check works.
254 */
255 ret = 0;
256 goto out;
257 }
258
259 /* we come here only for functions VM86_ENTER, VM86_ENTER_NO_BYPASS */
260 ret = -EPERM;
261 if (tsk->thread.saved_esp0)
262 goto out;
263 v86 = (struct vm86plus_struct __user *)regs.ecx;
264 tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
265 offsetof(struct kernel_vm86_struct, regs32) -
266 sizeof(info.regs));
267 ret = -EFAULT;
268 if (tmp)
269 goto out;
270 info.regs32 = &regs;
271 info.vm86plus.is_vm86pus = 1;
272 tsk->thread.vm86_info = (struct vm86_struct __user *)v86;
273 do_sys_vm86(&info, tsk);
274 ret = 0; /* we never return here */
275out:
276 return ret;
277}
278
279
280static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk)
281{
282 struct tss_struct *tss;
283/*
284 * make sure the vm86() system call doesn't try to do anything silly
285 */
286 info->regs.pt.xds = 0;
287 info->regs.pt.xes = 0;
288 info->regs.pt.xfs = 0;
289
290/* we are clearing gs later just before "jmp resume_userspace",
291 * because it is not saved/restored.
292 */
293
294/*
295 * The eflags register is also special: we cannot trust that the user
296 * has set it up safely, so this makes sure interrupt etc flags are
297 * inherited from protected mode.
298 */
299 VEFLAGS = info->regs.pt.eflags;
300 info->regs.pt.eflags &= SAFE_MASK;
301 info->regs.pt.eflags |= info->regs32->eflags & ~SAFE_MASK;
302 info->regs.pt.eflags |= VM_MASK;
303
304 switch (info->cpu_type) {
305 case CPU_286:
306 tsk->thread.v86mask = 0;
307 break;
308 case CPU_386:
309 tsk->thread.v86mask = NT_MASK | IOPL_MASK;
310 break;
311 case CPU_486:
312 tsk->thread.v86mask = AC_MASK | NT_MASK | IOPL_MASK;
313 break;
314 default:
315 tsk->thread.v86mask = ID_MASK | AC_MASK | NT_MASK | IOPL_MASK;
316 break;
317 }
318
319/*
320 * Save old state, set default return value (%eax) to 0
321 */
322 info->regs32->eax = 0;
323 tsk->thread.saved_esp0 = tsk->thread.esp0;
324 tsk->thread.saved_fs = info->regs32->xfs;
325 savesegment(gs, tsk->thread.saved_gs);
326
327 tss = &per_cpu(init_tss, get_cpu());
328 tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0;
329 if (cpu_has_sep)
330 tsk->thread.sysenter_cs = 0;
331 load_esp0(tss, &tsk->thread);
332 put_cpu();
333
334 tsk->thread.screen_bitmap = info->screen_bitmap;
335 if (info->flags & VM86_SCREEN_BITMAP)
336 mark_screen_rdonly(tsk->mm);
337
338 /*call audit_syscall_exit since we do not exit via the normal paths */
339 if (unlikely(current->audit_context))
340 audit_syscall_exit(AUDITSC_RESULT(0), 0);
341
342 __asm__ __volatile__(
343 "movl %0,%%esp\n\t"
344 "movl %1,%%ebp\n\t"
345 "mov %2, %%gs\n\t"
346 "jmp resume_userspace"
347 : /* no outputs */
348 :"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0));
349 /* we never return here */
350}
351
352static inline void return_to_32bit(struct kernel_vm86_regs * regs16, int retval)
353{
354 struct pt_regs * regs32;
355
356 regs32 = save_v86_state(regs16);
357 regs32->eax = retval;
358 __asm__ __volatile__("movl %0,%%esp\n\t"
359 "movl %1,%%ebp\n\t"
360 "jmp resume_userspace"
361 : : "r" (regs32), "r" (current_thread_info()));
362}
363
364static inline void set_IF(struct kernel_vm86_regs * regs)
365{
366 VEFLAGS |= VIF_MASK;
367 if (VEFLAGS & VIP_MASK)
368 return_to_32bit(regs, VM86_STI);
369}
370
371static inline void clear_IF(struct kernel_vm86_regs * regs)
372{
373 VEFLAGS &= ~VIF_MASK;
374}
375
376static inline void clear_TF(struct kernel_vm86_regs * regs)
377{
378 regs->pt.eflags &= ~TF_MASK;
379}
380
381static inline void clear_AC(struct kernel_vm86_regs * regs)
382{
383 regs->pt.eflags &= ~AC_MASK;
384}
385
386/* It is correct to call set_IF(regs) from the set_vflags_*
387 * functions. However someone forgot to call clear_IF(regs)
388 * in the opposite case.
389 * After the command sequence CLI PUSHF STI POPF you should
390 * end up with interrups disabled, but you ended up with
391 * interrupts enabled.
392 * ( I was testing my own changes, but the only bug I
393 * could find was in a function I had not changed. )
394 * [KD]
395 */
396
397static inline void set_vflags_long(unsigned long eflags, struct kernel_vm86_regs * regs)
398{
399 set_flags(VEFLAGS, eflags, current->thread.v86mask);
400 set_flags(regs->pt.eflags, eflags, SAFE_MASK);
401 if (eflags & IF_MASK)
402 set_IF(regs);
403 else
404 clear_IF(regs);
405}
406
407static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs * regs)
408{
409 set_flags(VFLAGS, flags, current->thread.v86mask);
410 set_flags(regs->pt.eflags, flags, SAFE_MASK);
411 if (flags & IF_MASK)
412 set_IF(regs);
413 else
414 clear_IF(regs);
415}
416
417static inline unsigned long get_vflags(struct kernel_vm86_regs * regs)
418{
419 unsigned long flags = regs->pt.eflags & RETURN_MASK;
420
421 if (VEFLAGS & VIF_MASK)
422 flags |= IF_MASK;
423 flags |= IOPL_MASK;
424 return flags | (VEFLAGS & current->thread.v86mask);
425}
426
427static inline int is_revectored(int nr, struct revectored_struct * bitmap)
428{
429 __asm__ __volatile__("btl %2,%1\n\tsbbl %0,%0"
430 :"=r" (nr)
431 :"m" (*bitmap),"r" (nr));
432 return nr;
433}
434
435#define val_byte(val, n) (((__u8 *)&val)[n])
436
437#define pushb(base, ptr, val, err_label) \
438 do { \
439 __u8 __val = val; \
440 ptr--; \
441 if (put_user(__val, base + ptr) < 0) \
442 goto err_label; \
443 } while(0)
444
445#define pushw(base, ptr, val, err_label) \
446 do { \
447 __u16 __val = val; \
448 ptr--; \
449 if (put_user(val_byte(__val, 1), base + ptr) < 0) \
450 goto err_label; \
451 ptr--; \
452 if (put_user(val_byte(__val, 0), base + ptr) < 0) \
453 goto err_label; \
454 } while(0)
455
456#define pushl(base, ptr, val, err_label) \
457 do { \
458 __u32 __val = val; \
459 ptr--; \
460 if (put_user(val_byte(__val, 3), base + ptr) < 0) \
461 goto err_label; \
462 ptr--; \
463 if (put_user(val_byte(__val, 2), base + ptr) < 0) \
464 goto err_label; \
465 ptr--; \
466 if (put_user(val_byte(__val, 1), base + ptr) < 0) \
467 goto err_label; \
468 ptr--; \
469 if (put_user(val_byte(__val, 0), base + ptr) < 0) \
470 goto err_label; \
471 } while(0)
472
473#define popb(base, ptr, err_label) \
474 ({ \
475 __u8 __res; \
476 if (get_user(__res, base + ptr) < 0) \
477 goto err_label; \
478 ptr++; \
479 __res; \
480 })
481
482#define popw(base, ptr, err_label) \
483 ({ \
484 __u16 __res; \
485 if (get_user(val_byte(__res, 0), base + ptr) < 0) \
486 goto err_label; \
487 ptr++; \
488 if (get_user(val_byte(__res, 1), base + ptr) < 0) \
489 goto err_label; \
490 ptr++; \
491 __res; \
492 })
493
494#define popl(base, ptr, err_label) \
495 ({ \
496 __u32 __res; \
497 if (get_user(val_byte(__res, 0), base + ptr) < 0) \
498 goto err_label; \
499 ptr++; \
500 if (get_user(val_byte(__res, 1), base + ptr) < 0) \
501 goto err_label; \
502 ptr++; \
503 if (get_user(val_byte(__res, 2), base + ptr) < 0) \
504 goto err_label; \
505 ptr++; \
506 if (get_user(val_byte(__res, 3), base + ptr) < 0) \
507 goto err_label; \
508 ptr++; \
509 __res; \
510 })
511
512/* There are so many possible reasons for this function to return
513 * VM86_INTx, so adding another doesn't bother me. We can expect
514 * userspace programs to be able to handle it. (Getting a problem
515 * in userspace is always better than an Oops anyway.) [KD]
516 */
517static void do_int(struct kernel_vm86_regs *regs, int i,
518 unsigned char __user * ssp, unsigned short sp)
519{
520 unsigned long __user *intr_ptr;
521 unsigned long segoffs;
522
523 if (regs->pt.xcs == BIOSSEG)
524 goto cannot_handle;
525 if (is_revectored(i, &KVM86->int_revectored))
526 goto cannot_handle;
527 if (i==0x21 && is_revectored(AH(regs),&KVM86->int21_revectored))
528 goto cannot_handle;
529 intr_ptr = (unsigned long __user *) (i << 2);
530 if (get_user(segoffs, intr_ptr))
531 goto cannot_handle;
532 if ((segoffs >> 16) == BIOSSEG)
533 goto cannot_handle;
534 pushw(ssp, sp, get_vflags(regs), cannot_handle);
535 pushw(ssp, sp, regs->pt.xcs, cannot_handle);
536 pushw(ssp, sp, IP(regs), cannot_handle);
537 regs->pt.xcs = segoffs >> 16;
538 SP(regs) -= 6;
539 IP(regs) = segoffs & 0xffff;
540 clear_TF(regs);
541 clear_IF(regs);
542 clear_AC(regs);
543 return;
544
545cannot_handle:
546 return_to_32bit(regs, VM86_INTx + (i << 8));
547}
548
549int handle_vm86_trap(struct kernel_vm86_regs * regs, long error_code, int trapno)
550{
551 if (VMPI.is_vm86pus) {
552 if ( (trapno==3) || (trapno==1) )
553 return_to_32bit(regs, VM86_TRAP + (trapno << 8));
554 do_int(regs, trapno, (unsigned char __user *) (regs->pt.xss << 4), SP(regs));
555 return 0;
556 }
557 if (trapno !=1)
558 return 1; /* we let this handle by the calling routine */
559 if (current->ptrace & PT_PTRACED) {
560 unsigned long flags;
561 spin_lock_irqsave(&current->sighand->siglock, flags);
562 sigdelset(&current->blocked, SIGTRAP);
563 recalc_sigpending();
564 spin_unlock_irqrestore(&current->sighand->siglock, flags);
565 }
566 send_sig(SIGTRAP, current, 1);
567 current->thread.trap_no = trapno;
568 current->thread.error_code = error_code;
569 return 0;
570}
571
572void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code)
573{
574 unsigned char opcode;
575 unsigned char __user *csp;
576 unsigned char __user *ssp;
577 unsigned short ip, sp, orig_flags;
578 int data32, pref_done;
579
580#define CHECK_IF_IN_TRAP \
581 if (VMPI.vm86dbg_active && VMPI.vm86dbg_TFpendig) \
582 newflags |= TF_MASK
583#define VM86_FAULT_RETURN do { \
584 if (VMPI.force_return_for_pic && (VEFLAGS & (IF_MASK | VIF_MASK))) \
585 return_to_32bit(regs, VM86_PICRETURN); \
586 if (orig_flags & TF_MASK) \
587 handle_vm86_trap(regs, 0, 1); \
588 return; } while (0)
589
590 orig_flags = *(unsigned short *)&regs->pt.eflags;
591
592 csp = (unsigned char __user *) (regs->pt.xcs << 4);
593 ssp = (unsigned char __user *) (regs->pt.xss << 4);
594 sp = SP(regs);
595 ip = IP(regs);
596
597 data32 = 0;
598 pref_done = 0;
599 do {
600 switch (opcode = popb(csp, ip, simulate_sigsegv)) {
601 case 0x66: /* 32-bit data */ data32=1; break;
602 case 0x67: /* 32-bit address */ break;
603 case 0x2e: /* CS */ break;
604 case 0x3e: /* DS */ break;
605 case 0x26: /* ES */ break;
606 case 0x36: /* SS */ break;
607 case 0x65: /* GS */ break;
608 case 0x64: /* FS */ break;
609 case 0xf2: /* repnz */ break;
610 case 0xf3: /* rep */ break;
611 default: pref_done = 1;
612 }
613 } while (!pref_done);
614
615 switch (opcode) {
616
617 /* pushf */
618 case 0x9c:
619 if (data32) {
620 pushl(ssp, sp, get_vflags(regs), simulate_sigsegv);
621 SP(regs) -= 4;
622 } else {
623 pushw(ssp, sp, get_vflags(regs), simulate_sigsegv);
624 SP(regs) -= 2;
625 }
626 IP(regs) = ip;
627 VM86_FAULT_RETURN;
628
629 /* popf */
630 case 0x9d:
631 {
632 unsigned long newflags;
633 if (data32) {
634 newflags=popl(ssp, sp, simulate_sigsegv);
635 SP(regs) += 4;
636 } else {
637 newflags = popw(ssp, sp, simulate_sigsegv);
638 SP(regs) += 2;
639 }
640 IP(regs) = ip;
641 CHECK_IF_IN_TRAP;
642 if (data32) {
643 set_vflags_long(newflags, regs);
644 } else {
645 set_vflags_short(newflags, regs);
646 }
647 VM86_FAULT_RETURN;
648 }
649
650 /* int xx */
651 case 0xcd: {
652 int intno=popb(csp, ip, simulate_sigsegv);
653 IP(regs) = ip;
654 if (VMPI.vm86dbg_active) {
655 if ( (1 << (intno &7)) & VMPI.vm86dbg_intxxtab[intno >> 3] )
656 return_to_32bit(regs, VM86_INTx + (intno << 8));
657 }
658 do_int(regs, intno, ssp, sp);
659 return;
660 }
661
662 /* iret */
663 case 0xcf:
664 {
665 unsigned long newip;
666 unsigned long newcs;
667 unsigned long newflags;
668 if (data32) {
669 newip=popl(ssp, sp, simulate_sigsegv);
670 newcs=popl(ssp, sp, simulate_sigsegv);
671 newflags=popl(ssp, sp, simulate_sigsegv);
672 SP(regs) += 12;
673 } else {
674 newip = popw(ssp, sp, simulate_sigsegv);
675 newcs = popw(ssp, sp, simulate_sigsegv);
676 newflags = popw(ssp, sp, simulate_sigsegv);
677 SP(regs) += 6;
678 }
679 IP(regs) = newip;
680 regs->pt.xcs = newcs;
681 CHECK_IF_IN_TRAP;
682 if (data32) {
683 set_vflags_long(newflags, regs);
684 } else {
685 set_vflags_short(newflags, regs);
686 }
687 VM86_FAULT_RETURN;
688 }
689
690 /* cli */
691 case 0xfa:
692 IP(regs) = ip;
693 clear_IF(regs);
694 VM86_FAULT_RETURN;
695
696 /* sti */
697 /*
698 * Damn. This is incorrect: the 'sti' instruction should actually
699 * enable interrupts after the /next/ instruction. Not good.
700 *
701 * Probably needs some horsing around with the TF flag. Aiee..
702 */
703 case 0xfb:
704 IP(regs) = ip;
705 set_IF(regs);
706 VM86_FAULT_RETURN;
707
708 default:
709 return_to_32bit(regs, VM86_UNKNOWN);
710 }
711
712 return;
713
714simulate_sigsegv:
715 /* FIXME: After a long discussion with Stas we finally
716 * agreed, that this is wrong. Here we should
717 * really send a SIGSEGV to the user program.
718 * But how do we create the correct context? We
719 * are inside a general protection fault handler
720 * and has just returned from a page fault handler.
721 * The correct context for the signal handler
722 * should be a mixture of the two, but how do we
723 * get the information? [KD]
724 */
725 return_to_32bit(regs, VM86_UNKNOWN);
726}
727
728/* ---------------- vm86 special IRQ passing stuff ----------------- */
729
730#define VM86_IRQNAME "vm86irq"
731
732static struct vm86_irqs {
733 struct task_struct *tsk;
734 int sig;
735} vm86_irqs[16];
736
737static DEFINE_SPINLOCK(irqbits_lock);
738static int irqbits;
739
740#define ALLOWED_SIGS ( 1 /* 0 = don't send a signal */ \
741 | (1 << SIGUSR1) | (1 << SIGUSR2) | (1 << SIGIO) | (1 << SIGURG) \
742 | (1 << SIGUNUSED) )
743
744static irqreturn_t irq_handler(int intno, void *dev_id)
745{
746 int irq_bit;
747 unsigned long flags;
748
749 spin_lock_irqsave(&irqbits_lock, flags);
750 irq_bit = 1 << intno;
751 if ((irqbits & irq_bit) || ! vm86_irqs[intno].tsk)
752 goto out;
753 irqbits |= irq_bit;
754 if (vm86_irqs[intno].sig)
755 send_sig(vm86_irqs[intno].sig, vm86_irqs[intno].tsk, 1);
756 /*
757 * IRQ will be re-enabled when user asks for the irq (whether
758 * polling or as a result of the signal)
759 */
760 disable_irq_nosync(intno);
761 spin_unlock_irqrestore(&irqbits_lock, flags);
762 return IRQ_HANDLED;
763
764out:
765 spin_unlock_irqrestore(&irqbits_lock, flags);
766 return IRQ_NONE;
767}
768
769static inline void free_vm86_irq(int irqnumber)
770{
771 unsigned long flags;
772
773 free_irq(irqnumber, NULL);
774 vm86_irqs[irqnumber].tsk = NULL;
775
776 spin_lock_irqsave(&irqbits_lock, flags);
777 irqbits &= ~(1 << irqnumber);
778 spin_unlock_irqrestore(&irqbits_lock, flags);
779}
780
781void release_vm86_irqs(struct task_struct *task)
782{
783 int i;
784 for (i = FIRST_VM86_IRQ ; i <= LAST_VM86_IRQ; i++)
785 if (vm86_irqs[i].tsk == task)
786 free_vm86_irq(i);
787}
788
789static inline int get_and_reset_irq(int irqnumber)
790{
791 int bit;
792 unsigned long flags;
793 int ret = 0;
794
795 if (invalid_vm86_irq(irqnumber)) return 0;
796 if (vm86_irqs[irqnumber].tsk != current) return 0;
797 spin_lock_irqsave(&irqbits_lock, flags);
798 bit = irqbits & (1 << irqnumber);
799 irqbits &= ~bit;
800 if (bit) {
801 enable_irq(irqnumber);
802 ret = 1;
803 }
804
805 spin_unlock_irqrestore(&irqbits_lock, flags);
806 return ret;
807}
808
809
810static int do_vm86_irq_handling(int subfunction, int irqnumber)
811{
812 int ret;
813 switch (subfunction) {
814 case VM86_GET_AND_RESET_IRQ: {
815 return get_and_reset_irq(irqnumber);
816 }
817 case VM86_GET_IRQ_BITS: {
818 return irqbits;
819 }
820 case VM86_REQUEST_IRQ: {
821 int sig = irqnumber >> 8;
822 int irq = irqnumber & 255;
823 if (!capable(CAP_SYS_ADMIN)) return -EPERM;
824 if (!((1 << sig) & ALLOWED_SIGS)) return -EPERM;
825 if (invalid_vm86_irq(irq)) return -EPERM;
826 if (vm86_irqs[irq].tsk) return -EPERM;
827 ret = request_irq(irq, &irq_handler, 0, VM86_IRQNAME, NULL);
828 if (ret) return ret;
829 vm86_irqs[irq].sig = sig;
830 vm86_irqs[irq].tsk = current;
831 return irq;
832 }
833 case VM86_FREE_IRQ: {
834 if (invalid_vm86_irq(irqnumber)) return -EPERM;
835 if (!vm86_irqs[irqnumber].tsk) return 0;
836 if (vm86_irqs[irqnumber].tsk != current) return -EPERM;
837 free_vm86_irq(irqnumber);
838 return 0;
839 }
840 }
841 return -EINVAL;
842}
843
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
new file mode 100644
index 000000000000..18673e0f193b
--- /dev/null
+++ b/arch/x86/kernel/vmi_32.c
@@ -0,0 +1,981 @@
1/*
2 * VMI specific paravirt-ops implementation
3 *
4 * Copyright (C) 2005, VMware, Inc.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
14 * NON INFRINGEMENT. See the GNU General Public License for more
15 * details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 * Send feedback to zach@vmware.com
22 *
23 */
24
25#include <linux/module.h>
26#include <linux/cpu.h>
27#include <linux/bootmem.h>
28#include <linux/mm.h>
29#include <linux/highmem.h>
30#include <linux/sched.h>
31#include <asm/vmi.h>
32#include <asm/io.h>
33#include <asm/fixmap.h>
34#include <asm/apicdef.h>
35#include <asm/apic.h>
36#include <asm/processor.h>
37#include <asm/timer.h>
38#include <asm/vmi_time.h>
39#include <asm/kmap_types.h>
40
41/* Convenient for calling VMI functions indirectly in the ROM */
42typedef u32 __attribute__((regparm(1))) (VROMFUNC)(void);
43typedef u64 __attribute__((regparm(2))) (VROMLONGFUNC)(int);
44
45#define call_vrom_func(rom,func) \
46 (((VROMFUNC *)(rom->func))())
47
48#define call_vrom_long_func(rom,func,arg) \
49 (((VROMLONGFUNC *)(rom->func)) (arg))
50
51static struct vrom_header *vmi_rom;
52static int disable_pge;
53static int disable_pse;
54static int disable_sep;
55static int disable_tsc;
56static int disable_mtrr;
57static int disable_noidle;
58static int disable_vmi_timer;
59
60/* Cached VMI operations */
61static struct {
62 void (*cpuid)(void /* non-c */);
63 void (*_set_ldt)(u32 selector);
64 void (*set_tr)(u32 selector);
65 void (*set_kernel_stack)(u32 selector, u32 esp0);
66 void (*allocate_page)(u32, u32, u32, u32, u32);
67 void (*release_page)(u32, u32);
68 void (*set_pte)(pte_t, pte_t *, unsigned);
69 void (*update_pte)(pte_t *, unsigned);
70 void (*set_linear_mapping)(int, void *, u32, u32);
71 void (*_flush_tlb)(int);
72 void (*set_initial_ap_state)(int, int);
73 void (*halt)(void);
74 void (*set_lazy_mode)(int mode);
75} vmi_ops;
76
77/* Cached VMI operations */
78struct vmi_timer_ops vmi_timer_ops;
79
80/*
81 * VMI patching routines.
82 */
83#define MNEM_CALL 0xe8
84#define MNEM_JMP 0xe9
85#define MNEM_RET 0xc3
86
87#define IRQ_PATCH_INT_MASK 0
88#define IRQ_PATCH_DISABLE 5
89
90static inline void patch_offset(void *insnbuf,
91 unsigned long eip, unsigned long dest)
92{
93 *(unsigned long *)(insnbuf+1) = dest-eip-5;
94}
95
96static unsigned patch_internal(int call, unsigned len, void *insnbuf,
97 unsigned long eip)
98{
99 u64 reloc;
100 struct vmi_relocation_info *const rel = (struct vmi_relocation_info *)&reloc;
101 reloc = call_vrom_long_func(vmi_rom, get_reloc, call);
102 switch(rel->type) {
103 case VMI_RELOCATION_CALL_REL:
104 BUG_ON(len < 5);
105 *(char *)insnbuf = MNEM_CALL;
106 patch_offset(insnbuf, eip, (unsigned long)rel->eip);
107 return 5;
108
109 case VMI_RELOCATION_JUMP_REL:
110 BUG_ON(len < 5);
111 *(char *)insnbuf = MNEM_JMP;
112 patch_offset(insnbuf, eip, (unsigned long)rel->eip);
113 return 5;
114
115 case VMI_RELOCATION_NOP:
116 /* obliterate the whole thing */
117 return 0;
118
119 case VMI_RELOCATION_NONE:
120 /* leave native code in place */
121 break;
122
123 default:
124 BUG();
125 }
126 return len;
127}
128
129/*
130 * Apply patch if appropriate, return length of new instruction
131 * sequence. The callee does nop padding for us.
132 */
133static unsigned vmi_patch(u8 type, u16 clobbers, void *insns,
134 unsigned long eip, unsigned len)
135{
136 switch (type) {
137 case PARAVIRT_PATCH(irq_disable):
138 return patch_internal(VMI_CALL_DisableInterrupts, len,
139 insns, eip);
140 case PARAVIRT_PATCH(irq_enable):
141 return patch_internal(VMI_CALL_EnableInterrupts, len,
142 insns, eip);
143 case PARAVIRT_PATCH(restore_fl):
144 return patch_internal(VMI_CALL_SetInterruptMask, len,
145 insns, eip);
146 case PARAVIRT_PATCH(save_fl):
147 return patch_internal(VMI_CALL_GetInterruptMask, len,
148 insns, eip);
149 case PARAVIRT_PATCH(iret):
150 return patch_internal(VMI_CALL_IRET, len, insns, eip);
151 case PARAVIRT_PATCH(irq_enable_sysexit):
152 return patch_internal(VMI_CALL_SYSEXIT, len, insns, eip);
153 default:
154 break;
155 }
156 return len;
157}
158
159/* CPUID has non-C semantics, and paravirt-ops API doesn't match hardware ISA */
160static void vmi_cpuid(unsigned int *eax, unsigned int *ebx,
161 unsigned int *ecx, unsigned int *edx)
162{
163 int override = 0;
164 if (*eax == 1)
165 override = 1;
166 asm volatile ("call *%6"
167 : "=a" (*eax),
168 "=b" (*ebx),
169 "=c" (*ecx),
170 "=d" (*edx)
171 : "0" (*eax), "2" (*ecx), "r" (vmi_ops.cpuid));
172 if (override) {
173 if (disable_pse)
174 *edx &= ~X86_FEATURE_PSE;
175 if (disable_pge)
176 *edx &= ~X86_FEATURE_PGE;
177 if (disable_sep)
178 *edx &= ~X86_FEATURE_SEP;
179 if (disable_tsc)
180 *edx &= ~X86_FEATURE_TSC;
181 if (disable_mtrr)
182 *edx &= ~X86_FEATURE_MTRR;
183 }
184}
185
186static inline void vmi_maybe_load_tls(struct desc_struct *gdt, int nr, struct desc_struct *new)
187{
188 if (gdt[nr].a != new->a || gdt[nr].b != new->b)
189 write_gdt_entry(gdt, nr, new->a, new->b);
190}
191
192static void vmi_load_tls(struct thread_struct *t, unsigned int cpu)
193{
194 struct desc_struct *gdt = get_cpu_gdt_table(cpu);
195 vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 0, &t->tls_array[0]);
196 vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 1, &t->tls_array[1]);
197 vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 2, &t->tls_array[2]);
198}
199
200static void vmi_set_ldt(const void *addr, unsigned entries)
201{
202 unsigned cpu = smp_processor_id();
203 u32 low, high;
204
205 pack_descriptor(&low, &high, (unsigned long)addr,
206 entries * sizeof(struct desc_struct) - 1,
207 DESCTYPE_LDT, 0);
208 write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, low, high);
209 vmi_ops._set_ldt(entries ? GDT_ENTRY_LDT*sizeof(struct desc_struct) : 0);
210}
211
212static void vmi_set_tr(void)
213{
214 vmi_ops.set_tr(GDT_ENTRY_TSS*sizeof(struct desc_struct));
215}
216
217static void vmi_load_esp0(struct tss_struct *tss,
218 struct thread_struct *thread)
219{
220 tss->x86_tss.esp0 = thread->esp0;
221
222 /* This can only happen when SEP is enabled, no need to test "SEP"arately */
223 if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
224 tss->x86_tss.ss1 = thread->sysenter_cs;
225 wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
226 }
227 vmi_ops.set_kernel_stack(__KERNEL_DS, tss->x86_tss.esp0);
228}
229
230static void vmi_flush_tlb_user(void)
231{
232 vmi_ops._flush_tlb(VMI_FLUSH_TLB);
233}
234
235static void vmi_flush_tlb_kernel(void)
236{
237 vmi_ops._flush_tlb(VMI_FLUSH_TLB | VMI_FLUSH_GLOBAL);
238}
239
240/* Stub to do nothing at all; used for delays and unimplemented calls */
241static void vmi_nop(void)
242{
243}
244
245#ifdef CONFIG_DEBUG_PAGE_TYPE
246
247#ifdef CONFIG_X86_PAE
248#define MAX_BOOT_PTS (2048+4+1)
249#else
250#define MAX_BOOT_PTS (1024+1)
251#endif
252
253/*
254 * During boot, mem_map is not yet available in paging_init, so stash
255 * all the boot page allocations here.
256 */
257static struct {
258 u32 pfn;
259 int type;
260} boot_page_allocations[MAX_BOOT_PTS];
261static int num_boot_page_allocations;
262static int boot_allocations_applied;
263
264void vmi_apply_boot_page_allocations(void)
265{
266 int i;
267 BUG_ON(!mem_map);
268 for (i = 0; i < num_boot_page_allocations; i++) {
269 struct page *page = pfn_to_page(boot_page_allocations[i].pfn);
270 page->type = boot_page_allocations[i].type;
271 page->type = boot_page_allocations[i].type &
272 ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
273 }
274 boot_allocations_applied = 1;
275}
276
277static void record_page_type(u32 pfn, int type)
278{
279 BUG_ON(num_boot_page_allocations >= MAX_BOOT_PTS);
280 boot_page_allocations[num_boot_page_allocations].pfn = pfn;
281 boot_page_allocations[num_boot_page_allocations].type = type;
282 num_boot_page_allocations++;
283}
284
285static void check_zeroed_page(u32 pfn, int type, struct page *page)
286{
287 u32 *ptr;
288 int i;
289 int limit = PAGE_SIZE / sizeof(int);
290
291 if (page_address(page))
292 ptr = (u32 *)page_address(page);
293 else
294 ptr = (u32 *)__va(pfn << PAGE_SHIFT);
295 /*
296 * When cloning the root in non-PAE mode, only the userspace
297 * pdes need to be zeroed.
298 */
299 if (type & VMI_PAGE_CLONE)
300 limit = USER_PTRS_PER_PGD;
301 for (i = 0; i < limit; i++)
302 BUG_ON(ptr[i]);
303}
304
305/*
306 * We stash the page type into struct page so we can verify the page
307 * types are used properly.
308 */
309static void vmi_set_page_type(u32 pfn, int type)
310{
311 /* PAE can have multiple roots per page - don't track */
312 if (PTRS_PER_PMD > 1 && (type & VMI_PAGE_PDP))
313 return;
314
315 if (boot_allocations_applied) {
316 struct page *page = pfn_to_page(pfn);
317 if (type != VMI_PAGE_NORMAL)
318 BUG_ON(page->type);
319 else
320 BUG_ON(page->type == VMI_PAGE_NORMAL);
321 page->type = type & ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
322 if (type & VMI_PAGE_ZEROED)
323 check_zeroed_page(pfn, type, page);
324 } else {
325 record_page_type(pfn, type);
326 }
327}
328
329static void vmi_check_page_type(u32 pfn, int type)
330{
331 /* PAE can have multiple roots per page - skip checks */
332 if (PTRS_PER_PMD > 1 && (type & VMI_PAGE_PDP))
333 return;
334
335 type &= ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
336 if (boot_allocations_applied) {
337 struct page *page = pfn_to_page(pfn);
338 BUG_ON((page->type ^ type) & VMI_PAGE_PAE);
339 BUG_ON(type == VMI_PAGE_NORMAL && page->type);
340 BUG_ON((type & page->type) == 0);
341 }
342}
343#else
344#define vmi_set_page_type(p,t) do { } while (0)
345#define vmi_check_page_type(p,t) do { } while (0)
346#endif
347
348#ifdef CONFIG_HIGHPTE
349static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
350{
351 void *va = kmap_atomic(page, type);
352
353 /*
354 * Internally, the VMI ROM must map virtual addresses to physical
355 * addresses for processing MMU updates. By the time MMU updates
356 * are issued, this information is typically already lost.
357 * Fortunately, the VMI provides a cache of mapping slots for active
358 * page tables.
359 *
360 * We use slot zero for the linear mapping of physical memory, and
361 * in HIGHPTE kernels, slot 1 and 2 for KM_PTE0 and KM_PTE1.
362 *
363 * args: SLOT VA COUNT PFN
364 */
365 BUG_ON(type != KM_PTE0 && type != KM_PTE1);
366 vmi_ops.set_linear_mapping((type - KM_PTE0)+1, va, 1, page_to_pfn(page));
367
368 return va;
369}
370#endif
371
372static void vmi_allocate_pt(struct mm_struct *mm, u32 pfn)
373{
374 vmi_set_page_type(pfn, VMI_PAGE_L1);
375 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
376}
377
378static void vmi_allocate_pd(u32 pfn)
379{
380 /*
381 * This call comes in very early, before mem_map is setup.
382 * It is called only for swapper_pg_dir, which already has
383 * data on it.
384 */
385 vmi_set_page_type(pfn, VMI_PAGE_L2);
386 vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0);
387}
388
389static void vmi_allocate_pd_clone(u32 pfn, u32 clonepfn, u32 start, u32 count)
390{
391 vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE);
392 vmi_check_page_type(clonepfn, VMI_PAGE_L2);
393 vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count);
394}
395
396static void vmi_release_pt(u32 pfn)
397{
398 vmi_ops.release_page(pfn, VMI_PAGE_L1);
399 vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
400}
401
402static void vmi_release_pd(u32 pfn)
403{
404 vmi_ops.release_page(pfn, VMI_PAGE_L2);
405 vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
406}
407
408/*
409 * Helper macros for MMU update flags. We can defer updates until a flush
410 * or page invalidation only if the update is to the current address space
411 * (otherwise, there is no flush). We must check against init_mm, since
412 * this could be a kernel update, which usually passes init_mm, although
413 * sometimes this check can be skipped if we know the particular function
414 * is only called on user mode PTEs. We could change the kernel to pass
415 * current->active_mm here, but in particular, I was unsure if changing
416 * mm/highmem.c to do this would still be correct on other architectures.
417 */
418#define is_current_as(mm, mustbeuser) ((mm) == current->active_mm || \
419 (!mustbeuser && (mm) == &init_mm))
420#define vmi_flags_addr(mm, addr, level, user) \
421 ((level) | (is_current_as(mm, user) ? \
422 (VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
423#define vmi_flags_addr_defer(mm, addr, level, user) \
424 ((level) | (is_current_as(mm, user) ? \
425 (VMI_PAGE_DEFER | VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
426
427static void vmi_update_pte(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
428{
429 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
430 vmi_ops.update_pte(ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
431}
432
433static void vmi_update_pte_defer(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
434{
435 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
436 vmi_ops.update_pte(ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 0));
437}
438
439static void vmi_set_pte(pte_t *ptep, pte_t pte)
440{
441 /* XXX because of set_pmd_pte, this can be called on PT or PD layers */
442 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE | VMI_PAGE_PD);
443 vmi_ops.set_pte(pte, ptep, VMI_PAGE_PT);
444}
445
446static void vmi_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
447{
448 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
449 vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
450}
451
452static void vmi_set_pmd(pmd_t *pmdp, pmd_t pmdval)
453{
454#ifdef CONFIG_X86_PAE
455 const pte_t pte = { pmdval.pmd, pmdval.pmd >> 32 };
456 vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PMD);
457#else
458 const pte_t pte = { pmdval.pud.pgd.pgd };
459 vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PGD);
460#endif
461 vmi_ops.set_pte(pte, (pte_t *)pmdp, VMI_PAGE_PD);
462}
463
464#ifdef CONFIG_X86_PAE
465
466static void vmi_set_pte_atomic(pte_t *ptep, pte_t pteval)
467{
468 /*
469 * XXX This is called from set_pmd_pte, but at both PT
470 * and PD layers so the VMI_PAGE_PT flag is wrong. But
471 * it is only called for large page mapping changes,
472 * the Xen backend, doesn't support large pages, and the
473 * ESX backend doesn't depend on the flag.
474 */
475 set_64bit((unsigned long long *)ptep,pte_val(pteval));
476 vmi_ops.update_pte(ptep, VMI_PAGE_PT);
477}
478
479static void vmi_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
480{
481 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
482 vmi_ops.set_pte(pte, ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 1));
483}
484
485static void vmi_set_pud(pud_t *pudp, pud_t pudval)
486{
487 /* Um, eww */
488 const pte_t pte = { pudval.pgd.pgd, pudval.pgd.pgd >> 32 };
489 vmi_check_page_type(__pa(pudp) >> PAGE_SHIFT, VMI_PAGE_PGD);
490 vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP);
491}
492
493static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
494{
495 const pte_t pte = { 0 };
496 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
497 vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
498}
499
500static void vmi_pmd_clear(pmd_t *pmd)
501{
502 const pte_t pte = { 0 };
503 vmi_check_page_type(__pa(pmd) >> PAGE_SHIFT, VMI_PAGE_PMD);
504 vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD);
505}
506#endif
507
508#ifdef CONFIG_SMP
509static void __devinit
510vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
511 unsigned long start_esp)
512{
513 struct vmi_ap_state ap;
514
515 /* Default everything to zero. This is fine for most GPRs. */
516 memset(&ap, 0, sizeof(struct vmi_ap_state));
517
518 ap.gdtr_limit = GDT_SIZE - 1;
519 ap.gdtr_base = (unsigned long) get_cpu_gdt_table(phys_apicid);
520
521 ap.idtr_limit = IDT_ENTRIES * 8 - 1;
522 ap.idtr_base = (unsigned long) idt_table;
523
524 ap.ldtr = 0;
525
526 ap.cs = __KERNEL_CS;
527 ap.eip = (unsigned long) start_eip;
528 ap.ss = __KERNEL_DS;
529 ap.esp = (unsigned long) start_esp;
530
531 ap.ds = __USER_DS;
532 ap.es = __USER_DS;
533 ap.fs = __KERNEL_PERCPU;
534 ap.gs = 0;
535
536 ap.eflags = 0;
537
538#ifdef CONFIG_X86_PAE
539 /* efer should match BSP efer. */
540 if (cpu_has_nx) {
541 unsigned l, h;
542 rdmsr(MSR_EFER, l, h);
543 ap.efer = (unsigned long long) h << 32 | l;
544 }
545#endif
546
547 ap.cr3 = __pa(swapper_pg_dir);
548 /* Protected mode, paging, AM, WP, NE, MP. */
549 ap.cr0 = 0x80050023;
550 ap.cr4 = mmu_cr4_features;
551 vmi_ops.set_initial_ap_state((u32)&ap, phys_apicid);
552}
553#endif
554
555static void vmi_set_lazy_mode(enum paravirt_lazy_mode mode)
556{
557 static DEFINE_PER_CPU(enum paravirt_lazy_mode, lazy_mode);
558
559 if (!vmi_ops.set_lazy_mode)
560 return;
561
562 /* Modes should never nest or overlap */
563 BUG_ON(__get_cpu_var(lazy_mode) && !(mode == PARAVIRT_LAZY_NONE ||
564 mode == PARAVIRT_LAZY_FLUSH));
565
566 if (mode == PARAVIRT_LAZY_FLUSH) {
567 vmi_ops.set_lazy_mode(0);
568 vmi_ops.set_lazy_mode(__get_cpu_var(lazy_mode));
569 } else {
570 vmi_ops.set_lazy_mode(mode);
571 __get_cpu_var(lazy_mode) = mode;
572 }
573}
574
575static inline int __init check_vmi_rom(struct vrom_header *rom)
576{
577 struct pci_header *pci;
578 struct pnp_header *pnp;
579 const char *manufacturer = "UNKNOWN";
580 const char *product = "UNKNOWN";
581 const char *license = "unspecified";
582
583 if (rom->rom_signature != 0xaa55)
584 return 0;
585 if (rom->vrom_signature != VMI_SIGNATURE)
586 return 0;
587 if (rom->api_version_maj != VMI_API_REV_MAJOR ||
588 rom->api_version_min+1 < VMI_API_REV_MINOR+1) {
589 printk(KERN_WARNING "VMI: Found mismatched rom version %d.%d\n",
590 rom->api_version_maj,
591 rom->api_version_min);
592 return 0;
593 }
594
595 /*
596 * Relying on the VMI_SIGNATURE field is not 100% safe, so check
597 * the PCI header and device type to make sure this is really a
598 * VMI device.
599 */
600 if (!rom->pci_header_offs) {
601 printk(KERN_WARNING "VMI: ROM does not contain PCI header.\n");
602 return 0;
603 }
604
605 pci = (struct pci_header *)((char *)rom+rom->pci_header_offs);
606 if (pci->vendorID != PCI_VENDOR_ID_VMWARE ||
607 pci->deviceID != PCI_DEVICE_ID_VMWARE_VMI) {
608 /* Allow it to run... anyways, but warn */
609 printk(KERN_WARNING "VMI: ROM from unknown manufacturer\n");
610 }
611
612 if (rom->pnp_header_offs) {
613 pnp = (struct pnp_header *)((char *)rom+rom->pnp_header_offs);
614 if (pnp->manufacturer_offset)
615 manufacturer = (const char *)rom+pnp->manufacturer_offset;
616 if (pnp->product_offset)
617 product = (const char *)rom+pnp->product_offset;
618 }
619
620 if (rom->license_offs)
621 license = (char *)rom+rom->license_offs;
622
623 printk(KERN_INFO "VMI: Found %s %s, API version %d.%d, ROM version %d.%d\n",
624 manufacturer, product,
625 rom->api_version_maj, rom->api_version_min,
626 pci->rom_version_maj, pci->rom_version_min);
627
628 /* Don't allow BSD/MIT here for now because we don't want to end up
629 with any binary only shim layers */
630 if (strcmp(license, "GPL") && strcmp(license, "GPL v2")) {
631 printk(KERN_WARNING "VMI: Non GPL license `%s' found for ROM. Not used.\n",
632 license);
633 return 0;
634 }
635
636 return 1;
637}
638
639/*
640 * Probe for the VMI option ROM
641 */
642static inline int __init probe_vmi_rom(void)
643{
644 unsigned long base;
645
646 /* VMI ROM is in option ROM area, check signature */
647 for (base = 0xC0000; base < 0xE0000; base += 2048) {
648 struct vrom_header *romstart;
649 romstart = (struct vrom_header *)isa_bus_to_virt(base);
650 if (check_vmi_rom(romstart)) {
651 vmi_rom = romstart;
652 return 1;
653 }
654 }
655 return 0;
656}
657
658/*
659 * VMI setup common to all processors
660 */
661void vmi_bringup(void)
662{
663 /* We must establish the lowmem mapping for MMU ops to work */
664 if (vmi_ops.set_linear_mapping)
665 vmi_ops.set_linear_mapping(0, (void *)__PAGE_OFFSET, max_low_pfn, 0);
666}
667
668/*
669 * Return a pointer to a VMI function or NULL if unimplemented
670 */
671static void *vmi_get_function(int vmicall)
672{
673 u64 reloc;
674 const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
675 reloc = call_vrom_long_func(vmi_rom, get_reloc, vmicall);
676 BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL);
677 if (rel->type == VMI_RELOCATION_CALL_REL)
678 return (void *)rel->eip;
679 else
680 return NULL;
681}
682
683/*
684 * Helper macro for making the VMI paravirt-ops fill code readable.
685 * For unimplemented operations, fall back to default, unless nop
686 * is returned by the ROM.
687 */
688#define para_fill(opname, vmicall) \
689do { \
690 reloc = call_vrom_long_func(vmi_rom, get_reloc, \
691 VMI_CALL_##vmicall); \
692 if (rel->type == VMI_RELOCATION_CALL_REL) \
693 paravirt_ops.opname = (void *)rel->eip; \
694 else if (rel->type == VMI_RELOCATION_NOP) \
695 paravirt_ops.opname = (void *)vmi_nop; \
696 else if (rel->type != VMI_RELOCATION_NONE) \
697 printk(KERN_WARNING "VMI: Unknown relocation " \
698 "type %d for " #vmicall"\n",\
699 rel->type); \
700} while (0)
701
702/*
703 * Helper macro for making the VMI paravirt-ops fill code readable.
704 * For cached operations which do not match the VMI ROM ABI and must
705 * go through a tranlation stub. Ignore NOPs, since it is not clear
706 * a NOP * VMI function corresponds to a NOP paravirt-op when the
707 * functions are not in 1-1 correspondence.
708 */
709#define para_wrap(opname, wrapper, cache, vmicall) \
710do { \
711 reloc = call_vrom_long_func(vmi_rom, get_reloc, \
712 VMI_CALL_##vmicall); \
713 BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL); \
714 if (rel->type == VMI_RELOCATION_CALL_REL) { \
715 paravirt_ops.opname = wrapper; \
716 vmi_ops.cache = (void *)rel->eip; \
717 } \
718} while (0)
719
720/*
721 * Activate the VMI interface and switch into paravirtualized mode
722 */
723static inline int __init activate_vmi(void)
724{
725 short kernel_cs;
726 u64 reloc;
727 const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
728
729 if (call_vrom_func(vmi_rom, vmi_init) != 0) {
730 printk(KERN_ERR "VMI ROM failed to initialize!");
731 return 0;
732 }
733 savesegment(cs, kernel_cs);
734
735 paravirt_ops.paravirt_enabled = 1;
736 paravirt_ops.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK;
737
738 paravirt_ops.patch = vmi_patch;
739 paravirt_ops.name = "vmi";
740
741 /*
742 * Many of these operations are ABI compatible with VMI.
743 * This means we can fill in the paravirt-ops with direct
744 * pointers into the VMI ROM. If the calling convention for
745 * these operations changes, this code needs to be updated.
746 *
747 * Exceptions
748 * CPUID paravirt-op uses pointers, not the native ISA
749 * halt has no VMI equivalent; all VMI halts are "safe"
750 * no MSR support yet - just trap and emulate. VMI uses the
751 * same ABI as the native ISA, but Linux wants exceptions
752 * from bogus MSR read / write handled
753 * rdpmc is not yet used in Linux
754 */
755
756 /* CPUID is special, so very special it gets wrapped like a present */
757 para_wrap(cpuid, vmi_cpuid, cpuid, CPUID);
758
759 para_fill(clts, CLTS);
760 para_fill(get_debugreg, GetDR);
761 para_fill(set_debugreg, SetDR);
762 para_fill(read_cr0, GetCR0);
763 para_fill(read_cr2, GetCR2);
764 para_fill(read_cr3, GetCR3);
765 para_fill(read_cr4, GetCR4);
766 para_fill(write_cr0, SetCR0);
767 para_fill(write_cr2, SetCR2);
768 para_fill(write_cr3, SetCR3);
769 para_fill(write_cr4, SetCR4);
770 para_fill(save_fl, GetInterruptMask);
771 para_fill(restore_fl, SetInterruptMask);
772 para_fill(irq_disable, DisableInterrupts);
773 para_fill(irq_enable, EnableInterrupts);
774
775 para_fill(wbinvd, WBINVD);
776 para_fill(read_tsc, RDTSC);
777
778 /* The following we emulate with trap and emulate for now */
779 /* paravirt_ops.read_msr = vmi_rdmsr */
780 /* paravirt_ops.write_msr = vmi_wrmsr */
781 /* paravirt_ops.rdpmc = vmi_rdpmc */
782
783 /* TR interface doesn't pass TR value, wrap */
784 para_wrap(load_tr_desc, vmi_set_tr, set_tr, SetTR);
785
786 /* LDT is special, too */
787 para_wrap(set_ldt, vmi_set_ldt, _set_ldt, SetLDT);
788
789 para_fill(load_gdt, SetGDT);
790 para_fill(load_idt, SetIDT);
791 para_fill(store_gdt, GetGDT);
792 para_fill(store_idt, GetIDT);
793 para_fill(store_tr, GetTR);
794 paravirt_ops.load_tls = vmi_load_tls;
795 para_fill(write_ldt_entry, WriteLDTEntry);
796 para_fill(write_gdt_entry, WriteGDTEntry);
797 para_fill(write_idt_entry, WriteIDTEntry);
798 para_wrap(load_esp0, vmi_load_esp0, set_kernel_stack, UpdateKernelStack);
799 para_fill(set_iopl_mask, SetIOPLMask);
800 para_fill(io_delay, IODelay);
801 para_wrap(set_lazy_mode, vmi_set_lazy_mode, set_lazy_mode, SetLazyMode);
802
803 /* user and kernel flush are just handled with different flags to FlushTLB */
804 para_wrap(flush_tlb_user, vmi_flush_tlb_user, _flush_tlb, FlushTLB);
805 para_wrap(flush_tlb_kernel, vmi_flush_tlb_kernel, _flush_tlb, FlushTLB);
806 para_fill(flush_tlb_single, InvalPage);
807
808 /*
809 * Until a standard flag format can be agreed on, we need to
810 * implement these as wrappers in Linux. Get the VMI ROM
811 * function pointers for the two backend calls.
812 */
813#ifdef CONFIG_X86_PAE
814 vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxELong);
815 vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxELong);
816#else
817 vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxE);
818 vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxE);
819#endif
820
821 if (vmi_ops.set_pte) {
822 paravirt_ops.set_pte = vmi_set_pte;
823 paravirt_ops.set_pte_at = vmi_set_pte_at;
824 paravirt_ops.set_pmd = vmi_set_pmd;
825#ifdef CONFIG_X86_PAE
826 paravirt_ops.set_pte_atomic = vmi_set_pte_atomic;
827 paravirt_ops.set_pte_present = vmi_set_pte_present;
828 paravirt_ops.set_pud = vmi_set_pud;
829 paravirt_ops.pte_clear = vmi_pte_clear;
830 paravirt_ops.pmd_clear = vmi_pmd_clear;
831#endif
832 }
833
834 if (vmi_ops.update_pte) {
835 paravirt_ops.pte_update = vmi_update_pte;
836 paravirt_ops.pte_update_defer = vmi_update_pte_defer;
837 }
838
839 vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage);
840 if (vmi_ops.allocate_page) {
841 paravirt_ops.alloc_pt = vmi_allocate_pt;
842 paravirt_ops.alloc_pd = vmi_allocate_pd;
843 paravirt_ops.alloc_pd_clone = vmi_allocate_pd_clone;
844 }
845
846 vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage);
847 if (vmi_ops.release_page) {
848 paravirt_ops.release_pt = vmi_release_pt;
849 paravirt_ops.release_pd = vmi_release_pd;
850 }
851
852 /* Set linear is needed in all cases */
853 vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping);
854#ifdef CONFIG_HIGHPTE
855 if (vmi_ops.set_linear_mapping)
856 paravirt_ops.kmap_atomic_pte = vmi_kmap_atomic_pte;
857#endif
858
859 /*
860 * These MUST always be patched. Don't support indirect jumps
861 * through these operations, as the VMI interface may use either
862 * a jump or a call to get to these operations, depending on
863 * the backend. They are performance critical anyway, so requiring
864 * a patch is not a big problem.
865 */
866 paravirt_ops.irq_enable_sysexit = (void *)0xfeedbab0;
867 paravirt_ops.iret = (void *)0xbadbab0;
868
869#ifdef CONFIG_SMP
870 para_wrap(startup_ipi_hook, vmi_startup_ipi_hook, set_initial_ap_state, SetInitialAPState);
871#endif
872
873#ifdef CONFIG_X86_LOCAL_APIC
874 para_fill(apic_read, APICRead);
875 para_fill(apic_write, APICWrite);
876 para_fill(apic_write_atomic, APICWrite);
877#endif
878
879 /*
880 * Check for VMI timer functionality by probing for a cycle frequency method
881 */
882 reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_GetCycleFrequency);
883 if (!disable_vmi_timer && rel->type != VMI_RELOCATION_NONE) {
884 vmi_timer_ops.get_cycle_frequency = (void *)rel->eip;
885 vmi_timer_ops.get_cycle_counter =
886 vmi_get_function(VMI_CALL_GetCycleCounter);
887 vmi_timer_ops.get_wallclock =
888 vmi_get_function(VMI_CALL_GetWallclockTime);
889 vmi_timer_ops.wallclock_updated =
890 vmi_get_function(VMI_CALL_WallclockUpdated);
891 vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm);
892 vmi_timer_ops.cancel_alarm =
893 vmi_get_function(VMI_CALL_CancelAlarm);
894 paravirt_ops.time_init = vmi_time_init;
895 paravirt_ops.get_wallclock = vmi_get_wallclock;
896 paravirt_ops.set_wallclock = vmi_set_wallclock;
897#ifdef CONFIG_X86_LOCAL_APIC
898 paravirt_ops.setup_boot_clock = vmi_time_bsp_init;
899 paravirt_ops.setup_secondary_clock = vmi_time_ap_init;
900#endif
901 paravirt_ops.sched_clock = vmi_sched_clock;
902 paravirt_ops.get_cpu_khz = vmi_cpu_khz;
903
904 /* We have true wallclock functions; disable CMOS clock sync */
905 no_sync_cmos_clock = 1;
906 } else {
907 disable_noidle = 1;
908 disable_vmi_timer = 1;
909 }
910
911 para_fill(safe_halt, Halt);
912
913 /*
914 * Alternative instruction rewriting doesn't happen soon enough
915 * to convert VMI_IRET to a call instead of a jump; so we have
916 * to do this before IRQs get reenabled. Fortunately, it is
917 * idempotent.
918 */
919 apply_paravirt(__parainstructions, __parainstructions_end);
920
921 vmi_bringup();
922
923 return 1;
924}
925
926#undef para_fill
927
928void __init vmi_init(void)
929{
930 unsigned long flags;
931
932 if (!vmi_rom)
933 probe_vmi_rom();
934 else
935 check_vmi_rom(vmi_rom);
936
937 /* In case probing for or validating the ROM failed, basil */
938 if (!vmi_rom)
939 return;
940
941 reserve_top_address(-vmi_rom->virtual_top);
942
943 local_irq_save(flags);
944 activate_vmi();
945
946#ifdef CONFIG_X86_IO_APIC
947 /* This is virtual hardware; timer routing is wired correctly */
948 no_timer_check = 1;
949#endif
950 local_irq_restore(flags & X86_EFLAGS_IF);
951}
952
953static int __init parse_vmi(char *arg)
954{
955 if (!arg)
956 return -EINVAL;
957
958 if (!strcmp(arg, "disable_pge")) {
959 clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
960 disable_pge = 1;
961 } else if (!strcmp(arg, "disable_pse")) {
962 clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
963 disable_pse = 1;
964 } else if (!strcmp(arg, "disable_sep")) {
965 clear_bit(X86_FEATURE_SEP, boot_cpu_data.x86_capability);
966 disable_sep = 1;
967 } else if (!strcmp(arg, "disable_tsc")) {
968 clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
969 disable_tsc = 1;
970 } else if (!strcmp(arg, "disable_mtrr")) {
971 clear_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability);
972 disable_mtrr = 1;
973 } else if (!strcmp(arg, "disable_timer")) {
974 disable_vmi_timer = 1;
975 disable_noidle = 1;
976 } else if (!strcmp(arg, "disable_noidle"))
977 disable_noidle = 1;
978 return 0;
979}
980
981early_param("vmi", parse_vmi);
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
new file mode 100644
index 000000000000..b1b5ab08b26e
--- /dev/null
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -0,0 +1,320 @@
1/*
2 * VMI paravirtual timer support routines.
3 *
4 * Copyright (C) 2007, VMware, Inc.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
14 * NON INFRINGEMENT. See the GNU General Public License for more
15 * details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 */
22
23#include <linux/smp.h>
24#include <linux/interrupt.h>
25#include <linux/cpumask.h>
26#include <linux/clocksource.h>
27#include <linux/clockchips.h>
28
29#include <asm/vmi.h>
30#include <asm/vmi_time.h>
31#include <asm/arch_hooks.h>
32#include <asm/apicdef.h>
33#include <asm/apic.h>
34#include <asm/timer.h>
35#include <asm/i8253.h>
36
37#include <irq_vectors.h>
38#include "io_ports.h"
39
40#define VMI_ONESHOT (VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
41#define VMI_PERIODIC (VMI_ALARM_IS_PERIODIC | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
42
43static DEFINE_PER_CPU(struct clock_event_device, local_events);
44
45static inline u32 vmi_counter(u32 flags)
46{
47 /* Given VMI_ONESHOT or VMI_PERIODIC, return the corresponding
48 * cycle counter. */
49 return flags & VMI_ALARM_COUNTER_MASK;
50}
51
52/* paravirt_ops.get_wallclock = vmi_get_wallclock */
53unsigned long vmi_get_wallclock(void)
54{
55 unsigned long long wallclock;
56 wallclock = vmi_timer_ops.get_wallclock(); // nsec
57 (void)do_div(wallclock, 1000000000); // sec
58
59 return wallclock;
60}
61
62/* paravirt_ops.set_wallclock = vmi_set_wallclock */
63int vmi_set_wallclock(unsigned long now)
64{
65 return 0;
66}
67
68/* paravirt_ops.sched_clock = vmi_sched_clock */
69unsigned long long vmi_sched_clock(void)
70{
71 return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE));
72}
73
74/* paravirt_ops.get_cpu_khz = vmi_cpu_khz */
75unsigned long vmi_cpu_khz(void)
76{
77 unsigned long long khz;
78 khz = vmi_timer_ops.get_cycle_frequency();
79 (void)do_div(khz, 1000);
80 return khz;
81}
82
83static inline unsigned int vmi_get_timer_vector(void)
84{
85#ifdef CONFIG_X86_IO_APIC
86 return FIRST_DEVICE_VECTOR;
87#else
88 return FIRST_EXTERNAL_VECTOR;
89#endif
90}
91
92/** vmi clockchip */
93#ifdef CONFIG_X86_LOCAL_APIC
94static unsigned int startup_timer_irq(unsigned int irq)
95{
96 unsigned long val = apic_read(APIC_LVTT);
97 apic_write(APIC_LVTT, vmi_get_timer_vector());
98
99 return (val & APIC_SEND_PENDING);
100}
101
102static void mask_timer_irq(unsigned int irq)
103{
104 unsigned long val = apic_read(APIC_LVTT);
105 apic_write(APIC_LVTT, val | APIC_LVT_MASKED);
106}
107
108static void unmask_timer_irq(unsigned int irq)
109{
110 unsigned long val = apic_read(APIC_LVTT);
111 apic_write(APIC_LVTT, val & ~APIC_LVT_MASKED);
112}
113
114static void ack_timer_irq(unsigned int irq)
115{
116 ack_APIC_irq();
117}
118
119static struct irq_chip vmi_chip __read_mostly = {
120 .name = "VMI-LOCAL",
121 .startup = startup_timer_irq,
122 .mask = mask_timer_irq,
123 .unmask = unmask_timer_irq,
124 .ack = ack_timer_irq
125};
126#endif
127
128/** vmi clockevent */
129#define VMI_ALARM_WIRED_IRQ0 0x00000000
130#define VMI_ALARM_WIRED_LVTT 0x00010000
131static int vmi_wiring = VMI_ALARM_WIRED_IRQ0;
132
133static inline int vmi_get_alarm_wiring(void)
134{
135 return vmi_wiring;
136}
137
138static void vmi_timer_set_mode(enum clock_event_mode mode,
139 struct clock_event_device *evt)
140{
141 cycle_t now, cycles_per_hz;
142 BUG_ON(!irqs_disabled());
143
144 switch (mode) {
145 case CLOCK_EVT_MODE_ONESHOT:
146 case CLOCK_EVT_MODE_RESUME:
147 break;
148 case CLOCK_EVT_MODE_PERIODIC:
149 cycles_per_hz = vmi_timer_ops.get_cycle_frequency();
150 (void)do_div(cycles_per_hz, HZ);
151 now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_PERIODIC));
152 vmi_timer_ops.set_alarm(VMI_PERIODIC, now, cycles_per_hz);
153 break;
154 case CLOCK_EVT_MODE_UNUSED:
155 case CLOCK_EVT_MODE_SHUTDOWN:
156 switch (evt->mode) {
157 case CLOCK_EVT_MODE_ONESHOT:
158 vmi_timer_ops.cancel_alarm(VMI_ONESHOT);
159 break;
160 case CLOCK_EVT_MODE_PERIODIC:
161 vmi_timer_ops.cancel_alarm(VMI_PERIODIC);
162 break;
163 default:
164 break;
165 }
166 break;
167 default:
168 break;
169 }
170}
171
172static int vmi_timer_next_event(unsigned long delta,
173 struct clock_event_device *evt)
174{
175 /* Unfortunately, set_next_event interface only passes relative
176 * expiry, but we want absolute expiry. It'd be better if were
177 * were passed an aboslute expiry, since a bunch of time may
178 * have been stolen between the time the delta is computed and
179 * when we set the alarm below. */
180 cycle_t now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_ONESHOT));
181
182 BUG_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
183 vmi_timer_ops.set_alarm(VMI_ONESHOT, now + delta, 0);
184 return 0;
185}
186
187static struct clock_event_device vmi_clockevent = {
188 .name = "vmi-timer",
189 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
190 .shift = 22,
191 .set_mode = vmi_timer_set_mode,
192 .set_next_event = vmi_timer_next_event,
193 .rating = 1000,
194 .irq = 0,
195};
196
197static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id)
198{
199 struct clock_event_device *evt = &__get_cpu_var(local_events);
200 evt->event_handler(evt);
201 return IRQ_HANDLED;
202}
203
204static struct irqaction vmi_clock_action = {
205 .name = "vmi-timer",
206 .handler = vmi_timer_interrupt,
207 .flags = IRQF_DISABLED | IRQF_NOBALANCING,
208 .mask = CPU_MASK_ALL,
209};
210
211static void __devinit vmi_time_init_clockevent(void)
212{
213 cycle_t cycles_per_msec;
214 struct clock_event_device *evt;
215
216 int cpu = smp_processor_id();
217 evt = &__get_cpu_var(local_events);
218
219 /* Use cycles_per_msec since div_sc params are 32-bits. */
220 cycles_per_msec = vmi_timer_ops.get_cycle_frequency();
221 (void)do_div(cycles_per_msec, 1000);
222
223 memcpy(evt, &vmi_clockevent, sizeof(*evt));
224 /* Must pick .shift such that .mult fits in 32-bits. Choosing
225 * .shift to be 22 allows 2^(32-22) cycles per nano-seconds
226 * before overflow. */
227 evt->mult = div_sc(cycles_per_msec, NSEC_PER_MSEC, evt->shift);
228 /* Upper bound is clockevent's use of ulong for cycle deltas. */
229 evt->max_delta_ns = clockevent_delta2ns(ULONG_MAX, evt);
230 evt->min_delta_ns = clockevent_delta2ns(1, evt);
231 evt->cpumask = cpumask_of_cpu(cpu);
232
233 printk(KERN_WARNING "vmi: registering clock event %s. mult=%lu shift=%u\n",
234 evt->name, evt->mult, evt->shift);
235 clockevents_register_device(evt);
236}
237
238void __init vmi_time_init(void)
239{
240 /* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */
241 outb_p(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */
242
243 vmi_time_init_clockevent();
244 setup_irq(0, &vmi_clock_action);
245}
246
247#ifdef CONFIG_X86_LOCAL_APIC
248void __devinit vmi_time_bsp_init(void)
249{
250 /*
251 * On APIC systems, we want local timers to fire on each cpu. We do
252 * this by programming LVTT to deliver timer events to the IRQ handler
253 * for IRQ-0, since we can't re-use the APIC local timer handler
254 * without interfering with that code.
255 */
256 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
257 local_irq_disable();
258#ifdef CONFIG_X86_SMP
259 /*
260 * XXX handle_percpu_irq only defined for SMP; we need to switch over
261 * to using it, since this is a local interrupt, which each CPU must
262 * handle individually without locking out or dropping simultaneous
263 * local timers on other CPUs. We also don't want to trigger the
264 * quirk workaround code for interrupts which gets invoked from
265 * handle_percpu_irq via eoi, so we use our own IRQ chip.
266 */
267 set_irq_chip_and_handler_name(0, &vmi_chip, handle_percpu_irq, "lvtt");
268#else
269 set_irq_chip_and_handler_name(0, &vmi_chip, handle_edge_irq, "lvtt");
270#endif
271 vmi_wiring = VMI_ALARM_WIRED_LVTT;
272 apic_write(APIC_LVTT, vmi_get_timer_vector());
273 local_irq_enable();
274 clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
275}
276
277void __devinit vmi_time_ap_init(void)
278{
279 vmi_time_init_clockevent();
280 apic_write(APIC_LVTT, vmi_get_timer_vector());
281}
282#endif
283
284/** vmi clocksource */
285
286static cycle_t read_real_cycles(void)
287{
288 return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL);
289}
290
291static struct clocksource clocksource_vmi = {
292 .name = "vmi-timer",
293 .rating = 450,
294 .read = read_real_cycles,
295 .mask = CLOCKSOURCE_MASK(64),
296 .mult = 0, /* to be set */
297 .shift = 22,
298 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
299};
300
301static int __init init_vmi_clocksource(void)
302{
303 cycle_t cycles_per_msec;
304
305 if (!vmi_timer_ops.get_cycle_frequency)
306 return 0;
307 /* Use khz2mult rather than hz2mult since hz arg is only 32-bits. */
308 cycles_per_msec = vmi_timer_ops.get_cycle_frequency();
309 (void)do_div(cycles_per_msec, 1000);
310
311 /* Note that clocksource.{mult, shift} converts in the opposite direction
312 * as clockevents. */
313 clocksource_vmi.mult = clocksource_khz2mult(cycles_per_msec,
314 clocksource_vmi.shift);
315
316 printk(KERN_WARNING "vmi: registering clock source khz=%lld\n", cycles_per_msec);
317 return clocksource_register(&clocksource_vmi);
318
319}
320module_init(init_vmi_clocksource);
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
new file mode 100644
index 000000000000..849ee611f013
--- /dev/null
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -0,0 +1,5 @@
1#ifdef CONFIG_X86_32
2# include "vmlinux_32.lds.S"
3#else
4# include "vmlinux_64.lds.S"
5#endif
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
new file mode 100644
index 000000000000..7d72cce00529
--- /dev/null
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -0,0 +1,213 @@
1/* ld script to make i386 Linux kernel
2 * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
3 *
4 * Don't define absolute symbols until and unless you know that symbol
5 * value is should remain constant even if kernel image is relocated
6 * at run time. Absolute symbols are not relocated. If symbol value should
7 * change if kernel is relocated, make the symbol section relative and
8 * put it inside the section definition.
9 */
10
11/* Don't define absolute symbols until and unless you know that symbol
12 * value is should remain constant even if kernel image is relocated
13 * at run time. Absolute symbols are not relocated. If symbol value should
14 * change if kernel is relocated, make the symbol section relative and
15 * put it inside the section definition.
16 */
17#define LOAD_OFFSET __PAGE_OFFSET
18
19#include <asm-generic/vmlinux.lds.h>
20#include <asm/thread_info.h>
21#include <asm/page.h>
22#include <asm/cache.h>
23#include <asm/boot.h>
24
25OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
26OUTPUT_ARCH(i386)
27ENTRY(phys_startup_32)
28jiffies = jiffies_64;
29
30PHDRS {
31 text PT_LOAD FLAGS(5); /* R_E */
32 data PT_LOAD FLAGS(7); /* RWE */
33 note PT_NOTE FLAGS(0); /* ___ */
34}
35SECTIONS
36{
37 . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
38 phys_startup_32 = startup_32 - LOAD_OFFSET;
39
40 .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) {
41 _text = .; /* Text and read-only data */
42 *(.text.head)
43 } :text = 0x9090
44
45 /* read-only */
46 .text : AT(ADDR(.text) - LOAD_OFFSET) {
47 TEXT_TEXT
48 SCHED_TEXT
49 LOCK_TEXT
50 KPROBES_TEXT
51 *(.fixup)
52 *(.gnu.warning)
53 _etext = .; /* End of text section */
54 } :text = 0x9090
55
56 . = ALIGN(16); /* Exception table */
57 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
58 __start___ex_table = .;
59 *(__ex_table)
60 __stop___ex_table = .;
61 }
62
63 NOTES :text :note
64
65 BUG_TABLE :text
66
67 . = ALIGN(4);
68 .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) {
69 __tracedata_start = .;
70 *(.tracedata)
71 __tracedata_end = .;
72 }
73
74 RODATA
75
76 /* writeable */
77 . = ALIGN(4096);
78 .data : AT(ADDR(.data) - LOAD_OFFSET) { /* Data */
79 DATA_DATA
80 CONSTRUCTORS
81 } :data
82
83 . = ALIGN(4096);
84 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
85 __nosave_begin = .;
86 *(.data.nosave)
87 . = ALIGN(4096);
88 __nosave_end = .;
89 }
90
91 . = ALIGN(4096);
92 .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
93 *(.data.page_aligned)
94 *(.data.idt)
95 }
96
97 . = ALIGN(32);
98 .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
99 *(.data.cacheline_aligned)
100 }
101
102 /* rarely changed data like cpu maps */
103 . = ALIGN(32);
104 .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
105 *(.data.read_mostly)
106 _edata = .; /* End of data section */
107 }
108
109 . = ALIGN(THREAD_SIZE); /* init_task */
110 .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
111 *(.data.init_task)
112 }
113
114 /* might get freed after init */
115 . = ALIGN(4096);
116 .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
117 __smp_locks = .;
118 *(.smp_locks)
119 __smp_locks_end = .;
120 }
121 /* will be freed after init
122 * Following ALIGN() is required to make sure no other data falls on the
123 * same page where __smp_alt_end is pointing as that page might be freed
124 * after boot. Always make sure that ALIGN() directive is present after
125 * the section which contains __smp_alt_end.
126 */
127 . = ALIGN(4096);
128
129 /* will be freed after init */
130 . = ALIGN(4096); /* Init code and data */
131 .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
132 __init_begin = .;
133 _sinittext = .;
134 *(.init.text)
135 _einittext = .;
136 }
137 .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { *(.init.data) }
138 . = ALIGN(16);
139 .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
140 __setup_start = .;
141 *(.init.setup)
142 __setup_end = .;
143 }
144 .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
145 __initcall_start = .;
146 INITCALLS
147 __initcall_end = .;
148 }
149 .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
150 __con_initcall_start = .;
151 *(.con_initcall.init)
152 __con_initcall_end = .;
153 }
154 SECURITY_INIT
155 . = ALIGN(4);
156 .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
157 __alt_instructions = .;
158 *(.altinstructions)
159 __alt_instructions_end = .;
160 }
161 .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
162 *(.altinstr_replacement)
163 }
164 . = ALIGN(4);
165 .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
166 __parainstructions = .;
167 *(.parainstructions)
168 __parainstructions_end = .;
169 }
170 /* .exit.text is discard at runtime, not link time, to deal with references
171 from .altinstructions and .eh_frame */
172 .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) }
173 .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) }
174#if defined(CONFIG_BLK_DEV_INITRD)
175 . = ALIGN(4096);
176 .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
177 __initramfs_start = .;
178 *(.init.ramfs)
179 __initramfs_end = .;
180 }
181#endif
182 . = ALIGN(4096);
183 .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) {
184 __per_cpu_start = .;
185 *(.data.percpu)
186 *(.data.percpu.shared_aligned)
187 __per_cpu_end = .;
188 }
189 . = ALIGN(4096);
190 /* freed after init ends here */
191
192 .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
193 __init_end = .;
194 __bss_start = .; /* BSS */
195 *(.bss.page_aligned)
196 *(.bss)
197 . = ALIGN(4);
198 __bss_stop = .;
199 _end = . ;
200 /* This is where the kernel creates the early boot page tables */
201 . = ALIGN(4096);
202 pg0 = . ;
203 }
204
205 /* Sections to be discarded */
206 /DISCARD/ : {
207 *(.exitcall.exit)
208 }
209
210 STABS_DEBUG
211
212 DWARF_DEBUG
213}
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
new file mode 100644
index 000000000000..ba8ea97abd21
--- /dev/null
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -0,0 +1,235 @@
1/* ld script to make x86-64 Linux kernel
2 * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
3 */
4
5#define LOAD_OFFSET __START_KERNEL_map
6
7#include <asm-generic/vmlinux.lds.h>
8#include <asm/page.h>
9
10#undef i386 /* in case the preprocessor is a 32bit one */
11
12OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
13OUTPUT_ARCH(i386:x86-64)
14ENTRY(phys_startup_64)
15jiffies_64 = jiffies;
16_proxy_pda = 1;
17PHDRS {
18 text PT_LOAD FLAGS(5); /* R_E */
19 data PT_LOAD FLAGS(7); /* RWE */
20 user PT_LOAD FLAGS(7); /* RWE */
21 data.init PT_LOAD FLAGS(7); /* RWE */
22 note PT_NOTE FLAGS(4); /* R__ */
23}
24SECTIONS
25{
26 . = __START_KERNEL;
27 phys_startup_64 = startup_64 - LOAD_OFFSET;
28 _text = .; /* Text and read-only data */
29 .text : AT(ADDR(.text) - LOAD_OFFSET) {
30 /* First the code that has to be first for bootstrapping */
31 *(.text.head)
32 _stext = .;
33 /* Then the rest */
34 TEXT_TEXT
35 SCHED_TEXT
36 LOCK_TEXT
37 KPROBES_TEXT
38 *(.fixup)
39 *(.gnu.warning)
40 } :text = 0x9090
41 /* out-of-line lock text */
42 .text.lock : AT(ADDR(.text.lock) - LOAD_OFFSET) { *(.text.lock) }
43
44 _etext = .; /* End of text section */
45
46 . = ALIGN(16); /* Exception table */
47 __start___ex_table = .;
48 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) }
49 __stop___ex_table = .;
50
51 NOTES :text :note
52
53 BUG_TABLE :text
54
55 RODATA
56
57 . = ALIGN(4);
58 .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) {
59 __tracedata_start = .;
60 *(.tracedata)
61 __tracedata_end = .;
62 }
63
64 . = ALIGN(PAGE_SIZE); /* Align data segment to page size boundary */
65 /* Data */
66 .data : AT(ADDR(.data) - LOAD_OFFSET) {
67 DATA_DATA
68 CONSTRUCTORS
69 } :data
70
71 _edata = .; /* End of data section */
72
73 . = ALIGN(PAGE_SIZE);
74 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
75 .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
76 *(.data.cacheline_aligned)
77 }
78 . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES);
79 .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
80 *(.data.read_mostly)
81 }
82
83#define VSYSCALL_ADDR (-10*1024*1024)
84#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095))
85#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095))
86
87#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR)
88#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET)
89
90#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR)
91#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
92
93 . = VSYSCALL_ADDR;
94 .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) } :user
95 __vsyscall_0 = VSYSCALL_VIRT_ADDR;
96
97 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
98 .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { *(.vsyscall_fn) }
99 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
100 .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data))
101 { *(.vsyscall_gtod_data) }
102 vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
103 .vsyscall_clock : AT(VLOAD(.vsyscall_clock))
104 { *(.vsyscall_clock) }
105 vsyscall_clock = VVIRT(.vsyscall_clock);
106
107
108 .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1))
109 { *(.vsyscall_1) }
110 .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2))
111 { *(.vsyscall_2) }
112
113 .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { *(.vgetcpu_mode) }
114 vgetcpu_mode = VVIRT(.vgetcpu_mode);
115
116 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
117 .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) }
118 jiffies = VVIRT(.jiffies);
119
120 .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3))
121 { *(.vsyscall_3) }
122
123 . = VSYSCALL_VIRT_ADDR + 4096;
124
125#undef VSYSCALL_ADDR
126#undef VSYSCALL_PHYS_ADDR
127#undef VSYSCALL_VIRT_ADDR
128#undef VLOAD_OFFSET
129#undef VLOAD
130#undef VVIRT_OFFSET
131#undef VVIRT
132
133 . = ALIGN(8192); /* init_task */
134 .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
135 *(.data.init_task)
136 }:data.init
137
138 . = ALIGN(4096);
139 .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
140 *(.data.page_aligned)
141 }
142
143 /* might get freed after init */
144 . = ALIGN(4096);
145 __smp_alt_begin = .;
146 __smp_locks = .;
147 .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
148 *(.smp_locks)
149 }
150 __smp_locks_end = .;
151 . = ALIGN(4096);
152 __smp_alt_end = .;
153
154 . = ALIGN(4096); /* Init code and data */
155 __init_begin = .;
156 .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
157 _sinittext = .;
158 *(.init.text)
159 _einittext = .;
160 }
161 __initdata_begin = .;
162 .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { *(.init.data) }
163 __initdata_end = .;
164 . = ALIGN(16);
165 __setup_start = .;
166 .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { *(.init.setup) }
167 __setup_end = .;
168 __initcall_start = .;
169 .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
170 INITCALLS
171 }
172 __initcall_end = .;
173 __con_initcall_start = .;
174 .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
175 *(.con_initcall.init)
176 }
177 __con_initcall_end = .;
178 SECURITY_INIT
179 . = ALIGN(8);
180 __alt_instructions = .;
181 .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
182 *(.altinstructions)
183 }
184 __alt_instructions_end = .;
185 .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
186 *(.altinstr_replacement)
187 }
188 /* .exit.text is discard at runtime, not link time, to deal with references
189 from .altinstructions and .eh_frame */
190 .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) }
191 .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) }
192
193/* vdso blob that is mapped into user space */
194 vdso_start = . ;
195 .vdso : AT(ADDR(.vdso) - LOAD_OFFSET) { *(.vdso) }
196 . = ALIGN(4096);
197 vdso_end = .;
198
199#ifdef CONFIG_BLK_DEV_INITRD
200 . = ALIGN(4096);
201 __initramfs_start = .;
202 .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) }
203 __initramfs_end = .;
204#endif
205
206 PERCPU(4096)
207
208 . = ALIGN(4096);
209 __init_end = .;
210
211 . = ALIGN(4096);
212 __nosave_begin = .;
213 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { *(.data.nosave) }
214 . = ALIGN(4096);
215 __nosave_end = .;
216
217 __bss_start = .; /* BSS */
218 .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
219 *(.bss.page_aligned)
220 *(.bss)
221 }
222 __bss_stop = .;
223
224 _end = . ;
225
226 /* Sections to be discarded */
227 /DISCARD/ : {
228 *(.exitcall.exit)
229 *(.eh_frame)
230 }
231
232 STABS_DEBUG
233
234 DWARF_DEBUG
235}
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
new file mode 100644
index 000000000000..414caf0c5f9a
--- /dev/null
+++ b/arch/x86/kernel/vsmp_64.c
@@ -0,0 +1,49 @@
1/*
2 * vSMPowered(tm) systems specific initialization
3 * Copyright (C) 2005 ScaleMP Inc.
4 *
5 * Use of this code is subject to the terms and conditions of the
6 * GNU general public license version 2. See "COPYING" or
7 * http://www.gnu.org/licenses/gpl.html
8 *
9 * Ravikiran Thirumalai <kiran@scalemp.com>,
10 * Shai Fultheim <shai@scalemp.com>
11 */
12
13#include <linux/init.h>
14#include <linux/pci_ids.h>
15#include <linux/pci_regs.h>
16#include <asm/pci-direct.h>
17#include <asm/io.h>
18
19static int __init vsmp_init(void)
20{
21 void *address;
22 unsigned int cap, ctl;
23
24 if (!early_pci_allowed())
25 return 0;
26
27 /* Check if we are running on a ScaleMP vSMP box */
28 if ((read_pci_config_16(0, 0x1f, 0, PCI_VENDOR_ID) != PCI_VENDOR_ID_SCALEMP) ||
29 (read_pci_config_16(0, 0x1f, 0, PCI_DEVICE_ID) != PCI_DEVICE_ID_SCALEMP_VSMP_CTL))
30 return 0;
31
32 /* set vSMP magic bits to indicate vSMP capable kernel */
33 address = ioremap(read_pci_config(0, 0x1f, 0, PCI_BASE_ADDRESS_0), 8);
34 cap = readl(address);
35 ctl = readl(address + 4);
36 printk("vSMP CTL: capabilities:0x%08x control:0x%08x\n", cap, ctl);
37 if (cap & ctl & (1 << 4)) {
38 /* Turn on vSMP IRQ fastpath handling (see system.h) */
39 ctl &= ~(1 << 4);
40 writel(ctl, address + 4);
41 ctl = readl(address + 4);
42 printk("vSMP CTL: control set to:0x%08x\n", ctl);
43 }
44
45 iounmap(address);
46 return 0;
47}
48
49core_initcall(vsmp_init);
diff --git a/arch/x86/kernel/vsyscall-int80_32.S b/arch/x86/kernel/vsyscall-int80_32.S
new file mode 100644
index 000000000000..103cab6aa7c0
--- /dev/null
+++ b/arch/x86/kernel/vsyscall-int80_32.S
@@ -0,0 +1,53 @@
1/*
2 * Code for the vsyscall page. This version uses the old int $0x80 method.
3 *
4 * NOTE:
5 * 1) __kernel_vsyscall _must_ be first in this page.
6 * 2) there are alignment constraints on this stub, see vsyscall-sigreturn.S
7 * for details.
8 */
9
10 .text
11 .globl __kernel_vsyscall
12 .type __kernel_vsyscall,@function
13__kernel_vsyscall:
14.LSTART_vsyscall:
15 int $0x80
16 ret
17.LEND_vsyscall:
18 .size __kernel_vsyscall,.-.LSTART_vsyscall
19 .previous
20
21 .section .eh_frame,"a",@progbits
22.LSTARTFRAMEDLSI:
23 .long .LENDCIEDLSI-.LSTARTCIEDLSI
24.LSTARTCIEDLSI:
25 .long 0 /* CIE ID */
26 .byte 1 /* Version number */
27 .string "zR" /* NUL-terminated augmentation string */
28 .uleb128 1 /* Code alignment factor */
29 .sleb128 -4 /* Data alignment factor */
30 .byte 8 /* Return address register column */
31 .uleb128 1 /* Augmentation value length */
32 .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
33 .byte 0x0c /* DW_CFA_def_cfa */
34 .uleb128 4
35 .uleb128 4
36 .byte 0x88 /* DW_CFA_offset, column 0x8 */
37 .uleb128 1
38 .align 4
39.LENDCIEDLSI:
40 .long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */
41.LSTARTFDEDLSI:
42 .long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */
43 .long .LSTART_vsyscall-. /* PC-relative start address */
44 .long .LEND_vsyscall-.LSTART_vsyscall
45 .uleb128 0
46 .align 4
47.LENDFDEDLSI:
48 .previous
49
50/*
51 * Get the common code for the sigreturn entry points.
52 */
53#include "vsyscall-sigreturn_32.S"
diff --git a/arch/x86/kernel/vsyscall-note_32.S b/arch/x86/kernel/vsyscall-note_32.S
new file mode 100644
index 000000000000..fcf376a37f79
--- /dev/null
+++ b/arch/x86/kernel/vsyscall-note_32.S
@@ -0,0 +1,45 @@
1/*
2 * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text.
3 * Here we can supply some information useful to userland.
4 */
5
6#include <linux/version.h>
7#include <linux/elfnote.h>
8
9/* Ideally this would use UTS_NAME, but using a quoted string here
10 doesn't work. Remember to change this when changing the
11 kernel's name. */
12ELFNOTE_START(Linux, 0, "a")
13 .long LINUX_VERSION_CODE
14ELFNOTE_END
15
16#ifdef CONFIG_XEN
17/*
18 * Add a special note telling glibc's dynamic linker a fake hardware
19 * flavor that it will use to choose the search path for libraries in the
20 * same way it uses real hardware capabilities like "mmx".
21 * We supply "nosegneg" as the fake capability, to indicate that we
22 * do not like negative offsets in instructions using segment overrides,
23 * since we implement those inefficiently. This makes it possible to
24 * install libraries optimized to avoid those access patterns in someplace
25 * like /lib/i686/tls/nosegneg. Note that an /etc/ld.so.conf.d/file
26 * corresponding to the bits here is needed to make ldconfig work right.
27 * It should contain:
28 * hwcap 1 nosegneg
29 * to match the mapping of bit to name that we give here.
30 *
31 * At runtime, the fake hardware feature will be considered to be present
32 * if its bit is set in the mask word. So, we start with the mask 0, and
33 * at boot time we set VDSO_NOTE_NONEGSEG_BIT if running under Xen.
34 */
35
36#include "../../x86/xen/vdso.h" /* Defines VDSO_NOTE_NONEGSEG_BIT. */
37
38 .globl VDSO_NOTE_MASK
39ELFNOTE_START(GNU, 2, "a")
40 .long 1 /* ncaps */
41VDSO_NOTE_MASK:
42 .long 0 /* mask */
43 .byte VDSO_NOTE_NONEGSEG_BIT; .asciz "nosegneg" /* bit, name */
44ELFNOTE_END
45#endif
diff --git a/arch/x86/kernel/vsyscall-sigreturn_32.S b/arch/x86/kernel/vsyscall-sigreturn_32.S
new file mode 100644
index 000000000000..a92262f41659
--- /dev/null
+++ b/arch/x86/kernel/vsyscall-sigreturn_32.S
@@ -0,0 +1,143 @@
1/*
2 * Common code for the sigreturn entry points on the vsyscall page.
3 * So far this code is the same for both int80 and sysenter versions.
4 * This file is #include'd by vsyscall-*.S to define them after the
5 * vsyscall entry point. The kernel assumes that the addresses of these
6 * routines are constant for all vsyscall implementations.
7 */
8
9#include <asm/unistd.h>
10#include <asm/asm-offsets.h>
11
12
13/* XXX
14 Should these be named "_sigtramp" or something?
15*/
16
17 .text
18 .org __kernel_vsyscall+32,0x90
19 .globl __kernel_sigreturn
20 .type __kernel_sigreturn,@function
21__kernel_sigreturn:
22.LSTART_sigreturn:
23 popl %eax /* XXX does this mean it needs unwind info? */
24 movl $__NR_sigreturn, %eax
25 int $0x80
26.LEND_sigreturn:
27 .size __kernel_sigreturn,.-.LSTART_sigreturn
28
29 .balign 32
30 .globl __kernel_rt_sigreturn
31 .type __kernel_rt_sigreturn,@function
32__kernel_rt_sigreturn:
33.LSTART_rt_sigreturn:
34 movl $__NR_rt_sigreturn, %eax
35 int $0x80
36.LEND_rt_sigreturn:
37 .size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn
38 .balign 32
39 .previous
40
41 .section .eh_frame,"a",@progbits
42.LSTARTFRAMEDLSI1:
43 .long .LENDCIEDLSI1-.LSTARTCIEDLSI1
44.LSTARTCIEDLSI1:
45 .long 0 /* CIE ID */
46 .byte 1 /* Version number */
47 .string "zRS" /* NUL-terminated augmentation string */
48 .uleb128 1 /* Code alignment factor */
49 .sleb128 -4 /* Data alignment factor */
50 .byte 8 /* Return address register column */
51 .uleb128 1 /* Augmentation value length */
52 .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
53 .byte 0 /* DW_CFA_nop */
54 .align 4
55.LENDCIEDLSI1:
56 .long .LENDFDEDLSI1-.LSTARTFDEDLSI1 /* Length FDE */
57.LSTARTFDEDLSI1:
58 .long .LSTARTFDEDLSI1-.LSTARTFRAMEDLSI1 /* CIE pointer */
59 /* HACK: The dwarf2 unwind routines will subtract 1 from the
60 return address to get an address in the middle of the
61 presumed call instruction. Since we didn't get here via
62 a call, we need to include the nop before the real start
63 to make up for it. */
64 .long .LSTART_sigreturn-1-. /* PC-relative start address */
65 .long .LEND_sigreturn-.LSTART_sigreturn+1
66 .uleb128 0 /* Augmentation */
67 /* What follows are the instructions for the table generation.
68 We record the locations of each register saved. This is
69 complicated by the fact that the "CFA" is always assumed to
70 be the value of the stack pointer in the caller. This means
71 that we must define the CFA of this body of code to be the
72 saved value of the stack pointer in the sigcontext. Which
73 also means that there is no fixed relation to the other
74 saved registers, which means that we must use DW_CFA_expression
75 to compute their addresses. It also means that when we
76 adjust the stack with the popl, we have to do it all over again. */
77
78#define do_cfa_expr(offset) \
79 .byte 0x0f; /* DW_CFA_def_cfa_expression */ \
80 .uleb128 1f-0f; /* length */ \
810: .byte 0x74; /* DW_OP_breg4 */ \
82 .sleb128 offset; /* offset */ \
83 .byte 0x06; /* DW_OP_deref */ \
841:
85
86#define do_expr(regno, offset) \
87 .byte 0x10; /* DW_CFA_expression */ \
88 .uleb128 regno; /* regno */ \
89 .uleb128 1f-0f; /* length */ \
900: .byte 0x74; /* DW_OP_breg4 */ \
91 .sleb128 offset; /* offset */ \
921:
93
94 do_cfa_expr(SIGCONTEXT_esp+4)
95 do_expr(0, SIGCONTEXT_eax+4)
96 do_expr(1, SIGCONTEXT_ecx+4)
97 do_expr(2, SIGCONTEXT_edx+4)
98 do_expr(3, SIGCONTEXT_ebx+4)
99 do_expr(5, SIGCONTEXT_ebp+4)
100 do_expr(6, SIGCONTEXT_esi+4)
101 do_expr(7, SIGCONTEXT_edi+4)
102 do_expr(8, SIGCONTEXT_eip+4)
103
104 .byte 0x42 /* DW_CFA_advance_loc 2 -- nop; popl eax. */
105
106 do_cfa_expr(SIGCONTEXT_esp)
107 do_expr(0, SIGCONTEXT_eax)
108 do_expr(1, SIGCONTEXT_ecx)
109 do_expr(2, SIGCONTEXT_edx)
110 do_expr(3, SIGCONTEXT_ebx)
111 do_expr(5, SIGCONTEXT_ebp)
112 do_expr(6, SIGCONTEXT_esi)
113 do_expr(7, SIGCONTEXT_edi)
114 do_expr(8, SIGCONTEXT_eip)
115
116 .align 4
117.LENDFDEDLSI1:
118
119 .long .LENDFDEDLSI2-.LSTARTFDEDLSI2 /* Length FDE */
120.LSTARTFDEDLSI2:
121 .long .LSTARTFDEDLSI2-.LSTARTFRAMEDLSI1 /* CIE pointer */
122 /* HACK: See above wrt unwind library assumptions. */
123 .long .LSTART_rt_sigreturn-1-. /* PC-relative start address */
124 .long .LEND_rt_sigreturn-.LSTART_rt_sigreturn+1
125 .uleb128 0 /* Augmentation */
126 /* What follows are the instructions for the table generation.
127 We record the locations of each register saved. This is
128 slightly less complicated than the above, since we don't
129 modify the stack pointer in the process. */
130
131 do_cfa_expr(RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_esp)
132 do_expr(0, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_eax)
133 do_expr(1, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_ecx)
134 do_expr(2, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_edx)
135 do_expr(3, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_ebx)
136 do_expr(5, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_ebp)
137 do_expr(6, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_esi)
138 do_expr(7, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_edi)
139 do_expr(8, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_eip)
140
141 .align 4
142.LENDFDEDLSI2:
143 .previous
diff --git a/arch/x86/kernel/vsyscall-sysenter_32.S b/arch/x86/kernel/vsyscall-sysenter_32.S
new file mode 100644
index 000000000000..ed879bf42995
--- /dev/null
+++ b/arch/x86/kernel/vsyscall-sysenter_32.S
@@ -0,0 +1,122 @@
1/*
2 * Code for the vsyscall page. This version uses the sysenter instruction.
3 *
4 * NOTE:
5 * 1) __kernel_vsyscall _must_ be first in this page.
6 * 2) there are alignment constraints on this stub, see vsyscall-sigreturn.S
7 * for details.
8 */
9
10/*
11 * The caller puts arg2 in %ecx, which gets pushed. The kernel will use
12 * %ecx itself for arg2. The pushing is because the sysexit instruction
13 * (found in entry.S) requires that we clobber %ecx with the desired %esp.
14 * User code might expect that %ecx is unclobbered though, as it would be
15 * for returning via the iret instruction, so we must push and pop.
16 *
17 * The caller puts arg3 in %edx, which the sysexit instruction requires
18 * for %eip. Thus, exactly as for arg2, we must push and pop.
19 *
20 * Arg6 is different. The caller puts arg6 in %ebp. Since the sysenter
21 * instruction clobbers %esp, the user's %esp won't even survive entry
22 * into the kernel. We store %esp in %ebp. Code in entry.S must fetch
23 * arg6 from the stack.
24 *
25 * You can not use this vsyscall for the clone() syscall because the
26 * three dwords on the parent stack do not get copied to the child.
27 */
28 .text
29 .globl __kernel_vsyscall
30 .type __kernel_vsyscall,@function
31__kernel_vsyscall:
32.LSTART_vsyscall:
33 push %ecx
34.Lpush_ecx:
35 push %edx
36.Lpush_edx:
37 push %ebp
38.Lenter_kernel:
39 movl %esp,%ebp
40 sysenter
41
42 /* 7: align return point with nop's to make disassembly easier */
43 .space 7,0x90
44
45 /* 14: System call restart point is here! (SYSENTER_RETURN-2) */
46 jmp .Lenter_kernel
47 /* 16: System call normal return point is here! */
48 .globl SYSENTER_RETURN /* Symbol used by sysenter.c */
49SYSENTER_RETURN:
50 pop %ebp
51.Lpop_ebp:
52 pop %edx
53.Lpop_edx:
54 pop %ecx
55.Lpop_ecx:
56 ret
57.LEND_vsyscall:
58 .size __kernel_vsyscall,.-.LSTART_vsyscall
59 .previous
60
61 .section .eh_frame,"a",@progbits
62.LSTARTFRAMEDLSI:
63 .long .LENDCIEDLSI-.LSTARTCIEDLSI
64.LSTARTCIEDLSI:
65 .long 0 /* CIE ID */
66 .byte 1 /* Version number */
67 .string "zR" /* NUL-terminated augmentation string */
68 .uleb128 1 /* Code alignment factor */
69 .sleb128 -4 /* Data alignment factor */
70 .byte 8 /* Return address register column */
71 .uleb128 1 /* Augmentation value length */
72 .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
73 .byte 0x0c /* DW_CFA_def_cfa */
74 .uleb128 4
75 .uleb128 4
76 .byte 0x88 /* DW_CFA_offset, column 0x8 */
77 .uleb128 1
78 .align 4
79.LENDCIEDLSI:
80 .long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */
81.LSTARTFDEDLSI:
82 .long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */
83 .long .LSTART_vsyscall-. /* PC-relative start address */
84 .long .LEND_vsyscall-.LSTART_vsyscall
85 .uleb128 0
86 /* What follows are the instructions for the table generation.
87 We have to record all changes of the stack pointer. */
88 .byte 0x04 /* DW_CFA_advance_loc4 */
89 .long .Lpush_ecx-.LSTART_vsyscall
90 .byte 0x0e /* DW_CFA_def_cfa_offset */
91 .byte 0x08 /* RA at offset 8 now */
92 .byte 0x04 /* DW_CFA_advance_loc4 */
93 .long .Lpush_edx-.Lpush_ecx
94 .byte 0x0e /* DW_CFA_def_cfa_offset */
95 .byte 0x0c /* RA at offset 12 now */
96 .byte 0x04 /* DW_CFA_advance_loc4 */
97 .long .Lenter_kernel-.Lpush_edx
98 .byte 0x0e /* DW_CFA_def_cfa_offset */
99 .byte 0x10 /* RA at offset 16 now */
100 .byte 0x85, 0x04 /* DW_CFA_offset %ebp -16 */
101 /* Finally the epilogue. */
102 .byte 0x04 /* DW_CFA_advance_loc4 */
103 .long .Lpop_ebp-.Lenter_kernel
104 .byte 0x0e /* DW_CFA_def_cfa_offset */
105 .byte 0x0c /* RA at offset 12 now */
106 .byte 0xc5 /* DW_CFA_restore %ebp */
107 .byte 0x04 /* DW_CFA_advance_loc4 */
108 .long .Lpop_edx-.Lpop_ebp
109 .byte 0x0e /* DW_CFA_def_cfa_offset */
110 .byte 0x08 /* RA at offset 8 now */
111 .byte 0x04 /* DW_CFA_advance_loc4 */
112 .long .Lpop_ecx-.Lpop_edx
113 .byte 0x0e /* DW_CFA_def_cfa_offset */
114 .byte 0x04 /* RA at offset 4 now */
115 .align 4
116.LENDFDEDLSI:
117 .previous
118
119/*
120 * Get the common code for the sigreturn entry points.
121 */
122#include "vsyscall-sigreturn_32.S"
diff --git a/arch/x86/kernel/vsyscall_32.S b/arch/x86/kernel/vsyscall_32.S
new file mode 100644
index 000000000000..a5ab3dc4fd25
--- /dev/null
+++ b/arch/x86/kernel/vsyscall_32.S
@@ -0,0 +1,15 @@
1#include <linux/init.h>
2
3__INITDATA
4
5 .globl vsyscall_int80_start, vsyscall_int80_end
6vsyscall_int80_start:
7 .incbin "arch/x86/kernel/vsyscall-int80_32.so"
8vsyscall_int80_end:
9
10 .globl vsyscall_sysenter_start, vsyscall_sysenter_end
11vsyscall_sysenter_start:
12 .incbin "arch/x86/kernel/vsyscall-sysenter_32.so"
13vsyscall_sysenter_end:
14
15__FINIT
diff --git a/arch/x86/kernel/vsyscall_32.lds.S b/arch/x86/kernel/vsyscall_32.lds.S
new file mode 100644
index 000000000000..4a8b0ed9b8fb
--- /dev/null
+++ b/arch/x86/kernel/vsyscall_32.lds.S
@@ -0,0 +1,67 @@
1/*
2 * Linker script for vsyscall DSO. The vsyscall page is an ELF shared
3 * object prelinked to its virtual address, and with only one read-only
4 * segment (that fits in one page). This script controls its layout.
5 */
6#include <asm/asm-offsets.h>
7
8SECTIONS
9{
10 . = VDSO_PRELINK_asm + SIZEOF_HEADERS;
11
12 .hash : { *(.hash) } :text
13 .gnu.hash : { *(.gnu.hash) }
14 .dynsym : { *(.dynsym) }
15 .dynstr : { *(.dynstr) }
16 .gnu.version : { *(.gnu.version) }
17 .gnu.version_d : { *(.gnu.version_d) }
18 .gnu.version_r : { *(.gnu.version_r) }
19
20 /* This linker script is used both with -r and with -shared.
21 For the layouts to match, we need to skip more than enough
22 space for the dynamic symbol table et al. If this amount
23 is insufficient, ld -shared will barf. Just increase it here. */
24 . = VDSO_PRELINK_asm + 0x400;
25
26 .text : { *(.text) } :text =0x90909090
27 .note : { *(.note.*) } :text :note
28 .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr
29 .eh_frame : { KEEP (*(.eh_frame)) } :text
30 .dynamic : { *(.dynamic) } :text :dynamic
31 .useless : {
32 *(.got.plt) *(.got)
33 *(.data .data.* .gnu.linkonce.d.*)
34 *(.dynbss)
35 *(.bss .bss.* .gnu.linkonce.b.*)
36 } :text
37}
38
39/*
40 * We must supply the ELF program headers explicitly to get just one
41 * PT_LOAD segment, and set the flags explicitly to make segments read-only.
42 */
43PHDRS
44{
45 text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */
46 dynamic PT_DYNAMIC FLAGS(4); /* PF_R */
47 note PT_NOTE FLAGS(4); /* PF_R */
48 eh_frame_hdr 0x6474e550; /* PT_GNU_EH_FRAME, but ld doesn't match the name */
49}
50
51/*
52 * This controls what symbols we export from the DSO.
53 */
54VERSION
55{
56 LINUX_2.5 {
57 global:
58 __kernel_vsyscall;
59 __kernel_sigreturn;
60 __kernel_rt_sigreturn;
61
62 local: *;
63 };
64}
65
66/* The ELF entry point can be used to set the AT_SYSINFO value. */
67ENTRY(__kernel_vsyscall);
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
new file mode 100644
index 000000000000..06c34949bfdc
--- /dev/null
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -0,0 +1,349 @@
1/*
2 * linux/arch/x86_64/kernel/vsyscall.c
3 *
4 * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
5 * Copyright 2003 Andi Kleen, SuSE Labs.
6 *
7 * Thanks to hpa@transmeta.com for some useful hint.
8 * Special thanks to Ingo Molnar for his early experience with
9 * a different vsyscall implementation for Linux/IA32 and for the name.
10 *
11 * vsyscall 1 is located at -10Mbyte, vsyscall 2 is located
12 * at virtual address -10Mbyte+1024bytes etc... There are at max 4
13 * vsyscalls. One vsyscall can reserve more than 1 slot to avoid
14 * jumping out of line if necessary. We cannot add more with this
15 * mechanism because older kernels won't return -ENOSYS.
16 * If we want more than four we need a vDSO.
17 *
18 * Note: the concept clashes with user mode linux. If you use UML and
19 * want per guest time just set the kernel.vsyscall64 sysctl to 0.
20 */
21
22#include <linux/time.h>
23#include <linux/init.h>
24#include <linux/kernel.h>
25#include <linux/timer.h>
26#include <linux/seqlock.h>
27#include <linux/jiffies.h>
28#include <linux/sysctl.h>
29#include <linux/clocksource.h>
30#include <linux/getcpu.h>
31#include <linux/cpu.h>
32#include <linux/smp.h>
33#include <linux/notifier.h>
34
35#include <asm/vsyscall.h>
36#include <asm/pgtable.h>
37#include <asm/page.h>
38#include <asm/unistd.h>
39#include <asm/fixmap.h>
40#include <asm/errno.h>
41#include <asm/io.h>
42#include <asm/segment.h>
43#include <asm/desc.h>
44#include <asm/topology.h>
45#include <asm/vgtod.h>
46
47#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
48#define __syscall_clobber "r11","rcx","memory"
49#define __pa_vsymbol(x) \
50 ({unsigned long v; \
51 extern char __vsyscall_0; \
52 asm("" : "=r" (v) : "0" (x)); \
53 ((v - VSYSCALL_FIRST_PAGE) + __pa_symbol(&__vsyscall_0)); })
54
55/*
56 * vsyscall_gtod_data contains data that is :
57 * - readonly from vsyscalls
58 * - writen by timer interrupt or systcl (/proc/sys/kernel/vsyscall64)
59 * Try to keep this structure as small as possible to avoid cache line ping pongs
60 */
61int __vgetcpu_mode __section_vgetcpu_mode;
62
63struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data =
64{
65 .lock = SEQLOCK_UNLOCKED,
66 .sysctl_enabled = 1,
67};
68
69void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
70{
71 unsigned long flags;
72
73 write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
74 /* copy vsyscall data */
75 vsyscall_gtod_data.clock.vread = clock->vread;
76 vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
77 vsyscall_gtod_data.clock.mask = clock->mask;
78 vsyscall_gtod_data.clock.mult = clock->mult;
79 vsyscall_gtod_data.clock.shift = clock->shift;
80 vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
81 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
82 vsyscall_gtod_data.sys_tz = sys_tz;
83 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
84 vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic;
85 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
86}
87
88/* RED-PEN may want to readd seq locking, but then the variable should be
89 * write-once.
90 */
91static __always_inline void do_get_tz(struct timezone * tz)
92{
93 *tz = __vsyscall_gtod_data.sys_tz;
94}
95
96static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
97{
98 int ret;
99 asm volatile("vsysc2: syscall"
100 : "=a" (ret)
101 : "0" (__NR_gettimeofday),"D" (tv),"S" (tz)
102 : __syscall_clobber );
103 return ret;
104}
105
106static __always_inline long time_syscall(long *t)
107{
108 long secs;
109 asm volatile("vsysc1: syscall"
110 : "=a" (secs)
111 : "0" (__NR_time),"D" (t) : __syscall_clobber);
112 return secs;
113}
114
115static __always_inline void do_vgettimeofday(struct timeval * tv)
116{
117 cycle_t now, base, mask, cycle_delta;
118 unsigned seq;
119 unsigned long mult, shift, nsec;
120 cycle_t (*vread)(void);
121 do {
122 seq = read_seqbegin(&__vsyscall_gtod_data.lock);
123
124 vread = __vsyscall_gtod_data.clock.vread;
125 if (unlikely(!__vsyscall_gtod_data.sysctl_enabled || !vread)) {
126 gettimeofday(tv,NULL);
127 return;
128 }
129 now = vread();
130 base = __vsyscall_gtod_data.clock.cycle_last;
131 mask = __vsyscall_gtod_data.clock.mask;
132 mult = __vsyscall_gtod_data.clock.mult;
133 shift = __vsyscall_gtod_data.clock.shift;
134
135 tv->tv_sec = __vsyscall_gtod_data.wall_time_sec;
136 nsec = __vsyscall_gtod_data.wall_time_nsec;
137 } while (read_seqretry(&__vsyscall_gtod_data.lock, seq));
138
139 /* calculate interval: */
140 cycle_delta = (now - base) & mask;
141 /* convert to nsecs: */
142 nsec += (cycle_delta * mult) >> shift;
143
144 while (nsec >= NSEC_PER_SEC) {
145 tv->tv_sec += 1;
146 nsec -= NSEC_PER_SEC;
147 }
148 tv->tv_usec = nsec / NSEC_PER_USEC;
149}
150
151int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
152{
153 if (tv)
154 do_vgettimeofday(tv);
155 if (tz)
156 do_get_tz(tz);
157 return 0;
158}
159
160/* This will break when the xtime seconds get inaccurate, but that is
161 * unlikely */
162time_t __vsyscall(1) vtime(time_t *t)
163{
164 struct timeval tv;
165 time_t result;
166 if (unlikely(!__vsyscall_gtod_data.sysctl_enabled))
167 return time_syscall(t);
168
169 vgettimeofday(&tv, 0);
170 result = tv.tv_sec;
171 if (t)
172 *t = result;
173 return result;
174}
175
176/* Fast way to get current CPU and node.
177 This helps to do per node and per CPU caches in user space.
178 The result is not guaranteed without CPU affinity, but usually
179 works out because the scheduler tries to keep a thread on the same
180 CPU.
181
182 tcache must point to a two element sized long array.
183 All arguments can be NULL. */
184long __vsyscall(2)
185vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
186{
187 unsigned int dummy, p;
188 unsigned long j = 0;
189
190 /* Fast cache - only recompute value once per jiffies and avoid
191 relatively costly rdtscp/cpuid otherwise.
192 This works because the scheduler usually keeps the process
193 on the same CPU and this syscall doesn't guarantee its
194 results anyways.
195 We do this here because otherwise user space would do it on
196 its own in a likely inferior way (no access to jiffies).
197 If you don't like it pass NULL. */
198 if (tcache && tcache->blob[0] == (j = __jiffies)) {
199 p = tcache->blob[1];
200 } else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
201 /* Load per CPU data from RDTSCP */
202 rdtscp(dummy, dummy, p);
203 } else {
204 /* Load per CPU data from GDT */
205 asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
206 }
207 if (tcache) {
208 tcache->blob[0] = j;
209 tcache->blob[1] = p;
210 }
211 if (cpu)
212 *cpu = p & 0xfff;
213 if (node)
214 *node = p >> 12;
215 return 0;
216}
217
218long __vsyscall(3) venosys_1(void)
219{
220 return -ENOSYS;
221}
222
223#ifdef CONFIG_SYSCTL
224
225#define SYSCALL 0x050f
226#define NOP2 0x9090
227
228/*
229 * NOP out syscall in vsyscall page when not needed.
230 */
231static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
232 void __user *buffer, size_t *lenp, loff_t *ppos)
233{
234 extern u16 vsysc1, vsysc2;
235 u16 __iomem *map1;
236 u16 __iomem *map2;
237 int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
238 if (!write)
239 return ret;
240 /* gcc has some trouble with __va(__pa()), so just do it this
241 way. */
242 map1 = ioremap(__pa_vsymbol(&vsysc1), 2);
243 if (!map1)
244 return -ENOMEM;
245 map2 = ioremap(__pa_vsymbol(&vsysc2), 2);
246 if (!map2) {
247 ret = -ENOMEM;
248 goto out;
249 }
250 if (!vsyscall_gtod_data.sysctl_enabled) {
251 writew(SYSCALL, map1);
252 writew(SYSCALL, map2);
253 } else {
254 writew(NOP2, map1);
255 writew(NOP2, map2);
256 }
257 iounmap(map2);
258out:
259 iounmap(map1);
260 return ret;
261}
262
263static int vsyscall_sysctl_nostrat(ctl_table *t, int __user *name, int nlen,
264 void __user *oldval, size_t __user *oldlenp,
265 void __user *newval, size_t newlen)
266{
267 return -ENOSYS;
268}
269
270static ctl_table kernel_table2[] = {
271 { .ctl_name = 99, .procname = "vsyscall64",
272 .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int),
273 .mode = 0644,
274 .strategy = vsyscall_sysctl_nostrat,
275 .proc_handler = vsyscall_sysctl_change },
276 {}
277};
278
279static ctl_table kernel_root_table2[] = {
280 { .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555,
281 .child = kernel_table2 },
282 {}
283};
284
285#endif
286
287/* Assume __initcall executes before all user space. Hopefully kmod
288 doesn't violate that. We'll find out if it does. */
289static void __cpuinit vsyscall_set_cpu(int cpu)
290{
291 unsigned long *d;
292 unsigned long node = 0;
293#ifdef CONFIG_NUMA
294 node = cpu_to_node[cpu];
295#endif
296 if (cpu_has(&cpu_data[cpu], X86_FEATURE_RDTSCP))
297 write_rdtscp_aux((node << 12) | cpu);
298
299 /* Store cpu number in limit so that it can be loaded quickly
300 in user space in vgetcpu.
301 12 bits for the CPU and 8 bits for the node. */
302 d = (unsigned long *)(cpu_gdt(cpu) + GDT_ENTRY_PER_CPU);
303 *d = 0x0f40000000000ULL;
304 *d |= cpu;
305 *d |= (node & 0xf) << 12;
306 *d |= (node >> 4) << 48;
307}
308
309static void __cpuinit cpu_vsyscall_init(void *arg)
310{
311 /* preemption should be already off */
312 vsyscall_set_cpu(raw_smp_processor_id());
313}
314
315static int __cpuinit
316cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
317{
318 long cpu = (long)arg;
319 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
320 smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1);
321 return NOTIFY_DONE;
322}
323
324static void __init map_vsyscall(void)
325{
326 extern char __vsyscall_0;
327 unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
328
329 /* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */
330 __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
331}
332
333static int __init vsyscall_init(void)
334{
335 BUG_ON(((unsigned long) &vgettimeofday !=
336 VSYSCALL_ADDR(__NR_vgettimeofday)));
337 BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
338 BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
339 BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
340 map_vsyscall();
341#ifdef CONFIG_SYSCTL
342 register_sysctl_table(kernel_root_table2);
343#endif
344 on_each_cpu(cpu_vsyscall_init, NULL, 0, 1);
345 hotcpu_notifier(cpu_vsyscall_notifier, 0);
346 return 0;
347}
348
349__initcall(vsyscall_init);
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
new file mode 100644
index 000000000000..77c25b307635
--- /dev/null
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -0,0 +1,62 @@
1/* Exports for assembly files.
2 All C exports should go in the respective C files. */
3
4#include <linux/module.h>
5#include <linux/smp.h>
6
7#include <asm/semaphore.h>
8#include <asm/processor.h>
9#include <asm/uaccess.h>
10#include <asm/pgtable.h>
11
12EXPORT_SYMBOL(kernel_thread);
13
14EXPORT_SYMBOL(__down_failed);
15EXPORT_SYMBOL(__down_failed_interruptible);
16EXPORT_SYMBOL(__down_failed_trylock);
17EXPORT_SYMBOL(__up_wakeup);
18
19EXPORT_SYMBOL(__get_user_1);
20EXPORT_SYMBOL(__get_user_2);
21EXPORT_SYMBOL(__get_user_4);
22EXPORT_SYMBOL(__get_user_8);
23EXPORT_SYMBOL(__put_user_1);
24EXPORT_SYMBOL(__put_user_2);
25EXPORT_SYMBOL(__put_user_4);
26EXPORT_SYMBOL(__put_user_8);
27
28EXPORT_SYMBOL(copy_user_generic);
29EXPORT_SYMBOL(__copy_user_nocache);
30EXPORT_SYMBOL(copy_from_user);
31EXPORT_SYMBOL(copy_to_user);
32EXPORT_SYMBOL(__copy_from_user_inatomic);
33
34EXPORT_SYMBOL(copy_page);
35EXPORT_SYMBOL(clear_page);
36
37#ifdef CONFIG_SMP
38extern void __write_lock_failed(rwlock_t *rw);
39extern void __read_lock_failed(rwlock_t *rw);
40EXPORT_SYMBOL(__write_lock_failed);
41EXPORT_SYMBOL(__read_lock_failed);
42#endif
43
44/* Export string functions. We normally rely on gcc builtin for most of these,
45 but gcc sometimes decides not to inline them. */
46#undef memcpy
47#undef memset
48#undef memmove
49
50extern void * memset(void *,int,__kernel_size_t);
51extern void * memcpy(void *,const void *,__kernel_size_t);
52extern void * __memcpy(void *,const void *,__kernel_size_t);
53
54EXPORT_SYMBOL(memset);
55EXPORT_SYMBOL(memcpy);
56EXPORT_SYMBOL(__memcpy);
57
58EXPORT_SYMBOL(empty_zero_page);
59EXPORT_SYMBOL(init_level4_pgt);
60EXPORT_SYMBOL(load_gs_index);
61
62EXPORT_SYMBOL(_proxy_pda);
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
new file mode 100644
index 000000000000..329da276c6f1
--- /dev/null
+++ b/arch/x86/lib/Makefile
@@ -0,0 +1,5 @@
1ifeq ($(CONFIG_X86_32),y)
2include ${srctree}/arch/x86/lib/Makefile_32
3else
4include ${srctree}/arch/x86/lib/Makefile_64
5endif
diff --git a/arch/x86/lib/Makefile_32 b/arch/x86/lib/Makefile_32
new file mode 100644
index 000000000000..98d1f1e2e2ef
--- /dev/null
+++ b/arch/x86/lib/Makefile_32
@@ -0,0 +1,11 @@
1#
2# Makefile for i386-specific library files..
3#
4
5
6lib-y = checksum_32.o delay_32.o usercopy_32.o getuser_32.o putuser_32.o memcpy_32.o strstr_32.o \
7 bitops_32.o semaphore_32.o string_32.o
8
9lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o
10
11obj-$(CONFIG_SMP) += msr-on-cpu.o
diff --git a/arch/x86/lib/Makefile_64 b/arch/x86/lib/Makefile_64
new file mode 100644
index 000000000000..bbabad3c9335
--- /dev/null
+++ b/arch/x86/lib/Makefile_64
@@ -0,0 +1,13 @@
1#
2# Makefile for x86_64-specific library files.
3#
4
5CFLAGS_csum-partial_64.o := -funroll-loops
6
7obj-y := io_64.o iomap_copy_64.o
8obj-$(CONFIG_SMP) += msr-on-cpu.o
9
10lib-y := csum-partial_64.o csum-copy_64.o csum-wrappers_64.o delay_64.o \
11 usercopy_64.o getuser_64.o putuser_64.o \
12 thunk_64.o clear_page_64.o copy_page_64.o bitstr_64.o bitops_64.o
13lib-y += memcpy_64.o memmove_64.o memset_64.o copy_user_64.o rwlock_64.o copy_user_nocache_64.o
diff --git a/arch/x86/lib/bitops_32.c b/arch/x86/lib/bitops_32.c
new file mode 100644
index 000000000000..afd0045595d4
--- /dev/null
+++ b/arch/x86/lib/bitops_32.c
@@ -0,0 +1,70 @@
1#include <linux/bitops.h>
2#include <linux/module.h>
3
4/**
5 * find_next_bit - find the first set bit in a memory region
6 * @addr: The address to base the search on
7 * @offset: The bitnumber to start searching at
8 * @size: The maximum size to search
9 */
10int find_next_bit(const unsigned long *addr, int size, int offset)
11{
12 const unsigned long *p = addr + (offset >> 5);
13 int set = 0, bit = offset & 31, res;
14
15 if (bit) {
16 /*
17 * Look for nonzero in the first 32 bits:
18 */
19 __asm__("bsfl %1,%0\n\t"
20 "jne 1f\n\t"
21 "movl $32, %0\n"
22 "1:"
23 : "=r" (set)
24 : "r" (*p >> bit));
25 if (set < (32 - bit))
26 return set + offset;
27 set = 32 - bit;
28 p++;
29 }
30 /*
31 * No set bit yet, search remaining full words for a bit
32 */
33 res = find_first_bit (p, size - 32 * (p - addr));
34 return (offset + set + res);
35}
36EXPORT_SYMBOL(find_next_bit);
37
38/**
39 * find_next_zero_bit - find the first zero bit in a memory region
40 * @addr: The address to base the search on
41 * @offset: The bitnumber to start searching at
42 * @size: The maximum size to search
43 */
44int find_next_zero_bit(const unsigned long *addr, int size, int offset)
45{
46 const unsigned long *p = addr + (offset >> 5);
47 int set = 0, bit = offset & 31, res;
48
49 if (bit) {
50 /*
51 * Look for zero in the first 32 bits.
52 */
53 __asm__("bsfl %1,%0\n\t"
54 "jne 1f\n\t"
55 "movl $32, %0\n"
56 "1:"
57 : "=r" (set)
58 : "r" (~(*p >> bit)));
59 if (set < (32 - bit))
60 return set + offset;
61 set = 32 - bit;
62 p++;
63 }
64 /*
65 * No zero yet, search remaining full bytes for a zero
66 */
67 res = find_first_zero_bit(p, size - 32 * (p - addr));
68 return (offset + set + res);
69}
70EXPORT_SYMBOL(find_next_zero_bit);
diff --git a/arch/x86/lib/bitops_64.c b/arch/x86/lib/bitops_64.c
new file mode 100644
index 000000000000..95b6d9639fba
--- /dev/null
+++ b/arch/x86/lib/bitops_64.c
@@ -0,0 +1,175 @@
1#include <linux/bitops.h>
2
3#undef find_first_zero_bit
4#undef find_next_zero_bit
5#undef find_first_bit
6#undef find_next_bit
7
8static inline long
9__find_first_zero_bit(const unsigned long * addr, unsigned long size)
10{
11 long d0, d1, d2;
12 long res;
13
14 /*
15 * We must test the size in words, not in bits, because
16 * otherwise incoming sizes in the range -63..-1 will not run
17 * any scasq instructions, and then the flags used by the je
18 * instruction will have whatever random value was in place
19 * before. Nobody should call us like that, but
20 * find_next_zero_bit() does when offset and size are at the
21 * same word and it fails to find a zero itself.
22 */
23 size += 63;
24 size >>= 6;
25 if (!size)
26 return 0;
27 asm volatile(
28 " repe; scasq\n"
29 " je 1f\n"
30 " xorq -8(%%rdi),%%rax\n"
31 " subq $8,%%rdi\n"
32 " bsfq %%rax,%%rdx\n"
33 "1: subq %[addr],%%rdi\n"
34 " shlq $3,%%rdi\n"
35 " addq %%rdi,%%rdx"
36 :"=d" (res), "=&c" (d0), "=&D" (d1), "=&a" (d2)
37 :"0" (0ULL), "1" (size), "2" (addr), "3" (-1ULL),
38 [addr] "S" (addr) : "memory");
39 /*
40 * Any register would do for [addr] above, but GCC tends to
41 * prefer rbx over rsi, even though rsi is readily available
42 * and doesn't have to be saved.
43 */
44 return res;
45}
46
47/**
48 * find_first_zero_bit - find the first zero bit in a memory region
49 * @addr: The address to start the search at
50 * @size: The maximum size to search
51 *
52 * Returns the bit-number of the first zero bit, not the number of the byte
53 * containing a bit.
54 */
55long find_first_zero_bit(const unsigned long * addr, unsigned long size)
56{
57 return __find_first_zero_bit (addr, size);
58}
59
60/**
61 * find_next_zero_bit - find the first zero bit in a memory region
62 * @addr: The address to base the search on
63 * @offset: The bitnumber to start searching at
64 * @size: The maximum size to search
65 */
66long find_next_zero_bit (const unsigned long * addr, long size, long offset)
67{
68 const unsigned long * p = addr + (offset >> 6);
69 unsigned long set = 0;
70 unsigned long res, bit = offset&63;
71
72 if (bit) {
73 /*
74 * Look for zero in first word
75 */
76 asm("bsfq %1,%0\n\t"
77 "cmoveq %2,%0"
78 : "=r" (set)
79 : "r" (~(*p >> bit)), "r"(64L));
80 if (set < (64 - bit))
81 return set + offset;
82 set = 64 - bit;
83 p++;
84 }
85 /*
86 * No zero yet, search remaining full words for a zero
87 */
88 res = __find_first_zero_bit (p, size - 64 * (p - addr));
89
90 return (offset + set + res);
91}
92
93static inline long
94__find_first_bit(const unsigned long * addr, unsigned long size)
95{
96 long d0, d1;
97 long res;
98
99 /*
100 * We must test the size in words, not in bits, because
101 * otherwise incoming sizes in the range -63..-1 will not run
102 * any scasq instructions, and then the flags used by the jz
103 * instruction will have whatever random value was in place
104 * before. Nobody should call us like that, but
105 * find_next_bit() does when offset and size are at the same
106 * word and it fails to find a one itself.
107 */
108 size += 63;
109 size >>= 6;
110 if (!size)
111 return 0;
112 asm volatile(
113 " repe; scasq\n"
114 " jz 1f\n"
115 " subq $8,%%rdi\n"
116 " bsfq (%%rdi),%%rax\n"
117 "1: subq %[addr],%%rdi\n"
118 " shlq $3,%%rdi\n"
119 " addq %%rdi,%%rax"
120 :"=a" (res), "=&c" (d0), "=&D" (d1)
121 :"0" (0ULL), "1" (size), "2" (addr),
122 [addr] "r" (addr) : "memory");
123 return res;
124}
125
126/**
127 * find_first_bit - find the first set bit in a memory region
128 * @addr: The address to start the search at
129 * @size: The maximum size to search
130 *
131 * Returns the bit-number of the first set bit, not the number of the byte
132 * containing a bit.
133 */
134long find_first_bit(const unsigned long * addr, unsigned long size)
135{
136 return __find_first_bit(addr,size);
137}
138
139/**
140 * find_next_bit - find the first set bit in a memory region
141 * @addr: The address to base the search on
142 * @offset: The bitnumber to start searching at
143 * @size: The maximum size to search
144 */
145long find_next_bit(const unsigned long * addr, long size, long offset)
146{
147 const unsigned long * p = addr + (offset >> 6);
148 unsigned long set = 0, bit = offset & 63, res;
149
150 if (bit) {
151 /*
152 * Look for nonzero in the first 64 bits:
153 */
154 asm("bsfq %1,%0\n\t"
155 "cmoveq %2,%0\n\t"
156 : "=r" (set)
157 : "r" (*p >> bit), "r" (64L));
158 if (set < (64 - bit))
159 return set + offset;
160 set = 64 - bit;
161 p++;
162 }
163 /*
164 * No set bit yet, search remaining full words for a bit
165 */
166 res = __find_first_bit (p, size - 64 * (p - addr));
167 return (offset + set + res);
168}
169
170#include <linux/module.h>
171
172EXPORT_SYMBOL(find_next_bit);
173EXPORT_SYMBOL(find_first_bit);
174EXPORT_SYMBOL(find_first_zero_bit);
175EXPORT_SYMBOL(find_next_zero_bit);
diff --git a/arch/x86/lib/bitstr_64.c b/arch/x86/lib/bitstr_64.c
new file mode 100644
index 000000000000..24676609a6ac
--- /dev/null
+++ b/arch/x86/lib/bitstr_64.c
@@ -0,0 +1,28 @@
1#include <linux/module.h>
2#include <linux/bitops.h>
3
4/* Find string of zero bits in a bitmap */
5unsigned long
6find_next_zero_string(unsigned long *bitmap, long start, long nbits, int len)
7{
8 unsigned long n, end, i;
9
10 again:
11 n = find_next_zero_bit(bitmap, nbits, start);
12 if (n == -1)
13 return -1;
14
15 /* could test bitsliced, but it's hardly worth it */
16 end = n+len;
17 if (end >= nbits)
18 return -1;
19 for (i = n+1; i < end; i++) {
20 if (test_bit(i, bitmap)) {
21 start = i+1;
22 goto again;
23 }
24 }
25 return n;
26}
27
28EXPORT_SYMBOL(find_next_zero_string);
diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S
new file mode 100644
index 000000000000..adbccd0bbb78
--- /dev/null
+++ b/arch/x86/lib/checksum_32.S
@@ -0,0 +1,546 @@
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * IP/TCP/UDP checksumming routines
7 *
8 * Authors: Jorge Cwik, <jorge@laser.satlink.net>
9 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
10 * Tom May, <ftom@netcom.com>
11 * Pentium Pro/II routines:
12 * Alexander Kjeldaas <astor@guardian.no>
13 * Finn Arne Gangstad <finnag@guardian.no>
14 * Lots of code moved from tcp.c and ip.c; see those files
15 * for more names.
16 *
17 * Changes: Ingo Molnar, converted csum_partial_copy() to 2.1 exception
18 * handling.
19 * Andi Kleen, add zeroing on error
20 * converted to pure assembler
21 *
22 * This program is free software; you can redistribute it and/or
23 * modify it under the terms of the GNU General Public License
24 * as published by the Free Software Foundation; either version
25 * 2 of the License, or (at your option) any later version.
26 */
27
28#include <linux/linkage.h>
29#include <asm/dwarf2.h>
30#include <asm/errno.h>
31
32/*
33 * computes a partial checksum, e.g. for TCP/UDP fragments
34 */
35
36/*
37unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
38 */
39
40.text
41
42#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
43
44 /*
45 * Experiments with Ethernet and SLIP connections show that buff
46 * is aligned on either a 2-byte or 4-byte boundary. We get at
47 * least a twofold speedup on 486 and Pentium if it is 4-byte aligned.
48 * Fortunately, it is easy to convert 2-byte alignment to 4-byte
49 * alignment for the unrolled loop.
50 */
51ENTRY(csum_partial)
52 CFI_STARTPROC
53 pushl %esi
54 CFI_ADJUST_CFA_OFFSET 4
55 CFI_REL_OFFSET esi, 0
56 pushl %ebx
57 CFI_ADJUST_CFA_OFFSET 4
58 CFI_REL_OFFSET ebx, 0
59 movl 20(%esp),%eax # Function arg: unsigned int sum
60 movl 16(%esp),%ecx # Function arg: int len
61 movl 12(%esp),%esi # Function arg: unsigned char *buff
62 testl $3, %esi # Check alignment.
63 jz 2f # Jump if alignment is ok.
64 testl $1, %esi # Check alignment.
65 jz 10f # Jump if alignment is boundary of 2bytes.
66
67 # buf is odd
68 dec %ecx
69 jl 8f
70 movzbl (%esi), %ebx
71 adcl %ebx, %eax
72 roll $8, %eax
73 inc %esi
74 testl $2, %esi
75 jz 2f
7610:
77 subl $2, %ecx # Alignment uses up two bytes.
78 jae 1f # Jump if we had at least two bytes.
79 addl $2, %ecx # ecx was < 2. Deal with it.
80 jmp 4f
811: movw (%esi), %bx
82 addl $2, %esi
83 addw %bx, %ax
84 adcl $0, %eax
852:
86 movl %ecx, %edx
87 shrl $5, %ecx
88 jz 2f
89 testl %esi, %esi
901: movl (%esi), %ebx
91 adcl %ebx, %eax
92 movl 4(%esi), %ebx
93 adcl %ebx, %eax
94 movl 8(%esi), %ebx
95 adcl %ebx, %eax
96 movl 12(%esi), %ebx
97 adcl %ebx, %eax
98 movl 16(%esi), %ebx
99 adcl %ebx, %eax
100 movl 20(%esi), %ebx
101 adcl %ebx, %eax
102 movl 24(%esi), %ebx
103 adcl %ebx, %eax
104 movl 28(%esi), %ebx
105 adcl %ebx, %eax
106 lea 32(%esi), %esi
107 dec %ecx
108 jne 1b
109 adcl $0, %eax
1102: movl %edx, %ecx
111 andl $0x1c, %edx
112 je 4f
113 shrl $2, %edx # This clears CF
1143: adcl (%esi), %eax
115 lea 4(%esi), %esi
116 dec %edx
117 jne 3b
118 adcl $0, %eax
1194: andl $3, %ecx
120 jz 7f
121 cmpl $2, %ecx
122 jb 5f
123 movw (%esi),%cx
124 leal 2(%esi),%esi
125 je 6f
126 shll $16,%ecx
1275: movb (%esi),%cl
1286: addl %ecx,%eax
129 adcl $0, %eax
1307:
131 testl $1, 12(%esp)
132 jz 8f
133 roll $8, %eax
1348:
135 popl %ebx
136 CFI_ADJUST_CFA_OFFSET -4
137 CFI_RESTORE ebx
138 popl %esi
139 CFI_ADJUST_CFA_OFFSET -4
140 CFI_RESTORE esi
141 ret
142 CFI_ENDPROC
143ENDPROC(csum_partial)
144
145#else
146
147/* Version for PentiumII/PPro */
148
149ENTRY(csum_partial)
150 CFI_STARTPROC
151 pushl %esi
152 CFI_ADJUST_CFA_OFFSET 4
153 CFI_REL_OFFSET esi, 0
154 pushl %ebx
155 CFI_ADJUST_CFA_OFFSET 4
156 CFI_REL_OFFSET ebx, 0
157 movl 20(%esp),%eax # Function arg: unsigned int sum
158 movl 16(%esp),%ecx # Function arg: int len
159 movl 12(%esp),%esi # Function arg: const unsigned char *buf
160
161 testl $3, %esi
162 jnz 25f
16310:
164 movl %ecx, %edx
165 movl %ecx, %ebx
166 andl $0x7c, %ebx
167 shrl $7, %ecx
168 addl %ebx,%esi
169 shrl $2, %ebx
170 negl %ebx
171 lea 45f(%ebx,%ebx,2), %ebx
172 testl %esi, %esi
173 jmp *%ebx
174
175 # Handle 2-byte-aligned regions
17620: addw (%esi), %ax
177 lea 2(%esi), %esi
178 adcl $0, %eax
179 jmp 10b
18025:
181 testl $1, %esi
182 jz 30f
183 # buf is odd
184 dec %ecx
185 jl 90f
186 movzbl (%esi), %ebx
187 addl %ebx, %eax
188 adcl $0, %eax
189 roll $8, %eax
190 inc %esi
191 testl $2, %esi
192 jz 10b
193
19430: subl $2, %ecx
195 ja 20b
196 je 32f
197 addl $2, %ecx
198 jz 80f
199 movzbl (%esi),%ebx # csumming 1 byte, 2-aligned
200 addl %ebx, %eax
201 adcl $0, %eax
202 jmp 80f
20332:
204 addw (%esi), %ax # csumming 2 bytes, 2-aligned
205 adcl $0, %eax
206 jmp 80f
207
20840:
209 addl -128(%esi), %eax
210 adcl -124(%esi), %eax
211 adcl -120(%esi), %eax
212 adcl -116(%esi), %eax
213 adcl -112(%esi), %eax
214 adcl -108(%esi), %eax
215 adcl -104(%esi), %eax
216 adcl -100(%esi), %eax
217 adcl -96(%esi), %eax
218 adcl -92(%esi), %eax
219 adcl -88(%esi), %eax
220 adcl -84(%esi), %eax
221 adcl -80(%esi), %eax
222 adcl -76(%esi), %eax
223 adcl -72(%esi), %eax
224 adcl -68(%esi), %eax
225 adcl -64(%esi), %eax
226 adcl -60(%esi), %eax
227 adcl -56(%esi), %eax
228 adcl -52(%esi), %eax
229 adcl -48(%esi), %eax
230 adcl -44(%esi), %eax
231 adcl -40(%esi), %eax
232 adcl -36(%esi), %eax
233 adcl -32(%esi), %eax
234 adcl -28(%esi), %eax
235 adcl -24(%esi), %eax
236 adcl -20(%esi), %eax
237 adcl -16(%esi), %eax
238 adcl -12(%esi), %eax
239 adcl -8(%esi), %eax
240 adcl -4(%esi), %eax
24145:
242 lea 128(%esi), %esi
243 adcl $0, %eax
244 dec %ecx
245 jge 40b
246 movl %edx, %ecx
24750: andl $3, %ecx
248 jz 80f
249
250 # Handle the last 1-3 bytes without jumping
251 notl %ecx # 1->2, 2->1, 3->0, higher bits are masked
252 movl $0xffffff,%ebx # by the shll and shrl instructions
253 shll $3,%ecx
254 shrl %cl,%ebx
255 andl -128(%esi),%ebx # esi is 4-aligned so should be ok
256 addl %ebx,%eax
257 adcl $0,%eax
25880:
259 testl $1, 12(%esp)
260 jz 90f
261 roll $8, %eax
26290:
263 popl %ebx
264 CFI_ADJUST_CFA_OFFSET -4
265 CFI_RESTORE ebx
266 popl %esi
267 CFI_ADJUST_CFA_OFFSET -4
268 CFI_RESTORE esi
269 ret
270 CFI_ENDPROC
271ENDPROC(csum_partial)
272
273#endif
274
275/*
276unsigned int csum_partial_copy_generic (const char *src, char *dst,
277 int len, int sum, int *src_err_ptr, int *dst_err_ptr)
278 */
279
280/*
281 * Copy from ds while checksumming, otherwise like csum_partial
282 *
283 * The macros SRC and DST specify the type of access for the instruction.
284 * thus we can call a custom exception handler for all access types.
285 *
286 * FIXME: could someone double-check whether I haven't mixed up some SRC and
287 * DST definitions? It's damn hard to trigger all cases. I hope I got
288 * them all but there's no guarantee.
289 */
290
291#define SRC(y...) \
292 9999: y; \
293 .section __ex_table, "a"; \
294 .long 9999b, 6001f ; \
295 .previous
296
297#define DST(y...) \
298 9999: y; \
299 .section __ex_table, "a"; \
300 .long 9999b, 6002f ; \
301 .previous
302
303#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
304
305#define ARGBASE 16
306#define FP 12
307
308ENTRY(csum_partial_copy_generic)
309 CFI_STARTPROC
310 subl $4,%esp
311 CFI_ADJUST_CFA_OFFSET 4
312 pushl %edi
313 CFI_ADJUST_CFA_OFFSET 4
314 CFI_REL_OFFSET edi, 0
315 pushl %esi
316 CFI_ADJUST_CFA_OFFSET 4
317 CFI_REL_OFFSET esi, 0
318 pushl %ebx
319 CFI_ADJUST_CFA_OFFSET 4
320 CFI_REL_OFFSET ebx, 0
321 movl ARGBASE+16(%esp),%eax # sum
322 movl ARGBASE+12(%esp),%ecx # len
323 movl ARGBASE+4(%esp),%esi # src
324 movl ARGBASE+8(%esp),%edi # dst
325
326 testl $2, %edi # Check alignment.
327 jz 2f # Jump if alignment is ok.
328 subl $2, %ecx # Alignment uses up two bytes.
329 jae 1f # Jump if we had at least two bytes.
330 addl $2, %ecx # ecx was < 2. Deal with it.
331 jmp 4f
332SRC(1: movw (%esi), %bx )
333 addl $2, %esi
334DST( movw %bx, (%edi) )
335 addl $2, %edi
336 addw %bx, %ax
337 adcl $0, %eax
3382:
339 movl %ecx, FP(%esp)
340 shrl $5, %ecx
341 jz 2f
342 testl %esi, %esi
343SRC(1: movl (%esi), %ebx )
344SRC( movl 4(%esi), %edx )
345 adcl %ebx, %eax
346DST( movl %ebx, (%edi) )
347 adcl %edx, %eax
348DST( movl %edx, 4(%edi) )
349
350SRC( movl 8(%esi), %ebx )
351SRC( movl 12(%esi), %edx )
352 adcl %ebx, %eax
353DST( movl %ebx, 8(%edi) )
354 adcl %edx, %eax
355DST( movl %edx, 12(%edi) )
356
357SRC( movl 16(%esi), %ebx )
358SRC( movl 20(%esi), %edx )
359 adcl %ebx, %eax
360DST( movl %ebx, 16(%edi) )
361 adcl %edx, %eax
362DST( movl %edx, 20(%edi) )
363
364SRC( movl 24(%esi), %ebx )
365SRC( movl 28(%esi), %edx )
366 adcl %ebx, %eax
367DST( movl %ebx, 24(%edi) )
368 adcl %edx, %eax
369DST( movl %edx, 28(%edi) )
370
371 lea 32(%esi), %esi
372 lea 32(%edi), %edi
373 dec %ecx
374 jne 1b
375 adcl $0, %eax
3762: movl FP(%esp), %edx
377 movl %edx, %ecx
378 andl $0x1c, %edx
379 je 4f
380 shrl $2, %edx # This clears CF
381SRC(3: movl (%esi), %ebx )
382 adcl %ebx, %eax
383DST( movl %ebx, (%edi) )
384 lea 4(%esi), %esi
385 lea 4(%edi), %edi
386 dec %edx
387 jne 3b
388 adcl $0, %eax
3894: andl $3, %ecx
390 jz 7f
391 cmpl $2, %ecx
392 jb 5f
393SRC( movw (%esi), %cx )
394 leal 2(%esi), %esi
395DST( movw %cx, (%edi) )
396 leal 2(%edi), %edi
397 je 6f
398 shll $16,%ecx
399SRC(5: movb (%esi), %cl )
400DST( movb %cl, (%edi) )
4016: addl %ecx, %eax
402 adcl $0, %eax
4037:
4045000:
405
406# Exception handler:
407.section .fixup, "ax"
408
4096001:
410 movl ARGBASE+20(%esp), %ebx # src_err_ptr
411 movl $-EFAULT, (%ebx)
412
413 # zero the complete destination - computing the rest
414 # is too much work
415 movl ARGBASE+8(%esp), %edi # dst
416 movl ARGBASE+12(%esp), %ecx # len
417 xorl %eax,%eax
418 rep ; stosb
419
420 jmp 5000b
421
4226002:
423 movl ARGBASE+24(%esp), %ebx # dst_err_ptr
424 movl $-EFAULT,(%ebx)
425 jmp 5000b
426
427.previous
428
429 popl %ebx
430 CFI_ADJUST_CFA_OFFSET -4
431 CFI_RESTORE ebx
432 popl %esi
433 CFI_ADJUST_CFA_OFFSET -4
434 CFI_RESTORE esi
435 popl %edi
436 CFI_ADJUST_CFA_OFFSET -4
437 CFI_RESTORE edi
438 popl %ecx # equivalent to addl $4,%esp
439 CFI_ADJUST_CFA_OFFSET -4
440 ret
441 CFI_ENDPROC
442ENDPROC(csum_partial_copy_generic)
443
444#else
445
446/* Version for PentiumII/PPro */
447
448#define ROUND1(x) \
449 SRC(movl x(%esi), %ebx ) ; \
450 addl %ebx, %eax ; \
451 DST(movl %ebx, x(%edi) ) ;
452
453#define ROUND(x) \
454 SRC(movl x(%esi), %ebx ) ; \
455 adcl %ebx, %eax ; \
456 DST(movl %ebx, x(%edi) ) ;
457
458#define ARGBASE 12
459
460ENTRY(csum_partial_copy_generic)
461 CFI_STARTPROC
462 pushl %ebx
463 CFI_ADJUST_CFA_OFFSET 4
464 CFI_REL_OFFSET ebx, 0
465 pushl %edi
466 CFI_ADJUST_CFA_OFFSET 4
467 CFI_REL_OFFSET edi, 0
468 pushl %esi
469 CFI_ADJUST_CFA_OFFSET 4
470 CFI_REL_OFFSET esi, 0
471 movl ARGBASE+4(%esp),%esi #src
472 movl ARGBASE+8(%esp),%edi #dst
473 movl ARGBASE+12(%esp),%ecx #len
474 movl ARGBASE+16(%esp),%eax #sum
475# movl %ecx, %edx
476 movl %ecx, %ebx
477 movl %esi, %edx
478 shrl $6, %ecx
479 andl $0x3c, %ebx
480 negl %ebx
481 subl %ebx, %esi
482 subl %ebx, %edi
483 lea -1(%esi),%edx
484 andl $-32,%edx
485 lea 3f(%ebx,%ebx), %ebx
486 testl %esi, %esi
487 jmp *%ebx
4881: addl $64,%esi
489 addl $64,%edi
490 SRC(movb -32(%edx),%bl) ; SRC(movb (%edx),%bl)
491 ROUND1(-64) ROUND(-60) ROUND(-56) ROUND(-52)
492 ROUND (-48) ROUND(-44) ROUND(-40) ROUND(-36)
493 ROUND (-32) ROUND(-28) ROUND(-24) ROUND(-20)
494 ROUND (-16) ROUND(-12) ROUND(-8) ROUND(-4)
4953: adcl $0,%eax
496 addl $64, %edx
497 dec %ecx
498 jge 1b
4994: movl ARGBASE+12(%esp),%edx #len
500 andl $3, %edx
501 jz 7f
502 cmpl $2, %edx
503 jb 5f
504SRC( movw (%esi), %dx )
505 leal 2(%esi), %esi
506DST( movw %dx, (%edi) )
507 leal 2(%edi), %edi
508 je 6f
509 shll $16,%edx
5105:
511SRC( movb (%esi), %dl )
512DST( movb %dl, (%edi) )
5136: addl %edx, %eax
514 adcl $0, %eax
5157:
516.section .fixup, "ax"
5176001: movl ARGBASE+20(%esp), %ebx # src_err_ptr
518 movl $-EFAULT, (%ebx)
519 # zero the complete destination (computing the rest is too much work)
520 movl ARGBASE+8(%esp),%edi # dst
521 movl ARGBASE+12(%esp),%ecx # len
522 xorl %eax,%eax
523 rep; stosb
524 jmp 7b
5256002: movl ARGBASE+24(%esp), %ebx # dst_err_ptr
526 movl $-EFAULT, (%ebx)
527 jmp 7b
528.previous
529
530 popl %esi
531 CFI_ADJUST_CFA_OFFSET -4
532 CFI_RESTORE esi
533 popl %edi
534 CFI_ADJUST_CFA_OFFSET -4
535 CFI_RESTORE edi
536 popl %ebx
537 CFI_ADJUST_CFA_OFFSET -4
538 CFI_RESTORE ebx
539 ret
540 CFI_ENDPROC
541ENDPROC(csum_partial_copy_generic)
542
543#undef ROUND
544#undef ROUND1
545
546#endif
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
new file mode 100644
index 000000000000..9a10a78bb4a4
--- /dev/null
+++ b/arch/x86/lib/clear_page_64.S
@@ -0,0 +1,59 @@
1#include <linux/linkage.h>
2#include <asm/dwarf2.h>
3
4/*
5 * Zero a page.
6 * rdi page
7 */
8 ALIGN
9clear_page_c:
10 CFI_STARTPROC
11 movl $4096/8,%ecx
12 xorl %eax,%eax
13 rep stosq
14 ret
15 CFI_ENDPROC
16ENDPROC(clear_page)
17
18ENTRY(clear_page)
19 CFI_STARTPROC
20 xorl %eax,%eax
21 movl $4096/64,%ecx
22 .p2align 4
23.Lloop:
24 decl %ecx
25#define PUT(x) movq %rax,x*8(%rdi)
26 movq %rax,(%rdi)
27 PUT(1)
28 PUT(2)
29 PUT(3)
30 PUT(4)
31 PUT(5)
32 PUT(6)
33 PUT(7)
34 leaq 64(%rdi),%rdi
35 jnz .Lloop
36 nop
37 ret
38 CFI_ENDPROC
39.Lclear_page_end:
40ENDPROC(clear_page)
41
42 /* Some CPUs run faster using the string instructions.
43 It is also a lot simpler. Use this when possible */
44
45#include <asm/cpufeature.h>
46
47 .section .altinstr_replacement,"ax"
481: .byte 0xeb /* jmp <disp8> */
49 .byte (clear_page_c - clear_page) - (2f - 1b) /* offset */
502:
51 .previous
52 .section .altinstructions,"a"
53 .align 8
54 .quad clear_page
55 .quad 1b
56 .byte X86_FEATURE_REP_GOOD
57 .byte .Lclear_page_end - clear_page
58 .byte 2b - 1b
59 .previous
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
new file mode 100644
index 000000000000..727a5d46d2fc
--- /dev/null
+++ b/arch/x86/lib/copy_page_64.S
@@ -0,0 +1,119 @@
1/* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */
2
3#include <linux/linkage.h>
4#include <asm/dwarf2.h>
5
6 ALIGN
7copy_page_c:
8 CFI_STARTPROC
9 movl $4096/8,%ecx
10 rep movsq
11 ret
12 CFI_ENDPROC
13ENDPROC(copy_page_c)
14
15/* Don't use streaming store because it's better when the target
16 ends up in cache. */
17
18/* Could vary the prefetch distance based on SMP/UP */
19
20ENTRY(copy_page)
21 CFI_STARTPROC
22 subq $3*8,%rsp
23 CFI_ADJUST_CFA_OFFSET 3*8
24 movq %rbx,(%rsp)
25 CFI_REL_OFFSET rbx, 0
26 movq %r12,1*8(%rsp)
27 CFI_REL_OFFSET r12, 1*8
28 movq %r13,2*8(%rsp)
29 CFI_REL_OFFSET r13, 2*8
30
31 movl $(4096/64)-5,%ecx
32 .p2align 4
33.Loop64:
34 dec %rcx
35
36 movq (%rsi), %rax
37 movq 8 (%rsi), %rbx
38 movq 16 (%rsi), %rdx
39 movq 24 (%rsi), %r8
40 movq 32 (%rsi), %r9
41 movq 40 (%rsi), %r10
42 movq 48 (%rsi), %r11
43 movq 56 (%rsi), %r12
44
45 prefetcht0 5*64(%rsi)
46
47 movq %rax, (%rdi)
48 movq %rbx, 8 (%rdi)
49 movq %rdx, 16 (%rdi)
50 movq %r8, 24 (%rdi)
51 movq %r9, 32 (%rdi)
52 movq %r10, 40 (%rdi)
53 movq %r11, 48 (%rdi)
54 movq %r12, 56 (%rdi)
55
56 leaq 64 (%rsi), %rsi
57 leaq 64 (%rdi), %rdi
58
59 jnz .Loop64
60
61 movl $5,%ecx
62 .p2align 4
63.Loop2:
64 decl %ecx
65
66 movq (%rsi), %rax
67 movq 8 (%rsi), %rbx
68 movq 16 (%rsi), %rdx
69 movq 24 (%rsi), %r8
70 movq 32 (%rsi), %r9
71 movq 40 (%rsi), %r10
72 movq 48 (%rsi), %r11
73 movq 56 (%rsi), %r12
74
75 movq %rax, (%rdi)
76 movq %rbx, 8 (%rdi)
77 movq %rdx, 16 (%rdi)
78 movq %r8, 24 (%rdi)
79 movq %r9, 32 (%rdi)
80 movq %r10, 40 (%rdi)
81 movq %r11, 48 (%rdi)
82 movq %r12, 56 (%rdi)
83
84 leaq 64(%rdi),%rdi
85 leaq 64(%rsi),%rsi
86
87 jnz .Loop2
88
89 movq (%rsp),%rbx
90 CFI_RESTORE rbx
91 movq 1*8(%rsp),%r12
92 CFI_RESTORE r12
93 movq 2*8(%rsp),%r13
94 CFI_RESTORE r13
95 addq $3*8,%rsp
96 CFI_ADJUST_CFA_OFFSET -3*8
97 ret
98.Lcopy_page_end:
99 CFI_ENDPROC
100ENDPROC(copy_page)
101
102 /* Some CPUs run faster using the string copy instructions.
103 It is also a lot simpler. Use this when possible */
104
105#include <asm/cpufeature.h>
106
107 .section .altinstr_replacement,"ax"
1081: .byte 0xeb /* jmp <disp8> */
109 .byte (copy_page_c - copy_page) - (2f - 1b) /* offset */
1102:
111 .previous
112 .section .altinstructions,"a"
113 .align 8
114 .quad copy_page
115 .quad 1b
116 .byte X86_FEATURE_REP_GOOD
117 .byte .Lcopy_page_end - copy_page
118 .byte 2b - 1b
119 .previous
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
new file mode 100644
index 000000000000..70bebd310408
--- /dev/null
+++ b/arch/x86/lib/copy_user_64.S
@@ -0,0 +1,354 @@
1/* Copyright 2002 Andi Kleen, SuSE Labs.
2 * Subject to the GNU Public License v2.
3 *
4 * Functions to copy from and to user space.
5 */
6
7#include <linux/linkage.h>
8#include <asm/dwarf2.h>
9
10#define FIX_ALIGNMENT 1
11
12#include <asm/current.h>
13#include <asm/asm-offsets.h>
14#include <asm/thread_info.h>
15#include <asm/cpufeature.h>
16
17 .macro ALTERNATIVE_JUMP feature,orig,alt
180:
19 .byte 0xe9 /* 32bit jump */
20 .long \orig-1f /* by default jump to orig */
211:
22 .section .altinstr_replacement,"ax"
232: .byte 0xe9 /* near jump with 32bit immediate */
24 .long \alt-1b /* offset */ /* or alternatively to alt */
25 .previous
26 .section .altinstructions,"a"
27 .align 8
28 .quad 0b
29 .quad 2b
30 .byte \feature /* when feature is set */
31 .byte 5
32 .byte 5
33 .previous
34 .endm
35
36/* Standard copy_to_user with segment limit checking */
37ENTRY(copy_to_user)
38 CFI_STARTPROC
39 GET_THREAD_INFO(%rax)
40 movq %rdi,%rcx
41 addq %rdx,%rcx
42 jc bad_to_user
43 cmpq threadinfo_addr_limit(%rax),%rcx
44 jae bad_to_user
45 xorl %eax,%eax /* clear zero flag */
46 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
47 CFI_ENDPROC
48
49ENTRY(copy_user_generic)
50 CFI_STARTPROC
51 movl $1,%ecx /* set zero flag */
52 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
53 CFI_ENDPROC
54
55ENTRY(__copy_from_user_inatomic)
56 CFI_STARTPROC
57 xorl %ecx,%ecx /* clear zero flag */
58 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
59 CFI_ENDPROC
60
61/* Standard copy_from_user with segment limit checking */
62ENTRY(copy_from_user)
63 CFI_STARTPROC
64 GET_THREAD_INFO(%rax)
65 movq %rsi,%rcx
66 addq %rdx,%rcx
67 jc bad_from_user
68 cmpq threadinfo_addr_limit(%rax),%rcx
69 jae bad_from_user
70 movl $1,%ecx /* set zero flag */
71 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
72 CFI_ENDPROC
73ENDPROC(copy_from_user)
74
75 .section .fixup,"ax"
76 /* must zero dest */
77bad_from_user:
78 CFI_STARTPROC
79 movl %edx,%ecx
80 xorl %eax,%eax
81 rep
82 stosb
83bad_to_user:
84 movl %edx,%eax
85 ret
86 CFI_ENDPROC
87END(bad_from_user)
88 .previous
89
90
91/*
92 * copy_user_generic_unrolled - memory copy with exception handling.
93 * This version is for CPUs like P4 that don't have efficient micro code for rep movsq
94 *
95 * Input:
96 * rdi destination
97 * rsi source
98 * rdx count
99 * ecx zero flag -- if true zero destination on error
100 *
101 * Output:
102 * eax uncopied bytes or 0 if successful.
103 */
104ENTRY(copy_user_generic_unrolled)
105 CFI_STARTPROC
106 pushq %rbx
107 CFI_ADJUST_CFA_OFFSET 8
108 CFI_REL_OFFSET rbx, 0
109 pushq %rcx
110 CFI_ADJUST_CFA_OFFSET 8
111 CFI_REL_OFFSET rcx, 0
112 xorl %eax,%eax /*zero for the exception handler */
113
114#ifdef FIX_ALIGNMENT
115 /* check for bad alignment of destination */
116 movl %edi,%ecx
117 andl $7,%ecx
118 jnz .Lbad_alignment
119.Lafter_bad_alignment:
120#endif
121
122 movq %rdx,%rcx
123
124 movl $64,%ebx
125 shrq $6,%rdx
126 decq %rdx
127 js .Lhandle_tail
128
129 .p2align 4
130.Lloop:
131.Ls1: movq (%rsi),%r11
132.Ls2: movq 1*8(%rsi),%r8
133.Ls3: movq 2*8(%rsi),%r9
134.Ls4: movq 3*8(%rsi),%r10
135.Ld1: movq %r11,(%rdi)
136.Ld2: movq %r8,1*8(%rdi)
137.Ld3: movq %r9,2*8(%rdi)
138.Ld4: movq %r10,3*8(%rdi)
139
140.Ls5: movq 4*8(%rsi),%r11
141.Ls6: movq 5*8(%rsi),%r8
142.Ls7: movq 6*8(%rsi),%r9
143.Ls8: movq 7*8(%rsi),%r10
144.Ld5: movq %r11,4*8(%rdi)
145.Ld6: movq %r8,5*8(%rdi)
146.Ld7: movq %r9,6*8(%rdi)
147.Ld8: movq %r10,7*8(%rdi)
148
149 decq %rdx
150
151 leaq 64(%rsi),%rsi
152 leaq 64(%rdi),%rdi
153
154 jns .Lloop
155
156 .p2align 4
157.Lhandle_tail:
158 movl %ecx,%edx
159 andl $63,%ecx
160 shrl $3,%ecx
161 jz .Lhandle_7
162 movl $8,%ebx
163 .p2align 4
164.Lloop_8:
165.Ls9: movq (%rsi),%r8
166.Ld9: movq %r8,(%rdi)
167 decl %ecx
168 leaq 8(%rdi),%rdi
169 leaq 8(%rsi),%rsi
170 jnz .Lloop_8
171
172.Lhandle_7:
173 movl %edx,%ecx
174 andl $7,%ecx
175 jz .Lende
176 .p2align 4
177.Lloop_1:
178.Ls10: movb (%rsi),%bl
179.Ld10: movb %bl,(%rdi)
180 incq %rdi
181 incq %rsi
182 decl %ecx
183 jnz .Lloop_1
184
185 CFI_REMEMBER_STATE
186.Lende:
187 popq %rcx
188 CFI_ADJUST_CFA_OFFSET -8
189 CFI_RESTORE rcx
190 popq %rbx
191 CFI_ADJUST_CFA_OFFSET -8
192 CFI_RESTORE rbx
193 ret
194 CFI_RESTORE_STATE
195
196#ifdef FIX_ALIGNMENT
197 /* align destination */
198 .p2align 4
199.Lbad_alignment:
200 movl $8,%r9d
201 subl %ecx,%r9d
202 movl %r9d,%ecx
203 cmpq %r9,%rdx
204 jz .Lhandle_7
205 js .Lhandle_7
206.Lalign_1:
207.Ls11: movb (%rsi),%bl
208.Ld11: movb %bl,(%rdi)
209 incq %rsi
210 incq %rdi
211 decl %ecx
212 jnz .Lalign_1
213 subq %r9,%rdx
214 jmp .Lafter_bad_alignment
215#endif
216
217 /* table sorted by exception address */
218 .section __ex_table,"a"
219 .align 8
220 .quad .Ls1,.Ls1e
221 .quad .Ls2,.Ls2e
222 .quad .Ls3,.Ls3e
223 .quad .Ls4,.Ls4e
224 .quad .Ld1,.Ls1e
225 .quad .Ld2,.Ls2e
226 .quad .Ld3,.Ls3e
227 .quad .Ld4,.Ls4e
228 .quad .Ls5,.Ls5e
229 .quad .Ls6,.Ls6e
230 .quad .Ls7,.Ls7e
231 .quad .Ls8,.Ls8e
232 .quad .Ld5,.Ls5e
233 .quad .Ld6,.Ls6e
234 .quad .Ld7,.Ls7e
235 .quad .Ld8,.Ls8e
236 .quad .Ls9,.Le_quad
237 .quad .Ld9,.Le_quad
238 .quad .Ls10,.Le_byte
239 .quad .Ld10,.Le_byte
240#ifdef FIX_ALIGNMENT
241 .quad .Ls11,.Lzero_rest
242 .quad .Ld11,.Lzero_rest
243#endif
244 .quad .Le5,.Le_zero
245 .previous
246
247 /* compute 64-offset for main loop. 8 bytes accuracy with error on the
248 pessimistic side. this is gross. it would be better to fix the
249 interface. */
250 /* eax: zero, ebx: 64 */
251.Ls1e: addl $8,%eax
252.Ls2e: addl $8,%eax
253.Ls3e: addl $8,%eax
254.Ls4e: addl $8,%eax
255.Ls5e: addl $8,%eax
256.Ls6e: addl $8,%eax
257.Ls7e: addl $8,%eax
258.Ls8e: addl $8,%eax
259 addq %rbx,%rdi /* +64 */
260 subq %rax,%rdi /* correct destination with computed offset */
261
262 shlq $6,%rdx /* loop counter * 64 (stride length) */
263 addq %rax,%rdx /* add offset to loopcnt */
264 andl $63,%ecx /* remaining bytes */
265 addq %rcx,%rdx /* add them */
266 jmp .Lzero_rest
267
268 /* exception on quad word loop in tail handling */
269 /* ecx: loopcnt/8, %edx: length, rdi: correct */
270.Le_quad:
271 shll $3,%ecx
272 andl $7,%edx
273 addl %ecx,%edx
274 /* edx: bytes to zero, rdi: dest, eax:zero */
275.Lzero_rest:
276 cmpl $0,(%rsp)
277 jz .Le_zero
278 movq %rdx,%rcx
279.Le_byte:
280 xorl %eax,%eax
281.Le5: rep
282 stosb
283 /* when there is another exception while zeroing the rest just return */
284.Le_zero:
285 movq %rdx,%rax
286 jmp .Lende
287 CFI_ENDPROC
288ENDPROC(copy_user_generic)
289
290
291 /* Some CPUs run faster using the string copy instructions.
292 This is also a lot simpler. Use them when possible.
293 Patch in jmps to this code instead of copying it fully
294 to avoid unwanted aliasing in the exception tables. */
295
296 /* rdi destination
297 * rsi source
298 * rdx count
299 * ecx zero flag
300 *
301 * Output:
302 * eax uncopied bytes or 0 if successfull.
303 *
304 * Only 4GB of copy is supported. This shouldn't be a problem
305 * because the kernel normally only writes from/to page sized chunks
306 * even if user space passed a longer buffer.
307 * And more would be dangerous because both Intel and AMD have
308 * errata with rep movsq > 4GB. If someone feels the need to fix
309 * this please consider this.
310 */
311ENTRY(copy_user_generic_string)
312 CFI_STARTPROC
313 movl %ecx,%r8d /* save zero flag */
314 movl %edx,%ecx
315 shrl $3,%ecx
316 andl $7,%edx
317 jz 10f
3181: rep
319 movsq
320 movl %edx,%ecx
3212: rep
322 movsb
3239: movl %ecx,%eax
324 ret
325
326 /* multiple of 8 byte */
32710: rep
328 movsq
329 xor %eax,%eax
330 ret
331
332 /* exception handling */
3333: lea (%rdx,%rcx,8),%rax /* exception on quad loop */
334 jmp 6f
3355: movl %ecx,%eax /* exception on byte loop */
336 /* eax: left over bytes */
3376: testl %r8d,%r8d /* zero flag set? */
338 jz 7f
339 movl %eax,%ecx /* initialize x86 loop counter */
340 push %rax
341 xorl %eax,%eax
3428: rep
343 stosb /* zero the rest */
34411: pop %rax
3457: ret
346 CFI_ENDPROC
347END(copy_user_generic_c)
348
349 .section __ex_table,"a"
350 .quad 1b,3b
351 .quad 2b,5b
352 .quad 8b,11b
353 .quad 10b,3b
354 .previous
diff --git a/arch/x86/lib/copy_user_nocache_64.S b/arch/x86/lib/copy_user_nocache_64.S
new file mode 100644
index 000000000000..4620efb12f13
--- /dev/null
+++ b/arch/x86/lib/copy_user_nocache_64.S
@@ -0,0 +1,217 @@
1/* Copyright 2002 Andi Kleen, SuSE Labs.
2 * Subject to the GNU Public License v2.
3 *
4 * Functions to copy from and to user space.
5 */
6
7#include <linux/linkage.h>
8#include <asm/dwarf2.h>
9
10#define FIX_ALIGNMENT 1
11
12#include <asm/current.h>
13#include <asm/asm-offsets.h>
14#include <asm/thread_info.h>
15#include <asm/cpufeature.h>
16
17/*
18 * copy_user_nocache - Uncached memory copy with exception handling
19 * This will force destination/source out of cache for more performance.
20 *
21 * Input:
22 * rdi destination
23 * rsi source
24 * rdx count
25 * rcx zero flag when 1 zero on exception
26 *
27 * Output:
28 * eax uncopied bytes or 0 if successful.
29 */
30ENTRY(__copy_user_nocache)
31 CFI_STARTPROC
32 pushq %rbx
33 CFI_ADJUST_CFA_OFFSET 8
34 CFI_REL_OFFSET rbx, 0
35 pushq %rcx /* save zero flag */
36 CFI_ADJUST_CFA_OFFSET 8
37 CFI_REL_OFFSET rcx, 0
38
39 xorl %eax,%eax /* zero for the exception handler */
40
41#ifdef FIX_ALIGNMENT
42 /* check for bad alignment of destination */
43 movl %edi,%ecx
44 andl $7,%ecx
45 jnz .Lbad_alignment
46.Lafter_bad_alignment:
47#endif
48
49 movq %rdx,%rcx
50
51 movl $64,%ebx
52 shrq $6,%rdx
53 decq %rdx
54 js .Lhandle_tail
55
56 .p2align 4
57.Lloop:
58.Ls1: movq (%rsi),%r11
59.Ls2: movq 1*8(%rsi),%r8
60.Ls3: movq 2*8(%rsi),%r9
61.Ls4: movq 3*8(%rsi),%r10
62.Ld1: movnti %r11,(%rdi)
63.Ld2: movnti %r8,1*8(%rdi)
64.Ld3: movnti %r9,2*8(%rdi)
65.Ld4: movnti %r10,3*8(%rdi)
66
67.Ls5: movq 4*8(%rsi),%r11
68.Ls6: movq 5*8(%rsi),%r8
69.Ls7: movq 6*8(%rsi),%r9
70.Ls8: movq 7*8(%rsi),%r10
71.Ld5: movnti %r11,4*8(%rdi)
72.Ld6: movnti %r8,5*8(%rdi)
73.Ld7: movnti %r9,6*8(%rdi)
74.Ld8: movnti %r10,7*8(%rdi)
75
76 dec %rdx
77
78 leaq 64(%rsi),%rsi
79 leaq 64(%rdi),%rdi
80
81 jns .Lloop
82
83 .p2align 4
84.Lhandle_tail:
85 movl %ecx,%edx
86 andl $63,%ecx
87 shrl $3,%ecx
88 jz .Lhandle_7
89 movl $8,%ebx
90 .p2align 4
91.Lloop_8:
92.Ls9: movq (%rsi),%r8
93.Ld9: movnti %r8,(%rdi)
94 decl %ecx
95 leaq 8(%rdi),%rdi
96 leaq 8(%rsi),%rsi
97 jnz .Lloop_8
98
99.Lhandle_7:
100 movl %edx,%ecx
101 andl $7,%ecx
102 jz .Lende
103 .p2align 4
104.Lloop_1:
105.Ls10: movb (%rsi),%bl
106.Ld10: movb %bl,(%rdi)
107 incq %rdi
108 incq %rsi
109 decl %ecx
110 jnz .Lloop_1
111
112 CFI_REMEMBER_STATE
113.Lende:
114 popq %rcx
115 CFI_ADJUST_CFA_OFFSET -8
116 CFI_RESTORE %rcx
117 popq %rbx
118 CFI_ADJUST_CFA_OFFSET -8
119 CFI_RESTORE rbx
120 ret
121 CFI_RESTORE_STATE
122
123#ifdef FIX_ALIGNMENT
124 /* align destination */
125 .p2align 4
126.Lbad_alignment:
127 movl $8,%r9d
128 subl %ecx,%r9d
129 movl %r9d,%ecx
130 cmpq %r9,%rdx
131 jz .Lhandle_7
132 js .Lhandle_7
133.Lalign_1:
134.Ls11: movb (%rsi),%bl
135.Ld11: movb %bl,(%rdi)
136 incq %rsi
137 incq %rdi
138 decl %ecx
139 jnz .Lalign_1
140 subq %r9,%rdx
141 jmp .Lafter_bad_alignment
142#endif
143
144 /* table sorted by exception address */
145 .section __ex_table,"a"
146 .align 8
147 .quad .Ls1,.Ls1e
148 .quad .Ls2,.Ls2e
149 .quad .Ls3,.Ls3e
150 .quad .Ls4,.Ls4e
151 .quad .Ld1,.Ls1e
152 .quad .Ld2,.Ls2e
153 .quad .Ld3,.Ls3e
154 .quad .Ld4,.Ls4e
155 .quad .Ls5,.Ls5e
156 .quad .Ls6,.Ls6e
157 .quad .Ls7,.Ls7e
158 .quad .Ls8,.Ls8e
159 .quad .Ld5,.Ls5e
160 .quad .Ld6,.Ls6e
161 .quad .Ld7,.Ls7e
162 .quad .Ld8,.Ls8e
163 .quad .Ls9,.Le_quad
164 .quad .Ld9,.Le_quad
165 .quad .Ls10,.Le_byte
166 .quad .Ld10,.Le_byte
167#ifdef FIX_ALIGNMENT
168 .quad .Ls11,.Lzero_rest
169 .quad .Ld11,.Lzero_rest
170#endif
171 .quad .Le5,.Le_zero
172 .previous
173
174 /* compute 64-offset for main loop. 8 bytes accuracy with error on the
175 pessimistic side. this is gross. it would be better to fix the
176 interface. */
177 /* eax: zero, ebx: 64 */
178.Ls1e: addl $8,%eax
179.Ls2e: addl $8,%eax
180.Ls3e: addl $8,%eax
181.Ls4e: addl $8,%eax
182.Ls5e: addl $8,%eax
183.Ls6e: addl $8,%eax
184.Ls7e: addl $8,%eax
185.Ls8e: addl $8,%eax
186 addq %rbx,%rdi /* +64 */
187 subq %rax,%rdi /* correct destination with computed offset */
188
189 shlq $6,%rdx /* loop counter * 64 (stride length) */
190 addq %rax,%rdx /* add offset to loopcnt */
191 andl $63,%ecx /* remaining bytes */
192 addq %rcx,%rdx /* add them */
193 jmp .Lzero_rest
194
195 /* exception on quad word loop in tail handling */
196 /* ecx: loopcnt/8, %edx: length, rdi: correct */
197.Le_quad:
198 shll $3,%ecx
199 andl $7,%edx
200 addl %ecx,%edx
201 /* edx: bytes to zero, rdi: dest, eax:zero */
202.Lzero_rest:
203 cmpl $0,(%rsp) /* zero flag set? */
204 jz .Le_zero
205 movq %rdx,%rcx
206.Le_byte:
207 xorl %eax,%eax
208.Le5: rep
209 stosb
210 /* when there is another exception while zeroing the rest just return */
211.Le_zero:
212 movq %rdx,%rax
213 jmp .Lende
214 CFI_ENDPROC
215ENDPROC(__copy_user_nocache)
216
217
diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S
new file mode 100644
index 000000000000..f0dba36578ea
--- /dev/null
+++ b/arch/x86/lib/csum-copy_64.S
@@ -0,0 +1,249 @@
1/*
2 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
3 *
4 * This file is subject to the terms and conditions of the GNU General Public
5 * License. See the file COPYING in the main directory of this archive
6 * for more details. No warranty for anything given at all.
7 */
8#include <linux/linkage.h>
9#include <asm/dwarf2.h>
10#include <asm/errno.h>
11
12/*
13 * Checksum copy with exception handling.
14 * On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the
15 * destination is zeroed.
16 *
17 * Input
18 * rdi source
19 * rsi destination
20 * edx len (32bit)
21 * ecx sum (32bit)
22 * r8 src_err_ptr (int)
23 * r9 dst_err_ptr (int)
24 *
25 * Output
26 * eax 64bit sum. undefined in case of exception.
27 *
28 * Wrappers need to take care of valid exception sum and zeroing.
29 * They also should align source or destination to 8 bytes.
30 */
31
32 .macro source
3310:
34 .section __ex_table,"a"
35 .align 8
36 .quad 10b,.Lbad_source
37 .previous
38 .endm
39
40 .macro dest
4120:
42 .section __ex_table,"a"
43 .align 8
44 .quad 20b,.Lbad_dest
45 .previous
46 .endm
47
48 .macro ignore L=.Lignore
4930:
50 .section __ex_table,"a"
51 .align 8
52 .quad 30b,\L
53 .previous
54 .endm
55
56
57ENTRY(csum_partial_copy_generic)
58 CFI_STARTPROC
59 cmpl $3*64,%edx
60 jle .Lignore
61
62.Lignore:
63 subq $7*8,%rsp
64 CFI_ADJUST_CFA_OFFSET 7*8
65 movq %rbx,2*8(%rsp)
66 CFI_REL_OFFSET rbx, 2*8
67 movq %r12,3*8(%rsp)
68 CFI_REL_OFFSET r12, 3*8
69 movq %r14,4*8(%rsp)
70 CFI_REL_OFFSET r14, 4*8
71 movq %r13,5*8(%rsp)
72 CFI_REL_OFFSET r13, 5*8
73 movq %rbp,6*8(%rsp)
74 CFI_REL_OFFSET rbp, 6*8
75
76 movq %r8,(%rsp)
77 movq %r9,1*8(%rsp)
78
79 movl %ecx,%eax
80 movl %edx,%ecx
81
82 xorl %r9d,%r9d
83 movq %rcx,%r12
84
85 shrq $6,%r12
86 jz .Lhandle_tail /* < 64 */
87
88 clc
89
90 /* main loop. clear in 64 byte blocks */
91 /* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */
92 /* r11: temp3, rdx: temp4, r12 loopcnt */
93 /* r10: temp5, rbp: temp6, r14 temp7, r13 temp8 */
94 .p2align 4
95.Lloop:
96 source
97 movq (%rdi),%rbx
98 source
99 movq 8(%rdi),%r8
100 source
101 movq 16(%rdi),%r11
102 source
103 movq 24(%rdi),%rdx
104
105 source
106 movq 32(%rdi),%r10
107 source
108 movq 40(%rdi),%rbp
109 source
110 movq 48(%rdi),%r14
111 source
112 movq 56(%rdi),%r13
113
114 ignore 2f
115 prefetcht0 5*64(%rdi)
1162:
117 adcq %rbx,%rax
118 adcq %r8,%rax
119 adcq %r11,%rax
120 adcq %rdx,%rax
121 adcq %r10,%rax
122 adcq %rbp,%rax
123 adcq %r14,%rax
124 adcq %r13,%rax
125
126 decl %r12d
127
128 dest
129 movq %rbx,(%rsi)
130 dest
131 movq %r8,8(%rsi)
132 dest
133 movq %r11,16(%rsi)
134 dest
135 movq %rdx,24(%rsi)
136
137 dest
138 movq %r10,32(%rsi)
139 dest
140 movq %rbp,40(%rsi)
141 dest
142 movq %r14,48(%rsi)
143 dest
144 movq %r13,56(%rsi)
145
1463:
147
148 leaq 64(%rdi),%rdi
149 leaq 64(%rsi),%rsi
150
151 jnz .Lloop
152
153 adcq %r9,%rax
154
155 /* do last upto 56 bytes */
156.Lhandle_tail:
157 /* ecx: count */
158 movl %ecx,%r10d
159 andl $63,%ecx
160 shrl $3,%ecx
161 jz .Lfold
162 clc
163 .p2align 4
164.Lloop_8:
165 source
166 movq (%rdi),%rbx
167 adcq %rbx,%rax
168 decl %ecx
169 dest
170 movq %rbx,(%rsi)
171 leaq 8(%rsi),%rsi /* preserve carry */
172 leaq 8(%rdi),%rdi
173 jnz .Lloop_8
174 adcq %r9,%rax /* add in carry */
175
176.Lfold:
177 /* reduce checksum to 32bits */
178 movl %eax,%ebx
179 shrq $32,%rax
180 addl %ebx,%eax
181 adcl %r9d,%eax
182
183 /* do last upto 6 bytes */
184.Lhandle_7:
185 movl %r10d,%ecx
186 andl $7,%ecx
187 shrl $1,%ecx
188 jz .Lhandle_1
189 movl $2,%edx
190 xorl %ebx,%ebx
191 clc
192 .p2align 4
193.Lloop_1:
194 source
195 movw (%rdi),%bx
196 adcl %ebx,%eax
197 decl %ecx
198 dest
199 movw %bx,(%rsi)
200 leaq 2(%rdi),%rdi
201 leaq 2(%rsi),%rsi
202 jnz .Lloop_1
203 adcl %r9d,%eax /* add in carry */
204
205 /* handle last odd byte */
206.Lhandle_1:
207 testl $1,%r10d
208 jz .Lende
209 xorl %ebx,%ebx
210 source
211 movb (%rdi),%bl
212 dest
213 movb %bl,(%rsi)
214 addl %ebx,%eax
215 adcl %r9d,%eax /* carry */
216
217 CFI_REMEMBER_STATE
218.Lende:
219 movq 2*8(%rsp),%rbx
220 CFI_RESTORE rbx
221 movq 3*8(%rsp),%r12
222 CFI_RESTORE r12
223 movq 4*8(%rsp),%r14
224 CFI_RESTORE r14
225 movq 5*8(%rsp),%r13
226 CFI_RESTORE r13
227 movq 6*8(%rsp),%rbp
228 CFI_RESTORE rbp
229 addq $7*8,%rsp
230 CFI_ADJUST_CFA_OFFSET -7*8
231 ret
232 CFI_RESTORE_STATE
233
234 /* Exception handlers. Very simple, zeroing is done in the wrappers */
235.Lbad_source:
236 movq (%rsp),%rax
237 testq %rax,%rax
238 jz .Lende
239 movl $-EFAULT,(%rax)
240 jmp .Lende
241
242.Lbad_dest:
243 movq 8(%rsp),%rax
244 testq %rax,%rax
245 jz .Lende
246 movl $-EFAULT,(%rax)
247 jmp .Lende
248 CFI_ENDPROC
249ENDPROC(csum_partial_copy_generic)
diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
new file mode 100644
index 000000000000..bc503f506903
--- /dev/null
+++ b/arch/x86/lib/csum-partial_64.c
@@ -0,0 +1,150 @@
1/*
2 * arch/x86_64/lib/csum-partial.c
3 *
4 * This file contains network checksum routines that are better done
5 * in an architecture-specific manner due to speed.
6 */
7
8#include <linux/compiler.h>
9#include <linux/module.h>
10#include <asm/checksum.h>
11
12static inline unsigned short from32to16(unsigned a)
13{
14 unsigned short b = a >> 16;
15 asm("addw %w2,%w0\n\t"
16 "adcw $0,%w0\n"
17 : "=r" (b)
18 : "0" (b), "r" (a));
19 return b;
20}
21
22/*
23 * Do a 64-bit checksum on an arbitrary memory area.
24 * Returns a 32bit checksum.
25 *
26 * This isn't as time critical as it used to be because many NICs
27 * do hardware checksumming these days.
28 *
29 * Things tried and found to not make it faster:
30 * Manual Prefetching
31 * Unrolling to an 128 bytes inner loop.
32 * Using interleaving with more registers to break the carry chains.
33 */
34static unsigned do_csum(const unsigned char *buff, unsigned len)
35{
36 unsigned odd, count;
37 unsigned long result = 0;
38
39 if (unlikely(len == 0))
40 return result;
41 odd = 1 & (unsigned long) buff;
42 if (unlikely(odd)) {
43 result = *buff << 8;
44 len--;
45 buff++;
46 }
47 count = len >> 1; /* nr of 16-bit words.. */
48 if (count) {
49 if (2 & (unsigned long) buff) {
50 result += *(unsigned short *)buff;
51 count--;
52 len -= 2;
53 buff += 2;
54 }
55 count >>= 1; /* nr of 32-bit words.. */
56 if (count) {
57 unsigned long zero;
58 unsigned count64;
59 if (4 & (unsigned long) buff) {
60 result += *(unsigned int *) buff;
61 count--;
62 len -= 4;
63 buff += 4;
64 }
65 count >>= 1; /* nr of 64-bit words.. */
66
67 /* main loop using 64byte blocks */
68 zero = 0;
69 count64 = count >> 3;
70 while (count64) {
71 asm("addq 0*8(%[src]),%[res]\n\t"
72 "adcq 1*8(%[src]),%[res]\n\t"
73 "adcq 2*8(%[src]),%[res]\n\t"
74 "adcq 3*8(%[src]),%[res]\n\t"
75 "adcq 4*8(%[src]),%[res]\n\t"
76 "adcq 5*8(%[src]),%[res]\n\t"
77 "adcq 6*8(%[src]),%[res]\n\t"
78 "adcq 7*8(%[src]),%[res]\n\t"
79 "adcq %[zero],%[res]"
80 : [res] "=r" (result)
81 : [src] "r" (buff), [zero] "r" (zero),
82 "[res]" (result));
83 buff += 64;
84 count64--;
85 }
86
87 /* last upto 7 8byte blocks */
88 count %= 8;
89 while (count) {
90 asm("addq %1,%0\n\t"
91 "adcq %2,%0\n"
92 : "=r" (result)
93 : "m" (*(unsigned long *)buff),
94 "r" (zero), "0" (result));
95 --count;
96 buff += 8;
97 }
98 result = add32_with_carry(result>>32,
99 result&0xffffffff);
100
101 if (len & 4) {
102 result += *(unsigned int *) buff;
103 buff += 4;
104 }
105 }
106 if (len & 2) {
107 result += *(unsigned short *) buff;
108 buff += 2;
109 }
110 }
111 if (len & 1)
112 result += *buff;
113 result = add32_with_carry(result>>32, result & 0xffffffff);
114 if (unlikely(odd)) {
115 result = from32to16(result);
116 result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
117 }
118 return result;
119}
120
121/*
122 * computes the checksum of a memory block at buff, length len,
123 * and adds in "sum" (32-bit)
124 *
125 * returns a 32-bit number suitable for feeding into itself
126 * or csum_tcpudp_magic
127 *
128 * this function must be called with even lengths, except
129 * for the last fragment, which may be odd
130 *
131 * it's best to have buff aligned on a 64-bit boundary
132 */
133__wsum csum_partial(const void *buff, int len, __wsum sum)
134{
135 return (__force __wsum)add32_with_carry(do_csum(buff, len),
136 (__force u32)sum);
137}
138
139EXPORT_SYMBOL(csum_partial);
140
141/*
142 * this routine is used for miscellaneous IP-like checksums, mainly
143 * in icmp.c
144 */
145__sum16 ip_compute_csum(const void *buff, int len)
146{
147 return csum_fold(csum_partial(buff,len,0));
148}
149EXPORT_SYMBOL(ip_compute_csum);
150
diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c
new file mode 100644
index 000000000000..fd42a4a095fc
--- /dev/null
+++ b/arch/x86/lib/csum-wrappers_64.c
@@ -0,0 +1,135 @@
1/* Copyright 2002,2003 Andi Kleen, SuSE Labs.
2 * Subject to the GNU Public License v.2
3 *
4 * Wrappers of assembly checksum functions for x86-64.
5 */
6
7#include <asm/checksum.h>
8#include <linux/module.h>
9
10/**
11 * csum_partial_copy_from_user - Copy and checksum from user space.
12 * @src: source address (user space)
13 * @dst: destination address
14 * @len: number of bytes to be copied.
15 * @isum: initial sum that is added into the result (32bit unfolded)
16 * @errp: set to -EFAULT for an bad source address.
17 *
18 * Returns an 32bit unfolded checksum of the buffer.
19 * src and dst are best aligned to 64bits.
20 */
21__wsum
22csum_partial_copy_from_user(const void __user *src, void *dst,
23 int len, __wsum isum, int *errp)
24{
25 might_sleep();
26 *errp = 0;
27 if (likely(access_ok(VERIFY_READ,src, len))) {
28 /* Why 6, not 7? To handle odd addresses aligned we
29 would need to do considerable complications to fix the
30 checksum which is defined as an 16bit accumulator. The
31 fix alignment code is primarily for performance
32 compatibility with 32bit and that will handle odd
33 addresses slowly too. */
34 if (unlikely((unsigned long)src & 6)) {
35 while (((unsigned long)src & 6) && len >= 2) {
36 __u16 val16;
37 *errp = __get_user(val16, (const __u16 __user *)src);
38 if (*errp)
39 return isum;
40 *(__u16 *)dst = val16;
41 isum = (__force __wsum)add32_with_carry(
42 (__force unsigned)isum, val16);
43 src += 2;
44 dst += 2;
45 len -= 2;
46 }
47 }
48 isum = csum_partial_copy_generic((__force const void *)src,
49 dst, len, isum, errp, NULL);
50 if (likely(*errp == 0))
51 return isum;
52 }
53 *errp = -EFAULT;
54 memset(dst,0,len);
55 return isum;
56}
57
58EXPORT_SYMBOL(csum_partial_copy_from_user);
59
60/**
61 * csum_partial_copy_to_user - Copy and checksum to user space.
62 * @src: source address
63 * @dst: destination address (user space)
64 * @len: number of bytes to be copied.
65 * @isum: initial sum that is added into the result (32bit unfolded)
66 * @errp: set to -EFAULT for an bad destination address.
67 *
68 * Returns an 32bit unfolded checksum of the buffer.
69 * src and dst are best aligned to 64bits.
70 */
71__wsum
72csum_partial_copy_to_user(const void *src, void __user *dst,
73 int len, __wsum isum, int *errp)
74{
75 might_sleep();
76 if (unlikely(!access_ok(VERIFY_WRITE, dst, len))) {
77 *errp = -EFAULT;
78 return 0;
79 }
80
81 if (unlikely((unsigned long)dst & 6)) {
82 while (((unsigned long)dst & 6) && len >= 2) {
83 __u16 val16 = *(__u16 *)src;
84 isum = (__force __wsum)add32_with_carry(
85 (__force unsigned)isum, val16);
86 *errp = __put_user(val16, (__u16 __user *)dst);
87 if (*errp)
88 return isum;
89 src += 2;
90 dst += 2;
91 len -= 2;
92 }
93 }
94
95 *errp = 0;
96 return csum_partial_copy_generic(src, (void __force *)dst,len,isum,NULL,errp);
97}
98
99EXPORT_SYMBOL(csum_partial_copy_to_user);
100
101/**
102 * csum_partial_copy_nocheck - Copy and checksum.
103 * @src: source address
104 * @dst: destination address
105 * @len: number of bytes to be copied.
106 * @isum: initial sum that is added into the result (32bit unfolded)
107 *
108 * Returns an 32bit unfolded checksum of the buffer.
109 */
110__wsum
111csum_partial_copy_nocheck(const void *src, void *dst, int len, __wsum sum)
112{
113 return csum_partial_copy_generic(src,dst,len,sum,NULL,NULL);
114}
115EXPORT_SYMBOL(csum_partial_copy_nocheck);
116
117__sum16 csum_ipv6_magic(const struct in6_addr *saddr,
118 const struct in6_addr *daddr,
119 __u32 len, unsigned short proto, __wsum sum)
120{
121 __u64 rest, sum64;
122
123 rest = (__force __u64)htonl(len) + (__force __u64)htons(proto) +
124 (__force __u64)sum;
125 asm(" addq (%[saddr]),%[sum]\n"
126 " adcq 8(%[saddr]),%[sum]\n"
127 " adcq (%[daddr]),%[sum]\n"
128 " adcq 8(%[daddr]),%[sum]\n"
129 " adcq $0,%[sum]\n"
130 : [sum] "=r" (sum64)
131 : "[sum]" (rest),[saddr] "r" (saddr), [daddr] "r" (daddr));
132 return csum_fold((__force __wsum)add32_with_carry(sum64 & 0xffffffff, sum64>>32));
133}
134
135EXPORT_SYMBOL(csum_ipv6_magic);
diff --git a/arch/x86/lib/delay_32.c b/arch/x86/lib/delay_32.c
new file mode 100644
index 000000000000..f6edb11364df
--- /dev/null
+++ b/arch/x86/lib/delay_32.c
@@ -0,0 +1,103 @@
1/*
2 * Precise Delay Loops for i386
3 *
4 * Copyright (C) 1993 Linus Torvalds
5 * Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
6 *
7 * The __delay function must _NOT_ be inlined as its execution time
8 * depends wildly on alignment on many x86 processors. The additional
9 * jump magic is needed to get the timing stable on all the CPU's
10 * we have to worry about.
11 */
12
13#include <linux/module.h>
14#include <linux/sched.h>
15#include <linux/delay.h>
16
17#include <asm/processor.h>
18#include <asm/delay.h>
19#include <asm/timer.h>
20
21#ifdef CONFIG_SMP
22# include <asm/smp.h>
23#endif
24
25/* simple loop based delay: */
26static void delay_loop(unsigned long loops)
27{
28 int d0;
29
30 __asm__ __volatile__(
31 "\tjmp 1f\n"
32 ".align 16\n"
33 "1:\tjmp 2f\n"
34 ".align 16\n"
35 "2:\tdecl %0\n\tjns 2b"
36 :"=&a" (d0)
37 :"0" (loops));
38}
39
40/* TSC based delay: */
41static void delay_tsc(unsigned long loops)
42{
43 unsigned long bclock, now;
44
45 rdtscl(bclock);
46 do {
47 rep_nop();
48 rdtscl(now);
49 } while ((now-bclock) < loops);
50}
51
52/*
53 * Since we calibrate only once at boot, this
54 * function should be set once at boot and not changed
55 */
56static void (*delay_fn)(unsigned long) = delay_loop;
57
58void use_tsc_delay(void)
59{
60 delay_fn = delay_tsc;
61}
62
63int read_current_timer(unsigned long *timer_val)
64{
65 if (delay_fn == delay_tsc) {
66 rdtscl(*timer_val);
67 return 0;
68 }
69 return -1;
70}
71
72void __delay(unsigned long loops)
73{
74 delay_fn(loops);
75}
76
77inline void __const_udelay(unsigned long xloops)
78{
79 int d0;
80
81 xloops *= 4;
82 __asm__("mull %0"
83 :"=d" (xloops), "=&a" (d0)
84 :"1" (xloops), "0"
85 (cpu_data[raw_smp_processor_id()].loops_per_jiffy * (HZ/4)));
86
87 __delay(++xloops);
88}
89
90void __udelay(unsigned long usecs)
91{
92 __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */
93}
94
95void __ndelay(unsigned long nsecs)
96{
97 __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */
98}
99
100EXPORT_SYMBOL(__delay);
101EXPORT_SYMBOL(__const_udelay);
102EXPORT_SYMBOL(__udelay);
103EXPORT_SYMBOL(__ndelay);
diff --git a/arch/x86/lib/delay_64.c b/arch/x86/lib/delay_64.c
new file mode 100644
index 000000000000..2dbebd308347
--- /dev/null
+++ b/arch/x86/lib/delay_64.c
@@ -0,0 +1,57 @@
1/*
2 * Precise Delay Loops for x86-64
3 *
4 * Copyright (C) 1993 Linus Torvalds
5 * Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
6 *
7 * The __delay function must _NOT_ be inlined as its execution time
8 * depends wildly on alignment on many x86 processors.
9 */
10
11#include <linux/module.h>
12#include <linux/sched.h>
13#include <linux/delay.h>
14#include <asm/delay.h>
15#include <asm/msr.h>
16
17#ifdef CONFIG_SMP
18#include <asm/smp.h>
19#endif
20
21int read_current_timer(unsigned long *timer_value)
22{
23 rdtscll(*timer_value);
24 return 0;
25}
26
27void __delay(unsigned long loops)
28{
29 unsigned bclock, now;
30
31 rdtscl(bclock);
32 do
33 {
34 rep_nop();
35 rdtscl(now);
36 }
37 while((now-bclock) < loops);
38}
39EXPORT_SYMBOL(__delay);
40
41inline void __const_udelay(unsigned long xloops)
42{
43 __delay(((xloops * HZ * cpu_data[raw_smp_processor_id()].loops_per_jiffy) >> 32) + 1);
44}
45EXPORT_SYMBOL(__const_udelay);
46
47void __udelay(unsigned long usecs)
48{
49 __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */
50}
51EXPORT_SYMBOL(__udelay);
52
53void __ndelay(unsigned long nsecs)
54{
55 __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */
56}
57EXPORT_SYMBOL(__ndelay);
diff --git a/arch/x86/lib/getuser_32.S b/arch/x86/lib/getuser_32.S
new file mode 100644
index 000000000000..6d84b53f12a2
--- /dev/null
+++ b/arch/x86/lib/getuser_32.S
@@ -0,0 +1,78 @@
1/*
2 * __get_user functions.
3 *
4 * (C) Copyright 1998 Linus Torvalds
5 *
6 * These functions have a non-standard call interface
7 * to make them more efficient, especially as they
8 * return an error value in addition to the "real"
9 * return value.
10 */
11#include <linux/linkage.h>
12#include <asm/dwarf2.h>
13#include <asm/thread_info.h>
14
15
16/*
17 * __get_user_X
18 *
19 * Inputs: %eax contains the address
20 *
21 * Outputs: %eax is error code (0 or -EFAULT)
22 * %edx contains zero-extended value
23 *
24 * These functions should not modify any other registers,
25 * as they get called from within inline assembly.
26 */
27
28.text
29ENTRY(__get_user_1)
30 CFI_STARTPROC
31 GET_THREAD_INFO(%edx)
32 cmpl TI_addr_limit(%edx),%eax
33 jae bad_get_user
341: movzbl (%eax),%edx
35 xorl %eax,%eax
36 ret
37 CFI_ENDPROC
38ENDPROC(__get_user_1)
39
40ENTRY(__get_user_2)
41 CFI_STARTPROC
42 addl $1,%eax
43 jc bad_get_user
44 GET_THREAD_INFO(%edx)
45 cmpl TI_addr_limit(%edx),%eax
46 jae bad_get_user
472: movzwl -1(%eax),%edx
48 xorl %eax,%eax
49 ret
50 CFI_ENDPROC
51ENDPROC(__get_user_2)
52
53ENTRY(__get_user_4)
54 CFI_STARTPROC
55 addl $3,%eax
56 jc bad_get_user
57 GET_THREAD_INFO(%edx)
58 cmpl TI_addr_limit(%edx),%eax
59 jae bad_get_user
603: movl -3(%eax),%edx
61 xorl %eax,%eax
62 ret
63 CFI_ENDPROC
64ENDPROC(__get_user_4)
65
66bad_get_user:
67 CFI_STARTPROC
68 xorl %edx,%edx
69 movl $-14,%eax
70 ret
71 CFI_ENDPROC
72END(bad_get_user)
73
74.section __ex_table,"a"
75 .long 1b,bad_get_user
76 .long 2b,bad_get_user
77 .long 3b,bad_get_user
78.previous
diff --git a/arch/x86/lib/getuser_64.S b/arch/x86/lib/getuser_64.S
new file mode 100644
index 000000000000..5448876261f8
--- /dev/null
+++ b/arch/x86/lib/getuser_64.S
@@ -0,0 +1,109 @@
1/*
2 * __get_user functions.
3 *
4 * (C) Copyright 1998 Linus Torvalds
5 * (C) Copyright 2005 Andi Kleen
6 *
7 * These functions have a non-standard call interface
8 * to make them more efficient, especially as they
9 * return an error value in addition to the "real"
10 * return value.
11 */
12
13/*
14 * __get_user_X
15 *
16 * Inputs: %rcx contains the address.
17 * The register is modified, but all changes are undone
18 * before returning because the C code doesn't know about it.
19 *
20 * Outputs: %rax is error code (0 or -EFAULT)
21 * %rdx contains zero-extended value
22 *
23 * %r8 is destroyed.
24 *
25 * These functions should not modify any other registers,
26 * as they get called from within inline assembly.
27 */
28
29#include <linux/linkage.h>
30#include <asm/dwarf2.h>
31#include <asm/page.h>
32#include <asm/errno.h>
33#include <asm/asm-offsets.h>
34#include <asm/thread_info.h>
35
36 .text
37ENTRY(__get_user_1)
38 CFI_STARTPROC
39 GET_THREAD_INFO(%r8)
40 cmpq threadinfo_addr_limit(%r8),%rcx
41 jae bad_get_user
421: movzb (%rcx),%edx
43 xorl %eax,%eax
44 ret
45 CFI_ENDPROC
46ENDPROC(__get_user_1)
47
48ENTRY(__get_user_2)
49 CFI_STARTPROC
50 GET_THREAD_INFO(%r8)
51 addq $1,%rcx
52 jc 20f
53 cmpq threadinfo_addr_limit(%r8),%rcx
54 jae 20f
55 decq %rcx
562: movzwl (%rcx),%edx
57 xorl %eax,%eax
58 ret
5920: decq %rcx
60 jmp bad_get_user
61 CFI_ENDPROC
62ENDPROC(__get_user_2)
63
64ENTRY(__get_user_4)
65 CFI_STARTPROC
66 GET_THREAD_INFO(%r8)
67 addq $3,%rcx
68 jc 30f
69 cmpq threadinfo_addr_limit(%r8),%rcx
70 jae 30f
71 subq $3,%rcx
723: movl (%rcx),%edx
73 xorl %eax,%eax
74 ret
7530: subq $3,%rcx
76 jmp bad_get_user
77 CFI_ENDPROC
78ENDPROC(__get_user_4)
79
80ENTRY(__get_user_8)
81 CFI_STARTPROC
82 GET_THREAD_INFO(%r8)
83 addq $7,%rcx
84 jc 40f
85 cmpq threadinfo_addr_limit(%r8),%rcx
86 jae 40f
87 subq $7,%rcx
884: movq (%rcx),%rdx
89 xorl %eax,%eax
90 ret
9140: subq $7,%rcx
92 jmp bad_get_user
93 CFI_ENDPROC
94ENDPROC(__get_user_8)
95
96bad_get_user:
97 CFI_STARTPROC
98 xorl %edx,%edx
99 movq $(-EFAULT),%rax
100 ret
101 CFI_ENDPROC
102END(bad_get_user)
103
104.section __ex_table,"a"
105 .quad 1b,bad_get_user
106 .quad 2b,bad_get_user
107 .quad 3b,bad_get_user
108 .quad 4b,bad_get_user
109.previous
diff --git a/arch/x86/lib/io_64.c b/arch/x86/lib/io_64.c
new file mode 100644
index 000000000000..87b4a4e18039
--- /dev/null
+++ b/arch/x86/lib/io_64.c
@@ -0,0 +1,23 @@
1#include <linux/string.h>
2#include <asm/io.h>
3#include <linux/module.h>
4
5void __memcpy_toio(unsigned long dst,const void*src,unsigned len)
6{
7 __inline_memcpy((void *) dst,src,len);
8}
9EXPORT_SYMBOL(__memcpy_toio);
10
11void __memcpy_fromio(void *dst,unsigned long src,unsigned len)
12{
13 __inline_memcpy(dst,(const void *) src,len);
14}
15EXPORT_SYMBOL(__memcpy_fromio);
16
17void memset_io(volatile void __iomem *a, int b, size_t c)
18{
19 /* XXX: memset can mangle the IO patterns quite a bit.
20 perhaps it would be better to use a dumb one */
21 memset((void *)a,b,c);
22}
23EXPORT_SYMBOL(memset_io);
diff --git a/arch/x86/lib/iomap_copy_64.S b/arch/x86/lib/iomap_copy_64.S
new file mode 100644
index 000000000000..05a95e713da8
--- /dev/null
+++ b/arch/x86/lib/iomap_copy_64.S
@@ -0,0 +1,30 @@
1/*
2 * Copyright 2006 PathScale, Inc. All Rights Reserved.
3 *
4 * This file is free software; you can redistribute it and/or modify
5 * it under the terms of version 2 of the GNU General Public License
6 * as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software Foundation,
15 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
16 */
17
18#include <linux/linkage.h>
19#include <asm/dwarf2.h>
20
21/*
22 * override generic version in lib/iomap_copy.c
23 */
24ENTRY(__iowrite32_copy)
25 CFI_STARTPROC
26 movl %edx,%ecx
27 rep movsd
28 ret
29 CFI_ENDPROC
30ENDPROC(__iowrite32_copy)
diff --git a/arch/x86/lib/memcpy_32.c b/arch/x86/lib/memcpy_32.c
new file mode 100644
index 000000000000..8ac51b82a632
--- /dev/null
+++ b/arch/x86/lib/memcpy_32.c
@@ -0,0 +1,43 @@
1#include <linux/string.h>
2#include <linux/module.h>
3
4#undef memcpy
5#undef memset
6
7void *memcpy(void *to, const void *from, size_t n)
8{
9#ifdef CONFIG_X86_USE_3DNOW
10 return __memcpy3d(to, from, n);
11#else
12 return __memcpy(to, from, n);
13#endif
14}
15EXPORT_SYMBOL(memcpy);
16
17void *memset(void *s, int c, size_t count)
18{
19 return __memset(s, c, count);
20}
21EXPORT_SYMBOL(memset);
22
23void *memmove(void *dest, const void *src, size_t n)
24{
25 int d0, d1, d2;
26
27 if (dest < src) {
28 memcpy(dest,src,n);
29 } else {
30 __asm__ __volatile__(
31 "std\n\t"
32 "rep\n\t"
33 "movsb\n\t"
34 "cld"
35 : "=&c" (d0), "=&S" (d1), "=&D" (d2)
36 :"0" (n),
37 "1" (n-1+(const char *)src),
38 "2" (n-1+(char *)dest)
39 :"memory");
40 }
41 return dest;
42}
43EXPORT_SYMBOL(memmove);
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
new file mode 100644
index 000000000000..c22981fa2f3a
--- /dev/null
+++ b/arch/x86/lib/memcpy_64.S
@@ -0,0 +1,131 @@
1/* Copyright 2002 Andi Kleen */
2
3#include <linux/linkage.h>
4#include <asm/dwarf2.h>
5#include <asm/cpufeature.h>
6
7/*
8 * memcpy - Copy a memory block.
9 *
10 * Input:
11 * rdi destination
12 * rsi source
13 * rdx count
14 *
15 * Output:
16 * rax original destination
17 */
18
19 ALIGN
20memcpy_c:
21 CFI_STARTPROC
22 movq %rdi,%rax
23 movl %edx,%ecx
24 shrl $3,%ecx
25 andl $7,%edx
26 rep movsq
27 movl %edx,%ecx
28 rep movsb
29 ret
30 CFI_ENDPROC
31ENDPROC(memcpy_c)
32
33ENTRY(__memcpy)
34ENTRY(memcpy)
35 CFI_STARTPROC
36 pushq %rbx
37 CFI_ADJUST_CFA_OFFSET 8
38 CFI_REL_OFFSET rbx, 0
39 movq %rdi,%rax
40
41 movl %edx,%ecx
42 shrl $6,%ecx
43 jz .Lhandle_tail
44
45 .p2align 4
46.Lloop_64:
47 decl %ecx
48
49 movq (%rsi),%r11
50 movq 8(%rsi),%r8
51
52 movq %r11,(%rdi)
53 movq %r8,1*8(%rdi)
54
55 movq 2*8(%rsi),%r9
56 movq 3*8(%rsi),%r10
57
58 movq %r9,2*8(%rdi)
59 movq %r10,3*8(%rdi)
60
61 movq 4*8(%rsi),%r11
62 movq 5*8(%rsi),%r8
63
64 movq %r11,4*8(%rdi)
65 movq %r8,5*8(%rdi)
66
67 movq 6*8(%rsi),%r9
68 movq 7*8(%rsi),%r10
69
70 movq %r9,6*8(%rdi)
71 movq %r10,7*8(%rdi)
72
73 leaq 64(%rsi),%rsi
74 leaq 64(%rdi),%rdi
75 jnz .Lloop_64
76
77.Lhandle_tail:
78 movl %edx,%ecx
79 andl $63,%ecx
80 shrl $3,%ecx
81 jz .Lhandle_7
82 .p2align 4
83.Lloop_8:
84 decl %ecx
85 movq (%rsi),%r8
86 movq %r8,(%rdi)
87 leaq 8(%rdi),%rdi
88 leaq 8(%rsi),%rsi
89 jnz .Lloop_8
90
91.Lhandle_7:
92 movl %edx,%ecx
93 andl $7,%ecx
94 jz .Lende
95 .p2align 4
96.Lloop_1:
97 movb (%rsi),%r8b
98 movb %r8b,(%rdi)
99 incq %rdi
100 incq %rsi
101 decl %ecx
102 jnz .Lloop_1
103
104.Lende:
105 popq %rbx
106 CFI_ADJUST_CFA_OFFSET -8
107 CFI_RESTORE rbx
108 ret
109.Lfinal:
110 CFI_ENDPROC
111ENDPROC(memcpy)
112ENDPROC(__memcpy)
113
114 /* Some CPUs run faster using the string copy instructions.
115 It is also a lot simpler. Use this when possible */
116
117 .section .altinstr_replacement,"ax"
1181: .byte 0xeb /* jmp <disp8> */
119 .byte (memcpy_c - memcpy) - (2f - 1b) /* offset */
1202:
121 .previous
122 .section .altinstructions,"a"
123 .align 8
124 .quad memcpy
125 .quad 1b
126 .byte X86_FEATURE_REP_GOOD
127 /* Replace only beginning, memcpy is used to apply alternatives, so it
128 * is silly to overwrite itself with nops - reboot is only outcome... */
129 .byte 2b - 1b
130 .byte 2b - 1b
131 .previous
diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c
new file mode 100644
index 000000000000..751ebae8ec42
--- /dev/null
+++ b/arch/x86/lib/memmove_64.c
@@ -0,0 +1,21 @@
1/* Normally compiler builtins are used, but sometimes the compiler calls out
2 of line code. Based on asm-i386/string.h.
3 */
4#define _STRING_C
5#include <linux/string.h>
6#include <linux/module.h>
7
8#undef memmove
9void *memmove(void * dest,const void *src,size_t count)
10{
11 if (dest < src) {
12 return memcpy(dest,src,count);
13 } else {
14 char *p = (char *) dest + count;
15 char *s = (char *) src + count;
16 while (count--)
17 *--p = *--s;
18 }
19 return dest;
20}
21EXPORT_SYMBOL(memmove);
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
new file mode 100644
index 000000000000..2c5948116bd2
--- /dev/null
+++ b/arch/x86/lib/memset_64.S
@@ -0,0 +1,133 @@
1/* Copyright 2002 Andi Kleen, SuSE Labs */
2
3#include <linux/linkage.h>
4#include <asm/dwarf2.h>
5
6/*
7 * ISO C memset - set a memory block to a byte value.
8 *
9 * rdi destination
10 * rsi value (char)
11 * rdx count (bytes)
12 *
13 * rax original destination
14 */
15 ALIGN
16memset_c:
17 CFI_STARTPROC
18 movq %rdi,%r9
19 movl %edx,%r8d
20 andl $7,%r8d
21 movl %edx,%ecx
22 shrl $3,%ecx
23 /* expand byte value */
24 movzbl %sil,%esi
25 movabs $0x0101010101010101,%rax
26 mulq %rsi /* with rax, clobbers rdx */
27 rep stosq
28 movl %r8d,%ecx
29 rep stosb
30 movq %r9,%rax
31 ret
32 CFI_ENDPROC
33ENDPROC(memset_c)
34
35ENTRY(memset)
36ENTRY(__memset)
37 CFI_STARTPROC
38 movq %rdi,%r10
39 movq %rdx,%r11
40
41 /* expand byte value */
42 movzbl %sil,%ecx
43 movabs $0x0101010101010101,%rax
44 mul %rcx /* with rax, clobbers rdx */
45
46 /* align dst */
47 movl %edi,%r9d
48 andl $7,%r9d
49 jnz .Lbad_alignment
50 CFI_REMEMBER_STATE
51.Lafter_bad_alignment:
52
53 movl %r11d,%ecx
54 shrl $6,%ecx
55 jz .Lhandle_tail
56
57 .p2align 4
58.Lloop_64:
59 decl %ecx
60 movq %rax,(%rdi)
61 movq %rax,8(%rdi)
62 movq %rax,16(%rdi)
63 movq %rax,24(%rdi)
64 movq %rax,32(%rdi)
65 movq %rax,40(%rdi)
66 movq %rax,48(%rdi)
67 movq %rax,56(%rdi)
68 leaq 64(%rdi),%rdi
69 jnz .Lloop_64
70
71 /* Handle tail in loops. The loops should be faster than hard
72 to predict jump tables. */
73 .p2align 4
74.Lhandle_tail:
75 movl %r11d,%ecx
76 andl $63&(~7),%ecx
77 jz .Lhandle_7
78 shrl $3,%ecx
79 .p2align 4
80.Lloop_8:
81 decl %ecx
82 movq %rax,(%rdi)
83 leaq 8(%rdi),%rdi
84 jnz .Lloop_8
85
86.Lhandle_7:
87 movl %r11d,%ecx
88 andl $7,%ecx
89 jz .Lende
90 .p2align 4
91.Lloop_1:
92 decl %ecx
93 movb %al,(%rdi)
94 leaq 1(%rdi),%rdi
95 jnz .Lloop_1
96
97.Lende:
98 movq %r10,%rax
99 ret
100
101 CFI_RESTORE_STATE
102.Lbad_alignment:
103 cmpq $7,%r11
104 jbe .Lhandle_7
105 movq %rax,(%rdi) /* unaligned store */
106 movq $8,%r8
107 subq %r9,%r8
108 addq %r8,%rdi
109 subq %r8,%r11
110 jmp .Lafter_bad_alignment
111.Lfinal:
112 CFI_ENDPROC
113ENDPROC(memset)
114ENDPROC(__memset)
115
116 /* Some CPUs run faster using the string instructions.
117 It is also a lot simpler. Use this when possible */
118
119#include <asm/cpufeature.h>
120
121 .section .altinstr_replacement,"ax"
1221: .byte 0xeb /* jmp <disp8> */
123 .byte (memset_c - memset) - (2f - 1b) /* offset */
1242:
125 .previous
126 .section .altinstructions,"a"
127 .align 8
128 .quad memset
129 .quad 1b
130 .byte X86_FEATURE_REP_GOOD
131 .byte .Lfinal - memset
132 .byte 2b - 1b
133 .previous
diff --git a/arch/x86/lib/mmx_32.c b/arch/x86/lib/mmx_32.c
new file mode 100644
index 000000000000..28084d2e8dd4
--- /dev/null
+++ b/arch/x86/lib/mmx_32.c
@@ -0,0 +1,403 @@
1#include <linux/types.h>
2#include <linux/string.h>
3#include <linux/sched.h>
4#include <linux/hardirq.h>
5#include <linux/module.h>
6
7#include <asm/i387.h>
8
9
10/*
11 * MMX 3DNow! library helper functions
12 *
13 * To do:
14 * We can use MMX just for prefetch in IRQ's. This may be a win.
15 * (reported so on K6-III)
16 * We should use a better code neutral filler for the short jump
17 * leal ebx. [ebx] is apparently best for K6-2, but Cyrix ??
18 * We also want to clobber the filler register so we don't get any
19 * register forwarding stalls on the filler.
20 *
21 * Add *user handling. Checksums are not a win with MMX on any CPU
22 * tested so far for any MMX solution figured.
23 *
24 * 22/09/2000 - Arjan van de Ven
25 * Improved for non-egineering-sample Athlons
26 *
27 */
28
29void *_mmx_memcpy(void *to, const void *from, size_t len)
30{
31 void *p;
32 int i;
33
34 if (unlikely(in_interrupt()))
35 return __memcpy(to, from, len);
36
37 p = to;
38 i = len >> 6; /* len/64 */
39
40 kernel_fpu_begin();
41
42 __asm__ __volatile__ (
43 "1: prefetch (%0)\n" /* This set is 28 bytes */
44 " prefetch 64(%0)\n"
45 " prefetch 128(%0)\n"
46 " prefetch 192(%0)\n"
47 " prefetch 256(%0)\n"
48 "2: \n"
49 ".section .fixup, \"ax\"\n"
50 "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
51 " jmp 2b\n"
52 ".previous\n"
53 ".section __ex_table,\"a\"\n"
54 " .align 4\n"
55 " .long 1b, 3b\n"
56 ".previous"
57 : : "r" (from) );
58
59
60 for(; i>5; i--)
61 {
62 __asm__ __volatile__ (
63 "1: prefetch 320(%0)\n"
64 "2: movq (%0), %%mm0\n"
65 " movq 8(%0), %%mm1\n"
66 " movq 16(%0), %%mm2\n"
67 " movq 24(%0), %%mm3\n"
68 " movq %%mm0, (%1)\n"
69 " movq %%mm1, 8(%1)\n"
70 " movq %%mm2, 16(%1)\n"
71 " movq %%mm3, 24(%1)\n"
72 " movq 32(%0), %%mm0\n"
73 " movq 40(%0), %%mm1\n"
74 " movq 48(%0), %%mm2\n"
75 " movq 56(%0), %%mm3\n"
76 " movq %%mm0, 32(%1)\n"
77 " movq %%mm1, 40(%1)\n"
78 " movq %%mm2, 48(%1)\n"
79 " movq %%mm3, 56(%1)\n"
80 ".section .fixup, \"ax\"\n"
81 "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
82 " jmp 2b\n"
83 ".previous\n"
84 ".section __ex_table,\"a\"\n"
85 " .align 4\n"
86 " .long 1b, 3b\n"
87 ".previous"
88 : : "r" (from), "r" (to) : "memory");
89 from+=64;
90 to+=64;
91 }
92
93 for(; i>0; i--)
94 {
95 __asm__ __volatile__ (
96 " movq (%0), %%mm0\n"
97 " movq 8(%0), %%mm1\n"
98 " movq 16(%0), %%mm2\n"
99 " movq 24(%0), %%mm3\n"
100 " movq %%mm0, (%1)\n"
101 " movq %%mm1, 8(%1)\n"
102 " movq %%mm2, 16(%1)\n"
103 " movq %%mm3, 24(%1)\n"
104 " movq 32(%0), %%mm0\n"
105 " movq 40(%0), %%mm1\n"
106 " movq 48(%0), %%mm2\n"
107 " movq 56(%0), %%mm3\n"
108 " movq %%mm0, 32(%1)\n"
109 " movq %%mm1, 40(%1)\n"
110 " movq %%mm2, 48(%1)\n"
111 " movq %%mm3, 56(%1)\n"
112 : : "r" (from), "r" (to) : "memory");
113 from+=64;
114 to+=64;
115 }
116 /*
117 * Now do the tail of the block
118 */
119 __memcpy(to, from, len&63);
120 kernel_fpu_end();
121 return p;
122}
123
124#ifdef CONFIG_MK7
125
126/*
127 * The K7 has streaming cache bypass load/store. The Cyrix III, K6 and
128 * other MMX using processors do not.
129 */
130
131static void fast_clear_page(void *page)
132{
133 int i;
134
135 kernel_fpu_begin();
136
137 __asm__ __volatile__ (
138 " pxor %%mm0, %%mm0\n" : :
139 );
140
141 for(i=0;i<4096/64;i++)
142 {
143 __asm__ __volatile__ (
144 " movntq %%mm0, (%0)\n"
145 " movntq %%mm0, 8(%0)\n"
146 " movntq %%mm0, 16(%0)\n"
147 " movntq %%mm0, 24(%0)\n"
148 " movntq %%mm0, 32(%0)\n"
149 " movntq %%mm0, 40(%0)\n"
150 " movntq %%mm0, 48(%0)\n"
151 " movntq %%mm0, 56(%0)\n"
152 : : "r" (page) : "memory");
153 page+=64;
154 }
155 /* since movntq is weakly-ordered, a "sfence" is needed to become
156 * ordered again.
157 */
158 __asm__ __volatile__ (
159 " sfence \n" : :
160 );
161 kernel_fpu_end();
162}
163
164static void fast_copy_page(void *to, void *from)
165{
166 int i;
167
168 kernel_fpu_begin();
169
170 /* maybe the prefetch stuff can go before the expensive fnsave...
171 * but that is for later. -AV
172 */
173 __asm__ __volatile__ (
174 "1: prefetch (%0)\n"
175 " prefetch 64(%0)\n"
176 " prefetch 128(%0)\n"
177 " prefetch 192(%0)\n"
178 " prefetch 256(%0)\n"
179 "2: \n"
180 ".section .fixup, \"ax\"\n"
181 "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
182 " jmp 2b\n"
183 ".previous\n"
184 ".section __ex_table,\"a\"\n"
185 " .align 4\n"
186 " .long 1b, 3b\n"
187 ".previous"
188 : : "r" (from) );
189
190 for(i=0; i<(4096-320)/64; i++)
191 {
192 __asm__ __volatile__ (
193 "1: prefetch 320(%0)\n"
194 "2: movq (%0), %%mm0\n"
195 " movntq %%mm0, (%1)\n"
196 " movq 8(%0), %%mm1\n"
197 " movntq %%mm1, 8(%1)\n"
198 " movq 16(%0), %%mm2\n"
199 " movntq %%mm2, 16(%1)\n"
200 " movq 24(%0), %%mm3\n"
201 " movntq %%mm3, 24(%1)\n"
202 " movq 32(%0), %%mm4\n"
203 " movntq %%mm4, 32(%1)\n"
204 " movq 40(%0), %%mm5\n"
205 " movntq %%mm5, 40(%1)\n"
206 " movq 48(%0), %%mm6\n"
207 " movntq %%mm6, 48(%1)\n"
208 " movq 56(%0), %%mm7\n"
209 " movntq %%mm7, 56(%1)\n"
210 ".section .fixup, \"ax\"\n"
211 "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
212 " jmp 2b\n"
213 ".previous\n"
214 ".section __ex_table,\"a\"\n"
215 " .align 4\n"
216 " .long 1b, 3b\n"
217 ".previous"
218 : : "r" (from), "r" (to) : "memory");
219 from+=64;
220 to+=64;
221 }
222 for(i=(4096-320)/64; i<4096/64; i++)
223 {
224 __asm__ __volatile__ (
225 "2: movq (%0), %%mm0\n"
226 " movntq %%mm0, (%1)\n"
227 " movq 8(%0), %%mm1\n"
228 " movntq %%mm1, 8(%1)\n"
229 " movq 16(%0), %%mm2\n"
230 " movntq %%mm2, 16(%1)\n"
231 " movq 24(%0), %%mm3\n"
232 " movntq %%mm3, 24(%1)\n"
233 " movq 32(%0), %%mm4\n"
234 " movntq %%mm4, 32(%1)\n"
235 " movq 40(%0), %%mm5\n"
236 " movntq %%mm5, 40(%1)\n"
237 " movq 48(%0), %%mm6\n"
238 " movntq %%mm6, 48(%1)\n"
239 " movq 56(%0), %%mm7\n"
240 " movntq %%mm7, 56(%1)\n"
241 : : "r" (from), "r" (to) : "memory");
242 from+=64;
243 to+=64;
244 }
245 /* since movntq is weakly-ordered, a "sfence" is needed to become
246 * ordered again.
247 */
248 __asm__ __volatile__ (
249 " sfence \n" : :
250 );
251 kernel_fpu_end();
252}
253
254#else
255
256/*
257 * Generic MMX implementation without K7 specific streaming
258 */
259
260static void fast_clear_page(void *page)
261{
262 int i;
263
264 kernel_fpu_begin();
265
266 __asm__ __volatile__ (
267 " pxor %%mm0, %%mm0\n" : :
268 );
269
270 for(i=0;i<4096/128;i++)
271 {
272 __asm__ __volatile__ (
273 " movq %%mm0, (%0)\n"
274 " movq %%mm0, 8(%0)\n"
275 " movq %%mm0, 16(%0)\n"
276 " movq %%mm0, 24(%0)\n"
277 " movq %%mm0, 32(%0)\n"
278 " movq %%mm0, 40(%0)\n"
279 " movq %%mm0, 48(%0)\n"
280 " movq %%mm0, 56(%0)\n"
281 " movq %%mm0, 64(%0)\n"
282 " movq %%mm0, 72(%0)\n"
283 " movq %%mm0, 80(%0)\n"
284 " movq %%mm0, 88(%0)\n"
285 " movq %%mm0, 96(%0)\n"
286 " movq %%mm0, 104(%0)\n"
287 " movq %%mm0, 112(%0)\n"
288 " movq %%mm0, 120(%0)\n"
289 : : "r" (page) : "memory");
290 page+=128;
291 }
292
293 kernel_fpu_end();
294}
295
296static void fast_copy_page(void *to, void *from)
297{
298 int i;
299
300
301 kernel_fpu_begin();
302
303 __asm__ __volatile__ (
304 "1: prefetch (%0)\n"
305 " prefetch 64(%0)\n"
306 " prefetch 128(%0)\n"
307 " prefetch 192(%0)\n"
308 " prefetch 256(%0)\n"
309 "2: \n"
310 ".section .fixup, \"ax\"\n"
311 "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
312 " jmp 2b\n"
313 ".previous\n"
314 ".section __ex_table,\"a\"\n"
315 " .align 4\n"
316 " .long 1b, 3b\n"
317 ".previous"
318 : : "r" (from) );
319
320 for(i=0; i<4096/64; i++)
321 {
322 __asm__ __volatile__ (
323 "1: prefetch 320(%0)\n"
324 "2: movq (%0), %%mm0\n"
325 " movq 8(%0), %%mm1\n"
326 " movq 16(%0), %%mm2\n"
327 " movq 24(%0), %%mm3\n"
328 " movq %%mm0, (%1)\n"
329 " movq %%mm1, 8(%1)\n"
330 " movq %%mm2, 16(%1)\n"
331 " movq %%mm3, 24(%1)\n"
332 " movq 32(%0), %%mm0\n"
333 " movq 40(%0), %%mm1\n"
334 " movq 48(%0), %%mm2\n"
335 " movq 56(%0), %%mm3\n"
336 " movq %%mm0, 32(%1)\n"
337 " movq %%mm1, 40(%1)\n"
338 " movq %%mm2, 48(%1)\n"
339 " movq %%mm3, 56(%1)\n"
340 ".section .fixup, \"ax\"\n"
341 "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
342 " jmp 2b\n"
343 ".previous\n"
344 ".section __ex_table,\"a\"\n"
345 " .align 4\n"
346 " .long 1b, 3b\n"
347 ".previous"
348 : : "r" (from), "r" (to) : "memory");
349 from+=64;
350 to+=64;
351 }
352 kernel_fpu_end();
353}
354
355
356#endif
357
358/*
359 * Favour MMX for page clear and copy.
360 */
361
362static void slow_zero_page(void * page)
363{
364 int d0, d1;
365 __asm__ __volatile__( \
366 "cld\n\t" \
367 "rep ; stosl" \
368 : "=&c" (d0), "=&D" (d1)
369 :"a" (0),"1" (page),"0" (1024)
370 :"memory");
371}
372
373void mmx_clear_page(void * page)
374{
375 if(unlikely(in_interrupt()))
376 slow_zero_page(page);
377 else
378 fast_clear_page(page);
379}
380
381static void slow_copy_page(void *to, void *from)
382{
383 int d0, d1, d2;
384 __asm__ __volatile__( \
385 "cld\n\t" \
386 "rep ; movsl" \
387 : "=&c" (d0), "=&D" (d1), "=&S" (d2) \
388 : "0" (1024),"1" ((long) to),"2" ((long) from) \
389 : "memory");
390}
391
392
393void mmx_copy_page(void *to, void *from)
394{
395 if(unlikely(in_interrupt()))
396 slow_copy_page(to, from);
397 else
398 fast_copy_page(to, from);
399}
400
401EXPORT_SYMBOL(_mmx_memcpy);
402EXPORT_SYMBOL(mmx_clear_page);
403EXPORT_SYMBOL(mmx_copy_page);
diff --git a/arch/x86/lib/msr-on-cpu.c b/arch/x86/lib/msr-on-cpu.c
new file mode 100644
index 000000000000..7767962f25d3
--- /dev/null
+++ b/arch/x86/lib/msr-on-cpu.c
@@ -0,0 +1,119 @@
1#include <linux/module.h>
2#include <linux/preempt.h>
3#include <linux/smp.h>
4#include <asm/msr.h>
5
6struct msr_info {
7 u32 msr_no;
8 u32 l, h;
9 int err;
10};
11
12static void __rdmsr_on_cpu(void *info)
13{
14 struct msr_info *rv = info;
15
16 rdmsr(rv->msr_no, rv->l, rv->h);
17}
18
19static void __rdmsr_safe_on_cpu(void *info)
20{
21 struct msr_info *rv = info;
22
23 rv->err = rdmsr_safe(rv->msr_no, &rv->l, &rv->h);
24}
25
26static int _rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h, int safe)
27{
28 int err = 0;
29 preempt_disable();
30 if (smp_processor_id() == cpu)
31 if (safe)
32 err = rdmsr_safe(msr_no, l, h);
33 else
34 rdmsr(msr_no, *l, *h);
35 else {
36 struct msr_info rv;
37
38 rv.msr_no = msr_no;
39 if (safe) {
40 smp_call_function_single(cpu, __rdmsr_safe_on_cpu,
41 &rv, 0, 1);
42 err = rv.err;
43 } else {
44 smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 0, 1);
45 }
46 *l = rv.l;
47 *h = rv.h;
48 }
49 preempt_enable();
50 return err;
51}
52
53static void __wrmsr_on_cpu(void *info)
54{
55 struct msr_info *rv = info;
56
57 wrmsr(rv->msr_no, rv->l, rv->h);
58}
59
60static void __wrmsr_safe_on_cpu(void *info)
61{
62 struct msr_info *rv = info;
63
64 rv->err = wrmsr_safe(rv->msr_no, rv->l, rv->h);
65}
66
67static int _wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h, int safe)
68{
69 int err = 0;
70 preempt_disable();
71 if (smp_processor_id() == cpu)
72 if (safe)
73 err = wrmsr_safe(msr_no, l, h);
74 else
75 wrmsr(msr_no, l, h);
76 else {
77 struct msr_info rv;
78
79 rv.msr_no = msr_no;
80 rv.l = l;
81 rv.h = h;
82 if (safe) {
83 smp_call_function_single(cpu, __wrmsr_safe_on_cpu,
84 &rv, 0, 1);
85 err = rv.err;
86 } else {
87 smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 0, 1);
88 }
89 }
90 preempt_enable();
91 return err;
92}
93
94void wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
95{
96 _wrmsr_on_cpu(cpu, msr_no, l, h, 0);
97}
98
99void rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
100{
101 _rdmsr_on_cpu(cpu, msr_no, l, h, 0);
102}
103
104/* These "safe" variants are slower and should be used when the target MSR
105 may not actually exist. */
106int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
107{
108 return _wrmsr_on_cpu(cpu, msr_no, l, h, 1);
109}
110
111int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
112{
113 return _rdmsr_on_cpu(cpu, msr_no, l, h, 1);
114}
115
116EXPORT_SYMBOL(rdmsr_on_cpu);
117EXPORT_SYMBOL(wrmsr_on_cpu);
118EXPORT_SYMBOL(rdmsr_safe_on_cpu);
119EXPORT_SYMBOL(wrmsr_safe_on_cpu);
diff --git a/arch/x86/lib/putuser_32.S b/arch/x86/lib/putuser_32.S
new file mode 100644
index 000000000000..f58fba109d18
--- /dev/null
+++ b/arch/x86/lib/putuser_32.S
@@ -0,0 +1,98 @@
1/*
2 * __put_user functions.
3 *
4 * (C) Copyright 2005 Linus Torvalds
5 *
6 * These functions have a non-standard call interface
7 * to make them more efficient, especially as they
8 * return an error value in addition to the "real"
9 * return value.
10 */
11#include <linux/linkage.h>
12#include <asm/dwarf2.h>
13#include <asm/thread_info.h>
14
15
16/*
17 * __put_user_X
18 *
19 * Inputs: %eax[:%edx] contains the data
20 * %ecx contains the address
21 *
22 * Outputs: %eax is error code (0 or -EFAULT)
23 *
24 * These functions should not modify any other registers,
25 * as they get called from within inline assembly.
26 */
27
28#define ENTER CFI_STARTPROC ; \
29 pushl %ebx ; \
30 CFI_ADJUST_CFA_OFFSET 4 ; \
31 CFI_REL_OFFSET ebx, 0 ; \
32 GET_THREAD_INFO(%ebx)
33#define EXIT popl %ebx ; \
34 CFI_ADJUST_CFA_OFFSET -4 ; \
35 CFI_RESTORE ebx ; \
36 ret ; \
37 CFI_ENDPROC
38
39.text
40ENTRY(__put_user_1)
41 ENTER
42 cmpl TI_addr_limit(%ebx),%ecx
43 jae bad_put_user
441: movb %al,(%ecx)
45 xorl %eax,%eax
46 EXIT
47ENDPROC(__put_user_1)
48
49ENTRY(__put_user_2)
50 ENTER
51 movl TI_addr_limit(%ebx),%ebx
52 subl $1,%ebx
53 cmpl %ebx,%ecx
54 jae bad_put_user
552: movw %ax,(%ecx)
56 xorl %eax,%eax
57 EXIT
58ENDPROC(__put_user_2)
59
60ENTRY(__put_user_4)
61 ENTER
62 movl TI_addr_limit(%ebx),%ebx
63 subl $3,%ebx
64 cmpl %ebx,%ecx
65 jae bad_put_user
663: movl %eax,(%ecx)
67 xorl %eax,%eax
68 EXIT
69ENDPROC(__put_user_4)
70
71ENTRY(__put_user_8)
72 ENTER
73 movl TI_addr_limit(%ebx),%ebx
74 subl $7,%ebx
75 cmpl %ebx,%ecx
76 jae bad_put_user
774: movl %eax,(%ecx)
785: movl %edx,4(%ecx)
79 xorl %eax,%eax
80 EXIT
81ENDPROC(__put_user_8)
82
83bad_put_user:
84 CFI_STARTPROC simple
85 CFI_DEF_CFA esp, 2*4
86 CFI_OFFSET eip, -1*4
87 CFI_OFFSET ebx, -2*4
88 movl $-14,%eax
89 EXIT
90END(bad_put_user)
91
92.section __ex_table,"a"
93 .long 1b,bad_put_user
94 .long 2b,bad_put_user
95 .long 3b,bad_put_user
96 .long 4b,bad_put_user
97 .long 5b,bad_put_user
98.previous
diff --git a/arch/x86/lib/putuser_64.S b/arch/x86/lib/putuser_64.S
new file mode 100644
index 000000000000..4989f5a8fa9b
--- /dev/null
+++ b/arch/x86/lib/putuser_64.S
@@ -0,0 +1,106 @@
1/*
2 * __put_user functions.
3 *
4 * (C) Copyright 1998 Linus Torvalds
5 * (C) Copyright 2005 Andi Kleen
6 *
7 * These functions have a non-standard call interface
8 * to make them more efficient, especially as they
9 * return an error value in addition to the "real"
10 * return value.
11 */
12
13/*
14 * __put_user_X
15 *
16 * Inputs: %rcx contains the address
17 * %rdx contains new value
18 *
19 * Outputs: %rax is error code (0 or -EFAULT)
20 *
21 * %r8 is destroyed.
22 *
23 * These functions should not modify any other registers,
24 * as they get called from within inline assembly.
25 */
26
27#include <linux/linkage.h>
28#include <asm/dwarf2.h>
29#include <asm/page.h>
30#include <asm/errno.h>
31#include <asm/asm-offsets.h>
32#include <asm/thread_info.h>
33
34 .text
35ENTRY(__put_user_1)
36 CFI_STARTPROC
37 GET_THREAD_INFO(%r8)
38 cmpq threadinfo_addr_limit(%r8),%rcx
39 jae bad_put_user
401: movb %dl,(%rcx)
41 xorl %eax,%eax
42 ret
43 CFI_ENDPROC
44ENDPROC(__put_user_1)
45
46ENTRY(__put_user_2)
47 CFI_STARTPROC
48 GET_THREAD_INFO(%r8)
49 addq $1,%rcx
50 jc 20f
51 cmpq threadinfo_addr_limit(%r8),%rcx
52 jae 20f
53 decq %rcx
542: movw %dx,(%rcx)
55 xorl %eax,%eax
56 ret
5720: decq %rcx
58 jmp bad_put_user
59 CFI_ENDPROC
60ENDPROC(__put_user_2)
61
62ENTRY(__put_user_4)
63 CFI_STARTPROC
64 GET_THREAD_INFO(%r8)
65 addq $3,%rcx
66 jc 30f
67 cmpq threadinfo_addr_limit(%r8),%rcx
68 jae 30f
69 subq $3,%rcx
703: movl %edx,(%rcx)
71 xorl %eax,%eax
72 ret
7330: subq $3,%rcx
74 jmp bad_put_user
75 CFI_ENDPROC
76ENDPROC(__put_user_4)
77
78ENTRY(__put_user_8)
79 CFI_STARTPROC
80 GET_THREAD_INFO(%r8)
81 addq $7,%rcx
82 jc 40f
83 cmpq threadinfo_addr_limit(%r8),%rcx
84 jae 40f
85 subq $7,%rcx
864: movq %rdx,(%rcx)
87 xorl %eax,%eax
88 ret
8940: subq $7,%rcx
90 jmp bad_put_user
91 CFI_ENDPROC
92ENDPROC(__put_user_8)
93
94bad_put_user:
95 CFI_STARTPROC
96 movq $(-EFAULT),%rax
97 ret
98 CFI_ENDPROC
99END(bad_put_user)
100
101.section __ex_table,"a"
102 .quad 1b,bad_put_user
103 .quad 2b,bad_put_user
104 .quad 3b,bad_put_user
105 .quad 4b,bad_put_user
106.previous
diff --git a/arch/x86/lib/rwlock_64.S b/arch/x86/lib/rwlock_64.S
new file mode 100644
index 000000000000..0cde1f807314
--- /dev/null
+++ b/arch/x86/lib/rwlock_64.S
@@ -0,0 +1,38 @@
1/* Slow paths of read/write spinlocks. */
2
3#include <linux/linkage.h>
4#include <asm/rwlock.h>
5#include <asm/alternative-asm.i>
6#include <asm/dwarf2.h>
7
8/* rdi: pointer to rwlock_t */
9ENTRY(__write_lock_failed)
10 CFI_STARTPROC
11 LOCK_PREFIX
12 addl $RW_LOCK_BIAS,(%rdi)
131: rep
14 nop
15 cmpl $RW_LOCK_BIAS,(%rdi)
16 jne 1b
17 LOCK_PREFIX
18 subl $RW_LOCK_BIAS,(%rdi)
19 jnz __write_lock_failed
20 ret
21 CFI_ENDPROC
22END(__write_lock_failed)
23
24/* rdi: pointer to rwlock_t */
25ENTRY(__read_lock_failed)
26 CFI_STARTPROC
27 LOCK_PREFIX
28 incl (%rdi)
291: rep
30 nop
31 cmpl $1,(%rdi)
32 js 1b
33 LOCK_PREFIX
34 decl (%rdi)
35 js __read_lock_failed
36 ret
37 CFI_ENDPROC
38END(__read_lock_failed)
diff --git a/arch/x86/lib/semaphore_32.S b/arch/x86/lib/semaphore_32.S
new file mode 100644
index 000000000000..c01eb39c0b43
--- /dev/null
+++ b/arch/x86/lib/semaphore_32.S
@@ -0,0 +1,219 @@
1/*
2 * i386 semaphore implementation.
3 *
4 * (C) Copyright 1999 Linus Torvalds
5 *
6 * Portions Copyright 1999 Red Hat, Inc.
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
12 *
13 * rw semaphores implemented November 1999 by Benjamin LaHaise <bcrl@kvack.org>
14 */
15
16#include <linux/linkage.h>
17#include <asm/rwlock.h>
18#include <asm/alternative-asm.i>
19#include <asm/frame.i>
20#include <asm/dwarf2.h>
21
22/*
23 * The semaphore operations have a special calling sequence that
24 * allow us to do a simpler in-line version of them. These routines
25 * need to convert that sequence back into the C sequence when
26 * there is contention on the semaphore.
27 *
28 * %eax contains the semaphore pointer on entry. Save the C-clobbered
29 * registers (%eax, %edx and %ecx) except %eax whish is either a return
30 * value or just clobbered..
31 */
32 .section .sched.text
33ENTRY(__down_failed)
34 CFI_STARTPROC
35 FRAME
36 pushl %edx
37 CFI_ADJUST_CFA_OFFSET 4
38 CFI_REL_OFFSET edx,0
39 pushl %ecx
40 CFI_ADJUST_CFA_OFFSET 4
41 CFI_REL_OFFSET ecx,0
42 call __down
43 popl %ecx
44 CFI_ADJUST_CFA_OFFSET -4
45 CFI_RESTORE ecx
46 popl %edx
47 CFI_ADJUST_CFA_OFFSET -4
48 CFI_RESTORE edx
49 ENDFRAME
50 ret
51 CFI_ENDPROC
52 END(__down_failed)
53
54ENTRY(__down_failed_interruptible)
55 CFI_STARTPROC
56 FRAME
57 pushl %edx
58 CFI_ADJUST_CFA_OFFSET 4
59 CFI_REL_OFFSET edx,0
60 pushl %ecx
61 CFI_ADJUST_CFA_OFFSET 4
62 CFI_REL_OFFSET ecx,0
63 call __down_interruptible
64 popl %ecx
65 CFI_ADJUST_CFA_OFFSET -4
66 CFI_RESTORE ecx
67 popl %edx
68 CFI_ADJUST_CFA_OFFSET -4
69 CFI_RESTORE edx
70 ENDFRAME
71 ret
72 CFI_ENDPROC
73 END(__down_failed_interruptible)
74
75ENTRY(__down_failed_trylock)
76 CFI_STARTPROC
77 FRAME
78 pushl %edx
79 CFI_ADJUST_CFA_OFFSET 4
80 CFI_REL_OFFSET edx,0
81 pushl %ecx
82 CFI_ADJUST_CFA_OFFSET 4
83 CFI_REL_OFFSET ecx,0
84 call __down_trylock
85 popl %ecx
86 CFI_ADJUST_CFA_OFFSET -4
87 CFI_RESTORE ecx
88 popl %edx
89 CFI_ADJUST_CFA_OFFSET -4
90 CFI_RESTORE edx
91 ENDFRAME
92 ret
93 CFI_ENDPROC
94 END(__down_failed_trylock)
95
96ENTRY(__up_wakeup)
97 CFI_STARTPROC
98 FRAME
99 pushl %edx
100 CFI_ADJUST_CFA_OFFSET 4
101 CFI_REL_OFFSET edx,0
102 pushl %ecx
103 CFI_ADJUST_CFA_OFFSET 4
104 CFI_REL_OFFSET ecx,0
105 call __up
106 popl %ecx
107 CFI_ADJUST_CFA_OFFSET -4
108 CFI_RESTORE ecx
109 popl %edx
110 CFI_ADJUST_CFA_OFFSET -4
111 CFI_RESTORE edx
112 ENDFRAME
113 ret
114 CFI_ENDPROC
115 END(__up_wakeup)
116
117/*
118 * rw spinlock fallbacks
119 */
120#ifdef CONFIG_SMP
121ENTRY(__write_lock_failed)
122 CFI_STARTPROC simple
123 FRAME
1242: LOCK_PREFIX
125 addl $ RW_LOCK_BIAS,(%eax)
1261: rep; nop
127 cmpl $ RW_LOCK_BIAS,(%eax)
128 jne 1b
129 LOCK_PREFIX
130 subl $ RW_LOCK_BIAS,(%eax)
131 jnz 2b
132 ENDFRAME
133 ret
134 CFI_ENDPROC
135 END(__write_lock_failed)
136
137ENTRY(__read_lock_failed)
138 CFI_STARTPROC
139 FRAME
1402: LOCK_PREFIX
141 incl (%eax)
1421: rep; nop
143 cmpl $1,(%eax)
144 js 1b
145 LOCK_PREFIX
146 decl (%eax)
147 js 2b
148 ENDFRAME
149 ret
150 CFI_ENDPROC
151 END(__read_lock_failed)
152
153#endif
154
155#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM
156
157/* Fix up special calling conventions */
158ENTRY(call_rwsem_down_read_failed)
159 CFI_STARTPROC
160 push %ecx
161 CFI_ADJUST_CFA_OFFSET 4
162 CFI_REL_OFFSET ecx,0
163 push %edx
164 CFI_ADJUST_CFA_OFFSET 4
165 CFI_REL_OFFSET edx,0
166 call rwsem_down_read_failed
167 pop %edx
168 CFI_ADJUST_CFA_OFFSET -4
169 pop %ecx
170 CFI_ADJUST_CFA_OFFSET -4
171 ret
172 CFI_ENDPROC
173 END(call_rwsem_down_read_failed)
174
175ENTRY(call_rwsem_down_write_failed)
176 CFI_STARTPROC
177 push %ecx
178 CFI_ADJUST_CFA_OFFSET 4
179 CFI_REL_OFFSET ecx,0
180 calll rwsem_down_write_failed
181 pop %ecx
182 CFI_ADJUST_CFA_OFFSET -4
183 ret
184 CFI_ENDPROC
185 END(call_rwsem_down_write_failed)
186
187ENTRY(call_rwsem_wake)
188 CFI_STARTPROC
189 decw %dx /* do nothing if still outstanding active readers */
190 jnz 1f
191 push %ecx
192 CFI_ADJUST_CFA_OFFSET 4
193 CFI_REL_OFFSET ecx,0
194 call rwsem_wake
195 pop %ecx
196 CFI_ADJUST_CFA_OFFSET -4
1971: ret
198 CFI_ENDPROC
199 END(call_rwsem_wake)
200
201/* Fix up special calling conventions */
202ENTRY(call_rwsem_downgrade_wake)
203 CFI_STARTPROC
204 push %ecx
205 CFI_ADJUST_CFA_OFFSET 4
206 CFI_REL_OFFSET ecx,0
207 push %edx
208 CFI_ADJUST_CFA_OFFSET 4
209 CFI_REL_OFFSET edx,0
210 call rwsem_downgrade_wake
211 pop %edx
212 CFI_ADJUST_CFA_OFFSET -4
213 pop %ecx
214 CFI_ADJUST_CFA_OFFSET -4
215 ret
216 CFI_ENDPROC
217 END(call_rwsem_downgrade_wake)
218
219#endif
diff --git a/arch/x86/lib/string_32.c b/arch/x86/lib/string_32.c
new file mode 100644
index 000000000000..2c773fefa3dd
--- /dev/null
+++ b/arch/x86/lib/string_32.c
@@ -0,0 +1,257 @@
1/*
2 * Most of the string-functions are rather heavily hand-optimized,
3 * see especially strsep,strstr,str[c]spn. They should work, but are not
4 * very easy to understand. Everything is done entirely within the register
5 * set, making the functions fast and clean. String instructions have been
6 * used through-out, making for "slightly" unclear code :-)
7 *
8 * AK: On P4 and K7 using non string instruction implementations might be faster
9 * for large memory blocks. But most of them are unlikely to be used on large
10 * strings.
11 */
12
13#include <linux/string.h>
14#include <linux/module.h>
15
16#ifdef __HAVE_ARCH_STRCPY
17char *strcpy(char * dest,const char *src)
18{
19 int d0, d1, d2;
20 asm volatile( "1:\tlodsb\n\t"
21 "stosb\n\t"
22 "testb %%al,%%al\n\t"
23 "jne 1b"
24 : "=&S" (d0), "=&D" (d1), "=&a" (d2)
25 :"0" (src),"1" (dest) : "memory");
26 return dest;
27}
28EXPORT_SYMBOL(strcpy);
29#endif
30
31#ifdef __HAVE_ARCH_STRNCPY
32char *strncpy(char * dest,const char *src,size_t count)
33{
34 int d0, d1, d2, d3;
35 asm volatile( "1:\tdecl %2\n\t"
36 "js 2f\n\t"
37 "lodsb\n\t"
38 "stosb\n\t"
39 "testb %%al,%%al\n\t"
40 "jne 1b\n\t"
41 "rep\n\t"
42 "stosb\n"
43 "2:"
44 : "=&S" (d0), "=&D" (d1), "=&c" (d2), "=&a" (d3)
45 :"0" (src),"1" (dest),"2" (count) : "memory");
46 return dest;
47}
48EXPORT_SYMBOL(strncpy);
49#endif
50
51#ifdef __HAVE_ARCH_STRCAT
52char *strcat(char * dest,const char * src)
53{
54 int d0, d1, d2, d3;
55 asm volatile( "repne\n\t"
56 "scasb\n\t"
57 "decl %1\n"
58 "1:\tlodsb\n\t"
59 "stosb\n\t"
60 "testb %%al,%%al\n\t"
61 "jne 1b"
62 : "=&S" (d0), "=&D" (d1), "=&a" (d2), "=&c" (d3)
63 : "0" (src), "1" (dest), "2" (0), "3" (0xffffffffu): "memory");
64 return dest;
65}
66EXPORT_SYMBOL(strcat);
67#endif
68
69#ifdef __HAVE_ARCH_STRNCAT
70char *strncat(char * dest,const char * src,size_t count)
71{
72 int d0, d1, d2, d3;
73 asm volatile( "repne\n\t"
74 "scasb\n\t"
75 "decl %1\n\t"
76 "movl %8,%3\n"
77 "1:\tdecl %3\n\t"
78 "js 2f\n\t"
79 "lodsb\n\t"
80 "stosb\n\t"
81 "testb %%al,%%al\n\t"
82 "jne 1b\n"
83 "2:\txorl %2,%2\n\t"
84 "stosb"
85 : "=&S" (d0), "=&D" (d1), "=&a" (d2), "=&c" (d3)
86 : "0" (src),"1" (dest),"2" (0),"3" (0xffffffffu), "g" (count)
87 : "memory");
88 return dest;
89}
90EXPORT_SYMBOL(strncat);
91#endif
92
93#ifdef __HAVE_ARCH_STRCMP
94int strcmp(const char * cs,const char * ct)
95{
96 int d0, d1;
97 int res;
98 asm volatile( "1:\tlodsb\n\t"
99 "scasb\n\t"
100 "jne 2f\n\t"
101 "testb %%al,%%al\n\t"
102 "jne 1b\n\t"
103 "xorl %%eax,%%eax\n\t"
104 "jmp 3f\n"
105 "2:\tsbbl %%eax,%%eax\n\t"
106 "orb $1,%%al\n"
107 "3:"
108 :"=a" (res), "=&S" (d0), "=&D" (d1)
109 :"1" (cs),"2" (ct)
110 :"memory");
111 return res;
112}
113EXPORT_SYMBOL(strcmp);
114#endif
115
116#ifdef __HAVE_ARCH_STRNCMP
117int strncmp(const char * cs,const char * ct,size_t count)
118{
119 int res;
120 int d0, d1, d2;
121 asm volatile( "1:\tdecl %3\n\t"
122 "js 2f\n\t"
123 "lodsb\n\t"
124 "scasb\n\t"
125 "jne 3f\n\t"
126 "testb %%al,%%al\n\t"
127 "jne 1b\n"
128 "2:\txorl %%eax,%%eax\n\t"
129 "jmp 4f\n"
130 "3:\tsbbl %%eax,%%eax\n\t"
131 "orb $1,%%al\n"
132 "4:"
133 :"=a" (res), "=&S" (d0), "=&D" (d1), "=&c" (d2)
134 :"1" (cs),"2" (ct),"3" (count)
135 :"memory");
136 return res;
137}
138EXPORT_SYMBOL(strncmp);
139#endif
140
141#ifdef __HAVE_ARCH_STRCHR
142char *strchr(const char * s, int c)
143{
144 int d0;
145 char * res;
146 asm volatile( "movb %%al,%%ah\n"
147 "1:\tlodsb\n\t"
148 "cmpb %%ah,%%al\n\t"
149 "je 2f\n\t"
150 "testb %%al,%%al\n\t"
151 "jne 1b\n\t"
152 "movl $1,%1\n"
153 "2:\tmovl %1,%0\n\t"
154 "decl %0"
155 :"=a" (res), "=&S" (d0)
156 :"1" (s),"0" (c)
157 :"memory");
158 return res;
159}
160EXPORT_SYMBOL(strchr);
161#endif
162
163#ifdef __HAVE_ARCH_STRRCHR
164char *strrchr(const char * s, int c)
165{
166 int d0, d1;
167 char * res;
168 asm volatile( "movb %%al,%%ah\n"
169 "1:\tlodsb\n\t"
170 "cmpb %%ah,%%al\n\t"
171 "jne 2f\n\t"
172 "leal -1(%%esi),%0\n"
173 "2:\ttestb %%al,%%al\n\t"
174 "jne 1b"
175 :"=g" (res), "=&S" (d0), "=&a" (d1)
176 :"0" (0),"1" (s),"2" (c)
177 :"memory");
178 return res;
179}
180EXPORT_SYMBOL(strrchr);
181#endif
182
183#ifdef __HAVE_ARCH_STRLEN
184size_t strlen(const char * s)
185{
186 int d0;
187 int res;
188 asm volatile( "repne\n\t"
189 "scasb\n\t"
190 "notl %0\n\t"
191 "decl %0"
192 :"=c" (res), "=&D" (d0)
193 :"1" (s),"a" (0), "0" (0xffffffffu)
194 :"memory");
195 return res;
196}
197EXPORT_SYMBOL(strlen);
198#endif
199
200#ifdef __HAVE_ARCH_MEMCHR
201void *memchr(const void *cs,int c,size_t count)
202{
203 int d0;
204 void *res;
205 if (!count)
206 return NULL;
207 asm volatile( "repne\n\t"
208 "scasb\n\t"
209 "je 1f\n\t"
210 "movl $1,%0\n"
211 "1:\tdecl %0"
212 :"=D" (res), "=&c" (d0)
213 :"a" (c),"0" (cs),"1" (count)
214 :"memory");
215 return res;
216}
217EXPORT_SYMBOL(memchr);
218#endif
219
220#ifdef __HAVE_ARCH_MEMSCAN
221void *memscan(void * addr, int c, size_t size)
222{
223 if (!size)
224 return addr;
225 asm volatile("repnz; scasb\n\t"
226 "jnz 1f\n\t"
227 "dec %%edi\n"
228 "1:"
229 : "=D" (addr), "=c" (size)
230 : "0" (addr), "1" (size), "a" (c)
231 : "memory");
232 return addr;
233}
234EXPORT_SYMBOL(memscan);
235#endif
236
237#ifdef __HAVE_ARCH_STRNLEN
238size_t strnlen(const char *s, size_t count)
239{
240 int d0;
241 int res;
242 asm volatile( "movl %2,%0\n\t"
243 "jmp 2f\n"
244 "1:\tcmpb $0,(%0)\n\t"
245 "je 3f\n\t"
246 "incl %0\n"
247 "2:\tdecl %1\n\t"
248 "cmpl $-1,%1\n\t"
249 "jne 1b\n"
250 "3:\tsubl %2,%0"
251 :"=a" (res), "=&d" (d0)
252 :"c" (s),"1" (count)
253 :"memory");
254 return res;
255}
256EXPORT_SYMBOL(strnlen);
257#endif
diff --git a/arch/x86/lib/strstr_32.c b/arch/x86/lib/strstr_32.c
new file mode 100644
index 000000000000..a3dafbf59dae
--- /dev/null
+++ b/arch/x86/lib/strstr_32.c
@@ -0,0 +1,31 @@
1#include <linux/string.h>
2
3char * strstr(const char * cs,const char * ct)
4{
5int d0, d1;
6register char * __res;
7__asm__ __volatile__(
8 "movl %6,%%edi\n\t"
9 "repne\n\t"
10 "scasb\n\t"
11 "notl %%ecx\n\t"
12 "decl %%ecx\n\t" /* NOTE! This also sets Z if searchstring='' */
13 "movl %%ecx,%%edx\n"
14 "1:\tmovl %6,%%edi\n\t"
15 "movl %%esi,%%eax\n\t"
16 "movl %%edx,%%ecx\n\t"
17 "repe\n\t"
18 "cmpsb\n\t"
19 "je 2f\n\t" /* also works for empty string, see above */
20 "xchgl %%eax,%%esi\n\t"
21 "incl %%esi\n\t"
22 "cmpb $0,-1(%%eax)\n\t"
23 "jne 1b\n\t"
24 "xorl %%eax,%%eax\n\t"
25 "2:"
26 :"=a" (__res), "=&c" (d0), "=&S" (d1)
27 :"0" (0), "1" (0xffffffff), "2" (cs), "g" (ct)
28 :"dx", "di");
29return __res;
30}
31
diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S
new file mode 100644
index 000000000000..55e586d352d3
--- /dev/null
+++ b/arch/x86/lib/thunk_64.S
@@ -0,0 +1,67 @@
1/*
2 * Save registers before calling assembly functions. This avoids
3 * disturbance of register allocation in some inline assembly constructs.
4 * Copyright 2001,2002 by Andi Kleen, SuSE Labs.
5 * Subject to the GNU public license, v.2. No warranty of any kind.
6 */
7
8 #include <linux/linkage.h>
9 #include <asm/dwarf2.h>
10 #include <asm/calling.h>
11 #include <asm/rwlock.h>
12
13 /* rdi: arg1 ... normal C conventions. rax is saved/restored. */
14 .macro thunk name,func
15 .globl \name
16\name:
17 CFI_STARTPROC
18 SAVE_ARGS
19 call \func
20 jmp restore
21 CFI_ENDPROC
22 .endm
23
24 /* rdi: arg1 ... normal C conventions. rax is passed from C. */
25 .macro thunk_retrax name,func
26 .globl \name
27\name:
28 CFI_STARTPROC
29 SAVE_ARGS
30 call \func
31 jmp restore_norax
32 CFI_ENDPROC
33 .endm
34
35
36 .section .sched.text
37#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM
38 thunk rwsem_down_read_failed_thunk,rwsem_down_read_failed
39 thunk rwsem_down_write_failed_thunk,rwsem_down_write_failed
40 thunk rwsem_wake_thunk,rwsem_wake
41 thunk rwsem_downgrade_thunk,rwsem_downgrade_wake
42#endif
43
44 thunk __down_failed,__down
45 thunk_retrax __down_failed_interruptible,__down_interruptible
46 thunk_retrax __down_failed_trylock,__down_trylock
47 thunk __up_wakeup,__up
48
49#ifdef CONFIG_TRACE_IRQFLAGS
50 thunk trace_hardirqs_on_thunk,trace_hardirqs_on
51 thunk trace_hardirqs_off_thunk,trace_hardirqs_off
52#endif
53
54 /* SAVE_ARGS below is used only for the .cfi directives it contains. */
55 CFI_STARTPROC
56 SAVE_ARGS
57restore:
58 RESTORE_ARGS
59 ret
60 CFI_ENDPROC
61
62 CFI_STARTPROC
63 SAVE_ARGS
64restore_norax:
65 RESTORE_ARGS 1
66 ret
67 CFI_ENDPROC
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
new file mode 100644
index 000000000000..9f38b12b4af1
--- /dev/null
+++ b/arch/x86/lib/usercopy_32.c
@@ -0,0 +1,882 @@
1/*
2 * User address space access functions.
3 * The non inlined parts of asm-i386/uaccess.h are here.
4 *
5 * Copyright 1997 Andi Kleen <ak@muc.de>
6 * Copyright 1997 Linus Torvalds
7 */
8#include <linux/mm.h>
9#include <linux/highmem.h>
10#include <linux/blkdev.h>
11#include <linux/module.h>
12#include <linux/backing-dev.h>
13#include <linux/interrupt.h>
14#include <asm/uaccess.h>
15#include <asm/mmx.h>
16
17static inline int __movsl_is_ok(unsigned long a1, unsigned long a2, unsigned long n)
18{
19#ifdef CONFIG_X86_INTEL_USERCOPY
20 if (n >= 64 && ((a1 ^ a2) & movsl_mask.mask))
21 return 0;
22#endif
23 return 1;
24}
25#define movsl_is_ok(a1,a2,n) \
26 __movsl_is_ok((unsigned long)(a1),(unsigned long)(a2),(n))
27
28/*
29 * Copy a null terminated string from userspace.
30 */
31
32#define __do_strncpy_from_user(dst,src,count,res) \
33do { \
34 int __d0, __d1, __d2; \
35 might_sleep(); \
36 __asm__ __volatile__( \
37 " testl %1,%1\n" \
38 " jz 2f\n" \
39 "0: lodsb\n" \
40 " stosb\n" \
41 " testb %%al,%%al\n" \
42 " jz 1f\n" \
43 " decl %1\n" \
44 " jnz 0b\n" \
45 "1: subl %1,%0\n" \
46 "2:\n" \
47 ".section .fixup,\"ax\"\n" \
48 "3: movl %5,%0\n" \
49 " jmp 2b\n" \
50 ".previous\n" \
51 ".section __ex_table,\"a\"\n" \
52 " .align 4\n" \
53 " .long 0b,3b\n" \
54 ".previous" \
55 : "=d"(res), "=c"(count), "=&a" (__d0), "=&S" (__d1), \
56 "=&D" (__d2) \
57 : "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst) \
58 : "memory"); \
59} while (0)
60
61/**
62 * __strncpy_from_user: - Copy a NUL terminated string from userspace, with less checking.
63 * @dst: Destination address, in kernel space. This buffer must be at
64 * least @count bytes long.
65 * @src: Source address, in user space.
66 * @count: Maximum number of bytes to copy, including the trailing NUL.
67 *
68 * Copies a NUL-terminated string from userspace to kernel space.
69 * Caller must check the specified block with access_ok() before calling
70 * this function.
71 *
72 * On success, returns the length of the string (not including the trailing
73 * NUL).
74 *
75 * If access to userspace fails, returns -EFAULT (some data may have been
76 * copied).
77 *
78 * If @count is smaller than the length of the string, copies @count bytes
79 * and returns @count.
80 */
81long
82__strncpy_from_user(char *dst, const char __user *src, long count)
83{
84 long res;
85 __do_strncpy_from_user(dst, src, count, res);
86 return res;
87}
88EXPORT_SYMBOL(__strncpy_from_user);
89
90/**
91 * strncpy_from_user: - Copy a NUL terminated string from userspace.
92 * @dst: Destination address, in kernel space. This buffer must be at
93 * least @count bytes long.
94 * @src: Source address, in user space.
95 * @count: Maximum number of bytes to copy, including the trailing NUL.
96 *
97 * Copies a NUL-terminated string from userspace to kernel space.
98 *
99 * On success, returns the length of the string (not including the trailing
100 * NUL).
101 *
102 * If access to userspace fails, returns -EFAULT (some data may have been
103 * copied).
104 *
105 * If @count is smaller than the length of the string, copies @count bytes
106 * and returns @count.
107 */
108long
109strncpy_from_user(char *dst, const char __user *src, long count)
110{
111 long res = -EFAULT;
112 if (access_ok(VERIFY_READ, src, 1))
113 __do_strncpy_from_user(dst, src, count, res);
114 return res;
115}
116EXPORT_SYMBOL(strncpy_from_user);
117
118/*
119 * Zero Userspace
120 */
121
122#define __do_clear_user(addr,size) \
123do { \
124 int __d0; \
125 might_sleep(); \
126 __asm__ __volatile__( \
127 "0: rep; stosl\n" \
128 " movl %2,%0\n" \
129 "1: rep; stosb\n" \
130 "2:\n" \
131 ".section .fixup,\"ax\"\n" \
132 "3: lea 0(%2,%0,4),%0\n" \
133 " jmp 2b\n" \
134 ".previous\n" \
135 ".section __ex_table,\"a\"\n" \
136 " .align 4\n" \
137 " .long 0b,3b\n" \
138 " .long 1b,2b\n" \
139 ".previous" \
140 : "=&c"(size), "=&D" (__d0) \
141 : "r"(size & 3), "0"(size / 4), "1"(addr), "a"(0)); \
142} while (0)
143
144/**
145 * clear_user: - Zero a block of memory in user space.
146 * @to: Destination address, in user space.
147 * @n: Number of bytes to zero.
148 *
149 * Zero a block of memory in user space.
150 *
151 * Returns number of bytes that could not be cleared.
152 * On success, this will be zero.
153 */
154unsigned long
155clear_user(void __user *to, unsigned long n)
156{
157 might_sleep();
158 if (access_ok(VERIFY_WRITE, to, n))
159 __do_clear_user(to, n);
160 return n;
161}
162EXPORT_SYMBOL(clear_user);
163
164/**
165 * __clear_user: - Zero a block of memory in user space, with less checking.
166 * @to: Destination address, in user space.
167 * @n: Number of bytes to zero.
168 *
169 * Zero a block of memory in user space. Caller must check
170 * the specified block with access_ok() before calling this function.
171 *
172 * Returns number of bytes that could not be cleared.
173 * On success, this will be zero.
174 */
175unsigned long
176__clear_user(void __user *to, unsigned long n)
177{
178 __do_clear_user(to, n);
179 return n;
180}
181EXPORT_SYMBOL(__clear_user);
182
183/**
184 * strnlen_user: - Get the size of a string in user space.
185 * @s: The string to measure.
186 * @n: The maximum valid length
187 *
188 * Get the size of a NUL-terminated string in user space.
189 *
190 * Returns the size of the string INCLUDING the terminating NUL.
191 * On exception, returns 0.
192 * If the string is too long, returns a value greater than @n.
193 */
194long strnlen_user(const char __user *s, long n)
195{
196 unsigned long mask = -__addr_ok(s);
197 unsigned long res, tmp;
198
199 might_sleep();
200
201 __asm__ __volatile__(
202 " testl %0, %0\n"
203 " jz 3f\n"
204 " andl %0,%%ecx\n"
205 "0: repne; scasb\n"
206 " setne %%al\n"
207 " subl %%ecx,%0\n"
208 " addl %0,%%eax\n"
209 "1:\n"
210 ".section .fixup,\"ax\"\n"
211 "2: xorl %%eax,%%eax\n"
212 " jmp 1b\n"
213 "3: movb $1,%%al\n"
214 " jmp 1b\n"
215 ".previous\n"
216 ".section __ex_table,\"a\"\n"
217 " .align 4\n"
218 " .long 0b,2b\n"
219 ".previous"
220 :"=r" (n), "=D" (s), "=a" (res), "=c" (tmp)
221 :"0" (n), "1" (s), "2" (0), "3" (mask)
222 :"cc");
223 return res & mask;
224}
225EXPORT_SYMBOL(strnlen_user);
226
227#ifdef CONFIG_X86_INTEL_USERCOPY
228static unsigned long
229__copy_user_intel(void __user *to, const void *from, unsigned long size)
230{
231 int d0, d1;
232 __asm__ __volatile__(
233 " .align 2,0x90\n"
234 "1: movl 32(%4), %%eax\n"
235 " cmpl $67, %0\n"
236 " jbe 3f\n"
237 "2: movl 64(%4), %%eax\n"
238 " .align 2,0x90\n"
239 "3: movl 0(%4), %%eax\n"
240 "4: movl 4(%4), %%edx\n"
241 "5: movl %%eax, 0(%3)\n"
242 "6: movl %%edx, 4(%3)\n"
243 "7: movl 8(%4), %%eax\n"
244 "8: movl 12(%4),%%edx\n"
245 "9: movl %%eax, 8(%3)\n"
246 "10: movl %%edx, 12(%3)\n"
247 "11: movl 16(%4), %%eax\n"
248 "12: movl 20(%4), %%edx\n"
249 "13: movl %%eax, 16(%3)\n"
250 "14: movl %%edx, 20(%3)\n"
251 "15: movl 24(%4), %%eax\n"
252 "16: movl 28(%4), %%edx\n"
253 "17: movl %%eax, 24(%3)\n"
254 "18: movl %%edx, 28(%3)\n"
255 "19: movl 32(%4), %%eax\n"
256 "20: movl 36(%4), %%edx\n"
257 "21: movl %%eax, 32(%3)\n"
258 "22: movl %%edx, 36(%3)\n"
259 "23: movl 40(%4), %%eax\n"
260 "24: movl 44(%4), %%edx\n"
261 "25: movl %%eax, 40(%3)\n"
262 "26: movl %%edx, 44(%3)\n"
263 "27: movl 48(%4), %%eax\n"
264 "28: movl 52(%4), %%edx\n"
265 "29: movl %%eax, 48(%3)\n"
266 "30: movl %%edx, 52(%3)\n"
267 "31: movl 56(%4), %%eax\n"
268 "32: movl 60(%4), %%edx\n"
269 "33: movl %%eax, 56(%3)\n"
270 "34: movl %%edx, 60(%3)\n"
271 " addl $-64, %0\n"
272 " addl $64, %4\n"
273 " addl $64, %3\n"
274 " cmpl $63, %0\n"
275 " ja 1b\n"
276 "35: movl %0, %%eax\n"
277 " shrl $2, %0\n"
278 " andl $3, %%eax\n"
279 " cld\n"
280 "99: rep; movsl\n"
281 "36: movl %%eax, %0\n"
282 "37: rep; movsb\n"
283 "100:\n"
284 ".section .fixup,\"ax\"\n"
285 "101: lea 0(%%eax,%0,4),%0\n"
286 " jmp 100b\n"
287 ".previous\n"
288 ".section __ex_table,\"a\"\n"
289 " .align 4\n"
290 " .long 1b,100b\n"
291 " .long 2b,100b\n"
292 " .long 3b,100b\n"
293 " .long 4b,100b\n"
294 " .long 5b,100b\n"
295 " .long 6b,100b\n"
296 " .long 7b,100b\n"
297 " .long 8b,100b\n"
298 " .long 9b,100b\n"
299 " .long 10b,100b\n"
300 " .long 11b,100b\n"
301 " .long 12b,100b\n"
302 " .long 13b,100b\n"
303 " .long 14b,100b\n"
304 " .long 15b,100b\n"
305 " .long 16b,100b\n"
306 " .long 17b,100b\n"
307 " .long 18b,100b\n"
308 " .long 19b,100b\n"
309 " .long 20b,100b\n"
310 " .long 21b,100b\n"
311 " .long 22b,100b\n"
312 " .long 23b,100b\n"
313 " .long 24b,100b\n"
314 " .long 25b,100b\n"
315 " .long 26b,100b\n"
316 " .long 27b,100b\n"
317 " .long 28b,100b\n"
318 " .long 29b,100b\n"
319 " .long 30b,100b\n"
320 " .long 31b,100b\n"
321 " .long 32b,100b\n"
322 " .long 33b,100b\n"
323 " .long 34b,100b\n"
324 " .long 35b,100b\n"
325 " .long 36b,100b\n"
326 " .long 37b,100b\n"
327 " .long 99b,101b\n"
328 ".previous"
329 : "=&c"(size), "=&D" (d0), "=&S" (d1)
330 : "1"(to), "2"(from), "0"(size)
331 : "eax", "edx", "memory");
332 return size;
333}
334
335static unsigned long
336__copy_user_zeroing_intel(void *to, const void __user *from, unsigned long size)
337{
338 int d0, d1;
339 __asm__ __volatile__(
340 " .align 2,0x90\n"
341 "0: movl 32(%4), %%eax\n"
342 " cmpl $67, %0\n"
343 " jbe 2f\n"
344 "1: movl 64(%4), %%eax\n"
345 " .align 2,0x90\n"
346 "2: movl 0(%4), %%eax\n"
347 "21: movl 4(%4), %%edx\n"
348 " movl %%eax, 0(%3)\n"
349 " movl %%edx, 4(%3)\n"
350 "3: movl 8(%4), %%eax\n"
351 "31: movl 12(%4),%%edx\n"
352 " movl %%eax, 8(%3)\n"
353 " movl %%edx, 12(%3)\n"
354 "4: movl 16(%4), %%eax\n"
355 "41: movl 20(%4), %%edx\n"
356 " movl %%eax, 16(%3)\n"
357 " movl %%edx, 20(%3)\n"
358 "10: movl 24(%4), %%eax\n"
359 "51: movl 28(%4), %%edx\n"
360 " movl %%eax, 24(%3)\n"
361 " movl %%edx, 28(%3)\n"
362 "11: movl 32(%4), %%eax\n"
363 "61: movl 36(%4), %%edx\n"
364 " movl %%eax, 32(%3)\n"
365 " movl %%edx, 36(%3)\n"
366 "12: movl 40(%4), %%eax\n"
367 "71: movl 44(%4), %%edx\n"
368 " movl %%eax, 40(%3)\n"
369 " movl %%edx, 44(%3)\n"
370 "13: movl 48(%4), %%eax\n"
371 "81: movl 52(%4), %%edx\n"
372 " movl %%eax, 48(%3)\n"
373 " movl %%edx, 52(%3)\n"
374 "14: movl 56(%4), %%eax\n"
375 "91: movl 60(%4), %%edx\n"
376 " movl %%eax, 56(%3)\n"
377 " movl %%edx, 60(%3)\n"
378 " addl $-64, %0\n"
379 " addl $64, %4\n"
380 " addl $64, %3\n"
381 " cmpl $63, %0\n"
382 " ja 0b\n"
383 "5: movl %0, %%eax\n"
384 " shrl $2, %0\n"
385 " andl $3, %%eax\n"
386 " cld\n"
387 "6: rep; movsl\n"
388 " movl %%eax,%0\n"
389 "7: rep; movsb\n"
390 "8:\n"
391 ".section .fixup,\"ax\"\n"
392 "9: lea 0(%%eax,%0,4),%0\n"
393 "16: pushl %0\n"
394 " pushl %%eax\n"
395 " xorl %%eax,%%eax\n"
396 " rep; stosb\n"
397 " popl %%eax\n"
398 " popl %0\n"
399 " jmp 8b\n"
400 ".previous\n"
401 ".section __ex_table,\"a\"\n"
402 " .align 4\n"
403 " .long 0b,16b\n"
404 " .long 1b,16b\n"
405 " .long 2b,16b\n"
406 " .long 21b,16b\n"
407 " .long 3b,16b\n"
408 " .long 31b,16b\n"
409 " .long 4b,16b\n"
410 " .long 41b,16b\n"
411 " .long 10b,16b\n"
412 " .long 51b,16b\n"
413 " .long 11b,16b\n"
414 " .long 61b,16b\n"
415 " .long 12b,16b\n"
416 " .long 71b,16b\n"
417 " .long 13b,16b\n"
418 " .long 81b,16b\n"
419 " .long 14b,16b\n"
420 " .long 91b,16b\n"
421 " .long 6b,9b\n"
422 " .long 7b,16b\n"
423 ".previous"
424 : "=&c"(size), "=&D" (d0), "=&S" (d1)
425 : "1"(to), "2"(from), "0"(size)
426 : "eax", "edx", "memory");
427 return size;
428}
429
430/*
431 * Non Temporal Hint version of __copy_user_zeroing_intel. It is cache aware.
432 * hyoshiok@miraclelinux.com
433 */
434
435static unsigned long __copy_user_zeroing_intel_nocache(void *to,
436 const void __user *from, unsigned long size)
437{
438 int d0, d1;
439
440 __asm__ __volatile__(
441 " .align 2,0x90\n"
442 "0: movl 32(%4), %%eax\n"
443 " cmpl $67, %0\n"
444 " jbe 2f\n"
445 "1: movl 64(%4), %%eax\n"
446 " .align 2,0x90\n"
447 "2: movl 0(%4), %%eax\n"
448 "21: movl 4(%4), %%edx\n"
449 " movnti %%eax, 0(%3)\n"
450 " movnti %%edx, 4(%3)\n"
451 "3: movl 8(%4), %%eax\n"
452 "31: movl 12(%4),%%edx\n"
453 " movnti %%eax, 8(%3)\n"
454 " movnti %%edx, 12(%3)\n"
455 "4: movl 16(%4), %%eax\n"
456 "41: movl 20(%4), %%edx\n"
457 " movnti %%eax, 16(%3)\n"
458 " movnti %%edx, 20(%3)\n"
459 "10: movl 24(%4), %%eax\n"
460 "51: movl 28(%4), %%edx\n"
461 " movnti %%eax, 24(%3)\n"
462 " movnti %%edx, 28(%3)\n"
463 "11: movl 32(%4), %%eax\n"
464 "61: movl 36(%4), %%edx\n"
465 " movnti %%eax, 32(%3)\n"
466 " movnti %%edx, 36(%3)\n"
467 "12: movl 40(%4), %%eax\n"
468 "71: movl 44(%4), %%edx\n"
469 " movnti %%eax, 40(%3)\n"
470 " movnti %%edx, 44(%3)\n"
471 "13: movl 48(%4), %%eax\n"
472 "81: movl 52(%4), %%edx\n"
473 " movnti %%eax, 48(%3)\n"
474 " movnti %%edx, 52(%3)\n"
475 "14: movl 56(%4), %%eax\n"
476 "91: movl 60(%4), %%edx\n"
477 " movnti %%eax, 56(%3)\n"
478 " movnti %%edx, 60(%3)\n"
479 " addl $-64, %0\n"
480 " addl $64, %4\n"
481 " addl $64, %3\n"
482 " cmpl $63, %0\n"
483 " ja 0b\n"
484 " sfence \n"
485 "5: movl %0, %%eax\n"
486 " shrl $2, %0\n"
487 " andl $3, %%eax\n"
488 " cld\n"
489 "6: rep; movsl\n"
490 " movl %%eax,%0\n"
491 "7: rep; movsb\n"
492 "8:\n"
493 ".section .fixup,\"ax\"\n"
494 "9: lea 0(%%eax,%0,4),%0\n"
495 "16: pushl %0\n"
496 " pushl %%eax\n"
497 " xorl %%eax,%%eax\n"
498 " rep; stosb\n"
499 " popl %%eax\n"
500 " popl %0\n"
501 " jmp 8b\n"
502 ".previous\n"
503 ".section __ex_table,\"a\"\n"
504 " .align 4\n"
505 " .long 0b,16b\n"
506 " .long 1b,16b\n"
507 " .long 2b,16b\n"
508 " .long 21b,16b\n"
509 " .long 3b,16b\n"
510 " .long 31b,16b\n"
511 " .long 4b,16b\n"
512 " .long 41b,16b\n"
513 " .long 10b,16b\n"
514 " .long 51b,16b\n"
515 " .long 11b,16b\n"
516 " .long 61b,16b\n"
517 " .long 12b,16b\n"
518 " .long 71b,16b\n"
519 " .long 13b,16b\n"
520 " .long 81b,16b\n"
521 " .long 14b,16b\n"
522 " .long 91b,16b\n"
523 " .long 6b,9b\n"
524 " .long 7b,16b\n"
525 ".previous"
526 : "=&c"(size), "=&D" (d0), "=&S" (d1)
527 : "1"(to), "2"(from), "0"(size)
528 : "eax", "edx", "memory");
529 return size;
530}
531
532static unsigned long __copy_user_intel_nocache(void *to,
533 const void __user *from, unsigned long size)
534{
535 int d0, d1;
536
537 __asm__ __volatile__(
538 " .align 2,0x90\n"
539 "0: movl 32(%4), %%eax\n"
540 " cmpl $67, %0\n"
541 " jbe 2f\n"
542 "1: movl 64(%4), %%eax\n"
543 " .align 2,0x90\n"
544 "2: movl 0(%4), %%eax\n"
545 "21: movl 4(%4), %%edx\n"
546 " movnti %%eax, 0(%3)\n"
547 " movnti %%edx, 4(%3)\n"
548 "3: movl 8(%4), %%eax\n"
549 "31: movl 12(%4),%%edx\n"
550 " movnti %%eax, 8(%3)\n"
551 " movnti %%edx, 12(%3)\n"
552 "4: movl 16(%4), %%eax\n"
553 "41: movl 20(%4), %%edx\n"
554 " movnti %%eax, 16(%3)\n"
555 " movnti %%edx, 20(%3)\n"
556 "10: movl 24(%4), %%eax\n"
557 "51: movl 28(%4), %%edx\n"
558 " movnti %%eax, 24(%3)\n"
559 " movnti %%edx, 28(%3)\n"
560 "11: movl 32(%4), %%eax\n"
561 "61: movl 36(%4), %%edx\n"
562 " movnti %%eax, 32(%3)\n"
563 " movnti %%edx, 36(%3)\n"
564 "12: movl 40(%4), %%eax\n"
565 "71: movl 44(%4), %%edx\n"
566 " movnti %%eax, 40(%3)\n"
567 " movnti %%edx, 44(%3)\n"
568 "13: movl 48(%4), %%eax\n"
569 "81: movl 52(%4), %%edx\n"
570 " movnti %%eax, 48(%3)\n"
571 " movnti %%edx, 52(%3)\n"
572 "14: movl 56(%4), %%eax\n"
573 "91: movl 60(%4), %%edx\n"
574 " movnti %%eax, 56(%3)\n"
575 " movnti %%edx, 60(%3)\n"
576 " addl $-64, %0\n"
577 " addl $64, %4\n"
578 " addl $64, %3\n"
579 " cmpl $63, %0\n"
580 " ja 0b\n"
581 " sfence \n"
582 "5: movl %0, %%eax\n"
583 " shrl $2, %0\n"
584 " andl $3, %%eax\n"
585 " cld\n"
586 "6: rep; movsl\n"
587 " movl %%eax,%0\n"
588 "7: rep; movsb\n"
589 "8:\n"
590 ".section .fixup,\"ax\"\n"
591 "9: lea 0(%%eax,%0,4),%0\n"
592 "16: jmp 8b\n"
593 ".previous\n"
594 ".section __ex_table,\"a\"\n"
595 " .align 4\n"
596 " .long 0b,16b\n"
597 " .long 1b,16b\n"
598 " .long 2b,16b\n"
599 " .long 21b,16b\n"
600 " .long 3b,16b\n"
601 " .long 31b,16b\n"
602 " .long 4b,16b\n"
603 " .long 41b,16b\n"
604 " .long 10b,16b\n"
605 " .long 51b,16b\n"
606 " .long 11b,16b\n"
607 " .long 61b,16b\n"
608 " .long 12b,16b\n"
609 " .long 71b,16b\n"
610 " .long 13b,16b\n"
611 " .long 81b,16b\n"
612 " .long 14b,16b\n"
613 " .long 91b,16b\n"
614 " .long 6b,9b\n"
615 " .long 7b,16b\n"
616 ".previous"
617 : "=&c"(size), "=&D" (d0), "=&S" (d1)
618 : "1"(to), "2"(from), "0"(size)
619 : "eax", "edx", "memory");
620 return size;
621}
622
623#else
624
625/*
626 * Leave these declared but undefined. They should not be any references to
627 * them
628 */
629unsigned long __copy_user_zeroing_intel(void *to, const void __user *from,
630 unsigned long size);
631unsigned long __copy_user_intel(void __user *to, const void *from,
632 unsigned long size);
633unsigned long __copy_user_zeroing_intel_nocache(void *to,
634 const void __user *from, unsigned long size);
635#endif /* CONFIG_X86_INTEL_USERCOPY */
636
637/* Generic arbitrary sized copy. */
638#define __copy_user(to,from,size) \
639do { \
640 int __d0, __d1, __d2; \
641 __asm__ __volatile__( \
642 " cmp $7,%0\n" \
643 " jbe 1f\n" \
644 " movl %1,%0\n" \
645 " negl %0\n" \
646 " andl $7,%0\n" \
647 " subl %0,%3\n" \
648 "4: rep; movsb\n" \
649 " movl %3,%0\n" \
650 " shrl $2,%0\n" \
651 " andl $3,%3\n" \
652 " .align 2,0x90\n" \
653 "0: rep; movsl\n" \
654 " movl %3,%0\n" \
655 "1: rep; movsb\n" \
656 "2:\n" \
657 ".section .fixup,\"ax\"\n" \
658 "5: addl %3,%0\n" \
659 " jmp 2b\n" \
660 "3: lea 0(%3,%0,4),%0\n" \
661 " jmp 2b\n" \
662 ".previous\n" \
663 ".section __ex_table,\"a\"\n" \
664 " .align 4\n" \
665 " .long 4b,5b\n" \
666 " .long 0b,3b\n" \
667 " .long 1b,2b\n" \
668 ".previous" \
669 : "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(__d2) \
670 : "3"(size), "0"(size), "1"(to), "2"(from) \
671 : "memory"); \
672} while (0)
673
674#define __copy_user_zeroing(to,from,size) \
675do { \
676 int __d0, __d1, __d2; \
677 __asm__ __volatile__( \
678 " cmp $7,%0\n" \
679 " jbe 1f\n" \
680 " movl %1,%0\n" \
681 " negl %0\n" \
682 " andl $7,%0\n" \
683 " subl %0,%3\n" \
684 "4: rep; movsb\n" \
685 " movl %3,%0\n" \
686 " shrl $2,%0\n" \
687 " andl $3,%3\n" \
688 " .align 2,0x90\n" \
689 "0: rep; movsl\n" \
690 " movl %3,%0\n" \
691 "1: rep; movsb\n" \
692 "2:\n" \
693 ".section .fixup,\"ax\"\n" \
694 "5: addl %3,%0\n" \
695 " jmp 6f\n" \
696 "3: lea 0(%3,%0,4),%0\n" \
697 "6: pushl %0\n" \
698 " pushl %%eax\n" \
699 " xorl %%eax,%%eax\n" \
700 " rep; stosb\n" \
701 " popl %%eax\n" \
702 " popl %0\n" \
703 " jmp 2b\n" \
704 ".previous\n" \
705 ".section __ex_table,\"a\"\n" \
706 " .align 4\n" \
707 " .long 4b,5b\n" \
708 " .long 0b,3b\n" \
709 " .long 1b,6b\n" \
710 ".previous" \
711 : "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(__d2) \
712 : "3"(size), "0"(size), "1"(to), "2"(from) \
713 : "memory"); \
714} while (0)
715
716unsigned long __copy_to_user_ll(void __user *to, const void *from,
717 unsigned long n)
718{
719#ifndef CONFIG_X86_WP_WORKS_OK
720 if (unlikely(boot_cpu_data.wp_works_ok == 0) &&
721 ((unsigned long )to) < TASK_SIZE) {
722 /*
723 * When we are in an atomic section (see
724 * mm/filemap.c:file_read_actor), return the full
725 * length to take the slow path.
726 */
727 if (in_atomic())
728 return n;
729
730 /*
731 * CPU does not honor the WP bit when writing
732 * from supervisory mode, and due to preemption or SMP,
733 * the page tables can change at any time.
734 * Do it manually. Manfred <manfred@colorfullife.com>
735 */
736 while (n) {
737 unsigned long offset = ((unsigned long)to)%PAGE_SIZE;
738 unsigned long len = PAGE_SIZE - offset;
739 int retval;
740 struct page *pg;
741 void *maddr;
742
743 if (len > n)
744 len = n;
745
746survive:
747 down_read(&current->mm->mmap_sem);
748 retval = get_user_pages(current, current->mm,
749 (unsigned long )to, 1, 1, 0, &pg, NULL);
750
751 if (retval == -ENOMEM && is_init(current)) {
752 up_read(&current->mm->mmap_sem);
753 congestion_wait(WRITE, HZ/50);
754 goto survive;
755 }
756
757 if (retval != 1) {
758 up_read(&current->mm->mmap_sem);
759 break;
760 }
761
762 maddr = kmap_atomic(pg, KM_USER0);
763 memcpy(maddr + offset, from, len);
764 kunmap_atomic(maddr, KM_USER0);
765 set_page_dirty_lock(pg);
766 put_page(pg);
767 up_read(&current->mm->mmap_sem);
768
769 from += len;
770 to += len;
771 n -= len;
772 }
773 return n;
774 }
775#endif
776 if (movsl_is_ok(to, from, n))
777 __copy_user(to, from, n);
778 else
779 n = __copy_user_intel(to, from, n);
780 return n;
781}
782EXPORT_SYMBOL(__copy_to_user_ll);
783
784unsigned long __copy_from_user_ll(void *to, const void __user *from,
785 unsigned long n)
786{
787 if (movsl_is_ok(to, from, n))
788 __copy_user_zeroing(to, from, n);
789 else
790 n = __copy_user_zeroing_intel(to, from, n);
791 return n;
792}
793EXPORT_SYMBOL(__copy_from_user_ll);
794
795unsigned long __copy_from_user_ll_nozero(void *to, const void __user *from,
796 unsigned long n)
797{
798 if (movsl_is_ok(to, from, n))
799 __copy_user(to, from, n);
800 else
801 n = __copy_user_intel((void __user *)to,
802 (const void *)from, n);
803 return n;
804}
805EXPORT_SYMBOL(__copy_from_user_ll_nozero);
806
807unsigned long __copy_from_user_ll_nocache(void *to, const void __user *from,
808 unsigned long n)
809{
810#ifdef CONFIG_X86_INTEL_USERCOPY
811 if ( n > 64 && cpu_has_xmm2)
812 n = __copy_user_zeroing_intel_nocache(to, from, n);
813 else
814 __copy_user_zeroing(to, from, n);
815#else
816 __copy_user_zeroing(to, from, n);
817#endif
818 return n;
819}
820
821unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *from,
822 unsigned long n)
823{
824#ifdef CONFIG_X86_INTEL_USERCOPY
825 if ( n > 64 && cpu_has_xmm2)
826 n = __copy_user_intel_nocache(to, from, n);
827 else
828 __copy_user(to, from, n);
829#else
830 __copy_user(to, from, n);
831#endif
832 return n;
833}
834
835/**
836 * copy_to_user: - Copy a block of data into user space.
837 * @to: Destination address, in user space.
838 * @from: Source address, in kernel space.
839 * @n: Number of bytes to copy.
840 *
841 * Context: User context only. This function may sleep.
842 *
843 * Copy data from kernel space to user space.
844 *
845 * Returns number of bytes that could not be copied.
846 * On success, this will be zero.
847 */
848unsigned long
849copy_to_user(void __user *to, const void *from, unsigned long n)
850{
851 if (access_ok(VERIFY_WRITE, to, n))
852 n = __copy_to_user(to, from, n);
853 return n;
854}
855EXPORT_SYMBOL(copy_to_user);
856
857/**
858 * copy_from_user: - Copy a block of data from user space.
859 * @to: Destination address, in kernel space.
860 * @from: Source address, in user space.
861 * @n: Number of bytes to copy.
862 *
863 * Context: User context only. This function may sleep.
864 *
865 * Copy data from user space to kernel space.
866 *
867 * Returns number of bytes that could not be copied.
868 * On success, this will be zero.
869 *
870 * If some data could not be copied, this function will pad the copied
871 * data to the requested size using zero bytes.
872 */
873unsigned long
874copy_from_user(void *to, const void __user *from, unsigned long n)
875{
876 if (access_ok(VERIFY_READ, from, n))
877 n = __copy_from_user(to, from, n);
878 else
879 memset(to, 0, n);
880 return n;
881}
882EXPORT_SYMBOL(copy_from_user);
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
new file mode 100644
index 000000000000..893d43f838cc
--- /dev/null
+++ b/arch/x86/lib/usercopy_64.c
@@ -0,0 +1,166 @@
1/*
2 * User address space access functions.
3 *
4 * Copyright 1997 Andi Kleen <ak@muc.de>
5 * Copyright 1997 Linus Torvalds
6 * Copyright 2002 Andi Kleen <ak@suse.de>
7 */
8#include <linux/module.h>
9#include <asm/uaccess.h>
10
11/*
12 * Copy a null terminated string from userspace.
13 */
14
15#define __do_strncpy_from_user(dst,src,count,res) \
16do { \
17 long __d0, __d1, __d2; \
18 might_sleep(); \
19 __asm__ __volatile__( \
20 " testq %1,%1\n" \
21 " jz 2f\n" \
22 "0: lodsb\n" \
23 " stosb\n" \
24 " testb %%al,%%al\n" \
25 " jz 1f\n" \
26 " decq %1\n" \
27 " jnz 0b\n" \
28 "1: subq %1,%0\n" \
29 "2:\n" \
30 ".section .fixup,\"ax\"\n" \
31 "3: movq %5,%0\n" \
32 " jmp 2b\n" \
33 ".previous\n" \
34 ".section __ex_table,\"a\"\n" \
35 " .align 8\n" \
36 " .quad 0b,3b\n" \
37 ".previous" \
38 : "=r"(res), "=c"(count), "=&a" (__d0), "=&S" (__d1), \
39 "=&D" (__d2) \
40 : "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst) \
41 : "memory"); \
42} while (0)
43
44long
45__strncpy_from_user(char *dst, const char __user *src, long count)
46{
47 long res;
48 __do_strncpy_from_user(dst, src, count, res);
49 return res;
50}
51EXPORT_SYMBOL(__strncpy_from_user);
52
53long
54strncpy_from_user(char *dst, const char __user *src, long count)
55{
56 long res = -EFAULT;
57 if (access_ok(VERIFY_READ, src, 1))
58 return __strncpy_from_user(dst, src, count);
59 return res;
60}
61EXPORT_SYMBOL(strncpy_from_user);
62
63/*
64 * Zero Userspace
65 */
66
67unsigned long __clear_user(void __user *addr, unsigned long size)
68{
69 long __d0;
70 might_sleep();
71 /* no memory constraint because it doesn't change any memory gcc knows
72 about */
73 asm volatile(
74 " testq %[size8],%[size8]\n"
75 " jz 4f\n"
76 "0: movq %[zero],(%[dst])\n"
77 " addq %[eight],%[dst]\n"
78 " decl %%ecx ; jnz 0b\n"
79 "4: movq %[size1],%%rcx\n"
80 " testl %%ecx,%%ecx\n"
81 " jz 2f\n"
82 "1: movb %b[zero],(%[dst])\n"
83 " incq %[dst]\n"
84 " decl %%ecx ; jnz 1b\n"
85 "2:\n"
86 ".section .fixup,\"ax\"\n"
87 "3: lea 0(%[size1],%[size8],8),%[size8]\n"
88 " jmp 2b\n"
89 ".previous\n"
90 ".section __ex_table,\"a\"\n"
91 " .align 8\n"
92 " .quad 0b,3b\n"
93 " .quad 1b,2b\n"
94 ".previous"
95 : [size8] "=c"(size), [dst] "=&D" (__d0)
96 : [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr),
97 [zero] "r" (0UL), [eight] "r" (8UL));
98 return size;
99}
100EXPORT_SYMBOL(__clear_user);
101
102unsigned long clear_user(void __user *to, unsigned long n)
103{
104 if (access_ok(VERIFY_WRITE, to, n))
105 return __clear_user(to, n);
106 return n;
107}
108EXPORT_SYMBOL(clear_user);
109
110/*
111 * Return the size of a string (including the ending 0)
112 *
113 * Return 0 on exception, a value greater than N if too long
114 */
115
116long __strnlen_user(const char __user *s, long n)
117{
118 long res = 0;
119 char c;
120
121 while (1) {
122 if (res>n)
123 return n+1;
124 if (__get_user(c, s))
125 return 0;
126 if (!c)
127 return res+1;
128 res++;
129 s++;
130 }
131}
132EXPORT_SYMBOL(__strnlen_user);
133
134long strnlen_user(const char __user *s, long n)
135{
136 if (!access_ok(VERIFY_READ, s, n))
137 return 0;
138 return __strnlen_user(s, n);
139}
140EXPORT_SYMBOL(strnlen_user);
141
142long strlen_user(const char __user *s)
143{
144 long res = 0;
145 char c;
146
147 for (;;) {
148 if (get_user(c, s))
149 return 0;
150 if (!c)
151 return res+1;
152 res++;
153 s++;
154 }
155}
156EXPORT_SYMBOL(strlen_user);
157
158unsigned long copy_in_user(void __user *to, const void __user *from, unsigned len)
159{
160 if (access_ok(VERIFY_WRITE, to, len) && access_ok(VERIFY_READ, from, len)) {
161 return copy_user_generic((__force void *)to, (__force void *)from, len);
162 }
163 return len;
164}
165EXPORT_SYMBOL(copy_in_user);
166
diff --git a/arch/x86/mach-default/Makefile b/arch/x86/mach-default/Makefile
new file mode 100644
index 000000000000..012fe34459e6
--- /dev/null
+++ b/arch/x86/mach-default/Makefile
@@ -0,0 +1,5 @@
1#
2# Makefile for the linux kernel.
3#
4
5obj-y := setup.o
diff --git a/arch/x86/mach-default/setup.c b/arch/x86/mach-default/setup.c
new file mode 100644
index 000000000000..7f635c7a2381
--- /dev/null
+++ b/arch/x86/mach-default/setup.c
@@ -0,0 +1,180 @@
1/*
2 * Machine specific setup for generic
3 */
4
5#include <linux/smp.h>
6#include <linux/init.h>
7#include <linux/interrupt.h>
8#include <asm/acpi.h>
9#include <asm/arch_hooks.h>
10#include <asm/e820.h>
11#include <asm/setup.h>
12
13#ifdef CONFIG_HOTPLUG_CPU
14#define DEFAULT_SEND_IPI (1)
15#else
16#define DEFAULT_SEND_IPI (0)
17#endif
18
19int no_broadcast=DEFAULT_SEND_IPI;
20
21/**
22 * pre_intr_init_hook - initialisation prior to setting up interrupt vectors
23 *
24 * Description:
25 * Perform any necessary interrupt initialisation prior to setting up
26 * the "ordinary" interrupt call gates. For legacy reasons, the ISA
27 * interrupts should be initialised here if the machine emulates a PC
28 * in any way.
29 **/
30void __init pre_intr_init_hook(void)
31{
32 init_ISA_irqs();
33}
34
35/*
36 * IRQ2 is cascade interrupt to second interrupt controller
37 */
38static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL};
39
40/**
41 * intr_init_hook - post gate setup interrupt initialisation
42 *
43 * Description:
44 * Fill in any interrupts that may have been left out by the general
45 * init_IRQ() routine. interrupts having to do with the machine rather
46 * than the devices on the I/O bus (like APIC interrupts in intel MP
47 * systems) are started here.
48 **/
49void __init intr_init_hook(void)
50{
51#ifdef CONFIG_X86_LOCAL_APIC
52 apic_intr_init();
53#endif
54
55 if (!acpi_ioapic)
56 setup_irq(2, &irq2);
57}
58
59/**
60 * pre_setup_arch_hook - hook called prior to any setup_arch() execution
61 *
62 * Description:
63 * generally used to activate any machine specific identification
64 * routines that may be needed before setup_arch() runs. On VISWS
65 * this is used to get the board revision and type.
66 **/
67void __init pre_setup_arch_hook(void)
68{
69}
70
71/**
72 * trap_init_hook - initialise system specific traps
73 *
74 * Description:
75 * Called as the final act of trap_init(). Used in VISWS to initialise
76 * the various board specific APIC traps.
77 **/
78void __init trap_init_hook(void)
79{
80}
81
82static struct irqaction irq0 = {
83 .handler = timer_interrupt,
84 .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL,
85 .mask = CPU_MASK_NONE,
86 .name = "timer"
87};
88
89/**
90 * time_init_hook - do any specific initialisations for the system timer.
91 *
92 * Description:
93 * Must plug the system timer interrupt source at HZ into the IRQ listed
94 * in irq_vectors.h:TIMER_IRQ
95 **/
96void __init time_init_hook(void)
97{
98 irq0.mask = cpumask_of_cpu(0);
99 setup_irq(0, &irq0);
100}
101
102#ifdef CONFIG_MCA
103/**
104 * mca_nmi_hook - hook into MCA specific NMI chain
105 *
106 * Description:
107 * The MCA (Microchannel Arcitecture) has an NMI chain for NMI sources
108 * along the MCA bus. Use this to hook into that chain if you will need
109 * it.
110 **/
111void mca_nmi_hook(void)
112{
113 /* If I recall correctly, there's a whole bunch of other things that
114 * we can do to check for NMI problems, but that's all I know about
115 * at the moment.
116 */
117
118 printk("NMI generated from unknown source!\n");
119}
120#endif
121
122static __init int no_ipi_broadcast(char *str)
123{
124 get_option(&str, &no_broadcast);
125 printk ("Using %s mode\n", no_broadcast ? "No IPI Broadcast" :
126 "IPI Broadcast");
127 return 1;
128}
129
130__setup("no_ipi_broadcast", no_ipi_broadcast);
131
132static int __init print_ipi_mode(void)
133{
134 printk ("Using IPI %s mode\n", no_broadcast ? "No-Shortcut" :
135 "Shortcut");
136 return 0;
137}
138
139late_initcall(print_ipi_mode);
140
141/**
142 * machine_specific_memory_setup - Hook for machine specific memory setup.
143 *
144 * Description:
145 * This is included late in kernel/setup.c so that it can make
146 * use of all of the static functions.
147 **/
148
149char * __init machine_specific_memory_setup(void)
150{
151 char *who;
152
153
154 who = "BIOS-e820";
155
156 /*
157 * Try to copy the BIOS-supplied E820-map.
158 *
159 * Otherwise fake a memory map; one section from 0k->640k,
160 * the next section from 1mb->appropriate_mem_k
161 */
162 sanitize_e820_map(E820_MAP, &E820_MAP_NR);
163 if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0) {
164 unsigned long mem_size;
165
166 /* compare results from other methods and take the greater */
167 if (ALT_MEM_K < EXT_MEM_K) {
168 mem_size = EXT_MEM_K;
169 who = "BIOS-88";
170 } else {
171 mem_size = ALT_MEM_K;
172 who = "BIOS-e801";
173 }
174
175 e820.nr_map = 0;
176 add_memory_region(0, LOWMEMSIZE(), E820_RAM);
177 add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
178 }
179 return who;
180}
diff --git a/arch/x86/mach-es7000/Makefile b/arch/x86/mach-es7000/Makefile
new file mode 100644
index 000000000000..69dd4da218dc
--- /dev/null
+++ b/arch/x86/mach-es7000/Makefile
@@ -0,0 +1,6 @@
1#
2# Makefile for the linux kernel.
3#
4
5obj-$(CONFIG_X86_ES7000) := es7000plat.o
6obj-$(CONFIG_X86_GENERICARCH) := es7000plat.o
diff --git a/arch/x86/mach-es7000/es7000.h b/arch/x86/mach-es7000/es7000.h
new file mode 100644
index 000000000000..c8d5aa132fa0
--- /dev/null
+++ b/arch/x86/mach-es7000/es7000.h
@@ -0,0 +1,114 @@
1/*
2 * Written by: Garry Forsgren, Unisys Corporation
3 * Natalie Protasevich, Unisys Corporation
4 * This file contains the code to configure and interface
5 * with Unisys ES7000 series hardware system manager.
6 *
7 * Copyright (c) 2003 Unisys Corporation. All Rights Reserved.
8 *
9 * This program is free software; you can redistribute it and/or modify it
10 * under the terms of version 2 of the GNU General Public License as
11 * published by the Free Software Foundation.
12 *
13 * This program is distributed in the hope that it would be useful, but
14 * WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
16 *
17 * You should have received a copy of the GNU General Public License along
18 * with this program; if not, write the Free Software Foundation, Inc., 59
19 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
20 *
21 * Contact information: Unisys Corporation, Township Line & Union Meeting
22 * Roads-A, Unisys Way, Blue Bell, Pennsylvania, 19424, or:
23 *
24 * http://www.unisys.com
25 */
26
27/*
28 * ES7000 chipsets
29 */
30
31#define NON_UNISYS 0
32#define ES7000_CLASSIC 1
33#define ES7000_ZORRO 2
34
35
36#define MIP_REG 1
37#define MIP_PSAI_REG 4
38
39#define MIP_BUSY 1
40#define MIP_SPIN 0xf0000
41#define MIP_VALID 0x0100000000000000ULL
42#define MIP_PORT(VALUE) ((VALUE >> 32) & 0xffff)
43
44#define MIP_RD_LO(VALUE) (VALUE & 0xffffffff)
45
46struct mip_reg_info {
47 unsigned long long mip_info;
48 unsigned long long delivery_info;
49 unsigned long long host_reg;
50 unsigned long long mip_reg;
51};
52
53struct part_info {
54 unsigned char type;
55 unsigned char length;
56 unsigned char part_id;
57 unsigned char apic_mode;
58 unsigned long snum;
59 char ptype[16];
60 char sname[64];
61 char pname[64];
62};
63
64struct psai {
65 unsigned long long entry_type;
66 unsigned long long addr;
67 unsigned long long bep_addr;
68};
69
70struct es7000_mem_info {
71 unsigned char type;
72 unsigned char length;
73 unsigned char resv[6];
74 unsigned long long start;
75 unsigned long long size;
76};
77
78struct es7000_oem_table {
79 unsigned long long hdr;
80 struct mip_reg_info mip;
81 struct part_info pif;
82 struct es7000_mem_info shm;
83 struct psai psai;
84};
85
86#ifdef CONFIG_ACPI
87
88struct oem_table {
89 struct acpi_table_header Header;
90 u32 OEMTableAddr;
91 u32 OEMTableSize;
92};
93
94extern int find_unisys_acpi_oem_table(unsigned long *oem_addr);
95#endif
96
97struct mip_reg {
98 unsigned long long off_0;
99 unsigned long long off_8;
100 unsigned long long off_10;
101 unsigned long long off_18;
102 unsigned long long off_20;
103 unsigned long long off_28;
104 unsigned long long off_30;
105 unsigned long long off_38;
106};
107
108#define MIP_SW_APIC 0x1020b
109#define MIP_FUNC(VALUE) (VALUE & 0xff)
110
111extern int parse_unisys_oem (char *oemptr);
112extern void setup_unisys(void);
113extern int es7000_start_cpu(int cpu, unsigned long eip);
114extern void es7000_sw_apic(void);
diff --git a/arch/x86/mach-es7000/es7000plat.c b/arch/x86/mach-es7000/es7000plat.c
new file mode 100644
index 000000000000..ab99072d3f9a
--- /dev/null
+++ b/arch/x86/mach-es7000/es7000plat.c
@@ -0,0 +1,327 @@
1/*
2 * Written by: Garry Forsgren, Unisys Corporation
3 * Natalie Protasevich, Unisys Corporation
4 * This file contains the code to configure and interface
5 * with Unisys ES7000 series hardware system manager.
6 *
7 * Copyright (c) 2003 Unisys Corporation. All Rights Reserved.
8 *
9 * This program is free software; you can redistribute it and/or modify it
10 * under the terms of version 2 of the GNU General Public License as
11 * published by the Free Software Foundation.
12 *
13 * This program is distributed in the hope that it would be useful, but
14 * WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
16 *
17 * You should have received a copy of the GNU General Public License along
18 * with this program; if not, write the Free Software Foundation, Inc., 59
19 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
20 *
21 * Contact information: Unisys Corporation, Township Line & Union Meeting
22 * Roads-A, Unisys Way, Blue Bell, Pennsylvania, 19424, or:
23 *
24 * http://www.unisys.com
25 */
26
27#include <linux/module.h>
28#include <linux/types.h>
29#include <linux/kernel.h>
30#include <linux/smp.h>
31#include <linux/string.h>
32#include <linux/spinlock.h>
33#include <linux/errno.h>
34#include <linux/notifier.h>
35#include <linux/reboot.h>
36#include <linux/init.h>
37#include <linux/acpi.h>
38#include <asm/io.h>
39#include <asm/nmi.h>
40#include <asm/smp.h>
41#include <asm/apicdef.h>
42#include "es7000.h"
43#include <mach_mpparse.h>
44
45/*
46 * ES7000 Globals
47 */
48
49volatile unsigned long *psai = NULL;
50struct mip_reg *mip_reg;
51struct mip_reg *host_reg;
52int mip_port;
53unsigned long mip_addr, host_addr;
54
55/*
56 * GSI override for ES7000 platforms.
57 */
58
59static unsigned int base;
60
61static int
62es7000_rename_gsi(int ioapic, int gsi)
63{
64 if (es7000_plat == ES7000_ZORRO)
65 return gsi;
66
67 if (!base) {
68 int i;
69 for (i = 0; i < nr_ioapics; i++)
70 base += nr_ioapic_registers[i];
71 }
72
73 if (!ioapic && (gsi < 16))
74 gsi += base;
75 return gsi;
76}
77
78void __init
79setup_unisys(void)
80{
81 /*
82 * Determine the generation of the ES7000 currently running.
83 *
84 * es7000_plat = 1 if the machine is a 5xx ES7000 box
85 * es7000_plat = 2 if the machine is a x86_64 ES7000 box
86 *
87 */
88 if (!(boot_cpu_data.x86 <= 15 && boot_cpu_data.x86_model <= 2))
89 es7000_plat = ES7000_ZORRO;
90 else
91 es7000_plat = ES7000_CLASSIC;
92 ioapic_renumber_irq = es7000_rename_gsi;
93}
94
95/*
96 * Parse the OEM Table
97 */
98
99int __init
100parse_unisys_oem (char *oemptr)
101{
102 int i;
103 int success = 0;
104 unsigned char type, size;
105 unsigned long val;
106 char *tp = NULL;
107 struct psai *psaip = NULL;
108 struct mip_reg_info *mi;
109 struct mip_reg *host, *mip;
110
111 tp = oemptr;
112
113 tp += 8;
114
115 for (i=0; i <= 6; i++) {
116 type = *tp++;
117 size = *tp++;
118 tp -= 2;
119 switch (type) {
120 case MIP_REG:
121 mi = (struct mip_reg_info *)tp;
122 val = MIP_RD_LO(mi->host_reg);
123 host_addr = val;
124 host = (struct mip_reg *)val;
125 host_reg = __va(host);
126 val = MIP_RD_LO(mi->mip_reg);
127 mip_port = MIP_PORT(mi->mip_info);
128 mip_addr = val;
129 mip = (struct mip_reg *)val;
130 mip_reg = __va(mip);
131 Dprintk("es7000_mipcfg: host_reg = 0x%lx \n",
132 (unsigned long)host_reg);
133 Dprintk("es7000_mipcfg: mip_reg = 0x%lx \n",
134 (unsigned long)mip_reg);
135 success++;
136 break;
137 case MIP_PSAI_REG:
138 psaip = (struct psai *)tp;
139 if (tp != NULL) {
140 if (psaip->addr)
141 psai = __va(psaip->addr);
142 else
143 psai = NULL;
144 success++;
145 }
146 break;
147 default:
148 break;
149 }
150 tp += size;
151 }
152
153 if (success < 2) {
154 es7000_plat = NON_UNISYS;
155 } else
156 setup_unisys();
157 return es7000_plat;
158}
159
160#ifdef CONFIG_ACPI
161int __init
162find_unisys_acpi_oem_table(unsigned long *oem_addr)
163{
164 struct acpi_table_header *header = NULL;
165 int i = 0;
166 while (ACPI_SUCCESS(acpi_get_table("OEM1", i++, &header))) {
167 if (!memcmp((char *) &header->oem_id, "UNISYS", 6)) {
168 struct oem_table *t = (struct oem_table *)header;
169 *oem_addr = (unsigned long)__acpi_map_table(t->OEMTableAddr,
170 t->OEMTableSize);
171 return 0;
172 }
173 }
174 return -1;
175}
176#endif
177
178/*
179 * This file also gets compiled if CONFIG_X86_GENERICARCH is set. Generic
180 * arch already has got following function definitions (asm-generic/es7000.c)
181 * hence no need to define these for that case.
182 */
183#ifndef CONFIG_X86_GENERICARCH
184void es7000_sw_apic(void);
185void __init enable_apic_mode(void)
186{
187 es7000_sw_apic();
188 return;
189}
190
191__init int mps_oem_check(struct mp_config_table *mpc, char *oem,
192 char *productid)
193{
194 if (mpc->mpc_oemptr) {
195 struct mp_config_oemtable *oem_table =
196 (struct mp_config_oemtable *)mpc->mpc_oemptr;
197 if (!strncmp(oem, "UNISYS", 6))
198 return parse_unisys_oem((char *)oem_table);
199 }
200 return 0;
201}
202#ifdef CONFIG_ACPI
203/* Hook from generic ACPI tables.c */
204int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
205{
206 unsigned long oem_addr;
207 if (!find_unisys_acpi_oem_table(&oem_addr)) {
208 if (es7000_check_dsdt())
209 return parse_unisys_oem((char *)oem_addr);
210 else {
211 setup_unisys();
212 return 1;
213 }
214 }
215 return 0;
216}
217#else
218int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
219{
220 return 0;
221}
222#endif
223#endif /* COFIG_X86_GENERICARCH */
224
225static void
226es7000_spin(int n)
227{
228 int i = 0;
229
230 while (i++ < n)
231 rep_nop();
232}
233
234static int __init
235es7000_mip_write(struct mip_reg *mip_reg)
236{
237 int status = 0;
238 int spin;
239
240 spin = MIP_SPIN;
241 while (((unsigned long long)host_reg->off_38 &
242 (unsigned long long)MIP_VALID) != 0) {
243 if (--spin <= 0) {
244 printk("es7000_mip_write: Timeout waiting for Host Valid Flag");
245 return -1;
246 }
247 es7000_spin(MIP_SPIN);
248 }
249
250 memcpy(host_reg, mip_reg, sizeof(struct mip_reg));
251 outb(1, mip_port);
252
253 spin = MIP_SPIN;
254
255 while (((unsigned long long)mip_reg->off_38 &
256 (unsigned long long)MIP_VALID) == 0) {
257 if (--spin <= 0) {
258 printk("es7000_mip_write: Timeout waiting for MIP Valid Flag");
259 return -1;
260 }
261 es7000_spin(MIP_SPIN);
262 }
263
264 status = ((unsigned long long)mip_reg->off_0 &
265 (unsigned long long)0xffff0000000000ULL) >> 48;
266 mip_reg->off_38 = ((unsigned long long)mip_reg->off_38 &
267 (unsigned long long)~MIP_VALID);
268 return status;
269}
270
271int
272es7000_start_cpu(int cpu, unsigned long eip)
273{
274 unsigned long vect = 0, psaival = 0;
275
276 if (psai == NULL)
277 return -1;
278
279 vect = ((unsigned long)__pa(eip)/0x1000) << 16;
280 psaival = (0x1000000 | vect | cpu);
281
282 while (*psai & 0x1000000)
283 ;
284
285 *psai = psaival;
286
287 return 0;
288
289}
290
291int
292es7000_stop_cpu(int cpu)
293{
294 int startup;
295
296 if (psai == NULL)
297 return -1;
298
299 startup= (0x1000000 | cpu);
300
301 while ((*psai & 0xff00ffff) != startup)
302 ;
303
304 startup = (*psai & 0xff0000) >> 16;
305 *psai &= 0xffffff;
306
307 return 0;
308
309}
310
311void __init
312es7000_sw_apic()
313{
314 if (es7000_plat) {
315 int mip_status;
316 struct mip_reg es7000_mip_reg;
317
318 printk("ES7000: Enabling APIC mode.\n");
319 memset(&es7000_mip_reg, 0, sizeof(struct mip_reg));
320 es7000_mip_reg.off_0 = MIP_SW_APIC;
321 es7000_mip_reg.off_38 = (MIP_VALID);
322 while ((mip_status = es7000_mip_write(&es7000_mip_reg)) != 0)
323 printk("es7000_sw_apic: command failed, status = %x\n",
324 mip_status);
325 return;
326 }
327}
diff --git a/arch/x86/mach-generic/Makefile b/arch/x86/mach-generic/Makefile
new file mode 100644
index 000000000000..19d6d407737b
--- /dev/null
+++ b/arch/x86/mach-generic/Makefile
@@ -0,0 +1,8 @@
1#
2# Makefile for the generic architecture
3#
4
5EXTRA_CFLAGS := -Iarch/x86/kernel
6
7obj-y := probe.o summit.o bigsmp.o es7000.o default.o
8obj-y += ../../x86/mach-es7000/
diff --git a/arch/x86/mach-generic/bigsmp.c b/arch/x86/mach-generic/bigsmp.c
new file mode 100644
index 000000000000..292a225edabe
--- /dev/null
+++ b/arch/x86/mach-generic/bigsmp.c
@@ -0,0 +1,57 @@
1/*
2 * APIC driver for "bigsmp" XAPIC machines with more than 8 virtual CPUs.
3 * Drives the local APIC in "clustered mode".
4 */
5#define APIC_DEFINITION 1
6#include <linux/threads.h>
7#include <linux/cpumask.h>
8#include <asm/smp.h>
9#include <asm/mpspec.h>
10#include <asm/genapic.h>
11#include <asm/fixmap.h>
12#include <asm/apicdef.h>
13#include <linux/kernel.h>
14#include <linux/smp.h>
15#include <linux/init.h>
16#include <linux/dmi.h>
17#include <asm/mach-bigsmp/mach_apic.h>
18#include <asm/mach-bigsmp/mach_apicdef.h>
19#include <asm/mach-bigsmp/mach_ipi.h>
20#include <asm/mach-default/mach_mpparse.h>
21
22static int dmi_bigsmp; /* can be set by dmi scanners */
23
24static int hp_ht_bigsmp(const struct dmi_system_id *d)
25{
26#ifdef CONFIG_X86_GENERICARCH
27 printk(KERN_NOTICE "%s detected: force use of apic=bigsmp\n", d->ident);
28 dmi_bigsmp = 1;
29#endif
30 return 0;
31}
32
33
34static const struct dmi_system_id bigsmp_dmi_table[] = {
35 { hp_ht_bigsmp, "HP ProLiant DL760 G2", {
36 DMI_MATCH(DMI_BIOS_VENDOR, "HP"),
37 DMI_MATCH(DMI_BIOS_VERSION, "P44-"),
38 }},
39
40 { hp_ht_bigsmp, "HP ProLiant DL740", {
41 DMI_MATCH(DMI_BIOS_VENDOR, "HP"),
42 DMI_MATCH(DMI_BIOS_VERSION, "P47-"),
43 }},
44 { }
45};
46
47
48static int probe_bigsmp(void)
49{
50 if (def_to_bigsmp)
51 dmi_bigsmp = 1;
52 else
53 dmi_check_system(bigsmp_dmi_table);
54 return dmi_bigsmp;
55}
56
57struct genapic apic_bigsmp = APIC_INIT("bigsmp", probe_bigsmp);
diff --git a/arch/x86/mach-generic/default.c b/arch/x86/mach-generic/default.c
new file mode 100644
index 000000000000..8685208d8512
--- /dev/null
+++ b/arch/x86/mach-generic/default.c
@@ -0,0 +1,26 @@
1/*
2 * Default generic APIC driver. This handles upto 8 CPUs.
3 */
4#define APIC_DEFINITION 1
5#include <linux/threads.h>
6#include <linux/cpumask.h>
7#include <asm/mpspec.h>
8#include <asm/mach-default/mach_apicdef.h>
9#include <asm/genapic.h>
10#include <asm/fixmap.h>
11#include <asm/apicdef.h>
12#include <linux/kernel.h>
13#include <linux/string.h>
14#include <linux/smp.h>
15#include <linux/init.h>
16#include <asm/mach-default/mach_apic.h>
17#include <asm/mach-default/mach_ipi.h>
18#include <asm/mach-default/mach_mpparse.h>
19
20/* should be called last. */
21static int probe_default(void)
22{
23 return 1;
24}
25
26struct genapic apic_default = APIC_INIT("default", probe_default);
diff --git a/arch/x86/mach-generic/es7000.c b/arch/x86/mach-generic/es7000.c
new file mode 100644
index 000000000000..4742626f08c4
--- /dev/null
+++ b/arch/x86/mach-generic/es7000.c
@@ -0,0 +1,69 @@
1/*
2 * APIC driver for the Unisys ES7000 chipset.
3 */
4#define APIC_DEFINITION 1
5#include <linux/threads.h>
6#include <linux/cpumask.h>
7#include <asm/smp.h>
8#include <asm/mpspec.h>
9#include <asm/genapic.h>
10#include <asm/fixmap.h>
11#include <asm/apicdef.h>
12#include <linux/kernel.h>
13#include <linux/string.h>
14#include <linux/smp.h>
15#include <linux/init.h>
16#include <asm/mach-es7000/mach_apicdef.h>
17#include <asm/mach-es7000/mach_apic.h>
18#include <asm/mach-es7000/mach_ipi.h>
19#include <asm/mach-es7000/mach_mpparse.h>
20#include <asm/mach-es7000/mach_wakecpu.h>
21
22static int probe_es7000(void)
23{
24 /* probed later in mptable/ACPI hooks */
25 return 0;
26}
27
28extern void es7000_sw_apic(void);
29static void __init enable_apic_mode(void)
30{
31 es7000_sw_apic();
32 return;
33}
34
35static __init int mps_oem_check(struct mp_config_table *mpc, char *oem,
36 char *productid)
37{
38 if (mpc->mpc_oemptr) {
39 struct mp_config_oemtable *oem_table =
40 (struct mp_config_oemtable *)mpc->mpc_oemptr;
41 if (!strncmp(oem, "UNISYS", 6))
42 return parse_unisys_oem((char *)oem_table);
43 }
44 return 0;
45}
46
47#ifdef CONFIG_ACPI
48/* Hook from generic ACPI tables.c */
49static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
50{
51 unsigned long oem_addr;
52 if (!find_unisys_acpi_oem_table(&oem_addr)) {
53 if (es7000_check_dsdt())
54 return parse_unisys_oem((char *)oem_addr);
55 else {
56 setup_unisys();
57 return 1;
58 }
59 }
60 return 0;
61}
62#else
63static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
64{
65 return 0;
66}
67#endif
68
69struct genapic __initdata_refok apic_es7000 = APIC_INIT("es7000", probe_es7000);
diff --git a/arch/x86/mach-generic/probe.c b/arch/x86/mach-generic/probe.c
new file mode 100644
index 000000000000..74f3da634423
--- /dev/null
+++ b/arch/x86/mach-generic/probe.c
@@ -0,0 +1,125 @@
1/* Copyright 2003 Andi Kleen, SuSE Labs.
2 * Subject to the GNU Public License, v.2
3 *
4 * Generic x86 APIC driver probe layer.
5 */
6#include <linux/threads.h>
7#include <linux/cpumask.h>
8#include <linux/string.h>
9#include <linux/kernel.h>
10#include <linux/ctype.h>
11#include <linux/init.h>
12#include <linux/errno.h>
13#include <asm/fixmap.h>
14#include <asm/mpspec.h>
15#include <asm/apicdef.h>
16#include <asm/genapic.h>
17
18extern struct genapic apic_summit;
19extern struct genapic apic_bigsmp;
20extern struct genapic apic_es7000;
21extern struct genapic apic_default;
22
23struct genapic *genapic = &apic_default;
24
25struct genapic *apic_probe[] __initdata = {
26 &apic_summit,
27 &apic_bigsmp,
28 &apic_es7000,
29 &apic_default, /* must be last */
30 NULL,
31};
32
33static int cmdline_apic __initdata;
34static int __init parse_apic(char *arg)
35{
36 int i;
37
38 if (!arg)
39 return -EINVAL;
40
41 for (i = 0; apic_probe[i]; i++) {
42 if (!strcmp(apic_probe[i]->name, arg)) {
43 genapic = apic_probe[i];
44 cmdline_apic = 1;
45 return 0;
46 }
47 }
48
49 /* Parsed again by __setup for debug/verbose */
50 return 0;
51}
52early_param("apic", parse_apic);
53
54void __init generic_bigsmp_probe(void)
55{
56 /*
57 * This routine is used to switch to bigsmp mode when
58 * - There is no apic= option specified by the user
59 * - generic_apic_probe() has choosen apic_default as the sub_arch
60 * - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support
61 */
62
63 if (!cmdline_apic && genapic == &apic_default)
64 if (apic_bigsmp.probe()) {
65 genapic = &apic_bigsmp;
66 printk(KERN_INFO "Overriding APIC driver with %s\n",
67 genapic->name);
68 }
69}
70
71void __init generic_apic_probe(void)
72{
73 if (!cmdline_apic) {
74 int i;
75 for (i = 0; apic_probe[i]; i++) {
76 if (apic_probe[i]->probe()) {
77 genapic = apic_probe[i];
78 break;
79 }
80 }
81 /* Not visible without early console */
82 if (!apic_probe[i])
83 panic("Didn't find an APIC driver");
84 }
85 printk(KERN_INFO "Using APIC driver %s\n", genapic->name);
86}
87
88/* These functions can switch the APIC even after the initial ->probe() */
89
90int __init mps_oem_check(struct mp_config_table *mpc, char *oem, char *productid)
91{
92 int i;
93 for (i = 0; apic_probe[i]; ++i) {
94 if (apic_probe[i]->mps_oem_check(mpc,oem,productid)) {
95 if (!cmdline_apic) {
96 genapic = apic_probe[i];
97 printk(KERN_INFO "Switched to APIC driver `%s'.\n",
98 genapic->name);
99 }
100 return 1;
101 }
102 }
103 return 0;
104}
105
106int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
107{
108 int i;
109 for (i = 0; apic_probe[i]; ++i) {
110 if (apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) {
111 if (!cmdline_apic) {
112 genapic = apic_probe[i];
113 printk(KERN_INFO "Switched to APIC driver `%s'.\n",
114 genapic->name);
115 }
116 return 1;
117 }
118 }
119 return 0;
120}
121
122int hard_smp_processor_id(void)
123{
124 return genapic->get_apic_id(*(unsigned long *)(APIC_BASE+APIC_ID));
125}
diff --git a/arch/x86/mach-generic/summit.c b/arch/x86/mach-generic/summit.c
new file mode 100644
index 000000000000..74883ccb8f73
--- /dev/null
+++ b/arch/x86/mach-generic/summit.c
@@ -0,0 +1,27 @@
1/*
2 * APIC driver for the IBM "Summit" chipset.
3 */
4#define APIC_DEFINITION 1
5#include <linux/threads.h>
6#include <linux/cpumask.h>
7#include <asm/smp.h>
8#include <asm/mpspec.h>
9#include <asm/genapic.h>
10#include <asm/fixmap.h>
11#include <asm/apicdef.h>
12#include <linux/kernel.h>
13#include <linux/string.h>
14#include <linux/smp.h>
15#include <linux/init.h>
16#include <asm/mach-summit/mach_apic.h>
17#include <asm/mach-summit/mach_apicdef.h>
18#include <asm/mach-summit/mach_ipi.h>
19#include <asm/mach-summit/mach_mpparse.h>
20
21static int probe_summit(void)
22{
23 /* probed later in mptable/ACPI hooks */
24 return 0;
25}
26
27struct genapic apic_summit = APIC_INIT("summit", probe_summit);
diff --git a/arch/x86/mach-visws/Makefile b/arch/x86/mach-visws/Makefile
new file mode 100644
index 000000000000..835fd96ad768
--- /dev/null
+++ b/arch/x86/mach-visws/Makefile
@@ -0,0 +1,8 @@
1#
2# Makefile for the linux kernel.
3#
4
5obj-y := setup.o traps.o reboot.o
6
7obj-$(CONFIG_X86_VISWS_APIC) += visws_apic.o
8obj-$(CONFIG_X86_LOCAL_APIC) += mpparse.o
diff --git a/arch/x86/mach-visws/mpparse.c b/arch/x86/mach-visws/mpparse.c
new file mode 100644
index 000000000000..f3c74fab8b95
--- /dev/null
+++ b/arch/x86/mach-visws/mpparse.c
@@ -0,0 +1,101 @@
1
2#include <linux/init.h>
3#include <linux/smp.h>
4
5#include <asm/smp.h>
6#include <asm/io.h>
7
8#include "cobalt.h"
9#include "mach_apic.h"
10
11/* Have we found an MP table */
12int smp_found_config;
13
14/*
15 * Various Linux-internal data structures created from the
16 * MP-table.
17 */
18int apic_version [MAX_APICS];
19
20int pic_mode;
21unsigned long mp_lapic_addr;
22
23/* Processor that is doing the boot up */
24unsigned int boot_cpu_physical_apicid = -1U;
25
26/* Bitmask of physically existing CPUs */
27physid_mask_t phys_cpu_present_map;
28
29unsigned int __initdata maxcpus = NR_CPUS;
30
31/*
32 * The Visual Workstation is Intel MP compliant in the hardware
33 * sense, but it doesn't have a BIOS(-configuration table).
34 * No problem for Linux.
35 */
36
37static void __init MP_processor_info (struct mpc_config_processor *m)
38{
39 int ver, logical_apicid;
40 physid_mask_t apic_cpus;
41
42 if (!(m->mpc_cpuflag & CPU_ENABLED))
43 return;
44
45 logical_apicid = m->mpc_apicid;
46 printk(KERN_INFO "%sCPU #%d %ld:%ld APIC version %d\n",
47 m->mpc_cpuflag & CPU_BOOTPROCESSOR ? "Bootup " : "",
48 m->mpc_apicid,
49 (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
50 (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
51 m->mpc_apicver);
52
53 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR)
54 boot_cpu_physical_apicid = m->mpc_apicid;
55
56 ver = m->mpc_apicver;
57 if ((ver >= 0x14 && m->mpc_apicid >= 0xff) || m->mpc_apicid >= 0xf) {
58 printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
59 m->mpc_apicid, MAX_APICS);
60 return;
61 }
62
63 apic_cpus = apicid_to_cpu_present(m->mpc_apicid);
64 physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus);
65 /*
66 * Validate version
67 */
68 if (ver == 0x0) {
69 printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! "
70 "fixing up to 0x10. (tell your hw vendor)\n",
71 m->mpc_apicid);
72 ver = 0x10;
73 }
74 apic_version[m->mpc_apicid] = ver;
75}
76
77void __init find_smp_config(void)
78{
79 struct mpc_config_processor *mp = phys_to_virt(CO_CPU_TAB_PHYS);
80 unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS));
81
82 if (ncpus > CO_CPU_MAX) {
83 printk(KERN_WARNING "find_visws_smp: got cpu count of %d at %p\n",
84 ncpus, mp);
85
86 ncpus = CO_CPU_MAX;
87 }
88
89 if (ncpus > maxcpus)
90 ncpus = maxcpus;
91
92 smp_found_config = 1;
93 while (ncpus--)
94 MP_processor_info(mp++);
95
96 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
97}
98
99void __init get_smp_config (void)
100{
101}
diff --git a/arch/x86/mach-visws/reboot.c b/arch/x86/mach-visws/reboot.c
new file mode 100644
index 000000000000..99332abfad42
--- /dev/null
+++ b/arch/x86/mach-visws/reboot.c
@@ -0,0 +1,55 @@
1#include <linux/module.h>
2#include <linux/smp.h>
3#include <linux/delay.h>
4
5#include <asm/io.h>
6#include "piix4.h"
7
8void (*pm_power_off)(void);
9EXPORT_SYMBOL(pm_power_off);
10
11void machine_shutdown(void)
12{
13#ifdef CONFIG_SMP
14 smp_send_stop();
15#endif
16}
17
18void machine_emergency_restart(void)
19{
20 /*
21 * Visual Workstations restart after this
22 * register is poked on the PIIX4
23 */
24 outb(PIIX4_RESET_VAL, PIIX4_RESET_PORT);
25}
26
27void machine_restart(char * __unused)
28{
29 machine_shutdown();
30 machine_emergency_restart();
31}
32
33void machine_power_off(void)
34{
35 unsigned short pm_status;
36 extern unsigned int pci_bus0;
37
38 while ((pm_status = inw(PMSTS_PORT)) & 0x100)
39 outw(pm_status, PMSTS_PORT);
40
41 outw(PM_SUSPEND_ENABLE, PMCNTRL_PORT);
42
43 mdelay(10);
44
45#define PCI_CONF1_ADDRESS(bus, devfn, reg) \
46 (0x80000000 | (bus << 16) | (devfn << 8) | (reg & ~3))
47
48 outl(PCI_CONF1_ADDRESS(pci_bus0, SPECIAL_DEV, SPECIAL_REG), 0xCF8);
49 outl(PIIX_SPECIAL_STOP, 0xCFC);
50}
51
52void machine_halt(void)
53{
54}
55
diff --git a/arch/x86/mach-visws/setup.c b/arch/x86/mach-visws/setup.c
new file mode 100644
index 000000000000..1f81f10e03a0
--- /dev/null
+++ b/arch/x86/mach-visws/setup.c
@@ -0,0 +1,183 @@
1/*
2 * Unmaintained SGI Visual Workstation support.
3 * Split out from setup.c by davej@suse.de
4 */
5
6#include <linux/smp.h>
7#include <linux/init.h>
8#include <linux/interrupt.h>
9#include <linux/module.h>
10
11#include <asm/fixmap.h>
12#include <asm/arch_hooks.h>
13#include <asm/io.h>
14#include <asm/e820.h>
15#include <asm/setup.h>
16#include "cobalt.h"
17#include "piix4.h"
18
19int no_broadcast;
20
21char visws_board_type = -1;
22char visws_board_rev = -1;
23
24void __init visws_get_board_type_and_rev(void)
25{
26 int raw;
27
28 visws_board_type = (char)(inb_p(PIIX_GPI_BD_REG) & PIIX_GPI_BD_REG)
29 >> PIIX_GPI_BD_SHIFT;
30 /*
31 * Get Board rev.
32 * First, we have to initialize the 307 part to allow us access
33 * to the GPIO registers. Let's map them at 0x0fc0 which is right
34 * after the PIIX4 PM section.
35 */
36 outb_p(SIO_DEV_SEL, SIO_INDEX);
37 outb_p(SIO_GP_DEV, SIO_DATA); /* Talk to GPIO regs. */
38
39 outb_p(SIO_DEV_MSB, SIO_INDEX);
40 outb_p(SIO_GP_MSB, SIO_DATA); /* MSB of GPIO base address */
41
42 outb_p(SIO_DEV_LSB, SIO_INDEX);
43 outb_p(SIO_GP_LSB, SIO_DATA); /* LSB of GPIO base address */
44
45 outb_p(SIO_DEV_ENB, SIO_INDEX);
46 outb_p(1, SIO_DATA); /* Enable GPIO registers. */
47
48 /*
49 * Now, we have to map the power management section to write
50 * a bit which enables access to the GPIO registers.
51 * What lunatic came up with this shit?
52 */
53 outb_p(SIO_DEV_SEL, SIO_INDEX);
54 outb_p(SIO_PM_DEV, SIO_DATA); /* Talk to GPIO regs. */
55
56 outb_p(SIO_DEV_MSB, SIO_INDEX);
57 outb_p(SIO_PM_MSB, SIO_DATA); /* MSB of PM base address */
58
59 outb_p(SIO_DEV_LSB, SIO_INDEX);
60 outb_p(SIO_PM_LSB, SIO_DATA); /* LSB of PM base address */
61
62 outb_p(SIO_DEV_ENB, SIO_INDEX);
63 outb_p(1, SIO_DATA); /* Enable PM registers. */
64
65 /*
66 * Now, write the PM register which enables the GPIO registers.
67 */
68 outb_p(SIO_PM_FER2, SIO_PM_INDEX);
69 outb_p(SIO_PM_GP_EN, SIO_PM_DATA);
70
71 /*
72 * Now, initialize the GPIO registers.
73 * We want them all to be inputs which is the
74 * power on default, so let's leave them alone.
75 * So, let's just read the board rev!
76 */
77 raw = inb_p(SIO_GP_DATA1);
78 raw &= 0x7f; /* 7 bits of valid board revision ID. */
79
80 if (visws_board_type == VISWS_320) {
81 if (raw < 0x6) {
82 visws_board_rev = 4;
83 } else if (raw < 0xc) {
84 visws_board_rev = 5;
85 } else {
86 visws_board_rev = 6;
87 }
88 } else if (visws_board_type == VISWS_540) {
89 visws_board_rev = 2;
90 } else {
91 visws_board_rev = raw;
92 }
93
94 printk(KERN_INFO "Silicon Graphics Visual Workstation %s (rev %d) detected\n",
95 (visws_board_type == VISWS_320 ? "320" :
96 (visws_board_type == VISWS_540 ? "540" :
97 "unknown")), visws_board_rev);
98}
99
100void __init pre_intr_init_hook(void)
101{
102 init_VISWS_APIC_irqs();
103}
104
105void __init intr_init_hook(void)
106{
107#ifdef CONFIG_X86_LOCAL_APIC
108 apic_intr_init();
109#endif
110}
111
112void __init pre_setup_arch_hook()
113{
114 visws_get_board_type_and_rev();
115}
116
117static struct irqaction irq0 = {
118 .handler = timer_interrupt,
119 .flags = IRQF_DISABLED | IRQF_IRQPOLL,
120 .name = "timer",
121};
122
123void __init time_init_hook(void)
124{
125 printk(KERN_INFO "Starting Cobalt Timer system clock\n");
126
127 /* Set the countdown value */
128 co_cpu_write(CO_CPU_TIMEVAL, CO_TIME_HZ/HZ);
129
130 /* Start the timer */
131 co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) | CO_CTRL_TIMERUN);
132
133 /* Enable (unmask) the timer interrupt */
134 co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) & ~CO_CTRL_TIMEMASK);
135
136 /* Wire cpu IDT entry to s/w handler (and Cobalt APIC to IDT) */
137 setup_irq(0, &irq0);
138}
139
140/* Hook for machine specific memory setup. */
141
142#define MB (1024 * 1024)
143
144unsigned long sgivwfb_mem_phys;
145unsigned long sgivwfb_mem_size;
146EXPORT_SYMBOL(sgivwfb_mem_phys);
147EXPORT_SYMBOL(sgivwfb_mem_size);
148
149long long mem_size __initdata = 0;
150
151char * __init machine_specific_memory_setup(void)
152{
153 long long gfx_mem_size = 8 * MB;
154
155 mem_size = ALT_MEM_K;
156
157 if (!mem_size) {
158 printk(KERN_WARNING "Bootloader didn't set memory size, upgrade it !\n");
159 mem_size = 128 * MB;
160 }
161
162 /*
163 * this hardcodes the graphics memory to 8 MB
164 * it really should be sized dynamically (or at least
165 * set as a boot param)
166 */
167 if (!sgivwfb_mem_size) {
168 printk(KERN_WARNING "Defaulting to 8 MB framebuffer size\n");
169 sgivwfb_mem_size = 8 * MB;
170 }
171
172 /*
173 * Trim to nearest MB
174 */
175 sgivwfb_mem_size &= ~((1 << 20) - 1);
176 sgivwfb_mem_phys = mem_size - gfx_mem_size;
177
178 add_memory_region(0, LOWMEMSIZE(), E820_RAM);
179 add_memory_region(HIGH_MEMORY, mem_size - sgivwfb_mem_size - HIGH_MEMORY, E820_RAM);
180 add_memory_region(sgivwfb_mem_phys, sgivwfb_mem_size, E820_RESERVED);
181
182 return "PROM";
183}
diff --git a/arch/x86/mach-visws/traps.c b/arch/x86/mach-visws/traps.c
new file mode 100644
index 000000000000..843b67acf43b
--- /dev/null
+++ b/arch/x86/mach-visws/traps.c
@@ -0,0 +1,68 @@
1/* VISWS traps */
2
3#include <linux/sched.h>
4#include <linux/kernel.h>
5#include <linux/init.h>
6#include <linux/pci.h>
7#include <linux/pci_ids.h>
8
9#include <asm/io.h>
10#include <asm/arch_hooks.h>
11#include <asm/apic.h>
12#include "cobalt.h"
13#include "lithium.h"
14
15
16#define A01234 (LI_INTA_0 | LI_INTA_1 | LI_INTA_2 | LI_INTA_3 | LI_INTA_4)
17#define BCD (LI_INTB | LI_INTC | LI_INTD)
18#define ALLDEVS (A01234 | BCD)
19
20static __init void lithium_init(void)
21{
22 set_fixmap(FIX_LI_PCIA, LI_PCI_A_PHYS);
23 set_fixmap(FIX_LI_PCIB, LI_PCI_B_PHYS);
24
25 if ((li_pcia_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) ||
26 (li_pcia_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) {
27 printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'A');
28 panic("This machine is not SGI Visual Workstation 320/540");
29 }
30
31 if ((li_pcib_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) ||
32 (li_pcib_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) {
33 printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'B');
34 panic("This machine is not SGI Visual Workstation 320/540");
35 }
36
37 li_pcia_write16(LI_PCI_INTEN, ALLDEVS);
38 li_pcib_write16(LI_PCI_INTEN, ALLDEVS);
39}
40
41static __init void cobalt_init(void)
42{
43 /*
44 * On normal SMP PC this is used only with SMP, but we have to
45 * use it and set it up here to start the Cobalt clock
46 */
47 set_fixmap(FIX_APIC_BASE, APIC_DEFAULT_PHYS_BASE);
48 setup_local_APIC();
49 printk(KERN_INFO "Local APIC Version %#lx, ID %#lx\n",
50 apic_read(APIC_LVR), apic_read(APIC_ID));
51
52 set_fixmap(FIX_CO_CPU, CO_CPU_PHYS);
53 set_fixmap(FIX_CO_APIC, CO_APIC_PHYS);
54 printk(KERN_INFO "Cobalt Revision %#lx, APIC ID %#lx\n",
55 co_cpu_read(CO_CPU_REV), co_apic_read(CO_APIC_ID));
56
57 /* Enable Cobalt APIC being careful to NOT change the ID! */
58 co_apic_write(CO_APIC_ID, co_apic_read(CO_APIC_ID) | CO_APIC_ENABLE);
59
60 printk(KERN_INFO "Cobalt APIC enabled: ID reg %#lx\n",
61 co_apic_read(CO_APIC_ID));
62}
63
64void __init trap_init_hook(void)
65{
66 lithium_init();
67 cobalt_init();
68}
diff --git a/arch/x86/mach-visws/visws_apic.c b/arch/x86/mach-visws/visws_apic.c
new file mode 100644
index 000000000000..710faf71a650
--- /dev/null
+++ b/arch/x86/mach-visws/visws_apic.c
@@ -0,0 +1,299 @@
1/*
2 * linux/arch/i386/mach-visws/visws_apic.c
3 *
4 * Copyright (C) 1999 Bent Hagemark, Ingo Molnar
5 *
6 * SGI Visual Workstation interrupt controller
7 *
8 * The Cobalt system ASIC in the Visual Workstation contains a "Cobalt" APIC
9 * which serves as the main interrupt controller in the system. Non-legacy
10 * hardware in the system uses this controller directly. Legacy devices
11 * are connected to the PIIX4 which in turn has its 8259(s) connected to
12 * a of the Cobalt APIC entry.
13 *
14 * 09/02/2000 - Updated for 2.4 by jbarnes@sgi.com
15 *
16 * 25/11/2002 - Updated for 2.5 by Andrey Panin <pazke@orbita1.ru>
17 */
18
19#include <linux/kernel_stat.h>
20#include <linux/interrupt.h>
21#include <linux/init.h>
22
23#include <asm/io.h>
24#include <asm/apic.h>
25#include <asm/i8259.h>
26
27#include "cobalt.h"
28#include "irq_vectors.h"
29
30
31static DEFINE_SPINLOCK(cobalt_lock);
32
33/*
34 * Set the given Cobalt APIC Redirection Table entry to point
35 * to the given IDT vector/index.
36 */
37static inline void co_apic_set(int entry, int irq)
38{
39 co_apic_write(CO_APIC_LO(entry), CO_APIC_LEVEL | (irq + FIRST_EXTERNAL_VECTOR));
40 co_apic_write(CO_APIC_HI(entry), 0);
41}
42
43/*
44 * Cobalt (IO)-APIC functions to handle PCI devices.
45 */
46static inline int co_apic_ide0_hack(void)
47{
48 extern char visws_board_type;
49 extern char visws_board_rev;
50
51 if (visws_board_type == VISWS_320 && visws_board_rev == 5)
52 return 5;
53 return CO_APIC_IDE0;
54}
55
56static int is_co_apic(unsigned int irq)
57{
58 if (IS_CO_APIC(irq))
59 return CO_APIC(irq);
60
61 switch (irq) {
62 case 0: return CO_APIC_CPU;
63 case CO_IRQ_IDE0: return co_apic_ide0_hack();
64 case CO_IRQ_IDE1: return CO_APIC_IDE1;
65 default: return -1;
66 }
67}
68
69
70/*
71 * This is the SGI Cobalt (IO-)APIC:
72 */
73
74static void enable_cobalt_irq(unsigned int irq)
75{
76 co_apic_set(is_co_apic(irq), irq);
77}
78
79static void disable_cobalt_irq(unsigned int irq)
80{
81 int entry = is_co_apic(irq);
82
83 co_apic_write(CO_APIC_LO(entry), CO_APIC_MASK);
84 co_apic_read(CO_APIC_LO(entry));
85}
86
87/*
88 * "irq" really just serves to identify the device. Here is where we
89 * map this to the Cobalt APIC entry where it's physically wired.
90 * This is called via request_irq -> setup_irq -> irq_desc->startup()
91 */
92static unsigned int startup_cobalt_irq(unsigned int irq)
93{
94 unsigned long flags;
95
96 spin_lock_irqsave(&cobalt_lock, flags);
97 if ((irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING)))
98 irq_desc[irq].status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING);
99 enable_cobalt_irq(irq);
100 spin_unlock_irqrestore(&cobalt_lock, flags);
101 return 0;
102}
103
104static void ack_cobalt_irq(unsigned int irq)
105{
106 unsigned long flags;
107
108 spin_lock_irqsave(&cobalt_lock, flags);
109 disable_cobalt_irq(irq);
110 apic_write(APIC_EOI, APIC_EIO_ACK);
111 spin_unlock_irqrestore(&cobalt_lock, flags);
112}
113
114static void end_cobalt_irq(unsigned int irq)
115{
116 unsigned long flags;
117
118 spin_lock_irqsave(&cobalt_lock, flags);
119 if (!(irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS)))
120 enable_cobalt_irq(irq);
121 spin_unlock_irqrestore(&cobalt_lock, flags);
122}
123
124static struct irq_chip cobalt_irq_type = {
125 .typename = "Cobalt-APIC",
126 .startup = startup_cobalt_irq,
127 .shutdown = disable_cobalt_irq,
128 .enable = enable_cobalt_irq,
129 .disable = disable_cobalt_irq,
130 .ack = ack_cobalt_irq,
131 .end = end_cobalt_irq,
132};
133
134
135/*
136 * This is the PIIX4-based 8259 that is wired up indirectly to Cobalt
137 * -- not the manner expected by the code in i8259.c.
138 *
139 * there is a 'master' physical interrupt source that gets sent to
140 * the CPU. But in the chipset there are various 'virtual' interrupts
141 * waiting to be handled. We represent this to Linux through a 'master'
142 * interrupt controller type, and through a special virtual interrupt-
143 * controller. Device drivers only see the virtual interrupt sources.
144 */
145static unsigned int startup_piix4_master_irq(unsigned int irq)
146{
147 init_8259A(0);
148
149 return startup_cobalt_irq(irq);
150}
151
152static void end_piix4_master_irq(unsigned int irq)
153{
154 unsigned long flags;
155
156 spin_lock_irqsave(&cobalt_lock, flags);
157 enable_cobalt_irq(irq);
158 spin_unlock_irqrestore(&cobalt_lock, flags);
159}
160
161static struct irq_chip piix4_master_irq_type = {
162 .typename = "PIIX4-master",
163 .startup = startup_piix4_master_irq,
164 .ack = ack_cobalt_irq,
165 .end = end_piix4_master_irq,
166};
167
168
169static struct irq_chip piix4_virtual_irq_type = {
170 .typename = "PIIX4-virtual",
171 .shutdown = disable_8259A_irq,
172 .enable = enable_8259A_irq,
173 .disable = disable_8259A_irq,
174};
175
176
177/*
178 * PIIX4-8259 master/virtual functions to handle interrupt requests
179 * from legacy devices: floppy, parallel, serial, rtc.
180 *
181 * None of these get Cobalt APIC entries, neither do they have IDT
182 * entries. These interrupts are purely virtual and distributed from
183 * the 'master' interrupt source: CO_IRQ_8259.
184 *
185 * When the 8259 interrupts its handler figures out which of these
186 * devices is interrupting and dispatches to its handler.
187 *
188 * CAREFUL: devices see the 'virtual' interrupt only. Thus disable/
189 * enable_irq gets the right irq. This 'master' irq is never directly
190 * manipulated by any driver.
191 */
192static irqreturn_t piix4_master_intr(int irq, void *dev_id)
193{
194 int realirq;
195 irq_desc_t *desc;
196 unsigned long flags;
197
198 spin_lock_irqsave(&i8259A_lock, flags);
199
200 /* Find out what's interrupting in the PIIX4 master 8259 */
201 outb(0x0c, 0x20); /* OCW3 Poll command */
202 realirq = inb(0x20);
203
204 /*
205 * Bit 7 == 0 means invalid/spurious
206 */
207 if (unlikely(!(realirq & 0x80)))
208 goto out_unlock;
209
210 realirq &= 7;
211
212 if (unlikely(realirq == 2)) {
213 outb(0x0c, 0xa0);
214 realirq = inb(0xa0);
215
216 if (unlikely(!(realirq & 0x80)))
217 goto out_unlock;
218
219 realirq = (realirq & 7) + 8;
220 }
221
222 /* mask and ack interrupt */
223 cached_irq_mask |= 1 << realirq;
224 if (unlikely(realirq > 7)) {
225 inb(0xa1);
226 outb(cached_slave_mask, 0xa1);
227 outb(0x60 + (realirq & 7), 0xa0);
228 outb(0x60 + 2, 0x20);
229 } else {
230 inb(0x21);
231 outb(cached_master_mask, 0x21);
232 outb(0x60 + realirq, 0x20);
233 }
234
235 spin_unlock_irqrestore(&i8259A_lock, flags);
236
237 desc = irq_desc + realirq;
238
239 /*
240 * handle this 'virtual interrupt' as a Cobalt one now.
241 */
242 kstat_cpu(smp_processor_id()).irqs[realirq]++;
243
244 if (likely(desc->action != NULL))
245 handle_IRQ_event(realirq, desc->action);
246
247 if (!(desc->status & IRQ_DISABLED))
248 enable_8259A_irq(realirq);
249
250 return IRQ_HANDLED;
251
252out_unlock:
253 spin_unlock_irqrestore(&i8259A_lock, flags);
254 return IRQ_NONE;
255}
256
257static struct irqaction master_action = {
258 .handler = piix4_master_intr,
259 .name = "PIIX4-8259",
260};
261
262static struct irqaction cascade_action = {
263 .handler = no_action,
264 .name = "cascade",
265};
266
267
268void init_VISWS_APIC_irqs(void)
269{
270 int i;
271
272 for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) {
273 irq_desc[i].status = IRQ_DISABLED;
274 irq_desc[i].action = 0;
275 irq_desc[i].depth = 1;
276
277 if (i == 0) {
278 irq_desc[i].chip = &cobalt_irq_type;
279 }
280 else if (i == CO_IRQ_IDE0) {
281 irq_desc[i].chip = &cobalt_irq_type;
282 }
283 else if (i == CO_IRQ_IDE1) {
284 irq_desc[i].chip = &cobalt_irq_type;
285 }
286 else if (i == CO_IRQ_8259) {
287 irq_desc[i].chip = &piix4_master_irq_type;
288 }
289 else if (i < CO_IRQ_APIC0) {
290 irq_desc[i].chip = &piix4_virtual_irq_type;
291 }
292 else if (IS_CO_APIC(i)) {
293 irq_desc[i].chip = &cobalt_irq_type;
294 }
295 }
296
297 setup_irq(CO_IRQ_8259, &master_action);
298 setup_irq(2, &cascade_action);
299}
diff --git a/arch/x86/mach-voyager/Makefile b/arch/x86/mach-voyager/Makefile
new file mode 100644
index 000000000000..15c250b371d3
--- /dev/null
+++ b/arch/x86/mach-voyager/Makefile
@@ -0,0 +1,8 @@
1#
2# Makefile for the linux kernel.
3#
4
5EXTRA_CFLAGS := -Iarch/x86/kernel
6obj-y := setup.o voyager_basic.o voyager_thread.o
7
8obj-$(CONFIG_SMP) += voyager_smp.o voyager_cat.o
diff --git a/arch/x86/mach-voyager/setup.c b/arch/x86/mach-voyager/setup.c
new file mode 100644
index 000000000000..2b55694e6400
--- /dev/null
+++ b/arch/x86/mach-voyager/setup.c
@@ -0,0 +1,125 @@
1/*
2 * Machine specific setup for generic
3 */
4
5#include <linux/init.h>
6#include <linux/interrupt.h>
7#include <asm/arch_hooks.h>
8#include <asm/voyager.h>
9#include <asm/e820.h>
10#include <asm/io.h>
11#include <asm/setup.h>
12
13void __init pre_intr_init_hook(void)
14{
15 init_ISA_irqs();
16}
17
18/*
19 * IRQ2 is cascade interrupt to second interrupt controller
20 */
21static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL};
22
23void __init intr_init_hook(void)
24{
25#ifdef CONFIG_SMP
26 smp_intr_init();
27#endif
28
29 setup_irq(2, &irq2);
30}
31
32void __init pre_setup_arch_hook(void)
33{
34 /* Voyagers run their CPUs from independent clocks, so disable
35 * the TSC code because we can't sync them */
36 tsc_disable = 1;
37}
38
39void __init trap_init_hook(void)
40{
41}
42
43static struct irqaction irq0 = {
44 .handler = timer_interrupt,
45 .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL,
46 .mask = CPU_MASK_NONE,
47 .name = "timer"
48};
49
50void __init time_init_hook(void)
51{
52 irq0.mask = cpumask_of_cpu(safe_smp_processor_id());
53 setup_irq(0, &irq0);
54}
55
56/* Hook for machine specific memory setup. */
57
58char * __init machine_specific_memory_setup(void)
59{
60 char *who;
61
62 who = "NOT VOYAGER";
63
64 if(voyager_level == 5) {
65 __u32 addr, length;
66 int i;
67
68 who = "Voyager-SUS";
69
70 e820.nr_map = 0;
71 for(i=0; voyager_memory_detect(i, &addr, &length); i++) {
72 add_memory_region(addr, length, E820_RAM);
73 }
74 return who;
75 } else if(voyager_level == 4) {
76 __u32 tom;
77 __u16 catbase = inb(VOYAGER_SSPB_RELOCATION_PORT)<<8;
78 /* select the DINO config space */
79 outb(VOYAGER_DINO, VOYAGER_CAT_CONFIG_PORT);
80 /* Read DINO top of memory register */
81 tom = ((inb(catbase + 0x4) & 0xf0) << 16)
82 + ((inb(catbase + 0x5) & 0x7f) << 24);
83
84 if(inb(catbase) != VOYAGER_DINO) {
85 printk(KERN_ERR "Voyager: Failed to get DINO for L4, setting tom to EXT_MEM_K\n");
86 tom = (EXT_MEM_K)<<10;
87 }
88 who = "Voyager-TOM";
89 add_memory_region(0, 0x9f000, E820_RAM);
90 /* map from 1M to top of memory */
91 add_memory_region(1*1024*1024, tom - 1*1024*1024, E820_RAM);
92 /* FIXME: Should check the ASICs to see if I need to
93 * take out the 8M window. Just do it at the moment
94 * */
95 add_memory_region(8*1024*1024, 8*1024*1024, E820_RESERVED);
96 return who;
97 }
98
99 who = "BIOS-e820";
100
101 /*
102 * Try to copy the BIOS-supplied E820-map.
103 *
104 * Otherwise fake a memory map; one section from 0k->640k,
105 * the next section from 1mb->appropriate_mem_k
106 */
107 sanitize_e820_map(E820_MAP, &E820_MAP_NR);
108 if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0) {
109 unsigned long mem_size;
110
111 /* compare results from other methods and take the greater */
112 if (ALT_MEM_K < EXT_MEM_K) {
113 mem_size = EXT_MEM_K;
114 who = "BIOS-88";
115 } else {
116 mem_size = ALT_MEM_K;
117 who = "BIOS-e801";
118 }
119
120 e820.nr_map = 0;
121 add_memory_region(0, LOWMEMSIZE(), E820_RAM);
122 add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
123 }
124 return who;
125}
diff --git a/arch/x86/mach-voyager/voyager_basic.c b/arch/x86/mach-voyager/voyager_basic.c
new file mode 100644
index 000000000000..9b77b39b71a6
--- /dev/null
+++ b/arch/x86/mach-voyager/voyager_basic.c
@@ -0,0 +1,331 @@
1/* Copyright (C) 1999,2001
2 *
3 * Author: J.E.J.Bottomley@HansenPartnership.com
4 *
5 * linux/arch/i386/kernel/voyager.c
6 *
7 * This file contains all the voyager specific routines for getting
8 * initialisation of the architecture to function. For additional
9 * features see:
10 *
11 * voyager_cat.c - Voyager CAT bus interface
12 * voyager_smp.c - Voyager SMP hal (emulates linux smp.c)
13 */
14
15#include <linux/module.h>
16#include <linux/types.h>
17#include <linux/sched.h>
18#include <linux/ptrace.h>
19#include <linux/ioport.h>
20#include <linux/interrupt.h>
21#include <linux/init.h>
22#include <linux/delay.h>
23#include <linux/reboot.h>
24#include <linux/sysrq.h>
25#include <linux/smp.h>
26#include <linux/nodemask.h>
27#include <asm/io.h>
28#include <asm/voyager.h>
29#include <asm/vic.h>
30#include <linux/pm.h>
31#include <asm/tlbflush.h>
32#include <asm/arch_hooks.h>
33#include <asm/i8253.h>
34
35/*
36 * Power off function, if any
37 */
38void (*pm_power_off)(void);
39EXPORT_SYMBOL(pm_power_off);
40
41int voyager_level = 0;
42
43struct voyager_SUS *voyager_SUS = NULL;
44
45#ifdef CONFIG_SMP
46static void
47voyager_dump(int dummy1, struct tty_struct *dummy3)
48{
49 /* get here via a sysrq */
50 voyager_smp_dump();
51}
52
53static struct sysrq_key_op sysrq_voyager_dump_op = {
54 .handler = voyager_dump,
55 .help_msg = "Voyager",
56 .action_msg = "Dump Voyager Status",
57};
58#endif
59
60void
61voyager_detect(struct voyager_bios_info *bios)
62{
63 if(bios->len != 0xff) {
64 int class = (bios->class_1 << 8)
65 | (bios->class_2 & 0xff);
66
67 printk("Voyager System detected.\n"
68 " Class %x, Revision %d.%d\n",
69 class, bios->major, bios->minor);
70 if(class == VOYAGER_LEVEL4)
71 voyager_level = 4;
72 else if(class < VOYAGER_LEVEL5_AND_ABOVE)
73 voyager_level = 3;
74 else
75 voyager_level = 5;
76 printk(" Architecture Level %d\n", voyager_level);
77 if(voyager_level < 4)
78 printk("\n**WARNING**: Voyager HAL only supports Levels 4 and 5 Architectures at the moment\n\n");
79 /* install the power off handler */
80 pm_power_off = voyager_power_off;
81#ifdef CONFIG_SMP
82 register_sysrq_key('v', &sysrq_voyager_dump_op);
83#endif
84 } else {
85 printk("\n\n**WARNING**: No Voyager Subsystem Found\n");
86 }
87}
88
89void
90voyager_system_interrupt(int cpl, void *dev_id)
91{
92 printk("Voyager: detected system interrupt\n");
93}
94
95/* Routine to read information from the extended CMOS area */
96__u8
97voyager_extended_cmos_read(__u16 addr)
98{
99 outb(addr & 0xff, 0x74);
100 outb((addr >> 8) & 0xff, 0x75);
101 return inb(0x76);
102}
103
104/* internal definitions for the SUS Click Map of memory */
105
106#define CLICK_ENTRIES 16
107#define CLICK_SIZE 4096 /* click to byte conversion for Length */
108
109typedef struct ClickMap {
110 struct Entry {
111 __u32 Address;
112 __u32 Length;
113 } Entry[CLICK_ENTRIES];
114} ClickMap_t;
115
116
117/* This routine is pretty much an awful hack to read the bios clickmap by
118 * mapping it into page 0. There are usually three regions in the map:
119 * Base Memory
120 * Extended Memory
121 * zero length marker for end of map
122 *
123 * Returns are 0 for failure and 1 for success on extracting region.
124 */
125int __init
126voyager_memory_detect(int region, __u32 *start, __u32 *length)
127{
128 int i;
129 int retval = 0;
130 __u8 cmos[4];
131 ClickMap_t *map;
132 unsigned long map_addr;
133 unsigned long old;
134
135 if(region >= CLICK_ENTRIES) {
136 printk("Voyager: Illegal ClickMap region %d\n", region);
137 return 0;
138 }
139
140 for(i = 0; i < sizeof(cmos); i++)
141 cmos[i] = voyager_extended_cmos_read(VOYAGER_MEMORY_CLICKMAP + i);
142
143 map_addr = *(unsigned long *)cmos;
144
145 /* steal page 0 for this */
146 old = pg0[0];
147 pg0[0] = ((map_addr & PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT);
148 local_flush_tlb();
149 /* now clear everything out but page 0 */
150 map = (ClickMap_t *)(map_addr & (~PAGE_MASK));
151
152 /* zero length is the end of the clickmap */
153 if(map->Entry[region].Length != 0) {
154 *length = map->Entry[region].Length * CLICK_SIZE;
155 *start = map->Entry[region].Address;
156 retval = 1;
157 }
158
159 /* replace the mapping */
160 pg0[0] = old;
161 local_flush_tlb();
162 return retval;
163}
164
165/* voyager specific handling code for timer interrupts. Used to hand
166 * off the timer tick to the SMP code, since the VIC doesn't have an
167 * internal timer (The QIC does, but that's another story). */
168void
169voyager_timer_interrupt(void)
170{
171 if((jiffies & 0x3ff) == 0) {
172
173 /* There seems to be something flaky in either
174 * hardware or software that is resetting the timer 0
175 * count to something much higher than it should be
176 * This seems to occur in the boot sequence, just
177 * before root is mounted. Therefore, every 10
178 * seconds or so, we sanity check the timer zero count
179 * and kick it back to where it should be.
180 *
181 * FIXME: This is the most awful hack yet seen. I
182 * should work out exactly what is interfering with
183 * the timer count settings early in the boot sequence
184 * and swiftly introduce it to something sharp and
185 * pointy. */
186 __u16 val;
187
188 spin_lock(&i8253_lock);
189
190 outb_p(0x00, 0x43);
191 val = inb_p(0x40);
192 val |= inb(0x40) << 8;
193 spin_unlock(&i8253_lock);
194
195 if(val > LATCH) {
196 printk("\nVOYAGER: countdown timer value too high (%d), resetting\n\n", val);
197 spin_lock(&i8253_lock);
198 outb(0x34,0x43);
199 outb_p(LATCH & 0xff , 0x40); /* LSB */
200 outb(LATCH >> 8 , 0x40); /* MSB */
201 spin_unlock(&i8253_lock);
202 }
203 }
204#ifdef CONFIG_SMP
205 smp_vic_timer_interrupt();
206#endif
207}
208
209void
210voyager_power_off(void)
211{
212 printk("VOYAGER Power Off\n");
213
214 if(voyager_level == 5) {
215 voyager_cat_power_off();
216 } else if(voyager_level == 4) {
217 /* This doesn't apparently work on most L4 machines,
218 * but the specs say to do this to get automatic power
219 * off. Unfortunately, if it doesn't power off the
220 * machine, it ends up doing a cold restart, which
221 * isn't really intended, so comment out the code */
222#if 0
223 int port;
224
225
226 /* enable the voyager Configuration Space */
227 outb((inb(VOYAGER_MC_SETUP) & 0xf0) | 0x8,
228 VOYAGER_MC_SETUP);
229 /* the port for the power off flag is an offset from the
230 floating base */
231 port = (inb(VOYAGER_SSPB_RELOCATION_PORT) << 8) + 0x21;
232 /* set the power off flag */
233 outb(inb(port) | 0x1, port);
234#endif
235 }
236 /* and wait for it to happen */
237 local_irq_disable();
238 for(;;)
239 halt();
240}
241
242/* copied from process.c */
243static inline void
244kb_wait(void)
245{
246 int i;
247
248 for (i=0; i<0x10000; i++)
249 if ((inb_p(0x64) & 0x02) == 0)
250 break;
251}
252
253void
254machine_shutdown(void)
255{
256 /* Architecture specific shutdown needed before a kexec */
257}
258
259void
260machine_restart(char *cmd)
261{
262 printk("Voyager Warm Restart\n");
263 kb_wait();
264
265 if(voyager_level == 5) {
266 /* write magic values to the RTC to inform system that
267 * shutdown is beginning */
268 outb(0x8f, 0x70);
269 outb(0x5 , 0x71);
270
271 udelay(50);
272 outb(0xfe,0x64); /* pull reset low */
273 } else if(voyager_level == 4) {
274 __u16 catbase = inb(VOYAGER_SSPB_RELOCATION_PORT)<<8;
275 __u8 basebd = inb(VOYAGER_MC_SETUP);
276
277 outb(basebd | 0x08, VOYAGER_MC_SETUP);
278 outb(0x02, catbase + 0x21);
279 }
280 local_irq_disable();
281 for(;;)
282 halt();
283}
284
285void
286machine_emergency_restart(void)
287{
288 /*for now, just hook this to a warm restart */
289 machine_restart(NULL);
290}
291
292void
293mca_nmi_hook(void)
294{
295 __u8 dumpval __maybe_unused = inb(0xf823);
296 __u8 swnmi __maybe_unused = inb(0xf813);
297
298 /* FIXME: assume dump switch pressed */
299 /* check to see if the dump switch was pressed */
300 VDEBUG(("VOYAGER: dumpval = 0x%x, swnmi = 0x%x\n", dumpval, swnmi));
301 /* clear swnmi */
302 outb(0xff, 0xf813);
303 /* tell SUS to ignore dump */
304 if(voyager_level == 5 && voyager_SUS != NULL) {
305 if(voyager_SUS->SUS_mbox == VOYAGER_DUMP_BUTTON_NMI) {
306 voyager_SUS->kernel_mbox = VOYAGER_NO_COMMAND;
307 voyager_SUS->kernel_flags |= VOYAGER_OS_IN_PROGRESS;
308 udelay(1000);
309 voyager_SUS->kernel_mbox = VOYAGER_IGNORE_DUMP;
310 voyager_SUS->kernel_flags &= ~VOYAGER_OS_IN_PROGRESS;
311 }
312 }
313 printk(KERN_ERR "VOYAGER: Dump switch pressed, printing CPU%d tracebacks\n", smp_processor_id());
314 show_stack(NULL, NULL);
315 show_state();
316}
317
318
319
320void
321machine_halt(void)
322{
323 /* treat a halt like a power off */
324 machine_power_off();
325}
326
327void machine_power_off(void)
328{
329 if (pm_power_off)
330 pm_power_off();
331}
diff --git a/arch/x86/mach-voyager/voyager_cat.c b/arch/x86/mach-voyager/voyager_cat.c
new file mode 100644
index 000000000000..26a2d4c54b68
--- /dev/null
+++ b/arch/x86/mach-voyager/voyager_cat.c
@@ -0,0 +1,1180 @@
1/* -*- mode: c; c-basic-offset: 8 -*- */
2
3/* Copyright (C) 1999,2001
4 *
5 * Author: J.E.J.Bottomley@HansenPartnership.com
6 *
7 * linux/arch/i386/kernel/voyager_cat.c
8 *
9 * This file contains all the logic for manipulating the CAT bus
10 * in a level 5 machine.
11 *
12 * The CAT bus is a serial configuration and test bus. Its primary
13 * uses are to probe the initial configuration of the system and to
14 * diagnose error conditions when a system interrupt occurs. The low
15 * level interface is fairly primitive, so most of this file consists
16 * of bit shift manipulations to send and receive packets on the
17 * serial bus */
18
19#include <linux/types.h>
20#include <linux/completion.h>
21#include <linux/sched.h>
22#include <asm/voyager.h>
23#include <asm/vic.h>
24#include <linux/ioport.h>
25#include <linux/init.h>
26#include <linux/slab.h>
27#include <linux/delay.h>
28#include <asm/io.h>
29
30#ifdef VOYAGER_CAT_DEBUG
31#define CDEBUG(x) printk x
32#else
33#define CDEBUG(x)
34#endif
35
36/* the CAT command port */
37#define CAT_CMD (sspb + 0xe)
38/* the CAT data port */
39#define CAT_DATA (sspb + 0xd)
40
41/* the internal cat functions */
42static void cat_pack(__u8 *msg, __u16 start_bit, __u8 *data,
43 __u16 num_bits);
44static void cat_unpack(__u8 *msg, __u16 start_bit, __u8 *data,
45 __u16 num_bits);
46static void cat_build_header(__u8 *header, const __u16 len,
47 const __u16 smallest_reg_bits,
48 const __u16 longest_reg_bits);
49static int cat_sendinst(voyager_module_t *modp, voyager_asic_t *asicp,
50 __u8 reg, __u8 op);
51static int cat_getdata(voyager_module_t *modp, voyager_asic_t *asicp,
52 __u8 reg, __u8 *value);
53static int cat_shiftout(__u8 *data, __u16 data_bytes, __u16 header_bytes,
54 __u8 pad_bits);
55static int cat_write(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg,
56 __u8 value);
57static int cat_read(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg,
58 __u8 *value);
59static int cat_subread(voyager_module_t *modp, voyager_asic_t *asicp,
60 __u16 offset, __u16 len, void *buf);
61static int cat_senddata(voyager_module_t *modp, voyager_asic_t *asicp,
62 __u8 reg, __u8 value);
63static int cat_disconnect(voyager_module_t *modp, voyager_asic_t *asicp);
64static int cat_connect(voyager_module_t *modp, voyager_asic_t *asicp);
65
66static inline const char *
67cat_module_name(int module_id)
68{
69 switch(module_id) {
70 case 0x10:
71 return "Processor Slot 0";
72 case 0x11:
73 return "Processor Slot 1";
74 case 0x12:
75 return "Processor Slot 2";
76 case 0x13:
77 return "Processor Slot 4";
78 case 0x14:
79 return "Memory Slot 0";
80 case 0x15:
81 return "Memory Slot 1";
82 case 0x18:
83 return "Primary Microchannel";
84 case 0x19:
85 return "Secondary Microchannel";
86 case 0x1a:
87 return "Power Supply Interface";
88 case 0x1c:
89 return "Processor Slot 5";
90 case 0x1d:
91 return "Processor Slot 6";
92 case 0x1e:
93 return "Processor Slot 7";
94 case 0x1f:
95 return "Processor Slot 8";
96 default:
97 return "Unknown Module";
98 }
99}
100
101static int sspb = 0; /* stores the super port location */
102int voyager_8slot = 0; /* set to true if a 51xx monster */
103
104voyager_module_t *voyager_cat_list;
105
106/* the I/O port assignments for the VIC and QIC */
107static struct resource vic_res = {
108 .name = "Voyager Interrupt Controller",
109 .start = 0xFC00,
110 .end = 0xFC6F
111};
112static struct resource qic_res = {
113 .name = "Quad Interrupt Controller",
114 .start = 0xFC70,
115 .end = 0xFCFF
116};
117
118/* This function is used to pack a data bit stream inside a message.
119 * It writes num_bits of the data buffer in msg starting at start_bit.
120 * Note: This function assumes that any unused bit in the data stream
121 * is set to zero so that the ors will work correctly */
122static void
123cat_pack(__u8 *msg, const __u16 start_bit, __u8 *data, const __u16 num_bits)
124{
125 /* compute initial shift needed */
126 const __u16 offset = start_bit % BITS_PER_BYTE;
127 __u16 len = num_bits / BITS_PER_BYTE;
128 __u16 byte = start_bit / BITS_PER_BYTE;
129 __u16 residue = (num_bits % BITS_PER_BYTE) + offset;
130 int i;
131
132 /* adjust if we have more than a byte of residue */
133 if(residue >= BITS_PER_BYTE) {
134 residue -= BITS_PER_BYTE;
135 len++;
136 }
137
138 /* clear out the bits. We assume here that if len==0 then
139 * residue >= offset. This is always true for the catbus
140 * operations */
141 msg[byte] &= 0xff << (BITS_PER_BYTE - offset);
142 msg[byte++] |= data[0] >> offset;
143 if(len == 0)
144 return;
145 for(i = 1; i < len; i++)
146 msg[byte++] = (data[i-1] << (BITS_PER_BYTE - offset))
147 | (data[i] >> offset);
148 if(residue != 0) {
149 __u8 mask = 0xff >> residue;
150 __u8 last_byte = data[i-1] << (BITS_PER_BYTE - offset)
151 | (data[i] >> offset);
152
153 last_byte &= ~mask;
154 msg[byte] &= mask;
155 msg[byte] |= last_byte;
156 }
157 return;
158}
159/* unpack the data again (same arguments as cat_pack()). data buffer
160 * must be zero populated.
161 *
162 * Function: given a message string move to start_bit and copy num_bits into
163 * data (starting at bit 0 in data).
164 */
165static void
166cat_unpack(__u8 *msg, const __u16 start_bit, __u8 *data, const __u16 num_bits)
167{
168 /* compute initial shift needed */
169 const __u16 offset = start_bit % BITS_PER_BYTE;
170 __u16 len = num_bits / BITS_PER_BYTE;
171 const __u8 last_bits = num_bits % BITS_PER_BYTE;
172 __u16 byte = start_bit / BITS_PER_BYTE;
173 int i;
174
175 if(last_bits != 0)
176 len++;
177
178 /* special case: want < 8 bits from msg and we can get it from
179 * a single byte of the msg */
180 if(len == 0 && BITS_PER_BYTE - offset >= num_bits) {
181 data[0] = msg[byte] << offset;
182 data[0] &= 0xff >> (BITS_PER_BYTE - num_bits);
183 return;
184 }
185 for(i = 0; i < len; i++) {
186 /* this annoying if has to be done just in case a read of
187 * msg one beyond the array causes a panic */
188 if(offset != 0) {
189 data[i] = msg[byte++] << offset;
190 data[i] |= msg[byte] >> (BITS_PER_BYTE - offset);
191 }
192 else {
193 data[i] = msg[byte++];
194 }
195 }
196 /* do we need to truncate the final byte */
197 if(last_bits != 0) {
198 data[i-1] &= 0xff << (BITS_PER_BYTE - last_bits);
199 }
200 return;
201}
202
203static void
204cat_build_header(__u8 *header, const __u16 len, const __u16 smallest_reg_bits,
205 const __u16 longest_reg_bits)
206{
207 int i;
208 __u16 start_bit = (smallest_reg_bits - 1) % BITS_PER_BYTE;
209 __u8 *last_byte = &header[len - 1];
210
211 if(start_bit == 0)
212 start_bit = 1; /* must have at least one bit in the hdr */
213
214 for(i=0; i < len; i++)
215 header[i] = 0;
216
217 for(i = start_bit; i > 0; i--)
218 *last_byte = ((*last_byte) << 1) + 1;
219
220}
221
222static int
223cat_sendinst(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg, __u8 op)
224{
225 __u8 parity, inst, inst_buf[4] = { 0 };
226 __u8 iseq[VOYAGER_MAX_SCAN_PATH], hseq[VOYAGER_MAX_REG_SIZE];
227 __u16 ibytes, hbytes, padbits;
228 int i;
229
230 /*
231 * Parity is the parity of the register number + 1 (READ_REGISTER
232 * and WRITE_REGISTER always add '1' to the number of bits == 1)
233 */
234 parity = (__u8)(1 + (reg & 0x01) +
235 ((__u8)(reg & 0x02) >> 1) +
236 ((__u8)(reg & 0x04) >> 2) +
237 ((__u8)(reg & 0x08) >> 3)) % 2;
238
239 inst = ((parity << 7) | (reg << 2) | op);
240
241 outb(VOYAGER_CAT_IRCYC, CAT_CMD);
242 if(!modp->scan_path_connected) {
243 if(asicp->asic_id != VOYAGER_CAT_ID) {
244 printk("**WARNING***: cat_sendinst has disconnected scan path not to CAT asic\n");
245 return 1;
246 }
247 outb(VOYAGER_CAT_HEADER, CAT_DATA);
248 outb(inst, CAT_DATA);
249 if(inb(CAT_DATA) != VOYAGER_CAT_HEADER) {
250 CDEBUG(("VOYAGER CAT: cat_sendinst failed to get CAT_HEADER\n"));
251 return 1;
252 }
253 return 0;
254 }
255 ibytes = modp->inst_bits / BITS_PER_BYTE;
256 if((padbits = modp->inst_bits % BITS_PER_BYTE) != 0) {
257 padbits = BITS_PER_BYTE - padbits;
258 ibytes++;
259 }
260 hbytes = modp->largest_reg / BITS_PER_BYTE;
261 if(modp->largest_reg % BITS_PER_BYTE)
262 hbytes++;
263 CDEBUG(("cat_sendinst: ibytes=%d, hbytes=%d\n", ibytes, hbytes));
264 /* initialise the instruction sequence to 0xff */
265 for(i=0; i < ibytes + hbytes; i++)
266 iseq[i] = 0xff;
267 cat_build_header(hseq, hbytes, modp->smallest_reg, modp->largest_reg);
268 cat_pack(iseq, modp->inst_bits, hseq, hbytes * BITS_PER_BYTE);
269 inst_buf[0] = inst;
270 inst_buf[1] = 0xFF >> (modp->largest_reg % BITS_PER_BYTE);
271 cat_pack(iseq, asicp->bit_location, inst_buf, asicp->ireg_length);
272#ifdef VOYAGER_CAT_DEBUG
273 printk("ins = 0x%x, iseq: ", inst);
274 for(i=0; i< ibytes + hbytes; i++)
275 printk("0x%x ", iseq[i]);
276 printk("\n");
277#endif
278 if(cat_shiftout(iseq, ibytes, hbytes, padbits)) {
279 CDEBUG(("VOYAGER CAT: cat_sendinst: cat_shiftout failed\n"));
280 return 1;
281 }
282 CDEBUG(("CAT SHIFTOUT DONE\n"));
283 return 0;
284}
285
286static int
287cat_getdata(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg,
288 __u8 *value)
289{
290 if(!modp->scan_path_connected) {
291 if(asicp->asic_id != VOYAGER_CAT_ID) {
292 CDEBUG(("VOYAGER CAT: ERROR: cat_getdata to CAT asic with scan path connected\n"));
293 return 1;
294 }
295 if(reg > VOYAGER_SUBADDRHI)
296 outb(VOYAGER_CAT_RUN, CAT_CMD);
297 outb(VOYAGER_CAT_DRCYC, CAT_CMD);
298 outb(VOYAGER_CAT_HEADER, CAT_DATA);
299 *value = inb(CAT_DATA);
300 outb(0xAA, CAT_DATA);
301 if(inb(CAT_DATA) != VOYAGER_CAT_HEADER) {
302 CDEBUG(("cat_getdata: failed to get VOYAGER_CAT_HEADER\n"));
303 return 1;
304 }
305 return 0;
306 }
307 else {
308 __u16 sbits = modp->num_asics -1 + asicp->ireg_length;
309 __u16 sbytes = sbits / BITS_PER_BYTE;
310 __u16 tbytes;
311 __u8 string[VOYAGER_MAX_SCAN_PATH], trailer[VOYAGER_MAX_REG_SIZE];
312 __u8 padbits;
313 int i;
314
315 outb(VOYAGER_CAT_DRCYC, CAT_CMD);
316
317 if((padbits = sbits % BITS_PER_BYTE) != 0) {
318 padbits = BITS_PER_BYTE - padbits;
319 sbytes++;
320 }
321 tbytes = asicp->ireg_length / BITS_PER_BYTE;
322 if(asicp->ireg_length % BITS_PER_BYTE)
323 tbytes++;
324 CDEBUG(("cat_getdata: tbytes = %d, sbytes = %d, padbits = %d\n",
325 tbytes, sbytes, padbits));
326 cat_build_header(trailer, tbytes, 1, asicp->ireg_length);
327
328
329 for(i = tbytes - 1; i >= 0; i--) {
330 outb(trailer[i], CAT_DATA);
331 string[sbytes + i] = inb(CAT_DATA);
332 }
333
334 for(i = sbytes - 1; i >= 0; i--) {
335 outb(0xaa, CAT_DATA);
336 string[i] = inb(CAT_DATA);
337 }
338 *value = 0;
339 cat_unpack(string, padbits + (tbytes * BITS_PER_BYTE) + asicp->asic_location, value, asicp->ireg_length);
340#ifdef VOYAGER_CAT_DEBUG
341 printk("value=0x%x, string: ", *value);
342 for(i=0; i< tbytes+sbytes; i++)
343 printk("0x%x ", string[i]);
344 printk("\n");
345#endif
346
347 /* sanity check the rest of the return */
348 for(i=0; i < tbytes; i++) {
349 __u8 input = 0;
350
351 cat_unpack(string, padbits + (i * BITS_PER_BYTE), &input, BITS_PER_BYTE);
352 if(trailer[i] != input) {
353 CDEBUG(("cat_getdata: failed to sanity check rest of ret(%d) 0x%x != 0x%x\n", i, input, trailer[i]));
354 return 1;
355 }
356 }
357 CDEBUG(("cat_getdata DONE\n"));
358 return 0;
359 }
360}
361
362static int
363cat_shiftout(__u8 *data, __u16 data_bytes, __u16 header_bytes, __u8 pad_bits)
364{
365 int i;
366
367 for(i = data_bytes + header_bytes - 1; i >= header_bytes; i--)
368 outb(data[i], CAT_DATA);
369
370 for(i = header_bytes - 1; i >= 0; i--) {
371 __u8 header = 0;
372 __u8 input;
373
374 outb(data[i], CAT_DATA);
375 input = inb(CAT_DATA);
376 CDEBUG(("cat_shiftout: returned 0x%x\n", input));
377 cat_unpack(data, ((data_bytes + i) * BITS_PER_BYTE) - pad_bits,
378 &header, BITS_PER_BYTE);
379 if(input != header) {
380 CDEBUG(("VOYAGER CAT: cat_shiftout failed to return header 0x%x != 0x%x\n", input, header));
381 return 1;
382 }
383 }
384 return 0;
385}
386
387static int
388cat_senddata(voyager_module_t *modp, voyager_asic_t *asicp,
389 __u8 reg, __u8 value)
390{
391 outb(VOYAGER_CAT_DRCYC, CAT_CMD);
392 if(!modp->scan_path_connected) {
393 if(asicp->asic_id != VOYAGER_CAT_ID) {
394 CDEBUG(("VOYAGER CAT: ERROR: scan path disconnected when asic != CAT\n"));
395 return 1;
396 }
397 outb(VOYAGER_CAT_HEADER, CAT_DATA);
398 outb(value, CAT_DATA);
399 if(inb(CAT_DATA) != VOYAGER_CAT_HEADER) {
400 CDEBUG(("cat_senddata: failed to get correct header response to sent data\n"));
401 return 1;
402 }
403 if(reg > VOYAGER_SUBADDRHI) {
404 outb(VOYAGER_CAT_RUN, CAT_CMD);
405 outb(VOYAGER_CAT_END, CAT_CMD);
406 outb(VOYAGER_CAT_RUN, CAT_CMD);
407 }
408
409 return 0;
410 }
411 else {
412 __u16 hbytes = asicp->ireg_length / BITS_PER_BYTE;
413 __u16 dbytes = (modp->num_asics - 1 + asicp->ireg_length)/BITS_PER_BYTE;
414 __u8 padbits, dseq[VOYAGER_MAX_SCAN_PATH],
415 hseq[VOYAGER_MAX_REG_SIZE];
416 int i;
417
418 if((padbits = (modp->num_asics - 1
419 + asicp->ireg_length) % BITS_PER_BYTE) != 0) {
420 padbits = BITS_PER_BYTE - padbits;
421 dbytes++;
422 }
423 if(asicp->ireg_length % BITS_PER_BYTE)
424 hbytes++;
425
426 cat_build_header(hseq, hbytes, 1, asicp->ireg_length);
427
428 for(i = 0; i < dbytes + hbytes; i++)
429 dseq[i] = 0xff;
430 CDEBUG(("cat_senddata: dbytes=%d, hbytes=%d, padbits=%d\n",
431 dbytes, hbytes, padbits));
432 cat_pack(dseq, modp->num_asics - 1 + asicp->ireg_length,
433 hseq, hbytes * BITS_PER_BYTE);
434 cat_pack(dseq, asicp->asic_location, &value,
435 asicp->ireg_length);
436#ifdef VOYAGER_CAT_DEBUG
437 printk("dseq ");
438 for(i=0; i<hbytes+dbytes; i++) {
439 printk("0x%x ", dseq[i]);
440 }
441 printk("\n");
442#endif
443 return cat_shiftout(dseq, dbytes, hbytes, padbits);
444 }
445}
446
447static int
448cat_write(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg,
449 __u8 value)
450{
451 if(cat_sendinst(modp, asicp, reg, VOYAGER_WRITE_CONFIG))
452 return 1;
453 return cat_senddata(modp, asicp, reg, value);
454}
455
456static int
457cat_read(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg,
458 __u8 *value)
459{
460 if(cat_sendinst(modp, asicp, reg, VOYAGER_READ_CONFIG))
461 return 1;
462 return cat_getdata(modp, asicp, reg, value);
463}
464
465static int
466cat_subaddrsetup(voyager_module_t *modp, voyager_asic_t *asicp, __u16 offset,
467 __u16 len)
468{
469 __u8 val;
470
471 if(len > 1) {
472 /* set auto increment */
473 __u8 newval;
474
475 if(cat_read(modp, asicp, VOYAGER_AUTO_INC_REG, &val)) {
476 CDEBUG(("cat_subaddrsetup: read of VOYAGER_AUTO_INC_REG failed\n"));
477 return 1;
478 }
479 CDEBUG(("cat_subaddrsetup: VOYAGER_AUTO_INC_REG = 0x%x\n", val));
480 newval = val | VOYAGER_AUTO_INC;
481 if(newval != val) {
482 if(cat_write(modp, asicp, VOYAGER_AUTO_INC_REG, val)) {
483 CDEBUG(("cat_subaddrsetup: write to VOYAGER_AUTO_INC_REG failed\n"));
484 return 1;
485 }
486 }
487 }
488 if(cat_write(modp, asicp, VOYAGER_SUBADDRLO, (__u8)(offset &0xff))) {
489 CDEBUG(("cat_subaddrsetup: write to SUBADDRLO failed\n"));
490 return 1;
491 }
492 if(asicp->subaddr > VOYAGER_SUBADDR_LO) {
493 if(cat_write(modp, asicp, VOYAGER_SUBADDRHI, (__u8)(offset >> 8))) {
494 CDEBUG(("cat_subaddrsetup: write to SUBADDRHI failed\n"));
495 return 1;
496 }
497 cat_read(modp, asicp, VOYAGER_SUBADDRHI, &val);
498 CDEBUG(("cat_subaddrsetup: offset = %d, hi = %d\n", offset, val));
499 }
500 cat_read(modp, asicp, VOYAGER_SUBADDRLO, &val);
501 CDEBUG(("cat_subaddrsetup: offset = %d, lo = %d\n", offset, val));
502 return 0;
503}
504
505static int
506cat_subwrite(voyager_module_t *modp, voyager_asic_t *asicp, __u16 offset,
507 __u16 len, void *buf)
508{
509 int i, retval;
510
511 /* FIXME: need special actions for VOYAGER_CAT_ID here */
512 if(asicp->asic_id == VOYAGER_CAT_ID) {
513 CDEBUG(("cat_subwrite: ATTEMPT TO WRITE TO CAT ASIC\n"));
514 /* FIXME -- This is supposed to be handled better
515 * There is a problem writing to the cat asic in the
516 * PSI. The 30us delay seems to work, though */
517 udelay(30);
518 }
519
520 if((retval = cat_subaddrsetup(modp, asicp, offset, len)) != 0) {
521 printk("cat_subwrite: cat_subaddrsetup FAILED\n");
522 return retval;
523 }
524
525 if(cat_sendinst(modp, asicp, VOYAGER_SUBADDRDATA, VOYAGER_WRITE_CONFIG)) {
526 printk("cat_subwrite: cat_sendinst FAILED\n");
527 return 1;
528 }
529 for(i = 0; i < len; i++) {
530 if(cat_senddata(modp, asicp, 0xFF, ((__u8 *)buf)[i])) {
531 printk("cat_subwrite: cat_sendata element at %d FAILED\n", i);
532 return 1;
533 }
534 }
535 return 0;
536}
537static int
538cat_subread(voyager_module_t *modp, voyager_asic_t *asicp, __u16 offset,
539 __u16 len, void *buf)
540{
541 int i, retval;
542
543 if((retval = cat_subaddrsetup(modp, asicp, offset, len)) != 0) {
544 CDEBUG(("cat_subread: cat_subaddrsetup FAILED\n"));
545 return retval;
546 }
547
548 if(cat_sendinst(modp, asicp, VOYAGER_SUBADDRDATA, VOYAGER_READ_CONFIG)) {
549 CDEBUG(("cat_subread: cat_sendinst failed\n"));
550 return 1;
551 }
552 for(i = 0; i < len; i++) {
553 if(cat_getdata(modp, asicp, 0xFF,
554 &((__u8 *)buf)[i])) {
555 CDEBUG(("cat_subread: cat_getdata element %d failed\n", i));
556 return 1;
557 }
558 }
559 return 0;
560}
561
562
563/* buffer for storing EPROM data read in during initialisation */
564static __initdata __u8 eprom_buf[0xFFFF];
565static voyager_module_t *voyager_initial_module;
566
567/* Initialise the cat bus components. We assume this is called by the
568 * boot cpu *after* all memory initialisation has been done (so we can
569 * use kmalloc) but before smp initialisation, so we can probe the SMP
570 * configuration and pick up necessary information. */
571void
572voyager_cat_init(void)
573{
574 voyager_module_t **modpp = &voyager_initial_module;
575 voyager_asic_t **asicpp;
576 voyager_asic_t *qabc_asic = NULL;
577 int i, j;
578 unsigned long qic_addr = 0;
579 __u8 qabc_data[0x20];
580 __u8 num_submodules, val;
581 voyager_eprom_hdr_t *eprom_hdr = (voyager_eprom_hdr_t *)&eprom_buf[0];
582
583 __u8 cmos[4];
584 unsigned long addr;
585
586 /* initiallise the SUS mailbox */
587 for(i=0; i<sizeof(cmos); i++)
588 cmos[i] = voyager_extended_cmos_read(VOYAGER_DUMP_LOCATION + i);
589 addr = *(unsigned long *)cmos;
590 if((addr & 0xff000000) != 0xff000000) {
591 printk(KERN_ERR "Voyager failed to get SUS mailbox (addr = 0x%lx\n", addr);
592 } else {
593 static struct resource res;
594
595 res.name = "voyager SUS";
596 res.start = addr;
597 res.end = addr+0x3ff;
598
599 request_resource(&iomem_resource, &res);
600 voyager_SUS = (struct voyager_SUS *)
601 ioremap(addr, 0x400);
602 printk(KERN_NOTICE "Voyager SUS mailbox version 0x%x\n",
603 voyager_SUS->SUS_version);
604 voyager_SUS->kernel_version = VOYAGER_MAILBOX_VERSION;
605 voyager_SUS->kernel_flags = VOYAGER_OS_HAS_SYSINT;
606 }
607
608 /* clear the processor counts */
609 voyager_extended_vic_processors = 0;
610 voyager_quad_processors = 0;
611
612
613
614 printk("VOYAGER: beginning CAT bus probe\n");
615 /* set up the SuperSet Port Block which tells us where the
616 * CAT communication port is */
617 sspb = inb(VOYAGER_SSPB_RELOCATION_PORT) * 0x100;
618 VDEBUG(("VOYAGER DEBUG: sspb = 0x%x\n", sspb));
619
620 /* now find out if were 8 slot or normal */
621 if((inb(VIC_PROC_WHO_AM_I) & EIGHT_SLOT_IDENTIFIER)
622 == EIGHT_SLOT_IDENTIFIER) {
623 voyager_8slot = 1;
624 printk(KERN_NOTICE "Voyager: Eight slot 51xx configuration detected\n");
625 }
626
627 for(i = VOYAGER_MIN_MODULE;
628 i <= VOYAGER_MAX_MODULE; i++) {
629 __u8 input;
630 int asic;
631 __u16 eprom_size;
632 __u16 sp_offset;
633
634 outb(VOYAGER_CAT_DESELECT, VOYAGER_CAT_CONFIG_PORT);
635 outb(i, VOYAGER_CAT_CONFIG_PORT);
636
637 /* check the presence of the module */
638 outb(VOYAGER_CAT_RUN, CAT_CMD);
639 outb(VOYAGER_CAT_IRCYC, CAT_CMD);
640 outb(VOYAGER_CAT_HEADER, CAT_DATA);
641 /* stream series of alternating 1's and 0's to stimulate
642 * response */
643 outb(0xAA, CAT_DATA);
644 input = inb(CAT_DATA);
645 outb(VOYAGER_CAT_END, CAT_CMD);
646 if(input != VOYAGER_CAT_HEADER) {
647 continue;
648 }
649 CDEBUG(("VOYAGER DEBUG: found module id 0x%x, %s\n", i,
650 cat_module_name(i)));
651 *modpp = kmalloc(sizeof(voyager_module_t), GFP_KERNEL); /*&voyager_module_storage[cat_count++];*/
652 if(*modpp == NULL) {
653 printk("**WARNING** kmalloc failure in cat_init\n");
654 continue;
655 }
656 memset(*modpp, 0, sizeof(voyager_module_t));
657 /* need temporary asic for cat_subread. It will be
658 * filled in correctly later */
659 (*modpp)->asic = kmalloc(sizeof(voyager_asic_t), GFP_KERNEL); /*&voyager_asic_storage[asic_count];*/
660 if((*modpp)->asic == NULL) {
661 printk("**WARNING** kmalloc failure in cat_init\n");
662 continue;
663 }
664 memset((*modpp)->asic, 0, sizeof(voyager_asic_t));
665 (*modpp)->asic->asic_id = VOYAGER_CAT_ID;
666 (*modpp)->asic->subaddr = VOYAGER_SUBADDR_HI;
667 (*modpp)->module_addr = i;
668 (*modpp)->scan_path_connected = 0;
669 if(i == VOYAGER_PSI) {
670 /* Exception leg for modules with no EEPROM */
671 printk("Module \"%s\"\n", cat_module_name(i));
672 continue;
673 }
674
675 CDEBUG(("cat_init: Reading eeprom for module 0x%x at offset %d\n", i, VOYAGER_XSUM_END_OFFSET));
676 outb(VOYAGER_CAT_RUN, CAT_CMD);
677 cat_disconnect(*modpp, (*modpp)->asic);
678 if(cat_subread(*modpp, (*modpp)->asic,
679 VOYAGER_XSUM_END_OFFSET, sizeof(eprom_size),
680 &eprom_size)) {
681 printk("**WARNING**: Voyager couldn't read EPROM size for module 0x%x\n", i);
682 outb(VOYAGER_CAT_END, CAT_CMD);
683 continue;
684 }
685 if(eprom_size > sizeof(eprom_buf)) {
686 printk("**WARNING**: Voyager insufficient size to read EPROM data, module 0x%x. Need %d\n", i, eprom_size);
687 outb(VOYAGER_CAT_END, CAT_CMD);
688 continue;
689 }
690 outb(VOYAGER_CAT_END, CAT_CMD);
691 outb(VOYAGER_CAT_RUN, CAT_CMD);
692 CDEBUG(("cat_init: module 0x%x, eeprom_size %d\n", i, eprom_size));
693 if(cat_subread(*modpp, (*modpp)->asic, 0,
694 eprom_size, eprom_buf)) {
695 outb(VOYAGER_CAT_END, CAT_CMD);
696 continue;
697 }
698 outb(VOYAGER_CAT_END, CAT_CMD);
699 printk("Module \"%s\", version 0x%x, tracer 0x%x, asics %d\n",
700 cat_module_name(i), eprom_hdr->version_id,
701 *((__u32 *)eprom_hdr->tracer), eprom_hdr->num_asics);
702 (*modpp)->ee_size = eprom_hdr->ee_size;
703 (*modpp)->num_asics = eprom_hdr->num_asics;
704 asicpp = &((*modpp)->asic);
705 sp_offset = eprom_hdr->scan_path_offset;
706 /* All we really care about are the Quad cards. We
707 * identify them because they are in a processor slot
708 * and have only four asics */
709 if((i < 0x10 || (i>=0x14 && i < 0x1c) || i>0x1f)) {
710 modpp = &((*modpp)->next);
711 continue;
712 }
713 /* Now we know it's in a processor slot, does it have
714 * a quad baseboard submodule */
715 outb(VOYAGER_CAT_RUN, CAT_CMD);
716 cat_read(*modpp, (*modpp)->asic, VOYAGER_SUBMODPRESENT,
717 &num_submodules);
718 /* lowest two bits, active low */
719 num_submodules = ~(0xfc | num_submodules);
720 CDEBUG(("VOYAGER CAT: %d submodules present\n", num_submodules));
721 if(num_submodules == 0) {
722 /* fill in the dyadic extended processors */
723 __u8 cpu = i & 0x07;
724
725 printk("Module \"%s\": Dyadic Processor Card\n",
726 cat_module_name(i));
727 voyager_extended_vic_processors |= (1<<cpu);
728 cpu += 4;
729 voyager_extended_vic_processors |= (1<<cpu);
730 outb(VOYAGER_CAT_END, CAT_CMD);
731 continue;
732 }
733
734 /* now we want to read the asics on the first submodule,
735 * which should be the quad base board */
736
737 cat_read(*modpp, (*modpp)->asic, VOYAGER_SUBMODSELECT, &val);
738 CDEBUG(("cat_init: SUBMODSELECT value = 0x%x\n", val));
739 val = (val & 0x7c) | VOYAGER_QUAD_BASEBOARD;
740 cat_write(*modpp, (*modpp)->asic, VOYAGER_SUBMODSELECT, val);
741
742 outb(VOYAGER_CAT_END, CAT_CMD);
743
744
745 CDEBUG(("cat_init: Reading eeprom for module 0x%x at offset %d\n", i, VOYAGER_XSUM_END_OFFSET));
746 outb(VOYAGER_CAT_RUN, CAT_CMD);
747 cat_disconnect(*modpp, (*modpp)->asic);
748 if(cat_subread(*modpp, (*modpp)->asic,
749 VOYAGER_XSUM_END_OFFSET, sizeof(eprom_size),
750 &eprom_size)) {
751 printk("**WARNING**: Voyager couldn't read EPROM size for module 0x%x\n", i);
752 outb(VOYAGER_CAT_END, CAT_CMD);
753 continue;
754 }
755 if(eprom_size > sizeof(eprom_buf)) {
756 printk("**WARNING**: Voyager insufficient size to read EPROM data, module 0x%x. Need %d\n", i, eprom_size);
757 outb(VOYAGER_CAT_END, CAT_CMD);
758 continue;
759 }
760 outb(VOYAGER_CAT_END, CAT_CMD);
761 outb(VOYAGER_CAT_RUN, CAT_CMD);
762 CDEBUG(("cat_init: module 0x%x, eeprom_size %d\n", i, eprom_size));
763 if(cat_subread(*modpp, (*modpp)->asic, 0,
764 eprom_size, eprom_buf)) {
765 outb(VOYAGER_CAT_END, CAT_CMD);
766 continue;
767 }
768 outb(VOYAGER_CAT_END, CAT_CMD);
769 /* Now do everything for the QBB submodule 1 */
770 (*modpp)->ee_size = eprom_hdr->ee_size;
771 (*modpp)->num_asics = eprom_hdr->num_asics;
772 asicpp = &((*modpp)->asic);
773 sp_offset = eprom_hdr->scan_path_offset;
774 /* get rid of the dummy CAT asic and read the real one */
775 kfree((*modpp)->asic);
776 for(asic=0; asic < (*modpp)->num_asics; asic++) {
777 int j;
778 voyager_asic_t *asicp = *asicpp
779 = kzalloc(sizeof(voyager_asic_t), GFP_KERNEL); /*&voyager_asic_storage[asic_count++];*/
780 voyager_sp_table_t *sp_table;
781 voyager_at_t *asic_table;
782 voyager_jtt_t *jtag_table;
783
784 if(asicp == NULL) {
785 printk("**WARNING** kmalloc failure in cat_init\n");
786 continue;
787 }
788 asicpp = &(asicp->next);
789 asicp->asic_location = asic;
790 sp_table = (voyager_sp_table_t *)(eprom_buf + sp_offset);
791 asicp->asic_id = sp_table->asic_id;
792 asic_table = (voyager_at_t *)(eprom_buf + sp_table->asic_data_offset);
793 for(j=0; j<4; j++)
794 asicp->jtag_id[j] = asic_table->jtag_id[j];
795 jtag_table = (voyager_jtt_t *)(eprom_buf + asic_table->jtag_offset);
796 asicp->ireg_length = jtag_table->ireg_len;
797 asicp->bit_location = (*modpp)->inst_bits;
798 (*modpp)->inst_bits += asicp->ireg_length;
799 if(asicp->ireg_length > (*modpp)->largest_reg)
800 (*modpp)->largest_reg = asicp->ireg_length;
801 if (asicp->ireg_length < (*modpp)->smallest_reg ||
802 (*modpp)->smallest_reg == 0)
803 (*modpp)->smallest_reg = asicp->ireg_length;
804 CDEBUG(("asic 0x%x, ireg_length=%d, bit_location=%d\n",
805 asicp->asic_id, asicp->ireg_length,
806 asicp->bit_location));
807 if(asicp->asic_id == VOYAGER_QUAD_QABC) {
808 CDEBUG(("VOYAGER CAT: QABC ASIC found\n"));
809 qabc_asic = asicp;
810 }
811 sp_offset += sizeof(voyager_sp_table_t);
812 }
813 CDEBUG(("Module inst_bits = %d, largest_reg = %d, smallest_reg=%d\n",
814 (*modpp)->inst_bits, (*modpp)->largest_reg,
815 (*modpp)->smallest_reg));
816 /* OK, now we have the QUAD ASICs set up, use them.
817 * we need to:
818 *
819 * 1. Find the Memory area for the Quad CPIs.
820 * 2. Find the Extended VIC processor
821 * 3. Configure a second extended VIC processor (This
822 * cannot be done for the 51xx.
823 * */
824 outb(VOYAGER_CAT_RUN, CAT_CMD);
825 cat_connect(*modpp, (*modpp)->asic);
826 CDEBUG(("CAT CONNECTED!!\n"));
827 cat_subread(*modpp, qabc_asic, 0, sizeof(qabc_data), qabc_data);
828 qic_addr = qabc_data[5] << 8;
829 qic_addr = (qic_addr | qabc_data[6]) << 8;
830 qic_addr = (qic_addr | qabc_data[7]) << 8;
831 printk("Module \"%s\": Quad Processor Card; CPI 0x%lx, SET=0x%x\n",
832 cat_module_name(i), qic_addr, qabc_data[8]);
833#if 0 /* plumbing fails---FIXME */
834 if((qabc_data[8] & 0xf0) == 0) {
835 /* FIXME: 32 way 8 CPU slot monster cannot be
836 * plumbed this way---need to check for it */
837
838 printk("Plumbing second Extended Quad Processor\n");
839 /* second VIC line hardwired to Quad CPU 1 */
840 qabc_data[8] |= 0x20;
841 cat_subwrite(*modpp, qabc_asic, 8, 1, &qabc_data[8]);
842#ifdef VOYAGER_CAT_DEBUG
843 /* verify plumbing */
844 cat_subread(*modpp, qabc_asic, 8, 1, &qabc_data[8]);
845 if((qabc_data[8] & 0xf0) == 0) {
846 CDEBUG(("PLUMBING FAILED: 0x%x\n", qabc_data[8]));
847 }
848#endif
849 }
850#endif
851
852 {
853 struct resource *res = kzalloc(sizeof(struct resource),GFP_KERNEL);
854 res->name = kmalloc(128, GFP_KERNEL);
855 sprintf((char *)res->name, "Voyager %s Quad CPI", cat_module_name(i));
856 res->start = qic_addr;
857 res->end = qic_addr + 0x3ff;
858 request_resource(&iomem_resource, res);
859 }
860
861 qic_addr = (unsigned long)ioremap(qic_addr, 0x400);
862
863 for(j = 0; j < 4; j++) {
864 __u8 cpu;
865
866 if(voyager_8slot) {
867 /* 8 slot has a different mapping,
868 * each slot has only one vic line, so
869 * 1 cpu in each slot must be < 8 */
870 cpu = (i & 0x07) + j*8;
871 } else {
872 cpu = (i & 0x03) + j*4;
873 }
874 if( (qabc_data[8] & (1<<j))) {
875 voyager_extended_vic_processors |= (1<<cpu);
876 }
877 if(qabc_data[8] & (1<<(j+4)) ) {
878 /* Second SET register plumbed: Quad
879 * card has two VIC connected CPUs.
880 * Secondary cannot be booted as a VIC
881 * CPU */
882 voyager_extended_vic_processors |= (1<<cpu);
883 voyager_allowed_boot_processors &= (~(1<<cpu));
884 }
885
886 voyager_quad_processors |= (1<<cpu);
887 voyager_quad_cpi_addr[cpu] = (struct voyager_qic_cpi *)
888 (qic_addr+(j<<8));
889 CDEBUG(("CPU%d: CPI address 0x%lx\n", cpu,
890 (unsigned long)voyager_quad_cpi_addr[cpu]));
891 }
892 outb(VOYAGER_CAT_END, CAT_CMD);
893
894
895
896 *asicpp = NULL;
897 modpp = &((*modpp)->next);
898 }
899 *modpp = NULL;
900 printk("CAT Bus Initialisation finished: extended procs 0x%x, quad procs 0x%x, allowed vic boot = 0x%x\n", voyager_extended_vic_processors, voyager_quad_processors, voyager_allowed_boot_processors);
901 request_resource(&ioport_resource, &vic_res);
902 if(voyager_quad_processors)
903 request_resource(&ioport_resource, &qic_res);
904 /* set up the front power switch */
905}
906
907int
908voyager_cat_readb(__u8 module, __u8 asic, int reg)
909{
910 return 0;
911}
912
913static int
914cat_disconnect(voyager_module_t *modp, voyager_asic_t *asicp)
915{
916 __u8 val;
917 int err = 0;
918
919 if(!modp->scan_path_connected)
920 return 0;
921 if(asicp->asic_id != VOYAGER_CAT_ID) {
922 CDEBUG(("cat_disconnect: ASIC is not CAT\n"));
923 return 1;
924 }
925 err = cat_read(modp, asicp, VOYAGER_SCANPATH, &val);
926 if(err) {
927 CDEBUG(("cat_disconnect: failed to read SCANPATH\n"));
928 return err;
929 }
930 val &= VOYAGER_DISCONNECT_ASIC;
931 err = cat_write(modp, asicp, VOYAGER_SCANPATH, val);
932 if(err) {
933 CDEBUG(("cat_disconnect: failed to write SCANPATH\n"));
934 return err;
935 }
936 outb(VOYAGER_CAT_END, CAT_CMD);
937 outb(VOYAGER_CAT_RUN, CAT_CMD);
938 modp->scan_path_connected = 0;
939
940 return 0;
941}
942
943static int
944cat_connect(voyager_module_t *modp, voyager_asic_t *asicp)
945{
946 __u8 val;
947 int err = 0;
948
949 if(modp->scan_path_connected)
950 return 0;
951 if(asicp->asic_id != VOYAGER_CAT_ID) {
952 CDEBUG(("cat_connect: ASIC is not CAT\n"));
953 return 1;
954 }
955
956 err = cat_read(modp, asicp, VOYAGER_SCANPATH, &val);
957 if(err) {
958 CDEBUG(("cat_connect: failed to read SCANPATH\n"));
959 return err;
960 }
961 val |= VOYAGER_CONNECT_ASIC;
962 err = cat_write(modp, asicp, VOYAGER_SCANPATH, val);
963 if(err) {
964 CDEBUG(("cat_connect: failed to write SCANPATH\n"));
965 return err;
966 }
967 outb(VOYAGER_CAT_END, CAT_CMD);
968 outb(VOYAGER_CAT_RUN, CAT_CMD);
969 modp->scan_path_connected = 1;
970
971 return 0;
972}
973
974void
975voyager_cat_power_off(void)
976{
977 /* Power the machine off by writing to the PSI over the CAT
978 * bus */
979 __u8 data;
980 voyager_module_t psi = { 0 };
981 voyager_asic_t psi_asic = { 0 };
982
983 psi.asic = &psi_asic;
984 psi.asic->asic_id = VOYAGER_CAT_ID;
985 psi.asic->subaddr = VOYAGER_SUBADDR_HI;
986 psi.module_addr = VOYAGER_PSI;
987 psi.scan_path_connected = 0;
988
989 outb(VOYAGER_CAT_END, CAT_CMD);
990 /* Connect the PSI to the CAT Bus */
991 outb(VOYAGER_CAT_DESELECT, VOYAGER_CAT_CONFIG_PORT);
992 outb(VOYAGER_PSI, VOYAGER_CAT_CONFIG_PORT);
993 outb(VOYAGER_CAT_RUN, CAT_CMD);
994 cat_disconnect(&psi, &psi_asic);
995 /* Read the status */
996 cat_subread(&psi, &psi_asic, VOYAGER_PSI_GENERAL_REG, 1, &data);
997 outb(VOYAGER_CAT_END, CAT_CMD);
998 CDEBUG(("PSI STATUS 0x%x\n", data));
999 /* These two writes are power off prep and perform */
1000 data = PSI_CLEAR;
1001 outb(VOYAGER_CAT_RUN, CAT_CMD);
1002 cat_subwrite(&psi, &psi_asic, VOYAGER_PSI_GENERAL_REG, 1, &data);
1003 outb(VOYAGER_CAT_END, CAT_CMD);
1004 data = PSI_POWER_DOWN;
1005 outb(VOYAGER_CAT_RUN, CAT_CMD);
1006 cat_subwrite(&psi, &psi_asic, VOYAGER_PSI_GENERAL_REG, 1, &data);
1007 outb(VOYAGER_CAT_END, CAT_CMD);
1008}
1009
1010struct voyager_status voyager_status = { 0 };
1011
1012void
1013voyager_cat_psi(__u8 cmd, __u16 reg, __u8 *data)
1014{
1015 voyager_module_t psi = { 0 };
1016 voyager_asic_t psi_asic = { 0 };
1017
1018 psi.asic = &psi_asic;
1019 psi.asic->asic_id = VOYAGER_CAT_ID;
1020 psi.asic->subaddr = VOYAGER_SUBADDR_HI;
1021 psi.module_addr = VOYAGER_PSI;
1022 psi.scan_path_connected = 0;
1023
1024 outb(VOYAGER_CAT_END, CAT_CMD);
1025 /* Connect the PSI to the CAT Bus */
1026 outb(VOYAGER_CAT_DESELECT, VOYAGER_CAT_CONFIG_PORT);
1027 outb(VOYAGER_PSI, VOYAGER_CAT_CONFIG_PORT);
1028 outb(VOYAGER_CAT_RUN, CAT_CMD);
1029 cat_disconnect(&psi, &psi_asic);
1030 switch(cmd) {
1031 case VOYAGER_PSI_READ:
1032 cat_read(&psi, &psi_asic, reg, data);
1033 break;
1034 case VOYAGER_PSI_WRITE:
1035 cat_write(&psi, &psi_asic, reg, *data);
1036 break;
1037 case VOYAGER_PSI_SUBREAD:
1038 cat_subread(&psi, &psi_asic, reg, 1, data);
1039 break;
1040 case VOYAGER_PSI_SUBWRITE:
1041 cat_subwrite(&psi, &psi_asic, reg, 1, data);
1042 break;
1043 default:
1044 printk(KERN_ERR "Voyager PSI, unrecognised command %d\n", cmd);
1045 break;
1046 }
1047 outb(VOYAGER_CAT_END, CAT_CMD);
1048}
1049
1050void
1051voyager_cat_do_common_interrupt(void)
1052{
1053 /* This is caused either by a memory parity error or something
1054 * in the PSI */
1055 __u8 data;
1056 voyager_module_t psi = { 0 };
1057 voyager_asic_t psi_asic = { 0 };
1058 struct voyager_psi psi_reg;
1059 int i;
1060 re_read:
1061 psi.asic = &psi_asic;
1062 psi.asic->asic_id = VOYAGER_CAT_ID;
1063 psi.asic->subaddr = VOYAGER_SUBADDR_HI;
1064 psi.module_addr = VOYAGER_PSI;
1065 psi.scan_path_connected = 0;
1066
1067 outb(VOYAGER_CAT_END, CAT_CMD);
1068 /* Connect the PSI to the CAT Bus */
1069 outb(VOYAGER_CAT_DESELECT, VOYAGER_CAT_CONFIG_PORT);
1070 outb(VOYAGER_PSI, VOYAGER_CAT_CONFIG_PORT);
1071 outb(VOYAGER_CAT_RUN, CAT_CMD);
1072 cat_disconnect(&psi, &psi_asic);
1073 /* Read the status. NOTE: Need to read *all* the PSI regs here
1074 * otherwise the cmn int will be reasserted */
1075 for(i = 0; i < sizeof(psi_reg.regs); i++) {
1076 cat_read(&psi, &psi_asic, i, &((__u8 *)&psi_reg.regs)[i]);
1077 }
1078 outb(VOYAGER_CAT_END, CAT_CMD);
1079 if((psi_reg.regs.checkbit & 0x02) == 0) {
1080 psi_reg.regs.checkbit |= 0x02;
1081 cat_write(&psi, &psi_asic, 5, psi_reg.regs.checkbit);
1082 printk("VOYAGER RE-READ PSI\n");
1083 goto re_read;
1084 }
1085 outb(VOYAGER_CAT_RUN, CAT_CMD);
1086 for(i = 0; i < sizeof(psi_reg.subregs); i++) {
1087 /* This looks strange, but the PSI doesn't do auto increment
1088 * correctly */
1089 cat_subread(&psi, &psi_asic, VOYAGER_PSI_SUPPLY_REG + i,
1090 1, &((__u8 *)&psi_reg.subregs)[i]);
1091 }
1092 outb(VOYAGER_CAT_END, CAT_CMD);
1093#ifdef VOYAGER_CAT_DEBUG
1094 printk("VOYAGER PSI: ");
1095 for(i=0; i<sizeof(psi_reg.regs); i++)
1096 printk("%02x ", ((__u8 *)&psi_reg.regs)[i]);
1097 printk("\n ");
1098 for(i=0; i<sizeof(psi_reg.subregs); i++)
1099 printk("%02x ", ((__u8 *)&psi_reg.subregs)[i]);
1100 printk("\n");
1101#endif
1102 if(psi_reg.regs.intstatus & PSI_MON) {
1103 /* switch off or power fail */
1104
1105 if(psi_reg.subregs.supply & PSI_SWITCH_OFF) {
1106 if(voyager_status.switch_off) {
1107 printk(KERN_ERR "Voyager front panel switch turned off again---Immediate power off!\n");
1108 voyager_cat_power_off();
1109 /* not reached */
1110 } else {
1111 printk(KERN_ERR "Voyager front panel switch turned off\n");
1112 voyager_status.switch_off = 1;
1113 voyager_status.request_from_kernel = 1;
1114 wake_up_process(voyager_thread);
1115 }
1116 /* Tell the hardware we're taking care of the
1117 * shutdown, otherwise it will power the box off
1118 * within 3 seconds of the switch being pressed and,
1119 * which is much more important to us, continue to
1120 * assert the common interrupt */
1121 data = PSI_CLR_SWITCH_OFF;
1122 outb(VOYAGER_CAT_RUN, CAT_CMD);
1123 cat_subwrite(&psi, &psi_asic, VOYAGER_PSI_SUPPLY_REG,
1124 1, &data);
1125 outb(VOYAGER_CAT_END, CAT_CMD);
1126 } else {
1127
1128 VDEBUG(("Voyager ac fail reg 0x%x\n",
1129 psi_reg.subregs.ACfail));
1130 if((psi_reg.subregs.ACfail & AC_FAIL_STAT_CHANGE) == 0) {
1131 /* No further update */
1132 return;
1133 }
1134#if 0
1135 /* Don't bother trying to find out who failed.
1136 * FIXME: This probably makes the code incorrect on
1137 * anything other than a 345x */
1138 for(i=0; i< 5; i++) {
1139 if( psi_reg.subregs.ACfail &(1<<i)) {
1140 break;
1141 }
1142 }
1143 printk(KERN_NOTICE "AC FAIL IN SUPPLY %d\n", i);
1144#endif
1145 /* DON'T do this: it shuts down the AC PSI
1146 outb(VOYAGER_CAT_RUN, CAT_CMD);
1147 data = PSI_MASK_MASK | i;
1148 cat_subwrite(&psi, &psi_asic, VOYAGER_PSI_MASK,
1149 1, &data);
1150 outb(VOYAGER_CAT_END, CAT_CMD);
1151 */
1152 printk(KERN_ERR "Voyager AC power failure\n");
1153 outb(VOYAGER_CAT_RUN, CAT_CMD);
1154 data = PSI_COLD_START;
1155 cat_subwrite(&psi, &psi_asic, VOYAGER_PSI_GENERAL_REG,
1156 1, &data);
1157 outb(VOYAGER_CAT_END, CAT_CMD);
1158 voyager_status.power_fail = 1;
1159 voyager_status.request_from_kernel = 1;
1160 wake_up_process(voyager_thread);
1161 }
1162
1163
1164 } else if(psi_reg.regs.intstatus & PSI_FAULT) {
1165 /* Major fault! */
1166 printk(KERN_ERR "Voyager PSI Detected major fault, immediate power off!\n");
1167 voyager_cat_power_off();
1168 /* not reached */
1169 } else if(psi_reg.regs.intstatus & (PSI_DC_FAIL | PSI_ALARM
1170 | PSI_CURRENT | PSI_DVM
1171 | PSI_PSCFAULT | PSI_STAT_CHG)) {
1172 /* other psi fault */
1173
1174 printk(KERN_WARNING "Voyager PSI status 0x%x\n", data);
1175 /* clear the PSI fault */
1176 outb(VOYAGER_CAT_RUN, CAT_CMD);
1177 cat_write(&psi, &psi_asic, VOYAGER_PSI_STATUS_REG, 0);
1178 outb(VOYAGER_CAT_END, CAT_CMD);
1179 }
1180}
diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c
new file mode 100644
index 000000000000..b87f8548e75a
--- /dev/null
+++ b/arch/x86/mach-voyager/voyager_smp.c
@@ -0,0 +1,1952 @@
1/* -*- mode: c; c-basic-offset: 8 -*- */
2
3/* Copyright (C) 1999,2001
4 *
5 * Author: J.E.J.Bottomley@HansenPartnership.com
6 *
7 * linux/arch/i386/kernel/voyager_smp.c
8 *
9 * This file provides all the same external entries as smp.c but uses
10 * the voyager hal to provide the functionality
11 */
12#include <linux/module.h>
13#include <linux/mm.h>
14#include <linux/kernel_stat.h>
15#include <linux/delay.h>
16#include <linux/mc146818rtc.h>
17#include <linux/cache.h>
18#include <linux/interrupt.h>
19#include <linux/init.h>
20#include <linux/kernel.h>
21#include <linux/bootmem.h>
22#include <linux/completion.h>
23#include <asm/desc.h>
24#include <asm/voyager.h>
25#include <asm/vic.h>
26#include <asm/mtrr.h>
27#include <asm/pgalloc.h>
28#include <asm/tlbflush.h>
29#include <asm/arch_hooks.h>
30
31/* TLB state -- visible externally, indexed physically */
32DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_mm, 0 };
33
34/* CPU IRQ affinity -- set to all ones initially */
35static unsigned long cpu_irq_affinity[NR_CPUS] __cacheline_aligned = { [0 ... NR_CPUS-1] = ~0UL };
36
37/* per CPU data structure (for /proc/cpuinfo et al), visible externally
38 * indexed physically */
39struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
40EXPORT_SYMBOL(cpu_data);
41
42/* physical ID of the CPU used to boot the system */
43unsigned char boot_cpu_id;
44
45/* The memory line addresses for the Quad CPIs */
46struct voyager_qic_cpi *voyager_quad_cpi_addr[NR_CPUS] __cacheline_aligned;
47
48/* The masks for the Extended VIC processors, filled in by cat_init */
49__u32 voyager_extended_vic_processors = 0;
50
51/* Masks for the extended Quad processors which cannot be VIC booted */
52__u32 voyager_allowed_boot_processors = 0;
53
54/* The mask for the Quad Processors (both extended and non-extended) */
55__u32 voyager_quad_processors = 0;
56
57/* Total count of live CPUs, used in process.c to display
58 * the CPU information and in irq.c for the per CPU irq
59 * activity count. Finally exported by i386_ksyms.c */
60static int voyager_extended_cpus = 1;
61
62/* Have we found an SMP box - used by time.c to do the profiling
63 interrupt for timeslicing; do not set to 1 until the per CPU timer
64 interrupt is active */
65int smp_found_config = 0;
66
67/* Used for the invalidate map that's also checked in the spinlock */
68static volatile unsigned long smp_invalidate_needed;
69
70/* Bitmask of currently online CPUs - used by setup.c for
71 /proc/cpuinfo, visible externally but still physical */
72cpumask_t cpu_online_map = CPU_MASK_NONE;
73EXPORT_SYMBOL(cpu_online_map);
74
75/* Bitmask of CPUs present in the system - exported by i386_syms.c, used
76 * by scheduler but indexed physically */
77cpumask_t phys_cpu_present_map = CPU_MASK_NONE;
78
79
80/* The internal functions */
81static void send_CPI(__u32 cpuset, __u8 cpi);
82static void ack_CPI(__u8 cpi);
83static int ack_QIC_CPI(__u8 cpi);
84static void ack_special_QIC_CPI(__u8 cpi);
85static void ack_VIC_CPI(__u8 cpi);
86static void send_CPI_allbutself(__u8 cpi);
87static void mask_vic_irq(unsigned int irq);
88static void unmask_vic_irq(unsigned int irq);
89static unsigned int startup_vic_irq(unsigned int irq);
90static void enable_local_vic_irq(unsigned int irq);
91static void disable_local_vic_irq(unsigned int irq);
92static void before_handle_vic_irq(unsigned int irq);
93static void after_handle_vic_irq(unsigned int irq);
94static void set_vic_irq_affinity(unsigned int irq, cpumask_t mask);
95static void ack_vic_irq(unsigned int irq);
96static void vic_enable_cpi(void);
97static void do_boot_cpu(__u8 cpuid);
98static void do_quad_bootstrap(void);
99
100int hard_smp_processor_id(void);
101int safe_smp_processor_id(void);
102
103/* Inline functions */
104static inline void
105send_one_QIC_CPI(__u8 cpu, __u8 cpi)
106{
107 voyager_quad_cpi_addr[cpu]->qic_cpi[cpi].cpi =
108 (smp_processor_id() << 16) + cpi;
109}
110
111static inline void
112send_QIC_CPI(__u32 cpuset, __u8 cpi)
113{
114 int cpu;
115
116 for_each_online_cpu(cpu) {
117 if(cpuset & (1<<cpu)) {
118#ifdef VOYAGER_DEBUG
119 if(!cpu_isset(cpu, cpu_online_map))
120 VDEBUG(("CPU%d sending cpi %d to CPU%d not in cpu_online_map\n", hard_smp_processor_id(), cpi, cpu));
121#endif
122 send_one_QIC_CPI(cpu, cpi - QIC_CPI_OFFSET);
123 }
124 }
125}
126
127static inline void
128wrapper_smp_local_timer_interrupt(void)
129{
130 irq_enter();
131 smp_local_timer_interrupt();
132 irq_exit();
133}
134
135static inline void
136send_one_CPI(__u8 cpu, __u8 cpi)
137{
138 if(voyager_quad_processors & (1<<cpu))
139 send_one_QIC_CPI(cpu, cpi - QIC_CPI_OFFSET);
140 else
141 send_CPI(1<<cpu, cpi);
142}
143
144static inline void
145send_CPI_allbutself(__u8 cpi)
146{
147 __u8 cpu = smp_processor_id();
148 __u32 mask = cpus_addr(cpu_online_map)[0] & ~(1 << cpu);
149 send_CPI(mask, cpi);
150}
151
152static inline int
153is_cpu_quad(void)
154{
155 __u8 cpumask = inb(VIC_PROC_WHO_AM_I);
156 return ((cpumask & QUAD_IDENTIFIER) == QUAD_IDENTIFIER);
157}
158
159static inline int
160is_cpu_extended(void)
161{
162 __u8 cpu = hard_smp_processor_id();
163
164 return(voyager_extended_vic_processors & (1<<cpu));
165}
166
167static inline int
168is_cpu_vic_boot(void)
169{
170 __u8 cpu = hard_smp_processor_id();
171
172 return(voyager_extended_vic_processors
173 & voyager_allowed_boot_processors & (1<<cpu));
174}
175
176
177static inline void
178ack_CPI(__u8 cpi)
179{
180 switch(cpi) {
181 case VIC_CPU_BOOT_CPI:
182 if(is_cpu_quad() && !is_cpu_vic_boot())
183 ack_QIC_CPI(cpi);
184 else
185 ack_VIC_CPI(cpi);
186 break;
187 case VIC_SYS_INT:
188 case VIC_CMN_INT:
189 /* These are slightly strange. Even on the Quad card,
190 * They are vectored as VIC CPIs */
191 if(is_cpu_quad())
192 ack_special_QIC_CPI(cpi);
193 else
194 ack_VIC_CPI(cpi);
195 break;
196 default:
197 printk("VOYAGER ERROR: CPI%d is in common CPI code\n", cpi);
198 break;
199 }
200}
201
202/* local variables */
203
204/* The VIC IRQ descriptors -- these look almost identical to the
205 * 8259 IRQs except that masks and things must be kept per processor
206 */
207static struct irq_chip vic_chip = {
208 .name = "VIC",
209 .startup = startup_vic_irq,
210 .mask = mask_vic_irq,
211 .unmask = unmask_vic_irq,
212 .set_affinity = set_vic_irq_affinity,
213};
214
215/* used to count up as CPUs are brought on line (starts at 0) */
216static int cpucount = 0;
217
218/* steal a page from the bottom of memory for the trampoline and
219 * squirrel its address away here. This will be in kernel virtual
220 * space */
221static __u32 trampoline_base;
222
223/* The per cpu profile stuff - used in smp_local_timer_interrupt */
224static DEFINE_PER_CPU(int, prof_multiplier) = 1;
225static DEFINE_PER_CPU(int, prof_old_multiplier) = 1;
226static DEFINE_PER_CPU(int, prof_counter) = 1;
227
228/* the map used to check if a CPU has booted */
229static __u32 cpu_booted_map;
230
231/* the synchronize flag used to hold all secondary CPUs spinning in
232 * a tight loop until the boot sequence is ready for them */
233static cpumask_t smp_commenced_mask = CPU_MASK_NONE;
234
235/* This is for the new dynamic CPU boot code */
236cpumask_t cpu_callin_map = CPU_MASK_NONE;
237cpumask_t cpu_callout_map = CPU_MASK_NONE;
238EXPORT_SYMBOL(cpu_callout_map);
239cpumask_t cpu_possible_map = CPU_MASK_NONE;
240EXPORT_SYMBOL(cpu_possible_map);
241
242/* The per processor IRQ masks (these are usually kept in sync) */
243static __u16 vic_irq_mask[NR_CPUS] __cacheline_aligned;
244
245/* the list of IRQs to be enabled by the VIC_ENABLE_IRQ_CPI */
246static __u16 vic_irq_enable_mask[NR_CPUS] __cacheline_aligned = { 0 };
247
248/* Lock for enable/disable of VIC interrupts */
249static __cacheline_aligned DEFINE_SPINLOCK(vic_irq_lock);
250
251/* The boot processor is correctly set up in PC mode when it
252 * comes up, but the secondaries need their master/slave 8259
253 * pairs initializing correctly */
254
255/* Interrupt counters (per cpu) and total - used to try to
256 * even up the interrupt handling routines */
257static long vic_intr_total = 0;
258static long vic_intr_count[NR_CPUS] __cacheline_aligned = { 0 };
259static unsigned long vic_tick[NR_CPUS] __cacheline_aligned = { 0 };
260
261/* Since we can only use CPI0, we fake all the other CPIs */
262static unsigned long vic_cpi_mailbox[NR_CPUS] __cacheline_aligned;
263
264/* debugging routine to read the isr of the cpu's pic */
265static inline __u16
266vic_read_isr(void)
267{
268 __u16 isr;
269
270 outb(0x0b, 0xa0);
271 isr = inb(0xa0) << 8;
272 outb(0x0b, 0x20);
273 isr |= inb(0x20);
274
275 return isr;
276}
277
278static __init void
279qic_setup(void)
280{
281 if(!is_cpu_quad()) {
282 /* not a quad, no setup */
283 return;
284 }
285 outb(QIC_DEFAULT_MASK0, QIC_MASK_REGISTER0);
286 outb(QIC_CPI_ENABLE, QIC_MASK_REGISTER1);
287
288 if(is_cpu_extended()) {
289 /* the QIC duplicate of the VIC base register */
290 outb(VIC_DEFAULT_CPI_BASE, QIC_VIC_CPI_BASE_REGISTER);
291 outb(QIC_DEFAULT_CPI_BASE, QIC_CPI_BASE_REGISTER);
292
293 /* FIXME: should set up the QIC timer and memory parity
294 * error vectors here */
295 }
296}
297
298static __init void
299vic_setup_pic(void)
300{
301 outb(1, VIC_REDIRECT_REGISTER_1);
302 /* clear the claim registers for dynamic routing */
303 outb(0, VIC_CLAIM_REGISTER_0);
304 outb(0, VIC_CLAIM_REGISTER_1);
305
306 outb(0, VIC_PRIORITY_REGISTER);
307 /* Set the Primary and Secondary Microchannel vector
308 * bases to be the same as the ordinary interrupts
309 *
310 * FIXME: This would be more efficient using separate
311 * vectors. */
312 outb(FIRST_EXTERNAL_VECTOR, VIC_PRIMARY_MC_BASE);
313 outb(FIRST_EXTERNAL_VECTOR, VIC_SECONDARY_MC_BASE);
314 /* Now initiallise the master PIC belonging to this CPU by
315 * sending the four ICWs */
316
317 /* ICW1: level triggered, ICW4 needed */
318 outb(0x19, 0x20);
319
320 /* ICW2: vector base */
321 outb(FIRST_EXTERNAL_VECTOR, 0x21);
322
323 /* ICW3: slave at line 2 */
324 outb(0x04, 0x21);
325
326 /* ICW4: 8086 mode */
327 outb(0x01, 0x21);
328
329 /* now the same for the slave PIC */
330
331 /* ICW1: level trigger, ICW4 needed */
332 outb(0x19, 0xA0);
333
334 /* ICW2: slave vector base */
335 outb(FIRST_EXTERNAL_VECTOR + 8, 0xA1);
336
337 /* ICW3: slave ID */
338 outb(0x02, 0xA1);
339
340 /* ICW4: 8086 mode */
341 outb(0x01, 0xA1);
342}
343
344static void
345do_quad_bootstrap(void)
346{
347 if(is_cpu_quad() && is_cpu_vic_boot()) {
348 int i;
349 unsigned long flags;
350 __u8 cpuid = hard_smp_processor_id();
351
352 local_irq_save(flags);
353
354 for(i = 0; i<4; i++) {
355 /* FIXME: this would be >>3 &0x7 on the 32 way */
356 if(((cpuid >> 2) & 0x03) == i)
357 /* don't lower our own mask! */
358 continue;
359
360 /* masquerade as local Quad CPU */
361 outb(QIC_CPUID_ENABLE | i, QIC_PROCESSOR_ID);
362 /* enable the startup CPI */
363 outb(QIC_BOOT_CPI_MASK, QIC_MASK_REGISTER1);
364 /* restore cpu id */
365 outb(0, QIC_PROCESSOR_ID);
366 }
367 local_irq_restore(flags);
368 }
369}
370
371
372/* Set up all the basic stuff: read the SMP config and make all the
373 * SMP information reflect only the boot cpu. All others will be
374 * brought on-line later. */
375void __init
376find_smp_config(void)
377{
378 int i;
379
380 boot_cpu_id = hard_smp_processor_id();
381
382 printk("VOYAGER SMP: Boot cpu is %d\n", boot_cpu_id);
383
384 /* initialize the CPU structures (moved from smp_boot_cpus) */
385 for(i=0; i<NR_CPUS; i++) {
386 cpu_irq_affinity[i] = ~0;
387 }
388 cpu_online_map = cpumask_of_cpu(boot_cpu_id);
389
390 /* The boot CPU must be extended */
391 voyager_extended_vic_processors = 1<<boot_cpu_id;
392 /* initially, all of the first 8 cpu's can boot */
393 voyager_allowed_boot_processors = 0xff;
394 /* set up everything for just this CPU, we can alter
395 * this as we start the other CPUs later */
396 /* now get the CPU disposition from the extended CMOS */
397 cpus_addr(phys_cpu_present_map)[0] = voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK);
398 cpus_addr(phys_cpu_present_map)[0] |= voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK + 1) << 8;
399 cpus_addr(phys_cpu_present_map)[0] |= voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK + 2) << 16;
400 cpus_addr(phys_cpu_present_map)[0] |= voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK + 3) << 24;
401 cpu_possible_map = phys_cpu_present_map;
402 printk("VOYAGER SMP: phys_cpu_present_map = 0x%lx\n", cpus_addr(phys_cpu_present_map)[0]);
403 /* Here we set up the VIC to enable SMP */
404 /* enable the CPIs by writing the base vector to their register */
405 outb(VIC_DEFAULT_CPI_BASE, VIC_CPI_BASE_REGISTER);
406 outb(1, VIC_REDIRECT_REGISTER_1);
407 /* set the claim registers for static routing --- Boot CPU gets
408 * all interrupts untill all other CPUs started */
409 outb(0xff, VIC_CLAIM_REGISTER_0);
410 outb(0xff, VIC_CLAIM_REGISTER_1);
411 /* Set the Primary and Secondary Microchannel vector
412 * bases to be the same as the ordinary interrupts
413 *
414 * FIXME: This would be more efficient using separate
415 * vectors. */
416 outb(FIRST_EXTERNAL_VECTOR, VIC_PRIMARY_MC_BASE);
417 outb(FIRST_EXTERNAL_VECTOR, VIC_SECONDARY_MC_BASE);
418
419 /* Finally tell the firmware that we're driving */
420 outb(inb(VOYAGER_SUS_IN_CONTROL_PORT) | VOYAGER_IN_CONTROL_FLAG,
421 VOYAGER_SUS_IN_CONTROL_PORT);
422
423 current_thread_info()->cpu = boot_cpu_id;
424 x86_write_percpu(cpu_number, boot_cpu_id);
425}
426
427/*
428 * The bootstrap kernel entry code has set these up. Save them
429 * for a given CPU, id is physical */
430void __init
431smp_store_cpu_info(int id)
432{
433 struct cpuinfo_x86 *c=&cpu_data[id];
434
435 *c = boot_cpu_data;
436
437 identify_secondary_cpu(c);
438}
439
440/* set up the trampoline and return the physical address of the code */
441static __u32 __init
442setup_trampoline(void)
443{
444 /* these two are global symbols in trampoline.S */
445 extern __u8 trampoline_end[];
446 extern __u8 trampoline_data[];
447
448 memcpy((__u8 *)trampoline_base, trampoline_data,
449 trampoline_end - trampoline_data);
450 return virt_to_phys((__u8 *)trampoline_base);
451}
452
453/* Routine initially called when a non-boot CPU is brought online */
454static void __init
455start_secondary(void *unused)
456{
457 __u8 cpuid = hard_smp_processor_id();
458 /* external functions not defined in the headers */
459 extern void calibrate_delay(void);
460
461 cpu_init();
462
463 /* OK, we're in the routine */
464 ack_CPI(VIC_CPU_BOOT_CPI);
465
466 /* setup the 8259 master slave pair belonging to this CPU ---
467 * we won't actually receive any until the boot CPU
468 * relinquishes it's static routing mask */
469 vic_setup_pic();
470
471 qic_setup();
472
473 if(is_cpu_quad() && !is_cpu_vic_boot()) {
474 /* clear the boot CPI */
475 __u8 dummy;
476
477 dummy = voyager_quad_cpi_addr[cpuid]->qic_cpi[VIC_CPU_BOOT_CPI].cpi;
478 printk("read dummy %d\n", dummy);
479 }
480
481 /* lower the mask to receive CPIs */
482 vic_enable_cpi();
483
484 VDEBUG(("VOYAGER SMP: CPU%d, stack at about %p\n", cpuid, &cpuid));
485
486 /* enable interrupts */
487 local_irq_enable();
488
489 /* get our bogomips */
490 calibrate_delay();
491
492 /* save our processor parameters */
493 smp_store_cpu_info(cpuid);
494
495 /* if we're a quad, we may need to bootstrap other CPUs */
496 do_quad_bootstrap();
497
498 /* FIXME: this is rather a poor hack to prevent the CPU
499 * activating softirqs while it's supposed to be waiting for
500 * permission to proceed. Without this, the new per CPU stuff
501 * in the softirqs will fail */
502 local_irq_disable();
503 cpu_set(cpuid, cpu_callin_map);
504
505 /* signal that we're done */
506 cpu_booted_map = 1;
507
508 while (!cpu_isset(cpuid, smp_commenced_mask))
509 rep_nop();
510 local_irq_enable();
511
512 local_flush_tlb();
513
514 cpu_set(cpuid, cpu_online_map);
515 wmb();
516 cpu_idle();
517}
518
519
520/* Routine to kick start the given CPU and wait for it to report ready
521 * (or timeout in startup). When this routine returns, the requested
522 * CPU is either fully running and configured or known to be dead.
523 *
524 * We call this routine sequentially 1 CPU at a time, so no need for
525 * locking */
526
527static void __init
528do_boot_cpu(__u8 cpu)
529{
530 struct task_struct *idle;
531 int timeout;
532 unsigned long flags;
533 int quad_boot = (1<<cpu) & voyager_quad_processors
534 & ~( voyager_extended_vic_processors
535 & voyager_allowed_boot_processors);
536
537 /* This is an area in head.S which was used to set up the
538 * initial kernel stack. We need to alter this to give the
539 * booting CPU a new stack (taken from its idle process) */
540 extern struct {
541 __u8 *esp;
542 unsigned short ss;
543 } stack_start;
544 /* This is the format of the CPI IDT gate (in real mode) which
545 * we're hijacking to boot the CPU */
546 union IDTFormat {
547 struct seg {
548 __u16 Offset;
549 __u16 Segment;
550 } idt;
551 __u32 val;
552 } hijack_source;
553
554 __u32 *hijack_vector;
555 __u32 start_phys_address = setup_trampoline();
556
557 /* There's a clever trick to this: The linux trampoline is
558 * compiled to begin at absolute location zero, so make the
559 * address zero but have the data segment selector compensate
560 * for the actual address */
561 hijack_source.idt.Offset = start_phys_address & 0x000F;
562 hijack_source.idt.Segment = (start_phys_address >> 4) & 0xFFFF;
563
564 cpucount++;
565 alternatives_smp_switch(1);
566
567 idle = fork_idle(cpu);
568 if(IS_ERR(idle))
569 panic("failed fork for CPU%d", cpu);
570 idle->thread.eip = (unsigned long) start_secondary;
571 /* init_tasks (in sched.c) is indexed logically */
572 stack_start.esp = (void *) idle->thread.esp;
573
574 init_gdt(cpu);
575 per_cpu(current_task, cpu) = idle;
576 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
577 irq_ctx_init(cpu);
578
579 /* Note: Don't modify initial ss override */
580 VDEBUG(("VOYAGER SMP: Booting CPU%d at 0x%lx[%x:%x], stack %p\n", cpu,
581 (unsigned long)hijack_source.val, hijack_source.idt.Segment,
582 hijack_source.idt.Offset, stack_start.esp));
583
584 /* init lowmem identity mapping */
585 clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
586 min_t(unsigned long, KERNEL_PGD_PTRS, USER_PGD_PTRS));
587 flush_tlb_all();
588
589 if(quad_boot) {
590 printk("CPU %d: non extended Quad boot\n", cpu);
591 hijack_vector = (__u32 *)phys_to_virt((VIC_CPU_BOOT_CPI + QIC_DEFAULT_CPI_BASE)*4);
592 *hijack_vector = hijack_source.val;
593 } else {
594 printk("CPU%d: extended VIC boot\n", cpu);
595 hijack_vector = (__u32 *)phys_to_virt((VIC_CPU_BOOT_CPI + VIC_DEFAULT_CPI_BASE)*4);
596 *hijack_vector = hijack_source.val;
597 /* VIC errata, may also receive interrupt at this address */
598 hijack_vector = (__u32 *)phys_to_virt((VIC_CPU_BOOT_ERRATA_CPI + VIC_DEFAULT_CPI_BASE)*4);
599 *hijack_vector = hijack_source.val;
600 }
601 /* All non-boot CPUs start with interrupts fully masked. Need
602 * to lower the mask of the CPI we're about to send. We do
603 * this in the VIC by masquerading as the processor we're
604 * about to boot and lowering its interrupt mask */
605 local_irq_save(flags);
606 if(quad_boot) {
607 send_one_QIC_CPI(cpu, VIC_CPU_BOOT_CPI);
608 } else {
609 outb(VIC_CPU_MASQUERADE_ENABLE | cpu, VIC_PROCESSOR_ID);
610 /* here we're altering registers belonging to `cpu' */
611
612 outb(VIC_BOOT_INTERRUPT_MASK, 0x21);
613 /* now go back to our original identity */
614 outb(boot_cpu_id, VIC_PROCESSOR_ID);
615
616 /* and boot the CPU */
617
618 send_CPI((1<<cpu), VIC_CPU_BOOT_CPI);
619 }
620 cpu_booted_map = 0;
621 local_irq_restore(flags);
622
623 /* now wait for it to become ready (or timeout) */
624 for(timeout = 0; timeout < 50000; timeout++) {
625 if(cpu_booted_map)
626 break;
627 udelay(100);
628 }
629 /* reset the page table */
630 zap_low_mappings();
631
632 if (cpu_booted_map) {
633 VDEBUG(("CPU%d: Booted successfully, back in CPU %d\n",
634 cpu, smp_processor_id()));
635
636 printk("CPU%d: ", cpu);
637 print_cpu_info(&cpu_data[cpu]);
638 wmb();
639 cpu_set(cpu, cpu_callout_map);
640 cpu_set(cpu, cpu_present_map);
641 }
642 else {
643 printk("CPU%d FAILED TO BOOT: ", cpu);
644 if (*((volatile unsigned char *)phys_to_virt(start_phys_address))==0xA5)
645 printk("Stuck.\n");
646 else
647 printk("Not responding.\n");
648
649 cpucount--;
650 }
651}
652
653void __init
654smp_boot_cpus(void)
655{
656 int i;
657
658 /* CAT BUS initialisation must be done after the memory */
659 /* FIXME: The L4 has a catbus too, it just needs to be
660 * accessed in a totally different way */
661 if(voyager_level == 5) {
662 voyager_cat_init();
663
664 /* now that the cat has probed the Voyager System Bus, sanity
665 * check the cpu map */
666 if( ((voyager_quad_processors | voyager_extended_vic_processors)
667 & cpus_addr(phys_cpu_present_map)[0]) != cpus_addr(phys_cpu_present_map)[0]) {
668 /* should panic */
669 printk("\n\n***WARNING*** Sanity check of CPU present map FAILED\n");
670 }
671 } else if(voyager_level == 4)
672 voyager_extended_vic_processors = cpus_addr(phys_cpu_present_map)[0];
673
674 /* this sets up the idle task to run on the current cpu */
675 voyager_extended_cpus = 1;
676 /* Remove the global_irq_holder setting, it triggers a BUG() on
677 * schedule at the moment */
678 //global_irq_holder = boot_cpu_id;
679
680 /* FIXME: Need to do something about this but currently only works
681 * on CPUs with a tsc which none of mine have.
682 smp_tune_scheduling();
683 */
684 smp_store_cpu_info(boot_cpu_id);
685 printk("CPU%d: ", boot_cpu_id);
686 print_cpu_info(&cpu_data[boot_cpu_id]);
687
688 if(is_cpu_quad()) {
689 /* booting on a Quad CPU */
690 printk("VOYAGER SMP: Boot CPU is Quad\n");
691 qic_setup();
692 do_quad_bootstrap();
693 }
694
695 /* enable our own CPIs */
696 vic_enable_cpi();
697
698 cpu_set(boot_cpu_id, cpu_online_map);
699 cpu_set(boot_cpu_id, cpu_callout_map);
700
701 /* loop over all the extended VIC CPUs and boot them. The
702 * Quad CPUs must be bootstrapped by their extended VIC cpu */
703 for(i = 0; i < NR_CPUS; i++) {
704 if(i == boot_cpu_id || !cpu_isset(i, phys_cpu_present_map))
705 continue;
706 do_boot_cpu(i);
707 /* This udelay seems to be needed for the Quad boots
708 * don't remove unless you know what you're doing */
709 udelay(1000);
710 }
711 /* we could compute the total bogomips here, but why bother?,
712 * Code added from smpboot.c */
713 {
714 unsigned long bogosum = 0;
715 for (i = 0; i < NR_CPUS; i++)
716 if (cpu_isset(i, cpu_online_map))
717 bogosum += cpu_data[i].loops_per_jiffy;
718 printk(KERN_INFO "Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
719 cpucount+1,
720 bogosum/(500000/HZ),
721 (bogosum/(5000/HZ))%100);
722 }
723 voyager_extended_cpus = hweight32(voyager_extended_vic_processors);
724 printk("VOYAGER: Extended (interrupt handling CPUs): %d, non-extended: %d\n", voyager_extended_cpus, num_booting_cpus() - voyager_extended_cpus);
725 /* that's it, switch to symmetric mode */
726 outb(0, VIC_PRIORITY_REGISTER);
727 outb(0, VIC_CLAIM_REGISTER_0);
728 outb(0, VIC_CLAIM_REGISTER_1);
729
730 VDEBUG(("VOYAGER SMP: Booted with %d CPUs\n", num_booting_cpus()));
731}
732
733/* Reload the secondary CPUs task structure (this function does not
734 * return ) */
735void __init
736initialize_secondary(void)
737{
738#if 0
739 // AC kernels only
740 set_current(hard_get_current());
741#endif
742
743 /*
744 * We don't actually need to load the full TSS,
745 * basically just the stack pointer and the eip.
746 */
747
748 asm volatile(
749 "movl %0,%%esp\n\t"
750 "jmp *%1"
751 :
752 :"r" (current->thread.esp),"r" (current->thread.eip));
753}
754
755/* handle a Voyager SYS_INT -- If we don't, the base board will
756 * panic the system.
757 *
758 * System interrupts occur because some problem was detected on the
759 * various busses. To find out what you have to probe all the
760 * hardware via the CAT bus. FIXME: At the moment we do nothing. */
761fastcall void
762smp_vic_sys_interrupt(struct pt_regs *regs)
763{
764 ack_CPI(VIC_SYS_INT);
765 printk("Voyager SYSTEM INTERRUPT\n");
766}
767
768/* Handle a voyager CMN_INT; These interrupts occur either because of
769 * a system status change or because a single bit memory error
770 * occurred. FIXME: At the moment, ignore all this. */
771fastcall void
772smp_vic_cmn_interrupt(struct pt_regs *regs)
773{
774 static __u8 in_cmn_int = 0;
775 static DEFINE_SPINLOCK(cmn_int_lock);
776
777 /* common ints are broadcast, so make sure we only do this once */
778 _raw_spin_lock(&cmn_int_lock);
779 if(in_cmn_int)
780 goto unlock_end;
781
782 in_cmn_int++;
783 _raw_spin_unlock(&cmn_int_lock);
784
785 VDEBUG(("Voyager COMMON INTERRUPT\n"));
786
787 if(voyager_level == 5)
788 voyager_cat_do_common_interrupt();
789
790 _raw_spin_lock(&cmn_int_lock);
791 in_cmn_int = 0;
792 unlock_end:
793 _raw_spin_unlock(&cmn_int_lock);
794 ack_CPI(VIC_CMN_INT);
795}
796
797/*
798 * Reschedule call back. Nothing to do, all the work is done
799 * automatically when we return from the interrupt. */
800static void
801smp_reschedule_interrupt(void)
802{
803 /* do nothing */
804}
805
806static struct mm_struct * flush_mm;
807static unsigned long flush_va;
808static DEFINE_SPINLOCK(tlbstate_lock);
809#define FLUSH_ALL 0xffffffff
810
811/*
812 * We cannot call mmdrop() because we are in interrupt context,
813 * instead update mm->cpu_vm_mask.
814 *
815 * We need to reload %cr3 since the page tables may be going
816 * away from under us..
817 */
818static inline void
819leave_mm (unsigned long cpu)
820{
821 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
822 BUG();
823 cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
824 load_cr3(swapper_pg_dir);
825}
826
827
828/*
829 * Invalidate call-back
830 */
831static void
832smp_invalidate_interrupt(void)
833{
834 __u8 cpu = smp_processor_id();
835
836 if (!test_bit(cpu, &smp_invalidate_needed))
837 return;
838 /* This will flood messages. Don't uncomment unless you see
839 * Problems with cross cpu invalidation
840 VDEBUG(("VOYAGER SMP: CPU%d received INVALIDATE_CPI\n",
841 smp_processor_id()));
842 */
843
844 if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) {
845 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) {
846 if (flush_va == FLUSH_ALL)
847 local_flush_tlb();
848 else
849 __flush_tlb_one(flush_va);
850 } else
851 leave_mm(cpu);
852 }
853 smp_mb__before_clear_bit();
854 clear_bit(cpu, &smp_invalidate_needed);
855 smp_mb__after_clear_bit();
856}
857
858/* All the new flush operations for 2.4 */
859
860
861/* This routine is called with a physical cpu mask */
862static void
863voyager_flush_tlb_others (unsigned long cpumask, struct mm_struct *mm,
864 unsigned long va)
865{
866 int stuck = 50000;
867
868 if (!cpumask)
869 BUG();
870 if ((cpumask & cpus_addr(cpu_online_map)[0]) != cpumask)
871 BUG();
872 if (cpumask & (1 << smp_processor_id()))
873 BUG();
874 if (!mm)
875 BUG();
876
877 spin_lock(&tlbstate_lock);
878
879 flush_mm = mm;
880 flush_va = va;
881 atomic_set_mask(cpumask, &smp_invalidate_needed);
882 /*
883 * We have to send the CPI only to
884 * CPUs affected.
885 */
886 send_CPI(cpumask, VIC_INVALIDATE_CPI);
887
888 while (smp_invalidate_needed) {
889 mb();
890 if(--stuck == 0) {
891 printk("***WARNING*** Stuck doing invalidate CPI (CPU%d)\n", smp_processor_id());
892 break;
893 }
894 }
895
896 /* Uncomment only to debug invalidation problems
897 VDEBUG(("VOYAGER SMP: Completed invalidate CPI (CPU%d)\n", cpu));
898 */
899
900 flush_mm = NULL;
901 flush_va = 0;
902 spin_unlock(&tlbstate_lock);
903}
904
905void
906flush_tlb_current_task(void)
907{
908 struct mm_struct *mm = current->mm;
909 unsigned long cpu_mask;
910
911 preempt_disable();
912
913 cpu_mask = cpus_addr(mm->cpu_vm_mask)[0] & ~(1 << smp_processor_id());
914 local_flush_tlb();
915 if (cpu_mask)
916 voyager_flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
917
918 preempt_enable();
919}
920
921
922void
923flush_tlb_mm (struct mm_struct * mm)
924{
925 unsigned long cpu_mask;
926
927 preempt_disable();
928
929 cpu_mask = cpus_addr(mm->cpu_vm_mask)[0] & ~(1 << smp_processor_id());
930
931 if (current->active_mm == mm) {
932 if (current->mm)
933 local_flush_tlb();
934 else
935 leave_mm(smp_processor_id());
936 }
937 if (cpu_mask)
938 voyager_flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
939
940 preempt_enable();
941}
942
943void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
944{
945 struct mm_struct *mm = vma->vm_mm;
946 unsigned long cpu_mask;
947
948 preempt_disable();
949
950 cpu_mask = cpus_addr(mm->cpu_vm_mask)[0] & ~(1 << smp_processor_id());
951 if (current->active_mm == mm) {
952 if(current->mm)
953 __flush_tlb_one(va);
954 else
955 leave_mm(smp_processor_id());
956 }
957
958 if (cpu_mask)
959 voyager_flush_tlb_others(cpu_mask, mm, va);
960
961 preempt_enable();
962}
963EXPORT_SYMBOL(flush_tlb_page);
964
965/* enable the requested IRQs */
966static void
967smp_enable_irq_interrupt(void)
968{
969 __u8 irq;
970 __u8 cpu = get_cpu();
971
972 VDEBUG(("VOYAGER SMP: CPU%d enabling irq mask 0x%x\n", cpu,
973 vic_irq_enable_mask[cpu]));
974
975 spin_lock(&vic_irq_lock);
976 for(irq = 0; irq < 16; irq++) {
977 if(vic_irq_enable_mask[cpu] & (1<<irq))
978 enable_local_vic_irq(irq);
979 }
980 vic_irq_enable_mask[cpu] = 0;
981 spin_unlock(&vic_irq_lock);
982
983 put_cpu_no_resched();
984}
985
986/*
987 * CPU halt call-back
988 */
989static void
990smp_stop_cpu_function(void *dummy)
991{
992 VDEBUG(("VOYAGER SMP: CPU%d is STOPPING\n", smp_processor_id()));
993 cpu_clear(smp_processor_id(), cpu_online_map);
994 local_irq_disable();
995 for(;;)
996 halt();
997}
998
999static DEFINE_SPINLOCK(call_lock);
1000
1001struct call_data_struct {
1002 void (*func) (void *info);
1003 void *info;
1004 volatile unsigned long started;
1005 volatile unsigned long finished;
1006 int wait;
1007};
1008
1009static struct call_data_struct * call_data;
1010
1011/* execute a thread on a new CPU. The function to be called must be
1012 * previously set up. This is used to schedule a function for
1013 * execution on all CPU's - set up the function then broadcast a
1014 * function_interrupt CPI to come here on each CPU */
1015static void
1016smp_call_function_interrupt(void)
1017{
1018 void (*func) (void *info) = call_data->func;
1019 void *info = call_data->info;
1020 /* must take copy of wait because call_data may be replaced
1021 * unless the function is waiting for us to finish */
1022 int wait = call_data->wait;
1023 __u8 cpu = smp_processor_id();
1024
1025 /*
1026 * Notify initiating CPU that I've grabbed the data and am
1027 * about to execute the function
1028 */
1029 mb();
1030 if(!test_and_clear_bit(cpu, &call_data->started)) {
1031 /* If the bit wasn't set, this could be a replay */
1032 printk(KERN_WARNING "VOYAGER SMP: CPU %d received call funtion with no call pending\n", cpu);
1033 return;
1034 }
1035 /*
1036 * At this point the info structure may be out of scope unless wait==1
1037 */
1038 irq_enter();
1039 (*func)(info);
1040 irq_exit();
1041 if (wait) {
1042 mb();
1043 clear_bit(cpu, &call_data->finished);
1044 }
1045}
1046
1047static int
1048voyager_smp_call_function_mask (cpumask_t cpumask,
1049 void (*func) (void *info), void *info,
1050 int wait)
1051{
1052 struct call_data_struct data;
1053 u32 mask = cpus_addr(cpumask)[0];
1054
1055 mask &= ~(1<<smp_processor_id());
1056
1057 if (!mask)
1058 return 0;
1059
1060 /* Can deadlock when called with interrupts disabled */
1061 WARN_ON(irqs_disabled());
1062
1063 data.func = func;
1064 data.info = info;
1065 data.started = mask;
1066 data.wait = wait;
1067 if (wait)
1068 data.finished = mask;
1069
1070 spin_lock(&call_lock);
1071 call_data = &data;
1072 wmb();
1073 /* Send a message to all other CPUs and wait for them to respond */
1074 send_CPI(mask, VIC_CALL_FUNCTION_CPI);
1075
1076 /* Wait for response */
1077 while (data.started)
1078 barrier();
1079
1080 if (wait)
1081 while (data.finished)
1082 barrier();
1083
1084 spin_unlock(&call_lock);
1085
1086 return 0;
1087}
1088
1089/* Sorry about the name. In an APIC based system, the APICs
1090 * themselves are programmed to send a timer interrupt. This is used
1091 * by linux to reschedule the processor. Voyager doesn't have this,
1092 * so we use the system clock to interrupt one processor, which in
1093 * turn, broadcasts a timer CPI to all the others --- we receive that
1094 * CPI here. We don't use this actually for counting so losing
1095 * ticks doesn't matter
1096 *
1097 * FIXME: For those CPU's which actually have a local APIC, we could
1098 * try to use it to trigger this interrupt instead of having to
1099 * broadcast the timer tick. Unfortunately, all my pentium DYADs have
1100 * no local APIC, so I can't do this
1101 *
1102 * This function is currently a placeholder and is unused in the code */
1103fastcall void
1104smp_apic_timer_interrupt(struct pt_regs *regs)
1105{
1106 struct pt_regs *old_regs = set_irq_regs(regs);
1107 wrapper_smp_local_timer_interrupt();
1108 set_irq_regs(old_regs);
1109}
1110
1111/* All of the QUAD interrupt GATES */
1112fastcall void
1113smp_qic_timer_interrupt(struct pt_regs *regs)
1114{
1115 struct pt_regs *old_regs = set_irq_regs(regs);
1116 ack_QIC_CPI(QIC_TIMER_CPI);
1117 wrapper_smp_local_timer_interrupt();
1118 set_irq_regs(old_regs);
1119}
1120
1121fastcall void
1122smp_qic_invalidate_interrupt(struct pt_regs *regs)
1123{
1124 ack_QIC_CPI(QIC_INVALIDATE_CPI);
1125 smp_invalidate_interrupt();
1126}
1127
1128fastcall void
1129smp_qic_reschedule_interrupt(struct pt_regs *regs)
1130{
1131 ack_QIC_CPI(QIC_RESCHEDULE_CPI);
1132 smp_reschedule_interrupt();
1133}
1134
1135fastcall void
1136smp_qic_enable_irq_interrupt(struct pt_regs *regs)
1137{
1138 ack_QIC_CPI(QIC_ENABLE_IRQ_CPI);
1139 smp_enable_irq_interrupt();
1140}
1141
1142fastcall void
1143smp_qic_call_function_interrupt(struct pt_regs *regs)
1144{
1145 ack_QIC_CPI(QIC_CALL_FUNCTION_CPI);
1146 smp_call_function_interrupt();
1147}
1148
1149fastcall void
1150smp_vic_cpi_interrupt(struct pt_regs *regs)
1151{
1152 struct pt_regs *old_regs = set_irq_regs(regs);
1153 __u8 cpu = smp_processor_id();
1154
1155 if(is_cpu_quad())
1156 ack_QIC_CPI(VIC_CPI_LEVEL0);
1157 else
1158 ack_VIC_CPI(VIC_CPI_LEVEL0);
1159
1160 if(test_and_clear_bit(VIC_TIMER_CPI, &vic_cpi_mailbox[cpu]))
1161 wrapper_smp_local_timer_interrupt();
1162 if(test_and_clear_bit(VIC_INVALIDATE_CPI, &vic_cpi_mailbox[cpu]))
1163 smp_invalidate_interrupt();
1164 if(test_and_clear_bit(VIC_RESCHEDULE_CPI, &vic_cpi_mailbox[cpu]))
1165 smp_reschedule_interrupt();
1166 if(test_and_clear_bit(VIC_ENABLE_IRQ_CPI, &vic_cpi_mailbox[cpu]))
1167 smp_enable_irq_interrupt();
1168 if(test_and_clear_bit(VIC_CALL_FUNCTION_CPI, &vic_cpi_mailbox[cpu]))
1169 smp_call_function_interrupt();
1170 set_irq_regs(old_regs);
1171}
1172
1173static void
1174do_flush_tlb_all(void* info)
1175{
1176 unsigned long cpu = smp_processor_id();
1177
1178 __flush_tlb_all();
1179 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY)
1180 leave_mm(cpu);
1181}
1182
1183
1184/* flush the TLB of every active CPU in the system */
1185void
1186flush_tlb_all(void)
1187{
1188 on_each_cpu(do_flush_tlb_all, 0, 1, 1);
1189}
1190
1191/* used to set up the trampoline for other CPUs when the memory manager
1192 * is sorted out */
1193void __init
1194smp_alloc_memory(void)
1195{
1196 trampoline_base = (__u32)alloc_bootmem_low_pages(PAGE_SIZE);
1197 if(__pa(trampoline_base) >= 0x93000)
1198 BUG();
1199}
1200
1201/* send a reschedule CPI to one CPU by physical CPU number*/
1202static void
1203voyager_smp_send_reschedule(int cpu)
1204{
1205 send_one_CPI(cpu, VIC_RESCHEDULE_CPI);
1206}
1207
1208
1209int
1210hard_smp_processor_id(void)
1211{
1212 __u8 i;
1213 __u8 cpumask = inb(VIC_PROC_WHO_AM_I);
1214 if((cpumask & QUAD_IDENTIFIER) == QUAD_IDENTIFIER)
1215 return cpumask & 0x1F;
1216
1217 for(i = 0; i < 8; i++) {
1218 if(cpumask & (1<<i))
1219 return i;
1220 }
1221 printk("** WARNING ** Illegal cpuid returned by VIC: %d", cpumask);
1222 return 0;
1223}
1224
1225int
1226safe_smp_processor_id(void)
1227{
1228 return hard_smp_processor_id();
1229}
1230
1231/* broadcast a halt to all other CPUs */
1232static void
1233voyager_smp_send_stop(void)
1234{
1235 smp_call_function(smp_stop_cpu_function, NULL, 1, 1);
1236}
1237
1238/* this function is triggered in time.c when a clock tick fires
1239 * we need to re-broadcast the tick to all CPUs */
1240void
1241smp_vic_timer_interrupt(void)
1242{
1243 send_CPI_allbutself(VIC_TIMER_CPI);
1244 smp_local_timer_interrupt();
1245}
1246
1247/* local (per CPU) timer interrupt. It does both profiling and
1248 * process statistics/rescheduling.
1249 *
1250 * We do profiling in every local tick, statistics/rescheduling
1251 * happen only every 'profiling multiplier' ticks. The default
1252 * multiplier is 1 and it can be changed by writing the new multiplier
1253 * value into /proc/profile.
1254 */
1255void
1256smp_local_timer_interrupt(void)
1257{
1258 int cpu = smp_processor_id();
1259 long weight;
1260
1261 profile_tick(CPU_PROFILING);
1262 if (--per_cpu(prof_counter, cpu) <= 0) {
1263 /*
1264 * The multiplier may have changed since the last time we got
1265 * to this point as a result of the user writing to
1266 * /proc/profile. In this case we need to adjust the APIC
1267 * timer accordingly.
1268 *
1269 * Interrupts are already masked off at this point.
1270 */
1271 per_cpu(prof_counter,cpu) = per_cpu(prof_multiplier, cpu);
1272 if (per_cpu(prof_counter, cpu) !=
1273 per_cpu(prof_old_multiplier, cpu)) {
1274 /* FIXME: need to update the vic timer tick here */
1275 per_cpu(prof_old_multiplier, cpu) =
1276 per_cpu(prof_counter, cpu);
1277 }
1278
1279 update_process_times(user_mode_vm(get_irq_regs()));
1280 }
1281
1282 if( ((1<<cpu) & voyager_extended_vic_processors) == 0)
1283 /* only extended VIC processors participate in
1284 * interrupt distribution */
1285 return;
1286
1287 /*
1288 * We take the 'long' return path, and there every subsystem
1289 * grabs the apropriate locks (kernel lock/ irq lock).
1290 *
1291 * we might want to decouple profiling from the 'long path',
1292 * and do the profiling totally in assembly.
1293 *
1294 * Currently this isn't too much of an issue (performance wise),
1295 * we can take more than 100K local irqs per second on a 100 MHz P5.
1296 */
1297
1298 if((++vic_tick[cpu] & 0x7) != 0)
1299 return;
1300 /* get here every 16 ticks (about every 1/6 of a second) */
1301
1302 /* Change our priority to give someone else a chance at getting
1303 * the IRQ. The algorithm goes like this:
1304 *
1305 * In the VIC, the dynamically routed interrupt is always
1306 * handled by the lowest priority eligible (i.e. receiving
1307 * interrupts) CPU. If >1 eligible CPUs are equal lowest, the
1308 * lowest processor number gets it.
1309 *
1310 * The priority of a CPU is controlled by a special per-CPU
1311 * VIC priority register which is 3 bits wide 0 being lowest
1312 * and 7 highest priority..
1313 *
1314 * Therefore we subtract the average number of interrupts from
1315 * the number we've fielded. If this number is negative, we
1316 * lower the activity count and if it is positive, we raise
1317 * it.
1318 *
1319 * I'm afraid this still leads to odd looking interrupt counts:
1320 * the totals are all roughly equal, but the individual ones
1321 * look rather skewed.
1322 *
1323 * FIXME: This algorithm is total crap when mixed with SMP
1324 * affinity code since we now try to even up the interrupt
1325 * counts when an affinity binding is keeping them on a
1326 * particular CPU*/
1327 weight = (vic_intr_count[cpu]*voyager_extended_cpus
1328 - vic_intr_total) >> 4;
1329 weight += 4;
1330 if(weight > 7)
1331 weight = 7;
1332 if(weight < 0)
1333 weight = 0;
1334
1335 outb((__u8)weight, VIC_PRIORITY_REGISTER);
1336
1337#ifdef VOYAGER_DEBUG
1338 if((vic_tick[cpu] & 0xFFF) == 0) {
1339 /* print this message roughly every 25 secs */
1340 printk("VOYAGER SMP: vic_tick[%d] = %lu, weight = %ld\n",
1341 cpu, vic_tick[cpu], weight);
1342 }
1343#endif
1344}
1345
1346/* setup the profiling timer */
1347int
1348setup_profiling_timer(unsigned int multiplier)
1349{
1350 int i;
1351
1352 if ( (!multiplier))
1353 return -EINVAL;
1354
1355 /*
1356 * Set the new multiplier for each CPU. CPUs don't start using the
1357 * new values until the next timer interrupt in which they do process
1358 * accounting.
1359 */
1360 for (i = 0; i < NR_CPUS; ++i)
1361 per_cpu(prof_multiplier, i) = multiplier;
1362
1363 return 0;
1364}
1365
1366/* This is a bit of a mess, but forced on us by the genirq changes
1367 * there's no genirq handler that really does what voyager wants
1368 * so hack it up with the simple IRQ handler */
1369static void fastcall
1370handle_vic_irq(unsigned int irq, struct irq_desc *desc)
1371{
1372 before_handle_vic_irq(irq);
1373 handle_simple_irq(irq, desc);
1374 after_handle_vic_irq(irq);
1375}
1376
1377
1378/* The CPIs are handled in the per cpu 8259s, so they must be
1379 * enabled to be received: FIX: enabling the CPIs in the early
1380 * boot sequence interferes with bug checking; enable them later
1381 * on in smp_init */
1382#define VIC_SET_GATE(cpi, vector) \
1383 set_intr_gate((cpi) + VIC_DEFAULT_CPI_BASE, (vector))
1384#define QIC_SET_GATE(cpi, vector) \
1385 set_intr_gate((cpi) + QIC_DEFAULT_CPI_BASE, (vector))
1386
1387void __init
1388smp_intr_init(void)
1389{
1390 int i;
1391
1392 /* initialize the per cpu irq mask to all disabled */
1393 for(i = 0; i < NR_CPUS; i++)
1394 vic_irq_mask[i] = 0xFFFF;
1395
1396 VIC_SET_GATE(VIC_CPI_LEVEL0, vic_cpi_interrupt);
1397
1398 VIC_SET_GATE(VIC_SYS_INT, vic_sys_interrupt);
1399 VIC_SET_GATE(VIC_CMN_INT, vic_cmn_interrupt);
1400
1401 QIC_SET_GATE(QIC_TIMER_CPI, qic_timer_interrupt);
1402 QIC_SET_GATE(QIC_INVALIDATE_CPI, qic_invalidate_interrupt);
1403 QIC_SET_GATE(QIC_RESCHEDULE_CPI, qic_reschedule_interrupt);
1404 QIC_SET_GATE(QIC_ENABLE_IRQ_CPI, qic_enable_irq_interrupt);
1405 QIC_SET_GATE(QIC_CALL_FUNCTION_CPI, qic_call_function_interrupt);
1406
1407
1408 /* now put the VIC descriptor into the first 48 IRQs
1409 *
1410 * This is for later: first 16 correspond to PC IRQs; next 16
1411 * are Primary MC IRQs and final 16 are Secondary MC IRQs */
1412 for(i = 0; i < 48; i++)
1413 set_irq_chip_and_handler(i, &vic_chip, handle_vic_irq);
1414}
1415
1416/* send a CPI at level cpi to a set of cpus in cpuset (set 1 bit per
1417 * processor to receive CPI */
1418static void
1419send_CPI(__u32 cpuset, __u8 cpi)
1420{
1421 int cpu;
1422 __u32 quad_cpuset = (cpuset & voyager_quad_processors);
1423
1424 if(cpi < VIC_START_FAKE_CPI) {
1425 /* fake CPI are only used for booting, so send to the
1426 * extended quads as well---Quads must be VIC booted */
1427 outb((__u8)(cpuset), VIC_CPI_Registers[cpi]);
1428 return;
1429 }
1430 if(quad_cpuset)
1431 send_QIC_CPI(quad_cpuset, cpi);
1432 cpuset &= ~quad_cpuset;
1433 cpuset &= 0xff; /* only first 8 CPUs vaild for VIC CPI */
1434 if(cpuset == 0)
1435 return;
1436 for_each_online_cpu(cpu) {
1437 if(cpuset & (1<<cpu))
1438 set_bit(cpi, &vic_cpi_mailbox[cpu]);
1439 }
1440 if(cpuset)
1441 outb((__u8)cpuset, VIC_CPI_Registers[VIC_CPI_LEVEL0]);
1442}
1443
1444/* Acknowledge receipt of CPI in the QIC, clear in QIC hardware and
1445 * set the cache line to shared by reading it.
1446 *
1447 * DON'T make this inline otherwise the cache line read will be
1448 * optimised away
1449 * */
1450static int
1451ack_QIC_CPI(__u8 cpi) {
1452 __u8 cpu = hard_smp_processor_id();
1453
1454 cpi &= 7;
1455
1456 outb(1<<cpi, QIC_INTERRUPT_CLEAR1);
1457 return voyager_quad_cpi_addr[cpu]->qic_cpi[cpi].cpi;
1458}
1459
1460static void
1461ack_special_QIC_CPI(__u8 cpi)
1462{
1463 switch(cpi) {
1464 case VIC_CMN_INT:
1465 outb(QIC_CMN_INT, QIC_INTERRUPT_CLEAR0);
1466 break;
1467 case VIC_SYS_INT:
1468 outb(QIC_SYS_INT, QIC_INTERRUPT_CLEAR0);
1469 break;
1470 }
1471 /* also clear at the VIC, just in case (nop for non-extended proc) */
1472 ack_VIC_CPI(cpi);
1473}
1474
1475/* Acknowledge receipt of CPI in the VIC (essentially an EOI) */
1476static void
1477ack_VIC_CPI(__u8 cpi)
1478{
1479#ifdef VOYAGER_DEBUG
1480 unsigned long flags;
1481 __u16 isr;
1482 __u8 cpu = smp_processor_id();
1483
1484 local_irq_save(flags);
1485 isr = vic_read_isr();
1486 if((isr & (1<<(cpi &7))) == 0) {
1487 printk("VOYAGER SMP: CPU%d lost CPI%d\n", cpu, cpi);
1488 }
1489#endif
1490 /* send specific EOI; the two system interrupts have
1491 * bit 4 set for a separate vector but behave as the
1492 * corresponding 3 bit intr */
1493 outb_p(0x60|(cpi & 7),0x20);
1494
1495#ifdef VOYAGER_DEBUG
1496 if((vic_read_isr() & (1<<(cpi &7))) != 0) {
1497 printk("VOYAGER SMP: CPU%d still asserting CPI%d\n", cpu, cpi);
1498 }
1499 local_irq_restore(flags);
1500#endif
1501}
1502
1503/* cribbed with thanks from irq.c */
1504#define __byte(x,y) (((unsigned char *)&(y))[x])
1505#define cached_21(cpu) (__byte(0,vic_irq_mask[cpu]))
1506#define cached_A1(cpu) (__byte(1,vic_irq_mask[cpu]))
1507
1508static unsigned int
1509startup_vic_irq(unsigned int irq)
1510{
1511 unmask_vic_irq(irq);
1512
1513 return 0;
1514}
1515
1516/* The enable and disable routines. This is where we run into
1517 * conflicting architectural philosophy. Fundamentally, the voyager
1518 * architecture does not expect to have to disable interrupts globally
1519 * (the IRQ controllers belong to each CPU). The processor masquerade
1520 * which is used to start the system shouldn't be used in a running OS
1521 * since it will cause great confusion if two separate CPUs drive to
1522 * the same IRQ controller (I know, I've tried it).
1523 *
1524 * The solution is a variant on the NCR lazy SPL design:
1525 *
1526 * 1) To disable an interrupt, do nothing (other than set the
1527 * IRQ_DISABLED flag). This dares the interrupt actually to arrive.
1528 *
1529 * 2) If the interrupt dares to come in, raise the local mask against
1530 * it (this will result in all the CPU masks being raised
1531 * eventually).
1532 *
1533 * 3) To enable the interrupt, lower the mask on the local CPU and
1534 * broadcast an Interrupt enable CPI which causes all other CPUs to
1535 * adjust their masks accordingly. */
1536
1537static void
1538unmask_vic_irq(unsigned int irq)
1539{
1540 /* linux doesn't to processor-irq affinity, so enable on
1541 * all CPUs we know about */
1542 int cpu = smp_processor_id(), real_cpu;
1543 __u16 mask = (1<<irq);
1544 __u32 processorList = 0;
1545 unsigned long flags;
1546
1547 VDEBUG(("VOYAGER: unmask_vic_irq(%d) CPU%d affinity 0x%lx\n",
1548 irq, cpu, cpu_irq_affinity[cpu]));
1549 spin_lock_irqsave(&vic_irq_lock, flags);
1550 for_each_online_cpu(real_cpu) {
1551 if(!(voyager_extended_vic_processors & (1<<real_cpu)))
1552 continue;
1553 if(!(cpu_irq_affinity[real_cpu] & mask)) {
1554 /* irq has no affinity for this CPU, ignore */
1555 continue;
1556 }
1557 if(real_cpu == cpu) {
1558 enable_local_vic_irq(irq);
1559 }
1560 else if(vic_irq_mask[real_cpu] & mask) {
1561 vic_irq_enable_mask[real_cpu] |= mask;
1562 processorList |= (1<<real_cpu);
1563 }
1564 }
1565 spin_unlock_irqrestore(&vic_irq_lock, flags);
1566 if(processorList)
1567 send_CPI(processorList, VIC_ENABLE_IRQ_CPI);
1568}
1569
1570static void
1571mask_vic_irq(unsigned int irq)
1572{
1573 /* lazy disable, do nothing */
1574}
1575
1576static void
1577enable_local_vic_irq(unsigned int irq)
1578{
1579 __u8 cpu = smp_processor_id();
1580 __u16 mask = ~(1 << irq);
1581 __u16 old_mask = vic_irq_mask[cpu];
1582
1583 vic_irq_mask[cpu] &= mask;
1584 if(vic_irq_mask[cpu] == old_mask)
1585 return;
1586
1587 VDEBUG(("VOYAGER DEBUG: Enabling irq %d in hardware on CPU %d\n",
1588 irq, cpu));
1589
1590 if (irq & 8) {
1591 outb_p(cached_A1(cpu),0xA1);
1592 (void)inb_p(0xA1);
1593 }
1594 else {
1595 outb_p(cached_21(cpu),0x21);
1596 (void)inb_p(0x21);
1597 }
1598}
1599
1600static void
1601disable_local_vic_irq(unsigned int irq)
1602{
1603 __u8 cpu = smp_processor_id();
1604 __u16 mask = (1 << irq);
1605 __u16 old_mask = vic_irq_mask[cpu];
1606
1607 if(irq == 7)
1608 return;
1609
1610 vic_irq_mask[cpu] |= mask;
1611 if(old_mask == vic_irq_mask[cpu])
1612 return;
1613
1614 VDEBUG(("VOYAGER DEBUG: Disabling irq %d in hardware on CPU %d\n",
1615 irq, cpu));
1616
1617 if (irq & 8) {
1618 outb_p(cached_A1(cpu),0xA1);
1619 (void)inb_p(0xA1);
1620 }
1621 else {
1622 outb_p(cached_21(cpu),0x21);
1623 (void)inb_p(0x21);
1624 }
1625}
1626
1627/* The VIC is level triggered, so the ack can only be issued after the
1628 * interrupt completes. However, we do Voyager lazy interrupt
1629 * handling here: It is an extremely expensive operation to mask an
1630 * interrupt in the vic, so we merely set a flag (IRQ_DISABLED). If
1631 * this interrupt actually comes in, then we mask and ack here to push
1632 * the interrupt off to another CPU */
1633static void
1634before_handle_vic_irq(unsigned int irq)
1635{
1636 irq_desc_t *desc = irq_desc + irq;
1637 __u8 cpu = smp_processor_id();
1638
1639 _raw_spin_lock(&vic_irq_lock);
1640 vic_intr_total++;
1641 vic_intr_count[cpu]++;
1642
1643 if(!(cpu_irq_affinity[cpu] & (1<<irq))) {
1644 /* The irq is not in our affinity mask, push it off
1645 * onto another CPU */
1646 VDEBUG(("VOYAGER DEBUG: affinity triggered disable of irq %d on cpu %d\n",
1647 irq, cpu));
1648 disable_local_vic_irq(irq);
1649 /* set IRQ_INPROGRESS to prevent the handler in irq.c from
1650 * actually calling the interrupt routine */
1651 desc->status |= IRQ_REPLAY | IRQ_INPROGRESS;
1652 } else if(desc->status & IRQ_DISABLED) {
1653 /* Damn, the interrupt actually arrived, do the lazy
1654 * disable thing. The interrupt routine in irq.c will
1655 * not handle a IRQ_DISABLED interrupt, so nothing more
1656 * need be done here */
1657 VDEBUG(("VOYAGER DEBUG: lazy disable of irq %d on CPU %d\n",
1658 irq, cpu));
1659 disable_local_vic_irq(irq);
1660 desc->status |= IRQ_REPLAY;
1661 } else {
1662 desc->status &= ~IRQ_REPLAY;
1663 }
1664
1665 _raw_spin_unlock(&vic_irq_lock);
1666}
1667
1668/* Finish the VIC interrupt: basically mask */
1669static void
1670after_handle_vic_irq(unsigned int irq)
1671{
1672 irq_desc_t *desc = irq_desc + irq;
1673
1674 _raw_spin_lock(&vic_irq_lock);
1675 {
1676 unsigned int status = desc->status & ~IRQ_INPROGRESS;
1677#ifdef VOYAGER_DEBUG
1678 __u16 isr;
1679#endif
1680
1681 desc->status = status;
1682 if ((status & IRQ_DISABLED))
1683 disable_local_vic_irq(irq);
1684#ifdef VOYAGER_DEBUG
1685 /* DEBUG: before we ack, check what's in progress */
1686 isr = vic_read_isr();
1687 if((isr & (1<<irq) && !(status & IRQ_REPLAY)) == 0) {
1688 int i;
1689 __u8 cpu = smp_processor_id();
1690 __u8 real_cpu;
1691 int mask; /* Um... initialize me??? --RR */
1692
1693 printk("VOYAGER SMP: CPU%d lost interrupt %d\n",
1694 cpu, irq);
1695 for_each_possible_cpu(real_cpu, mask) {
1696
1697 outb(VIC_CPU_MASQUERADE_ENABLE | real_cpu,
1698 VIC_PROCESSOR_ID);
1699 isr = vic_read_isr();
1700 if(isr & (1<<irq)) {
1701 printk("VOYAGER SMP: CPU%d ack irq %d\n",
1702 real_cpu, irq);
1703 ack_vic_irq(irq);
1704 }
1705 outb(cpu, VIC_PROCESSOR_ID);
1706 }
1707 }
1708#endif /* VOYAGER_DEBUG */
1709 /* as soon as we ack, the interrupt is eligible for
1710 * receipt by another CPU so everything must be in
1711 * order here */
1712 ack_vic_irq(irq);
1713 if(status & IRQ_REPLAY) {
1714 /* replay is set if we disable the interrupt
1715 * in the before_handle_vic_irq() routine, so
1716 * clear the in progress bit here to allow the
1717 * next CPU to handle this correctly */
1718 desc->status &= ~(IRQ_REPLAY | IRQ_INPROGRESS);
1719 }
1720#ifdef VOYAGER_DEBUG
1721 isr = vic_read_isr();
1722 if((isr & (1<<irq)) != 0)
1723 printk("VOYAGER SMP: after_handle_vic_irq() after ack irq=%d, isr=0x%x\n",
1724 irq, isr);
1725#endif /* VOYAGER_DEBUG */
1726 }
1727 _raw_spin_unlock(&vic_irq_lock);
1728
1729 /* All code after this point is out of the main path - the IRQ
1730 * may be intercepted by another CPU if reasserted */
1731}
1732
1733
1734/* Linux processor - interrupt affinity manipulations.
1735 *
1736 * For each processor, we maintain a 32 bit irq affinity mask.
1737 * Initially it is set to all 1's so every processor accepts every
1738 * interrupt. In this call, we change the processor's affinity mask:
1739 *
1740 * Change from enable to disable:
1741 *
1742 * If the interrupt ever comes in to the processor, we will disable it
1743 * and ack it to push it off to another CPU, so just accept the mask here.
1744 *
1745 * Change from disable to enable:
1746 *
1747 * change the mask and then do an interrupt enable CPI to re-enable on
1748 * the selected processors */
1749
1750void
1751set_vic_irq_affinity(unsigned int irq, cpumask_t mask)
1752{
1753 /* Only extended processors handle interrupts */
1754 unsigned long real_mask;
1755 unsigned long irq_mask = 1 << irq;
1756 int cpu;
1757
1758 real_mask = cpus_addr(mask)[0] & voyager_extended_vic_processors;
1759
1760 if(cpus_addr(mask)[0] == 0)
1761 /* can't have no cpu's to accept the interrupt -- extremely
1762 * bad things will happen */
1763 return;
1764
1765 if(irq == 0)
1766 /* can't change the affinity of the timer IRQ. This
1767 * is due to the constraint in the voyager
1768 * architecture that the CPI also comes in on and IRQ
1769 * line and we have chosen IRQ0 for this. If you
1770 * raise the mask on this interrupt, the processor
1771 * will no-longer be able to accept VIC CPIs */
1772 return;
1773
1774 if(irq >= 32)
1775 /* You can only have 32 interrupts in a voyager system
1776 * (and 32 only if you have a secondary microchannel
1777 * bus) */
1778 return;
1779
1780 for_each_online_cpu(cpu) {
1781 unsigned long cpu_mask = 1 << cpu;
1782
1783 if(cpu_mask & real_mask) {
1784 /* enable the interrupt for this cpu */
1785 cpu_irq_affinity[cpu] |= irq_mask;
1786 } else {
1787 /* disable the interrupt for this cpu */
1788 cpu_irq_affinity[cpu] &= ~irq_mask;
1789 }
1790 }
1791 /* this is magic, we now have the correct affinity maps, so
1792 * enable the interrupt. This will send an enable CPI to
1793 * those cpu's who need to enable it in their local masks,
1794 * causing them to correct for the new affinity . If the
1795 * interrupt is currently globally disabled, it will simply be
1796 * disabled again as it comes in (voyager lazy disable). If
1797 * the affinity map is tightened to disable the interrupt on a
1798 * cpu, it will be pushed off when it comes in */
1799 unmask_vic_irq(irq);
1800}
1801
1802static void
1803ack_vic_irq(unsigned int irq)
1804{
1805 if (irq & 8) {
1806 outb(0x62,0x20); /* Specific EOI to cascade */
1807 outb(0x60|(irq & 7),0xA0);
1808 } else {
1809 outb(0x60 | (irq & 7),0x20);
1810 }
1811}
1812
1813/* enable the CPIs. In the VIC, the CPIs are delivered by the 8259
1814 * but are not vectored by it. This means that the 8259 mask must be
1815 * lowered to receive them */
1816static __init void
1817vic_enable_cpi(void)
1818{
1819 __u8 cpu = smp_processor_id();
1820
1821 /* just take a copy of the current mask (nop for boot cpu) */
1822 vic_irq_mask[cpu] = vic_irq_mask[boot_cpu_id];
1823
1824 enable_local_vic_irq(VIC_CPI_LEVEL0);
1825 enable_local_vic_irq(VIC_CPI_LEVEL1);
1826 /* for sys int and cmn int */
1827 enable_local_vic_irq(7);
1828
1829 if(is_cpu_quad()) {
1830 outb(QIC_DEFAULT_MASK0, QIC_MASK_REGISTER0);
1831 outb(QIC_CPI_ENABLE, QIC_MASK_REGISTER1);
1832 VDEBUG(("VOYAGER SMP: QIC ENABLE CPI: CPU%d: MASK 0x%x\n",
1833 cpu, QIC_CPI_ENABLE));
1834 }
1835
1836 VDEBUG(("VOYAGER SMP: ENABLE CPI: CPU%d: MASK 0x%x\n",
1837 cpu, vic_irq_mask[cpu]));
1838}
1839
1840void
1841voyager_smp_dump()
1842{
1843 int old_cpu = smp_processor_id(), cpu;
1844
1845 /* dump the interrupt masks of each processor */
1846 for_each_online_cpu(cpu) {
1847 __u16 imr, isr, irr;
1848 unsigned long flags;
1849
1850 local_irq_save(flags);
1851 outb(VIC_CPU_MASQUERADE_ENABLE | cpu, VIC_PROCESSOR_ID);
1852 imr = (inb(0xa1) << 8) | inb(0x21);
1853 outb(0x0a, 0xa0);
1854 irr = inb(0xa0) << 8;
1855 outb(0x0a, 0x20);
1856 irr |= inb(0x20);
1857 outb(0x0b, 0xa0);
1858 isr = inb(0xa0) << 8;
1859 outb(0x0b, 0x20);
1860 isr |= inb(0x20);
1861 outb(old_cpu, VIC_PROCESSOR_ID);
1862 local_irq_restore(flags);
1863 printk("\tCPU%d: mask=0x%x, IMR=0x%x, IRR=0x%x, ISR=0x%x\n",
1864 cpu, vic_irq_mask[cpu], imr, irr, isr);
1865#if 0
1866 /* These lines are put in to try to unstick an un ack'd irq */
1867 if(isr != 0) {
1868 int irq;
1869 for(irq=0; irq<16; irq++) {
1870 if(isr & (1<<irq)) {
1871 printk("\tCPU%d: ack irq %d\n",
1872 cpu, irq);
1873 local_irq_save(flags);
1874 outb(VIC_CPU_MASQUERADE_ENABLE | cpu,
1875 VIC_PROCESSOR_ID);
1876 ack_vic_irq(irq);
1877 outb(old_cpu, VIC_PROCESSOR_ID);
1878 local_irq_restore(flags);
1879 }
1880 }
1881 }
1882#endif
1883 }
1884}
1885
1886void
1887smp_voyager_power_off(void *dummy)
1888{
1889 if(smp_processor_id() == boot_cpu_id)
1890 voyager_power_off();
1891 else
1892 smp_stop_cpu_function(NULL);
1893}
1894
1895static void __init
1896voyager_smp_prepare_cpus(unsigned int max_cpus)
1897{
1898 /* FIXME: ignore max_cpus for now */
1899 smp_boot_cpus();
1900}
1901
1902static void __devinit voyager_smp_prepare_boot_cpu(void)
1903{
1904 init_gdt(smp_processor_id());
1905 switch_to_new_gdt();
1906
1907 cpu_set(smp_processor_id(), cpu_online_map);
1908 cpu_set(smp_processor_id(), cpu_callout_map);
1909 cpu_set(smp_processor_id(), cpu_possible_map);
1910 cpu_set(smp_processor_id(), cpu_present_map);
1911}
1912
1913static int __devinit
1914voyager_cpu_up(unsigned int cpu)
1915{
1916 /* This only works at boot for x86. See "rewrite" above. */
1917 if (cpu_isset(cpu, smp_commenced_mask))
1918 return -ENOSYS;
1919
1920 /* In case one didn't come up */
1921 if (!cpu_isset(cpu, cpu_callin_map))
1922 return -EIO;
1923 /* Unleash the CPU! */
1924 cpu_set(cpu, smp_commenced_mask);
1925 while (!cpu_isset(cpu, cpu_online_map))
1926 mb();
1927 return 0;
1928}
1929
1930static void __init
1931voyager_smp_cpus_done(unsigned int max_cpus)
1932{
1933 zap_low_mappings();
1934}
1935
1936void __init
1937smp_setup_processor_id(void)
1938{
1939 current_thread_info()->cpu = hard_smp_processor_id();
1940 x86_write_percpu(cpu_number, hard_smp_processor_id());
1941}
1942
1943struct smp_ops smp_ops = {
1944 .smp_prepare_boot_cpu = voyager_smp_prepare_boot_cpu,
1945 .smp_prepare_cpus = voyager_smp_prepare_cpus,
1946 .cpu_up = voyager_cpu_up,
1947 .smp_cpus_done = voyager_smp_cpus_done,
1948
1949 .smp_send_stop = voyager_smp_send_stop,
1950 .smp_send_reschedule = voyager_smp_send_reschedule,
1951 .smp_call_function_mask = voyager_smp_call_function_mask,
1952};
diff --git a/arch/x86/mach-voyager/voyager_thread.c b/arch/x86/mach-voyager/voyager_thread.c
new file mode 100644
index 000000000000..f9d595338159
--- /dev/null
+++ b/arch/x86/mach-voyager/voyager_thread.c
@@ -0,0 +1,134 @@
1/* -*- mode: c; c-basic-offset: 8 -*- */
2
3/* Copyright (C) 2001
4 *
5 * Author: J.E.J.Bottomley@HansenPartnership.com
6 *
7 * linux/arch/i386/kernel/voyager_thread.c
8 *
9 * This module provides the machine status monitor thread for the
10 * voyager architecture. This allows us to monitor the machine
11 * environment (temp, voltage, fan function) and the front panel and
12 * internal UPS. If a fault is detected, this thread takes corrective
13 * action (usually just informing init)
14 * */
15
16#include <linux/module.h>
17#include <linux/mm.h>
18#include <linux/kernel_stat.h>
19#include <linux/delay.h>
20#include <linux/mc146818rtc.h>
21#include <linux/init.h>
22#include <linux/bootmem.h>
23#include <linux/kmod.h>
24#include <linux/completion.h>
25#include <linux/sched.h>
26#include <linux/kthread.h>
27#include <asm/desc.h>
28#include <asm/voyager.h>
29#include <asm/vic.h>
30#include <asm/mtrr.h>
31#include <asm/msr.h>
32
33
34struct task_struct *voyager_thread;
35static __u8 set_timeout;
36
37static int
38execute(const char *string)
39{
40 int ret;
41
42 char *envp[] = {
43 "HOME=/",
44 "TERM=linux",
45 "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
46 NULL,
47 };
48 char *argv[] = {
49 "/bin/bash",
50 "-c",
51 (char *)string,
52 NULL,
53 };
54
55 if ((ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC)) != 0) {
56 printk(KERN_ERR "Voyager failed to run \"%s\": %i\n",
57 string, ret);
58 }
59 return ret;
60}
61
62static void
63check_from_kernel(void)
64{
65 if(voyager_status.switch_off) {
66
67 /* FIXME: This should be configureable via proc */
68 execute("umask 600; echo 0 > /etc/initrunlvl; kill -HUP 1");
69 } else if(voyager_status.power_fail) {
70 VDEBUG(("Voyager daemon detected AC power failure\n"));
71
72 /* FIXME: This should be configureable via proc */
73 execute("umask 600; echo F > /etc/powerstatus; kill -PWR 1");
74 set_timeout = 1;
75 }
76}
77
78static void
79check_continuing_condition(void)
80{
81 if(voyager_status.power_fail) {
82 __u8 data;
83 voyager_cat_psi(VOYAGER_PSI_SUBREAD,
84 VOYAGER_PSI_AC_FAIL_REG, &data);
85 if((data & 0x1f) == 0) {
86 /* all power restored */
87 printk(KERN_NOTICE "VOYAGER AC power restored, cancelling shutdown\n");
88 /* FIXME: should be user configureable */
89 execute("umask 600; echo O > /etc/powerstatus; kill -PWR 1");
90 set_timeout = 0;
91 }
92 }
93}
94
95static int
96thread(void *unused)
97{
98 printk(KERN_NOTICE "Voyager starting monitor thread\n");
99
100 for (;;) {
101 set_current_state(TASK_INTERRUPTIBLE);
102 schedule_timeout(set_timeout ? HZ : MAX_SCHEDULE_TIMEOUT);
103
104 VDEBUG(("Voyager Daemon awoken\n"));
105 if(voyager_status.request_from_kernel == 0) {
106 /* probably awoken from timeout */
107 check_continuing_condition();
108 } else {
109 check_from_kernel();
110 voyager_status.request_from_kernel = 0;
111 }
112 }
113}
114
115static int __init
116voyager_thread_start(void)
117{
118 voyager_thread = kthread_run(thread, NULL, "kvoyagerd");
119 if (IS_ERR(voyager_thread)) {
120 printk(KERN_ERR "Voyager: Failed to create system monitor thread.\n");
121 return PTR_ERR(voyager_thread);
122 }
123 return 0;
124}
125
126
127static void __exit
128voyager_thread_stop(void)
129{
130 kthread_stop(voyager_thread);
131}
132
133module_init(voyager_thread_start);
134module_exit(voyager_thread_stop);
diff --git a/arch/x86/math-emu/Makefile b/arch/x86/math-emu/Makefile
new file mode 100644
index 000000000000..9c943fa6ce6b
--- /dev/null
+++ b/arch/x86/math-emu/Makefile
@@ -0,0 +1,30 @@
1#
2# Makefile for wm-FPU-emu
3#
4
5#DEBUG = -DDEBUGGING
6DEBUG =
7PARANOID = -DPARANOID
8CFLAGS := $(CFLAGS) $(PARANOID) $(DEBUG) -fno-builtin $(MATH_EMULATION)
9
10EXTRA_AFLAGS := $(PARANOID)
11
12# From 'C' language sources:
13C_OBJS =fpu_entry.o errors.o \
14 fpu_arith.o fpu_aux.o fpu_etc.o fpu_tags.o fpu_trig.o \
15 load_store.o get_address.o \
16 poly_atan.o poly_l2.o poly_2xm1.o poly_sin.o poly_tan.o \
17 reg_add_sub.o reg_compare.o reg_constant.o reg_convert.o \
18 reg_ld_str.o reg_divide.o reg_mul.o
19
20# From 80x86 assembler sources:
21A_OBJS =reg_u_add.o reg_u_div.o reg_u_mul.o reg_u_sub.o \
22 div_small.o reg_norm.o reg_round.o \
23 wm_shrx.o wm_sqrt.o \
24 div_Xsig.o polynom_Xsig.o round_Xsig.o \
25 shr_Xsig.o mul_Xsig.o
26
27obj-y =$(C_OBJS) $(A_OBJS)
28
29proto:
30 cproto -e -DMAKING_PROTO *.c >fpu_proto.h
diff --git a/arch/x86/math-emu/README b/arch/x86/math-emu/README
new file mode 100644
index 000000000000..e6235491d6eb
--- /dev/null
+++ b/arch/x86/math-emu/README
@@ -0,0 +1,427 @@
1 +---------------------------------------------------------------------------+
2 | wm-FPU-emu an FPU emulator for 80386 and 80486SX microprocessors. |
3 | |
4 | Copyright (C) 1992,1993,1994,1995,1996,1997,1999 |
5 | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, |
6 | Australia. E-mail billm@melbpc.org.au |
7 | |
8 | This program is free software; you can redistribute it and/or modify |
9 | it under the terms of the GNU General Public License version 2 as |
10 | published by the Free Software Foundation. |
11 | |
12 | This program is distributed in the hope that it will be useful, |
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
15 | GNU General Public License for more details. |
16 | |
17 | You should have received a copy of the GNU General Public License |
18 | along with this program; if not, write to the Free Software |
19 | Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
20 | |
21 +---------------------------------------------------------------------------+
22
23
24
25wm-FPU-emu is an FPU emulator for Linux. It is derived from wm-emu387
26which was my 80387 emulator for early versions of djgpp (gcc under
27msdos); wm-emu387 was in turn based upon emu387 which was written by
28DJ Delorie for djgpp. The interface to the Linux kernel is based upon
29the original Linux math emulator by Linus Torvalds.
30
31My target FPU for wm-FPU-emu is that described in the Intel486
32Programmer's Reference Manual (1992 edition). Unfortunately, numerous
33facets of the functioning of the FPU are not well covered in the
34Reference Manual. The information in the manual has been supplemented
35with measurements on real 80486's. Unfortunately, it is simply not
36possible to be sure that all of the peculiarities of the 80486 have
37been discovered, so there is always likely to be obscure differences
38in the detailed behaviour of the emulator and a real 80486.
39
40wm-FPU-emu does not implement all of the behaviour of the 80486 FPU,
41but is very close. See "Limitations" later in this file for a list of
42some differences.
43
44Please report bugs, etc to me at:
45 billm@melbpc.org.au
46or b.metzenthen@medoto.unimelb.edu.au
47
48For more information on the emulator and on floating point topics, see
49my web pages, currently at http://www.suburbia.net/~billm/
50
51
52--Bill Metzenthen
53 December 1999
54
55
56----------------------- Internals of wm-FPU-emu -----------------------
57
58Numeric algorithms:
59(1) Add, subtract, and multiply. Nothing remarkable in these.
60(2) Divide has been tuned to get reasonable performance. The algorithm
61 is not the obvious one which most people seem to use, but is designed
62 to take advantage of the characteristics of the 80386. I expect that
63 it has been invented many times before I discovered it, but I have not
64 seen it. It is based upon one of those ideas which one carries around
65 for years without ever bothering to check it out.
66(3) The sqrt function has been tuned to get good performance. It is based
67 upon Newton's classic method. Performance was improved by capitalizing
68 upon the properties of Newton's method, and the code is once again
69 structured taking account of the 80386 characteristics.
70(4) The trig, log, and exp functions are based in each case upon quasi-
71 "optimal" polynomial approximations. My definition of "optimal" was
72 based upon getting good accuracy with reasonable speed.
73(5) The argument reducing code for the trig function effectively uses
74 a value of pi which is accurate to more than 128 bits. As a consequence,
75 the reduced argument is accurate to more than 64 bits for arguments up
76 to a few pi, and accurate to more than 64 bits for most arguments,
77 even for arguments approaching 2^63. This is far superior to an
78 80486, which uses a value of pi which is accurate to 66 bits.
79
80The code of the emulator is complicated slightly by the need to
81account for a limited form of re-entrancy. Normally, the emulator will
82emulate each FPU instruction to completion without interruption.
83However, it may happen that when the emulator is accessing the user
84memory space, swapping may be needed. In this case the emulator may be
85temporarily suspended while disk i/o takes place. During this time
86another process may use the emulator, thereby perhaps changing static
87variables. The code which accesses user memory is confined to five
88files:
89 fpu_entry.c
90 reg_ld_str.c
91 load_store.c
92 get_address.c
93 errors.c
94As from version 1.12 of the emulator, no static variables are used
95(apart from those in the kernel's per-process tables). The emulator is
96therefore now fully re-entrant, rather than having just the restricted
97form of re-entrancy which is required by the Linux kernel.
98
99----------------------- Limitations of wm-FPU-emu -----------------------
100
101There are a number of differences between the current wm-FPU-emu
102(version 2.01) and the 80486 FPU (apart from bugs). The differences
103are fewer than those which applied to the 1.xx series of the emulator.
104Some of the more important differences are listed below:
105
106The Roundup flag does not have much meaning for the transcendental
107functions and its 80486 value with these functions is likely to differ
108from its emulator value.
109
110In a few rare cases the Underflow flag obtained with the emulator will
111be different from that obtained with an 80486. This occurs when the
112following conditions apply simultaneously:
113(a) the operands have a higher precision than the current setting of the
114 precision control (PC) flags.
115(b) the underflow exception is masked.
116(c) the magnitude of the exact result (before rounding) is less than 2^-16382.
117(d) the magnitude of the final result (after rounding) is exactly 2^-16382.
118(e) the magnitude of the exact result would be exactly 2^-16382 if the
119 operands were rounded to the current precision before the arithmetic
120 operation was performed.
121If all of these apply, the emulator will set the Underflow flag but a real
12280486 will not.
123
124NOTE: Certain formats of Extended Real are UNSUPPORTED. They are
125unsupported by the 80486. They are the Pseudo-NaNs, Pseudoinfinities,
126and Unnormals. None of these will be generated by an 80486 or by the
127emulator. Do not use them. The emulator treats them differently in
128detail from the way an 80486 does.
129
130Self modifying code can cause the emulator to fail. An example of such
131code is:
132 movl %esp,[%ebx]
133 fld1
134The FPU instruction may be (usually will be) loaded into the pre-fetch
135queue of the CPU before the mov instruction is executed. If the
136destination of the 'movl' overlaps the FPU instruction then the bytes
137in the prefetch queue and memory will be inconsistent when the FPU
138instruction is executed. The emulator will be invoked but will not be
139able to find the instruction which caused the device-not-present
140exception. For this case, the emulator cannot emulate the behaviour of
141an 80486DX.
142
143Handling of the address size override prefix byte (0x67) has not been
144extensively tested yet. A major problem exists because using it in
145vm86 mode can cause a general protection fault. Address offsets
146greater than 0xffff appear to be illegal in vm86 mode but are quite
147acceptable (and work) in real mode. A small test program developed to
148check the addressing, and which runs successfully in real mode,
149crashes dosemu under Linux and also brings Windows down with a general
150protection fault message when run under the MS-DOS prompt of Windows
1513.1. (The program simply reads data from a valid address).
152
153The emulator supports 16-bit protected mode, with one difference from
154an 80486DX. A 80486DX will allow some floating point instructions to
155write a few bytes below the lowest address of the stack. The emulator
156will not allow this in 16-bit protected mode: no instructions are
157allowed to write outside the bounds set by the protection.
158
159----------------------- Performance of wm-FPU-emu -----------------------
160
161Speed.
162-----
163
164The speed of floating point computation with the emulator will depend
165upon instruction mix. Relative performance is best for the instructions
166which require most computation. The simple instructions are adversely
167affected by the FPU instruction trap overhead.
168
169
170Timing: Some simple timing tests have been made on the emulator functions.
171The times include load/store instructions. All times are in microseconds
172measured on a 33MHz 386 with 64k cache. The Turbo C tests were under
173ms-dos, the next two columns are for emulators running with the djgpp
174ms-dos extender. The final column is for wm-FPU-emu in Linux 0.97,
175using libm4.0 (hard).
176
177function Turbo C djgpp 1.06 WM-emu387 wm-FPU-emu
178
179 + 60.5 154.8 76.5 139.4
180 - 61.1-65.5 157.3-160.8 76.2-79.5 142.9-144.7
181 * 71.0 190.8 79.6 146.6
182 / 61.2-75.0 261.4-266.9 75.3-91.6 142.2-158.1
183
184 sin() 310.8 4692.0 319.0 398.5
185 cos() 284.4 4855.2 308.0 388.7
186 tan() 495.0 8807.1 394.9 504.7
187 atan() 328.9 4866.4 601.1 419.5-491.9
188
189 sqrt() 128.7 crashed 145.2 227.0
190 log() 413.1-419.1 5103.4-5354.21 254.7-282.2 409.4-437.1
191 exp() 479.1 6619.2 469.1 850.8
192
193
194The performance under Linux is improved by the use of look-ahead code.
195The following results show the improvement which is obtained under
196Linux due to the look-ahead code. Also given are the times for the
197original Linux emulator with the 4.1 'soft' lib.
198
199 [ Linus' note: I changed look-ahead to be the default under linux, as
200 there was no reason not to use it after I had edited it to be
201 disabled during tracing ]
202
203 wm-FPU-emu w original w
204 look-ahead 'soft' lib
205 + 106.4 190.2
206 - 108.6-111.6 192.4-216.2
207 * 113.4 193.1
208 / 108.8-124.4 700.1-706.2
209
210 sin() 390.5 2642.0
211 cos() 381.5 2767.4
212 tan() 496.5 3153.3
213 atan() 367.2-435.5 2439.4-3396.8
214
215 sqrt() 195.1 4732.5
216 log() 358.0-387.5 3359.2-3390.3
217 exp() 619.3 4046.4
218
219
220These figures are now somewhat out-of-date. The emulator has become
221progressively slower for most functions as more of the 80486 features
222have been implemented.
223
224
225----------------------- Accuracy of wm-FPU-emu -----------------------
226
227
228The accuracy of the emulator is in almost all cases equal to or better
229than that of an Intel 80486 FPU.
230
231The results of the basic arithmetic functions (+,-,*,/), and fsqrt
232match those of an 80486 FPU. They are the best possible; the error for
233these never exceeds 1/2 an lsb. The fprem and fprem1 instructions
234return exact results; they have no error.
235
236
237The following table compares the emulator accuracy for the sqrt(),
238trig and log functions against the Turbo C "emulator". For this table,
239each function was tested at about 400 points. Ideal worst-case results
240would be 64 bits. The reduced Turbo C accuracy of cos() and tan() for
241arguments greater than pi/4 can be thought of as being related to the
242precision of the argument x; e.g. an argument of pi/2-(1e-10) which is
243accurate to 64 bits can result in a relative accuracy in cos() of
244about 64 + log2(cos(x)) = 31 bits.
245
246
247Function Tested x range Worst result Turbo C
248 (relative bits)
249
250sqrt(x) 1 .. 2 64.1 63.2
251atan(x) 1e-10 .. 200 64.2 62.8
252cos(x) 0 .. pi/2-(1e-10) 64.4 (x <= pi/4) 62.4
253 64.1 (x = pi/2-(1e-10)) 31.9
254sin(x) 1e-10 .. pi/2 64.0 62.8
255tan(x) 1e-10 .. pi/2-(1e-10) 64.0 (x <= pi/4) 62.1
256 64.1 (x = pi/2-(1e-10)) 31.9
257exp(x) 0 .. 1 63.1 ** 62.9
258log(x) 1+1e-6 .. 2 63.8 ** 62.1
259
260** The accuracy for exp() and log() is low because the FPU (emulator)
261does not compute them directly; two operations are required.
262
263
264The emulator passes the "paranoia" tests (compiled with gcc 2.3.3 or
265later) for 'float' variables (24 bit precision numbers) when precision
266control is set to 24, 53 or 64 bits, and for 'double' variables (53
267bit precision numbers) when precision control is set to 53 bits (a
268properly performing FPU cannot pass the 'paranoia' tests for 'double'
269variables when precision control is set to 64 bits).
270
271The code for reducing the argument for the trig functions (fsin, fcos,
272fptan and fsincos) has been improved and now effectively uses a value
273for pi which is accurate to more than 128 bits precision. As a
274consequence, the accuracy of these functions for large arguments has
275been dramatically improved (and is now very much better than an 80486
276FPU). There is also now no degradation of accuracy for fcos and fptan
277for operands close to pi/2. Measured results are (note that the
278definition of accuracy has changed slightly from that used for the
279above table):
280
281Function Tested x range Worst result
282 (absolute bits)
283
284cos(x) 0 .. 9.22e+18 62.0
285sin(x) 1e-16 .. 9.22e+18 62.1
286tan(x) 1e-16 .. 9.22e+18 61.8
287
288It is possible with some effort to find very large arguments which
289give much degraded precision. For example, the integer number
290 8227740058411162616.0
291is within about 10e-7 of a multiple of pi. To find the tan (for
292example) of this number to 64 bits precision it would be necessary to
293have a value of pi which had about 150 bits precision. The FPU
294emulator computes the result to about 42.6 bits precision (the correct
295result is about -9.739715e-8). On the other hand, an 80486 FPU returns
2960.01059, which in relative terms is hopelessly inaccurate.
297
298For arguments close to critical angles (which occur at multiples of
299pi/2) the emulator is more accurate than an 80486 FPU. For very large
300arguments, the emulator is far more accurate.
301
302
303Prior to version 1.20 of the emulator, the accuracy of the results for
304the transcendental functions (in their principal range) was not as
305good as the results from an 80486 FPU. From version 1.20, the accuracy
306has been considerably improved and these functions now give measured
307worst-case results which are better than the worst-case results given
308by an 80486 FPU.
309
310The following table gives the measured results for the emulator. The
311number of randomly selected arguments in each case is about half a
312million. The group of three columns gives the frequency of the given
313accuracy in number of times per million, thus the second of these
314columns shows that an accuracy of between 63.80 and 63.89 bits was
315found at a rate of 133 times per one million measurements for fsin.
316The results show that the fsin, fcos and fptan instructions return
317results which are in error (i.e. less accurate than the best possible
318result (which is 64 bits)) for about one per cent of all arguments
319between -pi/2 and +pi/2. The other instructions have a lower
320frequency of results which are in error. The last two columns give
321the worst accuracy which was found (in bits) and the approximate value
322of the argument which produced it.
323
324 frequency (per M)
325 ------------------- ---------------
326instr arg range # tests 63.7 63.8 63.9 worst at arg
327 bits bits bits bits
328----- ------------ ------- ---- ---- ----- ----- --------
329fsin (0,pi/2) 547756 0 133 10673 63.89 0.451317
330fcos (0,pi/2) 547563 0 126 10532 63.85 0.700801
331fptan (0,pi/2) 536274 11 267 10059 63.74 0.784876
332fpatan 4 quadrants 517087 0 8 1855 63.88 0.435121 (4q)
333fyl2x (0,20) 541861 0 0 1323 63.94 1.40923 (x)
334fyl2xp1 (-.293,.414) 520256 0 0 5678 63.93 0.408542 (x)
335f2xm1 (-1,1) 538847 4 481 6488 63.79 0.167709
336
337
338Tests performed on an 80486 FPU showed results of lower accuracy. The
339following table gives the results which were obtained with an AMD
340486DX2/66 (other tests indicate that an Intel 486DX produces
341identical results). The tests were basically the same as those used
342to measure the emulator (the values, being random, were in general not
343the same). The total number of tests for each instruction are given
344at the end of the table, in case each about 100k tests were performed.
345Another line of figures at the end of the table shows that most of the
346instructions return results which are in error for more than 10
347percent of the arguments tested.
348
349The numbers in the body of the table give the approx number of times a
350result of the given accuracy in bits (given in the left-most column)
351was obtained per one million arguments. For three of the instructions,
352two columns of results are given: * The second column for f2xm1 gives
353the number cases where the results of the first column were for a
354positive argument, this shows that this instruction gives better
355results for positive arguments than it does for negative. * In the
356cases of fcos and fptan, the first column gives the results when all
357cases where arguments greater than 1.5 were removed from the results
358given in the second column. Unlike the emulator, an 80486 FPU returns
359results of relatively poor accuracy for these instructions when the
360argument approaches pi/2. The table does not show those cases when the
361accuracy of the results were less than 62 bits, which occurs quite
362often for fsin and fptan when the argument approaches pi/2. This poor
363accuracy is discussed above in relation to the Turbo C "emulator", and
364the accuracy of the value of pi.
365
366
367bits f2xm1 f2xm1 fpatan fcos fcos fyl2x fyl2xp1 fsin fptan fptan
36862.0 0 0 0 0 437 0 0 0 0 925
36962.1 0 0 10 0 894 0 0 0 0 1023
37062.2 14 0 0 0 1033 0 0 0 0 945
37162.3 57 0 0 0 1202 0 0 0 0 1023
37262.4 385 0 0 10 1292 0 23 0 0 1178
37362.5 1140 0 0 119 1649 0 39 0 0 1149
37462.6 2037 0 0 189 1620 0 16 0 0 1169
37562.7 5086 14 0 646 2315 10 101 35 39 1402
37662.8 8818 86 0 984 3050 59 287 131 224 2036
37762.9 11340 1355 0 2126 4153 79 605 357 321 1948
37863.0 15557 4750 0 3319 5376 246 1281 862 808 2688
37963.1 20016 8288 0 4620 6628 511 2569 1723 1510 3302
38063.2 24945 11127 10 6588 8098 1120 4470 2968 2990 4724
38163.3 25686 12382 69 8774 10682 1906 6775 4482 5474 7236
38263.4 29219 14722 79 11109 12311 3094 9414 7259 8912 10587
38363.5 30458 14936 393 13802 15014 5874 12666 9609 13762 15262
38463.6 32439 16448 1277 17945 19028 10226 15537 14657 19158 20346
38563.7 35031 16805 4067 23003 23947 18910 20116 21333 25001 26209
38663.8 33251 15820 7673 24781 25675 24617 25354 24440 29433 30329
38763.9 33293 16833 18529 28318 29233 31267 31470 27748 29676 30601
388
389Per cent with error:
390 30.9 3.2 18.5 9.8 13.1 11.6 17.4
391Total arguments tested:
392 70194 70099 101784 100641 100641 101799 128853 114893 102675 102675
393
394
395------------------------- Contributors -------------------------------
396
397A number of people have contributed to the development of the
398emulator, often by just reporting bugs, sometimes with suggested
399fixes, and a few kind people have provided me with access in one way
400or another to an 80486 machine. Contributors include (to those people
401who I may have forgotten, please forgive me):
402
403Linus Torvalds
404Tommy.Thorn@daimi.aau.dk
405Andrew.Tridgell@anu.edu.au
406Nick Holloway, alfie@dcs.warwick.ac.uk
407Hermano Moura, moura@dcs.gla.ac.uk
408Jon Jagger, J.Jagger@scp.ac.uk
409Lennart Benschop
410Brian Gallew, geek+@CMU.EDU
411Thomas Staniszewski, ts3v+@andrew.cmu.edu
412Martin Howell, mph@plasma.apana.org.au
413M Saggaf, alsaggaf@athena.mit.edu
414Peter Barker, PETER@socpsy.sci.fau.edu
415tom@vlsivie.tuwien.ac.at
416Dan Russel, russed@rpi.edu
417Daniel Carosone, danielce@ee.mu.oz.au
418cae@jpmorgan.com
419Hamish Coleman, t933093@minyos.xx.rmit.oz.au
420Bruce Evans, bde@kralizec.zeta.org.au
421Timo Korvola, Timo.Korvola@hut.fi
422Rick Lyons, rick@razorback.brisnet.org.au
423Rick, jrs@world.std.com
424
425...and numerous others who responded to my request for help with
426a real 80486.
427
diff --git a/arch/x86/math-emu/control_w.h b/arch/x86/math-emu/control_w.h
new file mode 100644
index 000000000000..ae2274dbd305
--- /dev/null
+++ b/arch/x86/math-emu/control_w.h
@@ -0,0 +1,45 @@
1/*---------------------------------------------------------------------------+
2 | control_w.h |
3 | |
4 | Copyright (C) 1992,1993 |
5 | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, |
6 | Australia. E-mail billm@vaxc.cc.monash.edu.au |
7 | |
8 +---------------------------------------------------------------------------*/
9
10#ifndef _CONTROLW_H_
11#define _CONTROLW_H_
12
13#ifdef __ASSEMBLY__
14#define _Const_(x) $##x
15#else
16#define _Const_(x) x
17#endif
18
19#define CW_RC _Const_(0x0C00) /* rounding control */
20#define CW_PC _Const_(0x0300) /* precision control */
21
22#define CW_Precision Const_(0x0020) /* loss of precision mask */
23#define CW_Underflow Const_(0x0010) /* underflow mask */
24#define CW_Overflow Const_(0x0008) /* overflow mask */
25#define CW_ZeroDiv Const_(0x0004) /* divide by zero mask */
26#define CW_Denormal Const_(0x0002) /* denormalized operand mask */
27#define CW_Invalid Const_(0x0001) /* invalid operation mask */
28
29#define CW_Exceptions _Const_(0x003f) /* all masks */
30
31#define RC_RND _Const_(0x0000)
32#define RC_DOWN _Const_(0x0400)
33#define RC_UP _Const_(0x0800)
34#define RC_CHOP _Const_(0x0C00)
35
36/* p 15-5: Precision control bits affect only the following:
37 ADD, SUB(R), MUL, DIV(R), and SQRT */
38#define PR_24_BITS _Const_(0x000)
39#define PR_53_BITS _Const_(0x200)
40#define PR_64_BITS _Const_(0x300)
41#define PR_RESERVED_BITS _Const_(0x100)
42/* FULL_PRECISION simulates all exceptions masked */
43#define FULL_PRECISION (PR_64_BITS | RC_RND | 0x3f)
44
45#endif /* _CONTROLW_H_ */
diff --git a/arch/x86/math-emu/div_Xsig.S b/arch/x86/math-emu/div_Xsig.S
new file mode 100644
index 000000000000..f77ba3058b31
--- /dev/null
+++ b/arch/x86/math-emu/div_Xsig.S
@@ -0,0 +1,365 @@
1 .file "div_Xsig.S"
2/*---------------------------------------------------------------------------+
3 | div_Xsig.S |
4 | |
5 | Division subroutine for 96 bit quantities |
6 | |
7 | Copyright (C) 1994,1995 |
8 | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, |
9 | Australia. E-mail billm@jacobi.maths.monash.edu.au |
10 | |
11 | |
12 +---------------------------------------------------------------------------*/
13
14/*---------------------------------------------------------------------------+
15 | Divide the 96 bit quantity pointed to by a, by that pointed to by b, and |
16 | put the 96 bit result at the location d. |
17 | |
18 | The result may not be accurate to 96 bits. It is intended for use where |
19 | a result better than 64 bits is required. The result should usually be |
20 | good to at least 94 bits. |
21 | The returned result is actually divided by one half. This is done to |
22 | prevent overflow. |
23 | |
24 | .aaaaaaaaaaaaaa / .bbbbbbbbbbbbb -> .dddddddddddd |
25 | |
26 | void div_Xsig(Xsig *a, Xsig *b, Xsig *dest) |
27 | |
28 +---------------------------------------------------------------------------*/
29
30#include "exception.h"
31#include "fpu_emu.h"
32
33
34#define XsigLL(x) (x)
35#define XsigL(x) 4(x)
36#define XsigH(x) 8(x)
37
38
39#ifndef NON_REENTRANT_FPU
40/*
41 Local storage on the stack:
42 Accumulator: FPU_accum_3:FPU_accum_2:FPU_accum_1:FPU_accum_0
43 */
44#define FPU_accum_3 -4(%ebp)
45#define FPU_accum_2 -8(%ebp)
46#define FPU_accum_1 -12(%ebp)
47#define FPU_accum_0 -16(%ebp)
48#define FPU_result_3 -20(%ebp)
49#define FPU_result_2 -24(%ebp)
50#define FPU_result_1 -28(%ebp)
51
52#else
53.data
54/*
55 Local storage in a static area:
56 Accumulator: FPU_accum_3:FPU_accum_2:FPU_accum_1:FPU_accum_0
57 */
58 .align 4,0
59FPU_accum_3:
60 .long 0
61FPU_accum_2:
62 .long 0
63FPU_accum_1:
64 .long 0
65FPU_accum_0:
66 .long 0
67FPU_result_3:
68 .long 0
69FPU_result_2:
70 .long 0
71FPU_result_1:
72 .long 0
73#endif /* NON_REENTRANT_FPU */
74
75
76.text
77ENTRY(div_Xsig)
78 pushl %ebp
79 movl %esp,%ebp
80#ifndef NON_REENTRANT_FPU
81 subl $28,%esp
82#endif /* NON_REENTRANT_FPU */
83
84 pushl %esi
85 pushl %edi
86 pushl %ebx
87
88 movl PARAM1,%esi /* pointer to num */
89 movl PARAM2,%ebx /* pointer to denom */
90
91#ifdef PARANOID
92 testl $0x80000000, XsigH(%ebx) /* Divisor */
93 je L_bugged
94#endif /* PARANOID */
95
96
97/*---------------------------------------------------------------------------+
98 | Divide: Return arg1/arg2 to arg3. |
99 | |
100 | The maximum returned value is (ignoring exponents) |
101 | .ffffffff ffffffff |
102 | ------------------ = 1.ffffffff fffffffe |
103 | .80000000 00000000 |
104 | and the minimum is |
105 | .80000000 00000000 |
106 | ------------------ = .80000000 00000001 (rounded) |
107 | .ffffffff ffffffff |
108 | |
109 +---------------------------------------------------------------------------*/
110
111 /* Save extended dividend in local register */
112
113 /* Divide by 2 to prevent overflow */
114 clc
115 movl XsigH(%esi),%eax
116 rcrl %eax
117 movl %eax,FPU_accum_3
118 movl XsigL(%esi),%eax
119 rcrl %eax
120 movl %eax,FPU_accum_2
121 movl XsigLL(%esi),%eax
122 rcrl %eax
123 movl %eax,FPU_accum_1
124 movl $0,%eax
125 rcrl %eax
126 movl %eax,FPU_accum_0
127
128 movl FPU_accum_2,%eax /* Get the current num */
129 movl FPU_accum_3,%edx
130
131/*----------------------------------------------------------------------*/
132/* Initialization done.
133 Do the first 32 bits. */
134
135 /* We will divide by a number which is too large */
136 movl XsigH(%ebx),%ecx
137 addl $1,%ecx
138 jnc LFirst_div_not_1
139
140 /* here we need to divide by 100000000h,
141 i.e., no division at all.. */
142 mov %edx,%eax
143 jmp LFirst_div_done
144
145LFirst_div_not_1:
146 divl %ecx /* Divide the numerator by the augmented
147 denom ms dw */
148
149LFirst_div_done:
150 movl %eax,FPU_result_3 /* Put the result in the answer */
151
152 mull XsigH(%ebx) /* mul by the ms dw of the denom */
153
154 subl %eax,FPU_accum_2 /* Subtract from the num local reg */
155 sbbl %edx,FPU_accum_3
156
157 movl FPU_result_3,%eax /* Get the result back */
158 mull XsigL(%ebx) /* now mul the ls dw of the denom */
159
160 subl %eax,FPU_accum_1 /* Subtract from the num local reg */
161 sbbl %edx,FPU_accum_2
162 sbbl $0,FPU_accum_3
163 je LDo_2nd_32_bits /* Must check for non-zero result here */
164
165#ifdef PARANOID
166 jb L_bugged_1
167#endif /* PARANOID */
168
169 /* need to subtract another once of the denom */
170 incl FPU_result_3 /* Correct the answer */
171
172 movl XsigL(%ebx),%eax
173 movl XsigH(%ebx),%edx
174 subl %eax,FPU_accum_1 /* Subtract from the num local reg */
175 sbbl %edx,FPU_accum_2
176
177#ifdef PARANOID
178 sbbl $0,FPU_accum_3
179 jne L_bugged_1 /* Must check for non-zero result here */
180#endif /* PARANOID */
181
182/*----------------------------------------------------------------------*/
183/* Half of the main problem is done, there is just a reduced numerator
184 to handle now.
185 Work with the second 32 bits, FPU_accum_0 not used from now on */
186LDo_2nd_32_bits:
187 movl FPU_accum_2,%edx /* get the reduced num */
188 movl FPU_accum_1,%eax
189
190 /* need to check for possible subsequent overflow */
191 cmpl XsigH(%ebx),%edx
192 jb LDo_2nd_div
193 ja LPrevent_2nd_overflow
194
195 cmpl XsigL(%ebx),%eax
196 jb LDo_2nd_div
197
198LPrevent_2nd_overflow:
199/* The numerator is greater or equal, would cause overflow */
200 /* prevent overflow */
201 subl XsigL(%ebx),%eax
202 sbbl XsigH(%ebx),%edx
203 movl %edx,FPU_accum_2
204 movl %eax,FPU_accum_1
205
206 incl FPU_result_3 /* Reflect the subtraction in the answer */
207
208#ifdef PARANOID
209 je L_bugged_2 /* Can't bump the result to 1.0 */
210#endif /* PARANOID */
211
212LDo_2nd_div:
213 cmpl $0,%ecx /* augmented denom msw */
214 jnz LSecond_div_not_1
215
216 /* %ecx == 0, we are dividing by 1.0 */
217 mov %edx,%eax
218 jmp LSecond_div_done
219
220LSecond_div_not_1:
221 divl %ecx /* Divide the numerator by the denom ms dw */
222
223LSecond_div_done:
224 movl %eax,FPU_result_2 /* Put the result in the answer */
225
226 mull XsigH(%ebx) /* mul by the ms dw of the denom */
227
228 subl %eax,FPU_accum_1 /* Subtract from the num local reg */
229 sbbl %edx,FPU_accum_2
230
231#ifdef PARANOID
232 jc L_bugged_2
233#endif /* PARANOID */
234
235 movl FPU_result_2,%eax /* Get the result back */
236 mull XsigL(%ebx) /* now mul the ls dw of the denom */
237
238 subl %eax,FPU_accum_0 /* Subtract from the num local reg */
239 sbbl %edx,FPU_accum_1 /* Subtract from the num local reg */
240 sbbl $0,FPU_accum_2
241
242#ifdef PARANOID
243 jc L_bugged_2
244#endif /* PARANOID */
245
246 jz LDo_3rd_32_bits
247
248#ifdef PARANOID
249 cmpl $1,FPU_accum_2
250 jne L_bugged_2
251#endif /* PARANOID */
252
253 /* need to subtract another once of the denom */
254 movl XsigL(%ebx),%eax
255 movl XsigH(%ebx),%edx
256 subl %eax,FPU_accum_0 /* Subtract from the num local reg */
257 sbbl %edx,FPU_accum_1
258 sbbl $0,FPU_accum_2
259
260#ifdef PARANOID
261 jc L_bugged_2
262 jne L_bugged_2
263#endif /* PARANOID */
264
265 addl $1,FPU_result_2 /* Correct the answer */
266 adcl $0,FPU_result_3
267
268#ifdef PARANOID
269 jc L_bugged_2 /* Must check for non-zero result here */
270#endif /* PARANOID */
271
272/*----------------------------------------------------------------------*/
273/* The division is essentially finished here, we just need to perform
274 tidying operations.
275 Deal with the 3rd 32 bits */
276LDo_3rd_32_bits:
277 /* We use an approximation for the third 32 bits.
278 To take account of the 3rd 32 bits of the divisor
279 (call them del), we subtract del * (a/b) */
280
281 movl FPU_result_3,%eax /* a/b */
282 mull XsigLL(%ebx) /* del */
283
284 subl %edx,FPU_accum_1
285
286 /* A borrow indicates that the result is negative */
287 jnb LTest_over
288
289 movl XsigH(%ebx),%edx
290 addl %edx,FPU_accum_1
291
292 subl $1,FPU_result_2 /* Adjust the answer */
293 sbbl $0,FPU_result_3
294
295 /* The above addition might not have been enough, check again. */
296 movl FPU_accum_1,%edx /* get the reduced num */
297 cmpl XsigH(%ebx),%edx /* denom */
298 jb LDo_3rd_div
299
300 movl XsigH(%ebx),%edx
301 addl %edx,FPU_accum_1
302
303 subl $1,FPU_result_2 /* Adjust the answer */
304 sbbl $0,FPU_result_3
305 jmp LDo_3rd_div
306
307LTest_over:
308 movl FPU_accum_1,%edx /* get the reduced num */
309
310 /* need to check for possible subsequent overflow */
311 cmpl XsigH(%ebx),%edx /* denom */
312 jb LDo_3rd_div
313
314 /* prevent overflow */
315 subl XsigH(%ebx),%edx
316 movl %edx,FPU_accum_1
317
318 addl $1,FPU_result_2 /* Reflect the subtraction in the answer */
319 adcl $0,FPU_result_3
320
321LDo_3rd_div:
322 movl FPU_accum_0,%eax
323 movl FPU_accum_1,%edx
324 divl XsigH(%ebx)
325
326 movl %eax,FPU_result_1 /* Rough estimate of third word */
327
328 movl PARAM3,%esi /* pointer to answer */
329
330 movl FPU_result_1,%eax
331 movl %eax,XsigLL(%esi)
332 movl FPU_result_2,%eax
333 movl %eax,XsigL(%esi)
334 movl FPU_result_3,%eax
335 movl %eax,XsigH(%esi)
336
337L_exit:
338 popl %ebx
339 popl %edi
340 popl %esi
341
342 leave
343 ret
344
345
346#ifdef PARANOID
347/* The logic is wrong if we got here */
348L_bugged:
349 pushl EX_INTERNAL|0x240
350 call EXCEPTION
351 pop %ebx
352 jmp L_exit
353
354L_bugged_1:
355 pushl EX_INTERNAL|0x241
356 call EXCEPTION
357 pop %ebx
358 jmp L_exit
359
360L_bugged_2:
361 pushl EX_INTERNAL|0x242
362 call EXCEPTION
363 pop %ebx
364 jmp L_exit
365#endif /* PARANOID */
diff --git a/arch/x86/math-emu/div_small.S b/arch/x86/math-emu/div_small.S
new file mode 100644
index 000000000000..47099628fa4c
--- /dev/null
+++ b/arch/x86/math-emu/div_small.S
@@ -0,0 +1,47 @@
1 .file "div_small.S"
2/*---------------------------------------------------------------------------+
3 | div_small.S |
4 | |
5 | Divide a 64 bit integer by a 32 bit integer & return remainder. |
6 | |
7 | Copyright (C) 1992,1995 |
8 | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, |
9 | Australia. E-mail billm@jacobi.maths.monash.edu.au |
10 | |
11 | |
12 +---------------------------------------------------------------------------*/
13
14/*---------------------------------------------------------------------------+
15 | unsigned long FPU_div_small(unsigned long long *x, unsigned long y) |
16 +---------------------------------------------------------------------------*/
17
18#include "fpu_emu.h"
19
20.text
21ENTRY(FPU_div_small)
22 pushl %ebp
23 movl %esp,%ebp
24
25 pushl %esi
26
27 movl PARAM1,%esi /* pointer to num */
28 movl PARAM2,%ecx /* The denominator */
29
30 movl 4(%esi),%eax /* Get the current num msw */
31 xorl %edx,%edx
32 divl %ecx
33
34 movl %eax,4(%esi)
35
36 movl (%esi),%eax /* Get the num lsw */
37 divl %ecx
38
39 movl %eax,(%esi)
40
41 movl %edx,%eax /* Return the remainder in eax */
42
43 popl %esi
44
45 leave
46 ret
47
diff --git a/arch/x86/math-emu/errors.c b/arch/x86/math-emu/errors.c
new file mode 100644
index 000000000000..a1b0d22f6978
--- /dev/null
+++ b/arch/x86/math-emu/errors.c
@@ -0,0 +1,739 @@
1/*---------------------------------------------------------------------------+
2 | errors.c |
3 | |
4 | The error handling functions for wm-FPU-emu |
5 | |
6 | Copyright (C) 1992,1993,1994,1996 |
7 | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
8 | E-mail billm@jacobi.maths.monash.edu.au |
9 | |
10 | |
11 +---------------------------------------------------------------------------*/
12
13/*---------------------------------------------------------------------------+
14 | Note: |
15 | The file contains code which accesses user memory. |
16 | Emulator static data may change when user memory is accessed, due to |
17 | other processes using the emulator while swapping is in progress. |
18 +---------------------------------------------------------------------------*/
19
20#include <linux/signal.h>
21
22#include <asm/uaccess.h>
23
24#include "fpu_emu.h"
25#include "fpu_system.h"
26#include "exception.h"
27#include "status_w.h"
28#include "control_w.h"
29#include "reg_constant.h"
30#include "version.h"
31
32/* */
33#undef PRINT_MESSAGES
34/* */
35
36
37#if 0
38void Un_impl(void)
39{
40 u_char byte1, FPU_modrm;
41 unsigned long address = FPU_ORIG_EIP;
42
43 RE_ENTRANT_CHECK_OFF;
44 /* No need to check access_ok(), we have previously fetched these bytes. */
45 printk("Unimplemented FPU Opcode at eip=%p : ", (void __user *) address);
46 if ( FPU_CS == __USER_CS )
47 {
48 while ( 1 )
49 {
50 FPU_get_user(byte1, (u_char __user *) address);
51 if ( (byte1 & 0xf8) == 0xd8 ) break;
52 printk("[%02x]", byte1);
53 address++;
54 }
55 printk("%02x ", byte1);
56 FPU_get_user(FPU_modrm, 1 + (u_char __user *) address);
57
58 if (FPU_modrm >= 0300)
59 printk("%02x (%02x+%d)\n", FPU_modrm, FPU_modrm & 0xf8, FPU_modrm & 7);
60 else
61 printk("/%d\n", (FPU_modrm >> 3) & 7);
62 }
63 else
64 {
65 printk("cs selector = %04x\n", FPU_CS);
66 }
67
68 RE_ENTRANT_CHECK_ON;
69
70 EXCEPTION(EX_Invalid);
71
72}
73#endif /* 0 */
74
75
76/*
77 Called for opcodes which are illegal and which are known to result in a
78 SIGILL with a real 80486.
79 */
80void FPU_illegal(void)
81{
82 math_abort(FPU_info,SIGILL);
83}
84
85
86
87void FPU_printall(void)
88{
89 int i;
90 static const char *tag_desc[] = { "Valid", "Zero", "ERROR", "Empty",
91 "DeNorm", "Inf", "NaN" };
92 u_char byte1, FPU_modrm;
93 unsigned long address = FPU_ORIG_EIP;
94
95 RE_ENTRANT_CHECK_OFF;
96 /* No need to check access_ok(), we have previously fetched these bytes. */
97 printk("At %p:", (void *) address);
98 if ( FPU_CS == __USER_CS )
99 {
100#define MAX_PRINTED_BYTES 20
101 for ( i = 0; i < MAX_PRINTED_BYTES; i++ )
102 {
103 FPU_get_user(byte1, (u_char __user *) address);
104 if ( (byte1 & 0xf8) == 0xd8 )
105 {
106 printk(" %02x", byte1);
107 break;
108 }
109 printk(" [%02x]", byte1);
110 address++;
111 }
112 if ( i == MAX_PRINTED_BYTES )
113 printk(" [more..]\n");
114 else
115 {
116 FPU_get_user(FPU_modrm, 1 + (u_char __user *) address);
117
118 if (FPU_modrm >= 0300)
119 printk(" %02x (%02x+%d)\n", FPU_modrm, FPU_modrm & 0xf8, FPU_modrm & 7);
120 else
121 printk(" /%d, mod=%d rm=%d\n",
122 (FPU_modrm >> 3) & 7, (FPU_modrm >> 6) & 3, FPU_modrm & 7);
123 }
124 }
125 else
126 {
127 printk("%04x\n", FPU_CS);
128 }
129
130 partial_status = status_word();
131
132#ifdef DEBUGGING
133if ( partial_status & SW_Backward ) printk("SW: backward compatibility\n");
134if ( partial_status & SW_C3 ) printk("SW: condition bit 3\n");
135if ( partial_status & SW_C2 ) printk("SW: condition bit 2\n");
136if ( partial_status & SW_C1 ) printk("SW: condition bit 1\n");
137if ( partial_status & SW_C0 ) printk("SW: condition bit 0\n");
138if ( partial_status & SW_Summary ) printk("SW: exception summary\n");
139if ( partial_status & SW_Stack_Fault ) printk("SW: stack fault\n");
140if ( partial_status & SW_Precision ) printk("SW: loss of precision\n");
141if ( partial_status & SW_Underflow ) printk("SW: underflow\n");
142if ( partial_status & SW_Overflow ) printk("SW: overflow\n");
143if ( partial_status & SW_Zero_Div ) printk("SW: divide by zero\n");
144if ( partial_status & SW_Denorm_Op ) printk("SW: denormalized operand\n");
145if ( partial_status & SW_Invalid ) printk("SW: invalid operation\n");
146#endif /* DEBUGGING */
147
148 printk(" SW: b=%d st=%ld es=%d sf=%d cc=%d%d%d%d ef=%d%d%d%d%d%d\n",
149 partial_status & 0x8000 ? 1 : 0, /* busy */
150 (partial_status & 0x3800) >> 11, /* stack top pointer */
151 partial_status & 0x80 ? 1 : 0, /* Error summary status */
152 partial_status & 0x40 ? 1 : 0, /* Stack flag */
153 partial_status & SW_C3?1:0, partial_status & SW_C2?1:0, /* cc */
154 partial_status & SW_C1?1:0, partial_status & SW_C0?1:0, /* cc */
155 partial_status & SW_Precision?1:0, partial_status & SW_Underflow?1:0,
156 partial_status & SW_Overflow?1:0, partial_status & SW_Zero_Div?1:0,
157 partial_status & SW_Denorm_Op?1:0, partial_status & SW_Invalid?1:0);
158
159printk(" CW: ic=%d rc=%ld%ld pc=%ld%ld iem=%d ef=%d%d%d%d%d%d\n",
160 control_word & 0x1000 ? 1 : 0,
161 (control_word & 0x800) >> 11, (control_word & 0x400) >> 10,
162 (control_word & 0x200) >> 9, (control_word & 0x100) >> 8,
163 control_word & 0x80 ? 1 : 0,
164 control_word & SW_Precision?1:0, control_word & SW_Underflow?1:0,
165 control_word & SW_Overflow?1:0, control_word & SW_Zero_Div?1:0,
166 control_word & SW_Denorm_Op?1:0, control_word & SW_Invalid?1:0);
167
168 for ( i = 0; i < 8; i++ )
169 {
170 FPU_REG *r = &st(i);
171 u_char tagi = FPU_gettagi(i);
172 switch (tagi)
173 {
174 case TAG_Empty:
175 continue;
176 break;
177 case TAG_Zero:
178 case TAG_Special:
179 tagi = FPU_Special(r);
180 case TAG_Valid:
181 printk("st(%d) %c .%04lx %04lx %04lx %04lx e%+-6d ", i,
182 getsign(r) ? '-' : '+',
183 (long)(r->sigh >> 16),
184 (long)(r->sigh & 0xFFFF),
185 (long)(r->sigl >> 16),
186 (long)(r->sigl & 0xFFFF),
187 exponent(r) - EXP_BIAS + 1);
188 break;
189 default:
190 printk("Whoops! Error in errors.c: tag%d is %d ", i, tagi);
191 continue;
192 break;
193 }
194 printk("%s\n", tag_desc[(int) (unsigned) tagi]);
195 }
196
197 RE_ENTRANT_CHECK_ON;
198
199}
200
201static struct {
202 int type;
203 const char *name;
204} exception_names[] = {
205 { EX_StackOver, "stack overflow" },
206 { EX_StackUnder, "stack underflow" },
207 { EX_Precision, "loss of precision" },
208 { EX_Underflow, "underflow" },
209 { EX_Overflow, "overflow" },
210 { EX_ZeroDiv, "divide by zero" },
211 { EX_Denormal, "denormalized operand" },
212 { EX_Invalid, "invalid operation" },
213 { EX_INTERNAL, "INTERNAL BUG in "FPU_VERSION },
214 { 0, NULL }
215};
216
217/*
218 EX_INTERNAL is always given with a code which indicates where the
219 error was detected.
220
221 Internal error types:
222 0x14 in fpu_etc.c
223 0x1nn in a *.c file:
224 0x101 in reg_add_sub.c
225 0x102 in reg_mul.c
226 0x104 in poly_atan.c
227 0x105 in reg_mul.c
228 0x107 in fpu_trig.c
229 0x108 in reg_compare.c
230 0x109 in reg_compare.c
231 0x110 in reg_add_sub.c
232 0x111 in fpe_entry.c
233 0x112 in fpu_trig.c
234 0x113 in errors.c
235 0x115 in fpu_trig.c
236 0x116 in fpu_trig.c
237 0x117 in fpu_trig.c
238 0x118 in fpu_trig.c
239 0x119 in fpu_trig.c
240 0x120 in poly_atan.c
241 0x121 in reg_compare.c
242 0x122 in reg_compare.c
243 0x123 in reg_compare.c
244 0x125 in fpu_trig.c
245 0x126 in fpu_entry.c
246 0x127 in poly_2xm1.c
247 0x128 in fpu_entry.c
248 0x129 in fpu_entry.c
249 0x130 in get_address.c
250 0x131 in get_address.c
251 0x132 in get_address.c
252 0x133 in get_address.c
253 0x140 in load_store.c
254 0x141 in load_store.c
255 0x150 in poly_sin.c
256 0x151 in poly_sin.c
257 0x160 in reg_ld_str.c
258 0x161 in reg_ld_str.c
259 0x162 in reg_ld_str.c
260 0x163 in reg_ld_str.c
261 0x164 in reg_ld_str.c
262 0x170 in fpu_tags.c
263 0x171 in fpu_tags.c
264 0x172 in fpu_tags.c
265 0x180 in reg_convert.c
266 0x2nn in an *.S file:
267 0x201 in reg_u_add.S
268 0x202 in reg_u_div.S
269 0x203 in reg_u_div.S
270 0x204 in reg_u_div.S
271 0x205 in reg_u_mul.S
272 0x206 in reg_u_sub.S
273 0x207 in wm_sqrt.S
274 0x208 in reg_div.S
275 0x209 in reg_u_sub.S
276 0x210 in reg_u_sub.S
277 0x211 in reg_u_sub.S
278 0x212 in reg_u_sub.S
279 0x213 in wm_sqrt.S
280 0x214 in wm_sqrt.S
281 0x215 in wm_sqrt.S
282 0x220 in reg_norm.S
283 0x221 in reg_norm.S
284 0x230 in reg_round.S
285 0x231 in reg_round.S
286 0x232 in reg_round.S
287 0x233 in reg_round.S
288 0x234 in reg_round.S
289 0x235 in reg_round.S
290 0x236 in reg_round.S
291 0x240 in div_Xsig.S
292 0x241 in div_Xsig.S
293 0x242 in div_Xsig.S
294 */
295
296asmlinkage void FPU_exception(int n)
297{
298 int i, int_type;
299
300 int_type = 0; /* Needed only to stop compiler warnings */
301 if ( n & EX_INTERNAL )
302 {
303 int_type = n - EX_INTERNAL;
304 n = EX_INTERNAL;
305 /* Set lots of exception bits! */
306 partial_status |= (SW_Exc_Mask | SW_Summary | SW_Backward);
307 }
308 else
309 {
310 /* Extract only the bits which we use to set the status word */
311 n &= (SW_Exc_Mask);
312 /* Set the corresponding exception bit */
313 partial_status |= n;
314 /* Set summary bits iff exception isn't masked */
315 if ( partial_status & ~control_word & CW_Exceptions )
316 partial_status |= (SW_Summary | SW_Backward);
317 if ( n & (SW_Stack_Fault | EX_Precision) )
318 {
319 if ( !(n & SW_C1) )
320 /* This bit distinguishes over- from underflow for a stack fault,
321 and roundup from round-down for precision loss. */
322 partial_status &= ~SW_C1;
323 }
324 }
325
326 RE_ENTRANT_CHECK_OFF;
327 if ( (~control_word & n & CW_Exceptions) || (n == EX_INTERNAL) )
328 {
329#ifdef PRINT_MESSAGES
330 /* My message from the sponsor */
331 printk(FPU_VERSION" "__DATE__" (C) W. Metzenthen.\n");
332#endif /* PRINT_MESSAGES */
333
334 /* Get a name string for error reporting */
335 for (i=0; exception_names[i].type; i++)
336 if ( (exception_names[i].type & n) == exception_names[i].type )
337 break;
338
339 if (exception_names[i].type)
340 {
341#ifdef PRINT_MESSAGES
342 printk("FP Exception: %s!\n", exception_names[i].name);
343#endif /* PRINT_MESSAGES */
344 }
345 else
346 printk("FPU emulator: Unknown Exception: 0x%04x!\n", n);
347
348 if ( n == EX_INTERNAL )
349 {
350 printk("FPU emulator: Internal error type 0x%04x\n", int_type);
351 FPU_printall();
352 }
353#ifdef PRINT_MESSAGES
354 else
355 FPU_printall();
356#endif /* PRINT_MESSAGES */
357
358 /*
359 * The 80486 generates an interrupt on the next non-control FPU
360 * instruction. So we need some means of flagging it.
361 * We use the ES (Error Summary) bit for this.
362 */
363 }
364 RE_ENTRANT_CHECK_ON;
365
366#ifdef __DEBUG__
367 math_abort(FPU_info,SIGFPE);
368#endif /* __DEBUG__ */
369
370}
371
372
373/* Real operation attempted on a NaN. */
374/* Returns < 0 if the exception is unmasked */
375int real_1op_NaN(FPU_REG *a)
376{
377 int signalling, isNaN;
378
379 isNaN = (exponent(a) == EXP_OVER) && (a->sigh & 0x80000000);
380
381 /* The default result for the case of two "equal" NaNs (signs may
382 differ) is chosen to reproduce 80486 behaviour */
383 signalling = isNaN && !(a->sigh & 0x40000000);
384
385 if ( !signalling )
386 {
387 if ( !isNaN ) /* pseudo-NaN, or other unsupported? */
388 {
389 if ( control_word & CW_Invalid )
390 {
391 /* Masked response */
392 reg_copy(&CONST_QNaN, a);
393 }
394 EXCEPTION(EX_Invalid);
395 return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | TAG_Special;
396 }
397 return TAG_Special;
398 }
399
400 if ( control_word & CW_Invalid )
401 {
402 /* The masked response */
403 if ( !(a->sigh & 0x80000000) ) /* pseudo-NaN ? */
404 {
405 reg_copy(&CONST_QNaN, a);
406 }
407 /* ensure a Quiet NaN */
408 a->sigh |= 0x40000000;
409 }
410
411 EXCEPTION(EX_Invalid);
412
413 return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | TAG_Special;
414}
415
416
417/* Real operation attempted on two operands, one a NaN. */
418/* Returns < 0 if the exception is unmasked */
419int real_2op_NaN(FPU_REG const *b, u_char tagb,
420 int deststnr,
421 FPU_REG const *defaultNaN)
422{
423 FPU_REG *dest = &st(deststnr);
424 FPU_REG const *a = dest;
425 u_char taga = FPU_gettagi(deststnr);
426 FPU_REG const *x;
427 int signalling, unsupported;
428
429 if ( taga == TAG_Special )
430 taga = FPU_Special(a);
431 if ( tagb == TAG_Special )
432 tagb = FPU_Special(b);
433
434 /* TW_NaN is also used for unsupported data types. */
435 unsupported = ((taga == TW_NaN)
436 && !((exponent(a) == EXP_OVER) && (a->sigh & 0x80000000)))
437 || ((tagb == TW_NaN)
438 && !((exponent(b) == EXP_OVER) && (b->sigh & 0x80000000)));
439 if ( unsupported )
440 {
441 if ( control_word & CW_Invalid )
442 {
443 /* Masked response */
444 FPU_copy_to_regi(&CONST_QNaN, TAG_Special, deststnr);
445 }
446 EXCEPTION(EX_Invalid);
447 return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | TAG_Special;
448 }
449
450 if (taga == TW_NaN)
451 {
452 x = a;
453 if (tagb == TW_NaN)
454 {
455 signalling = !(a->sigh & b->sigh & 0x40000000);
456 if ( significand(b) > significand(a) )
457 x = b;
458 else if ( significand(b) == significand(a) )
459 {
460 /* The default result for the case of two "equal" NaNs (signs may
461 differ) is chosen to reproduce 80486 behaviour */
462 x = defaultNaN;
463 }
464 }
465 else
466 {
467 /* return the quiet version of the NaN in a */
468 signalling = !(a->sigh & 0x40000000);
469 }
470 }
471 else
472#ifdef PARANOID
473 if (tagb == TW_NaN)
474#endif /* PARANOID */
475 {
476 signalling = !(b->sigh & 0x40000000);
477 x = b;
478 }
479#ifdef PARANOID
480 else
481 {
482 signalling = 0;
483 EXCEPTION(EX_INTERNAL|0x113);
484 x = &CONST_QNaN;
485 }
486#endif /* PARANOID */
487
488 if ( (!signalling) || (control_word & CW_Invalid) )
489 {
490 if ( ! x )
491 x = b;
492
493 if ( !(x->sigh & 0x80000000) ) /* pseudo-NaN ? */
494 x = &CONST_QNaN;
495
496 FPU_copy_to_regi(x, TAG_Special, deststnr);
497
498 if ( !signalling )
499 return TAG_Special;
500
501 /* ensure a Quiet NaN */
502 dest->sigh |= 0x40000000;
503 }
504
505 EXCEPTION(EX_Invalid);
506
507 return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | TAG_Special;
508}
509
510
511/* Invalid arith operation on Valid registers */
512/* Returns < 0 if the exception is unmasked */
513asmlinkage int arith_invalid(int deststnr)
514{
515
516 EXCEPTION(EX_Invalid);
517
518 if ( control_word & CW_Invalid )
519 {
520 /* The masked response */
521 FPU_copy_to_regi(&CONST_QNaN, TAG_Special, deststnr);
522 }
523
524 return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | TAG_Valid;
525
526}
527
528
529/* Divide a finite number by zero */
530asmlinkage int FPU_divide_by_zero(int deststnr, u_char sign)
531{
532 FPU_REG *dest = &st(deststnr);
533 int tag = TAG_Valid;
534
535 if ( control_word & CW_ZeroDiv )
536 {
537 /* The masked response */
538 FPU_copy_to_regi(&CONST_INF, TAG_Special, deststnr);
539 setsign(dest, sign);
540 tag = TAG_Special;
541 }
542
543 EXCEPTION(EX_ZeroDiv);
544
545 return (!(control_word & CW_ZeroDiv) ? FPU_Exception : 0) | tag;
546
547}
548
549
550/* This may be called often, so keep it lean */
551int set_precision_flag(int flags)
552{
553 if ( control_word & CW_Precision )
554 {
555 partial_status &= ~(SW_C1 & flags);
556 partial_status |= flags; /* The masked response */
557 return 0;
558 }
559 else
560 {
561 EXCEPTION(flags);
562 return 1;
563 }
564}
565
566
567/* This may be called often, so keep it lean */
568asmlinkage void set_precision_flag_up(void)
569{
570 if ( control_word & CW_Precision )
571 partial_status |= (SW_Precision | SW_C1); /* The masked response */
572 else
573 EXCEPTION(EX_Precision | SW_C1);
574}
575
576
577/* This may be called often, so keep it lean */
578asmlinkage void set_precision_flag_down(void)
579{
580 if ( control_word & CW_Precision )
581 { /* The masked response */
582 partial_status &= ~SW_C1;
583 partial_status |= SW_Precision;
584 }
585 else
586 EXCEPTION(EX_Precision);
587}
588
589
590asmlinkage int denormal_operand(void)
591{
592 if ( control_word & CW_Denormal )
593 { /* The masked response */
594 partial_status |= SW_Denorm_Op;
595 return TAG_Special;
596 }
597 else
598 {
599 EXCEPTION(EX_Denormal);
600 return TAG_Special | FPU_Exception;
601 }
602}
603
604
605asmlinkage int arith_overflow(FPU_REG *dest)
606{
607 int tag = TAG_Valid;
608
609 if ( control_word & CW_Overflow )
610 {
611 /* The masked response */
612/* ###### The response here depends upon the rounding mode */
613 reg_copy(&CONST_INF, dest);
614 tag = TAG_Special;
615 }
616 else
617 {
618 /* Subtract the magic number from the exponent */
619 addexponent(dest, (-3 * (1 << 13)));
620 }
621
622 EXCEPTION(EX_Overflow);
623 if ( control_word & CW_Overflow )
624 {
625 /* The overflow exception is masked. */
626 /* By definition, precision is lost.
627 The roundup bit (C1) is also set because we have
628 "rounded" upwards to Infinity. */
629 EXCEPTION(EX_Precision | SW_C1);
630 return tag;
631 }
632
633 return tag;
634
635}
636
637
638asmlinkage int arith_underflow(FPU_REG *dest)
639{
640 int tag = TAG_Valid;
641
642 if ( control_word & CW_Underflow )
643 {
644 /* The masked response */
645 if ( exponent16(dest) <= EXP_UNDER - 63 )
646 {
647 reg_copy(&CONST_Z, dest);
648 partial_status &= ~SW_C1; /* Round down. */
649 tag = TAG_Zero;
650 }
651 else
652 {
653 stdexp(dest);
654 }
655 }
656 else
657 {
658 /* Add the magic number to the exponent. */
659 addexponent(dest, (3 * (1 << 13)) + EXTENDED_Ebias);
660 }
661
662 EXCEPTION(EX_Underflow);
663 if ( control_word & CW_Underflow )
664 {
665 /* The underflow exception is masked. */
666 EXCEPTION(EX_Precision);
667 return tag;
668 }
669
670 return tag;
671
672}
673
674
675void FPU_stack_overflow(void)
676{
677
678 if ( control_word & CW_Invalid )
679 {
680 /* The masked response */
681 top--;
682 FPU_copy_to_reg0(&CONST_QNaN, TAG_Special);
683 }
684
685 EXCEPTION(EX_StackOver);
686
687 return;
688
689}
690
691
692void FPU_stack_underflow(void)
693{
694
695 if ( control_word & CW_Invalid )
696 {
697 /* The masked response */
698 FPU_copy_to_reg0(&CONST_QNaN, TAG_Special);
699 }
700
701 EXCEPTION(EX_StackUnder);
702
703 return;
704
705}
706
707
708void FPU_stack_underflow_i(int i)
709{
710
711 if ( control_word & CW_Invalid )
712 {
713 /* The masked response */
714 FPU_copy_to_regi(&CONST_QNaN, TAG_Special, i);
715 }
716
717 EXCEPTION(EX_StackUnder);
718
719 return;
720
721}
722
723
724void FPU_stack_underflow_pop(int i)
725{
726
727 if ( control_word & CW_Invalid )
728 {
729 /* The masked response */
730 FPU_copy_to_regi(&CONST_QNaN, TAG_Special, i);
731 FPU_pop();
732 }
733
734 EXCEPTION(EX_StackUnder);
735
736 return;
737
738}
739
diff --git a/arch/x86/math-emu/exception.h b/arch/x86/math-emu/exception.h
new file mode 100644
index 000000000000..b463f21a811e
--- /dev/null
+++ b/arch/x86/math-emu/exception.h
@@ -0,0 +1,53 @@
1/*---------------------------------------------------------------------------+
2 | exception.h |
3 | |
4 | Copyright (C) 1992 W. Metzenthen, 22 Parker St, Ormond, Vic 3163, |
5 | Australia. E-mail billm@vaxc.cc.monash.edu.au |
6 | |
7 +---------------------------------------------------------------------------*/
8
9#ifndef _EXCEPTION_H_
10#define _EXCEPTION_H_
11
12
13#ifdef __ASSEMBLY__
14#define Const_(x) $##x
15#else
16#define Const_(x) x
17#endif
18
19#ifndef SW_C1
20#include "fpu_emu.h"
21#endif /* SW_C1 */
22
23#define FPU_BUSY Const_(0x8000) /* FPU busy bit (8087 compatibility) */
24#define EX_ErrorSummary Const_(0x0080) /* Error summary status */
25/* Special exceptions: */
26#define EX_INTERNAL Const_(0x8000) /* Internal error in wm-FPU-emu */
27#define EX_StackOver Const_(0x0041|SW_C1) /* stack overflow */
28#define EX_StackUnder Const_(0x0041) /* stack underflow */
29/* Exception flags: */
30#define EX_Precision Const_(0x0020) /* loss of precision */
31#define EX_Underflow Const_(0x0010) /* underflow */
32#define EX_Overflow Const_(0x0008) /* overflow */
33#define EX_ZeroDiv Const_(0x0004) /* divide by zero */
34#define EX_Denormal Const_(0x0002) /* denormalized operand */
35#define EX_Invalid Const_(0x0001) /* invalid operation */
36
37
38#define PRECISION_LOST_UP Const_((EX_Precision | SW_C1))
39#define PRECISION_LOST_DOWN Const_(EX_Precision)
40
41
42#ifndef __ASSEMBLY__
43
44#ifdef DEBUG
45#define EXCEPTION(x) { printk("exception in %s at line %d\n", \
46 __FILE__, __LINE__); FPU_exception(x); }
47#else
48#define EXCEPTION(x) FPU_exception(x)
49#endif
50
51#endif /* __ASSEMBLY__ */
52
53#endif /* _EXCEPTION_H_ */
diff --git a/arch/x86/math-emu/fpu_arith.c b/arch/x86/math-emu/fpu_arith.c
new file mode 100644
index 000000000000..6972dec01af6
--- /dev/null
+++ b/arch/x86/math-emu/fpu_arith.c
@@ -0,0 +1,174 @@
1/*---------------------------------------------------------------------------+
2 | fpu_arith.c |
3 | |
4 | Code to implement the FPU register/register arithmetic instructions |
5 | |
6 | Copyright (C) 1992,1993,1997 |
7 | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
8 | E-mail billm@suburbia.net |
9 | |
10 | |
11 +---------------------------------------------------------------------------*/
12
13#include "fpu_system.h"
14#include "fpu_emu.h"
15#include "control_w.h"
16#include "status_w.h"
17
18
19void fadd__(void)
20{
21 /* fadd st,st(i) */
22 int i = FPU_rm;
23 clear_C1();
24 FPU_add(&st(i), FPU_gettagi(i), 0, control_word);
25}
26
27
28void fmul__(void)
29{
30 /* fmul st,st(i) */
31 int i = FPU_rm;
32 clear_C1();
33 FPU_mul(&st(i), FPU_gettagi(i), 0, control_word);
34}
35
36
37
38void fsub__(void)
39{
40 /* fsub st,st(i) */
41 clear_C1();
42 FPU_sub(0, FPU_rm, control_word);
43}
44
45
46void fsubr_(void)
47{
48 /* fsubr st,st(i) */
49 clear_C1();
50 FPU_sub(REV, FPU_rm, control_word);
51}
52
53
54void fdiv__(void)
55{
56 /* fdiv st,st(i) */
57 clear_C1();
58 FPU_div(0, FPU_rm, control_word);
59}
60
61
62void fdivr_(void)
63{
64 /* fdivr st,st(i) */
65 clear_C1();
66 FPU_div(REV, FPU_rm, control_word);
67}
68
69
70
71void fadd_i(void)
72{
73 /* fadd st(i),st */
74 int i = FPU_rm;
75 clear_C1();
76 FPU_add(&st(i), FPU_gettagi(i), i, control_word);
77}
78
79
80void fmul_i(void)
81{
82 /* fmul st(i),st */
83 clear_C1();
84 FPU_mul(&st(0), FPU_gettag0(), FPU_rm, control_word);
85}
86
87
88void fsubri(void)
89{
90 /* fsubr st(i),st */
91 clear_C1();
92 FPU_sub(DEST_RM, FPU_rm, control_word);
93}
94
95
96void fsub_i(void)
97{
98 /* fsub st(i),st */
99 clear_C1();
100 FPU_sub(REV|DEST_RM, FPU_rm, control_word);
101}
102
103
104void fdivri(void)
105{
106 /* fdivr st(i),st */
107 clear_C1();
108 FPU_div(DEST_RM, FPU_rm, control_word);
109}
110
111
112void fdiv_i(void)
113{
114 /* fdiv st(i),st */
115 clear_C1();
116 FPU_div(REV|DEST_RM, FPU_rm, control_word);
117}
118
119
120
121void faddp_(void)
122{
123 /* faddp st(i),st */
124 int i = FPU_rm;
125 clear_C1();
126 if ( FPU_add(&st(i), FPU_gettagi(i), i, control_word) >= 0 )
127 FPU_pop();
128}
129
130
131void fmulp_(void)
132{
133 /* fmulp st(i),st */
134 clear_C1();
135 if ( FPU_mul(&st(0), FPU_gettag0(), FPU_rm, control_word) >= 0 )
136 FPU_pop();
137}
138
139
140
141void fsubrp(void)
142{
143 /* fsubrp st(i),st */
144 clear_C1();
145 if ( FPU_sub(DEST_RM, FPU_rm, control_word) >= 0 )
146 FPU_pop();
147}
148
149
150void fsubp_(void)
151{
152 /* fsubp st(i),st */
153 clear_C1();
154 if ( FPU_sub(REV|DEST_RM, FPU_rm, control_word) >= 0 )
155 FPU_pop();
156}
157
158
159void fdivrp(void)
160{
161 /* fdivrp st(i),st */
162 clear_C1();
163 if ( FPU_div(DEST_RM, FPU_rm, control_word) >= 0 )
164 FPU_pop();
165}
166
167
168void fdivp_(void)
169{
170 /* fdivp st(i),st */
171 clear_C1();
172 if ( FPU_div(REV|DEST_RM, FPU_rm, control_word) >= 0 )
173 FPU_pop();
174}
diff --git a/arch/x86/math-emu/fpu_asm.h b/arch/x86/math-emu/fpu_asm.h
new file mode 100644
index 000000000000..9ba12416df12
--- /dev/null
+++ b/arch/x86/math-emu/fpu_asm.h
@@ -0,0 +1,32 @@
1/*---------------------------------------------------------------------------+
2 | fpu_asm.h |
3 | |
4 | Copyright (C) 1992,1995,1997 |
5 | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, |
6 | Australia. E-mail billm@suburbia.net |
7 | |
8 +---------------------------------------------------------------------------*/
9
10#ifndef _FPU_ASM_H_
11#define _FPU_ASM_H_
12
13#include <linux/linkage.h>
14
15#define EXCEPTION FPU_exception
16
17
18#define PARAM1 8(%ebp)
19#define PARAM2 12(%ebp)
20#define PARAM3 16(%ebp)
21#define PARAM4 20(%ebp)
22#define PARAM5 24(%ebp)
23#define PARAM6 28(%ebp)
24#define PARAM7 32(%ebp)
25
26#define SIGL_OFFSET 0
27#define EXP(x) 8(x)
28#define SIG(x) SIGL_OFFSET##(x)
29#define SIGL(x) SIGL_OFFSET##(x)
30#define SIGH(x) 4(x)
31
32#endif /* _FPU_ASM_H_ */
diff --git a/arch/x86/math-emu/fpu_aux.c b/arch/x86/math-emu/fpu_aux.c
new file mode 100644
index 000000000000..20886cfb9f76
--- /dev/null
+++ b/arch/x86/math-emu/fpu_aux.c
@@ -0,0 +1,204 @@
1/*---------------------------------------------------------------------------+
2 | fpu_aux.c |
3 | |
4 | Code to implement some of the FPU auxiliary instructions. |
5 | |
6 | Copyright (C) 1992,1993,1994,1997 |
7 | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
8 | E-mail billm@suburbia.net |
9 | |
10 | |
11 +---------------------------------------------------------------------------*/
12
13#include "fpu_system.h"
14#include "exception.h"
15#include "fpu_emu.h"
16#include "status_w.h"
17#include "control_w.h"
18
19
20static void fnop(void)
21{
22}
23
24static void fclex(void)
25{
26 partial_status &= ~(SW_Backward|SW_Summary|SW_Stack_Fault|SW_Precision|
27 SW_Underflow|SW_Overflow|SW_Zero_Div|SW_Denorm_Op|
28 SW_Invalid);
29 no_ip_update = 1;
30}
31
32/* Needs to be externally visible */
33void finit(void)
34{
35 control_word = 0x037f;
36 partial_status = 0;
37 top = 0; /* We don't keep top in the status word internally. */
38 fpu_tag_word = 0xffff;
39 /* The behaviour is different from that detailed in
40 Section 15.1.6 of the Intel manual */
41 operand_address.offset = 0;
42 operand_address.selector = 0;
43 instruction_address.offset = 0;
44 instruction_address.selector = 0;
45 instruction_address.opcode = 0;
46 no_ip_update = 1;
47}
48
49/*
50 * These are nops on the i387..
51 */
52#define feni fnop
53#define fdisi fnop
54#define fsetpm fnop
55
56static FUNC const finit_table[] = {
57 feni, fdisi, fclex, finit,
58 fsetpm, FPU_illegal, FPU_illegal, FPU_illegal
59};
60
61void finit_(void)
62{
63 (finit_table[FPU_rm])();
64}
65
66
67static void fstsw_ax(void)
68{
69 *(short *) &FPU_EAX = status_word();
70 no_ip_update = 1;
71}
72
73static FUNC const fstsw_table[] = {
74 fstsw_ax, FPU_illegal, FPU_illegal, FPU_illegal,
75 FPU_illegal, FPU_illegal, FPU_illegal, FPU_illegal
76};
77
78void fstsw_(void)
79{
80 (fstsw_table[FPU_rm])();
81}
82
83
84static FUNC const fp_nop_table[] = {
85 fnop, FPU_illegal, FPU_illegal, FPU_illegal,
86 FPU_illegal, FPU_illegal, FPU_illegal, FPU_illegal
87};
88
89void fp_nop(void)
90{
91 (fp_nop_table[FPU_rm])();
92}
93
94
95void fld_i_(void)
96{
97 FPU_REG *st_new_ptr;
98 int i;
99 u_char tag;
100
101 if ( STACK_OVERFLOW )
102 { FPU_stack_overflow(); return; }
103
104 /* fld st(i) */
105 i = FPU_rm;
106 if ( NOT_EMPTY(i) )
107 {
108 reg_copy(&st(i), st_new_ptr);
109 tag = FPU_gettagi(i);
110 push();
111 FPU_settag0(tag);
112 }
113 else
114 {
115 if ( control_word & CW_Invalid )
116 {
117 /* The masked response */
118 FPU_stack_underflow();
119 }
120 else
121 EXCEPTION(EX_StackUnder);
122 }
123
124}
125
126
127void fxch_i(void)
128{
129 /* fxch st(i) */
130 FPU_REG t;
131 int i = FPU_rm;
132 FPU_REG *st0_ptr = &st(0), *sti_ptr = &st(i);
133 long tag_word = fpu_tag_word;
134 int regnr = top & 7, regnri = ((regnr + i) & 7);
135 u_char st0_tag = (tag_word >> (regnr*2)) & 3;
136 u_char sti_tag = (tag_word >> (regnri*2)) & 3;
137
138 if ( st0_tag == TAG_Empty )
139 {
140 if ( sti_tag == TAG_Empty )
141 {
142 FPU_stack_underflow();
143 FPU_stack_underflow_i(i);
144 return;
145 }
146 if ( control_word & CW_Invalid )
147 {
148 /* Masked response */
149 FPU_copy_to_reg0(sti_ptr, sti_tag);
150 }
151 FPU_stack_underflow_i(i);
152 return;
153 }
154 if ( sti_tag == TAG_Empty )
155 {
156 if ( control_word & CW_Invalid )
157 {
158 /* Masked response */
159 FPU_copy_to_regi(st0_ptr, st0_tag, i);
160 }
161 FPU_stack_underflow();
162 return;
163 }
164 clear_C1();
165
166 reg_copy(st0_ptr, &t);
167 reg_copy(sti_ptr, st0_ptr);
168 reg_copy(&t, sti_ptr);
169
170 tag_word &= ~(3 << (regnr*2)) & ~(3 << (regnri*2));
171 tag_word |= (sti_tag << (regnr*2)) | (st0_tag << (regnri*2));
172 fpu_tag_word = tag_word;
173}
174
175
176void ffree_(void)
177{
178 /* ffree st(i) */
179 FPU_settagi(FPU_rm, TAG_Empty);
180}
181
182
183void ffreep(void)
184{
185 /* ffree st(i) + pop - unofficial code */
186 FPU_settagi(FPU_rm, TAG_Empty);
187 FPU_pop();
188}
189
190
191void fst_i_(void)
192{
193 /* fst st(i) */
194 FPU_copy_to_regi(&st(0), FPU_gettag0(), FPU_rm);
195}
196
197
198void fstp_i(void)
199{
200 /* fstp st(i) */
201 FPU_copy_to_regi(&st(0), FPU_gettag0(), FPU_rm);
202 FPU_pop();
203}
204
diff --git a/arch/x86/math-emu/fpu_emu.h b/arch/x86/math-emu/fpu_emu.h
new file mode 100644
index 000000000000..65120f523853
--- /dev/null
+++ b/arch/x86/math-emu/fpu_emu.h
@@ -0,0 +1,218 @@
1/*---------------------------------------------------------------------------+
2 | fpu_emu.h |
3 | |
4 | Copyright (C) 1992,1993,1994,1997 |
5 | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, |
6 | Australia. E-mail billm@suburbia.net |
7 | |
8 +---------------------------------------------------------------------------*/
9
10
11#ifndef _FPU_EMU_H_
12#define _FPU_EMU_H_
13
14/*
15 * Define PECULIAR_486 to get a closer approximation to 80486 behaviour,
16 * rather than behaviour which appears to be cleaner.
17 * This is a matter of opinion: for all I know, the 80486 may simply
18 * be complying with the IEEE spec. Maybe one day I'll get to see the
19 * spec...
20 */
21#define PECULIAR_486
22
23#ifdef __ASSEMBLY__
24#include "fpu_asm.h"
25#define Const(x) $##x
26#else
27#define Const(x) x
28#endif
29
30#define EXP_BIAS Const(0)
31#define EXP_OVER Const(0x4000) /* smallest invalid large exponent */
32#define EXP_UNDER Const(-0x3fff) /* largest invalid small exponent */
33#define EXP_WAY_UNDER Const(-0x6000) /* Below the smallest denormal, but
34 still a 16 bit nr. */
35#define EXP_Infinity EXP_OVER
36#define EXP_NaN EXP_OVER
37
38#define EXTENDED_Ebias Const(0x3fff)
39#define EXTENDED_Emin (-0x3ffe) /* smallest valid exponent */
40
41#define SIGN_POS Const(0)
42#define SIGN_NEG Const(0x80)
43
44#define SIGN_Positive Const(0)
45#define SIGN_Negative Const(0x8000)
46
47
48/* Keep the order TAG_Valid, TAG_Zero, TW_Denormal */
49/* The following fold to 2 (Special) in the Tag Word */
50#define TW_Denormal Const(4) /* De-normal */
51#define TW_Infinity Const(5) /* + or - infinity */
52#define TW_NaN Const(6) /* Not a Number */
53#define TW_Unsupported Const(7) /* Not supported by an 80486 */
54
55#define TAG_Valid Const(0) /* valid */
56#define TAG_Zero Const(1) /* zero */
57#define TAG_Special Const(2) /* De-normal, + or - infinity,
58 or Not a Number */
59#define TAG_Empty Const(3) /* empty */
60#define TAG_Error Const(0x80) /* probably need to abort */
61
62#define LOADED_DATA Const(10101) /* Special st() number to identify
63 loaded data (not on stack). */
64
65/* A few flags (must be >= 0x10). */
66#define REV 0x10
67#define DEST_RM 0x20
68#define LOADED 0x40
69
70#define FPU_Exception Const(0x80000000) /* Added to tag returns. */
71
72
73#ifndef __ASSEMBLY__
74
75#include "fpu_system.h"
76
77#include <asm/sigcontext.h> /* for struct _fpstate */
78#include <asm/math_emu.h>
79#include <linux/linkage.h>
80
81/*
82#define RE_ENTRANT_CHECKING
83 */
84
85#ifdef RE_ENTRANT_CHECKING
86extern u_char emulating;
87# define RE_ENTRANT_CHECK_OFF emulating = 0
88# define RE_ENTRANT_CHECK_ON emulating = 1
89#else
90# define RE_ENTRANT_CHECK_OFF
91# define RE_ENTRANT_CHECK_ON
92#endif /* RE_ENTRANT_CHECKING */
93
94#define FWAIT_OPCODE 0x9b
95#define OP_SIZE_PREFIX 0x66
96#define ADDR_SIZE_PREFIX 0x67
97#define PREFIX_CS 0x2e
98#define PREFIX_DS 0x3e
99#define PREFIX_ES 0x26
100#define PREFIX_SS 0x36
101#define PREFIX_FS 0x64
102#define PREFIX_GS 0x65
103#define PREFIX_REPE 0xf3
104#define PREFIX_REPNE 0xf2
105#define PREFIX_LOCK 0xf0
106#define PREFIX_CS_ 1
107#define PREFIX_DS_ 2
108#define PREFIX_ES_ 3
109#define PREFIX_FS_ 4
110#define PREFIX_GS_ 5
111#define PREFIX_SS_ 6
112#define PREFIX_DEFAULT 7
113
114struct address {
115 unsigned int offset;
116 unsigned int selector:16;
117 unsigned int opcode:11;
118 unsigned int empty:5;
119};
120struct fpu__reg {
121 unsigned sigl;
122 unsigned sigh;
123 short exp;
124};
125
126typedef void (*FUNC)(void);
127typedef struct fpu__reg FPU_REG;
128typedef void (*FUNC_ST0)(FPU_REG *st0_ptr, u_char st0_tag);
129typedef struct { u_char address_size, operand_size, segment; }
130 overrides;
131/* This structure is 32 bits: */
132typedef struct { overrides override;
133 u_char default_mode; } fpu_addr_modes;
134/* PROTECTED has a restricted meaning in the emulator; it is used
135 to signal that the emulator needs to do special things to ensure
136 that protection is respected in a segmented model. */
137#define PROTECTED 4
138#define SIXTEEN 1 /* We rely upon this being 1 (true) */
139#define VM86 SIXTEEN
140#define PM16 (SIXTEEN | PROTECTED)
141#define SEG32 PROTECTED
142extern u_char const data_sizes_16[32];
143
144#define register_base ((u_char *) registers )
145#define fpu_register(x) ( * ((FPU_REG *)( register_base + 10 * (x & 7) )) )
146#define st(x) ( * ((FPU_REG *)( register_base + 10 * ((top+x) & 7) )) )
147
148#define STACK_OVERFLOW (FPU_stackoverflow(&st_new_ptr))
149#define NOT_EMPTY(i) (!FPU_empty_i(i))
150
151#define NOT_EMPTY_ST0 (st0_tag ^ TAG_Empty)
152
153#define poppop() { FPU_pop(); FPU_pop(); }
154
155/* push() does not affect the tags */
156#define push() { top--; }
157
158#define signbyte(a) (((u_char *)(a))[9])
159#define getsign(a) (signbyte(a) & 0x80)
160#define setsign(a,b) { if (b) signbyte(a) |= 0x80; else signbyte(a) &= 0x7f; }
161#define copysign(a,b) { if (getsign(a)) signbyte(b) |= 0x80; \
162 else signbyte(b) &= 0x7f; }
163#define changesign(a) { signbyte(a) ^= 0x80; }
164#define setpositive(a) { signbyte(a) &= 0x7f; }
165#define setnegative(a) { signbyte(a) |= 0x80; }
166#define signpositive(a) ( (signbyte(a) & 0x80) == 0 )
167#define signnegative(a) (signbyte(a) & 0x80)
168
169static inline void reg_copy(FPU_REG const *x, FPU_REG *y)
170{
171 *(short *)&(y->exp) = *(const short *)&(x->exp);
172 *(long long *)&(y->sigl) = *(const long long *)&(x->sigl);
173}
174
175#define exponent(x) (((*(short *)&((x)->exp)) & 0x7fff) - EXTENDED_Ebias)
176#define setexponentpos(x,y) { (*(short *)&((x)->exp)) = \
177 ((y) + EXTENDED_Ebias) & 0x7fff; }
178#define exponent16(x) (*(short *)&((x)->exp))
179#define setexponent16(x,y) { (*(short *)&((x)->exp)) = (y); }
180#define addexponent(x,y) { (*(short *)&((x)->exp)) += (y); }
181#define stdexp(x) { (*(short *)&((x)->exp)) += EXTENDED_Ebias; }
182
183#define isdenormal(ptr) (exponent(ptr) == EXP_BIAS+EXP_UNDER)
184
185#define significand(x) ( ((unsigned long long *)&((x)->sigl))[0] )
186
187
188/*----- Prototypes for functions written in assembler -----*/
189/* extern void reg_move(FPU_REG *a, FPU_REG *b); */
190
191asmlinkage int FPU_normalize(FPU_REG *x);
192asmlinkage int FPU_normalize_nuo(FPU_REG *x);
193asmlinkage int FPU_u_sub(FPU_REG const *arg1, FPU_REG const *arg2,
194 FPU_REG *answ, unsigned int control_w, u_char sign,
195 int expa, int expb);
196asmlinkage int FPU_u_mul(FPU_REG const *arg1, FPU_REG const *arg2,
197 FPU_REG *answ, unsigned int control_w, u_char sign,
198 int expon);
199asmlinkage int FPU_u_div(FPU_REG const *arg1, FPU_REG const *arg2,
200 FPU_REG *answ, unsigned int control_w, u_char sign);
201asmlinkage int FPU_u_add(FPU_REG const *arg1, FPU_REG const *arg2,
202 FPU_REG *answ, unsigned int control_w, u_char sign,
203 int expa, int expb);
204asmlinkage int wm_sqrt(FPU_REG *n, int dummy1, int dummy2,
205 unsigned int control_w, u_char sign);
206asmlinkage unsigned FPU_shrx(void *l, unsigned x);
207asmlinkage unsigned FPU_shrxs(void *v, unsigned x);
208asmlinkage unsigned long FPU_div_small(unsigned long long *x, unsigned long y);
209asmlinkage int FPU_round(FPU_REG *arg, unsigned int extent, int dummy,
210 unsigned int control_w, u_char sign);
211
212#ifndef MAKING_PROTO
213#include "fpu_proto.h"
214#endif
215
216#endif /* __ASSEMBLY__ */
217
218#endif /* _FPU_EMU_H_ */
diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c
new file mode 100644
index 000000000000..1853524c8b57
--- /dev/null
+++ b/arch/x86/math-emu/fpu_entry.c
@@ -0,0 +1,761 @@
1/*---------------------------------------------------------------------------+
2 | fpu_entry.c |
3 | |
4 | The entry functions for wm-FPU-emu |
5 | |
6 | Copyright (C) 1992,1993,1994,1996,1997 |
7 | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
8 | E-mail billm@suburbia.net |
9 | |
10 | See the files "README" and "COPYING" for further copyright and warranty |
11 | information. |
12 | |
13 +---------------------------------------------------------------------------*/
14
15/*---------------------------------------------------------------------------+
16 | Note: |
17 | The file contains code which accesses user memory. |
18 | Emulator static data may change when user memory is accessed, due to |
19 | other processes using the emulator while swapping is in progress. |
20 +---------------------------------------------------------------------------*/
21
22/*---------------------------------------------------------------------------+
23 | math_emulate(), restore_i387_soft() and save_i387_soft() are the only |
24 | entry points for wm-FPU-emu. |
25 +---------------------------------------------------------------------------*/
26
27#include <linux/signal.h>
28#include <linux/ptrace.h>
29
30#include <asm/uaccess.h>
31#include <asm/desc.h>
32
33#include "fpu_system.h"
34#include "fpu_emu.h"
35#include "exception.h"
36#include "control_w.h"
37#include "status_w.h"
38
39#define __BAD__ FPU_illegal /* Illegal on an 80486, causes SIGILL */
40
41#ifndef NO_UNDOC_CODE /* Un-documented FPU op-codes supported by default. */
42
43/* WARNING: These codes are not documented by Intel in their 80486 manual
44 and may not work on FPU clones or later Intel FPUs. */
45
46/* Changes to support the un-doc codes provided by Linus Torvalds. */
47
48#define _d9_d8_ fstp_i /* unofficial code (19) */
49#define _dc_d0_ fcom_st /* unofficial code (14) */
50#define _dc_d8_ fcompst /* unofficial code (1c) */
51#define _dd_c8_ fxch_i /* unofficial code (0d) */
52#define _de_d0_ fcompst /* unofficial code (16) */
53#define _df_c0_ ffreep /* unofficial code (07) ffree + pop */
54#define _df_c8_ fxch_i /* unofficial code (0f) */
55#define _df_d0_ fstp_i /* unofficial code (17) */
56#define _df_d8_ fstp_i /* unofficial code (1f) */
57
58static FUNC const st_instr_table[64] = {
59 fadd__, fld_i_, __BAD__, __BAD__, fadd_i, ffree_, faddp_, _df_c0_,
60 fmul__, fxch_i, __BAD__, __BAD__, fmul_i, _dd_c8_, fmulp_, _df_c8_,
61 fcom_st, fp_nop, __BAD__, __BAD__, _dc_d0_, fst_i_, _de_d0_, _df_d0_,
62 fcompst, _d9_d8_, __BAD__, __BAD__, _dc_d8_, fstp_i, fcompp, _df_d8_,
63 fsub__, FPU_etc, __BAD__, finit_, fsubri, fucom_, fsubrp, fstsw_,
64 fsubr_, fconst, fucompp, __BAD__, fsub_i, fucomp, fsubp_, __BAD__,
65 fdiv__, FPU_triga, __BAD__, __BAD__, fdivri, __BAD__, fdivrp, __BAD__,
66 fdivr_, FPU_trigb, __BAD__, __BAD__, fdiv_i, __BAD__, fdivp_, __BAD__,
67};
68
69#else /* Support only documented FPU op-codes */
70
71static FUNC const st_instr_table[64] = {
72 fadd__, fld_i_, __BAD__, __BAD__, fadd_i, ffree_, faddp_, __BAD__,
73 fmul__, fxch_i, __BAD__, __BAD__, fmul_i, __BAD__, fmulp_, __BAD__,
74 fcom_st, fp_nop, __BAD__, __BAD__, __BAD__, fst_i_, __BAD__, __BAD__,
75 fcompst, __BAD__, __BAD__, __BAD__, __BAD__, fstp_i, fcompp, __BAD__,
76 fsub__, FPU_etc, __BAD__, finit_, fsubri, fucom_, fsubrp, fstsw_,
77 fsubr_, fconst, fucompp, __BAD__, fsub_i, fucomp, fsubp_, __BAD__,
78 fdiv__, FPU_triga, __BAD__, __BAD__, fdivri, __BAD__, fdivrp, __BAD__,
79 fdivr_, FPU_trigb, __BAD__, __BAD__, fdiv_i, __BAD__, fdivp_, __BAD__,
80};
81
82#endif /* NO_UNDOC_CODE */
83
84
85#define _NONE_ 0 /* Take no special action */
86#define _REG0_ 1 /* Need to check for not empty st(0) */
87#define _REGI_ 2 /* Need to check for not empty st(0) and st(rm) */
88#define _REGi_ 0 /* Uses st(rm) */
89#define _PUSH_ 3 /* Need to check for space to push onto stack */
90#define _null_ 4 /* Function illegal or not implemented */
91#define _REGIi 5 /* Uses st(0) and st(rm), result to st(rm) */
92#define _REGIp 6 /* Uses st(0) and st(rm), result to st(rm) then pop */
93#define _REGIc 0 /* Compare st(0) and st(rm) */
94#define _REGIn 0 /* Uses st(0) and st(rm), but handle checks later */
95
96#ifndef NO_UNDOC_CODE
97
98/* Un-documented FPU op-codes supported by default. (see above) */
99
100static u_char const type_table[64] = {
101 _REGI_, _NONE_, _null_, _null_, _REGIi, _REGi_, _REGIp, _REGi_,
102 _REGI_, _REGIn, _null_, _null_, _REGIi, _REGI_, _REGIp, _REGI_,
103 _REGIc, _NONE_, _null_, _null_, _REGIc, _REG0_, _REGIc, _REG0_,
104 _REGIc, _REG0_, _null_, _null_, _REGIc, _REG0_, _REGIc, _REG0_,
105 _REGI_, _NONE_, _null_, _NONE_, _REGIi, _REGIc, _REGIp, _NONE_,
106 _REGI_, _NONE_, _REGIc, _null_, _REGIi, _REGIc, _REGIp, _null_,
107 _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_,
108 _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_
109};
110
111#else /* Support only documented FPU op-codes */
112
113static u_char const type_table[64] = {
114 _REGI_, _NONE_, _null_, _null_, _REGIi, _REGi_, _REGIp, _null_,
115 _REGI_, _REGIn, _null_, _null_, _REGIi, _null_, _REGIp, _null_,
116 _REGIc, _NONE_, _null_, _null_, _null_, _REG0_, _null_, _null_,
117 _REGIc, _null_, _null_, _null_, _null_, _REG0_, _REGIc, _null_,
118 _REGI_, _NONE_, _null_, _NONE_, _REGIi, _REGIc, _REGIp, _NONE_,
119 _REGI_, _NONE_, _REGIc, _null_, _REGIi, _REGIc, _REGIp, _null_,
120 _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_,
121 _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_
122};
123
124#endif /* NO_UNDOC_CODE */
125
126
127#ifdef RE_ENTRANT_CHECKING
128u_char emulating=0;
129#endif /* RE_ENTRANT_CHECKING */
130
131static int valid_prefix(u_char *Byte, u_char __user **fpu_eip,
132 overrides *override);
133
134asmlinkage void math_emulate(long arg)
135{
136 u_char FPU_modrm, byte1;
137 unsigned short code;
138 fpu_addr_modes addr_modes;
139 int unmasked;
140 FPU_REG loaded_data;
141 FPU_REG *st0_ptr;
142 u_char loaded_tag, st0_tag;
143 void __user *data_address;
144 struct address data_sel_off;
145 struct address entry_sel_off;
146 unsigned long code_base = 0;
147 unsigned long code_limit = 0; /* Initialized to stop compiler warnings */
148 struct desc_struct code_descriptor;
149
150#ifdef RE_ENTRANT_CHECKING
151 if ( emulating )
152 {
153 printk("ERROR: wm-FPU-emu is not RE-ENTRANT!\n");
154 }
155 RE_ENTRANT_CHECK_ON;
156#endif /* RE_ENTRANT_CHECKING */
157
158 if (!used_math())
159 {
160 finit();
161 set_used_math();
162 }
163
164 SETUP_DATA_AREA(arg);
165
166 FPU_ORIG_EIP = FPU_EIP;
167
168 if ( (FPU_EFLAGS & 0x00020000) != 0 )
169 {
170 /* Virtual 8086 mode */
171 addr_modes.default_mode = VM86;
172 FPU_EIP += code_base = FPU_CS << 4;
173 code_limit = code_base + 0xffff; /* Assumes code_base <= 0xffff0000 */
174 }
175 else if ( FPU_CS == __USER_CS && FPU_DS == __USER_DS )
176 {
177 addr_modes.default_mode = 0;
178 }
179 else if ( FPU_CS == __KERNEL_CS )
180 {
181 printk("math_emulate: %04x:%08lx\n",FPU_CS,FPU_EIP);
182 panic("Math emulation needed in kernel");
183 }
184 else
185 {
186
187 if ( (FPU_CS & 4) != 4 ) /* Must be in the LDT */
188 {
189 /* Can only handle segmented addressing via the LDT
190 for now, and it must be 16 bit */
191 printk("FPU emulator: Unsupported addressing mode\n");
192 math_abort(FPU_info, SIGILL);
193 }
194
195 code_descriptor = LDT_DESCRIPTOR(FPU_CS);
196 if ( SEG_D_SIZE(code_descriptor) )
197 {
198 /* The above test may be wrong, the book is not clear */
199 /* Segmented 32 bit protected mode */
200 addr_modes.default_mode = SEG32;
201 }
202 else
203 {
204 /* 16 bit protected mode */
205 addr_modes.default_mode = PM16;
206 }
207 FPU_EIP += code_base = SEG_BASE_ADDR(code_descriptor);
208 code_limit = code_base
209 + (SEG_LIMIT(code_descriptor)+1) * SEG_GRANULARITY(code_descriptor)
210 - 1;
211 if ( code_limit < code_base ) code_limit = 0xffffffff;
212 }
213
214 FPU_lookahead = 1;
215 if (current->ptrace & PT_PTRACED)
216 FPU_lookahead = 0;
217
218 if ( !valid_prefix(&byte1, (u_char __user **)&FPU_EIP,
219 &addr_modes.override) )
220 {
221 RE_ENTRANT_CHECK_OFF;
222 printk("FPU emulator: Unknown prefix byte 0x%02x, probably due to\n"
223 "FPU emulator: self-modifying code! (emulation impossible)\n",
224 byte1);
225 RE_ENTRANT_CHECK_ON;
226 EXCEPTION(EX_INTERNAL|0x126);
227 math_abort(FPU_info,SIGILL);
228 }
229
230do_another_FPU_instruction:
231
232 no_ip_update = 0;
233
234 FPU_EIP++; /* We have fetched the prefix and first code bytes. */
235
236 if ( addr_modes.default_mode )
237 {
238 /* This checks for the minimum instruction bytes.
239 We also need to check any extra (address mode) code access. */
240 if ( FPU_EIP > code_limit )
241 math_abort(FPU_info,SIGSEGV);
242 }
243
244 if ( (byte1 & 0xf8) != 0xd8 )
245 {
246 if ( byte1 == FWAIT_OPCODE )
247 {
248 if (partial_status & SW_Summary)
249 goto do_the_FPU_interrupt;
250 else
251 goto FPU_fwait_done;
252 }
253#ifdef PARANOID
254 EXCEPTION(EX_INTERNAL|0x128);
255 math_abort(FPU_info,SIGILL);
256#endif /* PARANOID */
257 }
258
259 RE_ENTRANT_CHECK_OFF;
260 FPU_code_access_ok(1);
261 FPU_get_user(FPU_modrm, (u_char __user *) FPU_EIP);
262 RE_ENTRANT_CHECK_ON;
263 FPU_EIP++;
264
265 if (partial_status & SW_Summary)
266 {
267 /* Ignore the error for now if the current instruction is a no-wait
268 control instruction */
269 /* The 80486 manual contradicts itself on this topic,
270 but a real 80486 uses the following instructions:
271 fninit, fnstenv, fnsave, fnstsw, fnstenv, fnclex.
272 */
273 code = (FPU_modrm << 8) | byte1;
274 if ( ! ( (((code & 0xf803) == 0xe003) || /* fnclex, fninit, fnstsw */
275 (((code & 0x3003) == 0x3001) && /* fnsave, fnstcw, fnstenv,
276 fnstsw */
277 ((code & 0xc000) != 0xc000))) ) )
278 {
279 /*
280 * We need to simulate the action of the kernel to FPU
281 * interrupts here.
282 */
283 do_the_FPU_interrupt:
284
285 FPU_EIP = FPU_ORIG_EIP; /* Point to current FPU instruction. */
286
287 RE_ENTRANT_CHECK_OFF;
288 current->thread.trap_no = 16;
289 current->thread.error_code = 0;
290 send_sig(SIGFPE, current, 1);
291 return;
292 }
293 }
294
295 entry_sel_off.offset = FPU_ORIG_EIP;
296 entry_sel_off.selector = FPU_CS;
297 entry_sel_off.opcode = (byte1 << 8) | FPU_modrm;
298
299 FPU_rm = FPU_modrm & 7;
300
301 if ( FPU_modrm < 0300 )
302 {
303 /* All of these instructions use the mod/rm byte to get a data address */
304
305 if ( (addr_modes.default_mode & SIXTEEN)
306 ^ (addr_modes.override.address_size == ADDR_SIZE_PREFIX) )
307 data_address = FPU_get_address_16(FPU_modrm, &FPU_EIP, &data_sel_off,
308 addr_modes);
309 else
310 data_address = FPU_get_address(FPU_modrm, &FPU_EIP, &data_sel_off,
311 addr_modes);
312
313 if ( addr_modes.default_mode )
314 {
315 if ( FPU_EIP-1 > code_limit )
316 math_abort(FPU_info,SIGSEGV);
317 }
318
319 if ( !(byte1 & 1) )
320 {
321 unsigned short status1 = partial_status;
322
323 st0_ptr = &st(0);
324 st0_tag = FPU_gettag0();
325
326 /* Stack underflow has priority */
327 if ( NOT_EMPTY_ST0 )
328 {
329 if ( addr_modes.default_mode & PROTECTED )
330 {
331 /* This table works for 16 and 32 bit protected mode */
332 if ( access_limit < data_sizes_16[(byte1 >> 1) & 3] )
333 math_abort(FPU_info,SIGSEGV);
334 }
335
336 unmasked = 0; /* Do this here to stop compiler warnings. */
337 switch ( (byte1 >> 1) & 3 )
338 {
339 case 0:
340 unmasked = FPU_load_single((float __user *)data_address,
341 &loaded_data);
342 loaded_tag = unmasked & 0xff;
343 unmasked &= ~0xff;
344 break;
345 case 1:
346 loaded_tag = FPU_load_int32((long __user *)data_address, &loaded_data);
347 break;
348 case 2:
349 unmasked = FPU_load_double((double __user *)data_address,
350 &loaded_data);
351 loaded_tag = unmasked & 0xff;
352 unmasked &= ~0xff;
353 break;
354 case 3:
355 default: /* Used here to suppress gcc warnings. */
356 loaded_tag = FPU_load_int16((short __user *)data_address, &loaded_data);
357 break;
358 }
359
360 /* No more access to user memory, it is safe
361 to use static data now */
362
363 /* NaN operands have the next priority. */
364 /* We have to delay looking at st(0) until after
365 loading the data, because that data might contain an SNaN */
366 if ( ((st0_tag == TAG_Special) && isNaN(st0_ptr)) ||
367 ((loaded_tag == TAG_Special) && isNaN(&loaded_data)) )
368 {
369 /* Restore the status word; we might have loaded a
370 denormal. */
371 partial_status = status1;
372 if ( (FPU_modrm & 0x30) == 0x10 )
373 {
374 /* fcom or fcomp */
375 EXCEPTION(EX_Invalid);
376 setcc(SW_C3 | SW_C2 | SW_C0);
377 if ( (FPU_modrm & 0x08) && (control_word & CW_Invalid) )
378 FPU_pop(); /* fcomp, masked, so we pop. */
379 }
380 else
381 {
382 if ( loaded_tag == TAG_Special )
383 loaded_tag = FPU_Special(&loaded_data);
384#ifdef PECULIAR_486
385 /* This is not really needed, but gives behaviour
386 identical to an 80486 */
387 if ( (FPU_modrm & 0x28) == 0x20 )
388 /* fdiv or fsub */
389 real_2op_NaN(&loaded_data, loaded_tag, 0, &loaded_data);
390 else
391#endif /* PECULIAR_486 */
392 /* fadd, fdivr, fmul, or fsubr */
393 real_2op_NaN(&loaded_data, loaded_tag, 0, st0_ptr);
394 }
395 goto reg_mem_instr_done;
396 }
397
398 if ( unmasked && !((FPU_modrm & 0x30) == 0x10) )
399 {
400 /* Is not a comparison instruction. */
401 if ( (FPU_modrm & 0x38) == 0x38 )
402 {
403 /* fdivr */
404 if ( (st0_tag == TAG_Zero) &&
405 ((loaded_tag == TAG_Valid)
406 || (loaded_tag == TAG_Special
407 && isdenormal(&loaded_data))) )
408 {
409 if ( FPU_divide_by_zero(0, getsign(&loaded_data))
410 < 0 )
411 {
412 /* We use the fact here that the unmasked
413 exception in the loaded data was for a
414 denormal operand */
415 /* Restore the state of the denormal op bit */
416 partial_status &= ~SW_Denorm_Op;
417 partial_status |= status1 & SW_Denorm_Op;
418 }
419 else
420 setsign(st0_ptr, getsign(&loaded_data));
421 }
422 }
423 goto reg_mem_instr_done;
424 }
425
426 switch ( (FPU_modrm >> 3) & 7 )
427 {
428 case 0: /* fadd */
429 clear_C1();
430 FPU_add(&loaded_data, loaded_tag, 0, control_word);
431 break;
432 case 1: /* fmul */
433 clear_C1();
434 FPU_mul(&loaded_data, loaded_tag, 0, control_word);
435 break;
436 case 2: /* fcom */
437 FPU_compare_st_data(&loaded_data, loaded_tag);
438 break;
439 case 3: /* fcomp */
440 if ( !FPU_compare_st_data(&loaded_data, loaded_tag)
441 && !unmasked )
442 FPU_pop();
443 break;
444 case 4: /* fsub */
445 clear_C1();
446 FPU_sub(LOADED|loaded_tag, (int)&loaded_data, control_word);
447 break;
448 case 5: /* fsubr */
449 clear_C1();
450 FPU_sub(REV|LOADED|loaded_tag, (int)&loaded_data, control_word);
451 break;
452 case 6: /* fdiv */
453 clear_C1();
454 FPU_div(LOADED|loaded_tag, (int)&loaded_data, control_word);
455 break;
456 case 7: /* fdivr */
457 clear_C1();
458 if ( st0_tag == TAG_Zero )
459 partial_status = status1; /* Undo any denorm tag,
460 zero-divide has priority. */
461 FPU_div(REV|LOADED|loaded_tag, (int)&loaded_data, control_word);
462 break;
463 }
464 }
465 else
466 {
467 if ( (FPU_modrm & 0x30) == 0x10 )
468 {
469 /* The instruction is fcom or fcomp */
470 EXCEPTION(EX_StackUnder);
471 setcc(SW_C3 | SW_C2 | SW_C0);
472 if ( (FPU_modrm & 0x08) && (control_word & CW_Invalid) )
473 FPU_pop(); /* fcomp */
474 }
475 else
476 FPU_stack_underflow();
477 }
478 reg_mem_instr_done:
479 operand_address = data_sel_off;
480 }
481 else
482 {
483 if ( !(no_ip_update =
484 FPU_load_store(((FPU_modrm & 0x38) | (byte1 & 6)) >> 1,
485 addr_modes, data_address)) )
486 {
487 operand_address = data_sel_off;
488 }
489 }
490
491 }
492 else
493 {
494 /* None of these instructions access user memory */
495 u_char instr_index = (FPU_modrm & 0x38) | (byte1 & 7);
496
497#ifdef PECULIAR_486
498 /* This is supposed to be undefined, but a real 80486 seems
499 to do this: */
500 operand_address.offset = 0;
501 operand_address.selector = FPU_DS;
502#endif /* PECULIAR_486 */
503
504 st0_ptr = &st(0);
505 st0_tag = FPU_gettag0();
506 switch ( type_table[(int) instr_index] )
507 {
508 case _NONE_: /* also _REGIc: _REGIn */
509 break;
510 case _REG0_:
511 if ( !NOT_EMPTY_ST0 )
512 {
513 FPU_stack_underflow();
514 goto FPU_instruction_done;
515 }
516 break;
517 case _REGIi:
518 if ( !NOT_EMPTY_ST0 || !NOT_EMPTY(FPU_rm) )
519 {
520 FPU_stack_underflow_i(FPU_rm);
521 goto FPU_instruction_done;
522 }
523 break;
524 case _REGIp:
525 if ( !NOT_EMPTY_ST0 || !NOT_EMPTY(FPU_rm) )
526 {
527 FPU_stack_underflow_pop(FPU_rm);
528 goto FPU_instruction_done;
529 }
530 break;
531 case _REGI_:
532 if ( !NOT_EMPTY_ST0 || !NOT_EMPTY(FPU_rm) )
533 {
534 FPU_stack_underflow();
535 goto FPU_instruction_done;
536 }
537 break;
538 case _PUSH_: /* Only used by the fld st(i) instruction */
539 break;
540 case _null_:
541 FPU_illegal();
542 goto FPU_instruction_done;
543 default:
544 EXCEPTION(EX_INTERNAL|0x111);
545 goto FPU_instruction_done;
546 }
547 (*st_instr_table[(int) instr_index])();
548
549FPU_instruction_done:
550 ;
551 }
552
553 if ( ! no_ip_update )
554 instruction_address = entry_sel_off;
555
556FPU_fwait_done:
557
558#ifdef DEBUG
559 RE_ENTRANT_CHECK_OFF;
560 FPU_printall();
561 RE_ENTRANT_CHECK_ON;
562#endif /* DEBUG */
563
564 if (FPU_lookahead && !need_resched())
565 {
566 FPU_ORIG_EIP = FPU_EIP - code_base;
567 if ( valid_prefix(&byte1, (u_char __user **)&FPU_EIP,
568 &addr_modes.override) )
569 goto do_another_FPU_instruction;
570 }
571
572 if ( addr_modes.default_mode )
573 FPU_EIP -= code_base;
574
575 RE_ENTRANT_CHECK_OFF;
576}
577
578
579/* Support for prefix bytes is not yet complete. To properly handle
580 all prefix bytes, further changes are needed in the emulator code
581 which accesses user address space. Access to separate segments is
582 important for msdos emulation. */
583static int valid_prefix(u_char *Byte, u_char __user **fpu_eip,
584 overrides *override)
585{
586 u_char byte;
587 u_char __user *ip = *fpu_eip;
588
589 *override = (overrides) { 0, 0, PREFIX_DEFAULT }; /* defaults */
590
591 RE_ENTRANT_CHECK_OFF;
592 FPU_code_access_ok(1);
593 FPU_get_user(byte, ip);
594 RE_ENTRANT_CHECK_ON;
595
596 while ( 1 )
597 {
598 switch ( byte )
599 {
600 case ADDR_SIZE_PREFIX:
601 override->address_size = ADDR_SIZE_PREFIX;
602 goto do_next_byte;
603
604 case OP_SIZE_PREFIX:
605 override->operand_size = OP_SIZE_PREFIX;
606 goto do_next_byte;
607
608 case PREFIX_CS:
609 override->segment = PREFIX_CS_;
610 goto do_next_byte;
611 case PREFIX_ES:
612 override->segment = PREFIX_ES_;
613 goto do_next_byte;
614 case PREFIX_SS:
615 override->segment = PREFIX_SS_;
616 goto do_next_byte;
617 case PREFIX_FS:
618 override->segment = PREFIX_FS_;
619 goto do_next_byte;
620 case PREFIX_GS:
621 override->segment = PREFIX_GS_;
622 goto do_next_byte;
623 case PREFIX_DS:
624 override->segment = PREFIX_DS_;
625 goto do_next_byte;
626
627/* lock is not a valid prefix for FPU instructions,
628 let the cpu handle it to generate a SIGILL. */
629/* case PREFIX_LOCK: */
630
631 /* rep.. prefixes have no meaning for FPU instructions */
632 case PREFIX_REPE:
633 case PREFIX_REPNE:
634
635 do_next_byte:
636 ip++;
637 RE_ENTRANT_CHECK_OFF;
638 FPU_code_access_ok(1);
639 FPU_get_user(byte, ip);
640 RE_ENTRANT_CHECK_ON;
641 break;
642 case FWAIT_OPCODE:
643 *Byte = byte;
644 return 1;
645 default:
646 if ( (byte & 0xf8) == 0xd8 )
647 {
648 *Byte = byte;
649 *fpu_eip = ip;
650 return 1;
651 }
652 else
653 {
654 /* Not a valid sequence of prefix bytes followed by
655 an FPU instruction. */
656 *Byte = byte; /* Needed for error message. */
657 return 0;
658 }
659 }
660 }
661}
662
663
664void math_abort(struct info * info, unsigned int signal)
665{
666 FPU_EIP = FPU_ORIG_EIP;
667 current->thread.trap_no = 16;
668 current->thread.error_code = 0;
669 send_sig(signal,current,1);
670 RE_ENTRANT_CHECK_OFF;
671 __asm__("movl %0,%%esp ; ret": :"g" (((long) info)-4));
672#ifdef PARANOID
673 printk("ERROR: wm-FPU-emu math_abort failed!\n");
674#endif /* PARANOID */
675}
676
677
678
679#define S387 ((struct i387_soft_struct *)s387)
680#define sstatus_word() \
681 ((S387->swd & ~SW_Top & 0xffff) | ((S387->ftop << SW_Top_Shift) & SW_Top))
682
683int restore_i387_soft(void *s387, struct _fpstate __user *buf)
684{
685 u_char __user *d = (u_char __user *)buf;
686 int offset, other, i, tags, regnr, tag, newtop;
687
688 RE_ENTRANT_CHECK_OFF;
689 FPU_access_ok(VERIFY_READ, d, 7*4 + 8*10);
690 if (__copy_from_user(&S387->cwd, d, 7*4))
691 return -1;
692 RE_ENTRANT_CHECK_ON;
693
694 d += 7*4;
695
696 S387->ftop = (S387->swd >> SW_Top_Shift) & 7;
697 offset = (S387->ftop & 7) * 10;
698 other = 80 - offset;
699
700 RE_ENTRANT_CHECK_OFF;
701 /* Copy all registers in stack order. */
702 if (__copy_from_user(((u_char *)&S387->st_space)+offset, d, other))
703 return -1;
704 if ( offset )
705 if (__copy_from_user((u_char *)&S387->st_space, d+other, offset))
706 return -1;
707 RE_ENTRANT_CHECK_ON;
708
709 /* The tags may need to be corrected now. */
710 tags = S387->twd;
711 newtop = S387->ftop;
712 for ( i = 0; i < 8; i++ )
713 {
714 regnr = (i+newtop) & 7;
715 if ( ((tags >> ((regnr & 7)*2)) & 3) != TAG_Empty )
716 {
717 /* The loaded data over-rides all other cases. */
718 tag = FPU_tagof((FPU_REG *)((u_char *)S387->st_space + 10*regnr));
719 tags &= ~(3 << (regnr*2));
720 tags |= (tag & 3) << (regnr*2);
721 }
722 }
723 S387->twd = tags;
724
725 return 0;
726}
727
728
729int save_i387_soft(void *s387, struct _fpstate __user * buf)
730{
731 u_char __user *d = (u_char __user *)buf;
732 int offset = (S387->ftop & 7) * 10, other = 80 - offset;
733
734 RE_ENTRANT_CHECK_OFF;
735 FPU_access_ok(VERIFY_WRITE, d, 7*4 + 8*10);
736#ifdef PECULIAR_486
737 S387->cwd &= ~0xe080;
738 /* An 80486 sets nearly all of the reserved bits to 1. */
739 S387->cwd |= 0xffff0040;
740 S387->swd = sstatus_word() | 0xffff0000;
741 S387->twd |= 0xffff0000;
742 S387->fcs &= ~0xf8000000;
743 S387->fos |= 0xffff0000;
744#endif /* PECULIAR_486 */
745 if (__copy_to_user(d, &S387->cwd, 7*4))
746 return -1;
747 RE_ENTRANT_CHECK_ON;
748
749 d += 7*4;
750
751 RE_ENTRANT_CHECK_OFF;
752 /* Copy all registers in stack order. */
753 if (__copy_to_user(d, ((u_char *)&S387->st_space)+offset, other))
754 return -1;
755 if ( offset )
756 if (__copy_to_user(d+other, (u_char *)&S387->st_space, offset))
757 return -1;
758 RE_ENTRANT_CHECK_ON;
759
760 return 1;
761}
diff --git a/arch/x86/math-emu/fpu_etc.c b/arch/x86/math-emu/fpu_etc.c
new file mode 100644
index 000000000000..e3b5d465587f
--- /dev/null
+++ b/arch/x86/math-emu/fpu_etc.c
@@ -0,0 +1,143 @@
1/*---------------------------------------------------------------------------+
2 | fpu_etc.c |
3 | |
4 | Implement a few FPU instructions. |
5 | |
6 | Copyright (C) 1992,1993,1994,1997 |
7 | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, |
8 | Australia. E-mail billm@suburbia.net |
9 | |
10 | |
11 +---------------------------------------------------------------------------*/
12
13#include "fpu_system.h"
14#include "exception.h"
15#include "fpu_emu.h"
16#include "status_w.h"
17#include "reg_constant.h"
18
19
20static void fchs(FPU_REG *st0_ptr, u_char st0tag)
21{
22 if ( st0tag ^ TAG_Empty )
23 {
24 signbyte(st0_ptr) ^= SIGN_NEG;
25 clear_C1();
26 }
27 else
28 FPU_stack_underflow();
29}
30
31
32static void fabs(FPU_REG *st0_ptr, u_char st0tag)
33{
34 if ( st0tag ^ TAG_Empty )
35 {
36 setpositive(st0_ptr);
37 clear_C1();
38 }
39 else
40 FPU_stack_underflow();
41}
42
43
44static void ftst_(FPU_REG *st0_ptr, u_char st0tag)
45{
46 switch (st0tag)
47 {
48 case TAG_Zero:
49 setcc(SW_C3);
50 break;
51 case TAG_Valid:
52 if (getsign(st0_ptr) == SIGN_POS)
53 setcc(0);
54 else
55 setcc(SW_C0);
56 break;
57 case TAG_Special:
58 switch ( FPU_Special(st0_ptr) )
59 {
60 case TW_Denormal:
61 if (getsign(st0_ptr) == SIGN_POS)
62 setcc(0);
63 else
64 setcc(SW_C0);
65 if ( denormal_operand() < 0 )
66 {
67#ifdef PECULIAR_486
68 /* This is weird! */
69 if (getsign(st0_ptr) == SIGN_POS)
70 setcc(SW_C3);
71#endif /* PECULIAR_486 */
72 return;
73 }
74 break;
75 case TW_NaN:
76 setcc(SW_C0|SW_C2|SW_C3); /* Operand is not comparable */
77 EXCEPTION(EX_Invalid);
78 break;
79 case TW_Infinity:
80 if (getsign(st0_ptr) == SIGN_POS)
81 setcc(0);
82 else
83 setcc(SW_C0);
84 break;
85 default:
86 setcc(SW_C0|SW_C2|SW_C3); /* Operand is not comparable */
87 EXCEPTION(EX_INTERNAL|0x14);
88 break;
89 }
90 break;
91 case TAG_Empty:
92 setcc(SW_C0|SW_C2|SW_C3);
93 EXCEPTION(EX_StackUnder);
94 break;
95 }
96}
97
98
99static void fxam(FPU_REG *st0_ptr, u_char st0tag)
100{
101 int c = 0;
102 switch (st0tag)
103 {
104 case TAG_Empty:
105 c = SW_C3|SW_C0;
106 break;
107 case TAG_Zero:
108 c = SW_C3;
109 break;
110 case TAG_Valid:
111 c = SW_C2;
112 break;
113 case TAG_Special:
114 switch ( FPU_Special(st0_ptr) )
115 {
116 case TW_Denormal:
117 c = SW_C2|SW_C3; /* Denormal */
118 break;
119 case TW_NaN:
120 /* We also use NaN for unsupported types. */
121 if ( (st0_ptr->sigh & 0x80000000) && (exponent(st0_ptr) == EXP_OVER) )
122 c = SW_C0;
123 break;
124 case TW_Infinity:
125 c = SW_C2|SW_C0;
126 break;
127 }
128 }
129 if ( getsign(st0_ptr) == SIGN_NEG )
130 c |= SW_C1;
131 setcc(c);
132}
133
134
135static FUNC_ST0 const fp_etc_table[] = {
136 fchs, fabs, (FUNC_ST0)FPU_illegal, (FUNC_ST0)FPU_illegal,
137 ftst_, fxam, (FUNC_ST0)FPU_illegal, (FUNC_ST0)FPU_illegal
138};
139
140void FPU_etc(void)
141{
142 (fp_etc_table[FPU_rm])(&st(0), FPU_gettag0());
143}
diff --git a/arch/x86/math-emu/fpu_proto.h b/arch/x86/math-emu/fpu_proto.h
new file mode 100644
index 000000000000..37a8a7fe7e2b
--- /dev/null
+++ b/arch/x86/math-emu/fpu_proto.h
@@ -0,0 +1,140 @@
1#ifndef _FPU_PROTO_H
2#define _FPU_PROTO_H
3
4/* errors.c */
5extern void FPU_illegal(void);
6extern void FPU_printall(void);
7asmlinkage void FPU_exception(int n);
8extern int real_1op_NaN(FPU_REG *a);
9extern int real_2op_NaN(FPU_REG const *b, u_char tagb, int deststnr,
10 FPU_REG const *defaultNaN);
11asmlinkage int arith_invalid(int deststnr);
12asmlinkage int FPU_divide_by_zero(int deststnr, u_char sign);
13extern int set_precision_flag(int flags);
14asmlinkage void set_precision_flag_up(void);
15asmlinkage void set_precision_flag_down(void);
16asmlinkage int denormal_operand(void);
17asmlinkage int arith_overflow(FPU_REG *dest);
18asmlinkage int arith_underflow(FPU_REG *dest);
19extern void FPU_stack_overflow(void);
20extern void FPU_stack_underflow(void);
21extern void FPU_stack_underflow_i(int i);
22extern void FPU_stack_underflow_pop(int i);
23/* fpu_arith.c */
24extern void fadd__(void);
25extern void fmul__(void);
26extern void fsub__(void);
27extern void fsubr_(void);
28extern void fdiv__(void);
29extern void fdivr_(void);
30extern void fadd_i(void);
31extern void fmul_i(void);
32extern void fsubri(void);
33extern void fsub_i(void);
34extern void fdivri(void);
35extern void fdiv_i(void);
36extern void faddp_(void);
37extern void fmulp_(void);
38extern void fsubrp(void);
39extern void fsubp_(void);
40extern void fdivrp(void);
41extern void fdivp_(void);
42/* fpu_aux.c */
43extern void finit(void);
44extern void finit_(void);
45extern void fstsw_(void);
46extern void fp_nop(void);
47extern void fld_i_(void);
48extern void fxch_i(void);
49extern void ffree_(void);
50extern void ffreep(void);
51extern void fst_i_(void);
52extern void fstp_i(void);
53/* fpu_entry.c */
54asmlinkage extern void math_emulate(long arg);
55extern void math_abort(struct info *info, unsigned int signal);
56/* fpu_etc.c */
57extern void FPU_etc(void);
58/* fpu_tags.c */
59extern int FPU_gettag0(void);
60extern int FPU_gettagi(int stnr);
61extern int FPU_gettag(int regnr);
62extern void FPU_settag0(int tag);
63extern void FPU_settagi(int stnr, int tag);
64extern void FPU_settag(int regnr, int tag);
65extern int FPU_Special(FPU_REG const *ptr);
66extern int isNaN(FPU_REG const *ptr);
67extern void FPU_pop(void);
68extern int FPU_empty_i(int stnr);
69extern int FPU_stackoverflow(FPU_REG **st_new_ptr);
70extern void FPU_copy_to_regi(FPU_REG const *r, u_char tag, int stnr);
71extern void FPU_copy_to_reg1(FPU_REG const *r, u_char tag);
72extern void FPU_copy_to_reg0(FPU_REG const *r, u_char tag);
73/* fpu_trig.c */
74extern void FPU_triga(void);
75extern void FPU_trigb(void);
76/* get_address.c */
77extern void __user *FPU_get_address(u_char FPU_modrm, unsigned long *fpu_eip,
78 struct address *addr, fpu_addr_modes addr_modes);
79extern void __user *FPU_get_address_16(u_char FPU_modrm, unsigned long *fpu_eip,
80 struct address *addr, fpu_addr_modes addr_modes);
81/* load_store.c */
82extern int FPU_load_store(u_char type, fpu_addr_modes addr_modes,
83 void __user *data_address);
84/* poly_2xm1.c */
85extern int poly_2xm1(u_char sign, FPU_REG *arg, FPU_REG *result);
86/* poly_atan.c */
87extern void poly_atan(FPU_REG *st0_ptr, u_char st0_tag, FPU_REG *st1_ptr,
88 u_char st1_tag);
89/* poly_l2.c */
90extern void poly_l2(FPU_REG *st0_ptr, FPU_REG *st1_ptr, u_char st1_sign);
91extern int poly_l2p1(u_char s0, u_char s1, FPU_REG *r0, FPU_REG *r1,
92 FPU_REG *d);
93/* poly_sin.c */
94extern void poly_sine(FPU_REG *st0_ptr);
95extern void poly_cos(FPU_REG *st0_ptr);
96/* poly_tan.c */
97extern void poly_tan(FPU_REG *st0_ptr);
98/* reg_add_sub.c */
99extern int FPU_add(FPU_REG const *b, u_char tagb, int destrnr, int control_w);
100extern int FPU_sub(int flags, int rm, int control_w);
101/* reg_compare.c */
102extern int FPU_compare_st_data(FPU_REG const *loaded_data, u_char loaded_tag);
103extern void fcom_st(void);
104extern void fcompst(void);
105extern void fcompp(void);
106extern void fucom_(void);
107extern void fucomp(void);
108extern void fucompp(void);
109/* reg_constant.c */
110extern void fconst(void);
111/* reg_ld_str.c */
112extern int FPU_load_extended(long double __user *s, int stnr);
113extern int FPU_load_double(double __user *dfloat, FPU_REG *loaded_data);
114extern int FPU_load_single(float __user *single, FPU_REG *loaded_data);
115extern int FPU_load_int64(long long __user *_s);
116extern int FPU_load_int32(long __user *_s, FPU_REG *loaded_data);
117extern int FPU_load_int16(short __user *_s, FPU_REG *loaded_data);
118extern int FPU_load_bcd(u_char __user *s);
119extern int FPU_store_extended(FPU_REG *st0_ptr, u_char st0_tag,
120 long double __user *d);
121extern int FPU_store_double(FPU_REG *st0_ptr, u_char st0_tag, double __user *dfloat);
122extern int FPU_store_single(FPU_REG *st0_ptr, u_char st0_tag, float __user *single);
123extern int FPU_store_int64(FPU_REG *st0_ptr, u_char st0_tag, long long __user *d);
124extern int FPU_store_int32(FPU_REG *st0_ptr, u_char st0_tag, long __user *d);
125extern int FPU_store_int16(FPU_REG *st0_ptr, u_char st0_tag, short __user *d);
126extern int FPU_store_bcd(FPU_REG *st0_ptr, u_char st0_tag, u_char __user *d);
127extern int FPU_round_to_int(FPU_REG *r, u_char tag);
128extern u_char __user *fldenv(fpu_addr_modes addr_modes, u_char __user *s);
129extern void frstor(fpu_addr_modes addr_modes, u_char __user *data_address);
130extern u_char __user *fstenv(fpu_addr_modes addr_modes, u_char __user *d);
131extern void fsave(fpu_addr_modes addr_modes, u_char __user *data_address);
132extern int FPU_tagof(FPU_REG *ptr);
133/* reg_mul.c */
134extern int FPU_mul(FPU_REG const *b, u_char tagb, int deststnr, int control_w);
135
136extern int FPU_div(int flags, int regrm, int control_w);
137/* reg_convert.c */
138extern int FPU_to_exp16(FPU_REG const *a, FPU_REG *x);
139#endif /* _FPU_PROTO_H */
140
diff --git a/arch/x86/math-emu/fpu_system.h b/arch/x86/math-emu/fpu_system.h
new file mode 100644
index 000000000000..a3ae28c49ddd
--- /dev/null
+++ b/arch/x86/math-emu/fpu_system.h
@@ -0,0 +1,90 @@
1/*---------------------------------------------------------------------------+
2 | fpu_system.h |
3 | |
4 | Copyright (C) 1992,1994,1997 |
5 | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, |
6 | Australia. E-mail billm@suburbia.net |
7 | |
8 +---------------------------------------------------------------------------*/
9
10#ifndef _FPU_SYSTEM_H
11#define _FPU_SYSTEM_H
12
13/* system dependent definitions */
14
15#include <linux/sched.h>
16#include <linux/kernel.h>
17#include <linux/mm.h>
18
19/* This sets the pointer FPU_info to point to the argument part
20 of the stack frame of math_emulate() */
21#define SETUP_DATA_AREA(arg) FPU_info = (struct info *) &arg
22
23/* s is always from a cpu register, and the cpu does bounds checking
24 * during register load --> no further bounds checks needed */
25#define LDT_DESCRIPTOR(s) (((struct desc_struct *)current->mm->context.ldt)[(s) >> 3])
26#define SEG_D_SIZE(x) ((x).b & (3 << 21))
27#define SEG_G_BIT(x) ((x).b & (1 << 23))
28#define SEG_GRANULARITY(x) (((x).b & (1 << 23)) ? 4096 : 1)
29#define SEG_286_MODE(x) ((x).b & ( 0xff000000 | 0xf0000 | (1 << 23)))
30#define SEG_BASE_ADDR(s) (((s).b & 0xff000000) \
31 | (((s).b & 0xff) << 16) | ((s).a >> 16))
32#define SEG_LIMIT(s) (((s).b & 0xff0000) | ((s).a & 0xffff))
33#define SEG_EXECUTE_ONLY(s) (((s).b & ((1 << 11) | (1 << 9))) == (1 << 11))
34#define SEG_WRITE_PERM(s) (((s).b & ((1 << 11) | (1 << 9))) == (1 << 9))
35#define SEG_EXPAND_DOWN(s) (((s).b & ((1 << 11) | (1 << 10))) \
36 == (1 << 10))
37
38#define I387 (current->thread.i387)
39#define FPU_info (I387.soft.info)
40
41#define FPU_CS (*(unsigned short *) &(FPU_info->___cs))
42#define FPU_SS (*(unsigned short *) &(FPU_info->___ss))
43#define FPU_DS (*(unsigned short *) &(FPU_info->___ds))
44#define FPU_EAX (FPU_info->___eax)
45#define FPU_EFLAGS (FPU_info->___eflags)
46#define FPU_EIP (FPU_info->___eip)
47#define FPU_ORIG_EIP (FPU_info->___orig_eip)
48
49#define FPU_lookahead (I387.soft.lookahead)
50
51/* nz if ip_offset and cs_selector are not to be set for the current
52 instruction. */
53#define no_ip_update (*(u_char *)&(I387.soft.no_update))
54#define FPU_rm (*(u_char *)&(I387.soft.rm))
55
56/* Number of bytes of data which can be legally accessed by the current
57 instruction. This only needs to hold a number <= 108, so a byte will do. */
58#define access_limit (*(u_char *)&(I387.soft.alimit))
59
60#define partial_status (I387.soft.swd)
61#define control_word (I387.soft.cwd)
62#define fpu_tag_word (I387.soft.twd)
63#define registers (I387.soft.st_space)
64#define top (I387.soft.ftop)
65
66#define instruction_address (*(struct address *)&I387.soft.fip)
67#define operand_address (*(struct address *)&I387.soft.foo)
68
69#define FPU_access_ok(x,y,z) if ( !access_ok(x,y,z) ) \
70 math_abort(FPU_info,SIGSEGV)
71#define FPU_abort math_abort(FPU_info, SIGSEGV)
72
73#undef FPU_IGNORE_CODE_SEGV
74#ifdef FPU_IGNORE_CODE_SEGV
75/* access_ok() is very expensive, and causes the emulator to run
76 about 20% slower if applied to the code. Anyway, errors due to bad
77 code addresses should be much rarer than errors due to bad data
78 addresses. */
79#define FPU_code_access_ok(z)
80#else
81/* A simpler test than access_ok() can probably be done for
82 FPU_code_access_ok() because the only possible error is to step
83 past the upper boundary of a legal code area. */
84#define FPU_code_access_ok(z) FPU_access_ok(VERIFY_READ,(void __user *)FPU_EIP,z)
85#endif
86
87#define FPU_get_user(x,y) get_user((x),(y))
88#define FPU_put_user(x,y) put_user((x),(y))
89
90#endif
diff --git a/arch/x86/math-emu/fpu_tags.c b/arch/x86/math-emu/fpu_tags.c
new file mode 100644
index 000000000000..cb436fe20e4c
--- /dev/null
+++ b/arch/x86/math-emu/fpu_tags.c
@@ -0,0 +1,127 @@
1/*---------------------------------------------------------------------------+
2 | fpu_tags.c |
3 | |
4 | Set FPU register tags. |
5 | |
6 | Copyright (C) 1997 |
7 | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
8 | E-mail billm@jacobi.maths.monash.edu.au |
9 | |
10 | |
11 +---------------------------------------------------------------------------*/
12
13#include "fpu_emu.h"
14#include "fpu_system.h"
15#include "exception.h"
16
17
18void FPU_pop(void)
19{
20 fpu_tag_word |= 3 << ((top & 7)*2);
21 top++;
22}
23
24
25int FPU_gettag0(void)
26{
27 return (fpu_tag_word >> ((top & 7)*2)) & 3;
28}
29
30
31int FPU_gettagi(int stnr)
32{
33 return (fpu_tag_word >> (((top+stnr) & 7)*2)) & 3;
34}
35
36
37int FPU_gettag(int regnr)
38{
39 return (fpu_tag_word >> ((regnr & 7)*2)) & 3;
40}
41
42
43void FPU_settag0(int tag)
44{
45 int regnr = top;
46 regnr &= 7;
47 fpu_tag_word &= ~(3 << (regnr*2));
48 fpu_tag_word |= (tag & 3) << (regnr*2);
49}
50
51
52void FPU_settagi(int stnr, int tag)
53{
54 int regnr = stnr+top;
55 regnr &= 7;
56 fpu_tag_word &= ~(3 << (regnr*2));
57 fpu_tag_word |= (tag & 3) << (regnr*2);
58}
59
60
61void FPU_settag(int regnr, int tag)
62{
63 regnr &= 7;
64 fpu_tag_word &= ~(3 << (regnr*2));
65 fpu_tag_word |= (tag & 3) << (regnr*2);
66}
67
68
69int FPU_Special(FPU_REG const *ptr)
70{
71 int exp = exponent(ptr);
72
73 if ( exp == EXP_BIAS+EXP_UNDER )
74 return TW_Denormal;
75 else if ( exp != EXP_BIAS+EXP_OVER )
76 return TW_NaN;
77 else if ( (ptr->sigh == 0x80000000) && (ptr->sigl == 0) )
78 return TW_Infinity;
79 return TW_NaN;
80}
81
82
83int isNaN(FPU_REG const *ptr)
84{
85 return ( (exponent(ptr) == EXP_BIAS+EXP_OVER)
86 && !((ptr->sigh == 0x80000000) && (ptr->sigl == 0)) );
87}
88
89
90int FPU_empty_i(int stnr)
91{
92 int regnr = (top+stnr) & 7;
93
94 return ((fpu_tag_word >> (regnr*2)) & 3) == TAG_Empty;
95}
96
97
98int FPU_stackoverflow(FPU_REG **st_new_ptr)
99{
100 *st_new_ptr = &st(-1);
101
102 return ((fpu_tag_word >> (((top - 1) & 7)*2)) & 3) != TAG_Empty;
103}
104
105
106void FPU_copy_to_regi(FPU_REG const *r, u_char tag, int stnr)
107{
108 reg_copy(r, &st(stnr));
109 FPU_settagi(stnr, tag);
110}
111
112void FPU_copy_to_reg1(FPU_REG const *r, u_char tag)
113{
114 reg_copy(r, &st(1));
115 FPU_settagi(1, tag);
116}
117
118void FPU_copy_to_reg0(FPU_REG const *r, u_char tag)
119{
120 int regnr = top;
121 regnr &= 7;
122
123 reg_copy(r, &st(0));
124
125 fpu_tag_word &= ~(3 << (regnr*2));
126 fpu_tag_word |= (tag & 3) << (regnr*2);
127}
diff --git a/arch/x86/math-emu/fpu_trig.c b/arch/x86/math-emu/fpu_trig.c
new file mode 100644
index 000000000000..403cbde1d425
--- /dev/null
+++ b/arch/x86/math-emu/fpu_trig.c
@@ -0,0 +1,1845 @@
1/*---------------------------------------------------------------------------+
2 | fpu_trig.c |
3 | |
4 | Implementation of the FPU "transcendental" functions. |
5 | |
6 | Copyright (C) 1992,1993,1994,1997,1999 |
7 | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, |
8 | Australia. E-mail billm@melbpc.org.au |
9 | |
10 | |
11 +---------------------------------------------------------------------------*/
12
13#include "fpu_system.h"
14#include "exception.h"
15#include "fpu_emu.h"
16#include "status_w.h"
17#include "control_w.h"
18#include "reg_constant.h"
19
20static void rem_kernel(unsigned long long st0, unsigned long long *y,
21 unsigned long long st1,
22 unsigned long long q, int n);
23
24#define BETTER_THAN_486
25
26#define FCOS 4
27
28/* Used only by fptan, fsin, fcos, and fsincos. */
29/* This routine produces very accurate results, similar to
30 using a value of pi with more than 128 bits precision. */
31/* Limited measurements show no results worse than 64 bit precision
32 except for the results for arguments close to 2^63, where the
33 precision of the result sometimes degrades to about 63.9 bits */
34static int trig_arg(FPU_REG *st0_ptr, int even)
35{
36 FPU_REG tmp;
37 u_char tmptag;
38 unsigned long long q;
39 int old_cw = control_word, saved_status = partial_status;
40 int tag, st0_tag = TAG_Valid;
41
42 if ( exponent(st0_ptr) >= 63 )
43 {
44 partial_status |= SW_C2; /* Reduction incomplete. */
45 return -1;
46 }
47
48 control_word &= ~CW_RC;
49 control_word |= RC_CHOP;
50
51 setpositive(st0_ptr);
52 tag = FPU_u_div(st0_ptr, &CONST_PI2, &tmp, PR_64_BITS | RC_CHOP | 0x3f,
53 SIGN_POS);
54
55 FPU_round_to_int(&tmp, tag); /* Fortunately, this can't overflow
56 to 2^64 */
57 q = significand(&tmp);
58 if ( q )
59 {
60 rem_kernel(significand(st0_ptr),
61 &significand(&tmp),
62 significand(&CONST_PI2),
63 q, exponent(st0_ptr) - exponent(&CONST_PI2));
64 setexponent16(&tmp, exponent(&CONST_PI2));
65 st0_tag = FPU_normalize(&tmp);
66 FPU_copy_to_reg0(&tmp, st0_tag);
67 }
68
69 if ( (even && !(q & 1)) || (!even && (q & 1)) )
70 {
71 st0_tag = FPU_sub(REV|LOADED|TAG_Valid, (int)&CONST_PI2, FULL_PRECISION);
72
73#ifdef BETTER_THAN_486
74 /* So far, the results are exact but based upon a 64 bit
75 precision approximation to pi/2. The technique used
76 now is equivalent to using an approximation to pi/2 which
77 is accurate to about 128 bits. */
78 if ( (exponent(st0_ptr) <= exponent(&CONST_PI2extra) + 64) || (q > 1) )
79 {
80 /* This code gives the effect of having pi/2 to better than
81 128 bits precision. */
82
83 significand(&tmp) = q + 1;
84 setexponent16(&tmp, 63);
85 FPU_normalize(&tmp);
86 tmptag =
87 FPU_u_mul(&CONST_PI2extra, &tmp, &tmp, FULL_PRECISION, SIGN_POS,
88 exponent(&CONST_PI2extra) + exponent(&tmp));
89 setsign(&tmp, getsign(&CONST_PI2extra));
90 st0_tag = FPU_add(&tmp, tmptag, 0, FULL_PRECISION);
91 if ( signnegative(st0_ptr) )
92 {
93 /* CONST_PI2extra is negative, so the result of the addition
94 can be negative. This means that the argument is actually
95 in a different quadrant. The correction is always < pi/2,
96 so it can't overflow into yet another quadrant. */
97 setpositive(st0_ptr);
98 q++;
99 }
100 }
101#endif /* BETTER_THAN_486 */
102 }
103#ifdef BETTER_THAN_486
104 else
105 {
106 /* So far, the results are exact but based upon a 64 bit
107 precision approximation to pi/2. The technique used
108 now is equivalent to using an approximation to pi/2 which
109 is accurate to about 128 bits. */
110 if ( ((q > 0) && (exponent(st0_ptr) <= exponent(&CONST_PI2extra) + 64))
111 || (q > 1) )
112 {
113 /* This code gives the effect of having p/2 to better than
114 128 bits precision. */
115
116 significand(&tmp) = q;
117 setexponent16(&tmp, 63);
118 FPU_normalize(&tmp); /* This must return TAG_Valid */
119 tmptag = FPU_u_mul(&CONST_PI2extra, &tmp, &tmp, FULL_PRECISION,
120 SIGN_POS,
121 exponent(&CONST_PI2extra) + exponent(&tmp));
122 setsign(&tmp, getsign(&CONST_PI2extra));
123 st0_tag = FPU_sub(LOADED|(tmptag & 0x0f), (int)&tmp,
124 FULL_PRECISION);
125 if ( (exponent(st0_ptr) == exponent(&CONST_PI2)) &&
126 ((st0_ptr->sigh > CONST_PI2.sigh)
127 || ((st0_ptr->sigh == CONST_PI2.sigh)
128 && (st0_ptr->sigl > CONST_PI2.sigl))) )
129 {
130 /* CONST_PI2extra is negative, so the result of the
131 subtraction can be larger than pi/2. This means
132 that the argument is actually in a different quadrant.
133 The correction is always < pi/2, so it can't overflow
134 into yet another quadrant. */
135 st0_tag = FPU_sub(REV|LOADED|TAG_Valid, (int)&CONST_PI2,
136 FULL_PRECISION);
137 q++;
138 }
139 }
140 }
141#endif /* BETTER_THAN_486 */
142
143 FPU_settag0(st0_tag);
144 control_word = old_cw;
145 partial_status = saved_status & ~SW_C2; /* Reduction complete. */
146
147 return (q & 3) | even;
148}
149
150
151/* Convert a long to register */
152static void convert_l2reg(long const *arg, int deststnr)
153{
154 int tag;
155 long num = *arg;
156 u_char sign;
157 FPU_REG *dest = &st(deststnr);
158
159 if (num == 0)
160 {
161 FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr);
162 return;
163 }
164
165 if (num > 0)
166 { sign = SIGN_POS; }
167 else
168 { num = -num; sign = SIGN_NEG; }
169
170 dest->sigh = num;
171 dest->sigl = 0;
172 setexponent16(dest, 31);
173 tag = FPU_normalize(dest);
174 FPU_settagi(deststnr, tag);
175 setsign(dest, sign);
176 return;
177}
178
179
180static void single_arg_error(FPU_REG *st0_ptr, u_char st0_tag)
181{
182 if ( st0_tag == TAG_Empty )
183 FPU_stack_underflow(); /* Puts a QNaN in st(0) */
184 else if ( st0_tag == TW_NaN )
185 real_1op_NaN(st0_ptr); /* return with a NaN in st(0) */
186#ifdef PARANOID
187 else
188 EXCEPTION(EX_INTERNAL|0x0112);
189#endif /* PARANOID */
190}
191
192
193static void single_arg_2_error(FPU_REG *st0_ptr, u_char st0_tag)
194{
195 int isNaN;
196
197 switch ( st0_tag )
198 {
199 case TW_NaN:
200 isNaN = (exponent(st0_ptr) == EXP_OVER) && (st0_ptr->sigh & 0x80000000);
201 if ( isNaN && !(st0_ptr->sigh & 0x40000000) ) /* Signaling ? */
202 {
203 EXCEPTION(EX_Invalid);
204 if ( control_word & CW_Invalid )
205 {
206 /* The masked response */
207 /* Convert to a QNaN */
208 st0_ptr->sigh |= 0x40000000;
209 push();
210 FPU_copy_to_reg0(st0_ptr, TAG_Special);
211 }
212 }
213 else if ( isNaN )
214 {
215 /* A QNaN */
216 push();
217 FPU_copy_to_reg0(st0_ptr, TAG_Special);
218 }
219 else
220 {
221 /* pseudoNaN or other unsupported */
222 EXCEPTION(EX_Invalid);
223 if ( control_word & CW_Invalid )
224 {
225 /* The masked response */
226 FPU_copy_to_reg0(&CONST_QNaN, TAG_Special);
227 push();
228 FPU_copy_to_reg0(&CONST_QNaN, TAG_Special);
229 }
230 }
231 break; /* return with a NaN in st(0) */
232#ifdef PARANOID
233 default:
234 EXCEPTION(EX_INTERNAL|0x0112);
235#endif /* PARANOID */
236 }
237}
238
239
240/*---------------------------------------------------------------------------*/
241
242static void f2xm1(FPU_REG *st0_ptr, u_char tag)
243{
244 FPU_REG a;
245
246 clear_C1();
247
248 if ( tag == TAG_Valid )
249 {
250 /* For an 80486 FPU, the result is undefined if the arg is >= 1.0 */
251 if ( exponent(st0_ptr) < 0 )
252 {
253 denormal_arg:
254
255 FPU_to_exp16(st0_ptr, &a);
256
257 /* poly_2xm1(x) requires 0 < st(0) < 1. */
258 poly_2xm1(getsign(st0_ptr), &a, st0_ptr);
259 }
260 set_precision_flag_up(); /* 80486 appears to always do this */
261 return;
262 }
263
264 if ( tag == TAG_Zero )
265 return;
266
267 if ( tag == TAG_Special )
268 tag = FPU_Special(st0_ptr);
269
270 switch ( tag )
271 {
272 case TW_Denormal:
273 if ( denormal_operand() < 0 )
274 return;
275 goto denormal_arg;
276 case TW_Infinity:
277 if ( signnegative(st0_ptr) )
278 {
279 /* -infinity gives -1 (p16-10) */
280 FPU_copy_to_reg0(&CONST_1, TAG_Valid);
281 setnegative(st0_ptr);
282 }
283 return;
284 default:
285 single_arg_error(st0_ptr, tag);
286 }
287}
288
289
290static void fptan(FPU_REG *st0_ptr, u_char st0_tag)
291{
292 FPU_REG *st_new_ptr;
293 int q;
294 u_char arg_sign = getsign(st0_ptr);
295
296 /* Stack underflow has higher priority */
297 if ( st0_tag == TAG_Empty )
298 {
299 FPU_stack_underflow(); /* Puts a QNaN in st(0) */
300 if ( control_word & CW_Invalid )
301 {
302 st_new_ptr = &st(-1);
303 push();
304 FPU_stack_underflow(); /* Puts a QNaN in the new st(0) */
305 }
306 return;
307 }
308
309 if ( STACK_OVERFLOW )
310 { FPU_stack_overflow(); return; }
311
312 if ( st0_tag == TAG_Valid )
313 {
314 if ( exponent(st0_ptr) > -40 )
315 {
316 if ( (q = trig_arg(st0_ptr, 0)) == -1 )
317 {
318 /* Operand is out of range */
319 return;
320 }
321
322 poly_tan(st0_ptr);
323 setsign(st0_ptr, (q & 1) ^ (arg_sign != 0));
324 set_precision_flag_up(); /* We do not really know if up or down */
325 }
326 else
327 {
328 /* For a small arg, the result == the argument */
329 /* Underflow may happen */
330
331 denormal_arg:
332
333 FPU_to_exp16(st0_ptr, st0_ptr);
334
335 st0_tag = FPU_round(st0_ptr, 1, 0, FULL_PRECISION, arg_sign);
336 FPU_settag0(st0_tag);
337 }
338 push();
339 FPU_copy_to_reg0(&CONST_1, TAG_Valid);
340 return;
341 }
342
343 if ( st0_tag == TAG_Zero )
344 {
345 push();
346 FPU_copy_to_reg0(&CONST_1, TAG_Valid);
347 setcc(0);
348 return;
349 }
350
351 if ( st0_tag == TAG_Special )
352 st0_tag = FPU_Special(st0_ptr);
353
354 if ( st0_tag == TW_Denormal )
355 {
356 if ( denormal_operand() < 0 )
357 return;
358
359 goto denormal_arg;
360 }
361
362 if ( st0_tag == TW_Infinity )
363 {
364 /* The 80486 treats infinity as an invalid operand */
365 if ( arith_invalid(0) >= 0 )
366 {
367 st_new_ptr = &st(-1);
368 push();
369 arith_invalid(0);
370 }
371 return;
372 }
373
374 single_arg_2_error(st0_ptr, st0_tag);
375}
376
377
378static void fxtract(FPU_REG *st0_ptr, u_char st0_tag)
379{
380 FPU_REG *st_new_ptr;
381 u_char sign;
382 register FPU_REG *st1_ptr = st0_ptr; /* anticipate */
383
384 if ( STACK_OVERFLOW )
385 { FPU_stack_overflow(); return; }
386
387 clear_C1();
388
389 if ( st0_tag == TAG_Valid )
390 {
391 long e;
392
393 push();
394 sign = getsign(st1_ptr);
395 reg_copy(st1_ptr, st_new_ptr);
396 setexponent16(st_new_ptr, exponent(st_new_ptr));
397
398 denormal_arg:
399
400 e = exponent16(st_new_ptr);
401 convert_l2reg(&e, 1);
402 setexponentpos(st_new_ptr, 0);
403 setsign(st_new_ptr, sign);
404 FPU_settag0(TAG_Valid); /* Needed if arg was a denormal */
405 return;
406 }
407 else if ( st0_tag == TAG_Zero )
408 {
409 sign = getsign(st0_ptr);
410
411 if ( FPU_divide_by_zero(0, SIGN_NEG) < 0 )
412 return;
413
414 push();
415 FPU_copy_to_reg0(&CONST_Z, TAG_Zero);
416 setsign(st_new_ptr, sign);
417 return;
418 }
419
420 if ( st0_tag == TAG_Special )
421 st0_tag = FPU_Special(st0_ptr);
422
423 if ( st0_tag == TW_Denormal )
424 {
425 if (denormal_operand() < 0 )
426 return;
427
428 push();
429 sign = getsign(st1_ptr);
430 FPU_to_exp16(st1_ptr, st_new_ptr);
431 goto denormal_arg;
432 }
433 else if ( st0_tag == TW_Infinity )
434 {
435 sign = getsign(st0_ptr);
436 setpositive(st0_ptr);
437 push();
438 FPU_copy_to_reg0(&CONST_INF, TAG_Special);
439 setsign(st_new_ptr, sign);
440 return;
441 }
442 else if ( st0_tag == TW_NaN )
443 {
444 if ( real_1op_NaN(st0_ptr) < 0 )
445 return;
446
447 push();
448 FPU_copy_to_reg0(st0_ptr, TAG_Special);
449 return;
450 }
451 else if ( st0_tag == TAG_Empty )
452 {
453 /* Is this the correct behaviour? */
454 if ( control_word & EX_Invalid )
455 {
456 FPU_stack_underflow();
457 push();
458 FPU_stack_underflow();
459 }
460 else
461 EXCEPTION(EX_StackUnder);
462 }
463#ifdef PARANOID
464 else
465 EXCEPTION(EX_INTERNAL | 0x119);
466#endif /* PARANOID */
467}
468
469
470static void fdecstp(void)
471{
472 clear_C1();
473 top--;
474}
475
476static void fincstp(void)
477{
478 clear_C1();
479 top++;
480}
481
482
483static void fsqrt_(FPU_REG *st0_ptr, u_char st0_tag)
484{
485 int expon;
486
487 clear_C1();
488
489 if ( st0_tag == TAG_Valid )
490 {
491 u_char tag;
492
493 if (signnegative(st0_ptr))
494 {
495 arith_invalid(0); /* sqrt(negative) is invalid */
496 return;
497 }
498
499 /* make st(0) in [1.0 .. 4.0) */
500 expon = exponent(st0_ptr);
501
502 denormal_arg:
503
504 setexponent16(st0_ptr, (expon & 1));
505
506 /* Do the computation, the sign of the result will be positive. */
507 tag = wm_sqrt(st0_ptr, 0, 0, control_word, SIGN_POS);
508 addexponent(st0_ptr, expon >> 1);
509 FPU_settag0(tag);
510 return;
511 }
512
513 if ( st0_tag == TAG_Zero )
514 return;
515
516 if ( st0_tag == TAG_Special )
517 st0_tag = FPU_Special(st0_ptr);
518
519 if ( st0_tag == TW_Infinity )
520 {
521 if ( signnegative(st0_ptr) )
522 arith_invalid(0); /* sqrt(-Infinity) is invalid */
523 return;
524 }
525 else if ( st0_tag == TW_Denormal )
526 {
527 if (signnegative(st0_ptr))
528 {
529 arith_invalid(0); /* sqrt(negative) is invalid */
530 return;
531 }
532
533 if ( denormal_operand() < 0 )
534 return;
535
536 FPU_to_exp16(st0_ptr, st0_ptr);
537
538 expon = exponent16(st0_ptr);
539
540 goto denormal_arg;
541 }
542
543 single_arg_error(st0_ptr, st0_tag);
544
545}
546
547
548static void frndint_(FPU_REG *st0_ptr, u_char st0_tag)
549{
550 int flags, tag;
551
552 if ( st0_tag == TAG_Valid )
553 {
554 u_char sign;
555
556 denormal_arg:
557
558 sign = getsign(st0_ptr);
559
560 if (exponent(st0_ptr) > 63)
561 return;
562
563 if ( st0_tag == TW_Denormal )
564 {
565 if (denormal_operand() < 0 )
566 return;
567 }
568
569 /* Fortunately, this can't overflow to 2^64 */
570 if ( (flags = FPU_round_to_int(st0_ptr, st0_tag)) )
571 set_precision_flag(flags);
572
573 setexponent16(st0_ptr, 63);
574 tag = FPU_normalize(st0_ptr);
575 setsign(st0_ptr, sign);
576 FPU_settag0(tag);
577 return;
578 }
579
580 if ( st0_tag == TAG_Zero )
581 return;
582
583 if ( st0_tag == TAG_Special )
584 st0_tag = FPU_Special(st0_ptr);
585
586 if ( st0_tag == TW_Denormal )
587 goto denormal_arg;
588 else if ( st0_tag == TW_Infinity )
589 return;
590 else
591 single_arg_error(st0_ptr, st0_tag);
592}
593
594
595static int fsin(FPU_REG *st0_ptr, u_char tag)
596{
597 u_char arg_sign = getsign(st0_ptr);
598
599 if ( tag == TAG_Valid )
600 {
601 int q;
602
603 if ( exponent(st0_ptr) > -40 )
604 {
605 if ( (q = trig_arg(st0_ptr, 0)) == -1 )
606 {
607 /* Operand is out of range */
608 return 1;
609 }
610
611 poly_sine(st0_ptr);
612
613 if (q & 2)
614 changesign(st0_ptr);
615
616 setsign(st0_ptr, getsign(st0_ptr) ^ arg_sign);
617
618 /* We do not really know if up or down */
619 set_precision_flag_up();
620 return 0;
621 }
622 else
623 {
624 /* For a small arg, the result == the argument */
625 set_precision_flag_up(); /* Must be up. */
626 return 0;
627 }
628 }
629
630 if ( tag == TAG_Zero )
631 {
632 setcc(0);
633 return 0;
634 }
635
636 if ( tag == TAG_Special )
637 tag = FPU_Special(st0_ptr);
638
639 if ( tag == TW_Denormal )
640 {
641 if ( denormal_operand() < 0 )
642 return 1;
643
644 /* For a small arg, the result == the argument */
645 /* Underflow may happen */
646 FPU_to_exp16(st0_ptr, st0_ptr);
647
648 tag = FPU_round(st0_ptr, 1, 0, FULL_PRECISION, arg_sign);
649
650 FPU_settag0(tag);
651
652 return 0;
653 }
654 else if ( tag == TW_Infinity )
655 {
656 /* The 80486 treats infinity as an invalid operand */
657 arith_invalid(0);
658 return 1;
659 }
660 else
661 {
662 single_arg_error(st0_ptr, tag);
663 return 1;
664 }
665}
666
667
668static int f_cos(FPU_REG *st0_ptr, u_char tag)
669{
670 u_char st0_sign;
671
672 st0_sign = getsign(st0_ptr);
673
674 if ( tag == TAG_Valid )
675 {
676 int q;
677
678 if ( exponent(st0_ptr) > -40 )
679 {
680 if ( (exponent(st0_ptr) < 0)
681 || ((exponent(st0_ptr) == 0)
682 && (significand(st0_ptr) <= 0xc90fdaa22168c234LL)) )
683 {
684 poly_cos(st0_ptr);
685
686 /* We do not really know if up or down */
687 set_precision_flag_down();
688
689 return 0;
690 }
691 else if ( (q = trig_arg(st0_ptr, FCOS)) != -1 )
692 {
693 poly_sine(st0_ptr);
694
695 if ((q+1) & 2)
696 changesign(st0_ptr);
697
698 /* We do not really know if up or down */
699 set_precision_flag_down();
700
701 return 0;
702 }
703 else
704 {
705 /* Operand is out of range */
706 return 1;
707 }
708 }
709 else
710 {
711 denormal_arg:
712
713 setcc(0);
714 FPU_copy_to_reg0(&CONST_1, TAG_Valid);
715#ifdef PECULIAR_486
716 set_precision_flag_down(); /* 80486 appears to do this. */
717#else
718 set_precision_flag_up(); /* Must be up. */
719#endif /* PECULIAR_486 */
720 return 0;
721 }
722 }
723 else if ( tag == TAG_Zero )
724 {
725 FPU_copy_to_reg0(&CONST_1, TAG_Valid);
726 setcc(0);
727 return 0;
728 }
729
730 if ( tag == TAG_Special )
731 tag = FPU_Special(st0_ptr);
732
733 if ( tag == TW_Denormal )
734 {
735 if ( denormal_operand() < 0 )
736 return 1;
737
738 goto denormal_arg;
739 }
740 else if ( tag == TW_Infinity )
741 {
742 /* The 80486 treats infinity as an invalid operand */
743 arith_invalid(0);
744 return 1;
745 }
746 else
747 {
748 single_arg_error(st0_ptr, tag); /* requires st0_ptr == &st(0) */
749 return 1;
750 }
751}
752
753
754static void fcos(FPU_REG *st0_ptr, u_char st0_tag)
755{
756 f_cos(st0_ptr, st0_tag);
757}
758
759
760static void fsincos(FPU_REG *st0_ptr, u_char st0_tag)
761{
762 FPU_REG *st_new_ptr;
763 FPU_REG arg;
764 u_char tag;
765
766 /* Stack underflow has higher priority */
767 if ( st0_tag == TAG_Empty )
768 {
769 FPU_stack_underflow(); /* Puts a QNaN in st(0) */
770 if ( control_word & CW_Invalid )
771 {
772 st_new_ptr = &st(-1);
773 push();
774 FPU_stack_underflow(); /* Puts a QNaN in the new st(0) */
775 }
776 return;
777 }
778
779 if ( STACK_OVERFLOW )
780 { FPU_stack_overflow(); return; }
781
782 if ( st0_tag == TAG_Special )
783 tag = FPU_Special(st0_ptr);
784 else
785 tag = st0_tag;
786
787 if ( tag == TW_NaN )
788 {
789 single_arg_2_error(st0_ptr, TW_NaN);
790 return;
791 }
792 else if ( tag == TW_Infinity )
793 {
794 /* The 80486 treats infinity as an invalid operand */
795 if ( arith_invalid(0) >= 0 )
796 {
797 /* Masked response */
798 push();
799 arith_invalid(0);
800 }
801 return;
802 }
803
804 reg_copy(st0_ptr, &arg);
805 if ( !fsin(st0_ptr, st0_tag) )
806 {
807 push();
808 FPU_copy_to_reg0(&arg, st0_tag);
809 f_cos(&st(0), st0_tag);
810 }
811 else
812 {
813 /* An error, so restore st(0) */
814 FPU_copy_to_reg0(&arg, st0_tag);
815 }
816}
817
818
819/*---------------------------------------------------------------------------*/
820/* The following all require two arguments: st(0) and st(1) */
821
822/* A lean, mean kernel for the fprem instructions. This relies upon
823 the division and rounding to an integer in do_fprem giving an
824 exact result. Because of this, rem_kernel() needs to deal only with
825 the least significant 64 bits, the more significant bits of the
826 result must be zero.
827 */
828static void rem_kernel(unsigned long long st0, unsigned long long *y,
829 unsigned long long st1,
830 unsigned long long q, int n)
831{
832 int dummy;
833 unsigned long long x;
834
835 x = st0 << n;
836
837 /* Do the required multiplication and subtraction in the one operation */
838
839 /* lsw x -= lsw st1 * lsw q */
840 asm volatile ("mull %4; subl %%eax,%0; sbbl %%edx,%1"
841 :"=m" (((unsigned *)&x)[0]), "=m" (((unsigned *)&x)[1]),
842 "=a" (dummy)
843 :"2" (((unsigned *)&st1)[0]), "m" (((unsigned *)&q)[0])
844 :"%dx");
845 /* msw x -= msw st1 * lsw q */
846 asm volatile ("mull %3; subl %%eax,%0"
847 :"=m" (((unsigned *)&x)[1]), "=a" (dummy)
848 :"1" (((unsigned *)&st1)[1]), "m" (((unsigned *)&q)[0])
849 :"%dx");
850 /* msw x -= lsw st1 * msw q */
851 asm volatile ("mull %3; subl %%eax,%0"
852 :"=m" (((unsigned *)&x)[1]), "=a" (dummy)
853 :"1" (((unsigned *)&st1)[0]), "m" (((unsigned *)&q)[1])
854 :"%dx");
855
856 *y = x;
857}
858
859
860/* Remainder of st(0) / st(1) */
861/* This routine produces exact results, i.e. there is never any
862 rounding or truncation, etc of the result. */
863static void do_fprem(FPU_REG *st0_ptr, u_char st0_tag, int round)
864{
865 FPU_REG *st1_ptr = &st(1);
866 u_char st1_tag = FPU_gettagi(1);
867
868 if ( !((st0_tag ^ TAG_Valid) | (st1_tag ^ TAG_Valid)) )
869 {
870 FPU_REG tmp, st0, st1;
871 u_char st0_sign, st1_sign;
872 u_char tmptag;
873 int tag;
874 int old_cw;
875 int expdif;
876 long long q;
877 unsigned short saved_status;
878 int cc;
879
880 fprem_valid:
881 /* Convert registers for internal use. */
882 st0_sign = FPU_to_exp16(st0_ptr, &st0);
883 st1_sign = FPU_to_exp16(st1_ptr, &st1);
884 expdif = exponent16(&st0) - exponent16(&st1);
885
886 old_cw = control_word;
887 cc = 0;
888
889 /* We want the status following the denorm tests, but don't want
890 the status changed by the arithmetic operations. */
891 saved_status = partial_status;
892 control_word &= ~CW_RC;
893 control_word |= RC_CHOP;
894
895 if ( expdif < 64 )
896 {
897 /* This should be the most common case */
898
899 if ( expdif > -2 )
900 {
901 u_char sign = st0_sign ^ st1_sign;
902 tag = FPU_u_div(&st0, &st1, &tmp,
903 PR_64_BITS | RC_CHOP | 0x3f,
904 sign);
905 setsign(&tmp, sign);
906
907 if ( exponent(&tmp) >= 0 )
908 {
909 FPU_round_to_int(&tmp, tag); /* Fortunately, this can't
910 overflow to 2^64 */
911 q = significand(&tmp);
912
913 rem_kernel(significand(&st0),
914 &significand(&tmp),
915 significand(&st1),
916 q, expdif);
917
918 setexponent16(&tmp, exponent16(&st1));
919 }
920 else
921 {
922 reg_copy(&st0, &tmp);
923 q = 0;
924 }
925
926 if ( (round == RC_RND) && (tmp.sigh & 0xc0000000) )
927 {
928 /* We may need to subtract st(1) once more,
929 to get a result <= 1/2 of st(1). */
930 unsigned long long x;
931 expdif = exponent16(&st1) - exponent16(&tmp);
932 if ( expdif <= 1 )
933 {
934 if ( expdif == 0 )
935 x = significand(&st1) - significand(&tmp);
936 else /* expdif is 1 */
937 x = (significand(&st1) << 1) - significand(&tmp);
938 if ( (x < significand(&tmp)) ||
939 /* or equi-distant (from 0 & st(1)) and q is odd */
940 ((x == significand(&tmp)) && (q & 1) ) )
941 {
942 st0_sign = ! st0_sign;
943 significand(&tmp) = x;
944 q++;
945 }
946 }
947 }
948
949 if (q & 4) cc |= SW_C0;
950 if (q & 2) cc |= SW_C3;
951 if (q & 1) cc |= SW_C1;
952 }
953 else
954 {
955 control_word = old_cw;
956 setcc(0);
957 return;
958 }
959 }
960 else
961 {
962 /* There is a large exponent difference ( >= 64 ) */
963 /* To make much sense, the code in this section should
964 be done at high precision. */
965 int exp_1, N;
966 u_char sign;
967
968 /* prevent overflow here */
969 /* N is 'a number between 32 and 63' (p26-113) */
970 reg_copy(&st0, &tmp);
971 tmptag = st0_tag;
972 N = (expdif & 0x0000001f) + 32; /* This choice gives results
973 identical to an AMD 486 */
974 setexponent16(&tmp, N);
975 exp_1 = exponent16(&st1);
976 setexponent16(&st1, 0);
977 expdif -= N;
978
979 sign = getsign(&tmp) ^ st1_sign;
980 tag = FPU_u_div(&tmp, &st1, &tmp, PR_64_BITS | RC_CHOP | 0x3f,
981 sign);
982 setsign(&tmp, sign);
983
984 FPU_round_to_int(&tmp, tag); /* Fortunately, this can't
985 overflow to 2^64 */
986
987 rem_kernel(significand(&st0),
988 &significand(&tmp),
989 significand(&st1),
990 significand(&tmp),
991 exponent(&tmp)
992 );
993 setexponent16(&tmp, exp_1 + expdif);
994
995 /* It is possible for the operation to be complete here.
996 What does the IEEE standard say? The Intel 80486 manual
997 implies that the operation will never be completed at this
998 point, and the behaviour of a real 80486 confirms this.
999 */
1000 if ( !(tmp.sigh | tmp.sigl) )
1001 {
1002 /* The result is zero */
1003 control_word = old_cw;
1004 partial_status = saved_status;
1005 FPU_copy_to_reg0(&CONST_Z, TAG_Zero);
1006 setsign(&st0, st0_sign);
1007#ifdef PECULIAR_486
1008 setcc(SW_C2);
1009#else
1010 setcc(0);
1011#endif /* PECULIAR_486 */
1012 return;
1013 }
1014 cc = SW_C2;
1015 }
1016
1017 control_word = old_cw;
1018 partial_status = saved_status;
1019 tag = FPU_normalize_nuo(&tmp);
1020 reg_copy(&tmp, st0_ptr);
1021
1022 /* The only condition to be looked for is underflow,
1023 and it can occur here only if underflow is unmasked. */
1024 if ( (exponent16(&tmp) <= EXP_UNDER) && (tag != TAG_Zero)
1025 && !(control_word & CW_Underflow) )
1026 {
1027 setcc(cc);
1028 tag = arith_underflow(st0_ptr);
1029 setsign(st0_ptr, st0_sign);
1030 FPU_settag0(tag);
1031 return;
1032 }
1033 else if ( (exponent16(&tmp) > EXP_UNDER) || (tag == TAG_Zero) )
1034 {
1035 stdexp(st0_ptr);
1036 setsign(st0_ptr, st0_sign);
1037 }
1038 else
1039 {
1040 tag = FPU_round(st0_ptr, 0, 0, FULL_PRECISION, st0_sign);
1041 }
1042 FPU_settag0(tag);
1043 setcc(cc);
1044
1045 return;
1046 }
1047
1048 if ( st0_tag == TAG_Special )
1049 st0_tag = FPU_Special(st0_ptr);
1050 if ( st1_tag == TAG_Special )
1051 st1_tag = FPU_Special(st1_ptr);
1052
1053 if ( ((st0_tag == TAG_Valid) && (st1_tag == TW_Denormal))
1054 || ((st0_tag == TW_Denormal) && (st1_tag == TAG_Valid))
1055 || ((st0_tag == TW_Denormal) && (st1_tag == TW_Denormal)) )
1056 {
1057 if ( denormal_operand() < 0 )
1058 return;
1059 goto fprem_valid;
1060 }
1061 else if ( (st0_tag == TAG_Empty) || (st1_tag == TAG_Empty) )
1062 {
1063 FPU_stack_underflow();
1064 return;
1065 }
1066 else if ( st0_tag == TAG_Zero )
1067 {
1068 if ( st1_tag == TAG_Valid )
1069 {
1070 setcc(0); return;
1071 }
1072 else if ( st1_tag == TW_Denormal )
1073 {
1074 if ( denormal_operand() < 0 )
1075 return;
1076 setcc(0); return;
1077 }
1078 else if ( st1_tag == TAG_Zero )
1079 { arith_invalid(0); return; } /* fprem(?,0) always invalid */
1080 else if ( st1_tag == TW_Infinity )
1081 { setcc(0); return; }
1082 }
1083 else if ( (st0_tag == TAG_Valid) || (st0_tag == TW_Denormal) )
1084 {
1085 if ( st1_tag == TAG_Zero )
1086 {
1087 arith_invalid(0); /* fprem(Valid,Zero) is invalid */
1088 return;
1089 }
1090 else if ( st1_tag != TW_NaN )
1091 {
1092 if ( ((st0_tag == TW_Denormal) || (st1_tag == TW_Denormal))
1093 && (denormal_operand() < 0) )
1094 return;
1095
1096 if ( st1_tag == TW_Infinity )
1097 {
1098 /* fprem(Valid,Infinity) is o.k. */
1099 setcc(0); return;
1100 }
1101 }
1102 }
1103 else if ( st0_tag == TW_Infinity )
1104 {
1105 if ( st1_tag != TW_NaN )
1106 {
1107 arith_invalid(0); /* fprem(Infinity,?) is invalid */
1108 return;
1109 }
1110 }
1111
1112 /* One of the registers must contain a NaN if we got here. */
1113
1114#ifdef PARANOID
1115 if ( (st0_tag != TW_NaN) && (st1_tag != TW_NaN) )
1116 EXCEPTION(EX_INTERNAL | 0x118);
1117#endif /* PARANOID */
1118
1119 real_2op_NaN(st1_ptr, st1_tag, 0, st1_ptr);
1120
1121}
1122
1123
1124/* ST(1) <- ST(1) * log ST; pop ST */
1125static void fyl2x(FPU_REG *st0_ptr, u_char st0_tag)
1126{
1127 FPU_REG *st1_ptr = &st(1), exponent;
1128 u_char st1_tag = FPU_gettagi(1);
1129 u_char sign;
1130 int e, tag;
1131
1132 clear_C1();
1133
1134 if ( (st0_tag == TAG_Valid) && (st1_tag == TAG_Valid) )
1135 {
1136 both_valid:
1137 /* Both regs are Valid or Denormal */
1138 if ( signpositive(st0_ptr) )
1139 {
1140 if ( st0_tag == TW_Denormal )
1141 FPU_to_exp16(st0_ptr, st0_ptr);
1142 else
1143 /* Convert st(0) for internal use. */
1144 setexponent16(st0_ptr, exponent(st0_ptr));
1145
1146 if ( (st0_ptr->sigh == 0x80000000) && (st0_ptr->sigl == 0) )
1147 {
1148 /* Special case. The result can be precise. */
1149 u_char esign;
1150 e = exponent16(st0_ptr);
1151 if ( e >= 0 )
1152 {
1153 exponent.sigh = e;
1154 esign = SIGN_POS;
1155 }
1156 else
1157 {
1158 exponent.sigh = -e;
1159 esign = SIGN_NEG;
1160 }
1161 exponent.sigl = 0;
1162 setexponent16(&exponent, 31);
1163 tag = FPU_normalize_nuo(&exponent);
1164 stdexp(&exponent);
1165 setsign(&exponent, esign);
1166 tag = FPU_mul(&exponent, tag, 1, FULL_PRECISION);
1167 if ( tag >= 0 )
1168 FPU_settagi(1, tag);
1169 }
1170 else
1171 {
1172 /* The usual case */
1173 sign = getsign(st1_ptr);
1174 if ( st1_tag == TW_Denormal )
1175 FPU_to_exp16(st1_ptr, st1_ptr);
1176 else
1177 /* Convert st(1) for internal use. */
1178 setexponent16(st1_ptr, exponent(st1_ptr));
1179 poly_l2(st0_ptr, st1_ptr, sign);
1180 }
1181 }
1182 else
1183 {
1184 /* negative */
1185 if ( arith_invalid(1) < 0 )
1186 return;
1187 }
1188
1189 FPU_pop();
1190
1191 return;
1192 }
1193
1194 if ( st0_tag == TAG_Special )
1195 st0_tag = FPU_Special(st0_ptr);
1196 if ( st1_tag == TAG_Special )
1197 st1_tag = FPU_Special(st1_ptr);
1198
1199 if ( (st0_tag == TAG_Empty) || (st1_tag == TAG_Empty) )
1200 {
1201 FPU_stack_underflow_pop(1);
1202 return;
1203 }
1204 else if ( (st0_tag <= TW_Denormal) && (st1_tag <= TW_Denormal) )
1205 {
1206 if ( st0_tag == TAG_Zero )
1207 {
1208 if ( st1_tag == TAG_Zero )
1209 {
1210 /* Both args zero is invalid */
1211 if ( arith_invalid(1) < 0 )
1212 return;
1213 }
1214 else
1215 {
1216 u_char sign;
1217 sign = getsign(st1_ptr)^SIGN_NEG;
1218 if ( FPU_divide_by_zero(1, sign) < 0 )
1219 return;
1220
1221 setsign(st1_ptr, sign);
1222 }
1223 }
1224 else if ( st1_tag == TAG_Zero )
1225 {
1226 /* st(1) contains zero, st(0) valid <> 0 */
1227 /* Zero is the valid answer */
1228 sign = getsign(st1_ptr);
1229
1230 if ( signnegative(st0_ptr) )
1231 {
1232 /* log(negative) */
1233 if ( arith_invalid(1) < 0 )
1234 return;
1235 }
1236 else if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) )
1237 return;
1238 else
1239 {
1240 if ( exponent(st0_ptr) < 0 )
1241 sign ^= SIGN_NEG;
1242
1243 FPU_copy_to_reg1(&CONST_Z, TAG_Zero);
1244 setsign(st1_ptr, sign);
1245 }
1246 }
1247 else
1248 {
1249 /* One or both operands are denormals. */
1250 if ( denormal_operand() < 0 )
1251 return;
1252 goto both_valid;
1253 }
1254 }
1255 else if ( (st0_tag == TW_NaN) || (st1_tag == TW_NaN) )
1256 {
1257 if ( real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0 )
1258 return;
1259 }
1260 /* One or both arg must be an infinity */
1261 else if ( st0_tag == TW_Infinity )
1262 {
1263 if ( (signnegative(st0_ptr)) || (st1_tag == TAG_Zero) )
1264 {
1265 /* log(-infinity) or 0*log(infinity) */
1266 if ( arith_invalid(1) < 0 )
1267 return;
1268 }
1269 else
1270 {
1271 u_char sign = getsign(st1_ptr);
1272
1273 if ( (st1_tag == TW_Denormal) && (denormal_operand() < 0) )
1274 return;
1275
1276 FPU_copy_to_reg1(&CONST_INF, TAG_Special);
1277 setsign(st1_ptr, sign);
1278 }
1279 }
1280 /* st(1) must be infinity here */
1281 else if ( ((st0_tag == TAG_Valid) || (st0_tag == TW_Denormal))
1282 && ( signpositive(st0_ptr) ) )
1283 {
1284 if ( exponent(st0_ptr) >= 0 )
1285 {
1286 if ( (exponent(st0_ptr) == 0) &&
1287 (st0_ptr->sigh == 0x80000000) &&
1288 (st0_ptr->sigl == 0) )
1289 {
1290 /* st(0) holds 1.0 */
1291 /* infinity*log(1) */
1292 if ( arith_invalid(1) < 0 )
1293 return;
1294 }
1295 /* else st(0) is positive and > 1.0 */
1296 }
1297 else
1298 {
1299 /* st(0) is positive and < 1.0 */
1300
1301 if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) )
1302 return;
1303
1304 changesign(st1_ptr);
1305 }
1306 }
1307 else
1308 {
1309 /* st(0) must be zero or negative */
1310 if ( st0_tag == TAG_Zero )
1311 {
1312 /* This should be invalid, but a real 80486 is happy with it. */
1313
1314#ifndef PECULIAR_486
1315 sign = getsign(st1_ptr);
1316 if ( FPU_divide_by_zero(1, sign) < 0 )
1317 return;
1318#endif /* PECULIAR_486 */
1319
1320 changesign(st1_ptr);
1321 }
1322 else if ( arith_invalid(1) < 0 ) /* log(negative) */
1323 return;
1324 }
1325
1326 FPU_pop();
1327}
1328
1329
1330static void fpatan(FPU_REG *st0_ptr, u_char st0_tag)
1331{
1332 FPU_REG *st1_ptr = &st(1);
1333 u_char st1_tag = FPU_gettagi(1);
1334 int tag;
1335
1336 clear_C1();
1337 if ( !((st0_tag ^ TAG_Valid) | (st1_tag ^ TAG_Valid)) )
1338 {
1339 valid_atan:
1340
1341 poly_atan(st0_ptr, st0_tag, st1_ptr, st1_tag);
1342
1343 FPU_pop();
1344
1345 return;
1346 }
1347
1348 if ( st0_tag == TAG_Special )
1349 st0_tag = FPU_Special(st0_ptr);
1350 if ( st1_tag == TAG_Special )
1351 st1_tag = FPU_Special(st1_ptr);
1352
1353 if ( ((st0_tag == TAG_Valid) && (st1_tag == TW_Denormal))
1354 || ((st0_tag == TW_Denormal) && (st1_tag == TAG_Valid))
1355 || ((st0_tag == TW_Denormal) && (st1_tag == TW_Denormal)) )
1356 {
1357 if ( denormal_operand() < 0 )
1358 return;
1359
1360 goto valid_atan;
1361 }
1362 else if ( (st0_tag == TAG_Empty) || (st1_tag == TAG_Empty) )
1363 {
1364 FPU_stack_underflow_pop(1);
1365 return;
1366 }
1367 else if ( (st0_tag == TW_NaN) || (st1_tag == TW_NaN) )
1368 {
1369 if ( real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) >= 0 )
1370 FPU_pop();
1371 return;
1372 }
1373 else if ( (st0_tag == TW_Infinity) || (st1_tag == TW_Infinity) )
1374 {
1375 u_char sign = getsign(st1_ptr);
1376 if ( st0_tag == TW_Infinity )
1377 {
1378 if ( st1_tag == TW_Infinity )
1379 {
1380 if ( signpositive(st0_ptr) )
1381 {
1382 FPU_copy_to_reg1(&CONST_PI4, TAG_Valid);
1383 }
1384 else
1385 {
1386 setpositive(st1_ptr);
1387 tag = FPU_u_add(&CONST_PI4, &CONST_PI2, st1_ptr,
1388 FULL_PRECISION, SIGN_POS,
1389 exponent(&CONST_PI4), exponent(&CONST_PI2));
1390 if ( tag >= 0 )
1391 FPU_settagi(1, tag);
1392 }
1393 }
1394 else
1395 {
1396 if ( (st1_tag == TW_Denormal) && (denormal_operand() < 0) )
1397 return;
1398
1399 if ( signpositive(st0_ptr) )
1400 {
1401 FPU_copy_to_reg1(&CONST_Z, TAG_Zero);
1402 setsign(st1_ptr, sign); /* An 80486 preserves the sign */
1403 FPU_pop();
1404 return;
1405 }
1406 else
1407 {
1408 FPU_copy_to_reg1(&CONST_PI, TAG_Valid);
1409 }
1410 }
1411 }
1412 else
1413 {
1414 /* st(1) is infinity, st(0) not infinity */
1415 if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) )
1416 return;
1417
1418 FPU_copy_to_reg1(&CONST_PI2, TAG_Valid);
1419 }
1420 setsign(st1_ptr, sign);
1421 }
1422 else if ( st1_tag == TAG_Zero )
1423 {
1424 /* st(0) must be valid or zero */
1425 u_char sign = getsign(st1_ptr);
1426
1427 if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) )
1428 return;
1429
1430 if ( signpositive(st0_ptr) )
1431 {
1432 /* An 80486 preserves the sign */
1433 FPU_pop();
1434 return;
1435 }
1436
1437 FPU_copy_to_reg1(&CONST_PI, TAG_Valid);
1438 setsign(st1_ptr, sign);
1439 }
1440 else if ( st0_tag == TAG_Zero )
1441 {
1442 /* st(1) must be TAG_Valid here */
1443 u_char sign = getsign(st1_ptr);
1444
1445 if ( (st1_tag == TW_Denormal) && (denormal_operand() < 0) )
1446 return;
1447
1448 FPU_copy_to_reg1(&CONST_PI2, TAG_Valid);
1449 setsign(st1_ptr, sign);
1450 }
1451#ifdef PARANOID
1452 else
1453 EXCEPTION(EX_INTERNAL | 0x125);
1454#endif /* PARANOID */
1455
1456 FPU_pop();
1457 set_precision_flag_up(); /* We do not really know if up or down */
1458}
1459
1460
1461static void fprem(FPU_REG *st0_ptr, u_char st0_tag)
1462{
1463 do_fprem(st0_ptr, st0_tag, RC_CHOP);
1464}
1465
1466
1467static void fprem1(FPU_REG *st0_ptr, u_char st0_tag)
1468{
1469 do_fprem(st0_ptr, st0_tag, RC_RND);
1470}
1471
1472
1473static void fyl2xp1(FPU_REG *st0_ptr, u_char st0_tag)
1474{
1475 u_char sign, sign1;
1476 FPU_REG *st1_ptr = &st(1), a, b;
1477 u_char st1_tag = FPU_gettagi(1);
1478
1479 clear_C1();
1480 if ( !((st0_tag ^ TAG_Valid) | (st1_tag ^ TAG_Valid)) )
1481 {
1482 valid_yl2xp1:
1483
1484 sign = getsign(st0_ptr);
1485 sign1 = getsign(st1_ptr);
1486
1487 FPU_to_exp16(st0_ptr, &a);
1488 FPU_to_exp16(st1_ptr, &b);
1489
1490 if ( poly_l2p1(sign, sign1, &a, &b, st1_ptr) )
1491 return;
1492
1493 FPU_pop();
1494 return;
1495 }
1496
1497 if ( st0_tag == TAG_Special )
1498 st0_tag = FPU_Special(st0_ptr);
1499 if ( st1_tag == TAG_Special )
1500 st1_tag = FPU_Special(st1_ptr);
1501
1502 if ( ((st0_tag == TAG_Valid) && (st1_tag == TW_Denormal))
1503 || ((st0_tag == TW_Denormal) && (st1_tag == TAG_Valid))
1504 || ((st0_tag == TW_Denormal) && (st1_tag == TW_Denormal)) )
1505 {
1506 if ( denormal_operand() < 0 )
1507 return;
1508
1509 goto valid_yl2xp1;
1510 }
1511 else if ( (st0_tag == TAG_Empty) | (st1_tag == TAG_Empty) )
1512 {
1513 FPU_stack_underflow_pop(1);
1514 return;
1515 }
1516 else if ( st0_tag == TAG_Zero )
1517 {
1518 switch ( st1_tag )
1519 {
1520 case TW_Denormal:
1521 if ( denormal_operand() < 0 )
1522 return;
1523
1524 case TAG_Zero:
1525 case TAG_Valid:
1526 setsign(st0_ptr, getsign(st0_ptr) ^ getsign(st1_ptr));
1527 FPU_copy_to_reg1(st0_ptr, st0_tag);
1528 break;
1529
1530 case TW_Infinity:
1531 /* Infinity*log(1) */
1532 if ( arith_invalid(1) < 0 )
1533 return;
1534 break;
1535
1536 case TW_NaN:
1537 if ( real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0 )
1538 return;
1539 break;
1540
1541 default:
1542#ifdef PARANOID
1543 EXCEPTION(EX_INTERNAL | 0x116);
1544 return;
1545#endif /* PARANOID */
1546 break;
1547 }
1548 }
1549 else if ( (st0_tag == TAG_Valid) || (st0_tag == TW_Denormal) )
1550 {
1551 switch ( st1_tag )
1552 {
1553 case TAG_Zero:
1554 if ( signnegative(st0_ptr) )
1555 {
1556 if ( exponent(st0_ptr) >= 0 )
1557 {
1558 /* st(0) holds <= -1.0 */
1559#ifdef PECULIAR_486 /* Stupid 80486 doesn't worry about log(negative). */
1560 changesign(st1_ptr);
1561#else
1562 if ( arith_invalid(1) < 0 )
1563 return;
1564#endif /* PECULIAR_486 */
1565 }
1566 else if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) )
1567 return;
1568 else
1569 changesign(st1_ptr);
1570 }
1571 else if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) )
1572 return;
1573 break;
1574
1575 case TW_Infinity:
1576 if ( signnegative(st0_ptr) )
1577 {
1578 if ( (exponent(st0_ptr) >= 0) &&
1579 !((st0_ptr->sigh == 0x80000000) &&
1580 (st0_ptr->sigl == 0)) )
1581 {
1582 /* st(0) holds < -1.0 */
1583#ifdef PECULIAR_486 /* Stupid 80486 doesn't worry about log(negative). */
1584 changesign(st1_ptr);
1585#else
1586 if ( arith_invalid(1) < 0 ) return;
1587#endif /* PECULIAR_486 */
1588 }
1589 else if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) )
1590 return;
1591 else
1592 changesign(st1_ptr);
1593 }
1594 else if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) )
1595 return;
1596 break;
1597
1598 case TW_NaN:
1599 if ( real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0 )
1600 return;
1601 }
1602
1603 }
1604 else if ( st0_tag == TW_NaN )
1605 {
1606 if ( real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0 )
1607 return;
1608 }
1609 else if ( st0_tag == TW_Infinity )
1610 {
1611 if ( st1_tag == TW_NaN )
1612 {
1613 if ( real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0 )
1614 return;
1615 }
1616 else if ( signnegative(st0_ptr) )
1617 {
1618#ifndef PECULIAR_486
1619 /* This should have higher priority than denormals, but... */
1620 if ( arith_invalid(1) < 0 ) /* log(-infinity) */
1621 return;
1622#endif /* PECULIAR_486 */
1623 if ( (st1_tag == TW_Denormal) && (denormal_operand() < 0) )
1624 return;
1625#ifdef PECULIAR_486
1626 /* Denormal operands actually get higher priority */
1627 if ( arith_invalid(1) < 0 ) /* log(-infinity) */
1628 return;
1629#endif /* PECULIAR_486 */
1630 }
1631 else if ( st1_tag == TAG_Zero )
1632 {
1633 /* log(infinity) */
1634 if ( arith_invalid(1) < 0 )
1635 return;
1636 }
1637
1638 /* st(1) must be valid here. */
1639
1640 else if ( (st1_tag == TW_Denormal) && (denormal_operand() < 0) )
1641 return;
1642
1643 /* The Manual says that log(Infinity) is invalid, but a real
1644 80486 sensibly says that it is o.k. */
1645 else
1646 {
1647 u_char sign = getsign(st1_ptr);
1648 FPU_copy_to_reg1(&CONST_INF, TAG_Special);
1649 setsign(st1_ptr, sign);
1650 }
1651 }
1652#ifdef PARANOID
1653 else
1654 {
1655 EXCEPTION(EX_INTERNAL | 0x117);
1656 return;
1657 }
1658#endif /* PARANOID */
1659
1660 FPU_pop();
1661 return;
1662
1663}
1664
1665
1666static void fscale(FPU_REG *st0_ptr, u_char st0_tag)
1667{
1668 FPU_REG *st1_ptr = &st(1);
1669 u_char st1_tag = FPU_gettagi(1);
1670 int old_cw = control_word;
1671 u_char sign = getsign(st0_ptr);
1672
1673 clear_C1();
1674 if ( !((st0_tag ^ TAG_Valid) | (st1_tag ^ TAG_Valid)) )
1675 {
1676 long scale;
1677 FPU_REG tmp;
1678
1679 /* Convert register for internal use. */
1680 setexponent16(st0_ptr, exponent(st0_ptr));
1681
1682 valid_scale:
1683
1684 if ( exponent(st1_ptr) > 30 )
1685 {
1686 /* 2^31 is far too large, would require 2^(2^30) or 2^(-2^30) */
1687
1688 if ( signpositive(st1_ptr) )
1689 {
1690 EXCEPTION(EX_Overflow);
1691 FPU_copy_to_reg0(&CONST_INF, TAG_Special);
1692 }
1693 else
1694 {
1695 EXCEPTION(EX_Underflow);
1696 FPU_copy_to_reg0(&CONST_Z, TAG_Zero);
1697 }
1698 setsign(st0_ptr, sign);
1699 return;
1700 }
1701
1702 control_word &= ~CW_RC;
1703 control_word |= RC_CHOP;
1704 reg_copy(st1_ptr, &tmp);
1705 FPU_round_to_int(&tmp, st1_tag); /* This can never overflow here */
1706 control_word = old_cw;
1707 scale = signnegative(st1_ptr) ? -tmp.sigl : tmp.sigl;
1708 scale += exponent16(st0_ptr);
1709
1710 setexponent16(st0_ptr, scale);
1711
1712 /* Use FPU_round() to properly detect under/overflow etc */
1713 FPU_round(st0_ptr, 0, 0, control_word, sign);
1714
1715 return;
1716 }
1717
1718 if ( st0_tag == TAG_Special )
1719 st0_tag = FPU_Special(st0_ptr);
1720 if ( st1_tag == TAG_Special )
1721 st1_tag = FPU_Special(st1_ptr);
1722
1723 if ( (st0_tag == TAG_Valid) || (st0_tag == TW_Denormal) )
1724 {
1725 switch ( st1_tag )
1726 {
1727 case TAG_Valid:
1728 /* st(0) must be a denormal */
1729 if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) )
1730 return;
1731
1732 FPU_to_exp16(st0_ptr, st0_ptr); /* Will not be left on stack */
1733 goto valid_scale;
1734
1735 case TAG_Zero:
1736 if ( st0_tag == TW_Denormal )
1737 denormal_operand();
1738 return;
1739
1740 case TW_Denormal:
1741 denormal_operand();
1742 return;
1743
1744 case TW_Infinity:
1745 if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) )
1746 return;
1747
1748 if ( signpositive(st1_ptr) )
1749 FPU_copy_to_reg0(&CONST_INF, TAG_Special);
1750 else
1751 FPU_copy_to_reg0(&CONST_Z, TAG_Zero);
1752 setsign(st0_ptr, sign);
1753 return;
1754
1755 case TW_NaN:
1756 real_2op_NaN(st1_ptr, st1_tag, 0, st0_ptr);
1757 return;
1758 }
1759 }
1760 else if ( st0_tag == TAG_Zero )
1761 {
1762 switch ( st1_tag )
1763 {
1764 case TAG_Valid:
1765 case TAG_Zero:
1766 return;
1767
1768 case TW_Denormal:
1769 denormal_operand();
1770 return;
1771
1772 case TW_Infinity:
1773 if ( signpositive(st1_ptr) )
1774 arith_invalid(0); /* Zero scaled by +Infinity */
1775 return;
1776
1777 case TW_NaN:
1778 real_2op_NaN(st1_ptr, st1_tag, 0, st0_ptr);
1779 return;
1780 }
1781 }
1782 else if ( st0_tag == TW_Infinity )
1783 {
1784 switch ( st1_tag )
1785 {
1786 case TAG_Valid:
1787 case TAG_Zero:
1788 return;
1789
1790 case TW_Denormal:
1791 denormal_operand();
1792 return;
1793
1794 case TW_Infinity:
1795 if ( signnegative(st1_ptr) )
1796 arith_invalid(0); /* Infinity scaled by -Infinity */
1797 return;
1798
1799 case TW_NaN:
1800 real_2op_NaN(st1_ptr, st1_tag, 0, st0_ptr);
1801 return;
1802 }
1803 }
1804 else if ( st0_tag == TW_NaN )
1805 {
1806 if ( st1_tag != TAG_Empty )
1807 { real_2op_NaN(st1_ptr, st1_tag, 0, st0_ptr); return; }
1808 }
1809
1810#ifdef PARANOID
1811 if ( !((st0_tag == TAG_Empty) || (st1_tag == TAG_Empty)) )
1812 {
1813 EXCEPTION(EX_INTERNAL | 0x115);
1814 return;
1815 }
1816#endif
1817
1818 /* At least one of st(0), st(1) must be empty */
1819 FPU_stack_underflow();
1820
1821}
1822
1823
1824/*---------------------------------------------------------------------------*/
1825
1826static FUNC_ST0 const trig_table_a[] = {
1827 f2xm1, fyl2x, fptan, fpatan,
1828 fxtract, fprem1, (FUNC_ST0)fdecstp, (FUNC_ST0)fincstp
1829};
1830
1831void FPU_triga(void)
1832{
1833 (trig_table_a[FPU_rm])(&st(0), FPU_gettag0());
1834}
1835
1836
1837static FUNC_ST0 const trig_table_b[] =
1838 {
1839 fprem, fyl2xp1, fsqrt_, fsincos, frndint_, fscale, (FUNC_ST0)fsin, fcos
1840 };
1841
1842void FPU_trigb(void)
1843{
1844 (trig_table_b[FPU_rm])(&st(0), FPU_gettag0());
1845}
diff --git a/arch/x86/math-emu/get_address.c b/arch/x86/math-emu/get_address.c
new file mode 100644
index 000000000000..2e2c51a8bd3a
--- /dev/null
+++ b/arch/x86/math-emu/get_address.c
@@ -0,0 +1,438 @@
1/*---------------------------------------------------------------------------+
2 | get_address.c |
3 | |
4 | Get the effective address from an FPU instruction. |
5 | |
6 | Copyright (C) 1992,1993,1994,1997 |
7 | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, |
8 | Australia. E-mail billm@suburbia.net |
9 | |
10 | |
11 +---------------------------------------------------------------------------*/
12
13/*---------------------------------------------------------------------------+
14 | Note: |
15 | The file contains code which accesses user memory. |
16 | Emulator static data may change when user memory is accessed, due to |
17 | other processes using the emulator while swapping is in progress. |
18 +---------------------------------------------------------------------------*/
19
20
21#include <linux/stddef.h>
22
23#include <asm/uaccess.h>
24#include <asm/desc.h>
25
26#include "fpu_system.h"
27#include "exception.h"
28#include "fpu_emu.h"
29
30
31#define FPU_WRITE_BIT 0x10
32
33static int reg_offset[] = {
34 offsetof(struct info,___eax),
35 offsetof(struct info,___ecx),
36 offsetof(struct info,___edx),
37 offsetof(struct info,___ebx),
38 offsetof(struct info,___esp),
39 offsetof(struct info,___ebp),
40 offsetof(struct info,___esi),
41 offsetof(struct info,___edi)
42};
43
44#define REG_(x) (*(long *)(reg_offset[(x)]+(u_char *) FPU_info))
45
46static int reg_offset_vm86[] = {
47 offsetof(struct info,___cs),
48 offsetof(struct info,___vm86_ds),
49 offsetof(struct info,___vm86_es),
50 offsetof(struct info,___vm86_fs),
51 offsetof(struct info,___vm86_gs),
52 offsetof(struct info,___ss),
53 offsetof(struct info,___vm86_ds)
54 };
55
56#define VM86_REG_(x) (*(unsigned short *) \
57 (reg_offset_vm86[((unsigned)x)]+(u_char *) FPU_info))
58
59/* This dummy, gs is not saved on the stack. */
60#define ___GS ___ds
61
62static int reg_offset_pm[] = {
63 offsetof(struct info,___cs),
64 offsetof(struct info,___ds),
65 offsetof(struct info,___es),
66 offsetof(struct info,___fs),
67 offsetof(struct info,___GS),
68 offsetof(struct info,___ss),
69 offsetof(struct info,___ds)
70 };
71
72#define PM_REG_(x) (*(unsigned short *) \
73 (reg_offset_pm[((unsigned)x)]+(u_char *) FPU_info))
74
75
76/* Decode the SIB byte. This function assumes mod != 0 */
77static int sib(int mod, unsigned long *fpu_eip)
78{
79 u_char ss,index,base;
80 long offset;
81
82 RE_ENTRANT_CHECK_OFF;
83 FPU_code_access_ok(1);
84 FPU_get_user(base, (u_char __user *) (*fpu_eip)); /* The SIB byte */
85 RE_ENTRANT_CHECK_ON;
86 (*fpu_eip)++;
87 ss = base >> 6;
88 index = (base >> 3) & 7;
89 base &= 7;
90
91 if ((mod == 0) && (base == 5))
92 offset = 0; /* No base register */
93 else
94 offset = REG_(base);
95
96 if (index == 4)
97 {
98 /* No index register */
99 /* A non-zero ss is illegal */
100 if ( ss )
101 EXCEPTION(EX_Invalid);
102 }
103 else
104 {
105 offset += (REG_(index)) << ss;
106 }
107
108 if (mod == 1)
109 {
110 /* 8 bit signed displacement */
111 long displacement;
112 RE_ENTRANT_CHECK_OFF;
113 FPU_code_access_ok(1);
114 FPU_get_user(displacement, (signed char __user *) (*fpu_eip));
115 offset += displacement;
116 RE_ENTRANT_CHECK_ON;
117 (*fpu_eip)++;
118 }
119 else if (mod == 2 || base == 5) /* The second condition also has mod==0 */
120 {
121 /* 32 bit displacement */
122 long displacement;
123 RE_ENTRANT_CHECK_OFF;
124 FPU_code_access_ok(4);
125 FPU_get_user(displacement, (long __user *) (*fpu_eip));
126 offset += displacement;
127 RE_ENTRANT_CHECK_ON;
128 (*fpu_eip) += 4;
129 }
130
131 return offset;
132}
133
134
135static unsigned long vm86_segment(u_char segment,
136 struct address *addr)
137{
138 segment--;
139#ifdef PARANOID
140 if ( segment > PREFIX_SS_ )
141 {
142 EXCEPTION(EX_INTERNAL|0x130);
143 math_abort(FPU_info,SIGSEGV);
144 }
145#endif /* PARANOID */
146 addr->selector = VM86_REG_(segment);
147 return (unsigned long)VM86_REG_(segment) << 4;
148}
149
150
151/* This should work for 16 and 32 bit protected mode. */
152static long pm_address(u_char FPU_modrm, u_char segment,
153 struct address *addr, long offset)
154{
155 struct desc_struct descriptor;
156 unsigned long base_address, limit, address, seg_top;
157
158 segment--;
159
160#ifdef PARANOID
161 /* segment is unsigned, so this also detects if segment was 0: */
162 if ( segment > PREFIX_SS_ )
163 {
164 EXCEPTION(EX_INTERNAL|0x132);
165 math_abort(FPU_info,SIGSEGV);
166 }
167#endif /* PARANOID */
168
169 switch ( segment )
170 {
171 /* gs isn't used by the kernel, so it still has its
172 user-space value. */
173 case PREFIX_GS_-1:
174 /* N.B. - movl %seg, mem is a 2 byte write regardless of prefix */
175 savesegment(gs, addr->selector);
176 break;
177 default:
178 addr->selector = PM_REG_(segment);
179 }
180
181 descriptor = LDT_DESCRIPTOR(PM_REG_(segment));
182 base_address = SEG_BASE_ADDR(descriptor);
183 address = base_address + offset;
184 limit = base_address
185 + (SEG_LIMIT(descriptor)+1) * SEG_GRANULARITY(descriptor) - 1;
186 if ( limit < base_address ) limit = 0xffffffff;
187
188 if ( SEG_EXPAND_DOWN(descriptor) )
189 {
190 if ( SEG_G_BIT(descriptor) )
191 seg_top = 0xffffffff;
192 else
193 {
194 seg_top = base_address + (1 << 20);
195 if ( seg_top < base_address ) seg_top = 0xffffffff;
196 }
197 access_limit =
198 (address <= limit) || (address >= seg_top) ? 0 :
199 ((seg_top-address) >= 255 ? 255 : seg_top-address);
200 }
201 else
202 {
203 access_limit =
204 (address > limit) || (address < base_address) ? 0 :
205 ((limit-address) >= 254 ? 255 : limit-address+1);
206 }
207 if ( SEG_EXECUTE_ONLY(descriptor) ||
208 (!SEG_WRITE_PERM(descriptor) && (FPU_modrm & FPU_WRITE_BIT)) )
209 {
210 access_limit = 0;
211 }
212 return address;
213}
214
215
216/*
217 MOD R/M byte: MOD == 3 has a special use for the FPU
218 SIB byte used iff R/M = 100b
219
220 7 6 5 4 3 2 1 0
221 ..... ......... .........
222 MOD OPCODE(2) R/M
223
224
225 SIB byte
226
227 7 6 5 4 3 2 1 0
228 ..... ......... .........
229 SS INDEX BASE
230
231*/
232
233void __user *FPU_get_address(u_char FPU_modrm, unsigned long *fpu_eip,
234 struct address *addr,
235 fpu_addr_modes addr_modes)
236{
237 u_char mod;
238 unsigned rm = FPU_modrm & 7;
239 long *cpu_reg_ptr;
240 int address = 0; /* Initialized just to stop compiler warnings. */
241
242 /* Memory accessed via the cs selector is write protected
243 in `non-segmented' 32 bit protected mode. */
244 if ( !addr_modes.default_mode && (FPU_modrm & FPU_WRITE_BIT)
245 && (addr_modes.override.segment == PREFIX_CS_) )
246 {
247 math_abort(FPU_info,SIGSEGV);
248 }
249
250 addr->selector = FPU_DS; /* Default, for 32 bit non-segmented mode. */
251
252 mod = (FPU_modrm >> 6) & 3;
253
254 if (rm == 4 && mod != 3)
255 {
256 address = sib(mod, fpu_eip);
257 }
258 else
259 {
260 cpu_reg_ptr = & REG_(rm);
261 switch (mod)
262 {
263 case 0:
264 if (rm == 5)
265 {
266 /* Special case: disp32 */
267 RE_ENTRANT_CHECK_OFF;
268 FPU_code_access_ok(4);
269 FPU_get_user(address, (unsigned long __user *) (*fpu_eip));
270 (*fpu_eip) += 4;
271 RE_ENTRANT_CHECK_ON;
272 addr->offset = address;
273 return (void __user *) address;
274 }
275 else
276 {
277 address = *cpu_reg_ptr; /* Just return the contents
278 of the cpu register */
279 addr->offset = address;
280 return (void __user *) address;
281 }
282 case 1:
283 /* 8 bit signed displacement */
284 RE_ENTRANT_CHECK_OFF;
285 FPU_code_access_ok(1);
286 FPU_get_user(address, (signed char __user *) (*fpu_eip));
287 RE_ENTRANT_CHECK_ON;
288 (*fpu_eip)++;
289 break;
290 case 2:
291 /* 32 bit displacement */
292 RE_ENTRANT_CHECK_OFF;
293 FPU_code_access_ok(4);
294 FPU_get_user(address, (long __user *) (*fpu_eip));
295 (*fpu_eip) += 4;
296 RE_ENTRANT_CHECK_ON;
297 break;
298 case 3:
299 /* Not legal for the FPU */
300 EXCEPTION(EX_Invalid);
301 }
302 address += *cpu_reg_ptr;
303 }
304
305 addr->offset = address;
306
307 switch ( addr_modes.default_mode )
308 {
309 case 0:
310 break;
311 case VM86:
312 address += vm86_segment(addr_modes.override.segment, addr);
313 break;
314 case PM16:
315 case SEG32:
316 address = pm_address(FPU_modrm, addr_modes.override.segment,
317 addr, address);
318 break;
319 default:
320 EXCEPTION(EX_INTERNAL|0x133);
321 }
322
323 return (void __user *)address;
324}
325
326
327void __user *FPU_get_address_16(u_char FPU_modrm, unsigned long *fpu_eip,
328 struct address *addr,
329 fpu_addr_modes addr_modes)
330{
331 u_char mod;
332 unsigned rm = FPU_modrm & 7;
333 int address = 0; /* Default used for mod == 0 */
334
335 /* Memory accessed via the cs selector is write protected
336 in `non-segmented' 32 bit protected mode. */
337 if ( !addr_modes.default_mode && (FPU_modrm & FPU_WRITE_BIT)
338 && (addr_modes.override.segment == PREFIX_CS_) )
339 {
340 math_abort(FPU_info,SIGSEGV);
341 }
342
343 addr->selector = FPU_DS; /* Default, for 32 bit non-segmented mode. */
344
345 mod = (FPU_modrm >> 6) & 3;
346
347 switch (mod)
348 {
349 case 0:
350 if (rm == 6)
351 {
352 /* Special case: disp16 */
353 RE_ENTRANT_CHECK_OFF;
354 FPU_code_access_ok(2);
355 FPU_get_user(address, (unsigned short __user *) (*fpu_eip));
356 (*fpu_eip) += 2;
357 RE_ENTRANT_CHECK_ON;
358 goto add_segment;
359 }
360 break;
361 case 1:
362 /* 8 bit signed displacement */
363 RE_ENTRANT_CHECK_OFF;
364 FPU_code_access_ok(1);
365 FPU_get_user(address, (signed char __user *) (*fpu_eip));
366 RE_ENTRANT_CHECK_ON;
367 (*fpu_eip)++;
368 break;
369 case 2:
370 /* 16 bit displacement */
371 RE_ENTRANT_CHECK_OFF;
372 FPU_code_access_ok(2);
373 FPU_get_user(address, (unsigned short __user *) (*fpu_eip));
374 (*fpu_eip) += 2;
375 RE_ENTRANT_CHECK_ON;
376 break;
377 case 3:
378 /* Not legal for the FPU */
379 EXCEPTION(EX_Invalid);
380 break;
381 }
382 switch ( rm )
383 {
384 case 0:
385 address += FPU_info->___ebx + FPU_info->___esi;
386 break;
387 case 1:
388 address += FPU_info->___ebx + FPU_info->___edi;
389 break;
390 case 2:
391 address += FPU_info->___ebp + FPU_info->___esi;
392 if ( addr_modes.override.segment == PREFIX_DEFAULT )
393 addr_modes.override.segment = PREFIX_SS_;
394 break;
395 case 3:
396 address += FPU_info->___ebp + FPU_info->___edi;
397 if ( addr_modes.override.segment == PREFIX_DEFAULT )
398 addr_modes.override.segment = PREFIX_SS_;
399 break;
400 case 4:
401 address += FPU_info->___esi;
402 break;
403 case 5:
404 address += FPU_info->___edi;
405 break;
406 case 6:
407 address += FPU_info->___ebp;
408 if ( addr_modes.override.segment == PREFIX_DEFAULT )
409 addr_modes.override.segment = PREFIX_SS_;
410 break;
411 case 7:
412 address += FPU_info->___ebx;
413 break;
414 }
415
416 add_segment:
417 address &= 0xffff;
418
419 addr->offset = address;
420
421 switch ( addr_modes.default_mode )
422 {
423 case 0:
424 break;
425 case VM86:
426 address += vm86_segment(addr_modes.override.segment, addr);
427 break;
428 case PM16:
429 case SEG32:
430 address = pm_address(FPU_modrm, addr_modes.override.segment,
431 addr, address);
432 break;
433 default:
434 EXCEPTION(EX_INTERNAL|0x131);
435 }
436
437 return (void __user *)address ;
438}
diff --git a/arch/x86/math-emu/load_store.c b/arch/x86/math-emu/load_store.c
new file mode 100644
index 000000000000..eebd6fb1c8a8
--- /dev/null
+++ b/arch/x86/math-emu/load_store.c
@@ -0,0 +1,272 @@
1/*---------------------------------------------------------------------------+
2 | load_store.c |
3 | |
4 | This file contains most of the code to interpret the FPU instructions |
5 | which load and store from user memory. |
6 | |
7 | Copyright (C) 1992,1993,1994,1997 |
8 | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, |
9 | Australia. E-mail billm@suburbia.net |
10 | |
11 | |
12 +---------------------------------------------------------------------------*/
13
14/*---------------------------------------------------------------------------+
15 | Note: |
16 | The file contains code which accesses user memory. |
17 | Emulator static data may change when user memory is accessed, due to |
18 | other processes using the emulator while swapping is in progress. |
19 +---------------------------------------------------------------------------*/
20
21#include <asm/uaccess.h>
22
23#include "fpu_system.h"
24#include "exception.h"
25#include "fpu_emu.h"
26#include "status_w.h"
27#include "control_w.h"
28
29
30#define _NONE_ 0 /* st0_ptr etc not needed */
31#define _REG0_ 1 /* Will be storing st(0) */
32#define _PUSH_ 3 /* Need to check for space to push onto stack */
33#define _null_ 4 /* Function illegal or not implemented */
34
35#define pop_0() { FPU_settag0(TAG_Empty); top++; }
36
37
38static u_char const type_table[32] = {
39 _PUSH_, _PUSH_, _PUSH_, _PUSH_,
40 _null_, _null_, _null_, _null_,
41 _REG0_, _REG0_, _REG0_, _REG0_,
42 _REG0_, _REG0_, _REG0_, _REG0_,
43 _NONE_, _null_, _NONE_, _PUSH_,
44 _NONE_, _PUSH_, _null_, _PUSH_,
45 _NONE_, _null_, _NONE_, _REG0_,
46 _NONE_, _REG0_, _NONE_, _REG0_
47 };
48
49u_char const data_sizes_16[32] = {
50 4, 4, 8, 2, 0, 0, 0, 0,
51 4, 4, 8, 2, 4, 4, 8, 2,
52 14, 0, 94, 10, 2, 10, 0, 8,
53 14, 0, 94, 10, 2, 10, 2, 8
54};
55
56static u_char const data_sizes_32[32] = {
57 4, 4, 8, 2, 0, 0, 0, 0,
58 4, 4, 8, 2, 4, 4, 8, 2,
59 28, 0,108, 10, 2, 10, 0, 8,
60 28, 0,108, 10, 2, 10, 2, 8
61};
62
63int FPU_load_store(u_char type, fpu_addr_modes addr_modes,
64 void __user *data_address)
65{
66 FPU_REG loaded_data;
67 FPU_REG *st0_ptr;
68 u_char st0_tag = TAG_Empty; /* This is just to stop a gcc warning. */
69 u_char loaded_tag;
70
71 st0_ptr = NULL; /* Initialized just to stop compiler warnings. */
72
73 if ( addr_modes.default_mode & PROTECTED )
74 {
75 if ( addr_modes.default_mode == SEG32 )
76 {
77 if ( access_limit < data_sizes_32[type] )
78 math_abort(FPU_info,SIGSEGV);
79 }
80 else if ( addr_modes.default_mode == PM16 )
81 {
82 if ( access_limit < data_sizes_16[type] )
83 math_abort(FPU_info,SIGSEGV);
84 }
85#ifdef PARANOID
86 else
87 EXCEPTION(EX_INTERNAL|0x140);
88#endif /* PARANOID */
89 }
90
91 switch ( type_table[type] )
92 {
93 case _NONE_:
94 break;
95 case _REG0_:
96 st0_ptr = &st(0); /* Some of these instructions pop after
97 storing */
98 st0_tag = FPU_gettag0();
99 break;
100 case _PUSH_:
101 {
102 if ( FPU_gettagi(-1) != TAG_Empty )
103 { FPU_stack_overflow(); return 0; }
104 top--;
105 st0_ptr = &st(0);
106 }
107 break;
108 case _null_:
109 FPU_illegal();
110 return 0;
111#ifdef PARANOID
112 default:
113 EXCEPTION(EX_INTERNAL|0x141);
114 return 0;
115#endif /* PARANOID */
116 }
117
118 switch ( type )
119 {
120 case 000: /* fld m32real */
121 clear_C1();
122 loaded_tag = FPU_load_single((float __user *)data_address, &loaded_data);
123 if ( (loaded_tag == TAG_Special)
124 && isNaN(&loaded_data)
125 && (real_1op_NaN(&loaded_data) < 0) )
126 {
127 top++;
128 break;
129 }
130 FPU_copy_to_reg0(&loaded_data, loaded_tag);
131 break;
132 case 001: /* fild m32int */
133 clear_C1();
134 loaded_tag = FPU_load_int32((long __user *)data_address, &loaded_data);
135 FPU_copy_to_reg0(&loaded_data, loaded_tag);
136 break;
137 case 002: /* fld m64real */
138 clear_C1();
139 loaded_tag = FPU_load_double((double __user *)data_address, &loaded_data);
140 if ( (loaded_tag == TAG_Special)
141 && isNaN(&loaded_data)
142 && (real_1op_NaN(&loaded_data) < 0) )
143 {
144 top++;
145 break;
146 }
147 FPU_copy_to_reg0(&loaded_data, loaded_tag);
148 break;
149 case 003: /* fild m16int */
150 clear_C1();
151 loaded_tag = FPU_load_int16((short __user *)data_address, &loaded_data);
152 FPU_copy_to_reg0(&loaded_data, loaded_tag);
153 break;
154 case 010: /* fst m32real */
155 clear_C1();
156 FPU_store_single(st0_ptr, st0_tag, (float __user *)data_address);
157 break;
158 case 011: /* fist m32int */
159 clear_C1();
160 FPU_store_int32(st0_ptr, st0_tag, (long __user *)data_address);
161 break;
162 case 012: /* fst m64real */
163 clear_C1();
164 FPU_store_double(st0_ptr, st0_tag, (double __user *)data_address);
165 break;
166 case 013: /* fist m16int */
167 clear_C1();
168 FPU_store_int16(st0_ptr, st0_tag, (short __user *)data_address);
169 break;
170 case 014: /* fstp m32real */
171 clear_C1();
172 if ( FPU_store_single(st0_ptr, st0_tag, (float __user *)data_address) )
173 pop_0(); /* pop only if the number was actually stored
174 (see the 80486 manual p16-28) */
175 break;
176 case 015: /* fistp m32int */
177 clear_C1();
178 if ( FPU_store_int32(st0_ptr, st0_tag, (long __user *)data_address) )
179 pop_0(); /* pop only if the number was actually stored
180 (see the 80486 manual p16-28) */
181 break;
182 case 016: /* fstp m64real */
183 clear_C1();
184 if ( FPU_store_double(st0_ptr, st0_tag, (double __user *)data_address) )
185 pop_0(); /* pop only if the number was actually stored
186 (see the 80486 manual p16-28) */
187 break;
188 case 017: /* fistp m16int */
189 clear_C1();
190 if ( FPU_store_int16(st0_ptr, st0_tag, (short __user *)data_address) )
191 pop_0(); /* pop only if the number was actually stored
192 (see the 80486 manual p16-28) */
193 break;
194 case 020: /* fldenv m14/28byte */
195 fldenv(addr_modes, (u_char __user *)data_address);
196 /* Ensure that the values just loaded are not changed by
197 fix-up operations. */
198 return 1;
199 case 022: /* frstor m94/108byte */
200 frstor(addr_modes, (u_char __user *)data_address);
201 /* Ensure that the values just loaded are not changed by
202 fix-up operations. */
203 return 1;
204 case 023: /* fbld m80dec */
205 clear_C1();
206 loaded_tag = FPU_load_bcd((u_char __user *)data_address);
207 FPU_settag0(loaded_tag);
208 break;
209 case 024: /* fldcw */
210 RE_ENTRANT_CHECK_OFF;
211 FPU_access_ok(VERIFY_READ, data_address, 2);
212 FPU_get_user(control_word, (unsigned short __user *) data_address);
213 RE_ENTRANT_CHECK_ON;
214 if ( partial_status & ~control_word & CW_Exceptions )
215 partial_status |= (SW_Summary | SW_Backward);
216 else
217 partial_status &= ~(SW_Summary | SW_Backward);
218#ifdef PECULIAR_486
219 control_word |= 0x40; /* An 80486 appears to always set this bit */
220#endif /* PECULIAR_486 */
221 return 1;
222 case 025: /* fld m80real */
223 clear_C1();
224 loaded_tag = FPU_load_extended((long double __user *)data_address, 0);
225 FPU_settag0(loaded_tag);
226 break;
227 case 027: /* fild m64int */
228 clear_C1();
229 loaded_tag = FPU_load_int64((long long __user *)data_address);
230 if (loaded_tag == TAG_Error)
231 return 0;
232 FPU_settag0(loaded_tag);
233 break;
234 case 030: /* fstenv m14/28byte */
235 fstenv(addr_modes, (u_char __user *)data_address);
236 return 1;
237 case 032: /* fsave */
238 fsave(addr_modes, (u_char __user *)data_address);
239 return 1;
240 case 033: /* fbstp m80dec */
241 clear_C1();
242 if ( FPU_store_bcd(st0_ptr, st0_tag, (u_char __user *)data_address) )
243 pop_0(); /* pop only if the number was actually stored
244 (see the 80486 manual p16-28) */
245 break;
246 case 034: /* fstcw m16int */
247 RE_ENTRANT_CHECK_OFF;
248 FPU_access_ok(VERIFY_WRITE,data_address,2);
249 FPU_put_user(control_word, (unsigned short __user *) data_address);
250 RE_ENTRANT_CHECK_ON;
251 return 1;
252 case 035: /* fstp m80real */
253 clear_C1();
254 if ( FPU_store_extended(st0_ptr, st0_tag, (long double __user *)data_address) )
255 pop_0(); /* pop only if the number was actually stored
256 (see the 80486 manual p16-28) */
257 break;
258 case 036: /* fstsw m2byte */
259 RE_ENTRANT_CHECK_OFF;
260 FPU_access_ok(VERIFY_WRITE,data_address,2);
261 FPU_put_user(status_word(),(unsigned short __user *) data_address);
262 RE_ENTRANT_CHECK_ON;
263 return 1;
264 case 037: /* fistp m64int */
265 clear_C1();
266 if ( FPU_store_int64(st0_ptr, st0_tag, (long long __user *)data_address) )
267 pop_0(); /* pop only if the number was actually stored
268 (see the 80486 manual p16-28) */
269 break;
270 }
271 return 0;
272}
diff --git a/arch/x86/math-emu/mul_Xsig.S b/arch/x86/math-emu/mul_Xsig.S
new file mode 100644
index 000000000000..717785a53eb4
--- /dev/null
+++ b/arch/x86/math-emu/mul_Xsig.S
@@ -0,0 +1,176 @@
1/*---------------------------------------------------------------------------+
2 | mul_Xsig.S |
3 | |
4 | Multiply a 12 byte fixed point number by another fixed point number. |
5 | |
6 | Copyright (C) 1992,1994,1995 |
7 | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, |
8 | Australia. E-mail billm@jacobi.maths.monash.edu.au |
9 | |
10 | Call from C as: |
11 | void mul32_Xsig(Xsig *x, unsigned b) |
12 | |
13 | void mul64_Xsig(Xsig *x, unsigned long long *b) |
14 | |
15 | void mul_Xsig_Xsig(Xsig *x, unsigned *b) |
16 | |
17 | The result is neither rounded nor normalized, and the ls bit or so may |
18 | be wrong. |
19 | |
20 +---------------------------------------------------------------------------*/
21 .file "mul_Xsig.S"
22
23
24#include "fpu_emu.h"
25
26.text
27ENTRY(mul32_Xsig)
28 pushl %ebp
29 movl %esp,%ebp
30 subl $16,%esp
31 pushl %esi
32
33 movl PARAM1,%esi
34 movl PARAM2,%ecx
35
36 xor %eax,%eax
37 movl %eax,-4(%ebp)
38 movl %eax,-8(%ebp)
39
40 movl (%esi),%eax /* lsl of Xsig */
41 mull %ecx /* msl of b */
42 movl %edx,-12(%ebp)
43
44 movl 4(%esi),%eax /* midl of Xsig */
45 mull %ecx /* msl of b */
46 addl %eax,-12(%ebp)
47 adcl %edx,-8(%ebp)
48 adcl $0,-4(%ebp)
49
50 movl 8(%esi),%eax /* msl of Xsig */
51 mull %ecx /* msl of b */
52 addl %eax,-8(%ebp)
53 adcl %edx,-4(%ebp)
54
55 movl -12(%ebp),%eax
56 movl %eax,(%esi)
57 movl -8(%ebp),%eax
58 movl %eax,4(%esi)
59 movl -4(%ebp),%eax
60 movl %eax,8(%esi)
61
62 popl %esi
63 leave
64 ret
65
66
67ENTRY(mul64_Xsig)
68 pushl %ebp
69 movl %esp,%ebp
70 subl $16,%esp
71 pushl %esi
72
73 movl PARAM1,%esi
74 movl PARAM2,%ecx
75
76 xor %eax,%eax
77 movl %eax,-4(%ebp)
78 movl %eax,-8(%ebp)
79
80 movl (%esi),%eax /* lsl of Xsig */
81 mull 4(%ecx) /* msl of b */
82 movl %edx,-12(%ebp)
83
84 movl 4(%esi),%eax /* midl of Xsig */
85 mull (%ecx) /* lsl of b */
86 addl %edx,-12(%ebp)
87 adcl $0,-8(%ebp)
88 adcl $0,-4(%ebp)
89
90 movl 4(%esi),%eax /* midl of Xsig */
91 mull 4(%ecx) /* msl of b */
92 addl %eax,-12(%ebp)
93 adcl %edx,-8(%ebp)
94 adcl $0,-4(%ebp)
95
96 movl 8(%esi),%eax /* msl of Xsig */
97 mull (%ecx) /* lsl of b */
98 addl %eax,-12(%ebp)
99 adcl %edx,-8(%ebp)
100 adcl $0,-4(%ebp)
101
102 movl 8(%esi),%eax /* msl of Xsig */
103 mull 4(%ecx) /* msl of b */
104 addl %eax,-8(%ebp)
105 adcl %edx,-4(%ebp)
106
107 movl -12(%ebp),%eax
108 movl %eax,(%esi)
109 movl -8(%ebp),%eax
110 movl %eax,4(%esi)
111 movl -4(%ebp),%eax
112 movl %eax,8(%esi)
113
114 popl %esi
115 leave
116 ret
117
118
119
120ENTRY(mul_Xsig_Xsig)
121 pushl %ebp
122 movl %esp,%ebp
123 subl $16,%esp
124 pushl %esi
125
126 movl PARAM1,%esi
127 movl PARAM2,%ecx
128
129 xor %eax,%eax
130 movl %eax,-4(%ebp)
131 movl %eax,-8(%ebp)
132
133 movl (%esi),%eax /* lsl of Xsig */
134 mull 8(%ecx) /* msl of b */
135 movl %edx,-12(%ebp)
136
137 movl 4(%esi),%eax /* midl of Xsig */
138 mull 4(%ecx) /* midl of b */
139 addl %edx,-12(%ebp)
140 adcl $0,-8(%ebp)
141 adcl $0,-4(%ebp)
142
143 movl 8(%esi),%eax /* msl of Xsig */
144 mull (%ecx) /* lsl of b */
145 addl %edx,-12(%ebp)
146 adcl $0,-8(%ebp)
147 adcl $0,-4(%ebp)
148
149 movl 4(%esi),%eax /* midl of Xsig */
150 mull 8(%ecx) /* msl of b */
151 addl %eax,-12(%ebp)
152 adcl %edx,-8(%ebp)
153 adcl $0,-4(%ebp)
154
155 movl 8(%esi),%eax /* msl of Xsig */
156 mull 4(%ecx) /* midl of b */
157 addl %eax,-12(%ebp)
158 adcl %edx,-8(%ebp)
159 adcl $0,-4(%ebp)
160
161 movl 8(%esi),%eax /* msl of Xsig */
162 mull 8(%ecx) /* msl of b */
163 addl %eax,-8(%ebp)
164 adcl %edx,-4(%ebp)
165
166 movl -12(%ebp),%edx
167 movl %edx,(%esi)
168 movl -8(%ebp),%edx
169 movl %edx,4(%esi)
170 movl -4(%ebp),%edx
171 movl %edx,8(%esi)
172
173 popl %esi
174 leave
175 ret
176
diff --git a/arch/x86/math-emu/poly.h b/arch/x86/math-emu/poly.h
new file mode 100644
index 000000000000..4db798114923
--- /dev/null
+++ b/arch/x86/math-emu/poly.h
@@ -0,0 +1,121 @@
1/*---------------------------------------------------------------------------+
2 | poly.h |
3 | |
4 | Header file for the FPU-emu poly*.c source files. |
5 | |
6 | Copyright (C) 1994,1999 |
7 | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, |
8 | Australia. E-mail billm@melbpc.org.au |
9 | |
10 | Declarations and definitions for functions operating on Xsig (12-byte |
11 | extended-significand) quantities. |
12 | |
13 +---------------------------------------------------------------------------*/
14
15#ifndef _POLY_H
16#define _POLY_H
17
18/* This 12-byte structure is used to improve the accuracy of computation
19 of transcendental functions.
20 Intended to be used to get results better than 8-byte computation
21 allows. 9-byte would probably be sufficient.
22 */
23typedef struct {
24 unsigned long lsw;
25 unsigned long midw;
26 unsigned long msw;
27} Xsig;
28
29asmlinkage void mul64(unsigned long long const *a, unsigned long long const *b,
30 unsigned long long *result);
31asmlinkage void polynomial_Xsig(Xsig *, const unsigned long long *x,
32 const unsigned long long terms[], const int n);
33
34asmlinkage void mul32_Xsig(Xsig *, const unsigned long mult);
35asmlinkage void mul64_Xsig(Xsig *, const unsigned long long *mult);
36asmlinkage void mul_Xsig_Xsig(Xsig *dest, const Xsig *mult);
37
38asmlinkage void shr_Xsig(Xsig *, const int n);
39asmlinkage int round_Xsig(Xsig *);
40asmlinkage int norm_Xsig(Xsig *);
41asmlinkage void div_Xsig(Xsig *x1, const Xsig *x2, const Xsig *dest);
42
43/* Macro to extract the most significant 32 bits from a long long */
44#define LL_MSW(x) (((unsigned long *)&x)[1])
45
46/* Macro to initialize an Xsig struct */
47#define MK_XSIG(a,b,c) { c, b, a }
48
49/* Macro to access the 8 ms bytes of an Xsig as a long long */
50#define XSIG_LL(x) (*(unsigned long long *)&x.midw)
51
52
53/*
54 Need to run gcc with optimizations on to get these to
55 actually be in-line.
56 */
57
58/* Multiply two fixed-point 32 bit numbers, producing a 32 bit result.
59 The answer is the ms word of the product. */
60/* Some versions of gcc make it difficult to stop eax from being clobbered.
61 Merely specifying that it is used doesn't work...
62 */
63static inline unsigned long mul_32_32(const unsigned long arg1,
64 const unsigned long arg2)
65{
66 int retval;
67 asm volatile ("mull %2; movl %%edx,%%eax" \
68 :"=a" (retval) \
69 :"0" (arg1), "g" (arg2) \
70 :"dx");
71 return retval;
72}
73
74
75/* Add the 12 byte Xsig x2 to Xsig dest, with no checks for overflow. */
76static inline void add_Xsig_Xsig(Xsig *dest, const Xsig *x2)
77{
78 asm volatile ("movl %1,%%edi; movl %2,%%esi;\n"
79 "movl (%%esi),%%eax; addl %%eax,(%%edi);\n"
80 "movl 4(%%esi),%%eax; adcl %%eax,4(%%edi);\n"
81 "movl 8(%%esi),%%eax; adcl %%eax,8(%%edi);\n"
82 :"=g" (*dest):"g" (dest), "g" (x2)
83 :"ax","si","di");
84}
85
86
87/* Add the 12 byte Xsig x2 to Xsig dest, adjust exp if overflow occurs. */
88/* Note: the constraints in the asm statement didn't always work properly
89 with gcc 2.5.8. Changing from using edi to using ecx got around the
90 problem, but keep fingers crossed! */
91static inline void add_two_Xsig(Xsig *dest, const Xsig *x2, long int *exp)
92{
93 asm volatile ("movl %2,%%ecx; movl %3,%%esi;\n"
94 "movl (%%esi),%%eax; addl %%eax,(%%ecx);\n"
95 "movl 4(%%esi),%%eax; adcl %%eax,4(%%ecx);\n"
96 "movl 8(%%esi),%%eax; adcl %%eax,8(%%ecx);\n"
97 "jnc 0f;\n"
98 "rcrl 8(%%ecx); rcrl 4(%%ecx); rcrl (%%ecx)\n"
99 "movl %4,%%ecx; incl (%%ecx)\n"
100 "movl $1,%%eax; jmp 1f;\n"
101 "0: xorl %%eax,%%eax;\n"
102 "1:\n"
103 :"=g" (*exp), "=g" (*dest)
104 :"g" (dest), "g" (x2), "g" (exp)
105 :"cx","si","ax");
106}
107
108
109/* Negate (subtract from 1.0) the 12 byte Xsig */
110/* This is faster in a loop on my 386 than using the "neg" instruction. */
111static inline void negate_Xsig(Xsig *x)
112{
113 asm volatile("movl %1,%%esi;\n"
114 "xorl %%ecx,%%ecx;\n"
115 "movl %%ecx,%%eax; subl (%%esi),%%eax; movl %%eax,(%%esi);\n"
116 "movl %%ecx,%%eax; sbbl 4(%%esi),%%eax; movl %%eax,4(%%esi);\n"
117 "movl %%ecx,%%eax; sbbl 8(%%esi),%%eax; movl %%eax,8(%%esi);\n"
118 :"=g" (*x):"g" (x):"si","ax","cx");
119}
120
121#endif /* _POLY_H */
diff --git a/arch/x86/math-emu/poly_2xm1.c b/arch/x86/math-emu/poly_2xm1.c
new file mode 100644
index 000000000000..9766ad5e9743
--- /dev/null
+++ b/arch/x86/math-emu/poly_2xm1.c
@@ -0,0 +1,156 @@
1/*---------------------------------------------------------------------------+
2 | poly_2xm1.c |
3 | |
4 | Function to compute 2^x-1 by a polynomial approximation. |
5 | |
6 | Copyright (C) 1992,1993,1994,1997 |
7 | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
8 | E-mail billm@suburbia.net |
9 | |
10 | |
11 +---------------------------------------------------------------------------*/
12
13#include "exception.h"
14#include "reg_constant.h"
15#include "fpu_emu.h"
16#include "fpu_system.h"
17#include "control_w.h"
18#include "poly.h"
19
20
21#define HIPOWER 11
22static const unsigned long long lterms[HIPOWER] =
23{
24 0x0000000000000000LL, /* This term done separately as 12 bytes */
25 0xf5fdeffc162c7543LL,
26 0x1c6b08d704a0bfa6LL,
27 0x0276556df749cc21LL,
28 0x002bb0ffcf14f6b8LL,
29 0x0002861225ef751cLL,
30 0x00001ffcbfcd5422LL,
31 0x00000162c005d5f1LL,
32 0x0000000da96ccb1bLL,
33 0x0000000078d1b897LL,
34 0x000000000422b029LL
35};
36
37static const Xsig hiterm = MK_XSIG(0xb17217f7, 0xd1cf79ab, 0xc8a39194);
38
39/* Four slices: 0.0 : 0.25 : 0.50 : 0.75 : 1.0,
40 These numbers are 2^(1/4), 2^(1/2), and 2^(3/4)
41 */
42static const Xsig shiftterm0 = MK_XSIG(0, 0, 0);
43static const Xsig shiftterm1 = MK_XSIG(0x9837f051, 0x8db8a96f, 0x46ad2318);
44static const Xsig shiftterm2 = MK_XSIG(0xb504f333, 0xf9de6484, 0x597d89b3);
45static const Xsig shiftterm3 = MK_XSIG(0xd744fcca, 0xd69d6af4, 0x39a68bb9);
46
47static const Xsig *shiftterm[] = { &shiftterm0, &shiftterm1,
48 &shiftterm2, &shiftterm3 };
49
50
51/*--- poly_2xm1() -----------------------------------------------------------+
52 | Requires st(0) which is TAG_Valid and < 1. |
53 +---------------------------------------------------------------------------*/
54int poly_2xm1(u_char sign, FPU_REG *arg, FPU_REG *result)
55{
56 long int exponent, shift;
57 unsigned long long Xll;
58 Xsig accumulator, Denom, argSignif;
59 u_char tag;
60
61 exponent = exponent16(arg);
62
63#ifdef PARANOID
64 if ( exponent >= 0 ) /* Don't want a |number| >= 1.0 */
65 {
66 /* Number negative, too large, or not Valid. */
67 EXCEPTION(EX_INTERNAL|0x127);
68 return 1;
69 }
70#endif /* PARANOID */
71
72 argSignif.lsw = 0;
73 XSIG_LL(argSignif) = Xll = significand(arg);
74
75 if ( exponent == -1 )
76 {
77 shift = (argSignif.msw & 0x40000000) ? 3 : 2;
78 /* subtract 0.5 or 0.75 */
79 exponent -= 2;
80 XSIG_LL(argSignif) <<= 2;
81 Xll <<= 2;
82 }
83 else if ( exponent == -2 )
84 {
85 shift = 1;
86 /* subtract 0.25 */
87 exponent--;
88 XSIG_LL(argSignif) <<= 1;
89 Xll <<= 1;
90 }
91 else
92 shift = 0;
93
94 if ( exponent < -2 )
95 {
96 /* Shift the argument right by the required places. */
97 if ( FPU_shrx(&Xll, -2-exponent) >= 0x80000000U )
98 Xll++; /* round up */
99 }
100
101 accumulator.lsw = accumulator.midw = accumulator.msw = 0;
102 polynomial_Xsig(&accumulator, &Xll, lterms, HIPOWER-1);
103 mul_Xsig_Xsig(&accumulator, &argSignif);
104 shr_Xsig(&accumulator, 3);
105
106 mul_Xsig_Xsig(&argSignif, &hiterm); /* The leading term */
107 add_two_Xsig(&accumulator, &argSignif, &exponent);
108
109 if ( shift )
110 {
111 /* The argument is large, use the identity:
112 f(x+a) = f(a) * (f(x) + 1) - 1;
113 */
114 shr_Xsig(&accumulator, - exponent);
115 accumulator.msw |= 0x80000000; /* add 1.0 */
116 mul_Xsig_Xsig(&accumulator, shiftterm[shift]);
117 accumulator.msw &= 0x3fffffff; /* subtract 1.0 */
118 exponent = 1;
119 }
120
121 if ( sign != SIGN_POS )
122 {
123 /* The argument is negative, use the identity:
124 f(-x) = -f(x) / (1 + f(x))
125 */
126 Denom.lsw = accumulator.lsw;
127 XSIG_LL(Denom) = XSIG_LL(accumulator);
128 if ( exponent < 0 )
129 shr_Xsig(&Denom, - exponent);
130 else if ( exponent > 0 )
131 {
132 /* exponent must be 1 here */
133 XSIG_LL(Denom) <<= 1;
134 if ( Denom.lsw & 0x80000000 )
135 XSIG_LL(Denom) |= 1;
136 (Denom.lsw) <<= 1;
137 }
138 Denom.msw |= 0x80000000; /* add 1.0 */
139 div_Xsig(&accumulator, &Denom, &accumulator);
140 }
141
142 /* Convert to 64 bit signed-compatible */
143 exponent += round_Xsig(&accumulator);
144
145 result = &st(0);
146 significand(result) = XSIG_LL(accumulator);
147 setexponent16(result, exponent);
148
149 tag = FPU_round(result, 1, 0, FULL_PRECISION, sign);
150
151 setsign(result, sign);
152 FPU_settag0(tag);
153
154 return 0;
155
156}
diff --git a/arch/x86/math-emu/poly_atan.c b/arch/x86/math-emu/poly_atan.c
new file mode 100644
index 000000000000..82f702952f69
--- /dev/null
+++ b/arch/x86/math-emu/poly_atan.c
@@ -0,0 +1,229 @@
1/*---------------------------------------------------------------------------+
2 | poly_atan.c |
3 | |
4 | Compute the arctan of a FPU_REG, using a polynomial approximation. |
5 | |
6 | Copyright (C) 1992,1993,1994,1997 |
7 | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
8 | E-mail billm@suburbia.net |
9 | |
10 | |
11 +---------------------------------------------------------------------------*/
12
13#include "exception.h"
14#include "reg_constant.h"
15#include "fpu_emu.h"
16#include "fpu_system.h"
17#include "status_w.h"
18#include "control_w.h"
19#include "poly.h"
20
21
22#define HIPOWERon 6 /* odd poly, negative terms */
23static const unsigned long long oddnegterms[HIPOWERon] =
24{
25 0x0000000000000000LL, /* Dummy (not for - 1.0) */
26 0x015328437f756467LL,
27 0x0005dda27b73dec6LL,
28 0x0000226bf2bfb91aLL,
29 0x000000ccc439c5f7LL,
30 0x0000000355438407LL
31} ;
32
33#define HIPOWERop 6 /* odd poly, positive terms */
34static const unsigned long long oddplterms[HIPOWERop] =
35{
36/* 0xaaaaaaaaaaaaaaabLL, transferred to fixedpterm[] */
37 0x0db55a71875c9ac2LL,
38 0x0029fce2d67880b0LL,
39 0x0000dfd3908b4596LL,
40 0x00000550fd61dab4LL,
41 0x0000001c9422b3f9LL,
42 0x000000003e3301e1LL
43};
44
45static const unsigned long long denomterm = 0xebd9b842c5c53a0eLL;
46
47static const Xsig fixedpterm = MK_XSIG(0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa);
48
49static const Xsig pi_signif = MK_XSIG(0xc90fdaa2, 0x2168c234, 0xc4c6628b);
50
51
52/*--- poly_atan() -----------------------------------------------------------+
53 | |
54 +---------------------------------------------------------------------------*/
55void poly_atan(FPU_REG *st0_ptr, u_char st0_tag,
56 FPU_REG *st1_ptr, u_char st1_tag)
57{
58 u_char transformed, inverted,
59 sign1, sign2;
60 int exponent;
61 long int dummy_exp;
62 Xsig accumulator, Numer, Denom, accumulatore, argSignif,
63 argSq, argSqSq;
64 u_char tag;
65
66 sign1 = getsign(st0_ptr);
67 sign2 = getsign(st1_ptr);
68 if ( st0_tag == TAG_Valid )
69 {
70 exponent = exponent(st0_ptr);
71 }
72 else
73 {
74 /* This gives non-compatible stack contents... */
75 FPU_to_exp16(st0_ptr, st0_ptr);
76 exponent = exponent16(st0_ptr);
77 }
78 if ( st1_tag == TAG_Valid )
79 {
80 exponent -= exponent(st1_ptr);
81 }
82 else
83 {
84 /* This gives non-compatible stack contents... */
85 FPU_to_exp16(st1_ptr, st1_ptr);
86 exponent -= exponent16(st1_ptr);
87 }
88
89 if ( (exponent < 0) || ((exponent == 0) &&
90 ((st0_ptr->sigh < st1_ptr->sigh) ||
91 ((st0_ptr->sigh == st1_ptr->sigh) &&
92 (st0_ptr->sigl < st1_ptr->sigl))) ) )
93 {
94 inverted = 1;
95 Numer.lsw = Denom.lsw = 0;
96 XSIG_LL(Numer) = significand(st0_ptr);
97 XSIG_LL(Denom) = significand(st1_ptr);
98 }
99 else
100 {
101 inverted = 0;
102 exponent = -exponent;
103 Numer.lsw = Denom.lsw = 0;
104 XSIG_LL(Numer) = significand(st1_ptr);
105 XSIG_LL(Denom) = significand(st0_ptr);
106 }
107 div_Xsig(&Numer, &Denom, &argSignif);
108 exponent += norm_Xsig(&argSignif);
109
110 if ( (exponent >= -1)
111 || ((exponent == -2) && (argSignif.msw > 0xd413ccd0)) )
112 {
113 /* The argument is greater than sqrt(2)-1 (=0.414213562...) */
114 /* Convert the argument by an identity for atan */
115 transformed = 1;
116
117 if ( exponent >= 0 )
118 {
119#ifdef PARANOID
120 if ( !( (exponent == 0) &&
121 (argSignif.lsw == 0) && (argSignif.midw == 0) &&
122 (argSignif.msw == 0x80000000) ) )
123 {
124 EXCEPTION(EX_INTERNAL|0x104); /* There must be a logic error */
125 return;
126 }
127#endif /* PARANOID */
128 argSignif.msw = 0; /* Make the transformed arg -> 0.0 */
129 }
130 else
131 {
132 Numer.lsw = Denom.lsw = argSignif.lsw;
133 XSIG_LL(Numer) = XSIG_LL(Denom) = XSIG_LL(argSignif);
134
135 if ( exponent < -1 )
136 shr_Xsig(&Numer, -1-exponent);
137 negate_Xsig(&Numer);
138
139 shr_Xsig(&Denom, -exponent);
140 Denom.msw |= 0x80000000;
141
142 div_Xsig(&Numer, &Denom, &argSignif);
143
144 exponent = -1 + norm_Xsig(&argSignif);
145 }
146 }
147 else
148 {
149 transformed = 0;
150 }
151
152 argSq.lsw = argSignif.lsw; argSq.midw = argSignif.midw;
153 argSq.msw = argSignif.msw;
154 mul_Xsig_Xsig(&argSq, &argSq);
155
156 argSqSq.lsw = argSq.lsw; argSqSq.midw = argSq.midw; argSqSq.msw = argSq.msw;
157 mul_Xsig_Xsig(&argSqSq, &argSqSq);
158
159 accumulatore.lsw = argSq.lsw;
160 XSIG_LL(accumulatore) = XSIG_LL(argSq);
161
162 shr_Xsig(&argSq, 2*(-1-exponent-1));
163 shr_Xsig(&argSqSq, 4*(-1-exponent-1));
164
165 /* Now have argSq etc with binary point at the left
166 .1xxxxxxxx */
167
168 /* Do the basic fixed point polynomial evaluation */
169 accumulator.msw = accumulator.midw = accumulator.lsw = 0;
170 polynomial_Xsig(&accumulator, &XSIG_LL(argSqSq),
171 oddplterms, HIPOWERop-1);
172 mul64_Xsig(&accumulator, &XSIG_LL(argSq));
173 negate_Xsig(&accumulator);
174 polynomial_Xsig(&accumulator, &XSIG_LL(argSqSq), oddnegterms, HIPOWERon-1);
175 negate_Xsig(&accumulator);
176 add_two_Xsig(&accumulator, &fixedpterm, &dummy_exp);
177
178 mul64_Xsig(&accumulatore, &denomterm);
179 shr_Xsig(&accumulatore, 1 + 2*(-1-exponent));
180 accumulatore.msw |= 0x80000000;
181
182 div_Xsig(&accumulator, &accumulatore, &accumulator);
183
184 mul_Xsig_Xsig(&accumulator, &argSignif);
185 mul_Xsig_Xsig(&accumulator, &argSq);
186
187 shr_Xsig(&accumulator, 3);
188 negate_Xsig(&accumulator);
189 add_Xsig_Xsig(&accumulator, &argSignif);
190
191 if ( transformed )
192 {
193 /* compute pi/4 - accumulator */
194 shr_Xsig(&accumulator, -1-exponent);
195 negate_Xsig(&accumulator);
196 add_Xsig_Xsig(&accumulator, &pi_signif);
197 exponent = -1;
198 }
199
200 if ( inverted )
201 {
202 /* compute pi/2 - accumulator */
203 shr_Xsig(&accumulator, -exponent);
204 negate_Xsig(&accumulator);
205 add_Xsig_Xsig(&accumulator, &pi_signif);
206 exponent = 0;
207 }
208
209 if ( sign1 )
210 {
211 /* compute pi - accumulator */
212 shr_Xsig(&accumulator, 1 - exponent);
213 negate_Xsig(&accumulator);
214 add_Xsig_Xsig(&accumulator, &pi_signif);
215 exponent = 1;
216 }
217
218 exponent += round_Xsig(&accumulator);
219
220 significand(st1_ptr) = XSIG_LL(accumulator);
221 setexponent16(st1_ptr, exponent);
222
223 tag = FPU_round(st1_ptr, 1, 0, FULL_PRECISION, sign2);
224 FPU_settagi(1, tag);
225
226 set_precision_flag_up(); /* We do not really know if up or down,
227 use this as the default. */
228
229}
diff --git a/arch/x86/math-emu/poly_l2.c b/arch/x86/math-emu/poly_l2.c
new file mode 100644
index 000000000000..dd00e1d5b074
--- /dev/null
+++ b/arch/x86/math-emu/poly_l2.c
@@ -0,0 +1,272 @@
1/*---------------------------------------------------------------------------+
2 | poly_l2.c |
3 | |
4 | Compute the base 2 log of a FPU_REG, using a polynomial approximation. |
5 | |
6 | Copyright (C) 1992,1993,1994,1997 |
7 | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
8 | E-mail billm@suburbia.net |
9 | |
10 | |
11 +---------------------------------------------------------------------------*/
12
13
14#include "exception.h"
15#include "reg_constant.h"
16#include "fpu_emu.h"
17#include "fpu_system.h"
18#include "control_w.h"
19#include "poly.h"
20
21
22static void log2_kernel(FPU_REG const *arg, u_char argsign,
23 Xsig *accum_result, long int *expon);
24
25
26/*--- poly_l2() -------------------------------------------------------------+
27 | Base 2 logarithm by a polynomial approximation. |
28 +---------------------------------------------------------------------------*/
29void poly_l2(FPU_REG *st0_ptr, FPU_REG *st1_ptr, u_char st1_sign)
30{
31 long int exponent, expon, expon_expon;
32 Xsig accumulator, expon_accum, yaccum;
33 u_char sign, argsign;
34 FPU_REG x;
35 int tag;
36
37 exponent = exponent16(st0_ptr);
38
39 /* From st0_ptr, make a number > sqrt(2)/2 and < sqrt(2) */
40 if ( st0_ptr->sigh > (unsigned)0xb504f334 )
41 {
42 /* Treat as sqrt(2)/2 < st0_ptr < 1 */
43 significand(&x) = - significand(st0_ptr);
44 setexponent16(&x, -1);
45 exponent++;
46 argsign = SIGN_NEG;
47 }
48 else
49 {
50 /* Treat as 1 <= st0_ptr < sqrt(2) */
51 x.sigh = st0_ptr->sigh - 0x80000000;
52 x.sigl = st0_ptr->sigl;
53 setexponent16(&x, 0);
54 argsign = SIGN_POS;
55 }
56 tag = FPU_normalize_nuo(&x);
57
58 if ( tag == TAG_Zero )
59 {
60 expon = 0;
61 accumulator.msw = accumulator.midw = accumulator.lsw = 0;
62 }
63 else
64 {
65 log2_kernel(&x, argsign, &accumulator, &expon);
66 }
67
68 if ( exponent < 0 )
69 {
70 sign = SIGN_NEG;
71 exponent = -exponent;
72 }
73 else
74 sign = SIGN_POS;
75 expon_accum.msw = exponent; expon_accum.midw = expon_accum.lsw = 0;
76 if ( exponent )
77 {
78 expon_expon = 31 + norm_Xsig(&expon_accum);
79 shr_Xsig(&accumulator, expon_expon - expon);
80
81 if ( sign ^ argsign )
82 negate_Xsig(&accumulator);
83 add_Xsig_Xsig(&accumulator, &expon_accum);
84 }
85 else
86 {
87 expon_expon = expon;
88 sign = argsign;
89 }
90
91 yaccum.lsw = 0; XSIG_LL(yaccum) = significand(st1_ptr);
92 mul_Xsig_Xsig(&accumulator, &yaccum);
93
94 expon_expon += round_Xsig(&accumulator);
95
96 if ( accumulator.msw == 0 )
97 {
98 FPU_copy_to_reg1(&CONST_Z, TAG_Zero);
99 return;
100 }
101
102 significand(st1_ptr) = XSIG_LL(accumulator);
103 setexponent16(st1_ptr, expon_expon + exponent16(st1_ptr) + 1);
104
105 tag = FPU_round(st1_ptr, 1, 0, FULL_PRECISION, sign ^ st1_sign);
106 FPU_settagi(1, tag);
107
108 set_precision_flag_up(); /* 80486 appears to always do this */
109
110 return;
111
112}
113
114
115/*--- poly_l2p1() -----------------------------------------------------------+
116 | Base 2 logarithm by a polynomial approximation. |
117 | log2(x+1) |
118 +---------------------------------------------------------------------------*/
119int poly_l2p1(u_char sign0, u_char sign1,
120 FPU_REG *st0_ptr, FPU_REG *st1_ptr, FPU_REG *dest)
121{
122 u_char tag;
123 long int exponent;
124 Xsig accumulator, yaccum;
125
126 if ( exponent16(st0_ptr) < 0 )
127 {
128 log2_kernel(st0_ptr, sign0, &accumulator, &exponent);
129
130 yaccum.lsw = 0;
131 XSIG_LL(yaccum) = significand(st1_ptr);
132 mul_Xsig_Xsig(&accumulator, &yaccum);
133
134 exponent += round_Xsig(&accumulator);
135
136 exponent += exponent16(st1_ptr) + 1;
137 if ( exponent < EXP_WAY_UNDER ) exponent = EXP_WAY_UNDER;
138
139 significand(dest) = XSIG_LL(accumulator);
140 setexponent16(dest, exponent);
141
142 tag = FPU_round(dest, 1, 0, FULL_PRECISION, sign0 ^ sign1);
143 FPU_settagi(1, tag);
144
145 if ( tag == TAG_Valid )
146 set_precision_flag_up(); /* 80486 appears to always do this */
147 }
148 else
149 {
150 /* The magnitude of st0_ptr is far too large. */
151
152 if ( sign0 != SIGN_POS )
153 {
154 /* Trying to get the log of a negative number. */
155#ifdef PECULIAR_486 /* Stupid 80486 doesn't worry about log(negative). */
156 changesign(st1_ptr);
157#else
158 if ( arith_invalid(1) < 0 )
159 return 1;
160#endif /* PECULIAR_486 */
161 }
162
163 /* 80486 appears to do this */
164 if ( sign0 == SIGN_NEG )
165 set_precision_flag_down();
166 else
167 set_precision_flag_up();
168 }
169
170 if ( exponent(dest) <= EXP_UNDER )
171 EXCEPTION(EX_Underflow);
172
173 return 0;
174
175}
176
177
178
179
180#undef HIPOWER
181#define HIPOWER 10
182static const unsigned long long logterms[HIPOWER] =
183{
184 0x2a8eca5705fc2ef0LL,
185 0xf6384ee1d01febceLL,
186 0x093bb62877cdf642LL,
187 0x006985d8a9ec439bLL,
188 0x0005212c4f55a9c8LL,
189 0x00004326a16927f0LL,
190 0x0000038d1d80a0e7LL,
191 0x0000003141cc80c6LL,
192 0x00000002b1668c9fLL,
193 0x000000002c7a46aaLL
194};
195
196static const unsigned long leadterm = 0xb8000000;
197
198
199/*--- log2_kernel() ---------------------------------------------------------+
200 | Base 2 logarithm by a polynomial approximation. |
201 | log2(x+1) |
202 +---------------------------------------------------------------------------*/
203static void log2_kernel(FPU_REG const *arg, u_char argsign, Xsig *accum_result,
204 long int *expon)
205{
206 long int exponent, adj;
207 unsigned long long Xsq;
208 Xsig accumulator, Numer, Denom, argSignif, arg_signif;
209
210 exponent = exponent16(arg);
211 Numer.lsw = Denom.lsw = 0;
212 XSIG_LL(Numer) = XSIG_LL(Denom) = significand(arg);
213 if ( argsign == SIGN_POS )
214 {
215 shr_Xsig(&Denom, 2 - (1 + exponent));
216 Denom.msw |= 0x80000000;
217 div_Xsig(&Numer, &Denom, &argSignif);
218 }
219 else
220 {
221 shr_Xsig(&Denom, 1 - (1 + exponent));
222 negate_Xsig(&Denom);
223 if ( Denom.msw & 0x80000000 )
224 {
225 div_Xsig(&Numer, &Denom, &argSignif);
226 exponent ++;
227 }
228 else
229 {
230 /* Denom must be 1.0 */
231 argSignif.lsw = Numer.lsw; argSignif.midw = Numer.midw;
232 argSignif.msw = Numer.msw;
233 }
234 }
235
236#ifndef PECULIAR_486
237 /* Should check here that |local_arg| is within the valid range */
238 if ( exponent >= -2 )
239 {
240 if ( (exponent > -2) ||
241 (argSignif.msw > (unsigned)0xafb0ccc0) )
242 {
243 /* The argument is too large */
244 }
245 }
246#endif /* PECULIAR_486 */
247
248 arg_signif.lsw = argSignif.lsw; XSIG_LL(arg_signif) = XSIG_LL(argSignif);
249 adj = norm_Xsig(&argSignif);
250 accumulator.lsw = argSignif.lsw; XSIG_LL(accumulator) = XSIG_LL(argSignif);
251 mul_Xsig_Xsig(&accumulator, &accumulator);
252 shr_Xsig(&accumulator, 2*(-1 - (1 + exponent + adj)));
253 Xsq = XSIG_LL(accumulator);
254 if ( accumulator.lsw & 0x80000000 )
255 Xsq++;
256
257 accumulator.msw = accumulator.midw = accumulator.lsw = 0;
258 /* Do the basic fixed point polynomial evaluation */
259 polynomial_Xsig(&accumulator, &Xsq, logterms, HIPOWER-1);
260
261 mul_Xsig_Xsig(&accumulator, &argSignif);
262 shr_Xsig(&accumulator, 6 - adj);
263
264 mul32_Xsig(&arg_signif, leadterm);
265 add_two_Xsig(&accumulator, &arg_signif, &exponent);
266
267 *expon = exponent + 1;
268 accum_result->lsw = accumulator.lsw;
269 accum_result->midw = accumulator.midw;
270 accum_result->msw = accumulator.msw;
271
272}
diff --git a/arch/x86/math-emu/poly_sin.c b/arch/x86/math-emu/poly_sin.c
new file mode 100644
index 000000000000..a36313fb06f1
--- /dev/null
+++ b/arch/x86/math-emu/poly_sin.c
@@ -0,0 +1,397 @@
1/*---------------------------------------------------------------------------+
2 | poly_sin.c |
3 | |
4 | Computation of an approximation of the sin function and the cosine |
5 | function by a polynomial. |
6 | |
7 | Copyright (C) 1992,1993,1994,1997,1999 |
8 | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
9 | E-mail billm@melbpc.org.au |
10 | |
11 | |
12 +---------------------------------------------------------------------------*/
13
14
15#include "exception.h"
16#include "reg_constant.h"
17#include "fpu_emu.h"
18#include "fpu_system.h"
19#include "control_w.h"
20#include "poly.h"
21
22
23#define N_COEFF_P 4
24#define N_COEFF_N 4
25
26static const unsigned long long pos_terms_l[N_COEFF_P] =
27{
28 0xaaaaaaaaaaaaaaabLL,
29 0x00d00d00d00cf906LL,
30 0x000006b99159a8bbLL,
31 0x000000000d7392e6LL
32};
33
34static const unsigned long long neg_terms_l[N_COEFF_N] =
35{
36 0x2222222222222167LL,
37 0x0002e3bc74aab624LL,
38 0x0000000b09229062LL,
39 0x00000000000c7973LL
40};
41
42
43
44#define N_COEFF_PH 4
45#define N_COEFF_NH 4
46static const unsigned long long pos_terms_h[N_COEFF_PH] =
47{
48 0x0000000000000000LL,
49 0x05b05b05b05b0406LL,
50 0x000049f93edd91a9LL,
51 0x00000000c9c9ed62LL
52};
53
54static const unsigned long long neg_terms_h[N_COEFF_NH] =
55{
56 0xaaaaaaaaaaaaaa98LL,
57 0x001a01a01a019064LL,
58 0x0000008f76c68a77LL,
59 0x0000000000d58f5eLL
60};
61
62
63/*--- poly_sine() -----------------------------------------------------------+
64 | |
65 +---------------------------------------------------------------------------*/
66void poly_sine(FPU_REG *st0_ptr)
67{
68 int exponent, echange;
69 Xsig accumulator, argSqrd, argTo4;
70 unsigned long fix_up, adj;
71 unsigned long long fixed_arg;
72 FPU_REG result;
73
74 exponent = exponent(st0_ptr);
75
76 accumulator.lsw = accumulator.midw = accumulator.msw = 0;
77
78 /* Split into two ranges, for arguments below and above 1.0 */
79 /* The boundary between upper and lower is approx 0.88309101259 */
80 if ( (exponent < -1) || ((exponent == -1) && (st0_ptr->sigh <= 0xe21240aa)) )
81 {
82 /* The argument is <= 0.88309101259 */
83
84 argSqrd.msw = st0_ptr->sigh; argSqrd.midw = st0_ptr->sigl; argSqrd.lsw = 0;
85 mul64_Xsig(&argSqrd, &significand(st0_ptr));
86 shr_Xsig(&argSqrd, 2*(-1-exponent));
87 argTo4.msw = argSqrd.msw; argTo4.midw = argSqrd.midw;
88 argTo4.lsw = argSqrd.lsw;
89 mul_Xsig_Xsig(&argTo4, &argTo4);
90
91 polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), neg_terms_l,
92 N_COEFF_N-1);
93 mul_Xsig_Xsig(&accumulator, &argSqrd);
94 negate_Xsig(&accumulator);
95
96 polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), pos_terms_l,
97 N_COEFF_P-1);
98
99 shr_Xsig(&accumulator, 2); /* Divide by four */
100 accumulator.msw |= 0x80000000; /* Add 1.0 */
101
102 mul64_Xsig(&accumulator, &significand(st0_ptr));
103 mul64_Xsig(&accumulator, &significand(st0_ptr));
104 mul64_Xsig(&accumulator, &significand(st0_ptr));
105
106 /* Divide by four, FPU_REG compatible, etc */
107 exponent = 3*exponent;
108
109 /* The minimum exponent difference is 3 */
110 shr_Xsig(&accumulator, exponent(st0_ptr) - exponent);
111
112 negate_Xsig(&accumulator);
113 XSIG_LL(accumulator) += significand(st0_ptr);
114
115 echange = round_Xsig(&accumulator);
116
117 setexponentpos(&result, exponent(st0_ptr) + echange);
118 }
119 else
120 {
121 /* The argument is > 0.88309101259 */
122 /* We use sin(st(0)) = cos(pi/2-st(0)) */
123
124 fixed_arg = significand(st0_ptr);
125
126 if ( exponent == 0 )
127 {
128 /* The argument is >= 1.0 */
129
130 /* Put the binary point at the left. */
131 fixed_arg <<= 1;
132 }
133 /* pi/2 in hex is: 1.921fb54442d18469 898CC51701B839A2 52049C1 */
134 fixed_arg = 0x921fb54442d18469LL - fixed_arg;
135 /* There is a special case which arises due to rounding, to fix here. */
136 if ( fixed_arg == 0xffffffffffffffffLL )
137 fixed_arg = 0;
138
139 XSIG_LL(argSqrd) = fixed_arg; argSqrd.lsw = 0;
140 mul64_Xsig(&argSqrd, &fixed_arg);
141
142 XSIG_LL(argTo4) = XSIG_LL(argSqrd); argTo4.lsw = argSqrd.lsw;
143 mul_Xsig_Xsig(&argTo4, &argTo4);
144
145 polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), neg_terms_h,
146 N_COEFF_NH-1);
147 mul_Xsig_Xsig(&accumulator, &argSqrd);
148 negate_Xsig(&accumulator);
149
150 polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), pos_terms_h,
151 N_COEFF_PH-1);
152 negate_Xsig(&accumulator);
153
154 mul64_Xsig(&accumulator, &fixed_arg);
155 mul64_Xsig(&accumulator, &fixed_arg);
156
157 shr_Xsig(&accumulator, 3);
158 negate_Xsig(&accumulator);
159
160 add_Xsig_Xsig(&accumulator, &argSqrd);
161
162 shr_Xsig(&accumulator, 1);
163
164 accumulator.lsw |= 1; /* A zero accumulator here would cause problems */
165 negate_Xsig(&accumulator);
166
167 /* The basic computation is complete. Now fix the answer to
168 compensate for the error due to the approximation used for
169 pi/2
170 */
171
172 /* This has an exponent of -65 */
173 fix_up = 0x898cc517;
174 /* The fix-up needs to be improved for larger args */
175 if ( argSqrd.msw & 0xffc00000 )
176 {
177 /* Get about 32 bit precision in these: */
178 fix_up -= mul_32_32(0x898cc517, argSqrd.msw) / 6;
179 }
180 fix_up = mul_32_32(fix_up, LL_MSW(fixed_arg));
181
182 adj = accumulator.lsw; /* temp save */
183 accumulator.lsw -= fix_up;
184 if ( accumulator.lsw > adj )
185 XSIG_LL(accumulator) --;
186
187 echange = round_Xsig(&accumulator);
188
189 setexponentpos(&result, echange - 1);
190 }
191
192 significand(&result) = XSIG_LL(accumulator);
193 setsign(&result, getsign(st0_ptr));
194 FPU_copy_to_reg0(&result, TAG_Valid);
195
196#ifdef PARANOID
197 if ( (exponent(&result) >= 0)
198 && (significand(&result) > 0x8000000000000000LL) )
199 {
200 EXCEPTION(EX_INTERNAL|0x150);
201 }
202#endif /* PARANOID */
203
204}
205
206
207
208/*--- poly_cos() ------------------------------------------------------------+
209 | |
210 +---------------------------------------------------------------------------*/
211void poly_cos(FPU_REG *st0_ptr)
212{
213 FPU_REG result;
214 long int exponent, exp2, echange;
215 Xsig accumulator, argSqrd, fix_up, argTo4;
216 unsigned long long fixed_arg;
217
218#ifdef PARANOID
219 if ( (exponent(st0_ptr) > 0)
220 || ((exponent(st0_ptr) == 0)
221 && (significand(st0_ptr) > 0xc90fdaa22168c234LL)) )
222 {
223 EXCEPTION(EX_Invalid);
224 FPU_copy_to_reg0(&CONST_QNaN, TAG_Special);
225 return;
226 }
227#endif /* PARANOID */
228
229 exponent = exponent(st0_ptr);
230
231 accumulator.lsw = accumulator.midw = accumulator.msw = 0;
232
233 if ( (exponent < -1) || ((exponent == -1) && (st0_ptr->sigh <= 0xb00d6f54)) )
234 {
235 /* arg is < 0.687705 */
236
237 argSqrd.msw = st0_ptr->sigh; argSqrd.midw = st0_ptr->sigl;
238 argSqrd.lsw = 0;
239 mul64_Xsig(&argSqrd, &significand(st0_ptr));
240
241 if ( exponent < -1 )
242 {
243 /* shift the argument right by the required places */
244 shr_Xsig(&argSqrd, 2*(-1-exponent));
245 }
246
247 argTo4.msw = argSqrd.msw; argTo4.midw = argSqrd.midw;
248 argTo4.lsw = argSqrd.lsw;
249 mul_Xsig_Xsig(&argTo4, &argTo4);
250
251 polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), neg_terms_h,
252 N_COEFF_NH-1);
253 mul_Xsig_Xsig(&accumulator, &argSqrd);
254 negate_Xsig(&accumulator);
255
256 polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), pos_terms_h,
257 N_COEFF_PH-1);
258 negate_Xsig(&accumulator);
259
260 mul64_Xsig(&accumulator, &significand(st0_ptr));
261 mul64_Xsig(&accumulator, &significand(st0_ptr));
262 shr_Xsig(&accumulator, -2*(1+exponent));
263
264 shr_Xsig(&accumulator, 3);
265 negate_Xsig(&accumulator);
266
267 add_Xsig_Xsig(&accumulator, &argSqrd);
268
269 shr_Xsig(&accumulator, 1);
270
271 /* It doesn't matter if accumulator is all zero here, the
272 following code will work ok */
273 negate_Xsig(&accumulator);
274
275 if ( accumulator.lsw & 0x80000000 )
276 XSIG_LL(accumulator) ++;
277 if ( accumulator.msw == 0 )
278 {
279 /* The result is 1.0 */
280 FPU_copy_to_reg0(&CONST_1, TAG_Valid);
281 return;
282 }
283 else
284 {
285 significand(&result) = XSIG_LL(accumulator);
286
287 /* will be a valid positive nr with expon = -1 */
288 setexponentpos(&result, -1);
289 }
290 }
291 else
292 {
293 fixed_arg = significand(st0_ptr);
294
295 if ( exponent == 0 )
296 {
297 /* The argument is >= 1.0 */
298
299 /* Put the binary point at the left. */
300 fixed_arg <<= 1;
301 }
302 /* pi/2 in hex is: 1.921fb54442d18469 898CC51701B839A2 52049C1 */
303 fixed_arg = 0x921fb54442d18469LL - fixed_arg;
304 /* There is a special case which arises due to rounding, to fix here. */
305 if ( fixed_arg == 0xffffffffffffffffLL )
306 fixed_arg = 0;
307
308 exponent = -1;
309 exp2 = -1;
310
311 /* A shift is needed here only for a narrow range of arguments,
312 i.e. for fixed_arg approx 2^-32, but we pick up more... */
313 if ( !(LL_MSW(fixed_arg) & 0xffff0000) )
314 {
315 fixed_arg <<= 16;
316 exponent -= 16;
317 exp2 -= 16;
318 }
319
320 XSIG_LL(argSqrd) = fixed_arg; argSqrd.lsw = 0;
321 mul64_Xsig(&argSqrd, &fixed_arg);
322
323 if ( exponent < -1 )
324 {
325 /* shift the argument right by the required places */
326 shr_Xsig(&argSqrd, 2*(-1-exponent));
327 }
328
329 argTo4.msw = argSqrd.msw; argTo4.midw = argSqrd.midw;
330 argTo4.lsw = argSqrd.lsw;
331 mul_Xsig_Xsig(&argTo4, &argTo4);
332
333 polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), neg_terms_l,
334 N_COEFF_N-1);
335 mul_Xsig_Xsig(&accumulator, &argSqrd);
336 negate_Xsig(&accumulator);
337
338 polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), pos_terms_l,
339 N_COEFF_P-1);
340
341 shr_Xsig(&accumulator, 2); /* Divide by four */
342 accumulator.msw |= 0x80000000; /* Add 1.0 */
343
344 mul64_Xsig(&accumulator, &fixed_arg);
345 mul64_Xsig(&accumulator, &fixed_arg);
346 mul64_Xsig(&accumulator, &fixed_arg);
347
348 /* Divide by four, FPU_REG compatible, etc */
349 exponent = 3*exponent;
350
351 /* The minimum exponent difference is 3 */
352 shr_Xsig(&accumulator, exp2 - exponent);
353
354 negate_Xsig(&accumulator);
355 XSIG_LL(accumulator) += fixed_arg;
356
357 /* The basic computation is complete. Now fix the answer to
358 compensate for the error due to the approximation used for
359 pi/2
360 */
361
362 /* This has an exponent of -65 */
363 XSIG_LL(fix_up) = 0x898cc51701b839a2ll;
364 fix_up.lsw = 0;
365
366 /* The fix-up needs to be improved for larger args */
367 if ( argSqrd.msw & 0xffc00000 )
368 {
369 /* Get about 32 bit precision in these: */
370 fix_up.msw -= mul_32_32(0x898cc517, argSqrd.msw) / 2;
371 fix_up.msw += mul_32_32(0x898cc517, argTo4.msw) / 24;
372 }
373
374 exp2 += norm_Xsig(&accumulator);
375 shr_Xsig(&accumulator, 1); /* Prevent overflow */
376 exp2++;
377 shr_Xsig(&fix_up, 65 + exp2);
378
379 add_Xsig_Xsig(&accumulator, &fix_up);
380
381 echange = round_Xsig(&accumulator);
382
383 setexponentpos(&result, exp2 + echange);
384 significand(&result) = XSIG_LL(accumulator);
385 }
386
387 FPU_copy_to_reg0(&result, TAG_Valid);
388
389#ifdef PARANOID
390 if ( (exponent(&result) >= 0)
391 && (significand(&result) > 0x8000000000000000LL) )
392 {
393 EXCEPTION(EX_INTERNAL|0x151);
394 }
395#endif /* PARANOID */
396
397}
diff --git a/arch/x86/math-emu/poly_tan.c b/arch/x86/math-emu/poly_tan.c
new file mode 100644
index 000000000000..8df3e03b6e6f
--- /dev/null
+++ b/arch/x86/math-emu/poly_tan.c
@@ -0,0 +1,222 @@
1/*---------------------------------------------------------------------------+
2 | poly_tan.c |
3 | |
4 | Compute the tan of a FPU_REG, using a polynomial approximation. |
5 | |
6 | Copyright (C) 1992,1993,1994,1997,1999 |
7 | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, |
8 | Australia. E-mail billm@melbpc.org.au |
9 | |
10 | |
11 +---------------------------------------------------------------------------*/
12
13#include "exception.h"
14#include "reg_constant.h"
15#include "fpu_emu.h"
16#include "fpu_system.h"
17#include "control_w.h"
18#include "poly.h"
19
20
21#define HiPOWERop 3 /* odd poly, positive terms */
22static const unsigned long long oddplterm[HiPOWERop] =
23{
24 0x0000000000000000LL,
25 0x0051a1cf08fca228LL,
26 0x0000000071284ff7LL
27};
28
29#define HiPOWERon 2 /* odd poly, negative terms */
30static const unsigned long long oddnegterm[HiPOWERon] =
31{
32 0x1291a9a184244e80LL,
33 0x0000583245819c21LL
34};
35
36#define HiPOWERep 2 /* even poly, positive terms */
37static const unsigned long long evenplterm[HiPOWERep] =
38{
39 0x0e848884b539e888LL,
40 0x00003c7f18b887daLL
41};
42
43#define HiPOWERen 2 /* even poly, negative terms */
44static const unsigned long long evennegterm[HiPOWERen] =
45{
46 0xf1f0200fd51569ccLL,
47 0x003afb46105c4432LL
48};
49
50static const unsigned long long twothirds = 0xaaaaaaaaaaaaaaabLL;
51
52
53/*--- poly_tan() ------------------------------------------------------------+
54 | |
55 +---------------------------------------------------------------------------*/
56void poly_tan(FPU_REG *st0_ptr)
57{
58 long int exponent;
59 int invert;
60 Xsig argSq, argSqSq, accumulatoro, accumulatore, accum,
61 argSignif, fix_up;
62 unsigned long adj;
63
64 exponent = exponent(st0_ptr);
65
66#ifdef PARANOID
67 if ( signnegative(st0_ptr) ) /* Can't hack a number < 0.0 */
68 { arith_invalid(0); return; } /* Need a positive number */
69#endif /* PARANOID */
70
71 /* Split the problem into two domains, smaller and larger than pi/4 */
72 if ( (exponent == 0) || ((exponent == -1) && (st0_ptr->sigh > 0xc90fdaa2)) )
73 {
74 /* The argument is greater than (approx) pi/4 */
75 invert = 1;
76 accum.lsw = 0;
77 XSIG_LL(accum) = significand(st0_ptr);
78
79 if ( exponent == 0 )
80 {
81 /* The argument is >= 1.0 */
82 /* Put the binary point at the left. */
83 XSIG_LL(accum) <<= 1;
84 }
85 /* pi/2 in hex is: 1.921fb54442d18469 898CC51701B839A2 52049C1 */
86 XSIG_LL(accum) = 0x921fb54442d18469LL - XSIG_LL(accum);
87 /* This is a special case which arises due to rounding. */
88 if ( XSIG_LL(accum) == 0xffffffffffffffffLL )
89 {
90 FPU_settag0(TAG_Valid);
91 significand(st0_ptr) = 0x8a51e04daabda360LL;
92 setexponent16(st0_ptr, (0x41 + EXTENDED_Ebias) | SIGN_Negative);
93 return;
94 }
95
96 argSignif.lsw = accum.lsw;
97 XSIG_LL(argSignif) = XSIG_LL(accum);
98 exponent = -1 + norm_Xsig(&argSignif);
99 }
100 else
101 {
102 invert = 0;
103 argSignif.lsw = 0;
104 XSIG_LL(accum) = XSIG_LL(argSignif) = significand(st0_ptr);
105
106 if ( exponent < -1 )
107 {
108 /* shift the argument right by the required places */
109 if ( FPU_shrx(&XSIG_LL(accum), -1-exponent) >= 0x80000000U )
110 XSIG_LL(accum) ++; /* round up */
111 }
112 }
113
114 XSIG_LL(argSq) = XSIG_LL(accum); argSq.lsw = accum.lsw;
115 mul_Xsig_Xsig(&argSq, &argSq);
116 XSIG_LL(argSqSq) = XSIG_LL(argSq); argSqSq.lsw = argSq.lsw;
117 mul_Xsig_Xsig(&argSqSq, &argSqSq);
118
119 /* Compute the negative terms for the numerator polynomial */
120 accumulatoro.msw = accumulatoro.midw = accumulatoro.lsw = 0;
121 polynomial_Xsig(&accumulatoro, &XSIG_LL(argSqSq), oddnegterm, HiPOWERon-1);
122 mul_Xsig_Xsig(&accumulatoro, &argSq);
123 negate_Xsig(&accumulatoro);
124 /* Add the positive terms */
125 polynomial_Xsig(&accumulatoro, &XSIG_LL(argSqSq), oddplterm, HiPOWERop-1);
126
127
128 /* Compute the positive terms for the denominator polynomial */
129 accumulatore.msw = accumulatore.midw = accumulatore.lsw = 0;
130 polynomial_Xsig(&accumulatore, &XSIG_LL(argSqSq), evenplterm, HiPOWERep-1);
131 mul_Xsig_Xsig(&accumulatore, &argSq);
132 negate_Xsig(&accumulatore);
133 /* Add the negative terms */
134 polynomial_Xsig(&accumulatore, &XSIG_LL(argSqSq), evennegterm, HiPOWERen-1);
135 /* Multiply by arg^2 */
136 mul64_Xsig(&accumulatore, &XSIG_LL(argSignif));
137 mul64_Xsig(&accumulatore, &XSIG_LL(argSignif));
138 /* de-normalize and divide by 2 */
139 shr_Xsig(&accumulatore, -2*(1+exponent) + 1);
140 negate_Xsig(&accumulatore); /* This does 1 - accumulator */
141
142 /* Now find the ratio. */
143 if ( accumulatore.msw == 0 )
144 {
145 /* accumulatoro must contain 1.0 here, (actually, 0) but it
146 really doesn't matter what value we use because it will
147 have negligible effect in later calculations
148 */
149 XSIG_LL(accum) = 0x8000000000000000LL;
150 accum.lsw = 0;
151 }
152 else
153 {
154 div_Xsig(&accumulatoro, &accumulatore, &accum);
155 }
156
157 /* Multiply by 1/3 * arg^3 */
158 mul64_Xsig(&accum, &XSIG_LL(argSignif));
159 mul64_Xsig(&accum, &XSIG_LL(argSignif));
160 mul64_Xsig(&accum, &XSIG_LL(argSignif));
161 mul64_Xsig(&accum, &twothirds);
162 shr_Xsig(&accum, -2*(exponent+1));
163
164 /* tan(arg) = arg + accum */
165 add_two_Xsig(&accum, &argSignif, &exponent);
166
167 if ( invert )
168 {
169 /* We now have the value of tan(pi_2 - arg) where pi_2 is an
170 approximation for pi/2
171 */
172 /* The next step is to fix the answer to compensate for the
173 error due to the approximation used for pi/2
174 */
175
176 /* This is (approx) delta, the error in our approx for pi/2
177 (see above). It has an exponent of -65
178 */
179 XSIG_LL(fix_up) = 0x898cc51701b839a2LL;
180 fix_up.lsw = 0;
181
182 if ( exponent == 0 )
183 adj = 0xffffffff; /* We want approx 1.0 here, but
184 this is close enough. */
185 else if ( exponent > -30 )
186 {
187 adj = accum.msw >> -(exponent+1); /* tan */
188 adj = mul_32_32(adj, adj); /* tan^2 */
189 }
190 else
191 adj = 0;
192 adj = mul_32_32(0x898cc517, adj); /* delta * tan^2 */
193
194 fix_up.msw += adj;
195 if ( !(fix_up.msw & 0x80000000) ) /* did fix_up overflow ? */
196 {
197 /* Yes, we need to add an msb */
198 shr_Xsig(&fix_up, 1);
199 fix_up.msw |= 0x80000000;
200 shr_Xsig(&fix_up, 64 + exponent);
201 }
202 else
203 shr_Xsig(&fix_up, 65 + exponent);
204
205 add_two_Xsig(&accum, &fix_up, &exponent);
206
207 /* accum now contains tan(pi/2 - arg).
208 Use tan(arg) = 1.0 / tan(pi/2 - arg)
209 */
210 accumulatoro.lsw = accumulatoro.midw = 0;
211 accumulatoro.msw = 0x80000000;
212 div_Xsig(&accumulatoro, &accum, &accum);
213 exponent = - exponent - 1;
214 }
215
216 /* Transfer the result */
217 round_Xsig(&accum);
218 FPU_settag0(TAG_Valid);
219 significand(st0_ptr) = XSIG_LL(accum);
220 setexponent16(st0_ptr, exponent + EXTENDED_Ebias); /* Result is positive. */
221
222}
diff --git a/arch/x86/math-emu/polynom_Xsig.S b/arch/x86/math-emu/polynom_Xsig.S
new file mode 100644
index 000000000000..17315c89ff3d
--- /dev/null
+++ b/arch/x86/math-emu/polynom_Xsig.S
@@ -0,0 +1,135 @@
1/*---------------------------------------------------------------------------+
2 | polynomial_Xsig.S |
3 | |
4 | Fixed point arithmetic polynomial evaluation. |
5 | |
6 | Copyright (C) 1992,1993,1994,1995 |
7 | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, |
8 | Australia. E-mail billm@jacobi.maths.monash.edu.au |
9 | |
10 | Call from C as: |
11 | void polynomial_Xsig(Xsig *accum, unsigned long long x, |
12 | unsigned long long terms[], int n) |
13 | |
14 | Computes: |
15 | terms[0] + (terms[1] + (terms[2] + ... + (terms[n-1]*x)*x)*x)*x) ... )*x |
16 | and adds the result to the 12 byte Xsig. |
17 | The terms[] are each 8 bytes, but all computation is performed to 12 byte |
18 | precision. |
19 | |
20 | This function must be used carefully: most overflow of intermediate |
21 | results is controlled, but overflow of the result is not. |
22 | |
23 +---------------------------------------------------------------------------*/
24 .file "polynomial_Xsig.S"
25
26#include "fpu_emu.h"
27
28
29#define TERM_SIZE $8
30#define SUM_MS -20(%ebp) /* sum ms long */
31#define SUM_MIDDLE -24(%ebp) /* sum middle long */
32#define SUM_LS -28(%ebp) /* sum ls long */
33#define ACCUM_MS -4(%ebp) /* accum ms long */
34#define ACCUM_MIDDLE -8(%ebp) /* accum middle long */
35#define ACCUM_LS -12(%ebp) /* accum ls long */
36#define OVERFLOWED -16(%ebp) /* addition overflow flag */
37
38.text
39ENTRY(polynomial_Xsig)
40 pushl %ebp
41 movl %esp,%ebp
42 subl $32,%esp
43 pushl %esi
44 pushl %edi
45 pushl %ebx
46
47 movl PARAM2,%esi /* x */
48 movl PARAM3,%edi /* terms */
49
50 movl TERM_SIZE,%eax
51 mull PARAM4 /* n */
52 addl %eax,%edi
53
54 movl 4(%edi),%edx /* terms[n] */
55 movl %edx,SUM_MS
56 movl (%edi),%edx /* terms[n] */
57 movl %edx,SUM_MIDDLE
58 xor %eax,%eax
59 movl %eax,SUM_LS
60 movb %al,OVERFLOWED
61
62 subl TERM_SIZE,%edi
63 decl PARAM4
64 js L_accum_done
65
66L_accum_loop:
67 xor %eax,%eax
68 movl %eax,ACCUM_MS
69 movl %eax,ACCUM_MIDDLE
70
71 movl SUM_MIDDLE,%eax
72 mull (%esi) /* x ls long */
73 movl %edx,ACCUM_LS
74
75 movl SUM_MIDDLE,%eax
76 mull 4(%esi) /* x ms long */
77 addl %eax,ACCUM_LS
78 adcl %edx,ACCUM_MIDDLE
79 adcl $0,ACCUM_MS
80
81 movl SUM_MS,%eax
82 mull (%esi) /* x ls long */
83 addl %eax,ACCUM_LS
84 adcl %edx,ACCUM_MIDDLE
85 adcl $0,ACCUM_MS
86
87 movl SUM_MS,%eax
88 mull 4(%esi) /* x ms long */
89 addl %eax,ACCUM_MIDDLE
90 adcl %edx,ACCUM_MS
91
92 testb $0xff,OVERFLOWED
93 jz L_no_overflow
94
95 movl (%esi),%eax
96 addl %eax,ACCUM_MIDDLE
97 movl 4(%esi),%eax
98 adcl %eax,ACCUM_MS /* This could overflow too */
99
100L_no_overflow:
101
102/*
103 * Now put the sum of next term and the accumulator
104 * into the sum register
105 */
106 movl ACCUM_LS,%eax
107 addl (%edi),%eax /* term ls long */
108 movl %eax,SUM_LS
109 movl ACCUM_MIDDLE,%eax
110 adcl (%edi),%eax /* term ls long */
111 movl %eax,SUM_MIDDLE
112 movl ACCUM_MS,%eax
113 adcl 4(%edi),%eax /* term ms long */
114 movl %eax,SUM_MS
115 sbbb %al,%al
116 movb %al,OVERFLOWED /* Used in the next iteration */
117
118 subl TERM_SIZE,%edi
119 decl PARAM4
120 jns L_accum_loop
121
122L_accum_done:
123 movl PARAM1,%edi /* accum */
124 movl SUM_LS,%eax
125 addl %eax,(%edi)
126 movl SUM_MIDDLE,%eax
127 adcl %eax,4(%edi)
128 movl SUM_MS,%eax
129 adcl %eax,8(%edi)
130
131 popl %ebx
132 popl %edi
133 popl %esi
134 leave
135 ret
diff --git a/arch/x86/math-emu/reg_add_sub.c b/arch/x86/math-emu/reg_add_sub.c
new file mode 100644
index 000000000000..7cd3b37ac084
--- /dev/null
+++ b/arch/x86/math-emu/reg_add_sub.c
@@ -0,0 +1,374 @@
1/*---------------------------------------------------------------------------+
2 | reg_add_sub.c |
3 | |
4 | Functions to add or subtract two registers and put the result in a third. |
5 | |
6 | Copyright (C) 1992,1993,1997 |
7 | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
8 | E-mail billm@suburbia.net |
9 | |
10 | |
11 +---------------------------------------------------------------------------*/
12
13/*---------------------------------------------------------------------------+
14 | For each function, the destination may be any FPU_REG, including one of |
15 | the source FPU_REGs. |
16 | Each function returns 0 if the answer is o.k., otherwise a non-zero |
17 | value is returned, indicating either an exception condition or an |
18 | internal error. |
19 +---------------------------------------------------------------------------*/
20
21#include "exception.h"
22#include "reg_constant.h"
23#include "fpu_emu.h"
24#include "control_w.h"
25#include "fpu_system.h"
26
27static
28int add_sub_specials(FPU_REG const *a, u_char taga, u_char signa,
29 FPU_REG const *b, u_char tagb, u_char signb,
30 FPU_REG *dest, int deststnr, int control_w);
31
32/*
33 Operates on st(0) and st(n), or on st(0) and temporary data.
34 The destination must be one of the source st(x).
35 */
36int FPU_add(FPU_REG const *b, u_char tagb, int deststnr, int control_w)
37{
38 FPU_REG *a = &st(0);
39 FPU_REG *dest = &st(deststnr);
40 u_char signb = getsign(b);
41 u_char taga = FPU_gettag0();
42 u_char signa = getsign(a);
43 u_char saved_sign = getsign(dest);
44 int diff, tag, expa, expb;
45
46 if ( !(taga | tagb) )
47 {
48 expa = exponent(a);
49 expb = exponent(b);
50
51 valid_add:
52 /* Both registers are valid */
53 if (!(signa ^ signb))
54 {
55 /* signs are the same */
56 tag = FPU_u_add(a, b, dest, control_w, signa, expa, expb);
57 }
58 else
59 {
60 /* The signs are different, so do a subtraction */
61 diff = expa - expb;
62 if (!diff)
63 {
64 diff = a->sigh - b->sigh; /* This works only if the ms bits
65 are identical. */
66 if (!diff)
67 {
68 diff = a->sigl > b->sigl;
69 if (!diff)
70 diff = -(a->sigl < b->sigl);
71 }
72 }
73
74 if (diff > 0)
75 {
76 tag = FPU_u_sub(a, b, dest, control_w, signa, expa, expb);
77 }
78 else if ( diff < 0 )
79 {
80 tag = FPU_u_sub(b, a, dest, control_w, signb, expb, expa);
81 }
82 else
83 {
84 FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr);
85 /* sign depends upon rounding mode */
86 setsign(dest, ((control_w & CW_RC) != RC_DOWN)
87 ? SIGN_POS : SIGN_NEG);
88 return TAG_Zero;
89 }
90 }
91
92 if ( tag < 0 )
93 {
94 setsign(dest, saved_sign);
95 return tag;
96 }
97 FPU_settagi(deststnr, tag);
98 return tag;
99 }
100
101 if ( taga == TAG_Special )
102 taga = FPU_Special(a);
103 if ( tagb == TAG_Special )
104 tagb = FPU_Special(b);
105
106 if ( ((taga == TAG_Valid) && (tagb == TW_Denormal))
107 || ((taga == TW_Denormal) && (tagb == TAG_Valid))
108 || ((taga == TW_Denormal) && (tagb == TW_Denormal)) )
109 {
110 FPU_REG x, y;
111
112 if ( denormal_operand() < 0 )
113 return FPU_Exception;
114
115 FPU_to_exp16(a, &x);
116 FPU_to_exp16(b, &y);
117 a = &x;
118 b = &y;
119 expa = exponent16(a);
120 expb = exponent16(b);
121 goto valid_add;
122 }
123
124 if ( (taga == TW_NaN) || (tagb == TW_NaN) )
125 {
126 if ( deststnr == 0 )
127 return real_2op_NaN(b, tagb, deststnr, a);
128 else
129 return real_2op_NaN(a, taga, deststnr, a);
130 }
131
132 return add_sub_specials(a, taga, signa, b, tagb, signb,
133 dest, deststnr, control_w);
134}
135
136
137/* Subtract b from a. (a-b) -> dest */
138int FPU_sub(int flags, int rm, int control_w)
139{
140 FPU_REG const *a, *b;
141 FPU_REG *dest;
142 u_char taga, tagb, signa, signb, saved_sign, sign;
143 int diff, tag = 0, expa, expb, deststnr;
144
145 a = &st(0);
146 taga = FPU_gettag0();
147
148 deststnr = 0;
149 if ( flags & LOADED )
150 {
151 b = (FPU_REG *)rm;
152 tagb = flags & 0x0f;
153 }
154 else
155 {
156 b = &st(rm);
157 tagb = FPU_gettagi(rm);
158
159 if ( flags & DEST_RM )
160 deststnr = rm;
161 }
162
163 signa = getsign(a);
164 signb = getsign(b);
165
166 if ( flags & REV )
167 {
168 signa ^= SIGN_NEG;
169 signb ^= SIGN_NEG;
170 }
171
172 dest = &st(deststnr);
173 saved_sign = getsign(dest);
174
175 if ( !(taga | tagb) )
176 {
177 expa = exponent(a);
178 expb = exponent(b);
179
180 valid_subtract:
181 /* Both registers are valid */
182
183 diff = expa - expb;
184
185 if (!diff)
186 {
187 diff = a->sigh - b->sigh; /* Works only if ms bits are identical */
188 if (!diff)
189 {
190 diff = a->sigl > b->sigl;
191 if (!diff)
192 diff = -(a->sigl < b->sigl);
193 }
194 }
195
196 switch ( (((int)signa)*2 + signb) / SIGN_NEG )
197 {
198 case 0: /* P - P */
199 case 3: /* N - N */
200 if (diff > 0)
201 {
202 /* |a| > |b| */
203 tag = FPU_u_sub(a, b, dest, control_w, signa, expa, expb);
204 }
205 else if ( diff == 0 )
206 {
207 FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr);
208
209 /* sign depends upon rounding mode */
210 setsign(dest, ((control_w & CW_RC) != RC_DOWN)
211 ? SIGN_POS : SIGN_NEG);
212 return TAG_Zero;
213 }
214 else
215 {
216 sign = signa ^ SIGN_NEG;
217 tag = FPU_u_sub(b, a, dest, control_w, sign, expb, expa);
218 }
219 break;
220 case 1: /* P - N */
221 tag = FPU_u_add(a, b, dest, control_w, SIGN_POS, expa, expb);
222 break;
223 case 2: /* N - P */
224 tag = FPU_u_add(a, b, dest, control_w, SIGN_NEG, expa, expb);
225 break;
226#ifdef PARANOID
227 default:
228 EXCEPTION(EX_INTERNAL|0x111);
229 return -1;
230#endif
231 }
232 if ( tag < 0 )
233 {
234 setsign(dest, saved_sign);
235 return tag;
236 }
237 FPU_settagi(deststnr, tag);
238 return tag;
239 }
240
241 if ( taga == TAG_Special )
242 taga = FPU_Special(a);
243 if ( tagb == TAG_Special )
244 tagb = FPU_Special(b);
245
246 if ( ((taga == TAG_Valid) && (tagb == TW_Denormal))
247 || ((taga == TW_Denormal) && (tagb == TAG_Valid))
248 || ((taga == TW_Denormal) && (tagb == TW_Denormal)) )
249 {
250 FPU_REG x, y;
251
252 if ( denormal_operand() < 0 )
253 return FPU_Exception;
254
255 FPU_to_exp16(a, &x);
256 FPU_to_exp16(b, &y);
257 a = &x;
258 b = &y;
259 expa = exponent16(a);
260 expb = exponent16(b);
261
262 goto valid_subtract;
263 }
264
265 if ( (taga == TW_NaN) || (tagb == TW_NaN) )
266 {
267 FPU_REG const *d1, *d2;
268 if ( flags & REV )
269 {
270 d1 = b;
271 d2 = a;
272 }
273 else
274 {
275 d1 = a;
276 d2 = b;
277 }
278 if ( flags & LOADED )
279 return real_2op_NaN(b, tagb, deststnr, d1);
280 if ( flags & DEST_RM )
281 return real_2op_NaN(a, taga, deststnr, d2);
282 else
283 return real_2op_NaN(b, tagb, deststnr, d2);
284 }
285
286 return add_sub_specials(a, taga, signa, b, tagb, signb ^ SIGN_NEG,
287 dest, deststnr, control_w);
288}
289
290
291static
292int add_sub_specials(FPU_REG const *a, u_char taga, u_char signa,
293 FPU_REG const *b, u_char tagb, u_char signb,
294 FPU_REG *dest, int deststnr, int control_w)
295{
296 if ( ((taga == TW_Denormal) || (tagb == TW_Denormal))
297 && (denormal_operand() < 0) )
298 return FPU_Exception;
299
300 if (taga == TAG_Zero)
301 {
302 if (tagb == TAG_Zero)
303 {
304 /* Both are zero, result will be zero. */
305 u_char different_signs = signa ^ signb;
306
307 FPU_copy_to_regi(a, TAG_Zero, deststnr);
308 if ( different_signs )
309 {
310 /* Signs are different. */
311 /* Sign of answer depends upon rounding mode. */
312 setsign(dest, ((control_w & CW_RC) != RC_DOWN)
313 ? SIGN_POS : SIGN_NEG);
314 }
315 else
316 setsign(dest, signa); /* signa may differ from the sign of a. */
317 return TAG_Zero;
318 }
319 else
320 {
321 reg_copy(b, dest);
322 if ( (tagb == TW_Denormal) && (b->sigh & 0x80000000) )
323 {
324 /* A pseudoDenormal, convert it. */
325 addexponent(dest, 1);
326 tagb = TAG_Valid;
327 }
328 else if ( tagb > TAG_Empty )
329 tagb = TAG_Special;
330 setsign(dest, signb); /* signb may differ from the sign of b. */
331 FPU_settagi(deststnr, tagb);
332 return tagb;
333 }
334 }
335 else if (tagb == TAG_Zero)
336 {
337 reg_copy(a, dest);
338 if ( (taga == TW_Denormal) && (a->sigh & 0x80000000) )
339 {
340 /* A pseudoDenormal */
341 addexponent(dest, 1);
342 taga = TAG_Valid;
343 }
344 else if ( taga > TAG_Empty )
345 taga = TAG_Special;
346 setsign(dest, signa); /* signa may differ from the sign of a. */
347 FPU_settagi(deststnr, taga);
348 return taga;
349 }
350 else if (taga == TW_Infinity)
351 {
352 if ( (tagb != TW_Infinity) || (signa == signb) )
353 {
354 FPU_copy_to_regi(a, TAG_Special, deststnr);
355 setsign(dest, signa); /* signa may differ from the sign of a. */
356 return taga;
357 }
358 /* Infinity-Infinity is undefined. */
359 return arith_invalid(deststnr);
360 }
361 else if (tagb == TW_Infinity)
362 {
363 FPU_copy_to_regi(b, TAG_Special, deststnr);
364 setsign(dest, signb); /* signb may differ from the sign of b. */
365 return tagb;
366 }
367
368#ifdef PARANOID
369 EXCEPTION(EX_INTERNAL|0x101);
370#endif
371
372 return FPU_Exception;
373}
374
diff --git a/arch/x86/math-emu/reg_compare.c b/arch/x86/math-emu/reg_compare.c
new file mode 100644
index 000000000000..f37c5b5a35ad
--- /dev/null
+++ b/arch/x86/math-emu/reg_compare.c
@@ -0,0 +1,381 @@
1/*---------------------------------------------------------------------------+
2 | reg_compare.c |
3 | |
4 | Compare two floating point registers |
5 | |
6 | Copyright (C) 1992,1993,1994,1997 |
7 | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
8 | E-mail billm@suburbia.net |
9 | |
10 | |
11 +---------------------------------------------------------------------------*/
12
13/*---------------------------------------------------------------------------+
14 | compare() is the core FPU_REG comparison function |
15 +---------------------------------------------------------------------------*/
16
17#include "fpu_system.h"
18#include "exception.h"
19#include "fpu_emu.h"
20#include "control_w.h"
21#include "status_w.h"
22
23
24static int compare(FPU_REG const *b, int tagb)
25{
26 int diff, exp0, expb;
27 u_char st0_tag;
28 FPU_REG *st0_ptr;
29 FPU_REG x, y;
30 u_char st0_sign, signb = getsign(b);
31
32 st0_ptr = &st(0);
33 st0_tag = FPU_gettag0();
34 st0_sign = getsign(st0_ptr);
35
36 if ( tagb == TAG_Special )
37 tagb = FPU_Special(b);
38 if ( st0_tag == TAG_Special )
39 st0_tag = FPU_Special(st0_ptr);
40
41 if ( ((st0_tag != TAG_Valid) && (st0_tag != TW_Denormal))
42 || ((tagb != TAG_Valid) && (tagb != TW_Denormal)) )
43 {
44 if ( st0_tag == TAG_Zero )
45 {
46 if ( tagb == TAG_Zero ) return COMP_A_eq_B;
47 if ( tagb == TAG_Valid )
48 return ((signb == SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B);
49 if ( tagb == TW_Denormal )
50 return ((signb == SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B)
51 | COMP_Denormal;
52 }
53 else if ( tagb == TAG_Zero )
54 {
55 if ( st0_tag == TAG_Valid )
56 return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B);
57 if ( st0_tag == TW_Denormal )
58 return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B)
59 | COMP_Denormal;
60 }
61
62 if ( st0_tag == TW_Infinity )
63 {
64 if ( (tagb == TAG_Valid) || (tagb == TAG_Zero) )
65 return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B);
66 else if ( tagb == TW_Denormal )
67 return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B)
68 | COMP_Denormal;
69 else if ( tagb == TW_Infinity )
70 {
71 /* The 80486 book says that infinities can be equal! */
72 return (st0_sign == signb) ? COMP_A_eq_B :
73 ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B);
74 }
75 /* Fall through to the NaN code */
76 }
77 else if ( tagb == TW_Infinity )
78 {
79 if ( (st0_tag == TAG_Valid) || (st0_tag == TAG_Zero) )
80 return ((signb == SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B);
81 if ( st0_tag == TW_Denormal )
82 return ((signb == SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B)
83 | COMP_Denormal;
84 /* Fall through to the NaN code */
85 }
86
87 /* The only possibility now should be that one of the arguments
88 is a NaN */
89 if ( (st0_tag == TW_NaN) || (tagb == TW_NaN) )
90 {
91 int signalling = 0, unsupported = 0;
92 if ( st0_tag == TW_NaN )
93 {
94 signalling = (st0_ptr->sigh & 0xc0000000) == 0x80000000;
95 unsupported = !((exponent(st0_ptr) == EXP_OVER)
96 && (st0_ptr->sigh & 0x80000000));
97 }
98 if ( tagb == TW_NaN )
99 {
100 signalling |= (b->sigh & 0xc0000000) == 0x80000000;
101 unsupported |= !((exponent(b) == EXP_OVER)
102 && (b->sigh & 0x80000000));
103 }
104 if ( signalling || unsupported )
105 return COMP_No_Comp | COMP_SNaN | COMP_NaN;
106 else
107 /* Neither is a signaling NaN */
108 return COMP_No_Comp | COMP_NaN;
109 }
110
111 EXCEPTION(EX_Invalid);
112 }
113
114 if (st0_sign != signb)
115 {
116 return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B)
117 | ( ((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) ?
118 COMP_Denormal : 0);
119 }
120
121 if ( (st0_tag == TW_Denormal) || (tagb == TW_Denormal) )
122 {
123 FPU_to_exp16(st0_ptr, &x);
124 FPU_to_exp16(b, &y);
125 st0_ptr = &x;
126 b = &y;
127 exp0 = exponent16(st0_ptr);
128 expb = exponent16(b);
129 }
130 else
131 {
132 exp0 = exponent(st0_ptr);
133 expb = exponent(b);
134 }
135
136#ifdef PARANOID
137 if (!(st0_ptr->sigh & 0x80000000)) EXCEPTION(EX_Invalid);
138 if (!(b->sigh & 0x80000000)) EXCEPTION(EX_Invalid);
139#endif /* PARANOID */
140
141 diff = exp0 - expb;
142 if ( diff == 0 )
143 {
144 diff = st0_ptr->sigh - b->sigh; /* Works only if ms bits are
145 identical */
146 if ( diff == 0 )
147 {
148 diff = st0_ptr->sigl > b->sigl;
149 if ( diff == 0 )
150 diff = -(st0_ptr->sigl < b->sigl);
151 }
152 }
153
154 if ( diff > 0 )
155 {
156 return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B)
157 | ( ((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) ?
158 COMP_Denormal : 0);
159 }
160 if ( diff < 0 )
161 {
162 return ((st0_sign == SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B)
163 | ( ((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) ?
164 COMP_Denormal : 0);
165 }
166
167 return COMP_A_eq_B
168 | ( ((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) ?
169 COMP_Denormal : 0);
170
171}
172
173
174/* This function requires that st(0) is not empty */
175int FPU_compare_st_data(FPU_REG const *loaded_data, u_char loaded_tag)
176{
177 int f = 0, c;
178
179 c = compare(loaded_data, loaded_tag);
180
181 if (c & COMP_NaN)
182 {
183 EXCEPTION(EX_Invalid);
184 f = SW_C3 | SW_C2 | SW_C0;
185 }
186 else
187 switch (c & 7)
188 {
189 case COMP_A_lt_B:
190 f = SW_C0;
191 break;
192 case COMP_A_eq_B:
193 f = SW_C3;
194 break;
195 case COMP_A_gt_B:
196 f = 0;
197 break;
198 case COMP_No_Comp:
199 f = SW_C3 | SW_C2 | SW_C0;
200 break;
201#ifdef PARANOID
202 default:
203 EXCEPTION(EX_INTERNAL|0x121);
204 f = SW_C3 | SW_C2 | SW_C0;
205 break;
206#endif /* PARANOID */
207 }
208 setcc(f);
209 if (c & COMP_Denormal)
210 {
211 return denormal_operand() < 0;
212 }
213 return 0;
214}
215
216
217static int compare_st_st(int nr)
218{
219 int f = 0, c;
220 FPU_REG *st_ptr;
221
222 if ( !NOT_EMPTY(0) || !NOT_EMPTY(nr) )
223 {
224 setcc(SW_C3 | SW_C2 | SW_C0);
225 /* Stack fault */
226 EXCEPTION(EX_StackUnder);
227 return !(control_word & CW_Invalid);
228 }
229
230 st_ptr = &st(nr);
231 c = compare(st_ptr, FPU_gettagi(nr));
232 if (c & COMP_NaN)
233 {
234 setcc(SW_C3 | SW_C2 | SW_C0);
235 EXCEPTION(EX_Invalid);
236 return !(control_word & CW_Invalid);
237 }
238 else
239 switch (c & 7)
240 {
241 case COMP_A_lt_B:
242 f = SW_C0;
243 break;
244 case COMP_A_eq_B:
245 f = SW_C3;
246 break;
247 case COMP_A_gt_B:
248 f = 0;
249 break;
250 case COMP_No_Comp:
251 f = SW_C3 | SW_C2 | SW_C0;
252 break;
253#ifdef PARANOID
254 default:
255 EXCEPTION(EX_INTERNAL|0x122);
256 f = SW_C3 | SW_C2 | SW_C0;
257 break;
258#endif /* PARANOID */
259 }
260 setcc(f);
261 if (c & COMP_Denormal)
262 {
263 return denormal_operand() < 0;
264 }
265 return 0;
266}
267
268
269static int compare_u_st_st(int nr)
270{
271 int f = 0, c;
272 FPU_REG *st_ptr;
273
274 if ( !NOT_EMPTY(0) || !NOT_EMPTY(nr) )
275 {
276 setcc(SW_C3 | SW_C2 | SW_C0);
277 /* Stack fault */
278 EXCEPTION(EX_StackUnder);
279 return !(control_word & CW_Invalid);
280 }
281
282 st_ptr = &st(nr);
283 c = compare(st_ptr, FPU_gettagi(nr));
284 if (c & COMP_NaN)
285 {
286 setcc(SW_C3 | SW_C2 | SW_C0);
287 if (c & COMP_SNaN) /* This is the only difference between
288 un-ordered and ordinary comparisons */
289 {
290 EXCEPTION(EX_Invalid);
291 return !(control_word & CW_Invalid);
292 }
293 return 0;
294 }
295 else
296 switch (c & 7)
297 {
298 case COMP_A_lt_B:
299 f = SW_C0;
300 break;
301 case COMP_A_eq_B:
302 f = SW_C3;
303 break;
304 case COMP_A_gt_B:
305 f = 0;
306 break;
307 case COMP_No_Comp:
308 f = SW_C3 | SW_C2 | SW_C0;
309 break;
310#ifdef PARANOID
311 default:
312 EXCEPTION(EX_INTERNAL|0x123);
313 f = SW_C3 | SW_C2 | SW_C0;
314 break;
315#endif /* PARANOID */
316 }
317 setcc(f);
318 if (c & COMP_Denormal)
319 {
320 return denormal_operand() < 0;
321 }
322 return 0;
323}
324
325/*---------------------------------------------------------------------------*/
326
327void fcom_st(void)
328{
329 /* fcom st(i) */
330 compare_st_st(FPU_rm);
331}
332
333
334void fcompst(void)
335{
336 /* fcomp st(i) */
337 if ( !compare_st_st(FPU_rm) )
338 FPU_pop();
339}
340
341
342void fcompp(void)
343{
344 /* fcompp */
345 if (FPU_rm != 1)
346 {
347 FPU_illegal();
348 return;
349 }
350 if ( !compare_st_st(1) )
351 poppop();
352}
353
354
355void fucom_(void)
356{
357 /* fucom st(i) */
358 compare_u_st_st(FPU_rm);
359
360}
361
362
363void fucomp(void)
364{
365 /* fucomp st(i) */
366 if ( !compare_u_st_st(FPU_rm) )
367 FPU_pop();
368}
369
370
371void fucompp(void)
372{
373 /* fucompp */
374 if (FPU_rm == 1)
375 {
376 if ( !compare_u_st_st(1) )
377 poppop();
378 }
379 else
380 FPU_illegal();
381}
diff --git a/arch/x86/math-emu/reg_constant.c b/arch/x86/math-emu/reg_constant.c
new file mode 100644
index 000000000000..a85015801969
--- /dev/null
+++ b/arch/x86/math-emu/reg_constant.c
@@ -0,0 +1,120 @@
1/*---------------------------------------------------------------------------+
2 | reg_constant.c |
3 | |
4 | All of the constant FPU_REGs |
5 | |
6 | Copyright (C) 1992,1993,1994,1997 |
7 | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, |
8 | Australia. E-mail billm@suburbia.net |
9 | |
10 | |
11 +---------------------------------------------------------------------------*/
12
13#include "fpu_system.h"
14#include "fpu_emu.h"
15#include "status_w.h"
16#include "reg_constant.h"
17#include "control_w.h"
18
19
20#define MAKE_REG(s,e,l,h) { l, h, \
21 ((EXTENDED_Ebias+(e)) | ((SIGN_##s != 0)*0x8000)) }
22
23FPU_REG const CONST_1 = MAKE_REG(POS, 0, 0x00000000, 0x80000000);
24#if 0
25FPU_REG const CONST_2 = MAKE_REG(POS, 1, 0x00000000, 0x80000000);
26FPU_REG const CONST_HALF = MAKE_REG(POS, -1, 0x00000000, 0x80000000);
27#endif /* 0 */
28static FPU_REG const CONST_L2T = MAKE_REG(POS, 1, 0xcd1b8afe, 0xd49a784b);
29static FPU_REG const CONST_L2E = MAKE_REG(POS, 0, 0x5c17f0bc, 0xb8aa3b29);
30FPU_REG const CONST_PI = MAKE_REG(POS, 1, 0x2168c235, 0xc90fdaa2);
31FPU_REG const CONST_PI2 = MAKE_REG(POS, 0, 0x2168c235, 0xc90fdaa2);
32FPU_REG const CONST_PI4 = MAKE_REG(POS, -1, 0x2168c235, 0xc90fdaa2);
33static FPU_REG const CONST_LG2 = MAKE_REG(POS, -2, 0xfbcff799, 0x9a209a84);
34static FPU_REG const CONST_LN2 = MAKE_REG(POS, -1, 0xd1cf79ac, 0xb17217f7);
35
36/* Extra bits to take pi/2 to more than 128 bits precision. */
37FPU_REG const CONST_PI2extra = MAKE_REG(NEG, -66,
38 0xfc8f8cbb, 0xece675d1);
39
40/* Only the sign (and tag) is used in internal zeroes */
41FPU_REG const CONST_Z = MAKE_REG(POS, EXP_UNDER, 0x0, 0x0);
42
43/* Only the sign and significand (and tag) are used in internal NaNs */
44/* The 80486 never generates one of these
45FPU_REG const CONST_SNAN = MAKE_REG(POS, EXP_OVER, 0x00000001, 0x80000000);
46 */
47/* This is the real indefinite QNaN */
48FPU_REG const CONST_QNaN = MAKE_REG(NEG, EXP_OVER, 0x00000000, 0xC0000000);
49
50/* Only the sign (and tag) is used in internal infinities */
51FPU_REG const CONST_INF = MAKE_REG(POS, EXP_OVER, 0x00000000, 0x80000000);
52
53
54static void fld_const(FPU_REG const *c, int adj, u_char tag)
55{
56 FPU_REG *st_new_ptr;
57
58 if ( STACK_OVERFLOW )
59 {
60 FPU_stack_overflow();
61 return;
62 }
63 push();
64 reg_copy(c, st_new_ptr);
65 st_new_ptr->sigl += adj; /* For all our fldxxx constants, we don't need to
66 borrow or carry. */
67 FPU_settag0(tag);
68 clear_C1();
69}
70
71/* A fast way to find out whether x is one of RC_DOWN or RC_CHOP
72 (and not one of RC_RND or RC_UP).
73 */
74#define DOWN_OR_CHOP(x) (x & RC_DOWN)
75
76static void fld1(int rc)
77{
78 fld_const(&CONST_1, 0, TAG_Valid);
79}
80
81static void fldl2t(int rc)
82{
83 fld_const(&CONST_L2T, (rc == RC_UP) ? 1 : 0, TAG_Valid);
84}
85
86static void fldl2e(int rc)
87{
88 fld_const(&CONST_L2E, DOWN_OR_CHOP(rc) ? -1 : 0, TAG_Valid);
89}
90
91static void fldpi(int rc)
92{
93 fld_const(&CONST_PI, DOWN_OR_CHOP(rc) ? -1 : 0, TAG_Valid);
94}
95
96static void fldlg2(int rc)
97{
98 fld_const(&CONST_LG2, DOWN_OR_CHOP(rc) ? -1 : 0, TAG_Valid);
99}
100
101static void fldln2(int rc)
102{
103 fld_const(&CONST_LN2, DOWN_OR_CHOP(rc) ? -1 : 0, TAG_Valid);
104}
105
106static void fldz(int rc)
107{
108 fld_const(&CONST_Z, 0, TAG_Zero);
109}
110
111typedef void (*FUNC_RC)(int);
112
113static FUNC_RC constants_table[] = {
114 fld1, fldl2t, fldl2e, fldpi, fldlg2, fldln2, fldz, (FUNC_RC)FPU_illegal
115};
116
117void fconst(void)
118{
119 (constants_table[FPU_rm])(control_word & CW_RC);
120}
diff --git a/arch/x86/math-emu/reg_constant.h b/arch/x86/math-emu/reg_constant.h
new file mode 100644
index 000000000000..1bffaec3a134
--- /dev/null
+++ b/arch/x86/math-emu/reg_constant.h
@@ -0,0 +1,25 @@
1/*---------------------------------------------------------------------------+
2 | reg_constant.h |
3 | |
4 | Copyright (C) 1992 W. Metzenthen, 22 Parker St, Ormond, Vic 3163, |
5 | Australia. E-mail billm@vaxc.cc.monash.edu.au |
6 | |
7 +---------------------------------------------------------------------------*/
8
9#ifndef _REG_CONSTANT_H_
10#define _REG_CONSTANT_H_
11
12#include "fpu_emu.h"
13
14extern FPU_REG const CONST_1;
15extern FPU_REG const CONST_PI;
16extern FPU_REG const CONST_PI2;
17extern FPU_REG const CONST_PI2extra;
18extern FPU_REG const CONST_PI4;
19extern FPU_REG const CONST_Z;
20extern FPU_REG const CONST_PINF;
21extern FPU_REG const CONST_INF;
22extern FPU_REG const CONST_MINF;
23extern FPU_REG const CONST_QNaN;
24
25#endif /* _REG_CONSTANT_H_ */
diff --git a/arch/x86/math-emu/reg_convert.c b/arch/x86/math-emu/reg_convert.c
new file mode 100644
index 000000000000..45a258752703
--- /dev/null
+++ b/arch/x86/math-emu/reg_convert.c
@@ -0,0 +1,53 @@
1/*---------------------------------------------------------------------------+
2 | reg_convert.c |
3 | |
4 | Convert register representation. |
5 | |
6 | Copyright (C) 1992,1993,1994,1996,1997 |
7 | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
8 | E-mail billm@suburbia.net |
9 | |
10 | |
11 +---------------------------------------------------------------------------*/
12
13#include "exception.h"
14#include "fpu_emu.h"
15
16
17int FPU_to_exp16(FPU_REG const *a, FPU_REG *x)
18{
19 int sign = getsign(a);
20
21 *(long long *)&(x->sigl) = *(const long long *)&(a->sigl);
22
23 /* Set up the exponent as a 16 bit quantity. */
24 setexponent16(x, exponent(a));
25
26 if ( exponent16(x) == EXP_UNDER )
27 {
28 /* The number is a de-normal or pseudodenormal. */
29 /* We only deal with the significand and exponent. */
30
31 if (x->sigh & 0x80000000)
32 {
33 /* Is a pseudodenormal. */
34 /* This is non-80486 behaviour because the number
35 loses its 'denormal' identity. */
36 addexponent(x, 1);
37 }
38 else
39 {
40 /* Is a denormal. */
41 addexponent(x, 1);
42 FPU_normalize_nuo(x);
43 }
44 }
45
46 if ( !(x->sigh & 0x80000000) )
47 {
48 EXCEPTION(EX_INTERNAL | 0x180);
49 }
50
51 return sign;
52}
53
diff --git a/arch/x86/math-emu/reg_divide.c b/arch/x86/math-emu/reg_divide.c
new file mode 100644
index 000000000000..5cee7ff920d9
--- /dev/null
+++ b/arch/x86/math-emu/reg_divide.c
@@ -0,0 +1,207 @@
1/*---------------------------------------------------------------------------+
2 | reg_divide.c |
3 | |
4 | Divide one FPU_REG by another and put the result in a destination FPU_REG.|
5 | |
6 | Copyright (C) 1996 |
7 | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
8 | E-mail billm@jacobi.maths.monash.edu.au |
9 | |
10 | Return value is the tag of the answer, or-ed with FPU_Exception if |
11 | one was raised, or -1 on internal error. |
12 | |
13 +---------------------------------------------------------------------------*/
14
15/*---------------------------------------------------------------------------+
16 | The destination may be any FPU_REG, including one of the source FPU_REGs. |
17 +---------------------------------------------------------------------------*/
18
19#include "exception.h"
20#include "reg_constant.h"
21#include "fpu_emu.h"
22#include "fpu_system.h"
23
24/*
25 Divide one register by another and put the result into a third register.
26 */
27int FPU_div(int flags, int rm, int control_w)
28{
29 FPU_REG x, y;
30 FPU_REG const *a, *b, *st0_ptr, *st_ptr;
31 FPU_REG *dest;
32 u_char taga, tagb, signa, signb, sign, saved_sign;
33 int tag, deststnr;
34
35 if ( flags & DEST_RM )
36 deststnr = rm;
37 else
38 deststnr = 0;
39
40 if ( flags & REV )
41 {
42 b = &st(0);
43 st0_ptr = b;
44 tagb = FPU_gettag0();
45 if ( flags & LOADED )
46 {
47 a = (FPU_REG *)rm;
48 taga = flags & 0x0f;
49 }
50 else
51 {
52 a = &st(rm);
53 st_ptr = a;
54 taga = FPU_gettagi(rm);
55 }
56 }
57 else
58 {
59 a = &st(0);
60 st0_ptr = a;
61 taga = FPU_gettag0();
62 if ( flags & LOADED )
63 {
64 b = (FPU_REG *)rm;
65 tagb = flags & 0x0f;
66 }
67 else
68 {
69 b = &st(rm);
70 st_ptr = b;
71 tagb = FPU_gettagi(rm);
72 }
73 }
74
75 signa = getsign(a);
76 signb = getsign(b);
77
78 sign = signa ^ signb;
79
80 dest = &st(deststnr);
81 saved_sign = getsign(dest);
82
83 if ( !(taga | tagb) )
84 {
85 /* Both regs Valid, this should be the most common case. */
86 reg_copy(a, &x);
87 reg_copy(b, &y);
88 setpositive(&x);
89 setpositive(&y);
90 tag = FPU_u_div(&x, &y, dest, control_w, sign);
91
92 if ( tag < 0 )
93 return tag;
94
95 FPU_settagi(deststnr, tag);
96 return tag;
97 }
98
99 if ( taga == TAG_Special )
100 taga = FPU_Special(a);
101 if ( tagb == TAG_Special )
102 tagb = FPU_Special(b);
103
104 if ( ((taga == TAG_Valid) && (tagb == TW_Denormal))
105 || ((taga == TW_Denormal) && (tagb == TAG_Valid))
106 || ((taga == TW_Denormal) && (tagb == TW_Denormal)) )
107 {
108 if ( denormal_operand() < 0 )
109 return FPU_Exception;
110
111 FPU_to_exp16(a, &x);
112 FPU_to_exp16(b, &y);
113 tag = FPU_u_div(&x, &y, dest, control_w, sign);
114 if ( tag < 0 )
115 return tag;
116
117 FPU_settagi(deststnr, tag);
118 return tag;
119 }
120 else if ( (taga <= TW_Denormal) && (tagb <= TW_Denormal) )
121 {
122 if ( tagb != TAG_Zero )
123 {
124 /* Want to find Zero/Valid */
125 if ( tagb == TW_Denormal )
126 {
127 if ( denormal_operand() < 0 )
128 return FPU_Exception;
129 }
130
131 /* The result is zero. */
132 FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr);
133 setsign(dest, sign);
134 return TAG_Zero;
135 }
136 /* We have an exception condition, either 0/0 or Valid/Zero. */
137 if ( taga == TAG_Zero )
138 {
139 /* 0/0 */
140 return arith_invalid(deststnr);
141 }
142 /* Valid/Zero */
143 return FPU_divide_by_zero(deststnr, sign);
144 }
145 /* Must have infinities, NaNs, etc */
146 else if ( (taga == TW_NaN) || (tagb == TW_NaN) )
147 {
148 if ( flags & LOADED )
149 return real_2op_NaN((FPU_REG *)rm, flags & 0x0f, 0, st0_ptr);
150
151 if ( flags & DEST_RM )
152 {
153 int tag;
154 tag = FPU_gettag0();
155 if ( tag == TAG_Special )
156 tag = FPU_Special(st0_ptr);
157 return real_2op_NaN(st0_ptr, tag, rm, (flags & REV) ? st0_ptr : &st(rm));
158 }
159 else
160 {
161 int tag;
162 tag = FPU_gettagi(rm);
163 if ( tag == TAG_Special )
164 tag = FPU_Special(&st(rm));
165 return real_2op_NaN(&st(rm), tag, 0, (flags & REV) ? st0_ptr : &st(rm));
166 }
167 }
168 else if (taga == TW_Infinity)
169 {
170 if (tagb == TW_Infinity)
171 {
172 /* infinity/infinity */
173 return arith_invalid(deststnr);
174 }
175 else
176 {
177 /* tagb must be Valid or Zero */
178 if ( (tagb == TW_Denormal) && (denormal_operand() < 0) )
179 return FPU_Exception;
180
181 /* Infinity divided by Zero or Valid does
182 not raise and exception, but returns Infinity */
183 FPU_copy_to_regi(a, TAG_Special, deststnr);
184 setsign(dest, sign);
185 return taga;
186 }
187 }
188 else if (tagb == TW_Infinity)
189 {
190 if ( (taga == TW_Denormal) && (denormal_operand() < 0) )
191 return FPU_Exception;
192
193 /* The result is zero. */
194 FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr);
195 setsign(dest, sign);
196 return TAG_Zero;
197 }
198#ifdef PARANOID
199 else
200 {
201 EXCEPTION(EX_INTERNAL|0x102);
202 return FPU_Exception;
203 }
204#endif /* PARANOID */
205
206 return 0;
207}
diff --git a/arch/x86/math-emu/reg_ld_str.c b/arch/x86/math-emu/reg_ld_str.c
new file mode 100644
index 000000000000..e976caef6498
--- /dev/null
+++ b/arch/x86/math-emu/reg_ld_str.c
@@ -0,0 +1,1375 @@
1/*---------------------------------------------------------------------------+
2 | reg_ld_str.c |
3 | |
4 | All of the functions which transfer data between user memory and FPU_REGs.|
5 | |
6 | Copyright (C) 1992,1993,1994,1996,1997 |
7 | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
8 | E-mail billm@suburbia.net |
9 | |
10 | |
11 +---------------------------------------------------------------------------*/
12
13/*---------------------------------------------------------------------------+
14 | Note: |
15 | The file contains code which accesses user memory. |
16 | Emulator static data may change when user memory is accessed, due to |
17 | other processes using the emulator while swapping is in progress. |
18 +---------------------------------------------------------------------------*/
19
20#include "fpu_emu.h"
21
22#include <asm/uaccess.h>
23
24#include "fpu_system.h"
25#include "exception.h"
26#include "reg_constant.h"
27#include "control_w.h"
28#include "status_w.h"
29
30
31#define DOUBLE_Emax 1023 /* largest valid exponent */
32#define DOUBLE_Ebias 1023
33#define DOUBLE_Emin (-1022) /* smallest valid exponent */
34
35#define SINGLE_Emax 127 /* largest valid exponent */
36#define SINGLE_Ebias 127
37#define SINGLE_Emin (-126) /* smallest valid exponent */
38
39
40static u_char normalize_no_excep(FPU_REG *r, int exp, int sign)
41{
42 u_char tag;
43
44 setexponent16(r, exp);
45
46 tag = FPU_normalize_nuo(r);
47 stdexp(r);
48 if ( sign )
49 setnegative(r);
50
51 return tag;
52}
53
54
55int FPU_tagof(FPU_REG *ptr)
56{
57 int exp;
58
59 exp = exponent16(ptr) & 0x7fff;
60 if ( exp == 0 )
61 {
62 if ( !(ptr->sigh | ptr->sigl) )
63 {
64 return TAG_Zero;
65 }
66 /* The number is a de-normal or pseudodenormal. */
67 return TAG_Special;
68 }
69
70 if ( exp == 0x7fff )
71 {
72 /* Is an Infinity, a NaN, or an unsupported data type. */
73 return TAG_Special;
74 }
75
76 if ( !(ptr->sigh & 0x80000000) )
77 {
78 /* Unsupported data type. */
79 /* Valid numbers have the ms bit set to 1. */
80 /* Unnormal. */
81 return TAG_Special;
82 }
83
84 return TAG_Valid;
85}
86
87
88/* Get a long double from user memory */
89int FPU_load_extended(long double __user *s, int stnr)
90{
91 FPU_REG *sti_ptr = &st(stnr);
92
93 RE_ENTRANT_CHECK_OFF;
94 FPU_access_ok(VERIFY_READ, s, 10);
95 __copy_from_user(sti_ptr, s, 10);
96 RE_ENTRANT_CHECK_ON;
97
98 return FPU_tagof(sti_ptr);
99}
100
101
102/* Get a double from user memory */
103int FPU_load_double(double __user *dfloat, FPU_REG *loaded_data)
104{
105 int exp, tag, negative;
106 unsigned m64, l64;
107
108 RE_ENTRANT_CHECK_OFF;
109 FPU_access_ok(VERIFY_READ, dfloat, 8);
110 FPU_get_user(m64, 1 + (unsigned long __user *) dfloat);
111 FPU_get_user(l64, (unsigned long __user *) dfloat);
112 RE_ENTRANT_CHECK_ON;
113
114 negative = (m64 & 0x80000000) ? SIGN_Negative : SIGN_Positive;
115 exp = ((m64 & 0x7ff00000) >> 20) - DOUBLE_Ebias + EXTENDED_Ebias;
116 m64 &= 0xfffff;
117 if ( exp > DOUBLE_Emax + EXTENDED_Ebias )
118 {
119 /* Infinity or NaN */
120 if ((m64 == 0) && (l64 == 0))
121 {
122 /* +- infinity */
123 loaded_data->sigh = 0x80000000;
124 loaded_data->sigl = 0x00000000;
125 exp = EXP_Infinity + EXTENDED_Ebias;
126 tag = TAG_Special;
127 }
128 else
129 {
130 /* Must be a signaling or quiet NaN */
131 exp = EXP_NaN + EXTENDED_Ebias;
132 loaded_data->sigh = (m64 << 11) | 0x80000000;
133 loaded_data->sigh |= l64 >> 21;
134 loaded_data->sigl = l64 << 11;
135 tag = TAG_Special; /* The calling function must look for NaNs */
136 }
137 }
138 else if ( exp < DOUBLE_Emin + EXTENDED_Ebias )
139 {
140 /* Zero or de-normal */
141 if ((m64 == 0) && (l64 == 0))
142 {
143 /* Zero */
144 reg_copy(&CONST_Z, loaded_data);
145 exp = 0;
146 tag = TAG_Zero;
147 }
148 else
149 {
150 /* De-normal */
151 loaded_data->sigh = m64 << 11;
152 loaded_data->sigh |= l64 >> 21;
153 loaded_data->sigl = l64 << 11;
154
155 return normalize_no_excep(loaded_data, DOUBLE_Emin, negative)
156 | (denormal_operand() < 0 ? FPU_Exception : 0);
157 }
158 }
159 else
160 {
161 loaded_data->sigh = (m64 << 11) | 0x80000000;
162 loaded_data->sigh |= l64 >> 21;
163 loaded_data->sigl = l64 << 11;
164
165 tag = TAG_Valid;
166 }
167
168 setexponent16(loaded_data, exp | negative);
169
170 return tag;
171}
172
173
174/* Get a float from user memory */
175int FPU_load_single(float __user *single, FPU_REG *loaded_data)
176{
177 unsigned m32;
178 int exp, tag, negative;
179
180 RE_ENTRANT_CHECK_OFF;
181 FPU_access_ok(VERIFY_READ, single, 4);
182 FPU_get_user(m32, (unsigned long __user *) single);
183 RE_ENTRANT_CHECK_ON;
184
185 negative = (m32 & 0x80000000) ? SIGN_Negative : SIGN_Positive;
186
187 if (!(m32 & 0x7fffffff))
188 {
189 /* Zero */
190 reg_copy(&CONST_Z, loaded_data);
191 addexponent(loaded_data, negative);
192 return TAG_Zero;
193 }
194 exp = ((m32 & 0x7f800000) >> 23) - SINGLE_Ebias + EXTENDED_Ebias;
195 m32 = (m32 & 0x7fffff) << 8;
196 if ( exp < SINGLE_Emin + EXTENDED_Ebias )
197 {
198 /* De-normals */
199 loaded_data->sigh = m32;
200 loaded_data->sigl = 0;
201
202 return normalize_no_excep(loaded_data, SINGLE_Emin, negative)
203 | (denormal_operand() < 0 ? FPU_Exception : 0);
204 }
205 else if ( exp > SINGLE_Emax + EXTENDED_Ebias )
206 {
207 /* Infinity or NaN */
208 if ( m32 == 0 )
209 {
210 /* +- infinity */
211 loaded_data->sigh = 0x80000000;
212 loaded_data->sigl = 0x00000000;
213 exp = EXP_Infinity + EXTENDED_Ebias;
214 tag = TAG_Special;
215 }
216 else
217 {
218 /* Must be a signaling or quiet NaN */
219 exp = EXP_NaN + EXTENDED_Ebias;
220 loaded_data->sigh = m32 | 0x80000000;
221 loaded_data->sigl = 0;
222 tag = TAG_Special; /* The calling function must look for NaNs */
223 }
224 }
225 else
226 {
227 loaded_data->sigh = m32 | 0x80000000;
228 loaded_data->sigl = 0;
229 tag = TAG_Valid;
230 }
231
232 setexponent16(loaded_data, exp | negative); /* Set the sign. */
233
234 return tag;
235}
236
237
238/* Get a long long from user memory */
239int FPU_load_int64(long long __user *_s)
240{
241 long long s;
242 int sign;
243 FPU_REG *st0_ptr = &st(0);
244
245 RE_ENTRANT_CHECK_OFF;
246 FPU_access_ok(VERIFY_READ, _s, 8);
247 if (copy_from_user(&s,_s,8))
248 FPU_abort;
249 RE_ENTRANT_CHECK_ON;
250
251 if (s == 0)
252 {
253 reg_copy(&CONST_Z, st0_ptr);
254 return TAG_Zero;
255 }
256
257 if (s > 0)
258 sign = SIGN_Positive;
259 else
260 {
261 s = -s;
262 sign = SIGN_Negative;
263 }
264
265 significand(st0_ptr) = s;
266
267 return normalize_no_excep(st0_ptr, 63, sign);
268}
269
270
271/* Get a long from user memory */
272int FPU_load_int32(long __user *_s, FPU_REG *loaded_data)
273{
274 long s;
275 int negative;
276
277 RE_ENTRANT_CHECK_OFF;
278 FPU_access_ok(VERIFY_READ, _s, 4);
279 FPU_get_user(s, _s);
280 RE_ENTRANT_CHECK_ON;
281
282 if (s == 0)
283 { reg_copy(&CONST_Z, loaded_data); return TAG_Zero; }
284
285 if (s > 0)
286 negative = SIGN_Positive;
287 else
288 {
289 s = -s;
290 negative = SIGN_Negative;
291 }
292
293 loaded_data->sigh = s;
294 loaded_data->sigl = 0;
295
296 return normalize_no_excep(loaded_data, 31, negative);
297}
298
299
300/* Get a short from user memory */
301int FPU_load_int16(short __user *_s, FPU_REG *loaded_data)
302{
303 int s, negative;
304
305 RE_ENTRANT_CHECK_OFF;
306 FPU_access_ok(VERIFY_READ, _s, 2);
307 /* Cast as short to get the sign extended. */
308 FPU_get_user(s, _s);
309 RE_ENTRANT_CHECK_ON;
310
311 if (s == 0)
312 { reg_copy(&CONST_Z, loaded_data); return TAG_Zero; }
313
314 if (s > 0)
315 negative = SIGN_Positive;
316 else
317 {
318 s = -s;
319 negative = SIGN_Negative;
320 }
321
322 loaded_data->sigh = s << 16;
323 loaded_data->sigl = 0;
324
325 return normalize_no_excep(loaded_data, 15, negative);
326}
327
328
329/* Get a packed bcd array from user memory */
330int FPU_load_bcd(u_char __user *s)
331{
332 FPU_REG *st0_ptr = &st(0);
333 int pos;
334 u_char bcd;
335 long long l=0;
336 int sign;
337
338 RE_ENTRANT_CHECK_OFF;
339 FPU_access_ok(VERIFY_READ, s, 10);
340 RE_ENTRANT_CHECK_ON;
341 for ( pos = 8; pos >= 0; pos--)
342 {
343 l *= 10;
344 RE_ENTRANT_CHECK_OFF;
345 FPU_get_user(bcd, s+pos);
346 RE_ENTRANT_CHECK_ON;
347 l += bcd >> 4;
348 l *= 10;
349 l += bcd & 0x0f;
350 }
351
352 RE_ENTRANT_CHECK_OFF;
353 FPU_get_user(sign, s+9);
354 sign = sign & 0x80 ? SIGN_Negative : SIGN_Positive;
355 RE_ENTRANT_CHECK_ON;
356
357 if ( l == 0 )
358 {
359 reg_copy(&CONST_Z, st0_ptr);
360 addexponent(st0_ptr, sign); /* Set the sign. */
361 return TAG_Zero;
362 }
363 else
364 {
365 significand(st0_ptr) = l;
366 return normalize_no_excep(st0_ptr, 63, sign);
367 }
368}
369
370/*===========================================================================*/
371
372/* Put a long double into user memory */
373int FPU_store_extended(FPU_REG *st0_ptr, u_char st0_tag, long double __user *d)
374{
375 /*
376 The only exception raised by an attempt to store to an
377 extended format is the Invalid Stack exception, i.e.
378 attempting to store from an empty register.
379 */
380
381 if ( st0_tag != TAG_Empty )
382 {
383 RE_ENTRANT_CHECK_OFF;
384 FPU_access_ok(VERIFY_WRITE, d, 10);
385
386 FPU_put_user(st0_ptr->sigl, (unsigned long __user *) d);
387 FPU_put_user(st0_ptr->sigh, (unsigned long __user *) ((u_char __user *)d + 4));
388 FPU_put_user(exponent16(st0_ptr), (unsigned short __user *) ((u_char __user *)d + 8));
389 RE_ENTRANT_CHECK_ON;
390
391 return 1;
392 }
393
394 /* Empty register (stack underflow) */
395 EXCEPTION(EX_StackUnder);
396 if ( control_word & CW_Invalid )
397 {
398 /* The masked response */
399 /* Put out the QNaN indefinite */
400 RE_ENTRANT_CHECK_OFF;
401 FPU_access_ok(VERIFY_WRITE,d,10);
402 FPU_put_user(0, (unsigned long __user *) d);
403 FPU_put_user(0xc0000000, 1 + (unsigned long __user *) d);
404 FPU_put_user(0xffff, 4 + (short __user *) d);
405 RE_ENTRANT_CHECK_ON;
406 return 1;
407 }
408 else
409 return 0;
410
411}
412
413
414/* Put a double into user memory */
415int FPU_store_double(FPU_REG *st0_ptr, u_char st0_tag, double __user *dfloat)
416{
417 unsigned long l[2];
418 unsigned long increment = 0; /* avoid gcc warnings */
419 int precision_loss;
420 int exp;
421 FPU_REG tmp;
422
423 if ( st0_tag == TAG_Valid )
424 {
425 reg_copy(st0_ptr, &tmp);
426 exp = exponent(&tmp);
427
428 if ( exp < DOUBLE_Emin ) /* It may be a denormal */
429 {
430 addexponent(&tmp, -DOUBLE_Emin + 52); /* largest exp to be 51 */
431
432 denormal_arg:
433
434 if ( (precision_loss = FPU_round_to_int(&tmp, st0_tag)) )
435 {
436#ifdef PECULIAR_486
437 /* Did it round to a non-denormal ? */
438 /* This behaviour might be regarded as peculiar, it appears
439 that the 80486 rounds to the dest precision, then
440 converts to decide underflow. */
441 if ( !((tmp.sigh == 0x00100000) && (tmp.sigl == 0) &&
442 (st0_ptr->sigl & 0x000007ff)) )
443#endif /* PECULIAR_486 */
444 {
445 EXCEPTION(EX_Underflow);
446 /* This is a special case: see sec 16.2.5.1 of
447 the 80486 book */
448 if ( !(control_word & CW_Underflow) )
449 return 0;
450 }
451 EXCEPTION(precision_loss);
452 if ( !(control_word & CW_Precision) )
453 return 0;
454 }
455 l[0] = tmp.sigl;
456 l[1] = tmp.sigh;
457 }
458 else
459 {
460 if ( tmp.sigl & 0x000007ff )
461 {
462 precision_loss = 1;
463 switch (control_word & CW_RC)
464 {
465 case RC_RND:
466 /* Rounding can get a little messy.. */
467 increment = ((tmp.sigl & 0x7ff) > 0x400) | /* nearest */
468 ((tmp.sigl & 0xc00) == 0xc00); /* odd -> even */
469 break;
470 case RC_DOWN: /* towards -infinity */
471 increment = signpositive(&tmp) ? 0 : tmp.sigl & 0x7ff;
472 break;
473 case RC_UP: /* towards +infinity */
474 increment = signpositive(&tmp) ? tmp.sigl & 0x7ff : 0;
475 break;
476 case RC_CHOP:
477 increment = 0;
478 break;
479 }
480
481 /* Truncate the mantissa */
482 tmp.sigl &= 0xfffff800;
483
484 if ( increment )
485 {
486 if ( tmp.sigl >= 0xfffff800 )
487 {
488 /* the sigl part overflows */
489 if ( tmp.sigh == 0xffffffff )
490 {
491 /* The sigh part overflows */
492 tmp.sigh = 0x80000000;
493 exp++;
494 if (exp >= EXP_OVER)
495 goto overflow;
496 }
497 else
498 {
499 tmp.sigh ++;
500 }
501 tmp.sigl = 0x00000000;
502 }
503 else
504 {
505 /* We only need to increment sigl */
506 tmp.sigl += 0x00000800;
507 }
508 }
509 }
510 else
511 precision_loss = 0;
512
513 l[0] = (tmp.sigl >> 11) | (tmp.sigh << 21);
514 l[1] = ((tmp.sigh >> 11) & 0xfffff);
515
516 if ( exp > DOUBLE_Emax )
517 {
518 overflow:
519 EXCEPTION(EX_Overflow);
520 if ( !(control_word & CW_Overflow) )
521 return 0;
522 set_precision_flag_up();
523 if ( !(control_word & CW_Precision) )
524 return 0;
525
526 /* This is a special case: see sec 16.2.5.1 of the 80486 book */
527 /* Overflow to infinity */
528 l[0] = 0x00000000; /* Set to */
529 l[1] = 0x7ff00000; /* + INF */
530 }
531 else
532 {
533 if ( precision_loss )
534 {
535 if ( increment )
536 set_precision_flag_up();
537 else
538 set_precision_flag_down();
539 }
540 /* Add the exponent */
541 l[1] |= (((exp+DOUBLE_Ebias) & 0x7ff) << 20);
542 }
543 }
544 }
545 else if (st0_tag == TAG_Zero)
546 {
547 /* Number is zero */
548 l[0] = 0;
549 l[1] = 0;
550 }
551 else if ( st0_tag == TAG_Special )
552 {
553 st0_tag = FPU_Special(st0_ptr);
554 if ( st0_tag == TW_Denormal )
555 {
556 /* A denormal will always underflow. */
557#ifndef PECULIAR_486
558 /* An 80486 is supposed to be able to generate
559 a denormal exception here, but... */
560 /* Underflow has priority. */
561 if ( control_word & CW_Underflow )
562 denormal_operand();
563#endif /* PECULIAR_486 */
564 reg_copy(st0_ptr, &tmp);
565 goto denormal_arg;
566 }
567 else if (st0_tag == TW_Infinity)
568 {
569 l[0] = 0;
570 l[1] = 0x7ff00000;
571 }
572 else if (st0_tag == TW_NaN)
573 {
574 /* Is it really a NaN ? */
575 if ( (exponent(st0_ptr) == EXP_OVER)
576 && (st0_ptr->sigh & 0x80000000) )
577 {
578 /* See if we can get a valid NaN from the FPU_REG */
579 l[0] = (st0_ptr->sigl >> 11) | (st0_ptr->sigh << 21);
580 l[1] = ((st0_ptr->sigh >> 11) & 0xfffff);
581 if ( !(st0_ptr->sigh & 0x40000000) )
582 {
583 /* It is a signalling NaN */
584 EXCEPTION(EX_Invalid);
585 if ( !(control_word & CW_Invalid) )
586 return 0;
587 l[1] |= (0x40000000 >> 11);
588 }
589 l[1] |= 0x7ff00000;
590 }
591 else
592 {
593 /* It is an unsupported data type */
594 EXCEPTION(EX_Invalid);
595 if ( !(control_word & CW_Invalid) )
596 return 0;
597 l[0] = 0;
598 l[1] = 0xfff80000;
599 }
600 }
601 }
602 else if ( st0_tag == TAG_Empty )
603 {
604 /* Empty register (stack underflow) */
605 EXCEPTION(EX_StackUnder);
606 if ( control_word & CW_Invalid )
607 {
608 /* The masked response */
609 /* Put out the QNaN indefinite */
610 RE_ENTRANT_CHECK_OFF;
611 FPU_access_ok(VERIFY_WRITE,dfloat,8);
612 FPU_put_user(0, (unsigned long __user *) dfloat);
613 FPU_put_user(0xfff80000, 1 + (unsigned long __user *) dfloat);
614 RE_ENTRANT_CHECK_ON;
615 return 1;
616 }
617 else
618 return 0;
619 }
620 if ( getsign(st0_ptr) )
621 l[1] |= 0x80000000;
622
623 RE_ENTRANT_CHECK_OFF;
624 FPU_access_ok(VERIFY_WRITE,dfloat,8);
625 FPU_put_user(l[0], (unsigned long __user *)dfloat);
626 FPU_put_user(l[1], 1 + (unsigned long __user *)dfloat);
627 RE_ENTRANT_CHECK_ON;
628
629 return 1;
630}
631
632
633/* Put a float into user memory */
634int FPU_store_single(FPU_REG *st0_ptr, u_char st0_tag, float __user *single)
635{
636 long templ = 0;
637 unsigned long increment = 0; /* avoid gcc warnings */
638 int precision_loss;
639 int exp;
640 FPU_REG tmp;
641
642 if ( st0_tag == TAG_Valid )
643 {
644
645 reg_copy(st0_ptr, &tmp);
646 exp = exponent(&tmp);
647
648 if ( exp < SINGLE_Emin )
649 {
650 addexponent(&tmp, -SINGLE_Emin + 23); /* largest exp to be 22 */
651
652 denormal_arg:
653
654 if ( (precision_loss = FPU_round_to_int(&tmp, st0_tag)) )
655 {
656#ifdef PECULIAR_486
657 /* Did it round to a non-denormal ? */
658 /* This behaviour might be regarded as peculiar, it appears
659 that the 80486 rounds to the dest precision, then
660 converts to decide underflow. */
661 if ( !((tmp.sigl == 0x00800000) &&
662 ((st0_ptr->sigh & 0x000000ff) || st0_ptr->sigl)) )
663#endif /* PECULIAR_486 */
664 {
665 EXCEPTION(EX_Underflow);
666 /* This is a special case: see sec 16.2.5.1 of
667 the 80486 book */
668 if ( !(control_word & CW_Underflow) )
669 return 0;
670 }
671 EXCEPTION(precision_loss);
672 if ( !(control_word & CW_Precision) )
673 return 0;
674 }
675 templ = tmp.sigl;
676 }
677 else
678 {
679 if ( tmp.sigl | (tmp.sigh & 0x000000ff) )
680 {
681 unsigned long sigh = tmp.sigh;
682 unsigned long sigl = tmp.sigl;
683
684 precision_loss = 1;
685 switch (control_word & CW_RC)
686 {
687 case RC_RND:
688 increment = ((sigh & 0xff) > 0x80) /* more than half */
689 || (((sigh & 0xff) == 0x80) && sigl) /* more than half */
690 || ((sigh & 0x180) == 0x180); /* round to even */
691 break;
692 case RC_DOWN: /* towards -infinity */
693 increment = signpositive(&tmp)
694 ? 0 : (sigl | (sigh & 0xff));
695 break;
696 case RC_UP: /* towards +infinity */
697 increment = signpositive(&tmp)
698 ? (sigl | (sigh & 0xff)) : 0;
699 break;
700 case RC_CHOP:
701 increment = 0;
702 break;
703 }
704
705 /* Truncate part of the mantissa */
706 tmp.sigl = 0;
707
708 if (increment)
709 {
710 if ( sigh >= 0xffffff00 )
711 {
712 /* The sigh part overflows */
713 tmp.sigh = 0x80000000;
714 exp++;
715 if ( exp >= EXP_OVER )
716 goto overflow;
717 }
718 else
719 {
720 tmp.sigh &= 0xffffff00;
721 tmp.sigh += 0x100;
722 }
723 }
724 else
725 {
726 tmp.sigh &= 0xffffff00; /* Finish the truncation */
727 }
728 }
729 else
730 precision_loss = 0;
731
732 templ = (tmp.sigh >> 8) & 0x007fffff;
733
734 if ( exp > SINGLE_Emax )
735 {
736 overflow:
737 EXCEPTION(EX_Overflow);
738 if ( !(control_word & CW_Overflow) )
739 return 0;
740 set_precision_flag_up();
741 if ( !(control_word & CW_Precision) )
742 return 0;
743
744 /* This is a special case: see sec 16.2.5.1 of the 80486 book. */
745 /* Masked response is overflow to infinity. */
746 templ = 0x7f800000;
747 }
748 else
749 {
750 if ( precision_loss )
751 {
752 if ( increment )
753 set_precision_flag_up();
754 else
755 set_precision_flag_down();
756 }
757 /* Add the exponent */
758 templ |= ((exp+SINGLE_Ebias) & 0xff) << 23;
759 }
760 }
761 }
762 else if (st0_tag == TAG_Zero)
763 {
764 templ = 0;
765 }
766 else if ( st0_tag == TAG_Special )
767 {
768 st0_tag = FPU_Special(st0_ptr);
769 if (st0_tag == TW_Denormal)
770 {
771 reg_copy(st0_ptr, &tmp);
772
773 /* A denormal will always underflow. */
774#ifndef PECULIAR_486
775 /* An 80486 is supposed to be able to generate
776 a denormal exception here, but... */
777 /* Underflow has priority. */
778 if ( control_word & CW_Underflow )
779 denormal_operand();
780#endif /* PECULIAR_486 */
781 goto denormal_arg;
782 }
783 else if (st0_tag == TW_Infinity)
784 {
785 templ = 0x7f800000;
786 }
787 else if (st0_tag == TW_NaN)
788 {
789 /* Is it really a NaN ? */
790 if ( (exponent(st0_ptr) == EXP_OVER) && (st0_ptr->sigh & 0x80000000) )
791 {
792 /* See if we can get a valid NaN from the FPU_REG */
793 templ = st0_ptr->sigh >> 8;
794 if ( !(st0_ptr->sigh & 0x40000000) )
795 {
796 /* It is a signalling NaN */
797 EXCEPTION(EX_Invalid);
798 if ( !(control_word & CW_Invalid) )
799 return 0;
800 templ |= (0x40000000 >> 8);
801 }
802 templ |= 0x7f800000;
803 }
804 else
805 {
806 /* It is an unsupported data type */
807 EXCEPTION(EX_Invalid);
808 if ( !(control_word & CW_Invalid) )
809 return 0;
810 templ = 0xffc00000;
811 }
812 }
813#ifdef PARANOID
814 else
815 {
816 EXCEPTION(EX_INTERNAL|0x164);
817 return 0;
818 }
819#endif
820 }
821 else if ( st0_tag == TAG_Empty )
822 {
823 /* Empty register (stack underflow) */
824 EXCEPTION(EX_StackUnder);
825 if ( control_word & EX_Invalid )
826 {
827 /* The masked response */
828 /* Put out the QNaN indefinite */
829 RE_ENTRANT_CHECK_OFF;
830 FPU_access_ok(VERIFY_WRITE,single,4);
831 FPU_put_user(0xffc00000, (unsigned long __user *) single);
832 RE_ENTRANT_CHECK_ON;
833 return 1;
834 }
835 else
836 return 0;
837 }
838#ifdef PARANOID
839 else
840 {
841 EXCEPTION(EX_INTERNAL|0x163);
842 return 0;
843 }
844#endif
845 if ( getsign(st0_ptr) )
846 templ |= 0x80000000;
847
848 RE_ENTRANT_CHECK_OFF;
849 FPU_access_ok(VERIFY_WRITE,single,4);
850 FPU_put_user(templ,(unsigned long __user *) single);
851 RE_ENTRANT_CHECK_ON;
852
853 return 1;
854}
855
856
857/* Put a long long into user memory */
858int FPU_store_int64(FPU_REG *st0_ptr, u_char st0_tag, long long __user *d)
859{
860 FPU_REG t;
861 long long tll;
862 int precision_loss;
863
864 if ( st0_tag == TAG_Empty )
865 {
866 /* Empty register (stack underflow) */
867 EXCEPTION(EX_StackUnder);
868 goto invalid_operand;
869 }
870 else if ( st0_tag == TAG_Special )
871 {
872 st0_tag = FPU_Special(st0_ptr);
873 if ( (st0_tag == TW_Infinity) ||
874 (st0_tag == TW_NaN) )
875 {
876 EXCEPTION(EX_Invalid);
877 goto invalid_operand;
878 }
879 }
880
881 reg_copy(st0_ptr, &t);
882 precision_loss = FPU_round_to_int(&t, st0_tag);
883 ((long *)&tll)[0] = t.sigl;
884 ((long *)&tll)[1] = t.sigh;
885 if ( (precision_loss == 1) ||
886 ((t.sigh & 0x80000000) &&
887 !((t.sigh == 0x80000000) && (t.sigl == 0) &&
888 signnegative(&t))) )
889 {
890 EXCEPTION(EX_Invalid);
891 /* This is a special case: see sec 16.2.5.1 of the 80486 book */
892 invalid_operand:
893 if ( control_word & EX_Invalid )
894 {
895 /* Produce something like QNaN "indefinite" */
896 tll = 0x8000000000000000LL;
897 }
898 else
899 return 0;
900 }
901 else
902 {
903 if ( precision_loss )
904 set_precision_flag(precision_loss);
905 if ( signnegative(&t) )
906 tll = - tll;
907 }
908
909 RE_ENTRANT_CHECK_OFF;
910 FPU_access_ok(VERIFY_WRITE,d,8);
911 if (copy_to_user(d, &tll, 8))
912 FPU_abort;
913 RE_ENTRANT_CHECK_ON;
914
915 return 1;
916}
917
918
919/* Put a long into user memory */
920int FPU_store_int32(FPU_REG *st0_ptr, u_char st0_tag, long __user *d)
921{
922 FPU_REG t;
923 int precision_loss;
924
925 if ( st0_tag == TAG_Empty )
926 {
927 /* Empty register (stack underflow) */
928 EXCEPTION(EX_StackUnder);
929 goto invalid_operand;
930 }
931 else if ( st0_tag == TAG_Special )
932 {
933 st0_tag = FPU_Special(st0_ptr);
934 if ( (st0_tag == TW_Infinity) ||
935 (st0_tag == TW_NaN) )
936 {
937 EXCEPTION(EX_Invalid);
938 goto invalid_operand;
939 }
940 }
941
942 reg_copy(st0_ptr, &t);
943 precision_loss = FPU_round_to_int(&t, st0_tag);
944 if (t.sigh ||
945 ((t.sigl & 0x80000000) &&
946 !((t.sigl == 0x80000000) && signnegative(&t))) )
947 {
948 EXCEPTION(EX_Invalid);
949 /* This is a special case: see sec 16.2.5.1 of the 80486 book */
950 invalid_operand:
951 if ( control_word & EX_Invalid )
952 {
953 /* Produce something like QNaN "indefinite" */
954 t.sigl = 0x80000000;
955 }
956 else
957 return 0;
958 }
959 else
960 {
961 if ( precision_loss )
962 set_precision_flag(precision_loss);
963 if ( signnegative(&t) )
964 t.sigl = -(long)t.sigl;
965 }
966
967 RE_ENTRANT_CHECK_OFF;
968 FPU_access_ok(VERIFY_WRITE,d,4);
969 FPU_put_user(t.sigl, (unsigned long __user *) d);
970 RE_ENTRANT_CHECK_ON;
971
972 return 1;
973}
974
975
976/* Put a short into user memory */
977int FPU_store_int16(FPU_REG *st0_ptr, u_char st0_tag, short __user *d)
978{
979 FPU_REG t;
980 int precision_loss;
981
982 if ( st0_tag == TAG_Empty )
983 {
984 /* Empty register (stack underflow) */
985 EXCEPTION(EX_StackUnder);
986 goto invalid_operand;
987 }
988 else if ( st0_tag == TAG_Special )
989 {
990 st0_tag = FPU_Special(st0_ptr);
991 if ( (st0_tag == TW_Infinity) ||
992 (st0_tag == TW_NaN) )
993 {
994 EXCEPTION(EX_Invalid);
995 goto invalid_operand;
996 }
997 }
998
999 reg_copy(st0_ptr, &t);
1000 precision_loss = FPU_round_to_int(&t, st0_tag);
1001 if (t.sigh ||
1002 ((t.sigl & 0xffff8000) &&
1003 !((t.sigl == 0x8000) && signnegative(&t))) )
1004 {
1005 EXCEPTION(EX_Invalid);
1006 /* This is a special case: see sec 16.2.5.1 of the 80486 book */
1007 invalid_operand:
1008 if ( control_word & EX_Invalid )
1009 {
1010 /* Produce something like QNaN "indefinite" */
1011 t.sigl = 0x8000;
1012 }
1013 else
1014 return 0;
1015 }
1016 else
1017 {
1018 if ( precision_loss )
1019 set_precision_flag(precision_loss);
1020 if ( signnegative(&t) )
1021 t.sigl = -t.sigl;
1022 }
1023
1024 RE_ENTRANT_CHECK_OFF;
1025 FPU_access_ok(VERIFY_WRITE,d,2);
1026 FPU_put_user((short)t.sigl, d);
1027 RE_ENTRANT_CHECK_ON;
1028
1029 return 1;
1030}
1031
1032
1033/* Put a packed bcd array into user memory */
1034int FPU_store_bcd(FPU_REG *st0_ptr, u_char st0_tag, u_char __user *d)
1035{
1036 FPU_REG t;
1037 unsigned long long ll;
1038 u_char b;
1039 int i, precision_loss;
1040 u_char sign = (getsign(st0_ptr) == SIGN_NEG) ? 0x80 : 0;
1041
1042 if ( st0_tag == TAG_Empty )
1043 {
1044 /* Empty register (stack underflow) */
1045 EXCEPTION(EX_StackUnder);
1046 goto invalid_operand;
1047 }
1048 else if ( st0_tag == TAG_Special )
1049 {
1050 st0_tag = FPU_Special(st0_ptr);
1051 if ( (st0_tag == TW_Infinity) ||
1052 (st0_tag == TW_NaN) )
1053 {
1054 EXCEPTION(EX_Invalid);
1055 goto invalid_operand;
1056 }
1057 }
1058
1059 reg_copy(st0_ptr, &t);
1060 precision_loss = FPU_round_to_int(&t, st0_tag);
1061 ll = significand(&t);
1062
1063 /* Check for overflow, by comparing with 999999999999999999 decimal. */
1064 if ( (t.sigh > 0x0de0b6b3) ||
1065 ((t.sigh == 0x0de0b6b3) && (t.sigl > 0xa763ffff)) )
1066 {
1067 EXCEPTION(EX_Invalid);
1068 /* This is a special case: see sec 16.2.5.1 of the 80486 book */
1069 invalid_operand:
1070 if ( control_word & CW_Invalid )
1071 {
1072 /* Produce the QNaN "indefinite" */
1073 RE_ENTRANT_CHECK_OFF;
1074 FPU_access_ok(VERIFY_WRITE,d,10);
1075 for ( i = 0; i < 7; i++)
1076 FPU_put_user(0, d+i); /* These bytes "undefined" */
1077 FPU_put_user(0xc0, d+7); /* This byte "undefined" */
1078 FPU_put_user(0xff, d+8);
1079 FPU_put_user(0xff, d+9);
1080 RE_ENTRANT_CHECK_ON;
1081 return 1;
1082 }
1083 else
1084 return 0;
1085 }
1086 else if ( precision_loss )
1087 {
1088 /* Precision loss doesn't stop the data transfer */
1089 set_precision_flag(precision_loss);
1090 }
1091
1092 RE_ENTRANT_CHECK_OFF;
1093 FPU_access_ok(VERIFY_WRITE,d,10);
1094 RE_ENTRANT_CHECK_ON;
1095 for ( i = 0; i < 9; i++)
1096 {
1097 b = FPU_div_small(&ll, 10);
1098 b |= (FPU_div_small(&ll, 10)) << 4;
1099 RE_ENTRANT_CHECK_OFF;
1100 FPU_put_user(b, d+i);
1101 RE_ENTRANT_CHECK_ON;
1102 }
1103 RE_ENTRANT_CHECK_OFF;
1104 FPU_put_user(sign, d+9);
1105 RE_ENTRANT_CHECK_ON;
1106
1107 return 1;
1108}
1109
1110/*===========================================================================*/
1111
1112/* r gets mangled such that sig is int, sign:
1113 it is NOT normalized */
1114/* The return value (in eax) is zero if the result is exact,
1115 if bits are changed due to rounding, truncation, etc, then
1116 a non-zero value is returned */
1117/* Overflow is signalled by a non-zero return value (in eax).
1118 In the case of overflow, the returned significand always has the
1119 largest possible value */
1120int FPU_round_to_int(FPU_REG *r, u_char tag)
1121{
1122 u_char very_big;
1123 unsigned eax;
1124
1125 if (tag == TAG_Zero)
1126 {
1127 /* Make sure that zero is returned */
1128 significand(r) = 0;
1129 return 0; /* o.k. */
1130 }
1131
1132 if (exponent(r) > 63)
1133 {
1134 r->sigl = r->sigh = ~0; /* The largest representable number */
1135 return 1; /* overflow */
1136 }
1137
1138 eax = FPU_shrxs(&r->sigl, 63 - exponent(r));
1139 very_big = !(~(r->sigh) | ~(r->sigl)); /* test for 0xfff...fff */
1140#define half_or_more (eax & 0x80000000)
1141#define frac_part (eax)
1142#define more_than_half ((eax & 0x80000001) == 0x80000001)
1143 switch (control_word & CW_RC)
1144 {
1145 case RC_RND:
1146 if ( more_than_half /* nearest */
1147 || (half_or_more && (r->sigl & 1)) ) /* odd -> even */
1148 {
1149 if ( very_big ) return 1; /* overflow */
1150 significand(r) ++;
1151 return PRECISION_LOST_UP;
1152 }
1153 break;
1154 case RC_DOWN:
1155 if (frac_part && getsign(r))
1156 {
1157 if ( very_big ) return 1; /* overflow */
1158 significand(r) ++;
1159 return PRECISION_LOST_UP;
1160 }
1161 break;
1162 case RC_UP:
1163 if (frac_part && !getsign(r))
1164 {
1165 if ( very_big ) return 1; /* overflow */
1166 significand(r) ++;
1167 return PRECISION_LOST_UP;
1168 }
1169 break;
1170 case RC_CHOP:
1171 break;
1172 }
1173
1174 return eax ? PRECISION_LOST_DOWN : 0;
1175
1176}
1177
1178/*===========================================================================*/
1179
1180u_char __user *fldenv(fpu_addr_modes addr_modes, u_char __user *s)
1181{
1182 unsigned short tag_word = 0;
1183 u_char tag;
1184 int i;
1185
1186 if ( (addr_modes.default_mode == VM86) ||
1187 ((addr_modes.default_mode == PM16)
1188 ^ (addr_modes.override.operand_size == OP_SIZE_PREFIX)) )
1189 {
1190 RE_ENTRANT_CHECK_OFF;
1191 FPU_access_ok(VERIFY_READ, s, 0x0e);
1192 FPU_get_user(control_word, (unsigned short __user *) s);
1193 FPU_get_user(partial_status, (unsigned short __user *) (s+2));
1194 FPU_get_user(tag_word, (unsigned short __user *) (s+4));
1195 FPU_get_user(instruction_address.offset, (unsigned short __user *) (s+6));
1196 FPU_get_user(instruction_address.selector, (unsigned short __user *) (s+8));
1197 FPU_get_user(operand_address.offset, (unsigned short __user *) (s+0x0a));
1198 FPU_get_user(operand_address.selector, (unsigned short __user *) (s+0x0c));
1199 RE_ENTRANT_CHECK_ON;
1200 s += 0x0e;
1201 if ( addr_modes.default_mode == VM86 )
1202 {
1203 instruction_address.offset
1204 += (instruction_address.selector & 0xf000) << 4;
1205 operand_address.offset += (operand_address.selector & 0xf000) << 4;
1206 }
1207 }
1208 else
1209 {
1210 RE_ENTRANT_CHECK_OFF;
1211 FPU_access_ok(VERIFY_READ, s, 0x1c);
1212 FPU_get_user(control_word, (unsigned short __user *) s);
1213 FPU_get_user(partial_status, (unsigned short __user *) (s+4));
1214 FPU_get_user(tag_word, (unsigned short __user *) (s+8));
1215 FPU_get_user(instruction_address.offset, (unsigned long __user *) (s+0x0c));
1216 FPU_get_user(instruction_address.selector, (unsigned short __user *) (s+0x10));
1217 FPU_get_user(instruction_address.opcode, (unsigned short __user *) (s+0x12));
1218 FPU_get_user(operand_address.offset, (unsigned long __user *) (s+0x14));
1219 FPU_get_user(operand_address.selector, (unsigned long __user *) (s+0x18));
1220 RE_ENTRANT_CHECK_ON;
1221 s += 0x1c;
1222 }
1223
1224#ifdef PECULIAR_486
1225 control_word &= ~0xe080;
1226#endif /* PECULIAR_486 */
1227
1228 top = (partial_status >> SW_Top_Shift) & 7;
1229
1230 if ( partial_status & ~control_word & CW_Exceptions )
1231 partial_status |= (SW_Summary | SW_Backward);
1232 else
1233 partial_status &= ~(SW_Summary | SW_Backward);
1234
1235 for ( i = 0; i < 8; i++ )
1236 {
1237 tag = tag_word & 3;
1238 tag_word >>= 2;
1239
1240 if ( tag == TAG_Empty )
1241 /* New tag is empty. Accept it */
1242 FPU_settag(i, TAG_Empty);
1243 else if ( FPU_gettag(i) == TAG_Empty )
1244 {
1245 /* Old tag is empty and new tag is not empty. New tag is determined
1246 by old reg contents */
1247 if ( exponent(&fpu_register(i)) == - EXTENDED_Ebias )
1248 {
1249 if ( !(fpu_register(i).sigl | fpu_register(i).sigh) )
1250 FPU_settag(i, TAG_Zero);
1251 else
1252 FPU_settag(i, TAG_Special);
1253 }
1254 else if ( exponent(&fpu_register(i)) == 0x7fff - EXTENDED_Ebias )
1255 {
1256 FPU_settag(i, TAG_Special);
1257 }
1258 else if ( fpu_register(i).sigh & 0x80000000 )
1259 FPU_settag(i, TAG_Valid);
1260 else
1261 FPU_settag(i, TAG_Special); /* An Un-normal */
1262 }
1263 /* Else old tag is not empty and new tag is not empty. Old tag
1264 remains correct */
1265 }
1266
1267 return s;
1268}
1269
1270
1271void frstor(fpu_addr_modes addr_modes, u_char __user *data_address)
1272{
1273 int i, regnr;
1274 u_char __user *s = fldenv(addr_modes, data_address);
1275 int offset = (top & 7) * 10, other = 80 - offset;
1276
1277 /* Copy all registers in stack order. */
1278 RE_ENTRANT_CHECK_OFF;
1279 FPU_access_ok(VERIFY_READ,s,80);
1280 __copy_from_user(register_base+offset, s, other);
1281 if ( offset )
1282 __copy_from_user(register_base, s+other, offset);
1283 RE_ENTRANT_CHECK_ON;
1284
1285 for ( i = 0; i < 8; i++ )
1286 {
1287 regnr = (i+top) & 7;
1288 if ( FPU_gettag(regnr) != TAG_Empty )
1289 /* The loaded data over-rides all other cases. */
1290 FPU_settag(regnr, FPU_tagof(&st(i)));
1291 }
1292
1293}
1294
1295
1296u_char __user *fstenv(fpu_addr_modes addr_modes, u_char __user *d)
1297{
1298 if ( (addr_modes.default_mode == VM86) ||
1299 ((addr_modes.default_mode == PM16)
1300 ^ (addr_modes.override.operand_size == OP_SIZE_PREFIX)) )
1301 {
1302 RE_ENTRANT_CHECK_OFF;
1303 FPU_access_ok(VERIFY_WRITE,d,14);
1304#ifdef PECULIAR_486
1305 FPU_put_user(control_word & ~0xe080, (unsigned long __user *) d);
1306#else
1307 FPU_put_user(control_word, (unsigned short __user *) d);
1308#endif /* PECULIAR_486 */
1309 FPU_put_user(status_word(), (unsigned short __user *) (d+2));
1310 FPU_put_user(fpu_tag_word, (unsigned short __user *) (d+4));
1311 FPU_put_user(instruction_address.offset, (unsigned short __user *) (d+6));
1312 FPU_put_user(operand_address.offset, (unsigned short __user *) (d+0x0a));
1313 if ( addr_modes.default_mode == VM86 )
1314 {
1315 FPU_put_user((instruction_address.offset & 0xf0000) >> 4,
1316 (unsigned short __user *) (d+8));
1317 FPU_put_user((operand_address.offset & 0xf0000) >> 4,
1318 (unsigned short __user *) (d+0x0c));
1319 }
1320 else
1321 {
1322 FPU_put_user(instruction_address.selector, (unsigned short __user *) (d+8));
1323 FPU_put_user(operand_address.selector, (unsigned short __user *) (d+0x0c));
1324 }
1325 RE_ENTRANT_CHECK_ON;
1326 d += 0x0e;
1327 }
1328 else
1329 {
1330 RE_ENTRANT_CHECK_OFF;
1331 FPU_access_ok(VERIFY_WRITE, d, 7*4);
1332#ifdef PECULIAR_486
1333 control_word &= ~0xe080;
1334 /* An 80486 sets nearly all of the reserved bits to 1. */
1335 control_word |= 0xffff0040;
1336 partial_status = status_word() | 0xffff0000;
1337 fpu_tag_word |= 0xffff0000;
1338 I387.soft.fcs &= ~0xf8000000;
1339 I387.soft.fos |= 0xffff0000;
1340#endif /* PECULIAR_486 */
1341 if (__copy_to_user(d, &control_word, 7*4))
1342 FPU_abort;
1343 RE_ENTRANT_CHECK_ON;
1344 d += 0x1c;
1345 }
1346
1347 control_word |= CW_Exceptions;
1348 partial_status &= ~(SW_Summary | SW_Backward);
1349
1350 return d;
1351}
1352
1353
1354void fsave(fpu_addr_modes addr_modes, u_char __user *data_address)
1355{
1356 u_char __user *d;
1357 int offset = (top & 7) * 10, other = 80 - offset;
1358
1359 d = fstenv(addr_modes, data_address);
1360
1361 RE_ENTRANT_CHECK_OFF;
1362 FPU_access_ok(VERIFY_WRITE,d,80);
1363
1364 /* Copy all registers in stack order. */
1365 if (__copy_to_user(d, register_base+offset, other))
1366 FPU_abort;
1367 if ( offset )
1368 if (__copy_to_user(d+other, register_base, offset))
1369 FPU_abort;
1370 RE_ENTRANT_CHECK_ON;
1371
1372 finit();
1373}
1374
1375/*===========================================================================*/
diff --git a/arch/x86/math-emu/reg_mul.c b/arch/x86/math-emu/reg_mul.c
new file mode 100644
index 000000000000..40f50b61bc67
--- /dev/null
+++ b/arch/x86/math-emu/reg_mul.c
@@ -0,0 +1,132 @@
1/*---------------------------------------------------------------------------+
2 | reg_mul.c |
3 | |
4 | Multiply one FPU_REG by another, put the result in a destination FPU_REG. |
5 | |
6 | Copyright (C) 1992,1993,1997 |
7 | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
8 | E-mail billm@suburbia.net |
9 | |
10 | Returns the tag of the result if no exceptions or errors occurred. |
11 | |
12 +---------------------------------------------------------------------------*/
13
14/*---------------------------------------------------------------------------+
15 | The destination may be any FPU_REG, including one of the source FPU_REGs. |
16 +---------------------------------------------------------------------------*/
17
18#include "fpu_emu.h"
19#include "exception.h"
20#include "reg_constant.h"
21#include "fpu_system.h"
22
23
24/*
25 Multiply two registers to give a register result.
26 The sources are st(deststnr) and (b,tagb,signb).
27 The destination is st(deststnr).
28 */
29/* This routine must be called with non-empty source registers */
30int FPU_mul(FPU_REG const *b, u_char tagb, int deststnr, int control_w)
31{
32 FPU_REG *a = &st(deststnr);
33 FPU_REG *dest = a;
34 u_char taga = FPU_gettagi(deststnr);
35 u_char saved_sign = getsign(dest);
36 u_char sign = (getsign(a) ^ getsign(b));
37 int tag;
38
39
40 if ( !(taga | tagb) )
41 {
42 /* Both regs Valid, this should be the most common case. */
43
44 tag = FPU_u_mul(a, b, dest, control_w, sign, exponent(a) + exponent(b));
45 if ( tag < 0 )
46 {
47 setsign(dest, saved_sign);
48 return tag;
49 }
50 FPU_settagi(deststnr, tag);
51 return tag;
52 }
53
54 if ( taga == TAG_Special )
55 taga = FPU_Special(a);
56 if ( tagb == TAG_Special )
57 tagb = FPU_Special(b);
58
59 if ( ((taga == TAG_Valid) && (tagb == TW_Denormal))
60 || ((taga == TW_Denormal) && (tagb == TAG_Valid))
61 || ((taga == TW_Denormal) && (tagb == TW_Denormal)) )
62 {
63 FPU_REG x, y;
64 if ( denormal_operand() < 0 )
65 return FPU_Exception;
66
67 FPU_to_exp16(a, &x);
68 FPU_to_exp16(b, &y);
69 tag = FPU_u_mul(&x, &y, dest, control_w, sign,
70 exponent16(&x) + exponent16(&y));
71 if ( tag < 0 )
72 {
73 setsign(dest, saved_sign);
74 return tag;
75 }
76 FPU_settagi(deststnr, tag);
77 return tag;
78 }
79 else if ( (taga <= TW_Denormal) && (tagb <= TW_Denormal) )
80 {
81 if ( ((tagb == TW_Denormal) || (taga == TW_Denormal))
82 && (denormal_operand() < 0) )
83 return FPU_Exception;
84
85 /* Must have either both arguments == zero, or
86 one valid and the other zero.
87 The result is therefore zero. */
88 FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr);
89 /* The 80486 book says that the answer is +0, but a real
90 80486 behaves this way.
91 IEEE-754 apparently says it should be this way. */
92 setsign(dest, sign);
93 return TAG_Zero;
94 }
95 /* Must have infinities, NaNs, etc */
96 else if ( (taga == TW_NaN) || (tagb == TW_NaN) )
97 {
98 return real_2op_NaN(b, tagb, deststnr, &st(0));
99 }
100 else if ( ((taga == TW_Infinity) && (tagb == TAG_Zero))
101 || ((tagb == TW_Infinity) && (taga == TAG_Zero)) )
102 {
103 return arith_invalid(deststnr); /* Zero*Infinity is invalid */
104 }
105 else if ( ((taga == TW_Denormal) || (tagb == TW_Denormal))
106 && (denormal_operand() < 0) )
107 {
108 return FPU_Exception;
109 }
110 else if (taga == TW_Infinity)
111 {
112 FPU_copy_to_regi(a, TAG_Special, deststnr);
113 setsign(dest, sign);
114 return TAG_Special;
115 }
116 else if (tagb == TW_Infinity)
117 {
118 FPU_copy_to_regi(b, TAG_Special, deststnr);
119 setsign(dest, sign);
120 return TAG_Special;
121 }
122
123#ifdef PARANOID
124 else
125 {
126 EXCEPTION(EX_INTERNAL|0x102);
127 return FPU_Exception;
128 }
129#endif /* PARANOID */
130
131 return 0;
132}
diff --git a/arch/x86/math-emu/reg_norm.S b/arch/x86/math-emu/reg_norm.S
new file mode 100644
index 000000000000..8b6352efceef
--- /dev/null
+++ b/arch/x86/math-emu/reg_norm.S
@@ -0,0 +1,147 @@
1/*---------------------------------------------------------------------------+
2 | reg_norm.S |
3 | |
4 | Copyright (C) 1992,1993,1994,1995,1997 |
5 | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, |
6 | Australia. E-mail billm@suburbia.net |
7 | |
8 | Normalize the value in a FPU_REG. |
9 | |
10 | Call from C as: |
11 | int FPU_normalize(FPU_REG *n) |
12 | |
13 | int FPU_normalize_nuo(FPU_REG *n) |
14 | |
15 | Return value is the tag of the answer, or-ed with FPU_Exception if |
16 | one was raised, or -1 on internal error. |
17 | |
18 +---------------------------------------------------------------------------*/
19
20#include "fpu_emu.h"
21
22
23.text
24ENTRY(FPU_normalize)
25 pushl %ebp
26 movl %esp,%ebp
27 pushl %ebx
28
29 movl PARAM1,%ebx
30
31 movl SIGH(%ebx),%edx
32 movl SIGL(%ebx),%eax
33
34 orl %edx,%edx /* ms bits */
35 js L_done /* Already normalized */
36 jnz L_shift_1 /* Shift left 1 - 31 bits */
37
38 orl %eax,%eax
39 jz L_zero /* The contents are zero */
40
41 movl %eax,%edx
42 xorl %eax,%eax
43 subw $32,EXP(%ebx) /* This can cause an underflow */
44
45/* We need to shift left by 1 - 31 bits */
46L_shift_1:
47 bsrl %edx,%ecx /* get the required shift in %ecx */
48 subl $31,%ecx
49 negl %ecx
50 shld %cl,%eax,%edx
51 shl %cl,%eax
52 subw %cx,EXP(%ebx) /* This can cause an underflow */
53
54 movl %edx,SIGH(%ebx)
55 movl %eax,SIGL(%ebx)
56
57L_done:
58 cmpw EXP_OVER,EXP(%ebx)
59 jge L_overflow
60
61 cmpw EXP_UNDER,EXP(%ebx)
62 jle L_underflow
63
64L_exit_valid:
65 movl TAG_Valid,%eax
66
67 /* Convert the exponent to 80x87 form. */
68 addw EXTENDED_Ebias,EXP(%ebx)
69 andw $0x7fff,EXP(%ebx)
70
71L_exit:
72 popl %ebx
73 leave
74 ret
75
76
77L_zero:
78 movw $0,EXP(%ebx)
79 movl TAG_Zero,%eax
80 jmp L_exit
81
82L_underflow:
83 /* Convert the exponent to 80x87 form. */
84 addw EXTENDED_Ebias,EXP(%ebx)
85 push %ebx
86 call arith_underflow
87 pop %ebx
88 jmp L_exit
89
90L_overflow:
91 /* Convert the exponent to 80x87 form. */
92 addw EXTENDED_Ebias,EXP(%ebx)
93 push %ebx
94 call arith_overflow
95 pop %ebx
96 jmp L_exit
97
98
99
100/* Normalise without reporting underflow or overflow */
101ENTRY(FPU_normalize_nuo)
102 pushl %ebp
103 movl %esp,%ebp
104 pushl %ebx
105
106 movl PARAM1,%ebx
107
108 movl SIGH(%ebx),%edx
109 movl SIGL(%ebx),%eax
110
111 orl %edx,%edx /* ms bits */
112 js L_exit_nuo_valid /* Already normalized */
113 jnz L_nuo_shift_1 /* Shift left 1 - 31 bits */
114
115 orl %eax,%eax
116 jz L_exit_nuo_zero /* The contents are zero */
117
118 movl %eax,%edx
119 xorl %eax,%eax
120 subw $32,EXP(%ebx) /* This can cause an underflow */
121
122/* We need to shift left by 1 - 31 bits */
123L_nuo_shift_1:
124 bsrl %edx,%ecx /* get the required shift in %ecx */
125 subl $31,%ecx
126 negl %ecx
127 shld %cl,%eax,%edx
128 shl %cl,%eax
129 subw %cx,EXP(%ebx) /* This can cause an underflow */
130
131 movl %edx,SIGH(%ebx)
132 movl %eax,SIGL(%ebx)
133
134L_exit_nuo_valid:
135 movl TAG_Valid,%eax
136
137 popl %ebx
138 leave
139 ret
140
141L_exit_nuo_zero:
142 movl TAG_Zero,%eax
143 movw EXP_UNDER,EXP(%ebx)
144
145 popl %ebx
146 leave
147 ret
diff --git a/arch/x86/math-emu/reg_round.S b/arch/x86/math-emu/reg_round.S
new file mode 100644
index 000000000000..d1d4e48b4f67
--- /dev/null
+++ b/arch/x86/math-emu/reg_round.S
@@ -0,0 +1,708 @@
1 .file "reg_round.S"
2/*---------------------------------------------------------------------------+
3 | reg_round.S |
4 | |
5 | Rounding/truncation/etc for FPU basic arithmetic functions. |
6 | |
7 | Copyright (C) 1993,1995,1997 |
8 | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, |
9 | Australia. E-mail billm@suburbia.net |
10 | |
11 | This code has four possible entry points. |
12 | The following must be entered by a jmp instruction: |
13 | fpu_reg_round, fpu_reg_round_sqrt, and fpu_Arith_exit. |
14 | |
15 | The FPU_round entry point is intended to be used by C code. |
16 | From C, call as: |
17 | int FPU_round(FPU_REG *arg, unsigned int extent, unsigned int control_w) |
18 | |
19 | Return value is the tag of the answer, or-ed with FPU_Exception if |
20 | one was raised, or -1 on internal error. |
21 | |
22 | For correct "up" and "down" rounding, the argument must have the correct |
23 | sign. |
24 | |
25 +---------------------------------------------------------------------------*/
26
27/*---------------------------------------------------------------------------+
28 | Four entry points. |
29 | |
30 | Needed by both the fpu_reg_round and fpu_reg_round_sqrt entry points: |
31 | %eax:%ebx 64 bit significand |
32 | %edx 32 bit extension of the significand |
33 | %edi pointer to an FPU_REG for the result to be stored |
34 | stack calling function must have set up a C stack frame and |
35 | pushed %esi, %edi, and %ebx |
36 | |
37 | Needed just for the fpu_reg_round_sqrt entry point: |
38 | %cx A control word in the same format as the FPU control word. |
39 | Otherwise, PARAM4 must give such a value. |
40 | |
41 | |
42 | The significand and its extension are assumed to be exact in the |
43 | following sense: |
44 | If the significand by itself is the exact result then the significand |
45 | extension (%edx) must contain 0, otherwise the significand extension |
46 | must be non-zero. |
47 | If the significand extension is non-zero then the significand is |
48 | smaller than the magnitude of the correct exact result by an amount |
49 | greater than zero and less than one ls bit of the significand. |
50 | The significand extension is only required to have three possible |
51 | non-zero values: |
52 | less than 0x80000000 <=> the significand is less than 1/2 an ls |
53 | bit smaller than the magnitude of the |
54 | true exact result. |
55 | exactly 0x80000000 <=> the significand is exactly 1/2 an ls bit |
56 | smaller than the magnitude of the true |
57 | exact result. |
58 | greater than 0x80000000 <=> the significand is more than 1/2 an ls |
59 | bit smaller than the magnitude of the |
60 | true exact result. |
61 | |
62 +---------------------------------------------------------------------------*/
63
64/*---------------------------------------------------------------------------+
65 | The code in this module has become quite complex, but it should handle |
66 | all of the FPU flags which are set at this stage of the basic arithmetic |
67 | computations. |
68 | There are a few rare cases where the results are not set identically to |
69 | a real FPU. These require a bit more thought because at this stage the |
70 | results of the code here appear to be more consistent... |
71 | This may be changed in a future version. |
72 +---------------------------------------------------------------------------*/
73
74
75#include "fpu_emu.h"
76#include "exception.h"
77#include "control_w.h"
78
79/* Flags for FPU_bits_lost */
80#define LOST_DOWN $1
81#define LOST_UP $2
82
83/* Flags for FPU_denormal */
84#define DENORMAL $1
85#define UNMASKED_UNDERFLOW $2
86
87
88#ifndef NON_REENTRANT_FPU
89/* Make the code re-entrant by putting
90 local storage on the stack: */
91#define FPU_bits_lost (%esp)
92#define FPU_denormal 1(%esp)
93
94#else
95/* Not re-entrant, so we can gain speed by putting
96 local storage in a static area: */
97.data
98 .align 4,0
99FPU_bits_lost:
100 .byte 0
101FPU_denormal:
102 .byte 0
103#endif /* NON_REENTRANT_FPU */
104
105
106.text
107.globl fpu_reg_round
108.globl fpu_Arith_exit
109
110/* Entry point when called from C */
111ENTRY(FPU_round)
112 pushl %ebp
113 movl %esp,%ebp
114 pushl %esi
115 pushl %edi
116 pushl %ebx
117
118 movl PARAM1,%edi
119 movl SIGH(%edi),%eax
120 movl SIGL(%edi),%ebx
121 movl PARAM2,%edx
122
123fpu_reg_round: /* Normal entry point */
124 movl PARAM4,%ecx
125
126#ifndef NON_REENTRANT_FPU
127 pushl %ebx /* adjust the stack pointer */
128#endif /* NON_REENTRANT_FPU */
129
130#ifdef PARANOID
131/* Cannot use this here yet */
132/* orl %eax,%eax */
133/* jns L_entry_bugged */
134#endif /* PARANOID */
135
136 cmpw EXP_UNDER,EXP(%edi)
137 jle L_Make_denorm /* The number is a de-normal */
138
139 movb $0,FPU_denormal /* 0 -> not a de-normal */
140
141Denorm_done:
142 movb $0,FPU_bits_lost /* No bits yet lost in rounding */
143
144 movl %ecx,%esi
145 andl CW_PC,%ecx
146 cmpl PR_64_BITS,%ecx
147 je LRound_To_64
148
149 cmpl PR_53_BITS,%ecx
150 je LRound_To_53
151
152 cmpl PR_24_BITS,%ecx
153 je LRound_To_24
154
155#ifdef PECULIAR_486
156/* With the precision control bits set to 01 "(reserved)", a real 80486
157 behaves as if the precision control bits were set to 11 "64 bits" */
158 cmpl PR_RESERVED_BITS,%ecx
159 je LRound_To_64
160#ifdef PARANOID
161 jmp L_bugged_denorm_486
162#endif /* PARANOID */
163#else
164#ifdef PARANOID
165 jmp L_bugged_denorm /* There is no bug, just a bad control word */
166#endif /* PARANOID */
167#endif /* PECULIAR_486 */
168
169
170/* Round etc to 24 bit precision */
171LRound_To_24:
172 movl %esi,%ecx
173 andl CW_RC,%ecx
174 cmpl RC_RND,%ecx
175 je LRound_nearest_24
176
177 cmpl RC_CHOP,%ecx
178 je LCheck_truncate_24
179
180 cmpl RC_UP,%ecx /* Towards +infinity */
181 je LUp_24
182
183 cmpl RC_DOWN,%ecx /* Towards -infinity */
184 je LDown_24
185
186#ifdef PARANOID
187 jmp L_bugged_round24
188#endif /* PARANOID */
189
190LUp_24:
191 cmpb SIGN_POS,PARAM5
192 jne LCheck_truncate_24 /* If negative then up==truncate */
193
194 jmp LCheck_24_round_up
195
196LDown_24:
197 cmpb SIGN_POS,PARAM5
198 je LCheck_truncate_24 /* If positive then down==truncate */
199
200LCheck_24_round_up:
201 movl %eax,%ecx
202 andl $0x000000ff,%ecx
203 orl %ebx,%ecx
204 orl %edx,%ecx
205 jnz LDo_24_round_up
206 jmp L_Re_normalise
207
208LRound_nearest_24:
209 /* Do rounding of the 24th bit if needed (nearest or even) */
210 movl %eax,%ecx
211 andl $0x000000ff,%ecx
212 cmpl $0x00000080,%ecx
213 jc LCheck_truncate_24 /* less than half, no increment needed */
214
215 jne LGreater_Half_24 /* greater than half, increment needed */
216
217 /* Possibly half, we need to check the ls bits */
218 orl %ebx,%ebx
219 jnz LGreater_Half_24 /* greater than half, increment needed */
220
221 orl %edx,%edx
222 jnz LGreater_Half_24 /* greater than half, increment needed */
223
224 /* Exactly half, increment only if 24th bit is 1 (round to even) */
225 testl $0x00000100,%eax
226 jz LDo_truncate_24
227
228LGreater_Half_24: /* Rounding: increment at the 24th bit */
229LDo_24_round_up:
230 andl $0xffffff00,%eax /* Truncate to 24 bits */
231 xorl %ebx,%ebx
232 movb LOST_UP,FPU_bits_lost
233 addl $0x00000100,%eax
234 jmp LCheck_Round_Overflow
235
236LCheck_truncate_24:
237 movl %eax,%ecx
238 andl $0x000000ff,%ecx
239 orl %ebx,%ecx
240 orl %edx,%ecx
241 jz L_Re_normalise /* No truncation needed */
242
243LDo_truncate_24:
244 andl $0xffffff00,%eax /* Truncate to 24 bits */
245 xorl %ebx,%ebx
246 movb LOST_DOWN,FPU_bits_lost
247 jmp L_Re_normalise
248
249
250/* Round etc to 53 bit precision */
251LRound_To_53:
252 movl %esi,%ecx
253 andl CW_RC,%ecx
254 cmpl RC_RND,%ecx
255 je LRound_nearest_53
256
257 cmpl RC_CHOP,%ecx
258 je LCheck_truncate_53
259
260 cmpl RC_UP,%ecx /* Towards +infinity */
261 je LUp_53
262
263 cmpl RC_DOWN,%ecx /* Towards -infinity */
264 je LDown_53
265
266#ifdef PARANOID
267 jmp L_bugged_round53
268#endif /* PARANOID */
269
270LUp_53:
271 cmpb SIGN_POS,PARAM5
272 jne LCheck_truncate_53 /* If negative then up==truncate */
273
274 jmp LCheck_53_round_up
275
276LDown_53:
277 cmpb SIGN_POS,PARAM5
278 je LCheck_truncate_53 /* If positive then down==truncate */
279
280LCheck_53_round_up:
281 movl %ebx,%ecx
282 andl $0x000007ff,%ecx
283 orl %edx,%ecx
284 jnz LDo_53_round_up
285 jmp L_Re_normalise
286
287LRound_nearest_53:
288 /* Do rounding of the 53rd bit if needed (nearest or even) */
289 movl %ebx,%ecx
290 andl $0x000007ff,%ecx
291 cmpl $0x00000400,%ecx
292 jc LCheck_truncate_53 /* less than half, no increment needed */
293
294 jnz LGreater_Half_53 /* greater than half, increment needed */
295
296 /* Possibly half, we need to check the ls bits */
297 orl %edx,%edx
298 jnz LGreater_Half_53 /* greater than half, increment needed */
299
300 /* Exactly half, increment only if 53rd bit is 1 (round to even) */
301 testl $0x00000800,%ebx
302 jz LTruncate_53
303
304LGreater_Half_53: /* Rounding: increment at the 53rd bit */
305LDo_53_round_up:
306 movb LOST_UP,FPU_bits_lost
307 andl $0xfffff800,%ebx /* Truncate to 53 bits */
308 addl $0x00000800,%ebx
309 adcl $0,%eax
310 jmp LCheck_Round_Overflow
311
312LCheck_truncate_53:
313 movl %ebx,%ecx
314 andl $0x000007ff,%ecx
315 orl %edx,%ecx
316 jz L_Re_normalise
317
318LTruncate_53:
319 movb LOST_DOWN,FPU_bits_lost
320 andl $0xfffff800,%ebx /* Truncate to 53 bits */
321 jmp L_Re_normalise
322
323
324/* Round etc to 64 bit precision */
325LRound_To_64:
326 movl %esi,%ecx
327 andl CW_RC,%ecx
328 cmpl RC_RND,%ecx
329 je LRound_nearest_64
330
331 cmpl RC_CHOP,%ecx
332 je LCheck_truncate_64
333
334 cmpl RC_UP,%ecx /* Towards +infinity */
335 je LUp_64
336
337 cmpl RC_DOWN,%ecx /* Towards -infinity */
338 je LDown_64
339
340#ifdef PARANOID
341 jmp L_bugged_round64
342#endif /* PARANOID */
343
344LUp_64:
345 cmpb SIGN_POS,PARAM5
346 jne LCheck_truncate_64 /* If negative then up==truncate */
347
348 orl %edx,%edx
349 jnz LDo_64_round_up
350 jmp L_Re_normalise
351
352LDown_64:
353 cmpb SIGN_POS,PARAM5
354 je LCheck_truncate_64 /* If positive then down==truncate */
355
356 orl %edx,%edx
357 jnz LDo_64_round_up
358 jmp L_Re_normalise
359
360LRound_nearest_64:
361 cmpl $0x80000000,%edx
362 jc LCheck_truncate_64
363
364 jne LDo_64_round_up
365
366 /* Now test for round-to-even */
367 testb $1,%bl
368 jz LCheck_truncate_64
369
370LDo_64_round_up:
371 movb LOST_UP,FPU_bits_lost
372 addl $1,%ebx
373 adcl $0,%eax
374
375LCheck_Round_Overflow:
376 jnc L_Re_normalise
377
378 /* Overflow, adjust the result (significand to 1.0) */
379 rcrl $1,%eax
380 rcrl $1,%ebx
381 incw EXP(%edi)
382 jmp L_Re_normalise
383
384LCheck_truncate_64:
385 orl %edx,%edx
386 jz L_Re_normalise
387
388LTruncate_64:
389 movb LOST_DOWN,FPU_bits_lost
390
391L_Re_normalise:
392 testb $0xff,FPU_denormal
393 jnz Normalise_result
394
395L_Normalised:
396 movl TAG_Valid,%edx
397
398L_deNormalised:
399 cmpb LOST_UP,FPU_bits_lost
400 je L_precision_lost_up
401
402 cmpb LOST_DOWN,FPU_bits_lost
403 je L_precision_lost_down
404
405L_no_precision_loss:
406 /* store the result */
407
408L_Store_significand:
409 movl %eax,SIGH(%edi)
410 movl %ebx,SIGL(%edi)
411
412 cmpw EXP_OVER,EXP(%edi)
413 jge L_overflow
414
415 movl %edx,%eax
416
417 /* Convert the exponent to 80x87 form. */
418 addw EXTENDED_Ebias,EXP(%edi)
419 andw $0x7fff,EXP(%edi)
420
421fpu_reg_round_signed_special_exit:
422
423 cmpb SIGN_POS,PARAM5
424 je fpu_reg_round_special_exit
425
426 orw $0x8000,EXP(%edi) /* Negative sign for the result. */
427
428fpu_reg_round_special_exit:
429
430#ifndef NON_REENTRANT_FPU
431 popl %ebx /* adjust the stack pointer */
432#endif /* NON_REENTRANT_FPU */
433
434fpu_Arith_exit:
435 popl %ebx
436 popl %edi
437 popl %esi
438 leave
439 ret
440
441
442/*
443 * Set the FPU status flags to represent precision loss due to
444 * round-up.
445 */
446L_precision_lost_up:
447 push %edx
448 push %eax
449 call set_precision_flag_up
450 popl %eax
451 popl %edx
452 jmp L_no_precision_loss
453
454/*
455 * Set the FPU status flags to represent precision loss due to
456 * truncation.
457 */
458L_precision_lost_down:
459 push %edx
460 push %eax
461 call set_precision_flag_down
462 popl %eax
463 popl %edx
464 jmp L_no_precision_loss
465
466
467/*
468 * The number is a denormal (which might get rounded up to a normal)
469 * Shift the number right the required number of bits, which will
470 * have to be undone later...
471 */
472L_Make_denorm:
473 /* The action to be taken depends upon whether the underflow
474 exception is masked */
475 testb CW_Underflow,%cl /* Underflow mask. */
476 jz Unmasked_underflow /* Do not make a denormal. */
477
478 movb DENORMAL,FPU_denormal
479
480 pushl %ecx /* Save */
481 movw EXP_UNDER+1,%cx
482 subw EXP(%edi),%cx
483
484 cmpw $64,%cx /* shrd only works for 0..31 bits */
485 jnc Denorm_shift_more_than_63
486
487 cmpw $32,%cx /* shrd only works for 0..31 bits */
488 jnc Denorm_shift_more_than_32
489
490/*
491 * We got here without jumps by assuming that the most common requirement
492 * is for a small de-normalising shift.
493 * Shift by [1..31] bits
494 */
495 addw %cx,EXP(%edi)
496 orl %edx,%edx /* extension */
497 setne %ch /* Save whether %edx is non-zero */
498 xorl %edx,%edx
499 shrd %cl,%ebx,%edx
500 shrd %cl,%eax,%ebx
501 shr %cl,%eax
502 orb %ch,%dl
503 popl %ecx
504 jmp Denorm_done
505
506/* Shift by [32..63] bits */
507Denorm_shift_more_than_32:
508 addw %cx,EXP(%edi)
509 subb $32,%cl
510 orl %edx,%edx
511 setne %ch
512 orb %ch,%bl
513 xorl %edx,%edx
514 shrd %cl,%ebx,%edx
515 shrd %cl,%eax,%ebx
516 shr %cl,%eax
517 orl %edx,%edx /* test these 32 bits */
518 setne %cl
519 orb %ch,%bl
520 orb %cl,%bl
521 movl %ebx,%edx
522 movl %eax,%ebx
523 xorl %eax,%eax
524 popl %ecx
525 jmp Denorm_done
526
527/* Shift by [64..) bits */
528Denorm_shift_more_than_63:
529 cmpw $64,%cx
530 jne Denorm_shift_more_than_64
531
532/* Exactly 64 bit shift */
533 addw %cx,EXP(%edi)
534 xorl %ecx,%ecx
535 orl %edx,%edx
536 setne %cl
537 orl %ebx,%ebx
538 setne %ch
539 orb %ch,%cl
540 orb %cl,%al
541 movl %eax,%edx
542 xorl %eax,%eax
543 xorl %ebx,%ebx
544 popl %ecx
545 jmp Denorm_done
546
547Denorm_shift_more_than_64:
548 movw EXP_UNDER+1,EXP(%edi)
549/* This is easy, %eax must be non-zero, so.. */
550 movl $1,%edx
551 xorl %eax,%eax
552 xorl %ebx,%ebx
553 popl %ecx
554 jmp Denorm_done
555
556
557Unmasked_underflow:
558 movb UNMASKED_UNDERFLOW,FPU_denormal
559 jmp Denorm_done
560
561
562/* Undo the de-normalisation. */
563Normalise_result:
564 cmpb UNMASKED_UNDERFLOW,FPU_denormal
565 je Signal_underflow
566
567/* The number must be a denormal if we got here. */
568#ifdef PARANOID
569 /* But check it... just in case. */
570 cmpw EXP_UNDER+1,EXP(%edi)
571 jne L_norm_bugged
572#endif /* PARANOID */
573
574#ifdef PECULIAR_486
575 /*
576 * This implements a special feature of 80486 behaviour.
577 * Underflow will be signalled even if the number is
578 * not a denormal after rounding.
579 * This difference occurs only for masked underflow, and not
580 * in the unmasked case.
581 * Actual 80486 behaviour differs from this in some circumstances.
582 */
583 orl %eax,%eax /* ms bits */
584 js LPseudoDenormal /* Will be masked underflow */
585#else
586 orl %eax,%eax /* ms bits */
587 js L_Normalised /* No longer a denormal */
588#endif /* PECULIAR_486 */
589
590 jnz LDenormal_adj_exponent
591
592 orl %ebx,%ebx
593 jz L_underflow_to_zero /* The contents are zero */
594
595LDenormal_adj_exponent:
596 decw EXP(%edi)
597
598LPseudoDenormal:
599 testb $0xff,FPU_bits_lost /* bits lost == underflow */
600 movl TAG_Special,%edx
601 jz L_deNormalised
602
603 /* There must be a masked underflow */
604 push %eax
605 pushl EX_Underflow
606 call EXCEPTION
607 popl %eax
608 popl %eax
609 movl TAG_Special,%edx
610 jmp L_deNormalised
611
612
613/*
614 * The operations resulted in a number too small to represent.
615 * Masked response.
616 */
617L_underflow_to_zero:
618 push %eax
619 call set_precision_flag_down
620 popl %eax
621
622 push %eax
623 pushl EX_Underflow
624 call EXCEPTION
625 popl %eax
626 popl %eax
627
628/* Reduce the exponent to EXP_UNDER */
629 movw EXP_UNDER,EXP(%edi)
630 movl TAG_Zero,%edx
631 jmp L_Store_significand
632
633
634/* The operations resulted in a number too large to represent. */
635L_overflow:
636 addw EXTENDED_Ebias,EXP(%edi) /* Set for unmasked response. */
637 push %edi
638 call arith_overflow
639 pop %edi
640 jmp fpu_reg_round_signed_special_exit
641
642
643Signal_underflow:
644 /* The number may have been changed to a non-denormal */
645 /* by the rounding operations. */
646 cmpw EXP_UNDER,EXP(%edi)
647 jle Do_unmasked_underflow
648
649 jmp L_Normalised
650
651Do_unmasked_underflow:
652 /* Increase the exponent by the magic number */
653 addw $(3*(1<<13)),EXP(%edi)
654 push %eax
655 pushl EX_Underflow
656 call EXCEPTION
657 popl %eax
658 popl %eax
659 jmp L_Normalised
660
661
662#ifdef PARANOID
663#ifdef PECULIAR_486
664L_bugged_denorm_486:
665 pushl EX_INTERNAL|0x236
666 call EXCEPTION
667 popl %ebx
668 jmp L_exception_exit
669#else
670L_bugged_denorm:
671 pushl EX_INTERNAL|0x230
672 call EXCEPTION
673 popl %ebx
674 jmp L_exception_exit
675#endif /* PECULIAR_486 */
676
677L_bugged_round24:
678 pushl EX_INTERNAL|0x231
679 call EXCEPTION
680 popl %ebx
681 jmp L_exception_exit
682
683L_bugged_round53:
684 pushl EX_INTERNAL|0x232
685 call EXCEPTION
686 popl %ebx
687 jmp L_exception_exit
688
689L_bugged_round64:
690 pushl EX_INTERNAL|0x233
691 call EXCEPTION
692 popl %ebx
693 jmp L_exception_exit
694
695L_norm_bugged:
696 pushl EX_INTERNAL|0x234
697 call EXCEPTION
698 popl %ebx
699 jmp L_exception_exit
700
701L_entry_bugged:
702 pushl EX_INTERNAL|0x235
703 call EXCEPTION
704 popl %ebx
705L_exception_exit:
706 mov $-1,%eax
707 jmp fpu_reg_round_special_exit
708#endif /* PARANOID */
diff --git a/arch/x86/math-emu/reg_u_add.S b/arch/x86/math-emu/reg_u_add.S
new file mode 100644
index 000000000000..47c4c2434d85
--- /dev/null
+++ b/arch/x86/math-emu/reg_u_add.S
@@ -0,0 +1,167 @@
1 .file "reg_u_add.S"
2/*---------------------------------------------------------------------------+
3 | reg_u_add.S |
4 | |
5 | Add two valid (TAG_Valid) FPU_REG numbers, of the same sign, and put the |
6 | result in a destination FPU_REG. |
7 | |
8 | Copyright (C) 1992,1993,1995,1997 |
9 | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
10 | E-mail billm@suburbia.net |
11 | |
12 | Call from C as: |
13 | int FPU_u_add(FPU_REG *arg1, FPU_REG *arg2, FPU_REG *answ, |
14 | int control_w) |
15 | Return value is the tag of the answer, or-ed with FPU_Exception if |
16 | one was raised, or -1 on internal error. |
17 | |
18 +---------------------------------------------------------------------------*/
19
20/*
21 | Kernel addition routine FPU_u_add(reg *arg1, reg *arg2, reg *answ).
22 | Takes two valid reg f.p. numbers (TAG_Valid), which are
23 | treated as unsigned numbers,
24 | and returns their sum as a TAG_Valid or TAG_Special f.p. number.
25 | The returned number is normalized.
26 | Basic checks are performed if PARANOID is defined.
27 */
28
29#include "exception.h"
30#include "fpu_emu.h"
31#include "control_w.h"
32
33.text
34ENTRY(FPU_u_add)
35 pushl %ebp
36 movl %esp,%ebp
37 pushl %esi
38 pushl %edi
39 pushl %ebx
40
41 movl PARAM1,%esi /* source 1 */
42 movl PARAM2,%edi /* source 2 */
43
44 movl PARAM6,%ecx
45 movl %ecx,%edx
46 subl PARAM7,%ecx /* exp1 - exp2 */
47 jge L_arg1_larger
48
49 /* num1 is smaller */
50 movl SIGL(%esi),%ebx
51 movl SIGH(%esi),%eax
52
53 movl %edi,%esi
54 movl PARAM7,%edx
55 negw %cx
56 jmp L_accum_loaded
57
58L_arg1_larger:
59 /* num1 has larger or equal exponent */
60 movl SIGL(%edi),%ebx
61 movl SIGH(%edi),%eax
62
63L_accum_loaded:
64 movl PARAM3,%edi /* destination */
65 movw %dx,EXP(%edi) /* Copy exponent to destination */
66
67 xorl %edx,%edx /* clear the extension */
68
69#ifdef PARANOID
70 testl $0x80000000,%eax
71 je L_bugged
72
73 testl $0x80000000,SIGH(%esi)
74 je L_bugged
75#endif /* PARANOID */
76
77/* The number to be shifted is in %eax:%ebx:%edx */
78 cmpw $32,%cx /* shrd only works for 0..31 bits */
79 jnc L_more_than_31
80
81/* less than 32 bits */
82 shrd %cl,%ebx,%edx
83 shrd %cl,%eax,%ebx
84 shr %cl,%eax
85 jmp L_shift_done
86
87L_more_than_31:
88 cmpw $64,%cx
89 jnc L_more_than_63
90
91 subb $32,%cl
92 jz L_exactly_32
93
94 shrd %cl,%eax,%edx
95 shr %cl,%eax
96 orl %ebx,%ebx
97 jz L_more_31_no_low /* none of the lowest bits is set */
98
99 orl $1,%edx /* record the fact in the extension */
100
101L_more_31_no_low:
102 movl %eax,%ebx
103 xorl %eax,%eax
104 jmp L_shift_done
105
106L_exactly_32:
107 movl %ebx,%edx
108 movl %eax,%ebx
109 xorl %eax,%eax
110 jmp L_shift_done
111
112L_more_than_63:
113 cmpw $65,%cx
114 jnc L_more_than_64
115
116 movl %eax,%edx
117 orl %ebx,%ebx
118 jz L_more_63_no_low
119
120 orl $1,%edx
121 jmp L_more_63_no_low
122
123L_more_than_64:
124 movl $1,%edx /* The shifted nr always at least one '1' */
125
126L_more_63_no_low:
127 xorl %ebx,%ebx
128 xorl %eax,%eax
129
130L_shift_done:
131 /* Now do the addition */
132 addl SIGL(%esi),%ebx
133 adcl SIGH(%esi),%eax
134 jnc L_round_the_result
135
136 /* Overflow, adjust the result */
137 rcrl $1,%eax
138 rcrl $1,%ebx
139 rcrl $1,%edx
140 jnc L_no_bit_lost
141
142 orl $1,%edx
143
144L_no_bit_lost:
145 incw EXP(%edi)
146
147L_round_the_result:
148 jmp fpu_reg_round /* Round the result */
149
150
151
152#ifdef PARANOID
153/* If we ever get here then we have problems! */
154L_bugged:
155 pushl EX_INTERNAL|0x201
156 call EXCEPTION
157 pop %ebx
158 movl $-1,%eax
159 jmp L_exit
160
161L_exit:
162 popl %ebx
163 popl %edi
164 popl %esi
165 leave
166 ret
167#endif /* PARANOID */
diff --git a/arch/x86/math-emu/reg_u_div.S b/arch/x86/math-emu/reg_u_div.S
new file mode 100644
index 000000000000..cc00654b6f9a
--- /dev/null
+++ b/arch/x86/math-emu/reg_u_div.S
@@ -0,0 +1,471 @@
1 .file "reg_u_div.S"
2/*---------------------------------------------------------------------------+
3 | reg_u_div.S |
4 | |
5 | Divide one FPU_REG by another and put the result in a destination FPU_REG.|
6 | |
7 | Copyright (C) 1992,1993,1995,1997 |
8 | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
9 | E-mail billm@suburbia.net |
10 | |
11 | |
12 +---------------------------------------------------------------------------*/
13
14/*---------------------------------------------------------------------------+
15 | Call from C as: |
16 | int FPU_u_div(FPU_REG *a, FPU_REG *b, FPU_REG *dest, |
17 | unsigned int control_word, char *sign) |
18 | |
19 | Does not compute the destination exponent, but does adjust it. |
20 | |
21 | Return value is the tag of the answer, or-ed with FPU_Exception if |
22 | one was raised, or -1 on internal error. |
23 +---------------------------------------------------------------------------*/
24
25#include "exception.h"
26#include "fpu_emu.h"
27#include "control_w.h"
28
29
30/* #define dSIGL(x) (x) */
31/* #define dSIGH(x) 4(x) */
32
33
34#ifndef NON_REENTRANT_FPU
35/*
36 Local storage on the stack:
37 Result: FPU_accum_3:FPU_accum_2:FPU_accum_1:FPU_accum_0
38 Overflow flag: ovfl_flag
39 */
40#define FPU_accum_3 -4(%ebp)
41#define FPU_accum_2 -8(%ebp)
42#define FPU_accum_1 -12(%ebp)
43#define FPU_accum_0 -16(%ebp)
44#define FPU_result_1 -20(%ebp)
45#define FPU_result_2 -24(%ebp)
46#define FPU_ovfl_flag -28(%ebp)
47
48#else
49.data
50/*
51 Local storage in a static area:
52 Result: FPU_accum_3:FPU_accum_2:FPU_accum_1:FPU_accum_0
53 Overflow flag: ovfl_flag
54 */
55 .align 4,0
56FPU_accum_3:
57 .long 0
58FPU_accum_2:
59 .long 0
60FPU_accum_1:
61 .long 0
62FPU_accum_0:
63 .long 0
64FPU_result_1:
65 .long 0
66FPU_result_2:
67 .long 0
68FPU_ovfl_flag:
69 .byte 0
70#endif /* NON_REENTRANT_FPU */
71
72#define REGA PARAM1
73#define REGB PARAM2
74#define DEST PARAM3
75
76.text
77ENTRY(FPU_u_div)
78 pushl %ebp
79 movl %esp,%ebp
80#ifndef NON_REENTRANT_FPU
81 subl $28,%esp
82#endif /* NON_REENTRANT_FPU */
83
84 pushl %esi
85 pushl %edi
86 pushl %ebx
87
88 movl REGA,%esi
89 movl REGB,%ebx
90 movl DEST,%edi
91
92 movswl EXP(%esi),%edx
93 movswl EXP(%ebx),%eax
94 subl %eax,%edx
95 addl EXP_BIAS,%edx
96
97 /* A denormal and a large number can cause an exponent underflow */
98 cmpl EXP_WAY_UNDER,%edx
99 jg xExp_not_underflow
100
101 /* Set to a really low value allow correct handling */
102 movl EXP_WAY_UNDER,%edx
103
104xExp_not_underflow:
105
106 movw %dx,EXP(%edi)
107
108#ifdef PARANOID
109/* testl $0x80000000, SIGH(%esi) // Dividend */
110/* je L_bugged */
111 testl $0x80000000, SIGH(%ebx) /* Divisor */
112 je L_bugged
113#endif /* PARANOID */
114
115/* Check if the divisor can be treated as having just 32 bits */
116 cmpl $0,SIGL(%ebx)
117 jnz L_Full_Division /* Can't do a quick divide */
118
119/* We should be able to zip through the division here */
120 movl SIGH(%ebx),%ecx /* The divisor */
121 movl SIGH(%esi),%edx /* Dividend */
122 movl SIGL(%esi),%eax /* Dividend */
123
124 cmpl %ecx,%edx
125 setaeb FPU_ovfl_flag /* Keep a record */
126 jb L_no_adjust
127
128 subl %ecx,%edx /* Prevent the overflow */
129
130L_no_adjust:
131 /* Divide the 64 bit number by the 32 bit denominator */
132 divl %ecx
133 movl %eax,FPU_result_2
134
135 /* Work on the remainder of the first division */
136 xorl %eax,%eax
137 divl %ecx
138 movl %eax,FPU_result_1
139
140 /* Work on the remainder of the 64 bit division */
141 xorl %eax,%eax
142 divl %ecx
143
144 testb $255,FPU_ovfl_flag /* was the num > denom ? */
145 je L_no_overflow
146
147 /* Do the shifting here */
148 /* increase the exponent */
149 incw EXP(%edi)
150
151 /* shift the mantissa right one bit */
152 stc /* To set the ms bit */
153 rcrl FPU_result_2
154 rcrl FPU_result_1
155 rcrl %eax
156
157L_no_overflow:
158 jmp LRound_precision /* Do the rounding as required */
159
160
161/*---------------------------------------------------------------------------+
162 | Divide: Return arg1/arg2 to arg3. |
163 | |
164 | This routine does not use the exponents of arg1 and arg2, but does |
165 | adjust the exponent of arg3. |
166 | |
167 | The maximum returned value is (ignoring exponents) |
168 | .ffffffff ffffffff |
169 | ------------------ = 1.ffffffff fffffffe |
170 | .80000000 00000000 |
171 | and the minimum is |
172 | .80000000 00000000 |
173 | ------------------ = .80000000 00000001 (rounded) |
174 | .ffffffff ffffffff |
175 | |
176 +---------------------------------------------------------------------------*/
177
178
179L_Full_Division:
180 /* Save extended dividend in local register */
181 movl SIGL(%esi),%eax
182 movl %eax,FPU_accum_2
183 movl SIGH(%esi),%eax
184 movl %eax,FPU_accum_3
185 xorl %eax,%eax
186 movl %eax,FPU_accum_1 /* zero the extension */
187 movl %eax,FPU_accum_0 /* zero the extension */
188
189 movl SIGL(%esi),%eax /* Get the current num */
190 movl SIGH(%esi),%edx
191
192/*----------------------------------------------------------------------*/
193/* Initialization done.
194 Do the first 32 bits. */
195
196 movb $0,FPU_ovfl_flag
197 cmpl SIGH(%ebx),%edx /* Test for imminent overflow */
198 jb LLess_than_1
199 ja LGreater_than_1
200
201 cmpl SIGL(%ebx),%eax
202 jb LLess_than_1
203
204LGreater_than_1:
205/* The dividend is greater or equal, would cause overflow */
206 setaeb FPU_ovfl_flag /* Keep a record */
207
208 subl SIGL(%ebx),%eax
209 sbbl SIGH(%ebx),%edx /* Prevent the overflow */
210 movl %eax,FPU_accum_2
211 movl %edx,FPU_accum_3
212
213LLess_than_1:
214/* At this point, we have a dividend < divisor, with a record of
215 adjustment in FPU_ovfl_flag */
216
217 /* We will divide by a number which is too large */
218 movl SIGH(%ebx),%ecx
219 addl $1,%ecx
220 jnc LFirst_div_not_1
221
222 /* here we need to divide by 100000000h,
223 i.e., no division at all.. */
224 mov %edx,%eax
225 jmp LFirst_div_done
226
227LFirst_div_not_1:
228 divl %ecx /* Divide the numerator by the augmented
229 denom ms dw */
230
231LFirst_div_done:
232 movl %eax,FPU_result_2 /* Put the result in the answer */
233
234 mull SIGH(%ebx) /* mul by the ms dw of the denom */
235
236 subl %eax,FPU_accum_2 /* Subtract from the num local reg */
237 sbbl %edx,FPU_accum_3
238
239 movl FPU_result_2,%eax /* Get the result back */
240 mull SIGL(%ebx) /* now mul the ls dw of the denom */
241
242 subl %eax,FPU_accum_1 /* Subtract from the num local reg */
243 sbbl %edx,FPU_accum_2
244 sbbl $0,FPU_accum_3
245 je LDo_2nd_32_bits /* Must check for non-zero result here */
246
247#ifdef PARANOID
248 jb L_bugged_1
249#endif /* PARANOID */
250
251 /* need to subtract another once of the denom */
252 incl FPU_result_2 /* Correct the answer */
253
254 movl SIGL(%ebx),%eax
255 movl SIGH(%ebx),%edx
256 subl %eax,FPU_accum_1 /* Subtract from the num local reg */
257 sbbl %edx,FPU_accum_2
258
259#ifdef PARANOID
260 sbbl $0,FPU_accum_3
261 jne L_bugged_1 /* Must check for non-zero result here */
262#endif /* PARANOID */
263
264/*----------------------------------------------------------------------*/
265/* Half of the main problem is done, there is just a reduced numerator
266 to handle now.
267 Work with the second 32 bits, FPU_accum_0 not used from now on */
268LDo_2nd_32_bits:
269 movl FPU_accum_2,%edx /* get the reduced num */
270 movl FPU_accum_1,%eax
271
272 /* need to check for possible subsequent overflow */
273 cmpl SIGH(%ebx),%edx
274 jb LDo_2nd_div
275 ja LPrevent_2nd_overflow
276
277 cmpl SIGL(%ebx),%eax
278 jb LDo_2nd_div
279
280LPrevent_2nd_overflow:
281/* The numerator is greater or equal, would cause overflow */
282 /* prevent overflow */
283 subl SIGL(%ebx),%eax
284 sbbl SIGH(%ebx),%edx
285 movl %edx,FPU_accum_2
286 movl %eax,FPU_accum_1
287
288 incl FPU_result_2 /* Reflect the subtraction in the answer */
289
290#ifdef PARANOID
291 je L_bugged_2 /* Can't bump the result to 1.0 */
292#endif /* PARANOID */
293
294LDo_2nd_div:
295 cmpl $0,%ecx /* augmented denom msw */
296 jnz LSecond_div_not_1
297
298 /* %ecx == 0, we are dividing by 1.0 */
299 mov %edx,%eax
300 jmp LSecond_div_done
301
302LSecond_div_not_1:
303 divl %ecx /* Divide the numerator by the denom ms dw */
304
305LSecond_div_done:
306 movl %eax,FPU_result_1 /* Put the result in the answer */
307
308 mull SIGH(%ebx) /* mul by the ms dw of the denom */
309
310 subl %eax,FPU_accum_1 /* Subtract from the num local reg */
311 sbbl %edx,FPU_accum_2
312
313#ifdef PARANOID
314 jc L_bugged_2
315#endif /* PARANOID */
316
317 movl FPU_result_1,%eax /* Get the result back */
318 mull SIGL(%ebx) /* now mul the ls dw of the denom */
319
320 subl %eax,FPU_accum_0 /* Subtract from the num local reg */
321 sbbl %edx,FPU_accum_1 /* Subtract from the num local reg */
322 sbbl $0,FPU_accum_2
323
324#ifdef PARANOID
325 jc L_bugged_2
326#endif /* PARANOID */
327
328 jz LDo_3rd_32_bits
329
330#ifdef PARANOID
331 cmpl $1,FPU_accum_2
332 jne L_bugged_2
333#endif /* PARANOID */
334
335 /* need to subtract another once of the denom */
336 movl SIGL(%ebx),%eax
337 movl SIGH(%ebx),%edx
338 subl %eax,FPU_accum_0 /* Subtract from the num local reg */
339 sbbl %edx,FPU_accum_1
340 sbbl $0,FPU_accum_2
341
342#ifdef PARANOID
343 jc L_bugged_2
344 jne L_bugged_2
345#endif /* PARANOID */
346
347 addl $1,FPU_result_1 /* Correct the answer */
348 adcl $0,FPU_result_2
349
350#ifdef PARANOID
351 jc L_bugged_2 /* Must check for non-zero result here */
352#endif /* PARANOID */
353
354/*----------------------------------------------------------------------*/
355/* The division is essentially finished here, we just need to perform
356 tidying operations.
357 Deal with the 3rd 32 bits */
358LDo_3rd_32_bits:
359 movl FPU_accum_1,%edx /* get the reduced num */
360 movl FPU_accum_0,%eax
361
362 /* need to check for possible subsequent overflow */
363 cmpl SIGH(%ebx),%edx /* denom */
364 jb LRound_prep
365 ja LPrevent_3rd_overflow
366
367 cmpl SIGL(%ebx),%eax /* denom */
368 jb LRound_prep
369
370LPrevent_3rd_overflow:
371 /* prevent overflow */
372 subl SIGL(%ebx),%eax
373 sbbl SIGH(%ebx),%edx
374 movl %edx,FPU_accum_1
375 movl %eax,FPU_accum_0
376
377 addl $1,FPU_result_1 /* Reflect the subtraction in the answer */
378 adcl $0,FPU_result_2
379 jne LRound_prep
380 jnc LRound_prep
381
382 /* This is a tricky spot, there is an overflow of the answer */
383 movb $255,FPU_ovfl_flag /* Overflow -> 1.000 */
384
385LRound_prep:
386/*
387 * Prepare for rounding.
388 * To test for rounding, we just need to compare 2*accum with the
389 * denom.
390 */
391 movl FPU_accum_0,%ecx
392 movl FPU_accum_1,%edx
393 movl %ecx,%eax
394 orl %edx,%eax
395 jz LRound_ovfl /* The accumulator contains zero. */
396
397 /* Multiply by 2 */
398 clc
399 rcll $1,%ecx
400 rcll $1,%edx
401 jc LRound_large /* No need to compare, denom smaller */
402
403 subl SIGL(%ebx),%ecx
404 sbbl SIGH(%ebx),%edx
405 jnc LRound_not_small
406
407 movl $0x70000000,%eax /* Denom was larger */
408 jmp LRound_ovfl
409
410LRound_not_small:
411 jnz LRound_large
412
413 movl $0x80000000,%eax /* Remainder was exactly 1/2 denom */
414 jmp LRound_ovfl
415
416LRound_large:
417 movl $0xff000000,%eax /* Denom was smaller */
418
419LRound_ovfl:
420/* We are now ready to deal with rounding, but first we must get
421 the bits properly aligned */
422 testb $255,FPU_ovfl_flag /* was the num > denom ? */
423 je LRound_precision
424
425 incw EXP(%edi)
426
427 /* shift the mantissa right one bit */
428 stc /* Will set the ms bit */
429 rcrl FPU_result_2
430 rcrl FPU_result_1
431 rcrl %eax
432
433/* Round the result as required */
434LRound_precision:
435 decw EXP(%edi) /* binary point between 1st & 2nd bits */
436
437 movl %eax,%edx
438 movl FPU_result_1,%ebx
439 movl FPU_result_2,%eax
440 jmp fpu_reg_round
441
442
443#ifdef PARANOID
444/* The logic is wrong if we got here */
445L_bugged:
446 pushl EX_INTERNAL|0x202
447 call EXCEPTION
448 pop %ebx
449 jmp L_exit
450
451L_bugged_1:
452 pushl EX_INTERNAL|0x203
453 call EXCEPTION
454 pop %ebx
455 jmp L_exit
456
457L_bugged_2:
458 pushl EX_INTERNAL|0x204
459 call EXCEPTION
460 pop %ebx
461 jmp L_exit
462
463L_exit:
464 movl $-1,%eax
465 popl %ebx
466 popl %edi
467 popl %esi
468
469 leave
470 ret
471#endif /* PARANOID */
diff --git a/arch/x86/math-emu/reg_u_mul.S b/arch/x86/math-emu/reg_u_mul.S
new file mode 100644
index 000000000000..973f12af97df
--- /dev/null
+++ b/arch/x86/math-emu/reg_u_mul.S
@@ -0,0 +1,148 @@
1 .file "reg_u_mul.S"
2/*---------------------------------------------------------------------------+
3 | reg_u_mul.S |
4 | |
5 | Core multiplication routine |
6 | |
7 | Copyright (C) 1992,1993,1995,1997 |
8 | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
9 | E-mail billm@suburbia.net |
10 | |
11 | |
12 +---------------------------------------------------------------------------*/
13
14/*---------------------------------------------------------------------------+
15 | Basic multiplication routine. |
16 | Does not check the resulting exponent for overflow/underflow |
17 | |
18 | FPU_u_mul(FPU_REG *a, FPU_REG *b, FPU_REG *c, unsigned int cw); |
19 | |
20 | Internal working is at approx 128 bits. |
21 | Result is rounded to nearest 53 or 64 bits, using "nearest or even". |
22 +---------------------------------------------------------------------------*/
23
24#include "exception.h"
25#include "fpu_emu.h"
26#include "control_w.h"
27
28
29
30#ifndef NON_REENTRANT_FPU
31/* Local storage on the stack: */
32#define FPU_accum_0 -4(%ebp) /* ms word */
33#define FPU_accum_1 -8(%ebp)
34
35#else
36/* Local storage in a static area: */
37.data
38 .align 4,0
39FPU_accum_0:
40 .long 0
41FPU_accum_1:
42 .long 0
43#endif /* NON_REENTRANT_FPU */
44
45
46.text
47ENTRY(FPU_u_mul)
48 pushl %ebp
49 movl %esp,%ebp
50#ifndef NON_REENTRANT_FPU
51 subl $8,%esp
52#endif /* NON_REENTRANT_FPU */
53
54 pushl %esi
55 pushl %edi
56 pushl %ebx
57
58 movl PARAM1,%esi
59 movl PARAM2,%edi
60
61#ifdef PARANOID
62 testl $0x80000000,SIGH(%esi)
63 jz L_bugged
64 testl $0x80000000,SIGH(%edi)
65 jz L_bugged
66#endif /* PARANOID */
67
68 xorl %ecx,%ecx
69 xorl %ebx,%ebx
70
71 movl SIGL(%esi),%eax
72 mull SIGL(%edi)
73 movl %eax,FPU_accum_0
74 movl %edx,FPU_accum_1
75
76 movl SIGL(%esi),%eax
77 mull SIGH(%edi)
78 addl %eax,FPU_accum_1
79 adcl %edx,%ebx
80/* adcl $0,%ecx // overflow here is not possible */
81
82 movl SIGH(%esi),%eax
83 mull SIGL(%edi)
84 addl %eax,FPU_accum_1
85 adcl %edx,%ebx
86 adcl $0,%ecx
87
88 movl SIGH(%esi),%eax
89 mull SIGH(%edi)
90 addl %eax,%ebx
91 adcl %edx,%ecx
92
93 /* Get the sum of the exponents. */
94 movl PARAM6,%eax
95 subl EXP_BIAS-1,%eax
96
97 /* Two denormals can cause an exponent underflow */
98 cmpl EXP_WAY_UNDER,%eax
99 jg Exp_not_underflow
100
101 /* Set to a really low value allow correct handling */
102 movl EXP_WAY_UNDER,%eax
103
104Exp_not_underflow:
105
106/* Have now finished with the sources */
107 movl PARAM3,%edi /* Point to the destination */
108 movw %ax,EXP(%edi)
109
110/* Now make sure that the result is normalized */
111 testl $0x80000000,%ecx
112 jnz LResult_Normalised
113
114 /* Normalize by shifting left one bit */
115 shll $1,FPU_accum_0
116 rcll $1,FPU_accum_1
117 rcll $1,%ebx
118 rcll $1,%ecx
119 decw EXP(%edi)
120
121LResult_Normalised:
122 movl FPU_accum_0,%eax
123 movl FPU_accum_1,%edx
124 orl %eax,%eax
125 jz L_extent_zero
126
127 orl $1,%edx
128
129L_extent_zero:
130 movl %ecx,%eax
131 jmp fpu_reg_round
132
133
134#ifdef PARANOID
135L_bugged:
136 pushl EX_INTERNAL|0x205
137 call EXCEPTION
138 pop %ebx
139 jmp L_exit
140
141L_exit:
142 popl %ebx
143 popl %edi
144 popl %esi
145 leave
146 ret
147#endif /* PARANOID */
148
diff --git a/arch/x86/math-emu/reg_u_sub.S b/arch/x86/math-emu/reg_u_sub.S
new file mode 100644
index 000000000000..1b6c24801d22
--- /dev/null
+++ b/arch/x86/math-emu/reg_u_sub.S
@@ -0,0 +1,272 @@
1 .file "reg_u_sub.S"
2/*---------------------------------------------------------------------------+
3 | reg_u_sub.S |
4 | |
5 | Core floating point subtraction routine. |
6 | |
7 | Copyright (C) 1992,1993,1995,1997 |
8 | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
9 | E-mail billm@suburbia.net |
10 | |
11 | Call from C as: |
12 | int FPU_u_sub(FPU_REG *arg1, FPU_REG *arg2, FPU_REG *answ, |
13 | int control_w) |
14 | Return value is the tag of the answer, or-ed with FPU_Exception if |
15 | one was raised, or -1 on internal error. |
16 | |
17 +---------------------------------------------------------------------------*/
18
19/*
20 | Kernel subtraction routine FPU_u_sub(reg *arg1, reg *arg2, reg *answ).
21 | Takes two valid reg f.p. numbers (TAG_Valid), which are
22 | treated as unsigned numbers,
23 | and returns their difference as a TAG_Valid or TAG_Zero f.p.
24 | number.
25 | The first number (arg1) must be the larger.
26 | The returned number is normalized.
27 | Basic checks are performed if PARANOID is defined.
28 */
29
30#include "exception.h"
31#include "fpu_emu.h"
32#include "control_w.h"
33
34.text
35ENTRY(FPU_u_sub)
36 pushl %ebp
37 movl %esp,%ebp
38 pushl %esi
39 pushl %edi
40 pushl %ebx
41
42 movl PARAM1,%esi /* source 1 */
43 movl PARAM2,%edi /* source 2 */
44
45 movl PARAM6,%ecx
46 subl PARAM7,%ecx /* exp1 - exp2 */
47
48#ifdef PARANOID
49 /* source 2 is always smaller than source 1 */
50 js L_bugged_1
51
52 testl $0x80000000,SIGH(%edi) /* The args are assumed to be be normalized */
53 je L_bugged_2
54
55 testl $0x80000000,SIGH(%esi)
56 je L_bugged_2
57#endif /* PARANOID */
58
59/*--------------------------------------+
60 | Form a register holding the |
61 | smaller number |
62 +--------------------------------------*/
63 movl SIGH(%edi),%eax /* register ms word */
64 movl SIGL(%edi),%ebx /* register ls word */
65
66 movl PARAM3,%edi /* destination */
67 movl PARAM6,%edx
68 movw %dx,EXP(%edi) /* Copy exponent to destination */
69
70 xorl %edx,%edx /* register extension */
71
72/*--------------------------------------+
73 | Shift the temporary register |
74 | right the required number of |
75 | places. |
76 +--------------------------------------*/
77
78 cmpw $32,%cx /* shrd only works for 0..31 bits */
79 jnc L_more_than_31
80
81/* less than 32 bits */
82 shrd %cl,%ebx,%edx
83 shrd %cl,%eax,%ebx
84 shr %cl,%eax
85 jmp L_shift_done
86
87L_more_than_31:
88 cmpw $64,%cx
89 jnc L_more_than_63
90
91 subb $32,%cl
92 jz L_exactly_32
93
94 shrd %cl,%eax,%edx
95 shr %cl,%eax
96 orl %ebx,%ebx
97 jz L_more_31_no_low /* none of the lowest bits is set */
98
99 orl $1,%edx /* record the fact in the extension */
100
101L_more_31_no_low:
102 movl %eax,%ebx
103 xorl %eax,%eax
104 jmp L_shift_done
105
106L_exactly_32:
107 movl %ebx,%edx
108 movl %eax,%ebx
109 xorl %eax,%eax
110 jmp L_shift_done
111
112L_more_than_63:
113 cmpw $65,%cx
114 jnc L_more_than_64
115
116 /* Shift right by 64 bits */
117 movl %eax,%edx
118 orl %ebx,%ebx
119 jz L_more_63_no_low
120
121 orl $1,%edx
122 jmp L_more_63_no_low
123
124L_more_than_64:
125 jne L_more_than_65
126
127 /* Shift right by 65 bits */
128 /* Carry is clear if we get here */
129 movl %eax,%edx
130 rcrl %edx
131 jnc L_shift_65_nc
132
133 orl $1,%edx
134 jmp L_more_63_no_low
135
136L_shift_65_nc:
137 orl %ebx,%ebx
138 jz L_more_63_no_low
139
140 orl $1,%edx
141 jmp L_more_63_no_low
142
143L_more_than_65:
144 movl $1,%edx /* The shifted nr always at least one '1' */
145
146L_more_63_no_low:
147 xorl %ebx,%ebx
148 xorl %eax,%eax
149
150L_shift_done:
151L_subtr:
152/*------------------------------+
153 | Do the subtraction |
154 +------------------------------*/
155 xorl %ecx,%ecx
156 subl %edx,%ecx
157 movl %ecx,%edx
158 movl SIGL(%esi),%ecx
159 sbbl %ebx,%ecx
160 movl %ecx,%ebx
161 movl SIGH(%esi),%ecx
162 sbbl %eax,%ecx
163 movl %ecx,%eax
164
165#ifdef PARANOID
166 /* We can never get a borrow */
167 jc L_bugged
168#endif /* PARANOID */
169
170/*--------------------------------------+
171 | Normalize the result |
172 +--------------------------------------*/
173 testl $0x80000000,%eax
174 jnz L_round /* no shifting needed */
175
176 orl %eax,%eax
177 jnz L_shift_1 /* shift left 1 - 31 bits */
178
179 orl %ebx,%ebx
180 jnz L_shift_32 /* shift left 32 - 63 bits */
181
182/*
183 * A rare case, the only one which is non-zero if we got here
184 * is: 1000000 .... 0000
185 * -0111111 .... 1111 1
186 * --------------------
187 * 0000000 .... 0000 1
188 */
189
190 cmpl $0x80000000,%edx
191 jnz L_must_be_zero
192
193 /* Shift left 64 bits */
194 subw $64,EXP(%edi)
195 xchg %edx,%eax
196 jmp fpu_reg_round
197
198L_must_be_zero:
199#ifdef PARANOID
200 orl %edx,%edx
201 jnz L_bugged_3
202#endif /* PARANOID */
203
204 /* The result is zero */
205 movw $0,EXP(%edi) /* exponent */
206 movl $0,SIGL(%edi)
207 movl $0,SIGH(%edi)
208 movl TAG_Zero,%eax
209 jmp L_exit
210
211L_shift_32:
212 movl %ebx,%eax
213 movl %edx,%ebx
214 movl $0,%edx
215 subw $32,EXP(%edi) /* Can get underflow here */
216
217/* We need to shift left by 1 - 31 bits */
218L_shift_1:
219 bsrl %eax,%ecx /* get the required shift in %ecx */
220 subl $31,%ecx
221 negl %ecx
222 shld %cl,%ebx,%eax
223 shld %cl,%edx,%ebx
224 shl %cl,%edx
225 subw %cx,EXP(%edi) /* Can get underflow here */
226
227L_round:
228 jmp fpu_reg_round /* Round the result */
229
230
231#ifdef PARANOID
232L_bugged_1:
233 pushl EX_INTERNAL|0x206
234 call EXCEPTION
235 pop %ebx
236 jmp L_error_exit
237
238L_bugged_2:
239 pushl EX_INTERNAL|0x209
240 call EXCEPTION
241 pop %ebx
242 jmp L_error_exit
243
244L_bugged_3:
245 pushl EX_INTERNAL|0x210
246 call EXCEPTION
247 pop %ebx
248 jmp L_error_exit
249
250L_bugged_4:
251 pushl EX_INTERNAL|0x211
252 call EXCEPTION
253 pop %ebx
254 jmp L_error_exit
255
256L_bugged:
257 pushl EX_INTERNAL|0x212
258 call EXCEPTION
259 pop %ebx
260 jmp L_error_exit
261
262L_error_exit:
263 movl $-1,%eax
264
265#endif /* PARANOID */
266
267L_exit:
268 popl %ebx
269 popl %edi
270 popl %esi
271 leave
272 ret
diff --git a/arch/x86/math-emu/round_Xsig.S b/arch/x86/math-emu/round_Xsig.S
new file mode 100644
index 000000000000..bbe0e87718e4
--- /dev/null
+++ b/arch/x86/math-emu/round_Xsig.S
@@ -0,0 +1,141 @@
1/*---------------------------------------------------------------------------+
2 | round_Xsig.S |
3 | |
4 | Copyright (C) 1992,1993,1994,1995 |
5 | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, |
6 | Australia. E-mail billm@jacobi.maths.monash.edu.au |
7 | |
8 | Normalize and round a 12 byte quantity. |
9 | Call from C as: |
10 | int round_Xsig(Xsig *n) |
11 | |
12 | Normalize a 12 byte quantity. |
13 | Call from C as: |
14 | int norm_Xsig(Xsig *n) |
15 | |
16 | Each function returns the size of the shift (nr of bits). |
17 | |
18 +---------------------------------------------------------------------------*/
19 .file "round_Xsig.S"
20
21#include "fpu_emu.h"
22
23
24.text
25ENTRY(round_Xsig)
26 pushl %ebp
27 movl %esp,%ebp
28 pushl %ebx /* Reserve some space */
29 pushl %ebx
30 pushl %esi
31
32 movl PARAM1,%esi
33
34 movl 8(%esi),%edx
35 movl 4(%esi),%ebx
36 movl (%esi),%eax
37
38 movl $0,-4(%ebp)
39
40 orl %edx,%edx /* ms bits */
41 js L_round /* Already normalized */
42 jnz L_shift_1 /* Shift left 1 - 31 bits */
43
44 movl %ebx,%edx
45 movl %eax,%ebx
46 xorl %eax,%eax
47 movl $-32,-4(%ebp)
48
49/* We need to shift left by 1 - 31 bits */
50L_shift_1:
51 bsrl %edx,%ecx /* get the required shift in %ecx */
52 subl $31,%ecx
53 negl %ecx
54 subl %ecx,-4(%ebp)
55 shld %cl,%ebx,%edx
56 shld %cl,%eax,%ebx
57 shl %cl,%eax
58
59L_round:
60 testl $0x80000000,%eax
61 jz L_exit
62
63 addl $1,%ebx
64 adcl $0,%edx
65 jnz L_exit
66
67 movl $0x80000000,%edx
68 incl -4(%ebp)
69
70L_exit:
71 movl %edx,8(%esi)
72 movl %ebx,4(%esi)
73 movl %eax,(%esi)
74
75 movl -4(%ebp),%eax
76
77 popl %esi
78 popl %ebx
79 leave
80 ret
81
82
83
84
85ENTRY(norm_Xsig)
86 pushl %ebp
87 movl %esp,%ebp
88 pushl %ebx /* Reserve some space */
89 pushl %ebx
90 pushl %esi
91
92 movl PARAM1,%esi
93
94 movl 8(%esi),%edx
95 movl 4(%esi),%ebx
96 movl (%esi),%eax
97
98 movl $0,-4(%ebp)
99
100 orl %edx,%edx /* ms bits */
101 js L_n_exit /* Already normalized */
102 jnz L_n_shift_1 /* Shift left 1 - 31 bits */
103
104 movl %ebx,%edx
105 movl %eax,%ebx
106 xorl %eax,%eax
107 movl $-32,-4(%ebp)
108
109 orl %edx,%edx /* ms bits */
110 js L_n_exit /* Normalized now */
111 jnz L_n_shift_1 /* Shift left 1 - 31 bits */
112
113 movl %ebx,%edx
114 movl %eax,%ebx
115 xorl %eax,%eax
116 addl $-32,-4(%ebp)
117 jmp L_n_exit /* Might not be normalized,
118 but shift no more. */
119
120/* We need to shift left by 1 - 31 bits */
121L_n_shift_1:
122 bsrl %edx,%ecx /* get the required shift in %ecx */
123 subl $31,%ecx
124 negl %ecx
125 subl %ecx,-4(%ebp)
126 shld %cl,%ebx,%edx
127 shld %cl,%eax,%ebx
128 shl %cl,%eax
129
130L_n_exit:
131 movl %edx,8(%esi)
132 movl %ebx,4(%esi)
133 movl %eax,(%esi)
134
135 movl -4(%ebp),%eax
136
137 popl %esi
138 popl %ebx
139 leave
140 ret
141
diff --git a/arch/x86/math-emu/shr_Xsig.S b/arch/x86/math-emu/shr_Xsig.S
new file mode 100644
index 000000000000..31cdd118e918
--- /dev/null
+++ b/arch/x86/math-emu/shr_Xsig.S
@@ -0,0 +1,87 @@
1 .file "shr_Xsig.S"
2/*---------------------------------------------------------------------------+
3 | shr_Xsig.S |
4 | |
5 | 12 byte right shift function |
6 | |
7 | Copyright (C) 1992,1994,1995 |
8 | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, |
9 | Australia. E-mail billm@jacobi.maths.monash.edu.au |
10 | |
11 | Call from C as: |
12 | void shr_Xsig(Xsig *arg, unsigned nr) |
13 | |
14 | Extended shift right function. |
15 | Fastest for small shifts. |
16 | Shifts the 12 byte quantity pointed to by the first arg (arg) |
17 | right by the number of bits specified by the second arg (nr). |
18 | |
19 +---------------------------------------------------------------------------*/
20
21#include "fpu_emu.h"
22
23.text
24ENTRY(shr_Xsig)
25 push %ebp
26 movl %esp,%ebp
27 pushl %esi
28 movl PARAM2,%ecx
29 movl PARAM1,%esi
30 cmpl $32,%ecx /* shrd only works for 0..31 bits */
31 jnc L_more_than_31
32
33/* less than 32 bits */
34 pushl %ebx
35 movl (%esi),%eax /* lsl */
36 movl 4(%esi),%ebx /* midl */
37 movl 8(%esi),%edx /* msl */
38 shrd %cl,%ebx,%eax
39 shrd %cl,%edx,%ebx
40 shr %cl,%edx
41 movl %eax,(%esi)
42 movl %ebx,4(%esi)
43 movl %edx,8(%esi)
44 popl %ebx
45 popl %esi
46 leave
47 ret
48
49L_more_than_31:
50 cmpl $64,%ecx
51 jnc L_more_than_63
52
53 subb $32,%cl
54 movl 4(%esi),%eax /* midl */
55 movl 8(%esi),%edx /* msl */
56 shrd %cl,%edx,%eax
57 shr %cl,%edx
58 movl %eax,(%esi)
59 movl %edx,4(%esi)
60 movl $0,8(%esi)
61 popl %esi
62 leave
63 ret
64
65L_more_than_63:
66 cmpl $96,%ecx
67 jnc L_more_than_95
68
69 subb $64,%cl
70 movl 8(%esi),%eax /* msl */
71 shr %cl,%eax
72 xorl %edx,%edx
73 movl %eax,(%esi)
74 movl %edx,4(%esi)
75 movl %edx,8(%esi)
76 popl %esi
77 leave
78 ret
79
80L_more_than_95:
81 xorl %eax,%eax
82 movl %eax,(%esi)
83 movl %eax,4(%esi)
84 movl %eax,8(%esi)
85 popl %esi
86 leave
87 ret
diff --git a/arch/x86/math-emu/status_w.h b/arch/x86/math-emu/status_w.h
new file mode 100644
index 000000000000..59e73302aa60
--- /dev/null
+++ b/arch/x86/math-emu/status_w.h
@@ -0,0 +1,67 @@
1/*---------------------------------------------------------------------------+
2 | status_w.h |
3 | |
4 | Copyright (C) 1992,1993 |
5 | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, |
6 | Australia. E-mail billm@vaxc.cc.monash.edu.au |
7 | |
8 +---------------------------------------------------------------------------*/
9
10#ifndef _STATUS_H_
11#define _STATUS_H_
12
13#include "fpu_emu.h" /* for definition of PECULIAR_486 */
14
15#ifdef __ASSEMBLY__
16#define Const__(x) $##x
17#else
18#define Const__(x) x
19#endif
20
21#define SW_Backward Const__(0x8000) /* backward compatibility */
22#define SW_C3 Const__(0x4000) /* condition bit 3 */
23#define SW_Top Const__(0x3800) /* top of stack */
24#define SW_Top_Shift Const__(11) /* shift for top of stack bits */
25#define SW_C2 Const__(0x0400) /* condition bit 2 */
26#define SW_C1 Const__(0x0200) /* condition bit 1 */
27#define SW_C0 Const__(0x0100) /* condition bit 0 */
28#define SW_Summary Const__(0x0080) /* exception summary */
29#define SW_Stack_Fault Const__(0x0040) /* stack fault */
30#define SW_Precision Const__(0x0020) /* loss of precision */
31#define SW_Underflow Const__(0x0010) /* underflow */
32#define SW_Overflow Const__(0x0008) /* overflow */
33#define SW_Zero_Div Const__(0x0004) /* divide by zero */
34#define SW_Denorm_Op Const__(0x0002) /* denormalized operand */
35#define SW_Invalid Const__(0x0001) /* invalid operation */
36
37#define SW_Exc_Mask Const__(0x27f) /* Status word exception bit mask */
38
39#ifndef __ASSEMBLY__
40
41#define COMP_A_gt_B 1
42#define COMP_A_eq_B 2
43#define COMP_A_lt_B 3
44#define COMP_No_Comp 4
45#define COMP_Denormal 0x20
46#define COMP_NaN 0x40
47#define COMP_SNaN 0x80
48
49#define status_word() \
50 ((partial_status & ~SW_Top & 0xffff) | ((top << SW_Top_Shift) & SW_Top))
51static inline void setcc(int cc)
52{
53 partial_status &= ~(SW_C0|SW_C1|SW_C2|SW_C3);
54 partial_status |= (cc) & (SW_C0|SW_C1|SW_C2|SW_C3);
55}
56
57#ifdef PECULIAR_486
58 /* Default, this conveys no information, but an 80486 does it. */
59 /* Clear the SW_C1 bit, "other bits undefined". */
60# define clear_C1() { partial_status &= ~SW_C1; }
61# else
62# define clear_C1()
63#endif /* PECULIAR_486 */
64
65#endif /* __ASSEMBLY__ */
66
67#endif /* _STATUS_H_ */
diff --git a/arch/x86/math-emu/version.h b/arch/x86/math-emu/version.h
new file mode 100644
index 000000000000..a0d73a1d2b67
--- /dev/null
+++ b/arch/x86/math-emu/version.h
@@ -0,0 +1,12 @@
1/*---------------------------------------------------------------------------+
2 | version.h |
3 | |
4 | |
5 | Copyright (C) 1992,1993,1994,1996,1997,1999 |
6 | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
7 | E-mail billm@melbpc.org.au |
8 | |
9 | |
10 +---------------------------------------------------------------------------*/
11
12#define FPU_VERSION "wm-FPU-emu version 2.01"
diff --git a/arch/x86/math-emu/wm_shrx.S b/arch/x86/math-emu/wm_shrx.S
new file mode 100644
index 000000000000..518428317985
--- /dev/null
+++ b/arch/x86/math-emu/wm_shrx.S
@@ -0,0 +1,204 @@
1 .file "wm_shrx.S"
2/*---------------------------------------------------------------------------+
3 | wm_shrx.S |
4 | |
5 | 64 bit right shift functions |
6 | |
7 | Copyright (C) 1992,1995 |
8 | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, |
9 | Australia. E-mail billm@jacobi.maths.monash.edu.au |
10 | |
11 | Call from C as: |
12 | unsigned FPU_shrx(void *arg1, unsigned arg2) |
13 | and |
14 | unsigned FPU_shrxs(void *arg1, unsigned arg2) |
15 | |
16 +---------------------------------------------------------------------------*/
17
18#include "fpu_emu.h"
19
20.text
21/*---------------------------------------------------------------------------+
22 | unsigned FPU_shrx(void *arg1, unsigned arg2) |
23 | |
24 | Extended shift right function. |
25 | Fastest for small shifts. |
26 | Shifts the 64 bit quantity pointed to by the first arg (arg1) |
27 | right by the number of bits specified by the second arg (arg2). |
28 | Forms a 96 bit quantity from the 64 bit arg and eax: |
29 | [ 64 bit arg ][ eax ] |
30 | shift right ---------> |
31 | The eax register is initialized to 0 before the shifting. |
32 | Results returned in the 64 bit arg and eax. |
33 +---------------------------------------------------------------------------*/
34
35ENTRY(FPU_shrx)
36 push %ebp
37 movl %esp,%ebp
38 pushl %esi
39 movl PARAM2,%ecx
40 movl PARAM1,%esi
41 cmpl $32,%ecx /* shrd only works for 0..31 bits */
42 jnc L_more_than_31
43
44/* less than 32 bits */
45 pushl %ebx
46 movl (%esi),%ebx /* lsl */
47 movl 4(%esi),%edx /* msl */
48 xorl %eax,%eax /* extension */
49 shrd %cl,%ebx,%eax
50 shrd %cl,%edx,%ebx
51 shr %cl,%edx
52 movl %ebx,(%esi)
53 movl %edx,4(%esi)
54 popl %ebx
55 popl %esi
56 leave
57 ret
58
59L_more_than_31:
60 cmpl $64,%ecx
61 jnc L_more_than_63
62
63 subb $32,%cl
64 movl (%esi),%eax /* lsl */
65 movl 4(%esi),%edx /* msl */
66 shrd %cl,%edx,%eax
67 shr %cl,%edx
68 movl %edx,(%esi)
69 movl $0,4(%esi)
70 popl %esi
71 leave
72 ret
73
74L_more_than_63:
75 cmpl $96,%ecx
76 jnc L_more_than_95
77
78 subb $64,%cl
79 movl 4(%esi),%eax /* msl */
80 shr %cl,%eax
81 xorl %edx,%edx
82 movl %edx,(%esi)
83 movl %edx,4(%esi)
84 popl %esi
85 leave
86 ret
87
88L_more_than_95:
89 xorl %eax,%eax
90 movl %eax,(%esi)
91 movl %eax,4(%esi)
92 popl %esi
93 leave
94 ret
95
96
97/*---------------------------------------------------------------------------+
98 | unsigned FPU_shrxs(void *arg1, unsigned arg2) |
99 | |
100 | Extended shift right function (optimized for small floating point |
101 | integers). |
102 | Shifts the 64 bit quantity pointed to by the first arg (arg1) |
103 | right by the number of bits specified by the second arg (arg2). |
104 | Forms a 96 bit quantity from the 64 bit arg and eax: |
105 | [ 64 bit arg ][ eax ] |
106 | shift right ---------> |
107 | The eax register is initialized to 0 before the shifting. |
108 | The lower 8 bits of eax are lost and replaced by a flag which is |
109 | set (to 0x01) if any bit, apart from the first one, is set in the |
110 | part which has been shifted out of the arg. |
111 | Results returned in the 64 bit arg and eax. |
112 +---------------------------------------------------------------------------*/
113ENTRY(FPU_shrxs)
114 push %ebp
115 movl %esp,%ebp
116 pushl %esi
117 pushl %ebx
118 movl PARAM2,%ecx
119 movl PARAM1,%esi
120 cmpl $64,%ecx /* shrd only works for 0..31 bits */
121 jnc Ls_more_than_63
122
123 cmpl $32,%ecx /* shrd only works for 0..31 bits */
124 jc Ls_less_than_32
125
126/* We got here without jumps by assuming that the most common requirement
127 is for small integers */
128/* Shift by [32..63] bits */
129 subb $32,%cl
130 movl (%esi),%eax /* lsl */
131 movl 4(%esi),%edx /* msl */
132 xorl %ebx,%ebx
133 shrd %cl,%eax,%ebx
134 shrd %cl,%edx,%eax
135 shr %cl,%edx
136 orl %ebx,%ebx /* test these 32 bits */
137 setne %bl
138 test $0x7fffffff,%eax /* and 31 bits here */
139 setne %bh
140 orw %bx,%bx /* Any of the 63 bit set ? */
141 setne %al
142 movl %edx,(%esi)
143 movl $0,4(%esi)
144 popl %ebx
145 popl %esi
146 leave
147 ret
148
149/* Shift by [0..31] bits */
150Ls_less_than_32:
151 movl (%esi),%ebx /* lsl */
152 movl 4(%esi),%edx /* msl */
153 xorl %eax,%eax /* extension */
154 shrd %cl,%ebx,%eax
155 shrd %cl,%edx,%ebx
156 shr %cl,%edx
157 test $0x7fffffff,%eax /* only need to look at eax here */
158 setne %al
159 movl %ebx,(%esi)
160 movl %edx,4(%esi)
161 popl %ebx
162 popl %esi
163 leave
164 ret
165
166/* Shift by [64..95] bits */
167Ls_more_than_63:
168 cmpl $96,%ecx
169 jnc Ls_more_than_95
170
171 subb $64,%cl
172 movl (%esi),%ebx /* lsl */
173 movl 4(%esi),%eax /* msl */
174 xorl %edx,%edx /* extension */
175 shrd %cl,%ebx,%edx
176 shrd %cl,%eax,%ebx
177 shr %cl,%eax
178 orl %ebx,%edx
179 setne %bl
180 test $0x7fffffff,%eax /* only need to look at eax here */
181 setne %bh
182 orw %bx,%bx
183 setne %al
184 xorl %edx,%edx
185 movl %edx,(%esi) /* set to zero */
186 movl %edx,4(%esi) /* set to zero */
187 popl %ebx
188 popl %esi
189 leave
190 ret
191
192Ls_more_than_95:
193/* Shift by [96..inf) bits */
194 xorl %eax,%eax
195 movl (%esi),%ebx
196 orl 4(%esi),%ebx
197 setne %al
198 xorl %ebx,%ebx
199 movl %ebx,(%esi)
200 movl %ebx,4(%esi)
201 popl %ebx
202 popl %esi
203 leave
204 ret
diff --git a/arch/x86/math-emu/wm_sqrt.S b/arch/x86/math-emu/wm_sqrt.S
new file mode 100644
index 000000000000..d258f59564e1
--- /dev/null
+++ b/arch/x86/math-emu/wm_sqrt.S
@@ -0,0 +1,470 @@
1 .file "wm_sqrt.S"
2/*---------------------------------------------------------------------------+
3 | wm_sqrt.S |
4 | |
5 | Fixed point arithmetic square root evaluation. |
6 | |
7 | Copyright (C) 1992,1993,1995,1997 |
8 | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, |
9 | Australia. E-mail billm@suburbia.net |
10 | |
11 | Call from C as: |
12 | int wm_sqrt(FPU_REG *n, unsigned int control_word) |
13 | |
14 +---------------------------------------------------------------------------*/
15
16/*---------------------------------------------------------------------------+
17 | wm_sqrt(FPU_REG *n, unsigned int control_word) |
18 | returns the square root of n in n. |
19 | |
20 | Use Newton's method to compute the square root of a number, which must |
21 | be in the range [1.0 .. 4.0), to 64 bits accuracy. |
22 | Does not check the sign or tag of the argument. |
23 | Sets the exponent, but not the sign or tag of the result. |
24 | |
25 | The guess is kept in %esi:%edi |
26 +---------------------------------------------------------------------------*/
27
28#include "exception.h"
29#include "fpu_emu.h"
30
31
32#ifndef NON_REENTRANT_FPU
33/* Local storage on the stack: */
34#define FPU_accum_3 -4(%ebp) /* ms word */
35#define FPU_accum_2 -8(%ebp)
36#define FPU_accum_1 -12(%ebp)
37#define FPU_accum_0 -16(%ebp)
38
39/*
40 * The de-normalised argument:
41 * sq_2 sq_1 sq_0
42 * b b b b b b b ... b b b b b b .... b b b b 0 0 0 ... 0
43 * ^ binary point here
44 */
45#define FPU_fsqrt_arg_2 -20(%ebp) /* ms word */
46#define FPU_fsqrt_arg_1 -24(%ebp)
47#define FPU_fsqrt_arg_0 -28(%ebp) /* ls word, at most the ms bit is set */
48
49#else
50/* Local storage in a static area: */
51.data
52 .align 4,0
53FPU_accum_3:
54 .long 0 /* ms word */
55FPU_accum_2:
56 .long 0
57FPU_accum_1:
58 .long 0
59FPU_accum_0:
60 .long 0
61
62/* The de-normalised argument:
63 sq_2 sq_1 sq_0
64 b b b b b b b ... b b b b b b .... b b b b 0 0 0 ... 0
65 ^ binary point here
66 */
67FPU_fsqrt_arg_2:
68 .long 0 /* ms word */
69FPU_fsqrt_arg_1:
70 .long 0
71FPU_fsqrt_arg_0:
72 .long 0 /* ls word, at most the ms bit is set */
73#endif /* NON_REENTRANT_FPU */
74
75
76.text
77ENTRY(wm_sqrt)
78 pushl %ebp
79 movl %esp,%ebp
80#ifndef NON_REENTRANT_FPU
81 subl $28,%esp
82#endif /* NON_REENTRANT_FPU */
83 pushl %esi
84 pushl %edi
85 pushl %ebx
86
87 movl PARAM1,%esi
88
89 movl SIGH(%esi),%eax
90 movl SIGL(%esi),%ecx
91 xorl %edx,%edx
92
93/* We use a rough linear estimate for the first guess.. */
94
95 cmpw EXP_BIAS,EXP(%esi)
96 jnz sqrt_arg_ge_2
97
98 shrl $1,%eax /* arg is in the range [1.0 .. 2.0) */
99 rcrl $1,%ecx
100 rcrl $1,%edx
101
102sqrt_arg_ge_2:
103/* From here on, n is never accessed directly again until it is
104 replaced by the answer. */
105
106 movl %eax,FPU_fsqrt_arg_2 /* ms word of n */
107 movl %ecx,FPU_fsqrt_arg_1
108 movl %edx,FPU_fsqrt_arg_0
109
110/* Make a linear first estimate */
111 shrl $1,%eax
112 addl $0x40000000,%eax
113 movl $0xaaaaaaaa,%ecx
114 mull %ecx
115 shll %edx /* max result was 7fff... */
116 testl $0x80000000,%edx /* but min was 3fff... */
117 jnz sqrt_prelim_no_adjust
118
119 movl $0x80000000,%edx /* round up */
120
121sqrt_prelim_no_adjust:
122 movl %edx,%esi /* Our first guess */
123
124/* We have now computed (approx) (2 + x) / 3, which forms the basis
125 for a few iterations of Newton's method */
126
127 movl FPU_fsqrt_arg_2,%ecx /* ms word */
128
129/*
130 * From our initial estimate, three iterations are enough to get us
131 * to 30 bits or so. This will then allow two iterations at better
132 * precision to complete the process.
133 */
134
135/* Compute (g + n/g)/2 at each iteration (g is the guess). */
136 shrl %ecx /* Doing this first will prevent a divide */
137 /* overflow later. */
138
139 movl %ecx,%edx /* msw of the arg / 2 */
140 divl %esi /* current estimate */
141 shrl %esi /* divide by 2 */
142 addl %eax,%esi /* the new estimate */
143
144 movl %ecx,%edx
145 divl %esi
146 shrl %esi
147 addl %eax,%esi
148
149 movl %ecx,%edx
150 divl %esi
151 shrl %esi
152 addl %eax,%esi
153
154/*
155 * Now that an estimate accurate to about 30 bits has been obtained (in %esi),
156 * we improve it to 60 bits or so.
157 *
158 * The strategy from now on is to compute new estimates from
159 * guess := guess + (n - guess^2) / (2 * guess)
160 */
161
162/* First, find the square of the guess */
163 movl %esi,%eax
164 mull %esi
165/* guess^2 now in %edx:%eax */
166
167 movl FPU_fsqrt_arg_1,%ecx
168 subl %ecx,%eax
169 movl FPU_fsqrt_arg_2,%ecx /* ms word of normalized n */
170 sbbl %ecx,%edx
171 jnc sqrt_stage_2_positive
172
173/* Subtraction gives a negative result,
174 negate the result before division. */
175 notl %edx
176 notl %eax
177 addl $1,%eax
178 adcl $0,%edx
179
180 divl %esi
181 movl %eax,%ecx
182
183 movl %edx,%eax
184 divl %esi
185 jmp sqrt_stage_2_finish
186
187sqrt_stage_2_positive:
188 divl %esi
189 movl %eax,%ecx
190
191 movl %edx,%eax
192 divl %esi
193
194 notl %ecx
195 notl %eax
196 addl $1,%eax
197 adcl $0,%ecx
198
199sqrt_stage_2_finish:
200 sarl $1,%ecx /* divide by 2 */
201 rcrl $1,%eax
202
203 /* Form the new estimate in %esi:%edi */
204 movl %eax,%edi
205 addl %ecx,%esi
206
207 jnz sqrt_stage_2_done /* result should be [1..2) */
208
209#ifdef PARANOID
210/* It should be possible to get here only if the arg is ffff....ffff */
211 cmp $0xffffffff,FPU_fsqrt_arg_1
212 jnz sqrt_stage_2_error
213#endif /* PARANOID */
214
215/* The best rounded result. */
216 xorl %eax,%eax
217 decl %eax
218 movl %eax,%edi
219 movl %eax,%esi
220 movl $0x7fffffff,%eax
221 jmp sqrt_round_result
222
223#ifdef PARANOID
224sqrt_stage_2_error:
225 pushl EX_INTERNAL|0x213
226 call EXCEPTION
227#endif /* PARANOID */
228
229sqrt_stage_2_done:
230
231/* Now the square root has been computed to better than 60 bits. */
232
233/* Find the square of the guess. */
234 movl %edi,%eax /* ls word of guess */
235 mull %edi
236 movl %edx,FPU_accum_1
237
238 movl %esi,%eax
239 mull %esi
240 movl %edx,FPU_accum_3
241 movl %eax,FPU_accum_2
242
243 movl %edi,%eax
244 mull %esi
245 addl %eax,FPU_accum_1
246 adcl %edx,FPU_accum_2
247 adcl $0,FPU_accum_3
248
249/* movl %esi,%eax */
250/* mull %edi */
251 addl %eax,FPU_accum_1
252 adcl %edx,FPU_accum_2
253 adcl $0,FPU_accum_3
254
255/* guess^2 now in FPU_accum_3:FPU_accum_2:FPU_accum_1 */
256
257 movl FPU_fsqrt_arg_0,%eax /* get normalized n */
258 subl %eax,FPU_accum_1
259 movl FPU_fsqrt_arg_1,%eax
260 sbbl %eax,FPU_accum_2
261 movl FPU_fsqrt_arg_2,%eax /* ms word of normalized n */
262 sbbl %eax,FPU_accum_3
263 jnc sqrt_stage_3_positive
264
265/* Subtraction gives a negative result,
266 negate the result before division */
267 notl FPU_accum_1
268 notl FPU_accum_2
269 notl FPU_accum_3
270 addl $1,FPU_accum_1
271 adcl $0,FPU_accum_2
272
273#ifdef PARANOID
274 adcl $0,FPU_accum_3 /* This must be zero */
275 jz sqrt_stage_3_no_error
276
277sqrt_stage_3_error:
278 pushl EX_INTERNAL|0x207
279 call EXCEPTION
280
281sqrt_stage_3_no_error:
282#endif /* PARANOID */
283
284 movl FPU_accum_2,%edx
285 movl FPU_accum_1,%eax
286 divl %esi
287 movl %eax,%ecx
288
289 movl %edx,%eax
290 divl %esi
291
292 sarl $1,%ecx /* divide by 2 */
293 rcrl $1,%eax
294
295 /* prepare to round the result */
296
297 addl %ecx,%edi
298 adcl $0,%esi
299
300 jmp sqrt_stage_3_finished
301
302sqrt_stage_3_positive:
303 movl FPU_accum_2,%edx
304 movl FPU_accum_1,%eax
305 divl %esi
306 movl %eax,%ecx
307
308 movl %edx,%eax
309 divl %esi
310
311 sarl $1,%ecx /* divide by 2 */
312 rcrl $1,%eax
313
314 /* prepare to round the result */
315
316 notl %eax /* Negate the correction term */
317 notl %ecx
318 addl $1,%eax
319 adcl $0,%ecx /* carry here ==> correction == 0 */
320 adcl $0xffffffff,%esi
321
322 addl %ecx,%edi
323 adcl $0,%esi
324
325sqrt_stage_3_finished:
326
327/*
328 * The result in %esi:%edi:%esi should be good to about 90 bits here,
329 * and the rounding information here does not have sufficient accuracy
330 * in a few rare cases.
331 */
332 cmpl $0xffffffe0,%eax
333 ja sqrt_near_exact_x
334
335 cmpl $0x00000020,%eax
336 jb sqrt_near_exact
337
338 cmpl $0x7fffffe0,%eax
339 jb sqrt_round_result
340
341 cmpl $0x80000020,%eax
342 jb sqrt_get_more_precision
343
344sqrt_round_result:
345/* Set up for rounding operations */
346 movl %eax,%edx
347 movl %esi,%eax
348 movl %edi,%ebx
349 movl PARAM1,%edi
350 movw EXP_BIAS,EXP(%edi) /* Result is in [1.0 .. 2.0) */
351 jmp fpu_reg_round
352
353
354sqrt_near_exact_x:
355/* First, the estimate must be rounded up. */
356 addl $1,%edi
357 adcl $0,%esi
358
359sqrt_near_exact:
360/*
361 * This is an easy case because x^1/2 is monotonic.
362 * We need just find the square of our estimate, compare it
363 * with the argument, and deduce whether our estimate is
364 * above, below, or exact. We use the fact that the estimate
365 * is known to be accurate to about 90 bits.
366 */
367 movl %edi,%eax /* ls word of guess */
368 mull %edi
369 movl %edx,%ebx /* 2nd ls word of square */
370 movl %eax,%ecx /* ls word of square */
371
372 movl %edi,%eax
373 mull %esi
374 addl %eax,%ebx
375 addl %eax,%ebx
376
377#ifdef PARANOID
378 cmp $0xffffffb0,%ebx
379 jb sqrt_near_exact_ok
380
381 cmp $0x00000050,%ebx
382 ja sqrt_near_exact_ok
383
384 pushl EX_INTERNAL|0x214
385 call EXCEPTION
386
387sqrt_near_exact_ok:
388#endif /* PARANOID */
389
390 or %ebx,%ebx
391 js sqrt_near_exact_small
392
393 jnz sqrt_near_exact_large
394
395 or %ebx,%edx
396 jnz sqrt_near_exact_large
397
398/* Our estimate is exactly the right answer */
399 xorl %eax,%eax
400 jmp sqrt_round_result
401
402sqrt_near_exact_small:
403/* Our estimate is too small */
404 movl $0x000000ff,%eax
405 jmp sqrt_round_result
406
407sqrt_near_exact_large:
408/* Our estimate is too large, we need to decrement it */
409 subl $1,%edi
410 sbbl $0,%esi
411 movl $0xffffff00,%eax
412 jmp sqrt_round_result
413
414
415sqrt_get_more_precision:
416/* This case is almost the same as the above, except we start
417 with an extra bit of precision in the estimate. */
418 stc /* The extra bit. */
419 rcll $1,%edi /* Shift the estimate left one bit */
420 rcll $1,%esi
421
422 movl %edi,%eax /* ls word of guess */
423 mull %edi
424 movl %edx,%ebx /* 2nd ls word of square */
425 movl %eax,%ecx /* ls word of square */
426
427 movl %edi,%eax
428 mull %esi
429 addl %eax,%ebx
430 addl %eax,%ebx
431
432/* Put our estimate back to its original value */
433 stc /* The ms bit. */
434 rcrl $1,%esi /* Shift the estimate left one bit */
435 rcrl $1,%edi
436
437#ifdef PARANOID
438 cmp $0xffffff60,%ebx
439 jb sqrt_more_prec_ok
440
441 cmp $0x000000a0,%ebx
442 ja sqrt_more_prec_ok
443
444 pushl EX_INTERNAL|0x215
445 call EXCEPTION
446
447sqrt_more_prec_ok:
448#endif /* PARANOID */
449
450 or %ebx,%ebx
451 js sqrt_more_prec_small
452
453 jnz sqrt_more_prec_large
454
455 or %ebx,%ecx
456 jnz sqrt_more_prec_large
457
458/* Our estimate is exactly the right answer */
459 movl $0x80000000,%eax
460 jmp sqrt_round_result
461
462sqrt_more_prec_small:
463/* Our estimate is too small */
464 movl $0x800000ff,%eax
465 jmp sqrt_round_result
466
467sqrt_more_prec_large:
468/* Our estimate is too large */
469 movl $0x7fffff00,%eax
470 jmp sqrt_round_result
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
new file mode 100644
index 000000000000..983291096848
--- /dev/null
+++ b/arch/x86/mm/Makefile
@@ -0,0 +1,5 @@
1ifeq ($(CONFIG_X86_32),y)
2include ${srctree}/arch/x86/mm/Makefile_32
3else
4include ${srctree}/arch/x86/mm/Makefile_64
5endif
diff --git a/arch/x86/mm/Makefile_32 b/arch/x86/mm/Makefile_32
new file mode 100644
index 000000000000..362b4ad082de
--- /dev/null
+++ b/arch/x86/mm/Makefile_32
@@ -0,0 +1,10 @@
1#
2# Makefile for the linux i386-specific parts of the memory manager.
3#
4
5obj-y := init_32.o pgtable_32.o fault_32.o ioremap_32.o extable_32.o pageattr_32.o mmap_32.o
6
7obj-$(CONFIG_NUMA) += discontig_32.o
8obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
9obj-$(CONFIG_HIGHMEM) += highmem_32.o
10obj-$(CONFIG_BOOT_IOREMAP) += boot_ioremap_32.o
diff --git a/arch/x86/mm/Makefile_64 b/arch/x86/mm/Makefile_64
new file mode 100644
index 000000000000..6bcb47945b87
--- /dev/null
+++ b/arch/x86/mm/Makefile_64
@@ -0,0 +1,10 @@
1#
2# Makefile for the linux x86_64-specific parts of the memory manager.
3#
4
5obj-y := init_64.o fault_64.o ioremap_64.o extable_64.o pageattr_64.o mmap_64.o
6obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
7obj-$(CONFIG_NUMA) += numa_64.o
8obj-$(CONFIG_K8_NUMA) += k8topology_64.o
9obj-$(CONFIG_ACPI_NUMA) += srat_64.o
10
diff --git a/arch/x86/mm/boot_ioremap_32.c b/arch/x86/mm/boot_ioremap_32.c
new file mode 100644
index 000000000000..4de95a17a7d4
--- /dev/null
+++ b/arch/x86/mm/boot_ioremap_32.c
@@ -0,0 +1,100 @@
1/*
2 * arch/i386/mm/boot_ioremap.c
3 *
4 * Re-map functions for early boot-time before paging_init() when the
5 * boot-time pagetables are still in use
6 *
7 * Written by Dave Hansen <haveblue@us.ibm.com>
8 */
9
10
11/*
12 * We need to use the 2-level pagetable functions, but CONFIG_X86_PAE
13 * keeps that from happenning. If anyone has a better way, I'm listening.
14 *
15 * boot_pte_t is defined only if this all works correctly
16 */
17
18#undef CONFIG_X86_PAE
19#undef CONFIG_PARAVIRT
20#include <asm/page.h>
21#include <asm/pgtable.h>
22#include <asm/tlbflush.h>
23#include <linux/init.h>
24#include <linux/stddef.h>
25
26/*
27 * I'm cheating here. It is known that the two boot PTE pages are
28 * allocated next to each other. I'm pretending that they're just
29 * one big array.
30 */
31
32#define BOOT_PTE_PTRS (PTRS_PER_PTE*2)
33
34static unsigned long boot_pte_index(unsigned long vaddr)
35{
36 return __pa(vaddr) >> PAGE_SHIFT;
37}
38
39static inline boot_pte_t* boot_vaddr_to_pte(void *address)
40{
41 boot_pte_t* boot_pg = (boot_pte_t*)pg0;
42 return &boot_pg[boot_pte_index((unsigned long)address)];
43}
44
45/*
46 * This is only for a caller who is clever enough to page-align
47 * phys_addr and virtual_source, and who also has a preference
48 * about which virtual address from which to steal ptes
49 */
50static void __boot_ioremap(unsigned long phys_addr, unsigned long nrpages,
51 void* virtual_source)
52{
53 boot_pte_t* pte;
54 int i;
55 char *vaddr = virtual_source;
56
57 pte = boot_vaddr_to_pte(virtual_source);
58 for (i=0; i < nrpages; i++, phys_addr += PAGE_SIZE, pte++) {
59 set_pte(pte, pfn_pte(phys_addr>>PAGE_SHIFT, PAGE_KERNEL));
60 __flush_tlb_one(&vaddr[i*PAGE_SIZE]);
61 }
62}
63
64/* the virtual space we're going to remap comes from this array */
65#define BOOT_IOREMAP_PAGES 4
66#define BOOT_IOREMAP_SIZE (BOOT_IOREMAP_PAGES*PAGE_SIZE)
67static __initdata char boot_ioremap_space[BOOT_IOREMAP_SIZE]
68 __attribute__ ((aligned (PAGE_SIZE)));
69
70/*
71 * This only applies to things which need to ioremap before paging_init()
72 * bt_ioremap() and plain ioremap() are both useless at this point.
73 *
74 * When used, we're still using the boot-time pagetables, which only
75 * have 2 PTE pages mapping the first 8MB
76 *
77 * There is no unmap. The boot-time PTE pages aren't used after boot.
78 * If you really want the space back, just remap it yourself.
79 * boot_ioremap(&ioremap_space-PAGE_OFFSET, BOOT_IOREMAP_SIZE)
80 */
81__init void* boot_ioremap(unsigned long phys_addr, unsigned long size)
82{
83 unsigned long last_addr, offset;
84 unsigned int nrpages;
85
86 last_addr = phys_addr + size - 1;
87
88 /* page align the requested address */
89 offset = phys_addr & ~PAGE_MASK;
90 phys_addr &= PAGE_MASK;
91 size = PAGE_ALIGN(last_addr) - phys_addr;
92
93 nrpages = size >> PAGE_SHIFT;
94 if (nrpages > BOOT_IOREMAP_PAGES)
95 return NULL;
96
97 __boot_ioremap(phys_addr, nrpages, boot_ioremap_space);
98
99 return &boot_ioremap_space[offset];
100}
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c
new file mode 100644
index 000000000000..860e912a3fbb
--- /dev/null
+++ b/arch/x86/mm/discontig_32.c
@@ -0,0 +1,431 @@
1/*
2 * Written by: Patricia Gaughen <gone@us.ibm.com>, IBM Corporation
3 * August 2002: added remote node KVA remap - Martin J. Bligh
4 *
5 * Copyright (C) 2002, IBM Corp.
6 *
7 * All rights reserved.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
17 * NON INFRINGEMENT. See the GNU General Public License for more
18 * details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 */
24
25#include <linux/mm.h>
26#include <linux/bootmem.h>
27#include <linux/mmzone.h>
28#include <linux/highmem.h>
29#include <linux/initrd.h>
30#include <linux/nodemask.h>
31#include <linux/module.h>
32#include <linux/kexec.h>
33#include <linux/pfn.h>
34#include <linux/swap.h>
35
36#include <asm/e820.h>
37#include <asm/setup.h>
38#include <asm/mmzone.h>
39#include <bios_ebda.h>
40
41struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
42EXPORT_SYMBOL(node_data);
43bootmem_data_t node0_bdata;
44
45/*
46 * numa interface - we expect the numa architecture specific code to have
47 * populated the following initialisation.
48 *
49 * 1) node_online_map - the map of all nodes configured (online) in the system
50 * 2) node_start_pfn - the starting page frame number for a node
51 * 3) node_end_pfn - the ending page fram number for a node
52 */
53unsigned long node_start_pfn[MAX_NUMNODES] __read_mostly;
54unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly;
55
56
57#ifdef CONFIG_DISCONTIGMEM
58/*
59 * 4) physnode_map - the mapping between a pfn and owning node
60 * physnode_map keeps track of the physical memory layout of a generic
61 * numa node on a 256Mb break (each element of the array will
62 * represent 256Mb of memory and will be marked by the node id. so,
63 * if the first gig is on node 0, and the second gig is on node 1
64 * physnode_map will contain:
65 *
66 * physnode_map[0-3] = 0;
67 * physnode_map[4-7] = 1;
68 * physnode_map[8- ] = -1;
69 */
70s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1};
71EXPORT_SYMBOL(physnode_map);
72
73void memory_present(int nid, unsigned long start, unsigned long end)
74{
75 unsigned long pfn;
76
77 printk(KERN_INFO "Node: %d, start_pfn: %ld, end_pfn: %ld\n",
78 nid, start, end);
79 printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid);
80 printk(KERN_DEBUG " ");
81 for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) {
82 physnode_map[pfn / PAGES_PER_ELEMENT] = nid;
83 printk("%ld ", pfn);
84 }
85 printk("\n");
86}
87
88unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
89 unsigned long end_pfn)
90{
91 unsigned long nr_pages = end_pfn - start_pfn;
92
93 if (!nr_pages)
94 return 0;
95
96 return (nr_pages + 1) * sizeof(struct page);
97}
98#endif
99
100extern unsigned long find_max_low_pfn(void);
101extern void add_one_highpage_init(struct page *, int, int);
102extern unsigned long highend_pfn, highstart_pfn;
103
104#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
105
106unsigned long node_remap_start_pfn[MAX_NUMNODES];
107unsigned long node_remap_size[MAX_NUMNODES];
108unsigned long node_remap_offset[MAX_NUMNODES];
109void *node_remap_start_vaddr[MAX_NUMNODES];
110void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
111
112void *node_remap_end_vaddr[MAX_NUMNODES];
113void *node_remap_alloc_vaddr[MAX_NUMNODES];
114static unsigned long kva_start_pfn;
115static unsigned long kva_pages;
116/*
117 * FLAT - support for basic PC memory model with discontig enabled, essentially
118 * a single node with all available processors in it with a flat
119 * memory map.
120 */
121int __init get_memcfg_numa_flat(void)
122{
123 printk("NUMA - single node, flat memory mode\n");
124
125 /* Run the memory configuration and find the top of memory. */
126 find_max_pfn();
127 node_start_pfn[0] = 0;
128 node_end_pfn[0] = max_pfn;
129 memory_present(0, 0, max_pfn);
130
131 /* Indicate there is one node available. */
132 nodes_clear(node_online_map);
133 node_set_online(0);
134 return 1;
135}
136
137/*
138 * Find the highest page frame number we have available for the node
139 */
140static void __init find_max_pfn_node(int nid)
141{
142 if (node_end_pfn[nid] > max_pfn)
143 node_end_pfn[nid] = max_pfn;
144 /*
145 * if a user has given mem=XXXX, then we need to make sure
146 * that the node _starts_ before that, too, not just ends
147 */
148 if (node_start_pfn[nid] > max_pfn)
149 node_start_pfn[nid] = max_pfn;
150 BUG_ON(node_start_pfn[nid] > node_end_pfn[nid]);
151}
152
153/*
154 * Allocate memory for the pg_data_t for this node via a crude pre-bootmem
155 * method. For node zero take this from the bottom of memory, for
156 * subsequent nodes place them at node_remap_start_vaddr which contains
157 * node local data in physically node local memory. See setup_memory()
158 * for details.
159 */
160static void __init allocate_pgdat(int nid)
161{
162 if (nid && node_has_online_mem(nid))
163 NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid];
164 else {
165 NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(min_low_pfn));
166 min_low_pfn += PFN_UP(sizeof(pg_data_t));
167 }
168}
169
170void *alloc_remap(int nid, unsigned long size)
171{
172 void *allocation = node_remap_alloc_vaddr[nid];
173
174 size = ALIGN(size, L1_CACHE_BYTES);
175
176 if (!allocation || (allocation + size) >= node_remap_end_vaddr[nid])
177 return 0;
178
179 node_remap_alloc_vaddr[nid] += size;
180 memset(allocation, 0, size);
181
182 return allocation;
183}
184
185void __init remap_numa_kva(void)
186{
187 void *vaddr;
188 unsigned long pfn;
189 int node;
190
191 for_each_online_node(node) {
192 for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) {
193 vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT);
194 set_pmd_pfn((ulong) vaddr,
195 node_remap_start_pfn[node] + pfn,
196 PAGE_KERNEL_LARGE);
197 }
198 }
199}
200
201static unsigned long calculate_numa_remap_pages(void)
202{
203 int nid;
204 unsigned long size, reserve_pages = 0;
205 unsigned long pfn;
206
207 for_each_online_node(nid) {
208 unsigned old_end_pfn = node_end_pfn[nid];
209
210 /*
211 * The acpi/srat node info can show hot-add memroy zones
212 * where memory could be added but not currently present.
213 */
214 if (node_start_pfn[nid] > max_pfn)
215 continue;
216 if (node_end_pfn[nid] > max_pfn)
217 node_end_pfn[nid] = max_pfn;
218
219 /* ensure the remap includes space for the pgdat. */
220 size = node_remap_size[nid] + sizeof(pg_data_t);
221
222 /* convert size to large (pmd size) pages, rounding up */
223 size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES;
224 /* now the roundup is correct, convert to PAGE_SIZE pages */
225 size = size * PTRS_PER_PTE;
226
227 /*
228 * Validate the region we are allocating only contains valid
229 * pages.
230 */
231 for (pfn = node_end_pfn[nid] - size;
232 pfn < node_end_pfn[nid]; pfn++)
233 if (!page_is_ram(pfn))
234 break;
235
236 if (pfn != node_end_pfn[nid])
237 size = 0;
238
239 printk("Reserving %ld pages of KVA for lmem_map of node %d\n",
240 size, nid);
241 node_remap_size[nid] = size;
242 node_remap_offset[nid] = reserve_pages;
243 reserve_pages += size;
244 printk("Shrinking node %d from %ld pages to %ld pages\n",
245 nid, node_end_pfn[nid], node_end_pfn[nid] - size);
246
247 if (node_end_pfn[nid] & (PTRS_PER_PTE-1)) {
248 /*
249 * Align node_end_pfn[] and node_remap_start_pfn[] to
250 * pmd boundary. remap_numa_kva will barf otherwise.
251 */
252 printk("Shrinking node %d further by %ld pages for proper alignment\n",
253 nid, node_end_pfn[nid] & (PTRS_PER_PTE-1));
254 size += node_end_pfn[nid] & (PTRS_PER_PTE-1);
255 }
256
257 node_end_pfn[nid] -= size;
258 node_remap_start_pfn[nid] = node_end_pfn[nid];
259 shrink_active_range(nid, old_end_pfn, node_end_pfn[nid]);
260 }
261 printk("Reserving total of %ld pages for numa KVA remap\n",
262 reserve_pages);
263 return reserve_pages;
264}
265
266extern void setup_bootmem_allocator(void);
267unsigned long __init setup_memory(void)
268{
269 int nid;
270 unsigned long system_start_pfn, system_max_low_pfn;
271
272 /*
273 * When mapping a NUMA machine we allocate the node_mem_map arrays
274 * from node local memory. They are then mapped directly into KVA
275 * between zone normal and vmalloc space. Calculate the size of
276 * this space and use it to adjust the boundry between ZONE_NORMAL
277 * and ZONE_HIGHMEM.
278 */
279 find_max_pfn();
280 get_memcfg_numa();
281
282 kva_pages = calculate_numa_remap_pages();
283
284 /* partially used pages are not usable - thus round upwards */
285 system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end);
286
287 kva_start_pfn = find_max_low_pfn() - kva_pages;
288
289#ifdef CONFIG_BLK_DEV_INITRD
290 /* Numa kva area is below the initrd */
291 if (LOADER_TYPE && INITRD_START)
292 kva_start_pfn = PFN_DOWN(INITRD_START) - kva_pages;
293#endif
294 kva_start_pfn -= kva_start_pfn & (PTRS_PER_PTE-1);
295
296 system_max_low_pfn = max_low_pfn = find_max_low_pfn();
297 printk("kva_start_pfn ~ %ld find_max_low_pfn() ~ %ld\n",
298 kva_start_pfn, max_low_pfn);
299 printk("max_pfn = %ld\n", max_pfn);
300#ifdef CONFIG_HIGHMEM
301 highstart_pfn = highend_pfn = max_pfn;
302 if (max_pfn > system_max_low_pfn)
303 highstart_pfn = system_max_low_pfn;
304 printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
305 pages_to_mb(highend_pfn - highstart_pfn));
306 num_physpages = highend_pfn;
307 high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
308#else
309 num_physpages = system_max_low_pfn;
310 high_memory = (void *) __va(system_max_low_pfn * PAGE_SIZE - 1) + 1;
311#endif
312 printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
313 pages_to_mb(system_max_low_pfn));
314 printk("min_low_pfn = %ld, max_low_pfn = %ld, highstart_pfn = %ld\n",
315 min_low_pfn, max_low_pfn, highstart_pfn);
316
317 printk("Low memory ends at vaddr %08lx\n",
318 (ulong) pfn_to_kaddr(max_low_pfn));
319 for_each_online_node(nid) {
320 node_remap_start_vaddr[nid] = pfn_to_kaddr(
321 kva_start_pfn + node_remap_offset[nid]);
322 /* Init the node remap allocator */
323 node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] +
324 (node_remap_size[nid] * PAGE_SIZE);
325 node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] +
326 ALIGN(sizeof(pg_data_t), PAGE_SIZE);
327
328 allocate_pgdat(nid);
329 printk ("node %d will remap to vaddr %08lx - %08lx\n", nid,
330 (ulong) node_remap_start_vaddr[nid],
331 (ulong) pfn_to_kaddr(highstart_pfn
332 + node_remap_offset[nid] + node_remap_size[nid]));
333 }
334 printk("High memory starts at vaddr %08lx\n",
335 (ulong) pfn_to_kaddr(highstart_pfn));
336 for_each_online_node(nid)
337 find_max_pfn_node(nid);
338
339 memset(NODE_DATA(0), 0, sizeof(struct pglist_data));
340 NODE_DATA(0)->bdata = &node0_bdata;
341 setup_bootmem_allocator();
342 return max_low_pfn;
343}
344
345void __init numa_kva_reserve(void)
346{
347 reserve_bootmem(PFN_PHYS(kva_start_pfn),PFN_PHYS(kva_pages));
348}
349
350void __init zone_sizes_init(void)
351{
352 int nid;
353 unsigned long max_zone_pfns[MAX_NR_ZONES];
354 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
355 max_zone_pfns[ZONE_DMA] =
356 virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
357 max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
358#ifdef CONFIG_HIGHMEM
359 max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
360#endif
361
362 /* If SRAT has not registered memory, register it now */
363 if (find_max_pfn_with_active_regions() == 0) {
364 for_each_online_node(nid) {
365 if (node_has_online_mem(nid))
366 add_active_range(nid, node_start_pfn[nid],
367 node_end_pfn[nid]);
368 }
369 }
370
371 free_area_init_nodes(max_zone_pfns);
372 return;
373}
374
375void __init set_highmem_pages_init(int bad_ppro)
376{
377#ifdef CONFIG_HIGHMEM
378 struct zone *zone;
379 struct page *page;
380
381 for_each_zone(zone) {
382 unsigned long node_pfn, zone_start_pfn, zone_end_pfn;
383
384 if (!is_highmem(zone))
385 continue;
386
387 zone_start_pfn = zone->zone_start_pfn;
388 zone_end_pfn = zone_start_pfn + zone->spanned_pages;
389
390 printk("Initializing %s for node %d (%08lx:%08lx)\n",
391 zone->name, zone_to_nid(zone),
392 zone_start_pfn, zone_end_pfn);
393
394 for (node_pfn = zone_start_pfn; node_pfn < zone_end_pfn; node_pfn++) {
395 if (!pfn_valid(node_pfn))
396 continue;
397 page = pfn_to_page(node_pfn);
398 add_one_highpage_init(page, node_pfn, bad_ppro);
399 }
400 }
401 totalram_pages += totalhigh_pages;
402#endif
403}
404
405#ifdef CONFIG_MEMORY_HOTPLUG
406int paddr_to_nid(u64 addr)
407{
408 int nid;
409 unsigned long pfn = PFN_DOWN(addr);
410
411 for_each_node(nid)
412 if (node_start_pfn[nid] <= pfn &&
413 pfn < node_end_pfn[nid])
414 return nid;
415
416 return -1;
417}
418
419/*
420 * This function is used to ask node id BEFORE memmap and mem_section's
421 * initialization (pfn_to_nid() can't be used yet).
422 * If _PXM is not defined on ACPI's DSDT, node id must be found by this.
423 */
424int memory_add_physaddr_to_nid(u64 addr)
425{
426 int nid = paddr_to_nid(addr);
427 return (nid >= 0) ? nid : 0;
428}
429
430EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
431#endif
diff --git a/arch/x86/mm/extable_32.c b/arch/x86/mm/extable_32.c
new file mode 100644
index 000000000000..0ce4f22a2635
--- /dev/null
+++ b/arch/x86/mm/extable_32.c
@@ -0,0 +1,35 @@
1/*
2 * linux/arch/i386/mm/extable.c
3 */
4
5#include <linux/module.h>
6#include <linux/spinlock.h>
7#include <asm/uaccess.h>
8
9int fixup_exception(struct pt_regs *regs)
10{
11 const struct exception_table_entry *fixup;
12
13#ifdef CONFIG_PNPBIOS
14 if (unlikely(SEGMENT_IS_PNP_CODE(regs->xcs)))
15 {
16 extern u32 pnp_bios_fault_eip, pnp_bios_fault_esp;
17 extern u32 pnp_bios_is_utter_crap;
18 pnp_bios_is_utter_crap = 1;
19 printk(KERN_CRIT "PNPBIOS fault.. attempting recovery.\n");
20 __asm__ volatile(
21 "movl %0, %%esp\n\t"
22 "jmp *%1\n\t"
23 : : "g" (pnp_bios_fault_esp), "g" (pnp_bios_fault_eip));
24 panic("do_trap: can't hit this");
25 }
26#endif
27
28 fixup = search_exception_tables(regs->eip);
29 if (fixup) {
30 regs->eip = fixup->fixup;
31 return 1;
32 }
33
34 return 0;
35}
diff --git a/arch/x86/mm/extable_64.c b/arch/x86/mm/extable_64.c
new file mode 100644
index 000000000000..79ac6e7100af
--- /dev/null
+++ b/arch/x86/mm/extable_64.c
@@ -0,0 +1,34 @@
1/*
2 * linux/arch/x86_64/mm/extable.c
3 */
4
5#include <linux/module.h>
6#include <linux/spinlock.h>
7#include <linux/init.h>
8#include <asm/uaccess.h>
9
10/* Simple binary search */
11const struct exception_table_entry *
12search_extable(const struct exception_table_entry *first,
13 const struct exception_table_entry *last,
14 unsigned long value)
15{
16 /* Work around a B stepping K8 bug */
17 if ((value >> 32) == 0)
18 value |= 0xffffffffUL << 32;
19
20 while (first <= last) {
21 const struct exception_table_entry *mid;
22 long diff;
23
24 mid = (last - first) / 2 + first;
25 diff = mid->insn - value;
26 if (diff == 0)
27 return mid;
28 else if (diff < 0)
29 first = mid+1;
30 else
31 last = mid-1;
32 }
33 return NULL;
34}
diff --git a/arch/x86/mm/fault_32.c b/arch/x86/mm/fault_32.c
new file mode 100644
index 000000000000..fcb38e7f3543
--- /dev/null
+++ b/arch/x86/mm/fault_32.c
@@ -0,0 +1,657 @@
1/*
2 * linux/arch/i386/mm/fault.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 */
6
7#include <linux/signal.h>
8#include <linux/sched.h>
9#include <linux/kernel.h>
10#include <linux/errno.h>
11#include <linux/string.h>
12#include <linux/types.h>
13#include <linux/ptrace.h>
14#include <linux/mman.h>
15#include <linux/mm.h>
16#include <linux/smp.h>
17#include <linux/interrupt.h>
18#include <linux/init.h>
19#include <linux/tty.h>
20#include <linux/vt_kern.h> /* For unblank_screen() */
21#include <linux/highmem.h>
22#include <linux/bootmem.h> /* for max_low_pfn */
23#include <linux/vmalloc.h>
24#include <linux/module.h>
25#include <linux/kprobes.h>
26#include <linux/uaccess.h>
27#include <linux/kdebug.h>
28
29#include <asm/system.h>
30#include <asm/desc.h>
31#include <asm/segment.h>
32
33extern void die(const char *,struct pt_regs *,long);
34
35static ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
36
37int register_page_fault_notifier(struct notifier_block *nb)
38{
39 vmalloc_sync_all();
40 return atomic_notifier_chain_register(&notify_page_fault_chain, nb);
41}
42EXPORT_SYMBOL_GPL(register_page_fault_notifier);
43
44int unregister_page_fault_notifier(struct notifier_block *nb)
45{
46 return atomic_notifier_chain_unregister(&notify_page_fault_chain, nb);
47}
48EXPORT_SYMBOL_GPL(unregister_page_fault_notifier);
49
50static inline int notify_page_fault(struct pt_regs *regs, long err)
51{
52 struct die_args args = {
53 .regs = regs,
54 .str = "page fault",
55 .err = err,
56 .trapnr = 14,
57 .signr = SIGSEGV
58 };
59 return atomic_notifier_call_chain(&notify_page_fault_chain,
60 DIE_PAGE_FAULT, &args);
61}
62
63/*
64 * Return EIP plus the CS segment base. The segment limit is also
65 * adjusted, clamped to the kernel/user address space (whichever is
66 * appropriate), and returned in *eip_limit.
67 *
68 * The segment is checked, because it might have been changed by another
69 * task between the original faulting instruction and here.
70 *
71 * If CS is no longer a valid code segment, or if EIP is beyond the
72 * limit, or if it is a kernel address when CS is not a kernel segment,
73 * then the returned value will be greater than *eip_limit.
74 *
75 * This is slow, but is very rarely executed.
76 */
77static inline unsigned long get_segment_eip(struct pt_regs *regs,
78 unsigned long *eip_limit)
79{
80 unsigned long eip = regs->eip;
81 unsigned seg = regs->xcs & 0xffff;
82 u32 seg_ar, seg_limit, base, *desc;
83
84 /* Unlikely, but must come before segment checks. */
85 if (unlikely(regs->eflags & VM_MASK)) {
86 base = seg << 4;
87 *eip_limit = base + 0xffff;
88 return base + (eip & 0xffff);
89 }
90
91 /* The standard kernel/user address space limit. */
92 *eip_limit = user_mode(regs) ? USER_DS.seg : KERNEL_DS.seg;
93
94 /* By far the most common cases. */
95 if (likely(SEGMENT_IS_FLAT_CODE(seg)))
96 return eip;
97
98 /* Check the segment exists, is within the current LDT/GDT size,
99 that kernel/user (ring 0..3) has the appropriate privilege,
100 that it's a code segment, and get the limit. */
101 __asm__ ("larl %3,%0; lsll %3,%1"
102 : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg));
103 if ((~seg_ar & 0x9800) || eip > seg_limit) {
104 *eip_limit = 0;
105 return 1; /* So that returned eip > *eip_limit. */
106 }
107
108 /* Get the GDT/LDT descriptor base.
109 When you look for races in this code remember that
110 LDT and other horrors are only used in user space. */
111 if (seg & (1<<2)) {
112 /* Must lock the LDT while reading it. */
113 down(&current->mm->context.sem);
114 desc = current->mm->context.ldt;
115 desc = (void *)desc + (seg & ~7);
116 } else {
117 /* Must disable preemption while reading the GDT. */
118 desc = (u32 *)get_cpu_gdt_table(get_cpu());
119 desc = (void *)desc + (seg & ~7);
120 }
121
122 /* Decode the code segment base from the descriptor */
123 base = get_desc_base((unsigned long *)desc);
124
125 if (seg & (1<<2)) {
126 up(&current->mm->context.sem);
127 } else
128 put_cpu();
129
130 /* Adjust EIP and segment limit, and clamp at the kernel limit.
131 It's legitimate for segments to wrap at 0xffffffff. */
132 seg_limit += base;
133 if (seg_limit < *eip_limit && seg_limit >= base)
134 *eip_limit = seg_limit;
135 return eip + base;
136}
137
138/*
139 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
140 * Check that here and ignore it.
141 */
142static int __is_prefetch(struct pt_regs *regs, unsigned long addr)
143{
144 unsigned long limit;
145 unsigned char *instr = (unsigned char *)get_segment_eip (regs, &limit);
146 int scan_more = 1;
147 int prefetch = 0;
148 int i;
149
150 for (i = 0; scan_more && i < 15; i++) {
151 unsigned char opcode;
152 unsigned char instr_hi;
153 unsigned char instr_lo;
154
155 if (instr > (unsigned char *)limit)
156 break;
157 if (probe_kernel_address(instr, opcode))
158 break;
159
160 instr_hi = opcode & 0xf0;
161 instr_lo = opcode & 0x0f;
162 instr++;
163
164 switch (instr_hi) {
165 case 0x20:
166 case 0x30:
167 /* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. */
168 scan_more = ((instr_lo & 7) == 0x6);
169 break;
170
171 case 0x60:
172 /* 0x64 thru 0x67 are valid prefixes in all modes. */
173 scan_more = (instr_lo & 0xC) == 0x4;
174 break;
175 case 0xF0:
176 /* 0xF0, 0xF2, and 0xF3 are valid prefixes */
177 scan_more = !instr_lo || (instr_lo>>1) == 1;
178 break;
179 case 0x00:
180 /* Prefetch instruction is 0x0F0D or 0x0F18 */
181 scan_more = 0;
182 if (instr > (unsigned char *)limit)
183 break;
184 if (probe_kernel_address(instr, opcode))
185 break;
186 prefetch = (instr_lo == 0xF) &&
187 (opcode == 0x0D || opcode == 0x18);
188 break;
189 default:
190 scan_more = 0;
191 break;
192 }
193 }
194 return prefetch;
195}
196
197static inline int is_prefetch(struct pt_regs *regs, unsigned long addr,
198 unsigned long error_code)
199{
200 if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
201 boot_cpu_data.x86 >= 6)) {
202 /* Catch an obscure case of prefetch inside an NX page. */
203 if (nx_enabled && (error_code & 16))
204 return 0;
205 return __is_prefetch(regs, addr);
206 }
207 return 0;
208}
209
210static noinline void force_sig_info_fault(int si_signo, int si_code,
211 unsigned long address, struct task_struct *tsk)
212{
213 siginfo_t info;
214
215 info.si_signo = si_signo;
216 info.si_errno = 0;
217 info.si_code = si_code;
218 info.si_addr = (void __user *)address;
219 force_sig_info(si_signo, &info, tsk);
220}
221
222fastcall void do_invalid_op(struct pt_regs *, unsigned long);
223
224static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
225{
226 unsigned index = pgd_index(address);
227 pgd_t *pgd_k;
228 pud_t *pud, *pud_k;
229 pmd_t *pmd, *pmd_k;
230
231 pgd += index;
232 pgd_k = init_mm.pgd + index;
233
234 if (!pgd_present(*pgd_k))
235 return NULL;
236
237 /*
238 * set_pgd(pgd, *pgd_k); here would be useless on PAE
239 * and redundant with the set_pmd() on non-PAE. As would
240 * set_pud.
241 */
242
243 pud = pud_offset(pgd, address);
244 pud_k = pud_offset(pgd_k, address);
245 if (!pud_present(*pud_k))
246 return NULL;
247
248 pmd = pmd_offset(pud, address);
249 pmd_k = pmd_offset(pud_k, address);
250 if (!pmd_present(*pmd_k))
251 return NULL;
252 if (!pmd_present(*pmd)) {
253 set_pmd(pmd, *pmd_k);
254 arch_flush_lazy_mmu_mode();
255 } else
256 BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
257 return pmd_k;
258}
259
260/*
261 * Handle a fault on the vmalloc or module mapping area
262 *
263 * This assumes no large pages in there.
264 */
265static inline int vmalloc_fault(unsigned long address)
266{
267 unsigned long pgd_paddr;
268 pmd_t *pmd_k;
269 pte_t *pte_k;
270 /*
271 * Synchronize this task's top level page-table
272 * with the 'reference' page table.
273 *
274 * Do _not_ use "current" here. We might be inside
275 * an interrupt in the middle of a task switch..
276 */
277 pgd_paddr = read_cr3();
278 pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
279 if (!pmd_k)
280 return -1;
281 pte_k = pte_offset_kernel(pmd_k, address);
282 if (!pte_present(*pte_k))
283 return -1;
284 return 0;
285}
286
287int show_unhandled_signals = 1;
288
289/*
290 * This routine handles page faults. It determines the address,
291 * and the problem, and then passes it off to one of the appropriate
292 * routines.
293 *
294 * error_code:
295 * bit 0 == 0 means no page found, 1 means protection fault
296 * bit 1 == 0 means read, 1 means write
297 * bit 2 == 0 means kernel, 1 means user-mode
298 * bit 3 == 1 means use of reserved bit detected
299 * bit 4 == 1 means fault was an instruction fetch
300 */
301fastcall void __kprobes do_page_fault(struct pt_regs *regs,
302 unsigned long error_code)
303{
304 struct task_struct *tsk;
305 struct mm_struct *mm;
306 struct vm_area_struct * vma;
307 unsigned long address;
308 int write, si_code;
309 int fault;
310
311 /* get the address */
312 address = read_cr2();
313
314 tsk = current;
315
316 si_code = SEGV_MAPERR;
317
318 /*
319 * We fault-in kernel-space virtual memory on-demand. The
320 * 'reference' page table is init_mm.pgd.
321 *
322 * NOTE! We MUST NOT take any locks for this case. We may
323 * be in an interrupt or a critical region, and should
324 * only copy the information from the master page table,
325 * nothing more.
326 *
327 * This verifies that the fault happens in kernel space
328 * (error_code & 4) == 0, and that the fault was not a
329 * protection error (error_code & 9) == 0.
330 */
331 if (unlikely(address >= TASK_SIZE)) {
332 if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0)
333 return;
334 if (notify_page_fault(regs, error_code) == NOTIFY_STOP)
335 return;
336 /*
337 * Don't take the mm semaphore here. If we fixup a prefetch
338 * fault we could otherwise deadlock.
339 */
340 goto bad_area_nosemaphore;
341 }
342
343 if (notify_page_fault(regs, error_code) == NOTIFY_STOP)
344 return;
345
346 /* It's safe to allow irq's after cr2 has been saved and the vmalloc
347 fault has been handled. */
348 if (regs->eflags & (X86_EFLAGS_IF|VM_MASK))
349 local_irq_enable();
350
351 mm = tsk->mm;
352
353 /*
354 * If we're in an interrupt, have no user context or are running in an
355 * atomic region then we must not take the fault..
356 */
357 if (in_atomic() || !mm)
358 goto bad_area_nosemaphore;
359
360 /* When running in the kernel we expect faults to occur only to
361 * addresses in user space. All other faults represent errors in the
362 * kernel and should generate an OOPS. Unfortunatly, in the case of an
363 * erroneous fault occurring in a code path which already holds mmap_sem
364 * we will deadlock attempting to validate the fault against the
365 * address space. Luckily the kernel only validly references user
366 * space from well defined areas of code, which are listed in the
367 * exceptions table.
368 *
369 * As the vast majority of faults will be valid we will only perform
370 * the source reference check when there is a possibilty of a deadlock.
371 * Attempt to lock the address space, if we cannot we then validate the
372 * source. If this is invalid we can skip the address space check,
373 * thus avoiding the deadlock.
374 */
375 if (!down_read_trylock(&mm->mmap_sem)) {
376 if ((error_code & 4) == 0 &&
377 !search_exception_tables(regs->eip))
378 goto bad_area_nosemaphore;
379 down_read(&mm->mmap_sem);
380 }
381
382 vma = find_vma(mm, address);
383 if (!vma)
384 goto bad_area;
385 if (vma->vm_start <= address)
386 goto good_area;
387 if (!(vma->vm_flags & VM_GROWSDOWN))
388 goto bad_area;
389 if (error_code & 4) {
390 /*
391 * Accessing the stack below %esp is always a bug.
392 * The large cushion allows instructions like enter
393 * and pusha to work. ("enter $65535,$31" pushes
394 * 32 pointers and then decrements %esp by 65535.)
395 */
396 if (address + 65536 + 32 * sizeof(unsigned long) < regs->esp)
397 goto bad_area;
398 }
399 if (expand_stack(vma, address))
400 goto bad_area;
401/*
402 * Ok, we have a good vm_area for this memory access, so
403 * we can handle it..
404 */
405good_area:
406 si_code = SEGV_ACCERR;
407 write = 0;
408 switch (error_code & 3) {
409 default: /* 3: write, present */
410 /* fall through */
411 case 2: /* write, not present */
412 if (!(vma->vm_flags & VM_WRITE))
413 goto bad_area;
414 write++;
415 break;
416 case 1: /* read, present */
417 goto bad_area;
418 case 0: /* read, not present */
419 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
420 goto bad_area;
421 }
422
423 survive:
424 /*
425 * If for any reason at all we couldn't handle the fault,
426 * make sure we exit gracefully rather than endlessly redo
427 * the fault.
428 */
429 fault = handle_mm_fault(mm, vma, address, write);
430 if (unlikely(fault & VM_FAULT_ERROR)) {
431 if (fault & VM_FAULT_OOM)
432 goto out_of_memory;
433 else if (fault & VM_FAULT_SIGBUS)
434 goto do_sigbus;
435 BUG();
436 }
437 if (fault & VM_FAULT_MAJOR)
438 tsk->maj_flt++;
439 else
440 tsk->min_flt++;
441
442 /*
443 * Did it hit the DOS screen memory VA from vm86 mode?
444 */
445 if (regs->eflags & VM_MASK) {
446 unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
447 if (bit < 32)
448 tsk->thread.screen_bitmap |= 1 << bit;
449 }
450 up_read(&mm->mmap_sem);
451 return;
452
453/*
454 * Something tried to access memory that isn't in our memory map..
455 * Fix it, but check if it's kernel or user first..
456 */
457bad_area:
458 up_read(&mm->mmap_sem);
459
460bad_area_nosemaphore:
461 /* User mode accesses just cause a SIGSEGV */
462 if (error_code & 4) {
463 /*
464 * It's possible to have interrupts off here.
465 */
466 local_irq_enable();
467
468 /*
469 * Valid to do another page fault here because this one came
470 * from user space.
471 */
472 if (is_prefetch(regs, address, error_code))
473 return;
474
475 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
476 printk_ratelimit()) {
477 printk("%s%s[%d]: segfault at %08lx eip %08lx "
478 "esp %08lx error %lx\n",
479 tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
480 tsk->comm, tsk->pid, address, regs->eip,
481 regs->esp, error_code);
482 }
483 tsk->thread.cr2 = address;
484 /* Kernel addresses are always protection faults */
485 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
486 tsk->thread.trap_no = 14;
487 force_sig_info_fault(SIGSEGV, si_code, address, tsk);
488 return;
489 }
490
491#ifdef CONFIG_X86_F00F_BUG
492 /*
493 * Pentium F0 0F C7 C8 bug workaround.
494 */
495 if (boot_cpu_data.f00f_bug) {
496 unsigned long nr;
497
498 nr = (address - idt_descr.address) >> 3;
499
500 if (nr == 6) {
501 do_invalid_op(regs, 0);
502 return;
503 }
504 }
505#endif
506
507no_context:
508 /* Are we prepared to handle this kernel fault? */
509 if (fixup_exception(regs))
510 return;
511
512 /*
513 * Valid to do another page fault here, because if this fault
514 * had been triggered by is_prefetch fixup_exception would have
515 * handled it.
516 */
517 if (is_prefetch(regs, address, error_code))
518 return;
519
520/*
521 * Oops. The kernel tried to access some bad page. We'll have to
522 * terminate things with extreme prejudice.
523 */
524
525 bust_spinlocks(1);
526
527 if (oops_may_print()) {
528 __typeof__(pte_val(__pte(0))) page;
529
530#ifdef CONFIG_X86_PAE
531 if (error_code & 16) {
532 pte_t *pte = lookup_address(address);
533
534 if (pte && pte_present(*pte) && !pte_exec_kernel(*pte))
535 printk(KERN_CRIT "kernel tried to execute "
536 "NX-protected page - exploit attempt? "
537 "(uid: %d)\n", current->uid);
538 }
539#endif
540 if (address < PAGE_SIZE)
541 printk(KERN_ALERT "BUG: unable to handle kernel NULL "
542 "pointer dereference");
543 else
544 printk(KERN_ALERT "BUG: unable to handle kernel paging"
545 " request");
546 printk(" at virtual address %08lx\n",address);
547 printk(KERN_ALERT " printing eip:\n");
548 printk("%08lx\n", regs->eip);
549
550 page = read_cr3();
551 page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
552#ifdef CONFIG_X86_PAE
553 printk(KERN_ALERT "*pdpt = %016Lx\n", page);
554 if ((page >> PAGE_SHIFT) < max_low_pfn
555 && page & _PAGE_PRESENT) {
556 page &= PAGE_MASK;
557 page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
558 & (PTRS_PER_PMD - 1)];
559 printk(KERN_ALERT "*pde = %016Lx\n", page);
560 page &= ~_PAGE_NX;
561 }
562#else
563 printk(KERN_ALERT "*pde = %08lx\n", page);
564#endif
565
566 /*
567 * We must not directly access the pte in the highpte
568 * case if the page table is located in highmem.
569 * And let's rather not kmap-atomic the pte, just in case
570 * it's allocated already.
571 */
572 if ((page >> PAGE_SHIFT) < max_low_pfn
573 && (page & _PAGE_PRESENT)) {
574 page &= PAGE_MASK;
575 page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
576 & (PTRS_PER_PTE - 1)];
577 printk(KERN_ALERT "*pte = %0*Lx\n", sizeof(page)*2, (u64)page);
578 }
579 }
580
581 tsk->thread.cr2 = address;
582 tsk->thread.trap_no = 14;
583 tsk->thread.error_code = error_code;
584 die("Oops", regs, error_code);
585 bust_spinlocks(0);
586 do_exit(SIGKILL);
587
588/*
589 * We ran out of memory, or some other thing happened to us that made
590 * us unable to handle the page fault gracefully.
591 */
592out_of_memory:
593 up_read(&mm->mmap_sem);
594 if (is_init(tsk)) {
595 yield();
596 down_read(&mm->mmap_sem);
597 goto survive;
598 }
599 printk("VM: killing process %s\n", tsk->comm);
600 if (error_code & 4)
601 do_exit(SIGKILL);
602 goto no_context;
603
604do_sigbus:
605 up_read(&mm->mmap_sem);
606
607 /* Kernel mode? Handle exceptions or die */
608 if (!(error_code & 4))
609 goto no_context;
610
611 /* User space => ok to do another page fault */
612 if (is_prefetch(regs, address, error_code))
613 return;
614
615 tsk->thread.cr2 = address;
616 tsk->thread.error_code = error_code;
617 tsk->thread.trap_no = 14;
618 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
619}
620
621void vmalloc_sync_all(void)
622{
623 /*
624 * Note that races in the updates of insync and start aren't
625 * problematic: insync can only get set bits added, and updates to
626 * start are only improving performance (without affecting correctness
627 * if undone).
628 */
629 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
630 static unsigned long start = TASK_SIZE;
631 unsigned long address;
632
633 if (SHARED_KERNEL_PMD)
634 return;
635
636 BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
637 for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) {
638 if (!test_bit(pgd_index(address), insync)) {
639 unsigned long flags;
640 struct page *page;
641
642 spin_lock_irqsave(&pgd_lock, flags);
643 for (page = pgd_list; page; page =
644 (struct page *)page->index)
645 if (!vmalloc_sync_one(page_address(page),
646 address)) {
647 BUG_ON(page != pgd_list);
648 break;
649 }
650 spin_unlock_irqrestore(&pgd_lock, flags);
651 if (!page)
652 set_bit(pgd_index(address), insync);
653 }
654 if (address == start && test_bit(pgd_index(address), insync))
655 start = address + PGDIR_SIZE;
656 }
657}
diff --git a/arch/x86/mm/fault_64.c b/arch/x86/mm/fault_64.c
new file mode 100644
index 000000000000..54816adb8e93
--- /dev/null
+++ b/arch/x86/mm/fault_64.c
@@ -0,0 +1,636 @@
1/*
2 * linux/arch/x86-64/mm/fault.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
6 */
7
8#include <linux/signal.h>
9#include <linux/sched.h>
10#include <linux/kernel.h>
11#include <linux/errno.h>
12#include <linux/string.h>
13#include <linux/types.h>
14#include <linux/ptrace.h>
15#include <linux/mman.h>
16#include <linux/mm.h>
17#include <linux/smp.h>
18#include <linux/interrupt.h>
19#include <linux/init.h>
20#include <linux/tty.h>
21#include <linux/vt_kern.h> /* For unblank_screen() */
22#include <linux/compiler.h>
23#include <linux/vmalloc.h>
24#include <linux/module.h>
25#include <linux/kprobes.h>
26#include <linux/uaccess.h>
27#include <linux/kdebug.h>
28
29#include <asm/system.h>
30#include <asm/pgalloc.h>
31#include <asm/smp.h>
32#include <asm/tlbflush.h>
33#include <asm/proto.h>
34#include <asm-generic/sections.h>
35
36/* Page fault error code bits */
37#define PF_PROT (1<<0) /* or no page found */
38#define PF_WRITE (1<<1)
39#define PF_USER (1<<2)
40#define PF_RSVD (1<<3)
41#define PF_INSTR (1<<4)
42
43static ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
44
45/* Hook to register for page fault notifications */
46int register_page_fault_notifier(struct notifier_block *nb)
47{
48 vmalloc_sync_all();
49 return atomic_notifier_chain_register(&notify_page_fault_chain, nb);
50}
51EXPORT_SYMBOL_GPL(register_page_fault_notifier);
52
53int unregister_page_fault_notifier(struct notifier_block *nb)
54{
55 return atomic_notifier_chain_unregister(&notify_page_fault_chain, nb);
56}
57EXPORT_SYMBOL_GPL(unregister_page_fault_notifier);
58
59static inline int notify_page_fault(struct pt_regs *regs, long err)
60{
61 struct die_args args = {
62 .regs = regs,
63 .str = "page fault",
64 .err = err,
65 .trapnr = 14,
66 .signr = SIGSEGV
67 };
68 return atomic_notifier_call_chain(&notify_page_fault_chain,
69 DIE_PAGE_FAULT, &args);
70}
71
72/* Sometimes the CPU reports invalid exceptions on prefetch.
73 Check that here and ignore.
74 Opcode checker based on code by Richard Brunner */
75static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
76 unsigned long error_code)
77{
78 unsigned char *instr;
79 int scan_more = 1;
80 int prefetch = 0;
81 unsigned char *max_instr;
82
83 /* If it was a exec fault ignore */
84 if (error_code & PF_INSTR)
85 return 0;
86
87 instr = (unsigned char __user *)convert_rip_to_linear(current, regs);
88 max_instr = instr + 15;
89
90 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
91 return 0;
92
93 while (scan_more && instr < max_instr) {
94 unsigned char opcode;
95 unsigned char instr_hi;
96 unsigned char instr_lo;
97
98 if (probe_kernel_address(instr, opcode))
99 break;
100
101 instr_hi = opcode & 0xf0;
102 instr_lo = opcode & 0x0f;
103 instr++;
104
105 switch (instr_hi) {
106 case 0x20:
107 case 0x30:
108 /* Values 0x26,0x2E,0x36,0x3E are valid x86
109 prefixes. In long mode, the CPU will signal
110 invalid opcode if some of these prefixes are
111 present so we will never get here anyway */
112 scan_more = ((instr_lo & 7) == 0x6);
113 break;
114
115 case 0x40:
116 /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
117 Need to figure out under what instruction mode the
118 instruction was issued ... */
119 /* Could check the LDT for lm, but for now it's good
120 enough to assume that long mode only uses well known
121 segments or kernel. */
122 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
123 break;
124
125 case 0x60:
126 /* 0x64 thru 0x67 are valid prefixes in all modes. */
127 scan_more = (instr_lo & 0xC) == 0x4;
128 break;
129 case 0xF0:
130 /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
131 scan_more = !instr_lo || (instr_lo>>1) == 1;
132 break;
133 case 0x00:
134 /* Prefetch instruction is 0x0F0D or 0x0F18 */
135 scan_more = 0;
136 if (probe_kernel_address(instr, opcode))
137 break;
138 prefetch = (instr_lo == 0xF) &&
139 (opcode == 0x0D || opcode == 0x18);
140 break;
141 default:
142 scan_more = 0;
143 break;
144 }
145 }
146 return prefetch;
147}
148
149static int bad_address(void *p)
150{
151 unsigned long dummy;
152 return probe_kernel_address((unsigned long *)p, dummy);
153}
154
155void dump_pagetable(unsigned long address)
156{
157 pgd_t *pgd;
158 pud_t *pud;
159 pmd_t *pmd;
160 pte_t *pte;
161
162 pgd = (pgd_t *)read_cr3();
163
164 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
165 pgd += pgd_index(address);
166 if (bad_address(pgd)) goto bad;
167 printk("PGD %lx ", pgd_val(*pgd));
168 if (!pgd_present(*pgd)) goto ret;
169
170 pud = pud_offset(pgd, address);
171 if (bad_address(pud)) goto bad;
172 printk("PUD %lx ", pud_val(*pud));
173 if (!pud_present(*pud)) goto ret;
174
175 pmd = pmd_offset(pud, address);
176 if (bad_address(pmd)) goto bad;
177 printk("PMD %lx ", pmd_val(*pmd));
178 if (!pmd_present(*pmd)) goto ret;
179
180 pte = pte_offset_kernel(pmd, address);
181 if (bad_address(pte)) goto bad;
182 printk("PTE %lx", pte_val(*pte));
183ret:
184 printk("\n");
185 return;
186bad:
187 printk("BAD\n");
188}
189
190static const char errata93_warning[] =
191KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
192KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
193KERN_ERR "******* Please consider a BIOS update.\n"
194KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
195
196/* Workaround for K8 erratum #93 & buggy BIOS.
197 BIOS SMM functions are required to use a specific workaround
198 to avoid corruption of the 64bit RIP register on C stepping K8.
199 A lot of BIOS that didn't get tested properly miss this.
200 The OS sees this as a page fault with the upper 32bits of RIP cleared.
201 Try to work around it here.
202 Note we only handle faults in kernel here. */
203
204static int is_errata93(struct pt_regs *regs, unsigned long address)
205{
206 static int warned;
207 if (address != regs->rip)
208 return 0;
209 if ((address >> 32) != 0)
210 return 0;
211 address |= 0xffffffffUL << 32;
212 if ((address >= (u64)_stext && address <= (u64)_etext) ||
213 (address >= MODULES_VADDR && address <= MODULES_END)) {
214 if (!warned) {
215 printk(errata93_warning);
216 warned = 1;
217 }
218 regs->rip = address;
219 return 1;
220 }
221 return 0;
222}
223
224static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
225 unsigned long error_code)
226{
227 unsigned long flags = oops_begin();
228 struct task_struct *tsk;
229
230 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
231 current->comm, address);
232 dump_pagetable(address);
233 tsk = current;
234 tsk->thread.cr2 = address;
235 tsk->thread.trap_no = 14;
236 tsk->thread.error_code = error_code;
237 __die("Bad pagetable", regs, error_code);
238 oops_end(flags);
239 do_exit(SIGKILL);
240}
241
242/*
243 * Handle a fault on the vmalloc area
244 *
245 * This assumes no large pages in there.
246 */
247static int vmalloc_fault(unsigned long address)
248{
249 pgd_t *pgd, *pgd_ref;
250 pud_t *pud, *pud_ref;
251 pmd_t *pmd, *pmd_ref;
252 pte_t *pte, *pte_ref;
253
254 /* Copy kernel mappings over when needed. This can also
255 happen within a race in page table update. In the later
256 case just flush. */
257
258 pgd = pgd_offset(current->mm ?: &init_mm, address);
259 pgd_ref = pgd_offset_k(address);
260 if (pgd_none(*pgd_ref))
261 return -1;
262 if (pgd_none(*pgd))
263 set_pgd(pgd, *pgd_ref);
264 else
265 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
266
267 /* Below here mismatches are bugs because these lower tables
268 are shared */
269
270 pud = pud_offset(pgd, address);
271 pud_ref = pud_offset(pgd_ref, address);
272 if (pud_none(*pud_ref))
273 return -1;
274 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
275 BUG();
276 pmd = pmd_offset(pud, address);
277 pmd_ref = pmd_offset(pud_ref, address);
278 if (pmd_none(*pmd_ref))
279 return -1;
280 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
281 BUG();
282 pte_ref = pte_offset_kernel(pmd_ref, address);
283 if (!pte_present(*pte_ref))
284 return -1;
285 pte = pte_offset_kernel(pmd, address);
286 /* Don't use pte_page here, because the mappings can point
287 outside mem_map, and the NUMA hash lookup cannot handle
288 that. */
289 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
290 BUG();
291 return 0;
292}
293
294static int page_fault_trace;
295int show_unhandled_signals = 1;
296
297/*
298 * This routine handles page faults. It determines the address,
299 * and the problem, and then passes it off to one of the appropriate
300 * routines.
301 */
302asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
303 unsigned long error_code)
304{
305 struct task_struct *tsk;
306 struct mm_struct *mm;
307 struct vm_area_struct * vma;
308 unsigned long address;
309 const struct exception_table_entry *fixup;
310 int write, fault;
311 unsigned long flags;
312 siginfo_t info;
313
314 tsk = current;
315 mm = tsk->mm;
316 prefetchw(&mm->mmap_sem);
317
318 /* get the address */
319 address = read_cr2();
320
321 info.si_code = SEGV_MAPERR;
322
323
324 /*
325 * We fault-in kernel-space virtual memory on-demand. The
326 * 'reference' page table is init_mm.pgd.
327 *
328 * NOTE! We MUST NOT take any locks for this case. We may
329 * be in an interrupt or a critical region, and should
330 * only copy the information from the master page table,
331 * nothing more.
332 *
333 * This verifies that the fault happens in kernel space
334 * (error_code & 4) == 0, and that the fault was not a
335 * protection error (error_code & 9) == 0.
336 */
337 if (unlikely(address >= TASK_SIZE64)) {
338 /*
339 * Don't check for the module range here: its PML4
340 * is always initialized because it's shared with the main
341 * kernel text. Only vmalloc may need PML4 syncups.
342 */
343 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
344 ((address >= VMALLOC_START && address < VMALLOC_END))) {
345 if (vmalloc_fault(address) >= 0)
346 return;
347 }
348 if (notify_page_fault(regs, error_code) == NOTIFY_STOP)
349 return;
350 /*
351 * Don't take the mm semaphore here. If we fixup a prefetch
352 * fault we could otherwise deadlock.
353 */
354 goto bad_area_nosemaphore;
355 }
356
357 if (notify_page_fault(regs, error_code) == NOTIFY_STOP)
358 return;
359
360 if (likely(regs->eflags & X86_EFLAGS_IF))
361 local_irq_enable();
362
363 if (unlikely(page_fault_trace))
364 printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
365 regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code);
366
367 if (unlikely(error_code & PF_RSVD))
368 pgtable_bad(address, regs, error_code);
369
370 /*
371 * If we're in an interrupt or have no user
372 * context, we must not take the fault..
373 */
374 if (unlikely(in_atomic() || !mm))
375 goto bad_area_nosemaphore;
376
377 /*
378 * User-mode registers count as a user access even for any
379 * potential system fault or CPU buglet.
380 */
381 if (user_mode_vm(regs))
382 error_code |= PF_USER;
383
384 again:
385 /* When running in the kernel we expect faults to occur only to
386 * addresses in user space. All other faults represent errors in the
387 * kernel and should generate an OOPS. Unfortunatly, in the case of an
388 * erroneous fault occurring in a code path which already holds mmap_sem
389 * we will deadlock attempting to validate the fault against the
390 * address space. Luckily the kernel only validly references user
391 * space from well defined areas of code, which are listed in the
392 * exceptions table.
393 *
394 * As the vast majority of faults will be valid we will only perform
395 * the source reference check when there is a possibilty of a deadlock.
396 * Attempt to lock the address space, if we cannot we then validate the
397 * source. If this is invalid we can skip the address space check,
398 * thus avoiding the deadlock.
399 */
400 if (!down_read_trylock(&mm->mmap_sem)) {
401 if ((error_code & PF_USER) == 0 &&
402 !search_exception_tables(regs->rip))
403 goto bad_area_nosemaphore;
404 down_read(&mm->mmap_sem);
405 }
406
407 vma = find_vma(mm, address);
408 if (!vma)
409 goto bad_area;
410 if (likely(vma->vm_start <= address))
411 goto good_area;
412 if (!(vma->vm_flags & VM_GROWSDOWN))
413 goto bad_area;
414 if (error_code & 4) {
415 /* Allow userspace just enough access below the stack pointer
416 * to let the 'enter' instruction work.
417 */
418 if (address + 65536 + 32 * sizeof(unsigned long) < regs->rsp)
419 goto bad_area;
420 }
421 if (expand_stack(vma, address))
422 goto bad_area;
423/*
424 * Ok, we have a good vm_area for this memory access, so
425 * we can handle it..
426 */
427good_area:
428 info.si_code = SEGV_ACCERR;
429 write = 0;
430 switch (error_code & (PF_PROT|PF_WRITE)) {
431 default: /* 3: write, present */
432 /* fall through */
433 case PF_WRITE: /* write, not present */
434 if (!(vma->vm_flags & VM_WRITE))
435 goto bad_area;
436 write++;
437 break;
438 case PF_PROT: /* read, present */
439 goto bad_area;
440 case 0: /* read, not present */
441 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
442 goto bad_area;
443 }
444
445 /*
446 * If for any reason at all we couldn't handle the fault,
447 * make sure we exit gracefully rather than endlessly redo
448 * the fault.
449 */
450 fault = handle_mm_fault(mm, vma, address, write);
451 if (unlikely(fault & VM_FAULT_ERROR)) {
452 if (fault & VM_FAULT_OOM)
453 goto out_of_memory;
454 else if (fault & VM_FAULT_SIGBUS)
455 goto do_sigbus;
456 BUG();
457 }
458 if (fault & VM_FAULT_MAJOR)
459 tsk->maj_flt++;
460 else
461 tsk->min_flt++;
462 up_read(&mm->mmap_sem);
463 return;
464
465/*
466 * Something tried to access memory that isn't in our memory map..
467 * Fix it, but check if it's kernel or user first..
468 */
469bad_area:
470 up_read(&mm->mmap_sem);
471
472bad_area_nosemaphore:
473 /* User mode accesses just cause a SIGSEGV */
474 if (error_code & PF_USER) {
475
476 /*
477 * It's possible to have interrupts off here.
478 */
479 local_irq_enable();
480
481 if (is_prefetch(regs, address, error_code))
482 return;
483
484 /* Work around K8 erratum #100 K8 in compat mode
485 occasionally jumps to illegal addresses >4GB. We
486 catch this here in the page fault handler because
487 these addresses are not reachable. Just detect this
488 case and return. Any code segment in LDT is
489 compatibility mode. */
490 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
491 (address >> 32))
492 return;
493
494 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
495 printk_ratelimit()) {
496 printk(
497 "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n",
498 tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
499 tsk->comm, tsk->pid, address, regs->rip,
500 regs->rsp, error_code);
501 }
502
503 tsk->thread.cr2 = address;
504 /* Kernel addresses are always protection faults */
505 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
506 tsk->thread.trap_no = 14;
507 info.si_signo = SIGSEGV;
508 info.si_errno = 0;
509 /* info.si_code has been set above */
510 info.si_addr = (void __user *)address;
511 force_sig_info(SIGSEGV, &info, tsk);
512 return;
513 }
514
515no_context:
516
517 /* Are we prepared to handle this kernel fault? */
518 fixup = search_exception_tables(regs->rip);
519 if (fixup) {
520 regs->rip = fixup->fixup;
521 return;
522 }
523
524 /*
525 * Hall of shame of CPU/BIOS bugs.
526 */
527
528 if (is_prefetch(regs, address, error_code))
529 return;
530
531 if (is_errata93(regs, address))
532 return;
533
534/*
535 * Oops. The kernel tried to access some bad page. We'll have to
536 * terminate things with extreme prejudice.
537 */
538
539 flags = oops_begin();
540
541 if (address < PAGE_SIZE)
542 printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
543 else
544 printk(KERN_ALERT "Unable to handle kernel paging request");
545 printk(" at %016lx RIP: \n" KERN_ALERT,address);
546 printk_address(regs->rip);
547 dump_pagetable(address);
548 tsk->thread.cr2 = address;
549 tsk->thread.trap_no = 14;
550 tsk->thread.error_code = error_code;
551 __die("Oops", regs, error_code);
552 /* Executive summary in case the body of the oops scrolled away */
553 printk(KERN_EMERG "CR2: %016lx\n", address);
554 oops_end(flags);
555 do_exit(SIGKILL);
556
557/*
558 * We ran out of memory, or some other thing happened to us that made
559 * us unable to handle the page fault gracefully.
560 */
561out_of_memory:
562 up_read(&mm->mmap_sem);
563 if (is_init(current)) {
564 yield();
565 goto again;
566 }
567 printk("VM: killing process %s\n", tsk->comm);
568 if (error_code & 4)
569 do_group_exit(SIGKILL);
570 goto no_context;
571
572do_sigbus:
573 up_read(&mm->mmap_sem);
574
575 /* Kernel mode? Handle exceptions or die */
576 if (!(error_code & PF_USER))
577 goto no_context;
578
579 tsk->thread.cr2 = address;
580 tsk->thread.error_code = error_code;
581 tsk->thread.trap_no = 14;
582 info.si_signo = SIGBUS;
583 info.si_errno = 0;
584 info.si_code = BUS_ADRERR;
585 info.si_addr = (void __user *)address;
586 force_sig_info(SIGBUS, &info, tsk);
587 return;
588}
589
590DEFINE_SPINLOCK(pgd_lock);
591LIST_HEAD(pgd_list);
592
593void vmalloc_sync_all(void)
594{
595 /* Note that races in the updates of insync and start aren't
596 problematic:
597 insync can only get set bits added, and updates to start are only
598 improving performance (without affecting correctness if undone). */
599 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
600 static unsigned long start = VMALLOC_START & PGDIR_MASK;
601 unsigned long address;
602
603 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
604 if (!test_bit(pgd_index(address), insync)) {
605 const pgd_t *pgd_ref = pgd_offset_k(address);
606 struct page *page;
607
608 if (pgd_none(*pgd_ref))
609 continue;
610 spin_lock(&pgd_lock);
611 list_for_each_entry(page, &pgd_list, lru) {
612 pgd_t *pgd;
613 pgd = (pgd_t *)page_address(page) + pgd_index(address);
614 if (pgd_none(*pgd))
615 set_pgd(pgd, *pgd_ref);
616 else
617 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
618 }
619 spin_unlock(&pgd_lock);
620 set_bit(pgd_index(address), insync);
621 }
622 if (address == start)
623 start = address + PGDIR_SIZE;
624 }
625 /* Check that there is no need to do the same for the modules area. */
626 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
627 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
628 (__START_KERNEL & PGDIR_MASK)));
629}
630
631static int __init enable_pagefaulttrace(char *str)
632{
633 page_fault_trace = 1;
634 return 1;
635}
636__setup("pagefaulttrace", enable_pagefaulttrace);
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
new file mode 100644
index 000000000000..1c3bf95f7356
--- /dev/null
+++ b/arch/x86/mm/highmem_32.c
@@ -0,0 +1,113 @@
1#include <linux/highmem.h>
2#include <linux/module.h>
3
4void *kmap(struct page *page)
5{
6 might_sleep();
7 if (!PageHighMem(page))
8 return page_address(page);
9 return kmap_high(page);
10}
11
12void kunmap(struct page *page)
13{
14 if (in_interrupt())
15 BUG();
16 if (!PageHighMem(page))
17 return;
18 kunmap_high(page);
19}
20
21/*
22 * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
23 * no global lock is needed and because the kmap code must perform a global TLB
24 * invalidation when the kmap pool wraps.
25 *
26 * However when holding an atomic kmap is is not legal to sleep, so atomic
27 * kmaps are appropriate for short, tight code paths only.
28 */
29void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
30{
31 enum fixed_addresses idx;
32 unsigned long vaddr;
33
34 /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
35 pagefault_disable();
36
37 if (!PageHighMem(page))
38 return page_address(page);
39
40 idx = type + KM_TYPE_NR*smp_processor_id();
41 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
42 BUG_ON(!pte_none(*(kmap_pte-idx)));
43 set_pte(kmap_pte-idx, mk_pte(page, prot));
44 arch_flush_lazy_mmu_mode();
45
46 return (void *)vaddr;
47}
48
49void *kmap_atomic(struct page *page, enum km_type type)
50{
51 return kmap_atomic_prot(page, type, kmap_prot);
52}
53
54void kunmap_atomic(void *kvaddr, enum km_type type)
55{
56 unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
57 enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
58
59 /*
60 * Force other mappings to Oops if they'll try to access this pte
61 * without first remap it. Keeping stale mappings around is a bad idea
62 * also, in case the page changes cacheability attributes or becomes
63 * a protected page in a hypervisor.
64 */
65 if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx))
66 kpte_clear_flush(kmap_pte-idx, vaddr);
67 else {
68#ifdef CONFIG_DEBUG_HIGHMEM
69 BUG_ON(vaddr < PAGE_OFFSET);
70 BUG_ON(vaddr >= (unsigned long)high_memory);
71#endif
72 }
73
74 arch_flush_lazy_mmu_mode();
75 pagefault_enable();
76}
77
78/* This is the same as kmap_atomic() but can map memory that doesn't
79 * have a struct page associated with it.
80 */
81void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
82{
83 enum fixed_addresses idx;
84 unsigned long vaddr;
85
86 pagefault_disable();
87
88 idx = type + KM_TYPE_NR*smp_processor_id();
89 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
90 set_pte(kmap_pte-idx, pfn_pte(pfn, kmap_prot));
91 arch_flush_lazy_mmu_mode();
92
93 return (void*) vaddr;
94}
95
96struct page *kmap_atomic_to_page(void *ptr)
97{
98 unsigned long idx, vaddr = (unsigned long)ptr;
99 pte_t *pte;
100
101 if (vaddr < FIXADDR_START)
102 return virt_to_page(ptr);
103
104 idx = virt_to_fix(vaddr);
105 pte = kmap_pte - (idx - FIX_KMAP_BEGIN);
106 return pte_page(*pte);
107}
108
109EXPORT_SYMBOL(kmap);
110EXPORT_SYMBOL(kunmap);
111EXPORT_SYMBOL(kmap_atomic);
112EXPORT_SYMBOL(kunmap_atomic);
113EXPORT_SYMBOL(kmap_atomic_to_page);
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
new file mode 100644
index 000000000000..6c06d9c0488e
--- /dev/null
+++ b/arch/x86/mm/hugetlbpage.c
@@ -0,0 +1,391 @@
1/*
2 * IA-32 Huge TLB Page Support for Kernel.
3 *
4 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
5 */
6
7#include <linux/init.h>
8#include <linux/fs.h>
9#include <linux/mm.h>
10#include <linux/hugetlb.h>
11#include <linux/pagemap.h>
12#include <linux/slab.h>
13#include <linux/err.h>
14#include <linux/sysctl.h>
15#include <asm/mman.h>
16#include <asm/tlb.h>
17#include <asm/tlbflush.h>
18
19static unsigned long page_table_shareable(struct vm_area_struct *svma,
20 struct vm_area_struct *vma,
21 unsigned long addr, pgoff_t idx)
22{
23 unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
24 svma->vm_start;
25 unsigned long sbase = saddr & PUD_MASK;
26 unsigned long s_end = sbase + PUD_SIZE;
27
28 /*
29 * match the virtual addresses, permission and the alignment of the
30 * page table page.
31 */
32 if (pmd_index(addr) != pmd_index(saddr) ||
33 vma->vm_flags != svma->vm_flags ||
34 sbase < svma->vm_start || svma->vm_end < s_end)
35 return 0;
36
37 return saddr;
38}
39
40static int vma_shareable(struct vm_area_struct *vma, unsigned long addr)
41{
42 unsigned long base = addr & PUD_MASK;
43 unsigned long end = base + PUD_SIZE;
44
45 /*
46 * check on proper vm_flags and page table alignment
47 */
48 if (vma->vm_flags & VM_MAYSHARE &&
49 vma->vm_start <= base && end <= vma->vm_end)
50 return 1;
51 return 0;
52}
53
54/*
55 * search for a shareable pmd page for hugetlb.
56 */
57static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
58{
59 struct vm_area_struct *vma = find_vma(mm, addr);
60 struct address_space *mapping = vma->vm_file->f_mapping;
61 pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
62 vma->vm_pgoff;
63 struct prio_tree_iter iter;
64 struct vm_area_struct *svma;
65 unsigned long saddr;
66 pte_t *spte = NULL;
67
68 if (!vma_shareable(vma, addr))
69 return;
70
71 spin_lock(&mapping->i_mmap_lock);
72 vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) {
73 if (svma == vma)
74 continue;
75
76 saddr = page_table_shareable(svma, vma, addr, idx);
77 if (saddr) {
78 spte = huge_pte_offset(svma->vm_mm, saddr);
79 if (spte) {
80 get_page(virt_to_page(spte));
81 break;
82 }
83 }
84 }
85
86 if (!spte)
87 goto out;
88
89 spin_lock(&mm->page_table_lock);
90 if (pud_none(*pud))
91 pud_populate(mm, pud, (unsigned long) spte & PAGE_MASK);
92 else
93 put_page(virt_to_page(spte));
94 spin_unlock(&mm->page_table_lock);
95out:
96 spin_unlock(&mapping->i_mmap_lock);
97}
98
99/*
100 * unmap huge page backed by shared pte.
101 *
102 * Hugetlb pte page is ref counted at the time of mapping. If pte is shared
103 * indicated by page_count > 1, unmap is achieved by clearing pud and
104 * decrementing the ref count. If count == 1, the pte page is not shared.
105 *
106 * called with vma->vm_mm->page_table_lock held.
107 *
108 * returns: 1 successfully unmapped a shared pte page
109 * 0 the underlying pte page is not shared, or it is the last user
110 */
111int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
112{
113 pgd_t *pgd = pgd_offset(mm, *addr);
114 pud_t *pud = pud_offset(pgd, *addr);
115
116 BUG_ON(page_count(virt_to_page(ptep)) == 0);
117 if (page_count(virt_to_page(ptep)) == 1)
118 return 0;
119
120 pud_clear(pud);
121 put_page(virt_to_page(ptep));
122 *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
123 return 1;
124}
125
126pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
127{
128 pgd_t *pgd;
129 pud_t *pud;
130 pte_t *pte = NULL;
131
132 pgd = pgd_offset(mm, addr);
133 pud = pud_alloc(mm, pgd, addr);
134 if (pud) {
135 if (pud_none(*pud))
136 huge_pmd_share(mm, addr, pud);
137 pte = (pte_t *) pmd_alloc(mm, pud, addr);
138 }
139 BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
140
141 return pte;
142}
143
144pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
145{
146 pgd_t *pgd;
147 pud_t *pud;
148 pmd_t *pmd = NULL;
149
150 pgd = pgd_offset(mm, addr);
151 if (pgd_present(*pgd)) {
152 pud = pud_offset(pgd, addr);
153 if (pud_present(*pud))
154 pmd = pmd_offset(pud, addr);
155 }
156 return (pte_t *) pmd;
157}
158
159#if 0 /* This is just for testing */
160struct page *
161follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
162{
163 unsigned long start = address;
164 int length = 1;
165 int nr;
166 struct page *page;
167 struct vm_area_struct *vma;
168
169 vma = find_vma(mm, addr);
170 if (!vma || !is_vm_hugetlb_page(vma))
171 return ERR_PTR(-EINVAL);
172
173 pte = huge_pte_offset(mm, address);
174
175 /* hugetlb should be locked, and hence, prefaulted */
176 WARN_ON(!pte || pte_none(*pte));
177
178 page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
179
180 WARN_ON(!PageCompound(page));
181
182 return page;
183}
184
185int pmd_huge(pmd_t pmd)
186{
187 return 0;
188}
189
190struct page *
191follow_huge_pmd(struct mm_struct *mm, unsigned long address,
192 pmd_t *pmd, int write)
193{
194 return NULL;
195}
196
197#else
198
199struct page *
200follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
201{
202 return ERR_PTR(-EINVAL);
203}
204
205int pmd_huge(pmd_t pmd)
206{
207 return !!(pmd_val(pmd) & _PAGE_PSE);
208}
209
210struct page *
211follow_huge_pmd(struct mm_struct *mm, unsigned long address,
212 pmd_t *pmd, int write)
213{
214 struct page *page;
215
216 page = pte_page(*(pte_t *)pmd);
217 if (page)
218 page += ((address & ~HPAGE_MASK) >> PAGE_SHIFT);
219 return page;
220}
221#endif
222
223/* x86_64 also uses this file */
224
225#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
226static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
227 unsigned long addr, unsigned long len,
228 unsigned long pgoff, unsigned long flags)
229{
230 struct mm_struct *mm = current->mm;
231 struct vm_area_struct *vma;
232 unsigned long start_addr;
233
234 if (len > mm->cached_hole_size) {
235 start_addr = mm->free_area_cache;
236 } else {
237 start_addr = TASK_UNMAPPED_BASE;
238 mm->cached_hole_size = 0;
239 }
240
241full_search:
242 addr = ALIGN(start_addr, HPAGE_SIZE);
243
244 for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
245 /* At this point: (!vma || addr < vma->vm_end). */
246 if (TASK_SIZE - len < addr) {
247 /*
248 * Start a new search - just in case we missed
249 * some holes.
250 */
251 if (start_addr != TASK_UNMAPPED_BASE) {
252 start_addr = TASK_UNMAPPED_BASE;
253 mm->cached_hole_size = 0;
254 goto full_search;
255 }
256 return -ENOMEM;
257 }
258 if (!vma || addr + len <= vma->vm_start) {
259 mm->free_area_cache = addr + len;
260 return addr;
261 }
262 if (addr + mm->cached_hole_size < vma->vm_start)
263 mm->cached_hole_size = vma->vm_start - addr;
264 addr = ALIGN(vma->vm_end, HPAGE_SIZE);
265 }
266}
267
268static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
269 unsigned long addr0, unsigned long len,
270 unsigned long pgoff, unsigned long flags)
271{
272 struct mm_struct *mm = current->mm;
273 struct vm_area_struct *vma, *prev_vma;
274 unsigned long base = mm->mmap_base, addr = addr0;
275 unsigned long largest_hole = mm->cached_hole_size;
276 int first_time = 1;
277
278 /* don't allow allocations above current base */
279 if (mm->free_area_cache > base)
280 mm->free_area_cache = base;
281
282 if (len <= largest_hole) {
283 largest_hole = 0;
284 mm->free_area_cache = base;
285 }
286try_again:
287 /* make sure it can fit in the remaining address space */
288 if (mm->free_area_cache < len)
289 goto fail;
290
291 /* either no address requested or cant fit in requested address hole */
292 addr = (mm->free_area_cache - len) & HPAGE_MASK;
293 do {
294 /*
295 * Lookup failure means no vma is above this address,
296 * i.e. return with success:
297 */
298 if (!(vma = find_vma_prev(mm, addr, &prev_vma)))
299 return addr;
300
301 /*
302 * new region fits between prev_vma->vm_end and
303 * vma->vm_start, use it:
304 */
305 if (addr + len <= vma->vm_start &&
306 (!prev_vma || (addr >= prev_vma->vm_end))) {
307 /* remember the address as a hint for next time */
308 mm->cached_hole_size = largest_hole;
309 return (mm->free_area_cache = addr);
310 } else {
311 /* pull free_area_cache down to the first hole */
312 if (mm->free_area_cache == vma->vm_end) {
313 mm->free_area_cache = vma->vm_start;
314 mm->cached_hole_size = largest_hole;
315 }
316 }
317
318 /* remember the largest hole we saw so far */
319 if (addr + largest_hole < vma->vm_start)
320 largest_hole = vma->vm_start - addr;
321
322 /* try just below the current vma->vm_start */
323 addr = (vma->vm_start - len) & HPAGE_MASK;
324 } while (len <= vma->vm_start);
325
326fail:
327 /*
328 * if hint left us with no space for the requested
329 * mapping then try again:
330 */
331 if (first_time) {
332 mm->free_area_cache = base;
333 largest_hole = 0;
334 first_time = 0;
335 goto try_again;
336 }
337 /*
338 * A failed mmap() very likely causes application failure,
339 * so fall back to the bottom-up function here. This scenario
340 * can happen with large stack limits and large mmap()
341 * allocations.
342 */
343 mm->free_area_cache = TASK_UNMAPPED_BASE;
344 mm->cached_hole_size = ~0UL;
345 addr = hugetlb_get_unmapped_area_bottomup(file, addr0,
346 len, pgoff, flags);
347
348 /*
349 * Restore the topdown base:
350 */
351 mm->free_area_cache = base;
352 mm->cached_hole_size = ~0UL;
353
354 return addr;
355}
356
357unsigned long
358hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
359 unsigned long len, unsigned long pgoff, unsigned long flags)
360{
361 struct mm_struct *mm = current->mm;
362 struct vm_area_struct *vma;
363
364 if (len & ~HPAGE_MASK)
365 return -EINVAL;
366 if (len > TASK_SIZE)
367 return -ENOMEM;
368
369 if (flags & MAP_FIXED) {
370 if (prepare_hugepage_range(addr, len))
371 return -EINVAL;
372 return addr;
373 }
374
375 if (addr) {
376 addr = ALIGN(addr, HPAGE_SIZE);
377 vma = find_vma(mm, addr);
378 if (TASK_SIZE - len >= addr &&
379 (!vma || addr + len <= vma->vm_start))
380 return addr;
381 }
382 if (mm->get_unmapped_area == arch_get_unmapped_area)
383 return hugetlb_get_unmapped_area_bottomup(file, addr, len,
384 pgoff, flags);
385 else
386 return hugetlb_get_unmapped_area_topdown(file, addr, len,
387 pgoff, flags);
388}
389
390#endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/
391
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
new file mode 100644
index 000000000000..730a5b177b1f
--- /dev/null
+++ b/arch/x86/mm/init_32.c
@@ -0,0 +1,858 @@
1/*
2 * linux/arch/i386/mm/init.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 *
6 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
7 */
8
9#include <linux/module.h>
10#include <linux/signal.h>
11#include <linux/sched.h>
12#include <linux/kernel.h>
13#include <linux/errno.h>
14#include <linux/string.h>
15#include <linux/types.h>
16#include <linux/ptrace.h>
17#include <linux/mman.h>
18#include <linux/mm.h>
19#include <linux/hugetlb.h>
20#include <linux/swap.h>
21#include <linux/smp.h>
22#include <linux/init.h>
23#include <linux/highmem.h>
24#include <linux/pagemap.h>
25#include <linux/pfn.h>
26#include <linux/poison.h>
27#include <linux/bootmem.h>
28#include <linux/slab.h>
29#include <linux/proc_fs.h>
30#include <linux/efi.h>
31#include <linux/memory_hotplug.h>
32#include <linux/initrd.h>
33#include <linux/cpumask.h>
34
35#include <asm/processor.h>
36#include <asm/system.h>
37#include <asm/uaccess.h>
38#include <asm/pgtable.h>
39#include <asm/dma.h>
40#include <asm/fixmap.h>
41#include <asm/e820.h>
42#include <asm/apic.h>
43#include <asm/tlb.h>
44#include <asm/tlbflush.h>
45#include <asm/sections.h>
46#include <asm/paravirt.h>
47
48unsigned int __VMALLOC_RESERVE = 128 << 20;
49
50DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
51unsigned long highstart_pfn, highend_pfn;
52
53static int noinline do_test_wp_bit(void);
54
55/*
56 * Creates a middle page table and puts a pointer to it in the
57 * given global directory entry. This only returns the gd entry
58 * in non-PAE compilation mode, since the middle layer is folded.
59 */
60static pmd_t * __init one_md_table_init(pgd_t *pgd)
61{
62 pud_t *pud;
63 pmd_t *pmd_table;
64
65#ifdef CONFIG_X86_PAE
66 if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
67 pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
68
69 paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT);
70 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
71 pud = pud_offset(pgd, 0);
72 if (pmd_table != pmd_offset(pud, 0))
73 BUG();
74 }
75#endif
76 pud = pud_offset(pgd, 0);
77 pmd_table = pmd_offset(pud, 0);
78 return pmd_table;
79}
80
81/*
82 * Create a page table and place a pointer to it in a middle page
83 * directory entry.
84 */
85static pte_t * __init one_page_table_init(pmd_t *pmd)
86{
87 if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
88 pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE);
89
90 paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT);
91 set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
92 BUG_ON(page_table != pte_offset_kernel(pmd, 0));
93 }
94
95 return pte_offset_kernel(pmd, 0);
96}
97
98/*
99 * This function initializes a certain range of kernel virtual memory
100 * with new bootmem page tables, everywhere page tables are missing in
101 * the given range.
102 */
103
104/*
105 * NOTE: The pagetables are allocated contiguous on the physical space
106 * so we can cache the place of the first one and move around without
107 * checking the pgd every time.
108 */
109static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base)
110{
111 pgd_t *pgd;
112 pmd_t *pmd;
113 int pgd_idx, pmd_idx;
114 unsigned long vaddr;
115
116 vaddr = start;
117 pgd_idx = pgd_index(vaddr);
118 pmd_idx = pmd_index(vaddr);
119 pgd = pgd_base + pgd_idx;
120
121 for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
122 pmd = one_md_table_init(pgd);
123 pmd = pmd + pmd_index(vaddr);
124 for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) {
125 one_page_table_init(pmd);
126
127 vaddr += PMD_SIZE;
128 }
129 pmd_idx = 0;
130 }
131}
132
133static inline int is_kernel_text(unsigned long addr)
134{
135 if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end)
136 return 1;
137 return 0;
138}
139
140/*
141 * This maps the physical memory to kernel virtual address space, a total
142 * of max_low_pfn pages, by creating page tables starting from address
143 * PAGE_OFFSET.
144 */
145static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
146{
147 unsigned long pfn;
148 pgd_t *pgd;
149 pmd_t *pmd;
150 pte_t *pte;
151 int pgd_idx, pmd_idx, pte_ofs;
152
153 pgd_idx = pgd_index(PAGE_OFFSET);
154 pgd = pgd_base + pgd_idx;
155 pfn = 0;
156
157 for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
158 pmd = one_md_table_init(pgd);
159 if (pfn >= max_low_pfn)
160 continue;
161 for (pmd_idx = 0; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) {
162 unsigned int address = pfn * PAGE_SIZE + PAGE_OFFSET;
163
164 /* Map with big pages if possible, otherwise create normal page tables. */
165 if (cpu_has_pse) {
166 unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1;
167 if (is_kernel_text(address) || is_kernel_text(address2))
168 set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC));
169 else
170 set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE));
171
172 pfn += PTRS_PER_PTE;
173 } else {
174 pte = one_page_table_init(pmd);
175
176 for (pte_ofs = 0;
177 pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn;
178 pte++, pfn++, pte_ofs++, address += PAGE_SIZE) {
179 if (is_kernel_text(address))
180 set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));
181 else
182 set_pte(pte, pfn_pte(pfn, PAGE_KERNEL));
183 }
184 }
185 }
186 }
187}
188
189static inline int page_kills_ppro(unsigned long pagenr)
190{
191 if (pagenr >= 0x70000 && pagenr <= 0x7003F)
192 return 1;
193 return 0;
194}
195
196int page_is_ram(unsigned long pagenr)
197{
198 int i;
199 unsigned long addr, end;
200
201 if (efi_enabled) {
202 efi_memory_desc_t *md;
203 void *p;
204
205 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
206 md = p;
207 if (!is_available_memory(md))
208 continue;
209 addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT;
210 end = (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >> PAGE_SHIFT;
211
212 if ((pagenr >= addr) && (pagenr < end))
213 return 1;
214 }
215 return 0;
216 }
217
218 for (i = 0; i < e820.nr_map; i++) {
219
220 if (e820.map[i].type != E820_RAM) /* not usable memory */
221 continue;
222 /*
223 * !!!FIXME!!! Some BIOSen report areas as RAM that
224 * are not. Notably the 640->1Mb area. We need a sanity
225 * check here.
226 */
227 addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT;
228 end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT;
229 if ((pagenr >= addr) && (pagenr < end))
230 return 1;
231 }
232 return 0;
233}
234
235#ifdef CONFIG_HIGHMEM
236pte_t *kmap_pte;
237pgprot_t kmap_prot;
238
239#define kmap_get_fixmap_pte(vaddr) \
240 pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), (vaddr)), (vaddr))
241
242static void __init kmap_init(void)
243{
244 unsigned long kmap_vstart;
245
246 /* cache the first kmap pte */
247 kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
248 kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
249
250 kmap_prot = PAGE_KERNEL;
251}
252
253static void __init permanent_kmaps_init(pgd_t *pgd_base)
254{
255 pgd_t *pgd;
256 pud_t *pud;
257 pmd_t *pmd;
258 pte_t *pte;
259 unsigned long vaddr;
260
261 vaddr = PKMAP_BASE;
262 page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
263
264 pgd = swapper_pg_dir + pgd_index(vaddr);
265 pud = pud_offset(pgd, vaddr);
266 pmd = pmd_offset(pud, vaddr);
267 pte = pte_offset_kernel(pmd, vaddr);
268 pkmap_page_table = pte;
269}
270
271static void __meminit free_new_highpage(struct page *page)
272{
273 init_page_count(page);
274 __free_page(page);
275 totalhigh_pages++;
276}
277
278void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
279{
280 if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) {
281 ClearPageReserved(page);
282 free_new_highpage(page);
283 } else
284 SetPageReserved(page);
285}
286
287static int __meminit add_one_highpage_hotplug(struct page *page, unsigned long pfn)
288{
289 free_new_highpage(page);
290 totalram_pages++;
291#ifdef CONFIG_FLATMEM
292 max_mapnr = max(pfn, max_mapnr);
293#endif
294 num_physpages++;
295 return 0;
296}
297
298/*
299 * Not currently handling the NUMA case.
300 * Assuming single node and all memory that
301 * has been added dynamically that would be
302 * onlined here is in HIGHMEM
303 */
304void __meminit online_page(struct page *page)
305{
306 ClearPageReserved(page);
307 add_one_highpage_hotplug(page, page_to_pfn(page));
308}
309
310
311#ifdef CONFIG_NUMA
312extern void set_highmem_pages_init(int);
313#else
314static void __init set_highmem_pages_init(int bad_ppro)
315{
316 int pfn;
317 for (pfn = highstart_pfn; pfn < highend_pfn; pfn++)
318 add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro);
319 totalram_pages += totalhigh_pages;
320}
321#endif /* CONFIG_FLATMEM */
322
323#else
324#define kmap_init() do { } while (0)
325#define permanent_kmaps_init(pgd_base) do { } while (0)
326#define set_highmem_pages_init(bad_ppro) do { } while (0)
327#endif /* CONFIG_HIGHMEM */
328
329unsigned long long __PAGE_KERNEL = _PAGE_KERNEL;
330EXPORT_SYMBOL(__PAGE_KERNEL);
331unsigned long long __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
332
333#ifdef CONFIG_NUMA
334extern void __init remap_numa_kva(void);
335#else
336#define remap_numa_kva() do {} while (0)
337#endif
338
339void __init native_pagetable_setup_start(pgd_t *base)
340{
341#ifdef CONFIG_X86_PAE
342 int i;
343
344 /*
345 * Init entries of the first-level page table to the
346 * zero page, if they haven't already been set up.
347 *
348 * In a normal native boot, we'll be running on a
349 * pagetable rooted in swapper_pg_dir, but not in PAE
350 * mode, so this will end up clobbering the mappings
351 * for the lower 24Mbytes of the address space,
352 * without affecting the kernel address space.
353 */
354 for (i = 0; i < USER_PTRS_PER_PGD; i++)
355 set_pgd(&base[i],
356 __pgd(__pa(empty_zero_page) | _PAGE_PRESENT));
357
358 /* Make sure kernel address space is empty so that a pagetable
359 will be allocated for it. */
360 memset(&base[USER_PTRS_PER_PGD], 0,
361 KERNEL_PGD_PTRS * sizeof(pgd_t));
362#else
363 paravirt_alloc_pd(__pa(swapper_pg_dir) >> PAGE_SHIFT);
364#endif
365}
366
367void __init native_pagetable_setup_done(pgd_t *base)
368{
369#ifdef CONFIG_X86_PAE
370 /*
371 * Add low memory identity-mappings - SMP needs it when
372 * starting up on an AP from real-mode. In the non-PAE
373 * case we already have these mappings through head.S.
374 * All user-space mappings are explicitly cleared after
375 * SMP startup.
376 */
377 set_pgd(&base[0], base[USER_PTRS_PER_PGD]);
378#endif
379}
380
381/*
382 * Build a proper pagetable for the kernel mappings. Up until this
383 * point, we've been running on some set of pagetables constructed by
384 * the boot process.
385 *
386 * If we're booting on native hardware, this will be a pagetable
387 * constructed in arch/i386/kernel/head.S, and not running in PAE mode
388 * (even if we'll end up running in PAE). The root of the pagetable
389 * will be swapper_pg_dir.
390 *
391 * If we're booting paravirtualized under a hypervisor, then there are
392 * more options: we may already be running PAE, and the pagetable may
393 * or may not be based in swapper_pg_dir. In any case,
394 * paravirt_pagetable_setup_start() will set up swapper_pg_dir
395 * appropriately for the rest of the initialization to work.
396 *
397 * In general, pagetable_init() assumes that the pagetable may already
398 * be partially populated, and so it avoids stomping on any existing
399 * mappings.
400 */
401static void __init pagetable_init (void)
402{
403 unsigned long vaddr, end;
404 pgd_t *pgd_base = swapper_pg_dir;
405
406 paravirt_pagetable_setup_start(pgd_base);
407
408 /* Enable PSE if available */
409 if (cpu_has_pse)
410 set_in_cr4(X86_CR4_PSE);
411
412 /* Enable PGE if available */
413 if (cpu_has_pge) {
414 set_in_cr4(X86_CR4_PGE);
415 __PAGE_KERNEL |= _PAGE_GLOBAL;
416 __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL;
417 }
418
419 kernel_physical_mapping_init(pgd_base);
420 remap_numa_kva();
421
422 /*
423 * Fixed mappings, only the page table structure has to be
424 * created - mappings will be set by set_fixmap():
425 */
426 vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
427 end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
428 page_table_range_init(vaddr, end, pgd_base);
429
430 permanent_kmaps_init(pgd_base);
431
432 paravirt_pagetable_setup_done(pgd_base);
433}
434
435#if defined(CONFIG_HIBERNATION) || defined(CONFIG_ACPI)
436/*
437 * Swap suspend & friends need this for resume because things like the intel-agp
438 * driver might have split up a kernel 4MB mapping.
439 */
440char __nosavedata swsusp_pg_dir[PAGE_SIZE]
441 __attribute__ ((aligned (PAGE_SIZE)));
442
443static inline void save_pg_dir(void)
444{
445 memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE);
446}
447#else
448static inline void save_pg_dir(void)
449{
450}
451#endif
452
453void zap_low_mappings (void)
454{
455 int i;
456
457 save_pg_dir();
458
459 /*
460 * Zap initial low-memory mappings.
461 *
462 * Note that "pgd_clear()" doesn't do it for
463 * us, because pgd_clear() is a no-op on i386.
464 */
465 for (i = 0; i < USER_PTRS_PER_PGD; i++)
466#ifdef CONFIG_X86_PAE
467 set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
468#else
469 set_pgd(swapper_pg_dir+i, __pgd(0));
470#endif
471 flush_tlb_all();
472}
473
474int nx_enabled = 0;
475
476#ifdef CONFIG_X86_PAE
477
478static int disable_nx __initdata = 0;
479u64 __supported_pte_mask __read_mostly = ~_PAGE_NX;
480EXPORT_SYMBOL_GPL(__supported_pte_mask);
481
482/*
483 * noexec = on|off
484 *
485 * Control non executable mappings.
486 *
487 * on Enable
488 * off Disable
489 */
490static int __init noexec_setup(char *str)
491{
492 if (!str || !strcmp(str, "on")) {
493 if (cpu_has_nx) {
494 __supported_pte_mask |= _PAGE_NX;
495 disable_nx = 0;
496 }
497 } else if (!strcmp(str,"off")) {
498 disable_nx = 1;
499 __supported_pte_mask &= ~_PAGE_NX;
500 } else
501 return -EINVAL;
502
503 return 0;
504}
505early_param("noexec", noexec_setup);
506
507static void __init set_nx(void)
508{
509 unsigned int v[4], l, h;
510
511 if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
512 cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
513 if ((v[3] & (1 << 20)) && !disable_nx) {
514 rdmsr(MSR_EFER, l, h);
515 l |= EFER_NX;
516 wrmsr(MSR_EFER, l, h);
517 nx_enabled = 1;
518 __supported_pte_mask |= _PAGE_NX;
519 }
520 }
521}
522
523/*
524 * Enables/disables executability of a given kernel page and
525 * returns the previous setting.
526 */
527int __init set_kernel_exec(unsigned long vaddr, int enable)
528{
529 pte_t *pte;
530 int ret = 1;
531
532 if (!nx_enabled)
533 goto out;
534
535 pte = lookup_address(vaddr);
536 BUG_ON(!pte);
537
538 if (!pte_exec_kernel(*pte))
539 ret = 0;
540
541 if (enable)
542 pte->pte_high &= ~(1 << (_PAGE_BIT_NX - 32));
543 else
544 pte->pte_high |= 1 << (_PAGE_BIT_NX - 32);
545 pte_update_defer(&init_mm, vaddr, pte);
546 __flush_tlb_all();
547out:
548 return ret;
549}
550
551#endif
552
553/*
554 * paging_init() sets up the page tables - note that the first 8MB are
555 * already mapped by head.S.
556 *
557 * This routines also unmaps the page at virtual kernel address 0, so
558 * that we can trap those pesky NULL-reference errors in the kernel.
559 */
560void __init paging_init(void)
561{
562#ifdef CONFIG_X86_PAE
563 set_nx();
564 if (nx_enabled)
565 printk("NX (Execute Disable) protection: active\n");
566#endif
567
568 pagetable_init();
569
570 load_cr3(swapper_pg_dir);
571
572#ifdef CONFIG_X86_PAE
573 /*
574 * We will bail out later - printk doesn't work right now so
575 * the user would just see a hanging kernel.
576 */
577 if (cpu_has_pae)
578 set_in_cr4(X86_CR4_PAE);
579#endif
580 __flush_tlb_all();
581
582 kmap_init();
583}
584
585/*
586 * Test if the WP bit works in supervisor mode. It isn't supported on 386's
587 * and also on some strange 486's (NexGen etc.). All 586+'s are OK. This
588 * used to involve black magic jumps to work around some nasty CPU bugs,
589 * but fortunately the switch to using exceptions got rid of all that.
590 */
591
592static void __init test_wp_bit(void)
593{
594 printk("Checking if this processor honours the WP bit even in supervisor mode... ");
595
596 /* Any page-aligned address will do, the test is non-destructive */
597 __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY);
598 boot_cpu_data.wp_works_ok = do_test_wp_bit();
599 clear_fixmap(FIX_WP_TEST);
600
601 if (!boot_cpu_data.wp_works_ok) {
602 printk("No.\n");
603#ifdef CONFIG_X86_WP_WORKS_OK
604 panic("This kernel doesn't support CPU's with broken WP. Recompile it for a 386!");
605#endif
606 } else {
607 printk("Ok.\n");
608 }
609}
610
611static struct kcore_list kcore_mem, kcore_vmalloc;
612
613void __init mem_init(void)
614{
615 extern int ppro_with_ram_bug(void);
616 int codesize, reservedpages, datasize, initsize;
617 int tmp;
618 int bad_ppro;
619
620#ifdef CONFIG_FLATMEM
621 BUG_ON(!mem_map);
622#endif
623
624 bad_ppro = ppro_with_ram_bug();
625
626#ifdef CONFIG_HIGHMEM
627 /* check that fixmap and pkmap do not overlap */
628 if (PKMAP_BASE+LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
629 printk(KERN_ERR "fixmap and kmap areas overlap - this will crash\n");
630 printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n",
631 PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, FIXADDR_START);
632 BUG();
633 }
634#endif
635
636 /* this will put all low memory onto the freelists */
637 totalram_pages += free_all_bootmem();
638
639 reservedpages = 0;
640 for (tmp = 0; tmp < max_low_pfn; tmp++)
641 /*
642 * Only count reserved RAM pages
643 */
644 if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
645 reservedpages++;
646
647 set_highmem_pages_init(bad_ppro);
648
649 codesize = (unsigned long) &_etext - (unsigned long) &_text;
650 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
651 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
652
653 kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
654 kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
655 VMALLOC_END-VMALLOC_START);
656
657 printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n",
658 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
659 num_physpages << (PAGE_SHIFT-10),
660 codesize >> 10,
661 reservedpages << (PAGE_SHIFT-10),
662 datasize >> 10,
663 initsize >> 10,
664 (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
665 );
666
667#if 1 /* double-sanity-check paranoia */
668 printk("virtual kernel memory layout:\n"
669 " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
670#ifdef CONFIG_HIGHMEM
671 " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
672#endif
673 " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n"
674 " lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n"
675 " .init : 0x%08lx - 0x%08lx (%4ld kB)\n"
676 " .data : 0x%08lx - 0x%08lx (%4ld kB)\n"
677 " .text : 0x%08lx - 0x%08lx (%4ld kB)\n",
678 FIXADDR_START, FIXADDR_TOP,
679 (FIXADDR_TOP - FIXADDR_START) >> 10,
680
681#ifdef CONFIG_HIGHMEM
682 PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
683 (LAST_PKMAP*PAGE_SIZE) >> 10,
684#endif
685
686 VMALLOC_START, VMALLOC_END,
687 (VMALLOC_END - VMALLOC_START) >> 20,
688
689 (unsigned long)__va(0), (unsigned long)high_memory,
690 ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,
691
692 (unsigned long)&__init_begin, (unsigned long)&__init_end,
693 ((unsigned long)&__init_end - (unsigned long)&__init_begin) >> 10,
694
695 (unsigned long)&_etext, (unsigned long)&_edata,
696 ((unsigned long)&_edata - (unsigned long)&_etext) >> 10,
697
698 (unsigned long)&_text, (unsigned long)&_etext,
699 ((unsigned long)&_etext - (unsigned long)&_text) >> 10);
700
701#ifdef CONFIG_HIGHMEM
702 BUG_ON(PKMAP_BASE+LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
703 BUG_ON(VMALLOC_END > PKMAP_BASE);
704#endif
705 BUG_ON(VMALLOC_START > VMALLOC_END);
706 BUG_ON((unsigned long)high_memory > VMALLOC_START);
707#endif /* double-sanity-check paranoia */
708
709#ifdef CONFIG_X86_PAE
710 if (!cpu_has_pae)
711 panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!");
712#endif
713 if (boot_cpu_data.wp_works_ok < 0)
714 test_wp_bit();
715
716 /*
717 * Subtle. SMP is doing it's boot stuff late (because it has to
718 * fork idle threads) - but it also needs low mappings for the
719 * protected-mode entry to work. We zap these entries only after
720 * the WP-bit has been tested.
721 */
722#ifndef CONFIG_SMP
723 zap_low_mappings();
724#endif
725}
726
727#ifdef CONFIG_MEMORY_HOTPLUG
728int arch_add_memory(int nid, u64 start, u64 size)
729{
730 struct pglist_data *pgdata = NODE_DATA(nid);
731 struct zone *zone = pgdata->node_zones + ZONE_HIGHMEM;
732 unsigned long start_pfn = start >> PAGE_SHIFT;
733 unsigned long nr_pages = size >> PAGE_SHIFT;
734
735 return __add_pages(zone, start_pfn, nr_pages);
736}
737
738int remove_memory(u64 start, u64 size)
739{
740 return -EINVAL;
741}
742EXPORT_SYMBOL_GPL(remove_memory);
743#endif
744
745struct kmem_cache *pmd_cache;
746
747void __init pgtable_cache_init(void)
748{
749 size_t pgd_size = PTRS_PER_PGD*sizeof(pgd_t);
750
751 if (PTRS_PER_PMD > 1) {
752 pmd_cache = kmem_cache_create("pmd",
753 PTRS_PER_PMD*sizeof(pmd_t),
754 PTRS_PER_PMD*sizeof(pmd_t),
755 SLAB_PANIC,
756 pmd_ctor);
757 if (!SHARED_KERNEL_PMD) {
758 /* If we're in PAE mode and have a non-shared
759 kernel pmd, then the pgd size must be a
760 page size. This is because the pgd_list
761 links through the page structure, so there
762 can only be one pgd per page for this to
763 work. */
764 pgd_size = PAGE_SIZE;
765 }
766 }
767}
768
769/*
770 * This function cannot be __init, since exceptions don't work in that
771 * section. Put this after the callers, so that it cannot be inlined.
772 */
773static int noinline do_test_wp_bit(void)
774{
775 char tmp_reg;
776 int flag;
777
778 __asm__ __volatile__(
779 " movb %0,%1 \n"
780 "1: movb %1,%0 \n"
781 " xorl %2,%2 \n"
782 "2: \n"
783 ".section __ex_table,\"a\"\n"
784 " .align 4 \n"
785 " .long 1b,2b \n"
786 ".previous \n"
787 :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)),
788 "=q" (tmp_reg),
789 "=r" (flag)
790 :"2" (1)
791 :"memory");
792
793 return flag;
794}
795
796#ifdef CONFIG_DEBUG_RODATA
797
798void mark_rodata_ro(void)
799{
800 unsigned long start = PFN_ALIGN(_text);
801 unsigned long size = PFN_ALIGN(_etext) - start;
802
803#ifndef CONFIG_KPROBES
804#ifdef CONFIG_HOTPLUG_CPU
805 /* It must still be possible to apply SMP alternatives. */
806 if (num_possible_cpus() <= 1)
807#endif
808 {
809 change_page_attr(virt_to_page(start),
810 size >> PAGE_SHIFT, PAGE_KERNEL_RX);
811 printk("Write protecting the kernel text: %luk\n", size >> 10);
812 }
813#endif
814 start += size;
815 size = (unsigned long)__end_rodata - start;
816 change_page_attr(virt_to_page(start),
817 size >> PAGE_SHIFT, PAGE_KERNEL_RO);
818 printk("Write protecting the kernel read-only data: %luk\n",
819 size >> 10);
820
821 /*
822 * change_page_attr() requires a global_flush_tlb() call after it.
823 * We do this after the printk so that if something went wrong in the
824 * change, the printk gets out at least to give a better debug hint
825 * of who is the culprit.
826 */
827 global_flush_tlb();
828}
829#endif
830
831void free_init_pages(char *what, unsigned long begin, unsigned long end)
832{
833 unsigned long addr;
834
835 for (addr = begin; addr < end; addr += PAGE_SIZE) {
836 ClearPageReserved(virt_to_page(addr));
837 init_page_count(virt_to_page(addr));
838 memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
839 free_page(addr);
840 totalram_pages++;
841 }
842 printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
843}
844
845void free_initmem(void)
846{
847 free_init_pages("unused kernel memory",
848 (unsigned long)(&__init_begin),
849 (unsigned long)(&__init_end));
850}
851
852#ifdef CONFIG_BLK_DEV_INITRD
853void free_initrd_mem(unsigned long start, unsigned long end)
854{
855 free_init_pages("initrd memory", start, end);
856}
857#endif
858
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
new file mode 100644
index 000000000000..458893b376f8
--- /dev/null
+++ b/arch/x86/mm/init_64.c
@@ -0,0 +1,750 @@
1/*
2 * linux/arch/x86_64/mm/init.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
6 * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
7 */
8
9#include <linux/signal.h>
10#include <linux/sched.h>
11#include <linux/kernel.h>
12#include <linux/errno.h>
13#include <linux/string.h>
14#include <linux/types.h>
15#include <linux/ptrace.h>
16#include <linux/mman.h>
17#include <linux/mm.h>
18#include <linux/swap.h>
19#include <linux/smp.h>
20#include <linux/init.h>
21#include <linux/pagemap.h>
22#include <linux/bootmem.h>
23#include <linux/proc_fs.h>
24#include <linux/pci.h>
25#include <linux/pfn.h>
26#include <linux/poison.h>
27#include <linux/dma-mapping.h>
28#include <linux/module.h>
29#include <linux/memory_hotplug.h>
30#include <linux/nmi.h>
31
32#include <asm/processor.h>
33#include <asm/system.h>
34#include <asm/uaccess.h>
35#include <asm/pgtable.h>
36#include <asm/pgalloc.h>
37#include <asm/dma.h>
38#include <asm/fixmap.h>
39#include <asm/e820.h>
40#include <asm/apic.h>
41#include <asm/tlb.h>
42#include <asm/mmu_context.h>
43#include <asm/proto.h>
44#include <asm/smp.h>
45#include <asm/sections.h>
46
47#ifndef Dprintk
48#define Dprintk(x...)
49#endif
50
51const struct dma_mapping_ops* dma_ops;
52EXPORT_SYMBOL(dma_ops);
53
54static unsigned long dma_reserve __initdata;
55
56DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
57
58/*
59 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
60 * physical space so we can cache the place of the first one and move
61 * around without checking the pgd every time.
62 */
63
64void show_mem(void)
65{
66 long i, total = 0, reserved = 0;
67 long shared = 0, cached = 0;
68 pg_data_t *pgdat;
69 struct page *page;
70
71 printk(KERN_INFO "Mem-info:\n");
72 show_free_areas();
73 printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
74
75 for_each_online_pgdat(pgdat) {
76 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
77 /* this loop can take a while with 256 GB and 4k pages
78 so update the NMI watchdog */
79 if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) {
80 touch_nmi_watchdog();
81 }
82 if (!pfn_valid(pgdat->node_start_pfn + i))
83 continue;
84 page = pfn_to_page(pgdat->node_start_pfn + i);
85 total++;
86 if (PageReserved(page))
87 reserved++;
88 else if (PageSwapCache(page))
89 cached++;
90 else if (page_count(page))
91 shared += page_count(page) - 1;
92 }
93 }
94 printk(KERN_INFO "%lu pages of RAM\n", total);
95 printk(KERN_INFO "%lu reserved pages\n",reserved);
96 printk(KERN_INFO "%lu pages shared\n",shared);
97 printk(KERN_INFO "%lu pages swap cached\n",cached);
98}
99
100int after_bootmem;
101
102static __init void *spp_getpage(void)
103{
104 void *ptr;
105 if (after_bootmem)
106 ptr = (void *) get_zeroed_page(GFP_ATOMIC);
107 else
108 ptr = alloc_bootmem_pages(PAGE_SIZE);
109 if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
110 panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
111
112 Dprintk("spp_getpage %p\n", ptr);
113 return ptr;
114}
115
116static __init void set_pte_phys(unsigned long vaddr,
117 unsigned long phys, pgprot_t prot)
118{
119 pgd_t *pgd;
120 pud_t *pud;
121 pmd_t *pmd;
122 pte_t *pte, new_pte;
123
124 Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
125
126 pgd = pgd_offset_k(vaddr);
127 if (pgd_none(*pgd)) {
128 printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
129 return;
130 }
131 pud = pud_offset(pgd, vaddr);
132 if (pud_none(*pud)) {
133 pmd = (pmd_t *) spp_getpage();
134 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
135 if (pmd != pmd_offset(pud, 0)) {
136 printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
137 return;
138 }
139 }
140 pmd = pmd_offset(pud, vaddr);
141 if (pmd_none(*pmd)) {
142 pte = (pte_t *) spp_getpage();
143 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
144 if (pte != pte_offset_kernel(pmd, 0)) {
145 printk("PAGETABLE BUG #02!\n");
146 return;
147 }
148 }
149 new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
150
151 pte = pte_offset_kernel(pmd, vaddr);
152 if (!pte_none(*pte) &&
153 pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
154 pte_ERROR(*pte);
155 set_pte(pte, new_pte);
156
157 /*
158 * It's enough to flush this one mapping.
159 * (PGE mappings get flushed as well)
160 */
161 __flush_tlb_one(vaddr);
162}
163
164/* NOTE: this is meant to be run only at boot */
165void __init
166__set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
167{
168 unsigned long address = __fix_to_virt(idx);
169
170 if (idx >= __end_of_fixed_addresses) {
171 printk("Invalid __set_fixmap\n");
172 return;
173 }
174 set_pte_phys(address, phys, prot);
175}
176
177unsigned long __meminitdata table_start, table_end;
178
179static __meminit void *alloc_low_page(unsigned long *phys)
180{
181 unsigned long pfn = table_end++;
182 void *adr;
183
184 if (after_bootmem) {
185 adr = (void *)get_zeroed_page(GFP_ATOMIC);
186 *phys = __pa(adr);
187 return adr;
188 }
189
190 if (pfn >= end_pfn)
191 panic("alloc_low_page: ran out of memory");
192
193 adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE);
194 memset(adr, 0, PAGE_SIZE);
195 *phys = pfn * PAGE_SIZE;
196 return adr;
197}
198
199static __meminit void unmap_low_page(void *adr)
200{
201
202 if (after_bootmem)
203 return;
204
205 early_iounmap(adr, PAGE_SIZE);
206}
207
208/* Must run before zap_low_mappings */
209__meminit void *early_ioremap(unsigned long addr, unsigned long size)
210{
211 unsigned long vaddr;
212 pmd_t *pmd, *last_pmd;
213 int i, pmds;
214
215 pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
216 vaddr = __START_KERNEL_map;
217 pmd = level2_kernel_pgt;
218 last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
219 for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
220 for (i = 0; i < pmds; i++) {
221 if (pmd_present(pmd[i]))
222 goto next;
223 }
224 vaddr += addr & ~PMD_MASK;
225 addr &= PMD_MASK;
226 for (i = 0; i < pmds; i++, addr += PMD_SIZE)
227 set_pmd(pmd + i,__pmd(addr | _KERNPG_TABLE | _PAGE_PSE));
228 __flush_tlb();
229 return (void *)vaddr;
230 next:
231 ;
232 }
233 printk("early_ioremap(0x%lx, %lu) failed\n", addr, size);
234 return NULL;
235}
236
237/* To avoid virtual aliases later */
238__meminit void early_iounmap(void *addr, unsigned long size)
239{
240 unsigned long vaddr;
241 pmd_t *pmd;
242 int i, pmds;
243
244 vaddr = (unsigned long)addr;
245 pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
246 pmd = level2_kernel_pgt + pmd_index(vaddr);
247 for (i = 0; i < pmds; i++)
248 pmd_clear(pmd + i);
249 __flush_tlb();
250}
251
252static void __meminit
253phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
254{
255 int i = pmd_index(address);
256
257 for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
258 unsigned long entry;
259 pmd_t *pmd = pmd_page + pmd_index(address);
260
261 if (address >= end) {
262 if (!after_bootmem)
263 for (; i < PTRS_PER_PMD; i++, pmd++)
264 set_pmd(pmd, __pmd(0));
265 break;
266 }
267
268 if (pmd_val(*pmd))
269 continue;
270
271 entry = _PAGE_NX|_PAGE_PSE|_KERNPG_TABLE|_PAGE_GLOBAL|address;
272 entry &= __supported_pte_mask;
273 set_pmd(pmd, __pmd(entry));
274 }
275}
276
277static void __meminit
278phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
279{
280 pmd_t *pmd = pmd_offset(pud,0);
281 spin_lock(&init_mm.page_table_lock);
282 phys_pmd_init(pmd, address, end);
283 spin_unlock(&init_mm.page_table_lock);
284 __flush_tlb_all();
285}
286
287static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
288{
289 int i = pud_index(addr);
290
291
292 for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) {
293 unsigned long pmd_phys;
294 pud_t *pud = pud_page + pud_index(addr);
295 pmd_t *pmd;
296
297 if (addr >= end)
298 break;
299
300 if (!after_bootmem && !e820_any_mapped(addr,addr+PUD_SIZE,0)) {
301 set_pud(pud, __pud(0));
302 continue;
303 }
304
305 if (pud_val(*pud)) {
306 phys_pmd_update(pud, addr, end);
307 continue;
308 }
309
310 pmd = alloc_low_page(&pmd_phys);
311 spin_lock(&init_mm.page_table_lock);
312 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
313 phys_pmd_init(pmd, addr, end);
314 spin_unlock(&init_mm.page_table_lock);
315 unmap_low_page(pmd);
316 }
317 __flush_tlb();
318}
319
320static void __init find_early_table_space(unsigned long end)
321{
322 unsigned long puds, pmds, tables, start;
323
324 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
325 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
326 tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) +
327 round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
328
329 /* RED-PEN putting page tables only on node 0 could
330 cause a hotspot and fill up ZONE_DMA. The page tables
331 need roughly 0.5KB per GB. */
332 start = 0x8000;
333 table_start = find_e820_area(start, end, tables);
334 if (table_start == -1UL)
335 panic("Cannot find space for the kernel page tables");
336
337 table_start >>= PAGE_SHIFT;
338 table_end = table_start;
339
340 early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
341 end, table_start << PAGE_SHIFT,
342 (table_start << PAGE_SHIFT) + tables);
343}
344
345/* Setup the direct mapping of the physical memory at PAGE_OFFSET.
346 This runs before bootmem is initialized and gets pages directly from the
347 physical memory. To access them they are temporarily mapped. */
348void __meminit init_memory_mapping(unsigned long start, unsigned long end)
349{
350 unsigned long next;
351
352 Dprintk("init_memory_mapping\n");
353
354 /*
355 * Find space for the kernel direct mapping tables.
356 * Later we should allocate these tables in the local node of the memory
357 * mapped. Unfortunately this is done currently before the nodes are
358 * discovered.
359 */
360 if (!after_bootmem)
361 find_early_table_space(end);
362
363 start = (unsigned long)__va(start);
364 end = (unsigned long)__va(end);
365
366 for (; start < end; start = next) {
367 unsigned long pud_phys;
368 pgd_t *pgd = pgd_offset_k(start);
369 pud_t *pud;
370
371 if (after_bootmem)
372 pud = pud_offset(pgd, start & PGDIR_MASK);
373 else
374 pud = alloc_low_page(&pud_phys);
375
376 next = start + PGDIR_SIZE;
377 if (next > end)
378 next = end;
379 phys_pud_init(pud, __pa(start), __pa(next));
380 if (!after_bootmem)
381 set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
382 unmap_low_page(pud);
383 }
384
385 if (!after_bootmem)
386 mmu_cr4_features = read_cr4();
387 __flush_tlb_all();
388}
389
390#ifndef CONFIG_NUMA
391void __init paging_init(void)
392{
393 unsigned long max_zone_pfns[MAX_NR_ZONES];
394 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
395 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
396 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
397 max_zone_pfns[ZONE_NORMAL] = end_pfn;
398
399 memory_present(0, 0, end_pfn);
400 sparse_init();
401 free_area_init_nodes(max_zone_pfns);
402}
403#endif
404
405/* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
406 from the CPU leading to inconsistent cache lines. address and size
407 must be aligned to 2MB boundaries.
408 Does nothing when the mapping doesn't exist. */
409void __init clear_kernel_mapping(unsigned long address, unsigned long size)
410{
411 unsigned long end = address + size;
412
413 BUG_ON(address & ~LARGE_PAGE_MASK);
414 BUG_ON(size & ~LARGE_PAGE_MASK);
415
416 for (; address < end; address += LARGE_PAGE_SIZE) {
417 pgd_t *pgd = pgd_offset_k(address);
418 pud_t *pud;
419 pmd_t *pmd;
420 if (pgd_none(*pgd))
421 continue;
422 pud = pud_offset(pgd, address);
423 if (pud_none(*pud))
424 continue;
425 pmd = pmd_offset(pud, address);
426 if (!pmd || pmd_none(*pmd))
427 continue;
428 if (0 == (pmd_val(*pmd) & _PAGE_PSE)) {
429 /* Could handle this, but it should not happen currently. */
430 printk(KERN_ERR
431 "clear_kernel_mapping: mapping has been split. will leak memory\n");
432 pmd_ERROR(*pmd);
433 }
434 set_pmd(pmd, __pmd(0));
435 }
436 __flush_tlb_all();
437}
438
439/*
440 * Memory hotplug specific functions
441 */
442void online_page(struct page *page)
443{
444 ClearPageReserved(page);
445 init_page_count(page);
446 __free_page(page);
447 totalram_pages++;
448 num_physpages++;
449}
450
451#ifdef CONFIG_MEMORY_HOTPLUG
452/*
453 * Memory is added always to NORMAL zone. This means you will never get
454 * additional DMA/DMA32 memory.
455 */
456int arch_add_memory(int nid, u64 start, u64 size)
457{
458 struct pglist_data *pgdat = NODE_DATA(nid);
459 struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
460 unsigned long start_pfn = start >> PAGE_SHIFT;
461 unsigned long nr_pages = size >> PAGE_SHIFT;
462 int ret;
463
464 init_memory_mapping(start, (start + size -1));
465
466 ret = __add_pages(zone, start_pfn, nr_pages);
467 if (ret)
468 goto error;
469
470 return ret;
471error:
472 printk("%s: Problem encountered in __add_pages!\n", __func__);
473 return ret;
474}
475EXPORT_SYMBOL_GPL(arch_add_memory);
476
477int remove_memory(u64 start, u64 size)
478{
479 return -EINVAL;
480}
481EXPORT_SYMBOL_GPL(remove_memory);
482
483#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
484int memory_add_physaddr_to_nid(u64 start)
485{
486 return 0;
487}
488EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
489#endif
490
491#endif /* CONFIG_MEMORY_HOTPLUG */
492
493#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
494/*
495 * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
496 * just online the pages.
497 */
498int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
499{
500 int err = -EIO;
501 unsigned long pfn;
502 unsigned long total = 0, mem = 0;
503 for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
504 if (pfn_valid(pfn)) {
505 online_page(pfn_to_page(pfn));
506 err = 0;
507 mem++;
508 }
509 total++;
510 }
511 if (!err) {
512 z->spanned_pages += total;
513 z->present_pages += mem;
514 z->zone_pgdat->node_spanned_pages += total;
515 z->zone_pgdat->node_present_pages += mem;
516 }
517 return err;
518}
519#endif
520
521static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
522 kcore_vsyscall;
523
524void __init mem_init(void)
525{
526 long codesize, reservedpages, datasize, initsize;
527
528 pci_iommu_alloc();
529
530 /* clear the zero-page */
531 memset(empty_zero_page, 0, PAGE_SIZE);
532
533 reservedpages = 0;
534
535 /* this will put all low memory onto the freelists */
536#ifdef CONFIG_NUMA
537 totalram_pages = numa_free_all_bootmem();
538#else
539 totalram_pages = free_all_bootmem();
540#endif
541 reservedpages = end_pfn - totalram_pages -
542 absent_pages_in_range(0, end_pfn);
543
544 after_bootmem = 1;
545
546 codesize = (unsigned long) &_etext - (unsigned long) &_text;
547 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
548 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
549
550 /* Register memory areas for /proc/kcore */
551 kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
552 kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
553 VMALLOC_END-VMALLOC_START);
554 kclist_add(&kcore_kernel, &_stext, _end - _stext);
555 kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
556 kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
557 VSYSCALL_END - VSYSCALL_START);
558
559 printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
560 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
561 end_pfn << (PAGE_SHIFT-10),
562 codesize >> 10,
563 reservedpages << (PAGE_SHIFT-10),
564 datasize >> 10,
565 initsize >> 10);
566}
567
568void free_init_pages(char *what, unsigned long begin, unsigned long end)
569{
570 unsigned long addr;
571
572 if (begin >= end)
573 return;
574
575 printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
576 for (addr = begin; addr < end; addr += PAGE_SIZE) {
577 ClearPageReserved(virt_to_page(addr));
578 init_page_count(virt_to_page(addr));
579 memset((void *)(addr & ~(PAGE_SIZE-1)),
580 POISON_FREE_INITMEM, PAGE_SIZE);
581 if (addr >= __START_KERNEL_map)
582 change_page_attr_addr(addr, 1, __pgprot(0));
583 free_page(addr);
584 totalram_pages++;
585 }
586 if (addr > __START_KERNEL_map)
587 global_flush_tlb();
588}
589
590void free_initmem(void)
591{
592 free_init_pages("unused kernel memory",
593 (unsigned long)(&__init_begin),
594 (unsigned long)(&__init_end));
595}
596
597#ifdef CONFIG_DEBUG_RODATA
598
599void mark_rodata_ro(void)
600{
601 unsigned long start = (unsigned long)_stext, end;
602
603#ifdef CONFIG_HOTPLUG_CPU
604 /* It must still be possible to apply SMP alternatives. */
605 if (num_possible_cpus() > 1)
606 start = (unsigned long)_etext;
607#endif
608
609#ifdef CONFIG_KPROBES
610 start = (unsigned long)__start_rodata;
611#endif
612
613 end = (unsigned long)__end_rodata;
614 start = (start + PAGE_SIZE - 1) & PAGE_MASK;
615 end &= PAGE_MASK;
616 if (end <= start)
617 return;
618
619 change_page_attr_addr(start, (end - start) >> PAGE_SHIFT, PAGE_KERNEL_RO);
620
621 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
622 (end - start) >> 10);
623
624 /*
625 * change_page_attr_addr() requires a global_flush_tlb() call after it.
626 * We do this after the printk so that if something went wrong in the
627 * change, the printk gets out at least to give a better debug hint
628 * of who is the culprit.
629 */
630 global_flush_tlb();
631}
632#endif
633
634#ifdef CONFIG_BLK_DEV_INITRD
635void free_initrd_mem(unsigned long start, unsigned long end)
636{
637 free_init_pages("initrd memory", start, end);
638}
639#endif
640
641void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
642{
643#ifdef CONFIG_NUMA
644 int nid = phys_to_nid(phys);
645#endif
646 unsigned long pfn = phys >> PAGE_SHIFT;
647 if (pfn >= end_pfn) {
648 /* This can happen with kdump kernels when accessing firmware
649 tables. */
650 if (pfn < end_pfn_map)
651 return;
652 printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
653 phys, len);
654 return;
655 }
656
657 /* Should check here against the e820 map to avoid double free */
658#ifdef CONFIG_NUMA
659 reserve_bootmem_node(NODE_DATA(nid), phys, len);
660#else
661 reserve_bootmem(phys, len);
662#endif
663 if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
664 dma_reserve += len / PAGE_SIZE;
665 set_dma_reserve(dma_reserve);
666 }
667}
668
669int kern_addr_valid(unsigned long addr)
670{
671 unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
672 pgd_t *pgd;
673 pud_t *pud;
674 pmd_t *pmd;
675 pte_t *pte;
676
677 if (above != 0 && above != -1UL)
678 return 0;
679
680 pgd = pgd_offset_k(addr);
681 if (pgd_none(*pgd))
682 return 0;
683
684 pud = pud_offset(pgd, addr);
685 if (pud_none(*pud))
686 return 0;
687
688 pmd = pmd_offset(pud, addr);
689 if (pmd_none(*pmd))
690 return 0;
691 if (pmd_large(*pmd))
692 return pfn_valid(pmd_pfn(*pmd));
693
694 pte = pte_offset_kernel(pmd, addr);
695 if (pte_none(*pte))
696 return 0;
697 return pfn_valid(pte_pfn(*pte));
698}
699
700/* A pseudo VMA to allow ptrace access for the vsyscall page. This only
701 covers the 64bit vsyscall page now. 32bit has a real VMA now and does
702 not need special handling anymore. */
703
704static struct vm_area_struct gate_vma = {
705 .vm_start = VSYSCALL_START,
706 .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES << PAGE_SHIFT),
707 .vm_page_prot = PAGE_READONLY_EXEC,
708 .vm_flags = VM_READ | VM_EXEC
709};
710
711struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
712{
713#ifdef CONFIG_IA32_EMULATION
714 if (test_tsk_thread_flag(tsk, TIF_IA32))
715 return NULL;
716#endif
717 return &gate_vma;
718}
719
720int in_gate_area(struct task_struct *task, unsigned long addr)
721{
722 struct vm_area_struct *vma = get_gate_vma(task);
723 if (!vma)
724 return 0;
725 return (addr >= vma->vm_start) && (addr < vma->vm_end);
726}
727
728/* Use this when you have no reliable task/vma, typically from interrupt
729 * context. It is less reliable than using the task's vma and may give
730 * false positives.
731 */
732int in_gate_area_no_task(unsigned long addr)
733{
734 return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
735}
736
737void * __init alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size)
738{
739 return __alloc_bootmem_core(pgdat->bdata, size,
740 SMP_CACHE_BYTES, (4UL*1024*1024*1024), 0);
741}
742
743const char *arch_vma_name(struct vm_area_struct *vma)
744{
745 if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
746 return "[vdso]";
747 if (vma == &gate_vma)
748 return "[vsyscall]";
749 return NULL;
750}
diff --git a/arch/x86/mm/ioremap_32.c b/arch/x86/mm/ioremap_32.c
new file mode 100644
index 000000000000..0b278315d737
--- /dev/null
+++ b/arch/x86/mm/ioremap_32.c
@@ -0,0 +1,274 @@
1/*
2 * arch/i386/mm/ioremap.c
3 *
4 * Re-map IO memory to kernel address space so that we can access it.
5 * This is needed for high PCI addresses that aren't mapped in the
6 * 640k-1MB IO memory area on PC's
7 *
8 * (C) Copyright 1995 1996 Linus Torvalds
9 */
10
11#include <linux/vmalloc.h>
12#include <linux/init.h>
13#include <linux/slab.h>
14#include <linux/module.h>
15#include <linux/io.h>
16#include <asm/fixmap.h>
17#include <asm/cacheflush.h>
18#include <asm/tlbflush.h>
19#include <asm/pgtable.h>
20
21#define ISA_START_ADDRESS 0xa0000
22#define ISA_END_ADDRESS 0x100000
23
24/*
25 * Generic mapping function (not visible outside):
26 */
27
28/*
29 * Remap an arbitrary physical address space into the kernel virtual
30 * address space. Needed when the kernel wants to access high addresses
31 * directly.
32 *
33 * NOTE! We need to allow non-page-aligned mappings too: we will obviously
34 * have to convert them into an offset in a page-aligned mapping, but the
35 * caller shouldn't need to know that small detail.
36 */
37void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags)
38{
39 void __iomem * addr;
40 struct vm_struct * area;
41 unsigned long offset, last_addr;
42 pgprot_t prot;
43
44 /* Don't allow wraparound or zero size */
45 last_addr = phys_addr + size - 1;
46 if (!size || last_addr < phys_addr)
47 return NULL;
48
49 /*
50 * Don't remap the low PCI/ISA area, it's always mapped..
51 */
52 if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
53 return (void __iomem *) phys_to_virt(phys_addr);
54
55 /*
56 * Don't allow anybody to remap normal RAM that we're using..
57 */
58 if (phys_addr <= virt_to_phys(high_memory - 1)) {
59 char *t_addr, *t_end;
60 struct page *page;
61
62 t_addr = __va(phys_addr);
63 t_end = t_addr + (size - 1);
64
65 for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++)
66 if(!PageReserved(page))
67 return NULL;
68 }
69
70 prot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY
71 | _PAGE_ACCESSED | flags);
72
73 /*
74 * Mappings have to be page-aligned
75 */
76 offset = phys_addr & ~PAGE_MASK;
77 phys_addr &= PAGE_MASK;
78 size = PAGE_ALIGN(last_addr+1) - phys_addr;
79
80 /*
81 * Ok, go for it..
82 */
83 area = get_vm_area(size, VM_IOREMAP | (flags << 20));
84 if (!area)
85 return NULL;
86 area->phys_addr = phys_addr;
87 addr = (void __iomem *) area->addr;
88 if (ioremap_page_range((unsigned long) addr,
89 (unsigned long) addr + size, phys_addr, prot)) {
90 vunmap((void __force *) addr);
91 return NULL;
92 }
93 return (void __iomem *) (offset + (char __iomem *)addr);
94}
95EXPORT_SYMBOL(__ioremap);
96
97/**
98 * ioremap_nocache - map bus memory into CPU space
99 * @offset: bus address of the memory
100 * @size: size of the resource to map
101 *
102 * ioremap_nocache performs a platform specific sequence of operations to
103 * make bus memory CPU accessible via the readb/readw/readl/writeb/
104 * writew/writel functions and the other mmio helpers. The returned
105 * address is not guaranteed to be usable directly as a virtual
106 * address.
107 *
108 * This version of ioremap ensures that the memory is marked uncachable
109 * on the CPU as well as honouring existing caching rules from things like
110 * the PCI bus. Note that there are other caches and buffers on many
111 * busses. In particular driver authors should read up on PCI writes
112 *
113 * It's useful if some control registers are in such an area and
114 * write combining or read caching is not desirable:
115 *
116 * Must be freed with iounmap.
117 */
118
119void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size)
120{
121 unsigned long last_addr;
122 void __iomem *p = __ioremap(phys_addr, size, _PAGE_PCD);
123 if (!p)
124 return p;
125
126 /* Guaranteed to be > phys_addr, as per __ioremap() */
127 last_addr = phys_addr + size - 1;
128
129 if (last_addr < virt_to_phys(high_memory) - 1) {
130 struct page *ppage = virt_to_page(__va(phys_addr));
131 unsigned long npages;
132
133 phys_addr &= PAGE_MASK;
134
135 /* This might overflow and become zero.. */
136 last_addr = PAGE_ALIGN(last_addr);
137
138 /* .. but that's ok, because modulo-2**n arithmetic will make
139 * the page-aligned "last - first" come out right.
140 */
141 npages = (last_addr - phys_addr) >> PAGE_SHIFT;
142
143 if (change_page_attr(ppage, npages, PAGE_KERNEL_NOCACHE) < 0) {
144 iounmap(p);
145 p = NULL;
146 }
147 global_flush_tlb();
148 }
149
150 return p;
151}
152EXPORT_SYMBOL(ioremap_nocache);
153
154/**
155 * iounmap - Free a IO remapping
156 * @addr: virtual address from ioremap_*
157 *
158 * Caller must ensure there is only one unmapping for the same pointer.
159 */
160void iounmap(volatile void __iomem *addr)
161{
162 struct vm_struct *p, *o;
163
164 if ((void __force *)addr <= high_memory)
165 return;
166
167 /*
168 * __ioremap special-cases the PCI/ISA range by not instantiating a
169 * vm_area and by simply returning an address into the kernel mapping
170 * of ISA space. So handle that here.
171 */
172 if (addr >= phys_to_virt(ISA_START_ADDRESS) &&
173 addr < phys_to_virt(ISA_END_ADDRESS))
174 return;
175
176 addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr);
177
178 /* Use the vm area unlocked, assuming the caller
179 ensures there isn't another iounmap for the same address
180 in parallel. Reuse of the virtual address is prevented by
181 leaving it in the global lists until we're done with it.
182 cpa takes care of the direct mappings. */
183 read_lock(&vmlist_lock);
184 for (p = vmlist; p; p = p->next) {
185 if (p->addr == addr)
186 break;
187 }
188 read_unlock(&vmlist_lock);
189
190 if (!p) {
191 printk("iounmap: bad address %p\n", addr);
192 dump_stack();
193 return;
194 }
195
196 /* Reset the direct mapping. Can block */
197 if ((p->flags >> 20) && p->phys_addr < virt_to_phys(high_memory) - 1) {
198 change_page_attr(virt_to_page(__va(p->phys_addr)),
199 get_vm_area_size(p) >> PAGE_SHIFT,
200 PAGE_KERNEL);
201 global_flush_tlb();
202 }
203
204 /* Finally remove it */
205 o = remove_vm_area((void *)addr);
206 BUG_ON(p != o || o == NULL);
207 kfree(p);
208}
209EXPORT_SYMBOL(iounmap);
210
211void __init *bt_ioremap(unsigned long phys_addr, unsigned long size)
212{
213 unsigned long offset, last_addr;
214 unsigned int nrpages;
215 enum fixed_addresses idx;
216
217 /* Don't allow wraparound or zero size */
218 last_addr = phys_addr + size - 1;
219 if (!size || last_addr < phys_addr)
220 return NULL;
221
222 /*
223 * Don't remap the low PCI/ISA area, it's always mapped..
224 */
225 if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
226 return phys_to_virt(phys_addr);
227
228 /*
229 * Mappings have to be page-aligned
230 */
231 offset = phys_addr & ~PAGE_MASK;
232 phys_addr &= PAGE_MASK;
233 size = PAGE_ALIGN(last_addr) - phys_addr;
234
235 /*
236 * Mappings have to fit in the FIX_BTMAP area.
237 */
238 nrpages = size >> PAGE_SHIFT;
239 if (nrpages > NR_FIX_BTMAPS)
240 return NULL;
241
242 /*
243 * Ok, go for it..
244 */
245 idx = FIX_BTMAP_BEGIN;
246 while (nrpages > 0) {
247 set_fixmap(idx, phys_addr);
248 phys_addr += PAGE_SIZE;
249 --idx;
250 --nrpages;
251 }
252 return (void*) (offset + fix_to_virt(FIX_BTMAP_BEGIN));
253}
254
255void __init bt_iounmap(void *addr, unsigned long size)
256{
257 unsigned long virt_addr;
258 unsigned long offset;
259 unsigned int nrpages;
260 enum fixed_addresses idx;
261
262 virt_addr = (unsigned long)addr;
263 if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN))
264 return;
265 offset = virt_addr & ~PAGE_MASK;
266 nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
267
268 idx = FIX_BTMAP_BEGIN;
269 while (nrpages > 0) {
270 clear_fixmap(idx);
271 --idx;
272 --nrpages;
273 }
274}
diff --git a/arch/x86/mm/ioremap_64.c b/arch/x86/mm/ioremap_64.c
new file mode 100644
index 000000000000..6cac90aa5032
--- /dev/null
+++ b/arch/x86/mm/ioremap_64.c
@@ -0,0 +1,210 @@
1/*
2 * arch/x86_64/mm/ioremap.c
3 *
4 * Re-map IO memory to kernel address space so that we can access it.
5 * This is needed for high PCI addresses that aren't mapped in the
6 * 640k-1MB IO memory area on PC's
7 *
8 * (C) Copyright 1995 1996 Linus Torvalds
9 */
10
11#include <linux/vmalloc.h>
12#include <linux/init.h>
13#include <linux/slab.h>
14#include <linux/module.h>
15#include <linux/io.h>
16
17#include <asm/pgalloc.h>
18#include <asm/fixmap.h>
19#include <asm/tlbflush.h>
20#include <asm/cacheflush.h>
21#include <asm/proto.h>
22
23unsigned long __phys_addr(unsigned long x)
24{
25 if (x >= __START_KERNEL_map)
26 return x - __START_KERNEL_map + phys_base;
27 return x - PAGE_OFFSET;
28}
29EXPORT_SYMBOL(__phys_addr);
30
31#define ISA_START_ADDRESS 0xa0000
32#define ISA_END_ADDRESS 0x100000
33
34/*
35 * Fix up the linear direct mapping of the kernel to avoid cache attribute
36 * conflicts.
37 */
38static int
39ioremap_change_attr(unsigned long phys_addr, unsigned long size,
40 unsigned long flags)
41{
42 int err = 0;
43 if (phys_addr + size - 1 < (end_pfn_map << PAGE_SHIFT)) {
44 unsigned long npages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
45 unsigned long vaddr = (unsigned long) __va(phys_addr);
46
47 /*
48 * Must use a address here and not struct page because the phys addr
49 * can be a in hole between nodes and not have an memmap entry.
50 */
51 err = change_page_attr_addr(vaddr,npages,__pgprot(__PAGE_KERNEL|flags));
52 if (!err)
53 global_flush_tlb();
54 }
55 return err;
56}
57
58/*
59 * Generic mapping function
60 */
61
62/*
63 * Remap an arbitrary physical address space into the kernel virtual
64 * address space. Needed when the kernel wants to access high addresses
65 * directly.
66 *
67 * NOTE! We need to allow non-page-aligned mappings too: we will obviously
68 * have to convert them into an offset in a page-aligned mapping, but the
69 * caller shouldn't need to know that small detail.
70 */
71void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags)
72{
73 void * addr;
74 struct vm_struct * area;
75 unsigned long offset, last_addr;
76 pgprot_t pgprot;
77
78 /* Don't allow wraparound or zero size */
79 last_addr = phys_addr + size - 1;
80 if (!size || last_addr < phys_addr)
81 return NULL;
82
83 /*
84 * Don't remap the low PCI/ISA area, it's always mapped..
85 */
86 if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
87 return (__force void __iomem *)phys_to_virt(phys_addr);
88
89#ifdef CONFIG_FLATMEM
90 /*
91 * Don't allow anybody to remap normal RAM that we're using..
92 */
93 if (last_addr < virt_to_phys(high_memory)) {
94 char *t_addr, *t_end;
95 struct page *page;
96
97 t_addr = __va(phys_addr);
98 t_end = t_addr + (size - 1);
99
100 for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++)
101 if(!PageReserved(page))
102 return NULL;
103 }
104#endif
105
106 pgprot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_GLOBAL
107 | _PAGE_DIRTY | _PAGE_ACCESSED | flags);
108 /*
109 * Mappings have to be page-aligned
110 */
111 offset = phys_addr & ~PAGE_MASK;
112 phys_addr &= PAGE_MASK;
113 size = PAGE_ALIGN(last_addr+1) - phys_addr;
114
115 /*
116 * Ok, go for it..
117 */
118 area = get_vm_area(size, VM_IOREMAP | (flags << 20));
119 if (!area)
120 return NULL;
121 area->phys_addr = phys_addr;
122 addr = area->addr;
123 if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size,
124 phys_addr, pgprot)) {
125 remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr));
126 return NULL;
127 }
128 if (flags && ioremap_change_attr(phys_addr, size, flags) < 0) {
129 area->flags &= 0xffffff;
130 vunmap(addr);
131 return NULL;
132 }
133 return (__force void __iomem *) (offset + (char *)addr);
134}
135EXPORT_SYMBOL(__ioremap);
136
137/**
138 * ioremap_nocache - map bus memory into CPU space
139 * @offset: bus address of the memory
140 * @size: size of the resource to map
141 *
142 * ioremap_nocache performs a platform specific sequence of operations to
143 * make bus memory CPU accessible via the readb/readw/readl/writeb/
144 * writew/writel functions and the other mmio helpers. The returned
145 * address is not guaranteed to be usable directly as a virtual
146 * address.
147 *
148 * This version of ioremap ensures that the memory is marked uncachable
149 * on the CPU as well as honouring existing caching rules from things like
150 * the PCI bus. Note that there are other caches and buffers on many
151 * busses. In particular driver authors should read up on PCI writes
152 *
153 * It's useful if some control registers are in such an area and
154 * write combining or read caching is not desirable:
155 *
156 * Must be freed with iounmap.
157 */
158
159void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size)
160{
161 return __ioremap(phys_addr, size, _PAGE_PCD);
162}
163EXPORT_SYMBOL(ioremap_nocache);
164
165/**
166 * iounmap - Free a IO remapping
167 * @addr: virtual address from ioremap_*
168 *
169 * Caller must ensure there is only one unmapping for the same pointer.
170 */
171void iounmap(volatile void __iomem *addr)
172{
173 struct vm_struct *p, *o;
174
175 if (addr <= high_memory)
176 return;
177 if (addr >= phys_to_virt(ISA_START_ADDRESS) &&
178 addr < phys_to_virt(ISA_END_ADDRESS))
179 return;
180
181 addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr);
182 /* Use the vm area unlocked, assuming the caller
183 ensures there isn't another iounmap for the same address
184 in parallel. Reuse of the virtual address is prevented by
185 leaving it in the global lists until we're done with it.
186 cpa takes care of the direct mappings. */
187 read_lock(&vmlist_lock);
188 for (p = vmlist; p; p = p->next) {
189 if (p->addr == addr)
190 break;
191 }
192 read_unlock(&vmlist_lock);
193
194 if (!p) {
195 printk("iounmap: bad address %p\n", addr);
196 dump_stack();
197 return;
198 }
199
200 /* Reset the direct mapping. Can block */
201 if (p->flags >> 20)
202 ioremap_change_attr(p->phys_addr, p->size, 0);
203
204 /* Finally remove it */
205 o = remove_vm_area((void *)addr);
206 BUG_ON(p != o || o == NULL);
207 kfree(p);
208}
209EXPORT_SYMBOL(iounmap);
210
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c
new file mode 100644
index 000000000000..a96006f7ae0c
--- /dev/null
+++ b/arch/x86/mm/k8topology_64.c
@@ -0,0 +1,182 @@
1/*
2 * AMD K8 NUMA support.
3 * Discover the memory map and associated nodes.
4 *
5 * This version reads it directly from the K8 northbridge.
6 *
7 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
8 */
9#include <linux/kernel.h>
10#include <linux/init.h>
11#include <linux/string.h>
12#include <linux/module.h>
13#include <linux/nodemask.h>
14#include <asm/io.h>
15#include <linux/pci_ids.h>
16#include <asm/types.h>
17#include <asm/mmzone.h>
18#include <asm/proto.h>
19#include <asm/e820.h>
20#include <asm/pci-direct.h>
21#include <asm/numa.h>
22
23static __init int find_northbridge(void)
24{
25 int num;
26
27 for (num = 0; num < 32; num++) {
28 u32 header;
29
30 header = read_pci_config(0, num, 0, 0x00);
31 if (header != (PCI_VENDOR_ID_AMD | (0x1100<<16)))
32 continue;
33
34 header = read_pci_config(0, num, 1, 0x00);
35 if (header != (PCI_VENDOR_ID_AMD | (0x1101<<16)))
36 continue;
37 return num;
38 }
39
40 return -1;
41}
42
43int __init k8_scan_nodes(unsigned long start, unsigned long end)
44{
45 unsigned long prevbase;
46 struct bootnode nodes[8];
47 int nodeid, i, j, nb;
48 unsigned char nodeids[8];
49 int found = 0;
50 u32 reg;
51 unsigned numnodes;
52 unsigned num_cores;
53
54 if (!early_pci_allowed())
55 return -1;
56
57 nb = find_northbridge();
58 if (nb < 0)
59 return nb;
60
61 printk(KERN_INFO "Scanning NUMA topology in Northbridge %d\n", nb);
62
63 num_cores = (cpuid_ecx(0x80000008) & 0xff) + 1;
64 printk(KERN_INFO "CPU has %d num_cores\n", num_cores);
65
66 reg = read_pci_config(0, nb, 0, 0x60);
67 numnodes = ((reg >> 4) & 0xF) + 1;
68 if (numnodes <= 1)
69 return -1;
70
71 printk(KERN_INFO "Number of nodes %d\n", numnodes);
72
73 memset(&nodes,0,sizeof(nodes));
74 prevbase = 0;
75 for (i = 0; i < 8; i++) {
76 unsigned long base,limit;
77 u32 nodeid;
78
79 base = read_pci_config(0, nb, 1, 0x40 + i*8);
80 limit = read_pci_config(0, nb, 1, 0x44 + i*8);
81
82 nodeid = limit & 7;
83 nodeids[i] = nodeid;
84 if ((base & 3) == 0) {
85 if (i < numnodes)
86 printk("Skipping disabled node %d\n", i);
87 continue;
88 }
89 if (nodeid >= numnodes) {
90 printk("Ignoring excess node %d (%lx:%lx)\n", nodeid,
91 base, limit);
92 continue;
93 }
94
95 if (!limit) {
96 printk(KERN_INFO "Skipping node entry %d (base %lx)\n", i,
97 base);
98 continue;
99 }
100 if ((base >> 8) & 3 || (limit >> 8) & 3) {
101 printk(KERN_ERR "Node %d using interleaving mode %lx/%lx\n",
102 nodeid, (base>>8)&3, (limit>>8) & 3);
103 return -1;
104 }
105 if (node_isset(nodeid, node_possible_map)) {
106 printk(KERN_INFO "Node %d already present. Skipping\n",
107 nodeid);
108 continue;
109 }
110
111 limit >>= 16;
112 limit <<= 24;
113 limit |= (1<<24)-1;
114 limit++;
115
116 if (limit > end_pfn << PAGE_SHIFT)
117 limit = end_pfn << PAGE_SHIFT;
118 if (limit <= base)
119 continue;
120
121 base >>= 16;
122 base <<= 24;
123
124 if (base < start)
125 base = start;
126 if (limit > end)
127 limit = end;
128 if (limit == base) {
129 printk(KERN_ERR "Empty node %d\n", nodeid);
130 continue;
131 }
132 if (limit < base) {
133 printk(KERN_ERR "Node %d bogus settings %lx-%lx.\n",
134 nodeid, base, limit);
135 continue;
136 }
137
138 /* Could sort here, but pun for now. Should not happen anyroads. */
139 if (prevbase > base) {
140 printk(KERN_ERR "Node map not sorted %lx,%lx\n",
141 prevbase,base);
142 return -1;
143 }
144
145 printk(KERN_INFO "Node %d MemBase %016lx Limit %016lx\n",
146 nodeid, base, limit);
147
148 found++;
149
150 nodes[nodeid].start = base;
151 nodes[nodeid].end = limit;
152 e820_register_active_regions(nodeid,
153 nodes[nodeid].start >> PAGE_SHIFT,
154 nodes[nodeid].end >> PAGE_SHIFT);
155
156 prevbase = base;
157
158 node_set(nodeid, node_possible_map);
159 }
160
161 if (!found)
162 return -1;
163
164 memnode_shift = compute_hash_shift(nodes, 8);
165 if (memnode_shift < 0) {
166 printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n");
167 return -1;
168 }
169 printk(KERN_INFO "Using node hash shift of %d\n", memnode_shift);
170
171 for (i = 0; i < 8; i++) {
172 if (nodes[i].start != nodes[i].end) {
173 nodeid = nodeids[i];
174 for (j = 0; j < num_cores; j++)
175 apicid_to_node[(nodeid * num_cores) + j] = i;
176 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
177 }
178 }
179
180 numa_init_array();
181 return 0;
182}
diff --git a/arch/x86/mm/mmap_32.c b/arch/x86/mm/mmap_32.c
new file mode 100644
index 000000000000..552e08473755
--- /dev/null
+++ b/arch/x86/mm/mmap_32.c
@@ -0,0 +1,77 @@
1/*
2 * linux/arch/i386/mm/mmap.c
3 *
4 * flexible mmap layout support
5 *
6 * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
7 * All Rights Reserved.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 *
23 *
24 * Started by Ingo Molnar <mingo@elte.hu>
25 */
26
27#include <linux/personality.h>
28#include <linux/mm.h>
29#include <linux/random.h>
30#include <linux/sched.h>
31
32/*
33 * Top of mmap area (just below the process stack).
34 *
35 * Leave an at least ~128 MB hole.
36 */
37#define MIN_GAP (128*1024*1024)
38#define MAX_GAP (TASK_SIZE/6*5)
39
40static inline unsigned long mmap_base(struct mm_struct *mm)
41{
42 unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur;
43 unsigned long random_factor = 0;
44
45 if (current->flags & PF_RANDOMIZE)
46 random_factor = get_random_int() % (1024*1024);
47
48 if (gap < MIN_GAP)
49 gap = MIN_GAP;
50 else if (gap > MAX_GAP)
51 gap = MAX_GAP;
52
53 return PAGE_ALIGN(TASK_SIZE - gap - random_factor);
54}
55
56/*
57 * This function, called very early during the creation of a new
58 * process VM image, sets up which VM layout function to use:
59 */
60void arch_pick_mmap_layout(struct mm_struct *mm)
61{
62 /*
63 * Fall back to the standard layout if the personality
64 * bit is set, or if the expected stack growth is unlimited:
65 */
66 if (sysctl_legacy_va_layout ||
67 (current->personality & ADDR_COMPAT_LAYOUT) ||
68 current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY) {
69 mm->mmap_base = TASK_UNMAPPED_BASE;
70 mm->get_unmapped_area = arch_get_unmapped_area;
71 mm->unmap_area = arch_unmap_area;
72 } else {
73 mm->mmap_base = mmap_base(mm);
74 mm->get_unmapped_area = arch_get_unmapped_area_topdown;
75 mm->unmap_area = arch_unmap_area_topdown;
76 }
77}
diff --git a/arch/x86/mm/mmap_64.c b/arch/x86/mm/mmap_64.c
new file mode 100644
index 000000000000..80bba0dc000e
--- /dev/null
+++ b/arch/x86/mm/mmap_64.c
@@ -0,0 +1,29 @@
1/* Copyright 2005 Andi Kleen, SuSE Labs.
2 * Licensed under GPL, v.2
3 */
4#include <linux/mm.h>
5#include <linux/sched.h>
6#include <linux/random.h>
7#include <asm/ia32.h>
8
9/* Notebook: move the mmap code from sys_x86_64.c over here. */
10
11void arch_pick_mmap_layout(struct mm_struct *mm)
12{
13#ifdef CONFIG_IA32_EMULATION
14 if (current_thread_info()->flags & _TIF_IA32)
15 return ia32_pick_mmap_layout(mm);
16#endif
17 mm->mmap_base = TASK_UNMAPPED_BASE;
18 if (current->flags & PF_RANDOMIZE) {
19 /* Add 28bit randomness which is about 40bits of address space
20 because mmap base has to be page aligned.
21 or ~1/128 of the total user VM
22 (total user address space is 47bits) */
23 unsigned rnd = get_random_int() & 0xfffffff;
24 mm->mmap_base += ((unsigned long)rnd) << PAGE_SHIFT;
25 }
26 mm->get_unmapped_area = arch_get_unmapped_area;
27 mm->unmap_area = arch_unmap_area;
28}
29
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
new file mode 100644
index 000000000000..6da235522269
--- /dev/null
+++ b/arch/x86/mm/numa_64.c
@@ -0,0 +1,648 @@
1/*
2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 */
5#include <linux/kernel.h>
6#include <linux/mm.h>
7#include <linux/string.h>
8#include <linux/init.h>
9#include <linux/bootmem.h>
10#include <linux/mmzone.h>
11#include <linux/ctype.h>
12#include <linux/module.h>
13#include <linux/nodemask.h>
14
15#include <asm/e820.h>
16#include <asm/proto.h>
17#include <asm/dma.h>
18#include <asm/numa.h>
19#include <asm/acpi.h>
20
21#ifndef Dprintk
22#define Dprintk(x...)
23#endif
24
25struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
26bootmem_data_t plat_node_bdata[MAX_NUMNODES];
27
28struct memnode memnode;
29
30unsigned char cpu_to_node[NR_CPUS] __read_mostly = {
31 [0 ... NR_CPUS-1] = NUMA_NO_NODE
32};
33unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
34 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
35};
36cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
37
38int numa_off __initdata;
39unsigned long __initdata nodemap_addr;
40unsigned long __initdata nodemap_size;
41
42
43/*
44 * Given a shift value, try to populate memnodemap[]
45 * Returns :
46 * 1 if OK
47 * 0 if memnodmap[] too small (of shift too small)
48 * -1 if node overlap or lost ram (shift too big)
49 */
50static int __init
51populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift)
52{
53 int i;
54 int res = -1;
55 unsigned long addr, end;
56
57 memset(memnodemap, 0xff, memnodemapsize);
58 for (i = 0; i < numnodes; i++) {
59 addr = nodes[i].start;
60 end = nodes[i].end;
61 if (addr >= end)
62 continue;
63 if ((end >> shift) >= memnodemapsize)
64 return 0;
65 do {
66 if (memnodemap[addr >> shift] != 0xff)
67 return -1;
68 memnodemap[addr >> shift] = i;
69 addr += (1UL << shift);
70 } while (addr < end);
71 res = 1;
72 }
73 return res;
74}
75
76static int __init allocate_cachealigned_memnodemap(void)
77{
78 unsigned long pad, pad_addr;
79
80 memnodemap = memnode.embedded_map;
81 if (memnodemapsize <= 48)
82 return 0;
83
84 pad = L1_CACHE_BYTES - 1;
85 pad_addr = 0x8000;
86 nodemap_size = pad + memnodemapsize;
87 nodemap_addr = find_e820_area(pad_addr, end_pfn<<PAGE_SHIFT,
88 nodemap_size);
89 if (nodemap_addr == -1UL) {
90 printk(KERN_ERR
91 "NUMA: Unable to allocate Memory to Node hash map\n");
92 nodemap_addr = nodemap_size = 0;
93 return -1;
94 }
95 pad_addr = (nodemap_addr + pad) & ~pad;
96 memnodemap = phys_to_virt(pad_addr);
97
98 printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
99 nodemap_addr, nodemap_addr + nodemap_size);
100 return 0;
101}
102
103/*
104 * The LSB of all start and end addresses in the node map is the value of the
105 * maximum possible shift.
106 */
107static int __init
108extract_lsb_from_nodes (const struct bootnode *nodes, int numnodes)
109{
110 int i, nodes_used = 0;
111 unsigned long start, end;
112 unsigned long bitfield = 0, memtop = 0;
113
114 for (i = 0; i < numnodes; i++) {
115 start = nodes[i].start;
116 end = nodes[i].end;
117 if (start >= end)
118 continue;
119 bitfield |= start;
120 nodes_used++;
121 if (end > memtop)
122 memtop = end;
123 }
124 if (nodes_used <= 1)
125 i = 63;
126 else
127 i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
128 memnodemapsize = (memtop >> i)+1;
129 return i;
130}
131
132int __init compute_hash_shift(struct bootnode *nodes, int numnodes)
133{
134 int shift;
135
136 shift = extract_lsb_from_nodes(nodes, numnodes);
137 if (allocate_cachealigned_memnodemap())
138 return -1;
139 printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
140 shift);
141
142 if (populate_memnodemap(nodes, numnodes, shift) != 1) {
143 printk(KERN_INFO
144 "Your memory is not aligned you need to rebuild your kernel "
145 "with a bigger NODEMAPSIZE shift=%d\n",
146 shift);
147 return -1;
148 }
149 return shift;
150}
151
152#ifdef CONFIG_SPARSEMEM
153int early_pfn_to_nid(unsigned long pfn)
154{
155 return phys_to_nid(pfn << PAGE_SHIFT);
156}
157#endif
158
159static void * __init
160early_node_mem(int nodeid, unsigned long start, unsigned long end,
161 unsigned long size)
162{
163 unsigned long mem = find_e820_area(start, end, size);
164 void *ptr;
165 if (mem != -1L)
166 return __va(mem);
167 ptr = __alloc_bootmem_nopanic(size,
168 SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS));
169 if (ptr == 0) {
170 printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
171 size, nodeid);
172 return NULL;
173 }
174 return ptr;
175}
176
177/* Initialize bootmem allocator for a node */
178void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
179{
180 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start;
181 unsigned long nodedata_phys;
182 void *bootmap;
183 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
184
185 start = round_up(start, ZONE_ALIGN);
186
187 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
188
189 start_pfn = start >> PAGE_SHIFT;
190 end_pfn = end >> PAGE_SHIFT;
191
192 node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size);
193 if (node_data[nodeid] == NULL)
194 return;
195 nodedata_phys = __pa(node_data[nodeid]);
196
197 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
198 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
199 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
200 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
201
202 /* Find a place for the bootmem map */
203 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
204 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
205 bootmap = early_node_mem(nodeid, bootmap_start, end,
206 bootmap_pages<<PAGE_SHIFT);
207 if (bootmap == NULL) {
208 if (nodedata_phys < start || nodedata_phys >= end)
209 free_bootmem((unsigned long)node_data[nodeid],pgdat_size);
210 node_data[nodeid] = NULL;
211 return;
212 }
213 bootmap_start = __pa(bootmap);
214 Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
215
216 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
217 bootmap_start >> PAGE_SHIFT,
218 start_pfn, end_pfn);
219
220 free_bootmem_with_active_regions(nodeid, end);
221
222 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
223 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
224#ifdef CONFIG_ACPI_NUMA
225 srat_reserve_add_area(nodeid);
226#endif
227 node_set_online(nodeid);
228}
229
230/* Initialize final allocator for a zone */
231void __init setup_node_zones(int nodeid)
232{
233 unsigned long start_pfn, end_pfn, memmapsize, limit;
234
235 start_pfn = node_start_pfn(nodeid);
236 end_pfn = node_end_pfn(nodeid);
237
238 Dprintk(KERN_INFO "Setting up memmap for node %d %lx-%lx\n",
239 nodeid, start_pfn, end_pfn);
240
241 /* Try to allocate mem_map at end to not fill up precious <4GB
242 memory. */
243 memmapsize = sizeof(struct page) * (end_pfn-start_pfn);
244 limit = end_pfn << PAGE_SHIFT;
245#ifdef CONFIG_FLAT_NODE_MEM_MAP
246 NODE_DATA(nodeid)->node_mem_map =
247 __alloc_bootmem_core(NODE_DATA(nodeid)->bdata,
248 memmapsize, SMP_CACHE_BYTES,
249 round_down(limit - memmapsize, PAGE_SIZE),
250 limit);
251#endif
252}
253
254void __init numa_init_array(void)
255{
256 int rr, i;
257 /* There are unfortunately some poorly designed mainboards around
258 that only connect memory to a single CPU. This breaks the 1:1 cpu->node
259 mapping. To avoid this fill in the mapping for all possible
260 CPUs, as the number of CPUs is not known yet.
261 We round robin the existing nodes. */
262 rr = first_node(node_online_map);
263 for (i = 0; i < NR_CPUS; i++) {
264 if (cpu_to_node[i] != NUMA_NO_NODE)
265 continue;
266 numa_set_node(i, rr);
267 rr = next_node(rr, node_online_map);
268 if (rr == MAX_NUMNODES)
269 rr = first_node(node_online_map);
270 }
271
272}
273
274#ifdef CONFIG_NUMA_EMU
275/* Numa emulation */
276char *cmdline __initdata;
277
278/*
279 * Setups up nid to range from addr to addr + size. If the end boundary is
280 * greater than max_addr, then max_addr is used instead. The return value is 0
281 * if there is additional memory left for allocation past addr and -1 otherwise.
282 * addr is adjusted to be at the end of the node.
283 */
284static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr,
285 u64 size, u64 max_addr)
286{
287 int ret = 0;
288 nodes[nid].start = *addr;
289 *addr += size;
290 if (*addr >= max_addr) {
291 *addr = max_addr;
292 ret = -1;
293 }
294 nodes[nid].end = *addr;
295 node_set(nid, node_possible_map);
296 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
297 nodes[nid].start, nodes[nid].end,
298 (nodes[nid].end - nodes[nid].start) >> 20);
299 return ret;
300}
301
302/*
303 * Splits num_nodes nodes up equally starting at node_start. The return value
304 * is the number of nodes split up and addr is adjusted to be at the end of the
305 * last node allocated.
306 */
307static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
308 u64 max_addr, int node_start,
309 int num_nodes)
310{
311 unsigned int big;
312 u64 size;
313 int i;
314
315 if (num_nodes <= 0)
316 return -1;
317 if (num_nodes > MAX_NUMNODES)
318 num_nodes = MAX_NUMNODES;
319 size = (max_addr - *addr - e820_hole_size(*addr, max_addr)) /
320 num_nodes;
321 /*
322 * Calculate the number of big nodes that can be allocated as a result
323 * of consolidating the leftovers.
324 */
325 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * num_nodes) /
326 FAKE_NODE_MIN_SIZE;
327
328 /* Round down to nearest FAKE_NODE_MIN_SIZE. */
329 size &= FAKE_NODE_MIN_HASH_MASK;
330 if (!size) {
331 printk(KERN_ERR "Not enough memory for each node. "
332 "NUMA emulation disabled.\n");
333 return -1;
334 }
335
336 for (i = node_start; i < num_nodes + node_start; i++) {
337 u64 end = *addr + size;
338 if (i < big)
339 end += FAKE_NODE_MIN_SIZE;
340 /*
341 * The final node can have the remaining system RAM. Other
342 * nodes receive roughly the same amount of available pages.
343 */
344 if (i == num_nodes + node_start - 1)
345 end = max_addr;
346 else
347 while (end - *addr - e820_hole_size(*addr, end) <
348 size) {
349 end += FAKE_NODE_MIN_SIZE;
350 if (end > max_addr) {
351 end = max_addr;
352 break;
353 }
354 }
355 if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0)
356 break;
357 }
358 return i - node_start + 1;
359}
360
361/*
362 * Splits the remaining system RAM into chunks of size. The remaining memory is
363 * always assigned to a final node and can be asymmetric. Returns the number of
364 * nodes split.
365 */
366static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr,
367 u64 max_addr, int node_start, u64 size)
368{
369 int i = node_start;
370 size = (size << 20) & FAKE_NODE_MIN_HASH_MASK;
371 while (!setup_node_range(i++, nodes, addr, size, max_addr))
372 ;
373 return i - node_start;
374}
375
376/*
377 * Sets up the system RAM area from start_pfn to end_pfn according to the
378 * numa=fake command-line option.
379 */
380static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
381{
382 struct bootnode nodes[MAX_NUMNODES];
383 u64 addr = start_pfn << PAGE_SHIFT;
384 u64 max_addr = end_pfn << PAGE_SHIFT;
385 int num_nodes = 0;
386 int coeff_flag;
387 int coeff = -1;
388 int num = 0;
389 u64 size;
390 int i;
391
392 memset(&nodes, 0, sizeof(nodes));
393 /*
394 * If the numa=fake command-line is just a single number N, split the
395 * system RAM into N fake nodes.
396 */
397 if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) {
398 num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0,
399 simple_strtol(cmdline, NULL, 0));
400 if (num_nodes < 0)
401 return num_nodes;
402 goto out;
403 }
404
405 /* Parse the command line. */
406 for (coeff_flag = 0; ; cmdline++) {
407 if (*cmdline && isdigit(*cmdline)) {
408 num = num * 10 + *cmdline - '0';
409 continue;
410 }
411 if (*cmdline == '*') {
412 if (num > 0)
413 coeff = num;
414 coeff_flag = 1;
415 }
416 if (!*cmdline || *cmdline == ',') {
417 if (!coeff_flag)
418 coeff = 1;
419 /*
420 * Round down to the nearest FAKE_NODE_MIN_SIZE.
421 * Command-line coefficients are in megabytes.
422 */
423 size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK;
424 if (size)
425 for (i = 0; i < coeff; i++, num_nodes++)
426 if (setup_node_range(num_nodes, nodes,
427 &addr, size, max_addr) < 0)
428 goto done;
429 if (!*cmdline)
430 break;
431 coeff_flag = 0;
432 coeff = -1;
433 }
434 num = 0;
435 }
436done:
437 if (!num_nodes)
438 return -1;
439 /* Fill remainder of system RAM, if appropriate. */
440 if (addr < max_addr) {
441 if (coeff_flag && coeff < 0) {
442 /* Split remaining nodes into num-sized chunks */
443 num_nodes += split_nodes_by_size(nodes, &addr, max_addr,
444 num_nodes, num);
445 goto out;
446 }
447 switch (*(cmdline - 1)) {
448 case '*':
449 /* Split remaining nodes into coeff chunks */
450 if (coeff <= 0)
451 break;
452 num_nodes += split_nodes_equally(nodes, &addr, max_addr,
453 num_nodes, coeff);
454 break;
455 case ',':
456 /* Do not allocate remaining system RAM */
457 break;
458 default:
459 /* Give one final node */
460 setup_node_range(num_nodes, nodes, &addr,
461 max_addr - addr, max_addr);
462 num_nodes++;
463 }
464 }
465out:
466 memnode_shift = compute_hash_shift(nodes, num_nodes);
467 if (memnode_shift < 0) {
468 memnode_shift = 0;
469 printk(KERN_ERR "No NUMA hash function found. NUMA emulation "
470 "disabled.\n");
471 return -1;
472 }
473
474 /*
475 * We need to vacate all active ranges that may have been registered by
476 * SRAT and set acpi_numa to -1 so that srat_disabled() always returns
477 * true. NUMA emulation has succeeded so we will not scan ACPI nodes.
478 */
479 remove_all_active_ranges();
480#ifdef CONFIG_ACPI_NUMA
481 acpi_numa = -1;
482#endif
483 for_each_node_mask(i, node_possible_map) {
484 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
485 nodes[i].end >> PAGE_SHIFT);
486 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
487 }
488 acpi_fake_nodes(nodes, num_nodes);
489 numa_init_array();
490 return 0;
491}
492#endif /* CONFIG_NUMA_EMU */
493
494void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
495{
496 int i;
497
498 nodes_clear(node_possible_map);
499
500#ifdef CONFIG_NUMA_EMU
501 if (cmdline && !numa_emulation(start_pfn, end_pfn))
502 return;
503 nodes_clear(node_possible_map);
504#endif
505
506#ifdef CONFIG_ACPI_NUMA
507 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
508 end_pfn << PAGE_SHIFT))
509 return;
510 nodes_clear(node_possible_map);
511#endif
512
513#ifdef CONFIG_K8_NUMA
514 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
515 return;
516 nodes_clear(node_possible_map);
517#endif
518 printk(KERN_INFO "%s\n",
519 numa_off ? "NUMA turned off" : "No NUMA configuration found");
520
521 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
522 start_pfn << PAGE_SHIFT,
523 end_pfn << PAGE_SHIFT);
524 /* setup dummy node covering all memory */
525 memnode_shift = 63;
526 memnodemap = memnode.embedded_map;
527 memnodemap[0] = 0;
528 nodes_clear(node_online_map);
529 node_set_online(0);
530 node_set(0, node_possible_map);
531 for (i = 0; i < NR_CPUS; i++)
532 numa_set_node(i, 0);
533 node_to_cpumask[0] = cpumask_of_cpu(0);
534 e820_register_active_regions(0, start_pfn, end_pfn);
535 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
536}
537
538__cpuinit void numa_add_cpu(int cpu)
539{
540 set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
541}
542
543void __cpuinit numa_set_node(int cpu, int node)
544{
545 cpu_pda(cpu)->nodenumber = node;
546 cpu_to_node[cpu] = node;
547}
548
549unsigned long __init numa_free_all_bootmem(void)
550{
551 int i;
552 unsigned long pages = 0;
553 for_each_online_node(i) {
554 pages += free_all_bootmem_node(NODE_DATA(i));
555 }
556 return pages;
557}
558
559void __init paging_init(void)
560{
561 int i;
562 unsigned long max_zone_pfns[MAX_NR_ZONES];
563 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
564 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
565 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
566 max_zone_pfns[ZONE_NORMAL] = end_pfn;
567
568 sparse_memory_present_with_active_regions(MAX_NUMNODES);
569 sparse_init();
570
571 for_each_online_node(i) {
572 setup_node_zones(i);
573 }
574
575 free_area_init_nodes(max_zone_pfns);
576}
577
578static __init int numa_setup(char *opt)
579{
580 if (!opt)
581 return -EINVAL;
582 if (!strncmp(opt,"off",3))
583 numa_off = 1;
584#ifdef CONFIG_NUMA_EMU
585 if (!strncmp(opt, "fake=", 5))
586 cmdline = opt + 5;
587#endif
588#ifdef CONFIG_ACPI_NUMA
589 if (!strncmp(opt,"noacpi",6))
590 acpi_numa = -1;
591 if (!strncmp(opt,"hotadd=", 7))
592 hotadd_percent = simple_strtoul(opt+7, NULL, 10);
593#endif
594 return 0;
595}
596
597early_param("numa", numa_setup);
598
599/*
600 * Setup early cpu_to_node.
601 *
602 * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
603 * and apicid_to_node[] tables have valid entries for a CPU.
604 * This means we skip cpu_to_node[] initialisation for NUMA
605 * emulation and faking node case (when running a kernel compiled
606 * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
607 * is already initialized in a round robin manner at numa_init_array,
608 * prior to this call, and this initialization is good enough
609 * for the fake NUMA cases.
610 */
611void __init init_cpu_to_node(void)
612{
613 int i;
614 for (i = 0; i < NR_CPUS; i++) {
615 u8 apicid = x86_cpu_to_apicid[i];
616 if (apicid == BAD_APICID)
617 continue;
618 if (apicid_to_node[apicid] == NUMA_NO_NODE)
619 continue;
620 numa_set_node(i,apicid_to_node[apicid]);
621 }
622}
623
624EXPORT_SYMBOL(cpu_to_node);
625EXPORT_SYMBOL(node_to_cpumask);
626EXPORT_SYMBOL(memnode);
627EXPORT_SYMBOL(node_data);
628
629#ifdef CONFIG_DISCONTIGMEM
630/*
631 * Functions to convert PFNs from/to per node page addresses.
632 * These are out of line because they are quite big.
633 * They could be all tuned by pre caching more state.
634 * Should do that.
635 */
636
637int pfn_valid(unsigned long pfn)
638{
639 unsigned nid;
640 if (pfn >= num_physpages)
641 return 0;
642 nid = pfn_to_nid(pfn);
643 if (nid == 0xff)
644 return 0;
645 return pfn >= node_start_pfn(nid) && (pfn) < node_end_pfn(nid);
646}
647EXPORT_SYMBOL(pfn_valid);
648#endif
diff --git a/arch/x86/mm/pageattr_32.c b/arch/x86/mm/pageattr_32.c
new file mode 100644
index 000000000000..4241a74d16c8
--- /dev/null
+++ b/arch/x86/mm/pageattr_32.c
@@ -0,0 +1,278 @@
1/*
2 * Copyright 2002 Andi Kleen, SuSE Labs.
3 * Thanks to Ben LaHaise for precious feedback.
4 */
5
6#include <linux/mm.h>
7#include <linux/sched.h>
8#include <linux/highmem.h>
9#include <linux/module.h>
10#include <linux/slab.h>
11#include <asm/uaccess.h>
12#include <asm/processor.h>
13#include <asm/tlbflush.h>
14#include <asm/pgalloc.h>
15#include <asm/sections.h>
16
17static DEFINE_SPINLOCK(cpa_lock);
18static struct list_head df_list = LIST_HEAD_INIT(df_list);
19
20
21pte_t *lookup_address(unsigned long address)
22{
23 pgd_t *pgd = pgd_offset_k(address);
24 pud_t *pud;
25 pmd_t *pmd;
26 if (pgd_none(*pgd))
27 return NULL;
28 pud = pud_offset(pgd, address);
29 if (pud_none(*pud))
30 return NULL;
31 pmd = pmd_offset(pud, address);
32 if (pmd_none(*pmd))
33 return NULL;
34 if (pmd_large(*pmd))
35 return (pte_t *)pmd;
36 return pte_offset_kernel(pmd, address);
37}
38
39static struct page *split_large_page(unsigned long address, pgprot_t prot,
40 pgprot_t ref_prot)
41{
42 int i;
43 unsigned long addr;
44 struct page *base;
45 pte_t *pbase;
46
47 spin_unlock_irq(&cpa_lock);
48 base = alloc_pages(GFP_KERNEL, 0);
49 spin_lock_irq(&cpa_lock);
50 if (!base)
51 return NULL;
52
53 /*
54 * page_private is used to track the number of entries in
55 * the page table page that have non standard attributes.
56 */
57 SetPagePrivate(base);
58 page_private(base) = 0;
59
60 address = __pa(address);
61 addr = address & LARGE_PAGE_MASK;
62 pbase = (pte_t *)page_address(base);
63 paravirt_alloc_pt(&init_mm, page_to_pfn(base));
64 for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
65 set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT,
66 addr == address ? prot : ref_prot));
67 }
68 return base;
69}
70
71static void cache_flush_page(struct page *p)
72{
73 unsigned long adr = (unsigned long)page_address(p);
74 int i;
75 for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size)
76 asm volatile("clflush (%0)" :: "r" (adr + i));
77}
78
79static void flush_kernel_map(void *arg)
80{
81 struct list_head *lh = (struct list_head *)arg;
82 struct page *p;
83
84 /* High level code is not ready for clflush yet */
85 if (0 && cpu_has_clflush) {
86 list_for_each_entry (p, lh, lru)
87 cache_flush_page(p);
88 } else if (boot_cpu_data.x86_model >= 4)
89 wbinvd();
90
91 /* Flush all to work around Errata in early athlons regarding
92 * large page flushing.
93 */
94 __flush_tlb_all();
95}
96
97static void set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
98{
99 struct page *page;
100 unsigned long flags;
101
102 set_pte_atomic(kpte, pte); /* change init_mm */
103 if (SHARED_KERNEL_PMD)
104 return;
105
106 spin_lock_irqsave(&pgd_lock, flags);
107 for (page = pgd_list; page; page = (struct page *)page->index) {
108 pgd_t *pgd;
109 pud_t *pud;
110 pmd_t *pmd;
111 pgd = (pgd_t *)page_address(page) + pgd_index(address);
112 pud = pud_offset(pgd, address);
113 pmd = pmd_offset(pud, address);
114 set_pte_atomic((pte_t *)pmd, pte);
115 }
116 spin_unlock_irqrestore(&pgd_lock, flags);
117}
118
119/*
120 * No more special protections in this 2/4MB area - revert to a
121 * large page again.
122 */
123static inline void revert_page(struct page *kpte_page, unsigned long address)
124{
125 pgprot_t ref_prot;
126 pte_t *linear;
127
128 ref_prot =
129 ((address & LARGE_PAGE_MASK) < (unsigned long)&_etext)
130 ? PAGE_KERNEL_LARGE_EXEC : PAGE_KERNEL_LARGE;
131
132 linear = (pte_t *)
133 pmd_offset(pud_offset(pgd_offset_k(address), address), address);
134 set_pmd_pte(linear, address,
135 pfn_pte((__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT,
136 ref_prot));
137}
138
139static inline void save_page(struct page *kpte_page)
140{
141 if (!test_and_set_bit(PG_arch_1, &kpte_page->flags))
142 list_add(&kpte_page->lru, &df_list);
143}
144
145static int
146__change_page_attr(struct page *page, pgprot_t prot)
147{
148 pte_t *kpte;
149 unsigned long address;
150 struct page *kpte_page;
151
152 BUG_ON(PageHighMem(page));
153 address = (unsigned long)page_address(page);
154
155 kpte = lookup_address(address);
156 if (!kpte)
157 return -EINVAL;
158 kpte_page = virt_to_page(kpte);
159 BUG_ON(PageLRU(kpte_page));
160 BUG_ON(PageCompound(kpte_page));
161
162 if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) {
163 if (!pte_huge(*kpte)) {
164 set_pte_atomic(kpte, mk_pte(page, prot));
165 } else {
166 pgprot_t ref_prot;
167 struct page *split;
168
169 ref_prot =
170 ((address & LARGE_PAGE_MASK) < (unsigned long)&_etext)
171 ? PAGE_KERNEL_EXEC : PAGE_KERNEL;
172 split = split_large_page(address, prot, ref_prot);
173 if (!split)
174 return -ENOMEM;
175 set_pmd_pte(kpte,address,mk_pte(split, ref_prot));
176 kpte_page = split;
177 }
178 page_private(kpte_page)++;
179 } else if (!pte_huge(*kpte)) {
180 set_pte_atomic(kpte, mk_pte(page, PAGE_KERNEL));
181 BUG_ON(page_private(kpte_page) == 0);
182 page_private(kpte_page)--;
183 } else
184 BUG();
185
186 /*
187 * If the pte was reserved, it means it was created at boot
188 * time (not via split_large_page) and in turn we must not
189 * replace it with a largepage.
190 */
191
192 save_page(kpte_page);
193 if (!PageReserved(kpte_page)) {
194 if (cpu_has_pse && (page_private(kpte_page) == 0)) {
195 paravirt_release_pt(page_to_pfn(kpte_page));
196 revert_page(kpte_page, address);
197 }
198 }
199 return 0;
200}
201
202static inline void flush_map(struct list_head *l)
203{
204 on_each_cpu(flush_kernel_map, l, 1, 1);
205}
206
207/*
208 * Change the page attributes of an page in the linear mapping.
209 *
210 * This should be used when a page is mapped with a different caching policy
211 * than write-back somewhere - some CPUs do not like it when mappings with
212 * different caching policies exist. This changes the page attributes of the
213 * in kernel linear mapping too.
214 *
215 * The caller needs to ensure that there are no conflicting mappings elsewhere.
216 * This function only deals with the kernel linear map.
217 *
218 * Caller must call global_flush_tlb() after this.
219 */
220int change_page_attr(struct page *page, int numpages, pgprot_t prot)
221{
222 int err = 0;
223 int i;
224 unsigned long flags;
225
226 spin_lock_irqsave(&cpa_lock, flags);
227 for (i = 0; i < numpages; i++, page++) {
228 err = __change_page_attr(page, prot);
229 if (err)
230 break;
231 }
232 spin_unlock_irqrestore(&cpa_lock, flags);
233 return err;
234}
235
236void global_flush_tlb(void)
237{
238 struct list_head l;
239 struct page *pg, *next;
240
241 BUG_ON(irqs_disabled());
242
243 spin_lock_irq(&cpa_lock);
244 list_replace_init(&df_list, &l);
245 spin_unlock_irq(&cpa_lock);
246 flush_map(&l);
247 list_for_each_entry_safe(pg, next, &l, lru) {
248 list_del(&pg->lru);
249 clear_bit(PG_arch_1, &pg->flags);
250 if (PageReserved(pg) || !cpu_has_pse || page_private(pg) != 0)
251 continue;
252 ClearPagePrivate(pg);
253 __free_page(pg);
254 }
255}
256
257#ifdef CONFIG_DEBUG_PAGEALLOC
258void kernel_map_pages(struct page *page, int numpages, int enable)
259{
260 if (PageHighMem(page))
261 return;
262 if (!enable)
263 debug_check_no_locks_freed(page_address(page),
264 numpages * PAGE_SIZE);
265
266 /* the return value is ignored - the calls cannot fail,
267 * large pages are disabled at boot time.
268 */
269 change_page_attr(page, numpages, enable ? PAGE_KERNEL : __pgprot(0));
270 /* we should perform an IPI and flush all tlbs,
271 * but that can deadlock->flush only current cpu.
272 */
273 __flush_tlb_all();
274}
275#endif
276
277EXPORT_SYMBOL(change_page_attr);
278EXPORT_SYMBOL(global_flush_tlb);
diff --git a/arch/x86/mm/pageattr_64.c b/arch/x86/mm/pageattr_64.c
new file mode 100644
index 000000000000..10b9809ce821
--- /dev/null
+++ b/arch/x86/mm/pageattr_64.c
@@ -0,0 +1,249 @@
1/*
2 * Copyright 2002 Andi Kleen, SuSE Labs.
3 * Thanks to Ben LaHaise for precious feedback.
4 */
5
6#include <linux/mm.h>
7#include <linux/sched.h>
8#include <linux/highmem.h>
9#include <linux/module.h>
10#include <linux/slab.h>
11#include <asm/uaccess.h>
12#include <asm/processor.h>
13#include <asm/tlbflush.h>
14#include <asm/io.h>
15
16pte_t *lookup_address(unsigned long address)
17{
18 pgd_t *pgd = pgd_offset_k(address);
19 pud_t *pud;
20 pmd_t *pmd;
21 pte_t *pte;
22 if (pgd_none(*pgd))
23 return NULL;
24 pud = pud_offset(pgd, address);
25 if (!pud_present(*pud))
26 return NULL;
27 pmd = pmd_offset(pud, address);
28 if (!pmd_present(*pmd))
29 return NULL;
30 if (pmd_large(*pmd))
31 return (pte_t *)pmd;
32 pte = pte_offset_kernel(pmd, address);
33 if (pte && !pte_present(*pte))
34 pte = NULL;
35 return pte;
36}
37
38static struct page *split_large_page(unsigned long address, pgprot_t prot,
39 pgprot_t ref_prot)
40{
41 int i;
42 unsigned long addr;
43 struct page *base = alloc_pages(GFP_KERNEL, 0);
44 pte_t *pbase;
45 if (!base)
46 return NULL;
47 /*
48 * page_private is used to track the number of entries in
49 * the page table page have non standard attributes.
50 */
51 SetPagePrivate(base);
52 page_private(base) = 0;
53
54 address = __pa(address);
55 addr = address & LARGE_PAGE_MASK;
56 pbase = (pte_t *)page_address(base);
57 for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
58 pbase[i] = pfn_pte(addr >> PAGE_SHIFT,
59 addr == address ? prot : ref_prot);
60 }
61 return base;
62}
63
64static void cache_flush_page(void *adr)
65{
66 int i;
67 for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size)
68 asm volatile("clflush (%0)" :: "r" (adr + i));
69}
70
71static void flush_kernel_map(void *arg)
72{
73 struct list_head *l = (struct list_head *)arg;
74 struct page *pg;
75
76 /* When clflush is available always use it because it is
77 much cheaper than WBINVD. */
78 /* clflush is still broken. Disable for now. */
79 if (1 || !cpu_has_clflush)
80 asm volatile("wbinvd" ::: "memory");
81 else list_for_each_entry(pg, l, lru) {
82 void *adr = page_address(pg);
83 cache_flush_page(adr);
84 }
85 __flush_tlb_all();
86}
87
88static inline void flush_map(struct list_head *l)
89{
90 on_each_cpu(flush_kernel_map, l, 1, 1);
91}
92
93static LIST_HEAD(deferred_pages); /* protected by init_mm.mmap_sem */
94
95static inline void save_page(struct page *fpage)
96{
97 if (!test_and_set_bit(PG_arch_1, &fpage->flags))
98 list_add(&fpage->lru, &deferred_pages);
99}
100
101/*
102 * No more special protections in this 2/4MB area - revert to a
103 * large page again.
104 */
105static void revert_page(unsigned long address, pgprot_t ref_prot)
106{
107 pgd_t *pgd;
108 pud_t *pud;
109 pmd_t *pmd;
110 pte_t large_pte;
111 unsigned long pfn;
112
113 pgd = pgd_offset_k(address);
114 BUG_ON(pgd_none(*pgd));
115 pud = pud_offset(pgd,address);
116 BUG_ON(pud_none(*pud));
117 pmd = pmd_offset(pud, address);
118 BUG_ON(pmd_val(*pmd) & _PAGE_PSE);
119 pfn = (__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT;
120 large_pte = pfn_pte(pfn, ref_prot);
121 large_pte = pte_mkhuge(large_pte);
122 set_pte((pte_t *)pmd, large_pte);
123}
124
125static int
126__change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
127 pgprot_t ref_prot)
128{
129 pte_t *kpte;
130 struct page *kpte_page;
131 pgprot_t ref_prot2;
132
133 kpte = lookup_address(address);
134 if (!kpte) return 0;
135 kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK);
136 BUG_ON(PageLRU(kpte_page));
137 BUG_ON(PageCompound(kpte_page));
138 if (pgprot_val(prot) != pgprot_val(ref_prot)) {
139 if (!pte_huge(*kpte)) {
140 set_pte(kpte, pfn_pte(pfn, prot));
141 } else {
142 /*
143 * split_large_page will take the reference for this
144 * change_page_attr on the split page.
145 */
146 struct page *split;
147 ref_prot2 = pte_pgprot(pte_clrhuge(*kpte));
148 split = split_large_page(address, prot, ref_prot2);
149 if (!split)
150 return -ENOMEM;
151 set_pte(kpte, mk_pte(split, ref_prot2));
152 kpte_page = split;
153 }
154 page_private(kpte_page)++;
155 } else if (!pte_huge(*kpte)) {
156 set_pte(kpte, pfn_pte(pfn, ref_prot));
157 BUG_ON(page_private(kpte_page) == 0);
158 page_private(kpte_page)--;
159 } else
160 BUG();
161
162 /* on x86-64 the direct mapping set at boot is not using 4k pages */
163 BUG_ON(PageReserved(kpte_page));
164
165 save_page(kpte_page);
166 if (page_private(kpte_page) == 0)
167 revert_page(address, ref_prot);
168 return 0;
169}
170
171/*
172 * Change the page attributes of an page in the linear mapping.
173 *
174 * This should be used when a page is mapped with a different caching policy
175 * than write-back somewhere - some CPUs do not like it when mappings with
176 * different caching policies exist. This changes the page attributes of the
177 * in kernel linear mapping too.
178 *
179 * The caller needs to ensure that there are no conflicting mappings elsewhere.
180 * This function only deals with the kernel linear map.
181 *
182 * Caller must call global_flush_tlb() after this.
183 */
184int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot)
185{
186 int err = 0, kernel_map = 0;
187 int i;
188
189 if (address >= __START_KERNEL_map
190 && address < __START_KERNEL_map + KERNEL_TEXT_SIZE) {
191 address = (unsigned long)__va(__pa(address));
192 kernel_map = 1;
193 }
194
195 down_write(&init_mm.mmap_sem);
196 for (i = 0; i < numpages; i++, address += PAGE_SIZE) {
197 unsigned long pfn = __pa(address) >> PAGE_SHIFT;
198
199 if (!kernel_map || pte_present(pfn_pte(0, prot))) {
200 err = __change_page_attr(address, pfn, prot, PAGE_KERNEL);
201 if (err)
202 break;
203 }
204 /* Handle kernel mapping too which aliases part of the
205 * lowmem */
206 if (__pa(address) < KERNEL_TEXT_SIZE) {
207 unsigned long addr2;
208 pgprot_t prot2;
209 addr2 = __START_KERNEL_map + __pa(address);
210 /* Make sure the kernel mappings stay executable */
211 prot2 = pte_pgprot(pte_mkexec(pfn_pte(0, prot)));
212 err = __change_page_attr(addr2, pfn, prot2,
213 PAGE_KERNEL_EXEC);
214 }
215 }
216 up_write(&init_mm.mmap_sem);
217 return err;
218}
219
220/* Don't call this for MMIO areas that may not have a mem_map entry */
221int change_page_attr(struct page *page, int numpages, pgprot_t prot)
222{
223 unsigned long addr = (unsigned long)page_address(page);
224 return change_page_attr_addr(addr, numpages, prot);
225}
226
227void global_flush_tlb(void)
228{
229 struct page *pg, *next;
230 struct list_head l;
231
232 down_read(&init_mm.mmap_sem);
233 list_replace_init(&deferred_pages, &l);
234 up_read(&init_mm.mmap_sem);
235
236 flush_map(&l);
237
238 list_for_each_entry_safe(pg, next, &l, lru) {
239 list_del(&pg->lru);
240 clear_bit(PG_arch_1, &pg->flags);
241 if (page_private(pg) != 0)
242 continue;
243 ClearPagePrivate(pg);
244 __free_page(pg);
245 }
246}
247
248EXPORT_SYMBOL(change_page_attr);
249EXPORT_SYMBOL(global_flush_tlb);
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
new file mode 100644
index 000000000000..01437c46baae
--- /dev/null
+++ b/arch/x86/mm/pgtable_32.c
@@ -0,0 +1,373 @@
1/*
2 * linux/arch/i386/mm/pgtable.c
3 */
4
5#include <linux/sched.h>
6#include <linux/kernel.h>
7#include <linux/errno.h>
8#include <linux/mm.h>
9#include <linux/swap.h>
10#include <linux/smp.h>
11#include <linux/highmem.h>
12#include <linux/slab.h>
13#include <linux/pagemap.h>
14#include <linux/spinlock.h>
15#include <linux/module.h>
16#include <linux/quicklist.h>
17
18#include <asm/system.h>
19#include <asm/pgtable.h>
20#include <asm/pgalloc.h>
21#include <asm/fixmap.h>
22#include <asm/e820.h>
23#include <asm/tlb.h>
24#include <asm/tlbflush.h>
25
26void show_mem(void)
27{
28 int total = 0, reserved = 0;
29 int shared = 0, cached = 0;
30 int highmem = 0;
31 struct page *page;
32 pg_data_t *pgdat;
33 unsigned long i;
34 unsigned long flags;
35
36 printk(KERN_INFO "Mem-info:\n");
37 show_free_areas();
38 printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
39 for_each_online_pgdat(pgdat) {
40 pgdat_resize_lock(pgdat, &flags);
41 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
42 page = pgdat_page_nr(pgdat, i);
43 total++;
44 if (PageHighMem(page))
45 highmem++;
46 if (PageReserved(page))
47 reserved++;
48 else if (PageSwapCache(page))
49 cached++;
50 else if (page_count(page))
51 shared += page_count(page) - 1;
52 }
53 pgdat_resize_unlock(pgdat, &flags);
54 }
55 printk(KERN_INFO "%d pages of RAM\n", total);
56 printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
57 printk(KERN_INFO "%d reserved pages\n", reserved);
58 printk(KERN_INFO "%d pages shared\n", shared);
59 printk(KERN_INFO "%d pages swap cached\n", cached);
60
61 printk(KERN_INFO "%lu pages dirty\n", global_page_state(NR_FILE_DIRTY));
62 printk(KERN_INFO "%lu pages writeback\n",
63 global_page_state(NR_WRITEBACK));
64 printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED));
65 printk(KERN_INFO "%lu pages slab\n",
66 global_page_state(NR_SLAB_RECLAIMABLE) +
67 global_page_state(NR_SLAB_UNRECLAIMABLE));
68 printk(KERN_INFO "%lu pages pagetables\n",
69 global_page_state(NR_PAGETABLE));
70}
71
72/*
73 * Associate a virtual page frame with a given physical page frame
74 * and protection flags for that frame.
75 */
76static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
77{
78 pgd_t *pgd;
79 pud_t *pud;
80 pmd_t *pmd;
81 pte_t *pte;
82
83 pgd = swapper_pg_dir + pgd_index(vaddr);
84 if (pgd_none(*pgd)) {
85 BUG();
86 return;
87 }
88 pud = pud_offset(pgd, vaddr);
89 if (pud_none(*pud)) {
90 BUG();
91 return;
92 }
93 pmd = pmd_offset(pud, vaddr);
94 if (pmd_none(*pmd)) {
95 BUG();
96 return;
97 }
98 pte = pte_offset_kernel(pmd, vaddr);
99 if (pgprot_val(flags))
100 /* <pfn,flags> stored as-is, to permit clearing entries */
101 set_pte(pte, pfn_pte(pfn, flags));
102 else
103 pte_clear(&init_mm, vaddr, pte);
104
105 /*
106 * It's enough to flush this one mapping.
107 * (PGE mappings get flushed as well)
108 */
109 __flush_tlb_one(vaddr);
110}
111
112/*
113 * Associate a large virtual page frame with a given physical page frame
114 * and protection flags for that frame. pfn is for the base of the page,
115 * vaddr is what the page gets mapped to - both must be properly aligned.
116 * The pmd must already be instantiated. Assumes PAE mode.
117 */
118void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
119{
120 pgd_t *pgd;
121 pud_t *pud;
122 pmd_t *pmd;
123
124 if (vaddr & (PMD_SIZE-1)) { /* vaddr is misaligned */
125 printk(KERN_WARNING "set_pmd_pfn: vaddr misaligned\n");
126 return; /* BUG(); */
127 }
128 if (pfn & (PTRS_PER_PTE-1)) { /* pfn is misaligned */
129 printk(KERN_WARNING "set_pmd_pfn: pfn misaligned\n");
130 return; /* BUG(); */
131 }
132 pgd = swapper_pg_dir + pgd_index(vaddr);
133 if (pgd_none(*pgd)) {
134 printk(KERN_WARNING "set_pmd_pfn: pgd_none\n");
135 return; /* BUG(); */
136 }
137 pud = pud_offset(pgd, vaddr);
138 pmd = pmd_offset(pud, vaddr);
139 set_pmd(pmd, pfn_pmd(pfn, flags));
140 /*
141 * It's enough to flush this one mapping.
142 * (PGE mappings get flushed as well)
143 */
144 __flush_tlb_one(vaddr);
145}
146
147static int fixmaps;
148unsigned long __FIXADDR_TOP = 0xfffff000;
149EXPORT_SYMBOL(__FIXADDR_TOP);
150
151void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
152{
153 unsigned long address = __fix_to_virt(idx);
154
155 if (idx >= __end_of_fixed_addresses) {
156 BUG();
157 return;
158 }
159 set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
160 fixmaps++;
161}
162
163/**
164 * reserve_top_address - reserves a hole in the top of kernel address space
165 * @reserve - size of hole to reserve
166 *
167 * Can be used to relocate the fixmap area and poke a hole in the top
168 * of kernel address space to make room for a hypervisor.
169 */
170void reserve_top_address(unsigned long reserve)
171{
172 BUG_ON(fixmaps > 0);
173 printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
174 (int)-reserve);
175 __FIXADDR_TOP = -reserve - PAGE_SIZE;
176 __VMALLOC_RESERVE += reserve;
177}
178
179pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
180{
181 return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
182}
183
184struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
185{
186 struct page *pte;
187
188#ifdef CONFIG_HIGHPTE
189 pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
190#else
191 pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
192#endif
193 return pte;
194}
195
196void pmd_ctor(void *pmd, struct kmem_cache *cache, unsigned long flags)
197{
198 memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
199}
200
201/*
202 * List of all pgd's needed for non-PAE so it can invalidate entries
203 * in both cached and uncached pgd's; not needed for PAE since the
204 * kernel pmd is shared. If PAE were not to share the pmd a similar
205 * tactic would be needed. This is essentially codepath-based locking
206 * against pageattr.c; it is the unique case in which a valid change
207 * of kernel pagetables can't be lazily synchronized by vmalloc faults.
208 * vmalloc faults work because attached pagetables are never freed.
209 * -- wli
210 */
211DEFINE_SPINLOCK(pgd_lock);
212struct page *pgd_list;
213
214static inline void pgd_list_add(pgd_t *pgd)
215{
216 struct page *page = virt_to_page(pgd);
217 page->index = (unsigned long)pgd_list;
218 if (pgd_list)
219 set_page_private(pgd_list, (unsigned long)&page->index);
220 pgd_list = page;
221 set_page_private(page, (unsigned long)&pgd_list);
222}
223
224static inline void pgd_list_del(pgd_t *pgd)
225{
226 struct page *next, **pprev, *page = virt_to_page(pgd);
227 next = (struct page *)page->index;
228 pprev = (struct page **)page_private(page);
229 *pprev = next;
230 if (next)
231 set_page_private(next, (unsigned long)pprev);
232}
233
234
235
236#if (PTRS_PER_PMD == 1)
237/* Non-PAE pgd constructor */
238static void pgd_ctor(void *pgd)
239{
240 unsigned long flags;
241
242 /* !PAE, no pagetable sharing */
243 memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
244
245 spin_lock_irqsave(&pgd_lock, flags);
246
247 /* must happen under lock */
248 clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
249 swapper_pg_dir + USER_PTRS_PER_PGD,
250 KERNEL_PGD_PTRS);
251 paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
252 __pa(swapper_pg_dir) >> PAGE_SHIFT,
253 USER_PTRS_PER_PGD,
254 KERNEL_PGD_PTRS);
255 pgd_list_add(pgd);
256 spin_unlock_irqrestore(&pgd_lock, flags);
257}
258#else /* PTRS_PER_PMD > 1 */
259/* PAE pgd constructor */
260static void pgd_ctor(void *pgd)
261{
262 /* PAE, kernel PMD may be shared */
263
264 if (SHARED_KERNEL_PMD) {
265 clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
266 swapper_pg_dir + USER_PTRS_PER_PGD,
267 KERNEL_PGD_PTRS);
268 } else {
269 unsigned long flags;
270
271 memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
272 spin_lock_irqsave(&pgd_lock, flags);
273 pgd_list_add(pgd);
274 spin_unlock_irqrestore(&pgd_lock, flags);
275 }
276}
277#endif /* PTRS_PER_PMD */
278
279static void pgd_dtor(void *pgd)
280{
281 unsigned long flags; /* can be called from interrupt context */
282
283 if (SHARED_KERNEL_PMD)
284 return;
285
286 paravirt_release_pd(__pa(pgd) >> PAGE_SHIFT);
287 spin_lock_irqsave(&pgd_lock, flags);
288 pgd_list_del(pgd);
289 spin_unlock_irqrestore(&pgd_lock, flags);
290}
291
292#define UNSHARED_PTRS_PER_PGD \
293 (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
294
295/* If we allocate a pmd for part of the kernel address space, then
296 make sure its initialized with the appropriate kernel mappings.
297 Otherwise use a cached zeroed pmd. */
298static pmd_t *pmd_cache_alloc(int idx)
299{
300 pmd_t *pmd;
301
302 if (idx >= USER_PTRS_PER_PGD) {
303 pmd = (pmd_t *)__get_free_page(GFP_KERNEL);
304
305 if (pmd)
306 memcpy(pmd,
307 (void *)pgd_page_vaddr(swapper_pg_dir[idx]),
308 sizeof(pmd_t) * PTRS_PER_PMD);
309 } else
310 pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
311
312 return pmd;
313}
314
315static void pmd_cache_free(pmd_t *pmd, int idx)
316{
317 if (idx >= USER_PTRS_PER_PGD)
318 free_page((unsigned long)pmd);
319 else
320 kmem_cache_free(pmd_cache, pmd);
321}
322
323pgd_t *pgd_alloc(struct mm_struct *mm)
324{
325 int i;
326 pgd_t *pgd = quicklist_alloc(0, GFP_KERNEL, pgd_ctor);
327
328 if (PTRS_PER_PMD == 1 || !pgd)
329 return pgd;
330
331 for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
332 pmd_t *pmd = pmd_cache_alloc(i);
333
334 if (!pmd)
335 goto out_oom;
336
337 paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT);
338 set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
339 }
340 return pgd;
341
342out_oom:
343 for (i--; i >= 0; i--) {
344 pgd_t pgdent = pgd[i];
345 void* pmd = (void *)__va(pgd_val(pgdent)-1);
346 paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
347 pmd_cache_free(pmd, i);
348 }
349 quicklist_free(0, pgd_dtor, pgd);
350 return NULL;
351}
352
353void pgd_free(pgd_t *pgd)
354{
355 int i;
356
357 /* in the PAE case user pgd entries are overwritten before usage */
358 if (PTRS_PER_PMD > 1)
359 for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
360 pgd_t pgdent = pgd[i];
361 void* pmd = (void *)__va(pgd_val(pgdent)-1);
362 paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
363 pmd_cache_free(pmd, i);
364 }
365 /* in the non-PAE case, free_pgtables() clears user pgd entries */
366 quicklist_free(0, pgd_dtor, pgd);
367}
368
369void check_pgt_cache(void)
370{
371 quicklist_trim(0, pgd_dtor, 25, 16);
372}
373
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
new file mode 100644
index 000000000000..acdf03e19146
--- /dev/null
+++ b/arch/x86/mm/srat_64.c
@@ -0,0 +1,566 @@
1/*
2 * ACPI 3.0 based NUMA setup
3 * Copyright 2004 Andi Kleen, SuSE Labs.
4 *
5 * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
6 *
7 * Called from acpi_numa_init while reading the SRAT and SLIT tables.
8 * Assumes all memory regions belonging to a single proximity domain
9 * are in one chunk. Holes between them will be included in the node.
10 */
11
12#include <linux/kernel.h>
13#include <linux/acpi.h>
14#include <linux/mmzone.h>
15#include <linux/bitmap.h>
16#include <linux/module.h>
17#include <linux/topology.h>
18#include <linux/bootmem.h>
19#include <linux/mm.h>
20#include <asm/proto.h>
21#include <asm/numa.h>
22#include <asm/e820.h>
23
24int acpi_numa __initdata;
25
26static struct acpi_table_slit *acpi_slit;
27
28static nodemask_t nodes_parsed __initdata;
29static struct bootnode nodes[MAX_NUMNODES] __initdata;
30static struct bootnode nodes_add[MAX_NUMNODES];
31static int found_add_area __initdata;
32int hotadd_percent __initdata = 0;
33
34/* Too small nodes confuse the VM badly. Usually they result
35 from BIOS bugs. */
36#define NODE_MIN_SIZE (4*1024*1024)
37
38static __init int setup_node(int pxm)
39{
40 return acpi_map_pxm_to_node(pxm);
41}
42
43static __init int conflicting_nodes(unsigned long start, unsigned long end)
44{
45 int i;
46 for_each_node_mask(i, nodes_parsed) {
47 struct bootnode *nd = &nodes[i];
48 if (nd->start == nd->end)
49 continue;
50 if (nd->end > start && nd->start < end)
51 return i;
52 if (nd->end == end && nd->start == start)
53 return i;
54 }
55 return -1;
56}
57
58static __init void cutoff_node(int i, unsigned long start, unsigned long end)
59{
60 struct bootnode *nd = &nodes[i];
61
62 if (found_add_area)
63 return;
64
65 if (nd->start < start) {
66 nd->start = start;
67 if (nd->end < nd->start)
68 nd->start = nd->end;
69 }
70 if (nd->end > end) {
71 nd->end = end;
72 if (nd->start > nd->end)
73 nd->start = nd->end;
74 }
75}
76
77static __init void bad_srat(void)
78{
79 int i;
80 printk(KERN_ERR "SRAT: SRAT not used.\n");
81 acpi_numa = -1;
82 found_add_area = 0;
83 for (i = 0; i < MAX_LOCAL_APIC; i++)
84 apicid_to_node[i] = NUMA_NO_NODE;
85 for (i = 0; i < MAX_NUMNODES; i++)
86 nodes_add[i].start = nodes[i].end = 0;
87 remove_all_active_ranges();
88}
89
90static __init inline int srat_disabled(void)
91{
92 return numa_off || acpi_numa < 0;
93}
94
95/*
96 * A lot of BIOS fill in 10 (= no distance) everywhere. This messes
97 * up the NUMA heuristics which wants the local node to have a smaller
98 * distance than the others.
99 * Do some quick checks here and only use the SLIT if it passes.
100 */
101static __init int slit_valid(struct acpi_table_slit *slit)
102{
103 int i, j;
104 int d = slit->locality_count;
105 for (i = 0; i < d; i++) {
106 for (j = 0; j < d; j++) {
107 u8 val = slit->entry[d*i + j];
108 if (i == j) {
109 if (val != LOCAL_DISTANCE)
110 return 0;
111 } else if (val <= LOCAL_DISTANCE)
112 return 0;
113 }
114 }
115 return 1;
116}
117
118/* Callback for SLIT parsing */
119void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
120{
121 if (!slit_valid(slit)) {
122 printk(KERN_INFO "ACPI: SLIT table looks invalid. Not used.\n");
123 return;
124 }
125 acpi_slit = slit;
126}
127
128/* Callback for Proximity Domain -> LAPIC mapping */
129void __init
130acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
131{
132 int pxm, node;
133 if (srat_disabled())
134 return;
135 if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) {
136 bad_srat();
137 return;
138 }
139 if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
140 return;
141 pxm = pa->proximity_domain_lo;
142 node = setup_node(pxm);
143 if (node < 0) {
144 printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
145 bad_srat();
146 return;
147 }
148 apicid_to_node[pa->apic_id] = node;
149 acpi_numa = 1;
150 printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
151 pxm, pa->apic_id, node);
152}
153
154#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
155/*
156 * Protect against too large hotadd areas that would fill up memory.
157 */
158static int hotadd_enough_memory(struct bootnode *nd)
159{
160 static unsigned long allocated;
161 static unsigned long last_area_end;
162 unsigned long pages = (nd->end - nd->start) >> PAGE_SHIFT;
163 long mem = pages * sizeof(struct page);
164 unsigned long addr;
165 unsigned long allowed;
166 unsigned long oldpages = pages;
167
168 if (mem < 0)
169 return 0;
170 allowed = (end_pfn - absent_pages_in_range(0, end_pfn)) * PAGE_SIZE;
171 allowed = (allowed / 100) * hotadd_percent;
172 if (allocated + mem > allowed) {
173 unsigned long range;
174 /* Give them at least part of their hotadd memory upto hotadd_percent
175 It would be better to spread the limit out
176 over multiple hotplug areas, but that is too complicated
177 right now */
178 if (allocated >= allowed)
179 return 0;
180 range = allowed - allocated;
181 pages = (range / PAGE_SIZE);
182 mem = pages * sizeof(struct page);
183 nd->end = nd->start + range;
184 }
185 /* Not completely fool proof, but a good sanity check */
186 addr = find_e820_area(last_area_end, end_pfn<<PAGE_SHIFT, mem);
187 if (addr == -1UL)
188 return 0;
189 if (pages != oldpages)
190 printk(KERN_NOTICE "SRAT: Hotadd area limited to %lu bytes\n",
191 pages << PAGE_SHIFT);
192 last_area_end = addr + mem;
193 allocated += mem;
194 return 1;
195}
196
197static int update_end_of_memory(unsigned long end)
198{
199 found_add_area = 1;
200 if ((end >> PAGE_SHIFT) > end_pfn)
201 end_pfn = end >> PAGE_SHIFT;
202 return 1;
203}
204
205static inline int save_add_info(void)
206{
207 return hotadd_percent > 0;
208}
209#else
210int update_end_of_memory(unsigned long end) {return -1;}
211static int hotadd_enough_memory(struct bootnode *nd) {return 1;}
212#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
213static inline int save_add_info(void) {return 1;}
214#else
215static inline int save_add_info(void) {return 0;}
216#endif
217#endif
218/*
219 * Update nodes_add and decide if to include add are in the zone.
220 * Both SPARSE and RESERVE need nodes_add infomation.
221 * This code supports one contigious hot add area per node.
222 */
223static int reserve_hotadd(int node, unsigned long start, unsigned long end)
224{
225 unsigned long s_pfn = start >> PAGE_SHIFT;
226 unsigned long e_pfn = end >> PAGE_SHIFT;
227 int ret = 0, changed = 0;
228 struct bootnode *nd = &nodes_add[node];
229
230 /* I had some trouble with strange memory hotadd regions breaking
231 the boot. Be very strict here and reject anything unexpected.
232 If you want working memory hotadd write correct SRATs.
233
234 The node size check is a basic sanity check to guard against
235 mistakes */
236 if ((signed long)(end - start) < NODE_MIN_SIZE) {
237 printk(KERN_ERR "SRAT: Hotplug area too small\n");
238 return -1;
239 }
240
241 /* This check might be a bit too strict, but I'm keeping it for now. */
242 if (absent_pages_in_range(s_pfn, e_pfn) != e_pfn - s_pfn) {
243 printk(KERN_ERR
244 "SRAT: Hotplug area %lu -> %lu has existing memory\n",
245 s_pfn, e_pfn);
246 return -1;
247 }
248
249 if (!hotadd_enough_memory(&nodes_add[node])) {
250 printk(KERN_ERR "SRAT: Hotplug area too large\n");
251 return -1;
252 }
253
254 /* Looks good */
255
256 if (nd->start == nd->end) {
257 nd->start = start;
258 nd->end = end;
259 changed = 1;
260 } else {
261 if (nd->start == end) {
262 nd->start = start;
263 changed = 1;
264 }
265 if (nd->end == start) {
266 nd->end = end;
267 changed = 1;
268 }
269 if (!changed)
270 printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
271 }
272
273 ret = update_end_of_memory(nd->end);
274
275 if (changed)
276 printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end);
277 return ret;
278}
279
280/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
281void __init
282acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
283{
284 struct bootnode *nd, oldnode;
285 unsigned long start, end;
286 int node, pxm;
287 int i;
288
289 if (srat_disabled())
290 return;
291 if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) {
292 bad_srat();
293 return;
294 }
295 if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0)
296 return;
297
298 if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info())
299 return;
300 start = ma->base_address;
301 end = start + ma->length;
302 pxm = ma->proximity_domain;
303 node = setup_node(pxm);
304 if (node < 0) {
305 printk(KERN_ERR "SRAT: Too many proximity domains.\n");
306 bad_srat();
307 return;
308 }
309 i = conflicting_nodes(start, end);
310 if (i == node) {
311 printk(KERN_WARNING
312 "SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n",
313 pxm, start, end, nodes[i].start, nodes[i].end);
314 } else if (i >= 0) {
315 printk(KERN_ERR
316 "SRAT: PXM %d (%lx-%lx) overlaps with PXM %d (%Lx-%Lx)\n",
317 pxm, start, end, node_to_pxm(i),
318 nodes[i].start, nodes[i].end);
319 bad_srat();
320 return;
321 }
322 nd = &nodes[node];
323 oldnode = *nd;
324 if (!node_test_and_set(node, nodes_parsed)) {
325 nd->start = start;
326 nd->end = end;
327 } else {
328 if (start < nd->start)
329 nd->start = start;
330 if (nd->end < end)
331 nd->end = end;
332 }
333
334 printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
335 nd->start, nd->end);
336 e820_register_active_regions(node, nd->start >> PAGE_SHIFT,
337 nd->end >> PAGE_SHIFT);
338 push_node_boundaries(node, nd->start >> PAGE_SHIFT,
339 nd->end >> PAGE_SHIFT);
340
341 if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) &&
342 (reserve_hotadd(node, start, end) < 0)) {
343 /* Ignore hotadd region. Undo damage */
344 printk(KERN_NOTICE "SRAT: Hotplug region ignored\n");
345 *nd = oldnode;
346 if ((nd->start | nd->end) == 0)
347 node_clear(node, nodes_parsed);
348 }
349}
350
351/* Sanity check to catch more bad SRATs (they are amazingly common).
352 Make sure the PXMs cover all memory. */
353static int __init nodes_cover_memory(const struct bootnode *nodes)
354{
355 int i;
356 unsigned long pxmram, e820ram;
357
358 pxmram = 0;
359 for_each_node_mask(i, nodes_parsed) {
360 unsigned long s = nodes[i].start >> PAGE_SHIFT;
361 unsigned long e = nodes[i].end >> PAGE_SHIFT;
362 pxmram += e - s;
363 pxmram -= absent_pages_in_range(s, e);
364 if ((long)pxmram < 0)
365 pxmram = 0;
366 }
367
368 e820ram = end_pfn - absent_pages_in_range(0, end_pfn);
369 /* We seem to lose 3 pages somewhere. Allow a bit of slack. */
370 if ((long)(e820ram - pxmram) >= 1*1024*1024) {
371 printk(KERN_ERR
372 "SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n",
373 (pxmram << PAGE_SHIFT) >> 20,
374 (e820ram << PAGE_SHIFT) >> 20);
375 return 0;
376 }
377 return 1;
378}
379
380static void unparse_node(int node)
381{
382 int i;
383 node_clear(node, nodes_parsed);
384 for (i = 0; i < MAX_LOCAL_APIC; i++) {
385 if (apicid_to_node[i] == node)
386 apicid_to_node[i] = NUMA_NO_NODE;
387 }
388}
389
390void __init acpi_numa_arch_fixup(void) {}
391
392/* Use the information discovered above to actually set up the nodes. */
393int __init acpi_scan_nodes(unsigned long start, unsigned long end)
394{
395 int i;
396
397 if (acpi_numa <= 0)
398 return -1;
399
400 /* First clean up the node list */
401 for (i = 0; i < MAX_NUMNODES; i++) {
402 cutoff_node(i, start, end);
403 if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) {
404 unparse_node(i);
405 node_set_offline(i);
406 }
407 }
408
409 if (!nodes_cover_memory(nodes)) {
410 bad_srat();
411 return -1;
412 }
413
414 memnode_shift = compute_hash_shift(nodes, MAX_NUMNODES);
415 if (memnode_shift < 0) {
416 printk(KERN_ERR
417 "SRAT: No NUMA node hash function found. Contact maintainer\n");
418 bad_srat();
419 return -1;
420 }
421
422 node_possible_map = nodes_parsed;
423
424 /* Finally register nodes */
425 for_each_node_mask(i, node_possible_map)
426 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
427 /* Try again in case setup_node_bootmem missed one due
428 to missing bootmem */
429 for_each_node_mask(i, node_possible_map)
430 if (!node_online(i))
431 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
432
433 for (i = 0; i < NR_CPUS; i++) {
434 if (cpu_to_node[i] == NUMA_NO_NODE)
435 continue;
436 if (!node_isset(cpu_to_node[i], node_possible_map))
437 numa_set_node(i, NUMA_NO_NODE);
438 }
439 numa_init_array();
440 return 0;
441}
442
443#ifdef CONFIG_NUMA_EMU
444static int __init find_node_by_addr(unsigned long addr)
445{
446 int ret = NUMA_NO_NODE;
447 int i;
448
449 for_each_node_mask(i, nodes_parsed) {
450 /*
451 * Find the real node that this emulated node appears on. For
452 * the sake of simplicity, we only use a real node's starting
453 * address to determine which emulated node it appears on.
454 */
455 if (addr >= nodes[i].start && addr < nodes[i].end) {
456 ret = i;
457 break;
458 }
459 }
460 return i;
461}
462
463/*
464 * In NUMA emulation, we need to setup proximity domain (_PXM) to node ID
465 * mappings that respect the real ACPI topology but reflect our emulated
466 * environment. For each emulated node, we find which real node it appears on
467 * and create PXM to NID mappings for those fake nodes which mirror that
468 * locality. SLIT will now represent the correct distances between emulated
469 * nodes as a result of the real topology.
470 */
471void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
472{
473 int i, j;
474 int fake_node_to_pxm_map[MAX_NUMNODES] = {
475 [0 ... MAX_NUMNODES-1] = PXM_INVAL
476 };
477 unsigned char fake_apicid_to_node[MAX_LOCAL_APIC] = {
478 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
479 };
480
481 printk(KERN_INFO "Faking PXM affinity for fake nodes on real "
482 "topology.\n");
483 for (i = 0; i < num_nodes; i++) {
484 int nid, pxm;
485
486 nid = find_node_by_addr(fake_nodes[i].start);
487 if (nid == NUMA_NO_NODE)
488 continue;
489 pxm = node_to_pxm(nid);
490 if (pxm == PXM_INVAL)
491 continue;
492 fake_node_to_pxm_map[i] = pxm;
493 /*
494 * For each apicid_to_node mapping that exists for this real
495 * node, it must now point to the fake node ID.
496 */
497 for (j = 0; j < MAX_LOCAL_APIC; j++)
498 if (apicid_to_node[j] == nid)
499 fake_apicid_to_node[j] = i;
500 }
501 for (i = 0; i < num_nodes; i++)
502 __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i);
503 memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
504
505 nodes_clear(nodes_parsed);
506 for (i = 0; i < num_nodes; i++)
507 if (fake_nodes[i].start != fake_nodes[i].end)
508 node_set(i, nodes_parsed);
509 WARN_ON(!nodes_cover_memory(fake_nodes));
510}
511
512static int null_slit_node_compare(int a, int b)
513{
514 return node_to_pxm(a) == node_to_pxm(b);
515}
516#else
517static int null_slit_node_compare(int a, int b)
518{
519 return a == b;
520}
521#endif /* CONFIG_NUMA_EMU */
522
523void __init srat_reserve_add_area(int nodeid)
524{
525 if (found_add_area && nodes_add[nodeid].end) {
526 u64 total_mb;
527
528 printk(KERN_INFO "SRAT: Reserving hot-add memory space "
529 "for node %d at %Lx-%Lx\n",
530 nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end);
531 total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start)
532 >> PAGE_SHIFT;
533 total_mb *= sizeof(struct page);
534 total_mb >>= 20;
535 printk(KERN_INFO "SRAT: This will cost you %Lu MB of "
536 "pre-allocated memory.\n", (unsigned long long)total_mb);
537 reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start,
538 nodes_add[nodeid].end - nodes_add[nodeid].start);
539 }
540}
541
542int __node_distance(int a, int b)
543{
544 int index;
545
546 if (!acpi_slit)
547 return null_slit_node_compare(a, b) ? LOCAL_DISTANCE :
548 REMOTE_DISTANCE;
549 index = acpi_slit->locality_count * node_to_pxm(a);
550 return acpi_slit->entry[index + node_to_pxm(b)];
551}
552
553EXPORT_SYMBOL(__node_distance);
554
555int memory_add_physaddr_to_nid(u64 start)
556{
557 int i, ret = 0;
558
559 for_each_node(i)
560 if (nodes_add[i].start <= start && nodes_add[i].end > start)
561 ret = i;
562
563 return ret;
564}
565EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
566
diff --git a/arch/x86/oprofile/Kconfig b/arch/x86/oprofile/Kconfig
new file mode 100644
index 000000000000..d8a84088471a
--- /dev/null
+++ b/arch/x86/oprofile/Kconfig
@@ -0,0 +1,17 @@
1config PROFILING
2 bool "Profiling support (EXPERIMENTAL)"
3 help
4 Say Y here to enable the extended profiling support mechanisms used
5 by profilers such as OProfile.
6
7
8config OPROFILE
9 tristate "OProfile system profiling (EXPERIMENTAL)"
10 depends on PROFILING
11 help
12 OProfile is a profiling system capable of profiling the
13 whole system, include the kernel, kernel modules, libraries,
14 and applications.
15
16 If unsure, say N.
17
diff --git a/arch/x86/oprofile/Makefile b/arch/x86/oprofile/Makefile
new file mode 100644
index 000000000000..30f3eb366667
--- /dev/null
+++ b/arch/x86/oprofile/Makefile
@@ -0,0 +1,12 @@
1obj-$(CONFIG_OPROFILE) += oprofile.o
2
3DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \
4 oprof.o cpu_buffer.o buffer_sync.o \
5 event_buffer.o oprofile_files.o \
6 oprofilefs.o oprofile_stats.o \
7 timer_int.o )
8
9oprofile-y := $(DRIVER_OBJS) init.o backtrace.o
10oprofile-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_athlon.o \
11 op_model_ppro.o op_model_p4.o
12oprofile-$(CONFIG_X86_IO_APIC) += nmi_timer_int.o
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
new file mode 100644
index 000000000000..c049ce414f01
--- /dev/null
+++ b/arch/x86/oprofile/backtrace.c
@@ -0,0 +1,127 @@
1/**
2 * @file backtrace.c
3 *
4 * @remark Copyright 2002 OProfile authors
5 * @remark Read the file COPYING
6 *
7 * @author John Levon
8 * @author David Smith
9 */
10
11#include <linux/oprofile.h>
12#include <linux/sched.h>
13#include <linux/mm.h>
14#include <asm/ptrace.h>
15#include <asm/uaccess.h>
16
17struct frame_head {
18 struct frame_head * ebp;
19 unsigned long ret;
20} __attribute__((packed));
21
22static struct frame_head *
23dump_kernel_backtrace(struct frame_head * head)
24{
25 oprofile_add_trace(head->ret);
26
27 /* frame pointers should strictly progress back up the stack
28 * (towards higher addresses) */
29 if (head >= head->ebp)
30 return NULL;
31
32 return head->ebp;
33}
34
35static struct frame_head *
36dump_user_backtrace(struct frame_head * head)
37{
38 struct frame_head bufhead[2];
39
40 /* Also check accessibility of one struct frame_head beyond */
41 if (!access_ok(VERIFY_READ, head, sizeof(bufhead)))
42 return NULL;
43 if (__copy_from_user_inatomic(bufhead, head, sizeof(bufhead)))
44 return NULL;
45
46 oprofile_add_trace(bufhead[0].ret);
47
48 /* frame pointers should strictly progress back up the stack
49 * (towards higher addresses) */
50 if (head >= bufhead[0].ebp)
51 return NULL;
52
53 return bufhead[0].ebp;
54}
55
56/*
57 * | | /\ Higher addresses
58 * | |
59 * --------------- stack base (address of current_thread_info)
60 * | thread info |
61 * . .
62 * | stack |
63 * --------------- saved regs->ebp value if valid (frame_head address)
64 * . .
65 * --------------- saved regs->rsp value if x86_64
66 * | |
67 * --------------- struct pt_regs * stored on stack if 32-bit
68 * | |
69 * . .
70 * | |
71 * --------------- %esp
72 * | |
73 * | | \/ Lower addresses
74 *
75 * Thus, regs (or regs->rsp for x86_64) <-> stack base restricts the
76 * valid(ish) ebp values. Note: (1) for x86_64, NMI and several other
77 * exceptions use special stacks, maintained by the interrupt stack table
78 * (IST). These stacks are set up in trap_init() in
79 * arch/x86_64/kernel/traps.c. Thus, for x86_64, regs now does not point
80 * to the kernel stack; instead, it points to some location on the NMI
81 * stack. On the other hand, regs->rsp is the stack pointer saved when the
82 * NMI occurred. (2) For 32-bit, regs->esp is not valid because the
83 * processor does not save %esp on the kernel stack when interrupts occur
84 * in the kernel mode.
85 */
86#ifdef CONFIG_FRAME_POINTER
87static int valid_kernel_stack(struct frame_head * head, struct pt_regs * regs)
88{
89 unsigned long headaddr = (unsigned long)head;
90#ifdef CONFIG_X86_64
91 unsigned long stack = (unsigned long)regs->rsp;
92#else
93 unsigned long stack = (unsigned long)regs;
94#endif
95 unsigned long stack_base = (stack & ~(THREAD_SIZE - 1)) + THREAD_SIZE;
96
97 return headaddr > stack && headaddr < stack_base;
98}
99#else
100/* without fp, it's just junk */
101static int valid_kernel_stack(struct frame_head * head, struct pt_regs * regs)
102{
103 return 0;
104}
105#endif
106
107
108void
109x86_backtrace(struct pt_regs * const regs, unsigned int depth)
110{
111 struct frame_head *head;
112
113#ifdef CONFIG_X86_64
114 head = (struct frame_head *)regs->rbp;
115#else
116 head = (struct frame_head *)regs->ebp;
117#endif
118
119 if (!user_mode_vm(regs)) {
120 while (depth-- && valid_kernel_stack(head, regs))
121 head = dump_kernel_backtrace(head);
122 return;
123 }
124
125 while (depth-- && head)
126 head = dump_user_backtrace(head);
127}
diff --git a/arch/x86/oprofile/init.c b/arch/x86/oprofile/init.c
new file mode 100644
index 000000000000..5341d481d92f
--- /dev/null
+++ b/arch/x86/oprofile/init.c
@@ -0,0 +1,48 @@
1/**
2 * @file init.c
3 *
4 * @remark Copyright 2002 OProfile authors
5 * @remark Read the file COPYING
6 *
7 * @author John Levon <levon@movementarian.org>
8 */
9
10#include <linux/oprofile.h>
11#include <linux/init.h>
12#include <linux/errno.h>
13
14/* We support CPUs that have performance counters like the Pentium Pro
15 * with the NMI mode driver.
16 */
17
18extern int op_nmi_init(struct oprofile_operations * ops);
19extern int op_nmi_timer_init(struct oprofile_operations * ops);
20extern void op_nmi_exit(void);
21extern void x86_backtrace(struct pt_regs * const regs, unsigned int depth);
22
23
24int __init oprofile_arch_init(struct oprofile_operations * ops)
25{
26 int ret;
27
28 ret = -ENODEV;
29
30#ifdef CONFIG_X86_LOCAL_APIC
31 ret = op_nmi_init(ops);
32#endif
33#ifdef CONFIG_X86_IO_APIC
34 if (ret < 0)
35 ret = op_nmi_timer_init(ops);
36#endif
37 ops->backtrace = x86_backtrace;
38
39 return ret;
40}
41
42
43void oprofile_arch_exit(void)
44{
45#ifdef CONFIG_X86_LOCAL_APIC
46 op_nmi_exit();
47#endif
48}
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
new file mode 100644
index 000000000000..11b7a51566a8
--- /dev/null
+++ b/arch/x86/oprofile/nmi_int.c
@@ -0,0 +1,477 @@
1/**
2 * @file nmi_int.c
3 *
4 * @remark Copyright 2002 OProfile authors
5 * @remark Read the file COPYING
6 *
7 * @author John Levon <levon@movementarian.org>
8 */
9
10#include <linux/init.h>
11#include <linux/notifier.h>
12#include <linux/smp.h>
13#include <linux/oprofile.h>
14#include <linux/sysdev.h>
15#include <linux/slab.h>
16#include <linux/moduleparam.h>
17#include <linux/kdebug.h>
18#include <asm/nmi.h>
19#include <asm/msr.h>
20#include <asm/apic.h>
21
22#include "op_counter.h"
23#include "op_x86_model.h"
24
25static struct op_x86_model_spec const * model;
26static struct op_msrs cpu_msrs[NR_CPUS];
27static unsigned long saved_lvtpc[NR_CPUS];
28
29static int nmi_start(void);
30static void nmi_stop(void);
31
32/* 0 == registered but off, 1 == registered and on */
33static int nmi_enabled = 0;
34
35#ifdef CONFIG_PM
36
37static int nmi_suspend(struct sys_device *dev, pm_message_t state)
38{
39 if (nmi_enabled == 1)
40 nmi_stop();
41 return 0;
42}
43
44
45static int nmi_resume(struct sys_device *dev)
46{
47 if (nmi_enabled == 1)
48 nmi_start();
49 return 0;
50}
51
52
53static struct sysdev_class oprofile_sysclass = {
54 set_kset_name("oprofile"),
55 .resume = nmi_resume,
56 .suspend = nmi_suspend,
57};
58
59
60static struct sys_device device_oprofile = {
61 .id = 0,
62 .cls = &oprofile_sysclass,
63};
64
65
66static int __init init_sysfs(void)
67{
68 int error;
69 if (!(error = sysdev_class_register(&oprofile_sysclass)))
70 error = sysdev_register(&device_oprofile);
71 return error;
72}
73
74
75static void exit_sysfs(void)
76{
77 sysdev_unregister(&device_oprofile);
78 sysdev_class_unregister(&oprofile_sysclass);
79}
80
81#else
82#define init_sysfs() do { } while (0)
83#define exit_sysfs() do { } while (0)
84#endif /* CONFIG_PM */
85
86static int profile_exceptions_notify(struct notifier_block *self,
87 unsigned long val, void *data)
88{
89 struct die_args *args = (struct die_args *)data;
90 int ret = NOTIFY_DONE;
91 int cpu = smp_processor_id();
92
93 switch(val) {
94 case DIE_NMI:
95 if (model->check_ctrs(args->regs, &cpu_msrs[cpu]))
96 ret = NOTIFY_STOP;
97 break;
98 default:
99 break;
100 }
101 return ret;
102}
103
104static void nmi_cpu_save_registers(struct op_msrs * msrs)
105{
106 unsigned int const nr_ctrs = model->num_counters;
107 unsigned int const nr_ctrls = model->num_controls;
108 struct op_msr * counters = msrs->counters;
109 struct op_msr * controls = msrs->controls;
110 unsigned int i;
111
112 for (i = 0; i < nr_ctrs; ++i) {
113 if (counters[i].addr){
114 rdmsr(counters[i].addr,
115 counters[i].saved.low,
116 counters[i].saved.high);
117 }
118 }
119
120 for (i = 0; i < nr_ctrls; ++i) {
121 if (controls[i].addr){
122 rdmsr(controls[i].addr,
123 controls[i].saved.low,
124 controls[i].saved.high);
125 }
126 }
127}
128
129
130static void nmi_save_registers(void * dummy)
131{
132 int cpu = smp_processor_id();
133 struct op_msrs * msrs = &cpu_msrs[cpu];
134 nmi_cpu_save_registers(msrs);
135}
136
137
138static void free_msrs(void)
139{
140 int i;
141 for_each_possible_cpu(i) {
142 kfree(cpu_msrs[i].counters);
143 cpu_msrs[i].counters = NULL;
144 kfree(cpu_msrs[i].controls);
145 cpu_msrs[i].controls = NULL;
146 }
147}
148
149
150static int allocate_msrs(void)
151{
152 int success = 1;
153 size_t controls_size = sizeof(struct op_msr) * model->num_controls;
154 size_t counters_size = sizeof(struct op_msr) * model->num_counters;
155
156 int i;
157 for_each_possible_cpu(i) {
158 cpu_msrs[i].counters = kmalloc(counters_size, GFP_KERNEL);
159 if (!cpu_msrs[i].counters) {
160 success = 0;
161 break;
162 }
163 cpu_msrs[i].controls = kmalloc(controls_size, GFP_KERNEL);
164 if (!cpu_msrs[i].controls) {
165 success = 0;
166 break;
167 }
168 }
169
170 if (!success)
171 free_msrs();
172
173 return success;
174}
175
176
177static void nmi_cpu_setup(void * dummy)
178{
179 int cpu = smp_processor_id();
180 struct op_msrs * msrs = &cpu_msrs[cpu];
181 spin_lock(&oprofilefs_lock);
182 model->setup_ctrs(msrs);
183 spin_unlock(&oprofilefs_lock);
184 saved_lvtpc[cpu] = apic_read(APIC_LVTPC);
185 apic_write(APIC_LVTPC, APIC_DM_NMI);
186}
187
188static struct notifier_block profile_exceptions_nb = {
189 .notifier_call = profile_exceptions_notify,
190 .next = NULL,
191 .priority = 0
192};
193
194static int nmi_setup(void)
195{
196 int err=0;
197 int cpu;
198
199 if (!allocate_msrs())
200 return -ENOMEM;
201
202 if ((err = register_die_notifier(&profile_exceptions_nb))){
203 free_msrs();
204 return err;
205 }
206
207 /* We need to serialize save and setup for HT because the subset
208 * of msrs are distinct for save and setup operations
209 */
210
211 /* Assume saved/restored counters are the same on all CPUs */
212 model->fill_in_addresses(&cpu_msrs[0]);
213 for_each_possible_cpu (cpu) {
214 if (cpu != 0) {
215 memcpy(cpu_msrs[cpu].counters, cpu_msrs[0].counters,
216 sizeof(struct op_msr) * model->num_counters);
217
218 memcpy(cpu_msrs[cpu].controls, cpu_msrs[0].controls,
219 sizeof(struct op_msr) * model->num_controls);
220 }
221
222 }
223 on_each_cpu(nmi_save_registers, NULL, 0, 1);
224 on_each_cpu(nmi_cpu_setup, NULL, 0, 1);
225 nmi_enabled = 1;
226 return 0;
227}
228
229
230static void nmi_restore_registers(struct op_msrs * msrs)
231{
232 unsigned int const nr_ctrs = model->num_counters;
233 unsigned int const nr_ctrls = model->num_controls;
234 struct op_msr * counters = msrs->counters;
235 struct op_msr * controls = msrs->controls;
236 unsigned int i;
237
238 for (i = 0; i < nr_ctrls; ++i) {
239 if (controls[i].addr){
240 wrmsr(controls[i].addr,
241 controls[i].saved.low,
242 controls[i].saved.high);
243 }
244 }
245
246 for (i = 0; i < nr_ctrs; ++i) {
247 if (counters[i].addr){
248 wrmsr(counters[i].addr,
249 counters[i].saved.low,
250 counters[i].saved.high);
251 }
252 }
253}
254
255
256static void nmi_cpu_shutdown(void * dummy)
257{
258 unsigned int v;
259 int cpu = smp_processor_id();
260 struct op_msrs * msrs = &cpu_msrs[cpu];
261
262 /* restoring APIC_LVTPC can trigger an apic error because the delivery
263 * mode and vector nr combination can be illegal. That's by design: on
264 * power on apic lvt contain a zero vector nr which are legal only for
265 * NMI delivery mode. So inhibit apic err before restoring lvtpc
266 */
267 v = apic_read(APIC_LVTERR);
268 apic_write(APIC_LVTERR, v | APIC_LVT_MASKED);
269 apic_write(APIC_LVTPC, saved_lvtpc[cpu]);
270 apic_write(APIC_LVTERR, v);
271 nmi_restore_registers(msrs);
272 model->shutdown(msrs);
273}
274
275
276static void nmi_shutdown(void)
277{
278 nmi_enabled = 0;
279 on_each_cpu(nmi_cpu_shutdown, NULL, 0, 1);
280 unregister_die_notifier(&profile_exceptions_nb);
281 free_msrs();
282}
283
284
285static void nmi_cpu_start(void * dummy)
286{
287 struct op_msrs const * msrs = &cpu_msrs[smp_processor_id()];
288 model->start(msrs);
289}
290
291
292static int nmi_start(void)
293{
294 on_each_cpu(nmi_cpu_start, NULL, 0, 1);
295 return 0;
296}
297
298
299static void nmi_cpu_stop(void * dummy)
300{
301 struct op_msrs const * msrs = &cpu_msrs[smp_processor_id()];
302 model->stop(msrs);
303}
304
305
306static void nmi_stop(void)
307{
308 on_each_cpu(nmi_cpu_stop, NULL, 0, 1);
309}
310
311
312struct op_counter_config counter_config[OP_MAX_COUNTER];
313
314static int nmi_create_files(struct super_block * sb, struct dentry * root)
315{
316 unsigned int i;
317
318 for (i = 0; i < model->num_counters; ++i) {
319 struct dentry * dir;
320 char buf[4];
321
322 /* quick little hack to _not_ expose a counter if it is not
323 * available for use. This should protect userspace app.
324 * NOTE: assumes 1:1 mapping here (that counters are organized
325 * sequentially in their struct assignment).
326 */
327 if (unlikely(!avail_to_resrv_perfctr_nmi_bit(i)))
328 continue;
329
330 snprintf(buf, sizeof(buf), "%d", i);
331 dir = oprofilefs_mkdir(sb, root, buf);
332 oprofilefs_create_ulong(sb, dir, "enabled", &counter_config[i].enabled);
333 oprofilefs_create_ulong(sb, dir, "event", &counter_config[i].event);
334 oprofilefs_create_ulong(sb, dir, "count", &counter_config[i].count);
335 oprofilefs_create_ulong(sb, dir, "unit_mask", &counter_config[i].unit_mask);
336 oprofilefs_create_ulong(sb, dir, "kernel", &counter_config[i].kernel);
337 oprofilefs_create_ulong(sb, dir, "user", &counter_config[i].user);
338 }
339
340 return 0;
341}
342
343static int p4force;
344module_param(p4force, int, 0);
345
346static int __init p4_init(char ** cpu_type)
347{
348 __u8 cpu_model = boot_cpu_data.x86_model;
349
350 if (!p4force && (cpu_model > 6 || cpu_model == 5))
351 return 0;
352
353#ifndef CONFIG_SMP
354 *cpu_type = "i386/p4";
355 model = &op_p4_spec;
356 return 1;
357#else
358 switch (smp_num_siblings) {
359 case 1:
360 *cpu_type = "i386/p4";
361 model = &op_p4_spec;
362 return 1;
363
364 case 2:
365 *cpu_type = "i386/p4-ht";
366 model = &op_p4_ht2_spec;
367 return 1;
368 }
369#endif
370
371 printk(KERN_INFO "oprofile: P4 HyperThreading detected with > 2 threads\n");
372 printk(KERN_INFO "oprofile: Reverting to timer mode.\n");
373 return 0;
374}
375
376
377static int __init ppro_init(char ** cpu_type)
378{
379 __u8 cpu_model = boot_cpu_data.x86_model;
380
381 if (cpu_model == 14)
382 *cpu_type = "i386/core";
383 else if (cpu_model == 15)
384 *cpu_type = "i386/core_2";
385 else if (cpu_model > 0xd)
386 return 0;
387 else if (cpu_model == 9) {
388 *cpu_type = "i386/p6_mobile";
389 } else if (cpu_model > 5) {
390 *cpu_type = "i386/piii";
391 } else if (cpu_model > 2) {
392 *cpu_type = "i386/pii";
393 } else {
394 *cpu_type = "i386/ppro";
395 }
396
397 model = &op_ppro_spec;
398 return 1;
399}
400
401/* in order to get sysfs right */
402static int using_nmi;
403
404int __init op_nmi_init(struct oprofile_operations *ops)
405{
406 __u8 vendor = boot_cpu_data.x86_vendor;
407 __u8 family = boot_cpu_data.x86;
408 char *cpu_type;
409
410 if (!cpu_has_apic)
411 return -ENODEV;
412
413 switch (vendor) {
414 case X86_VENDOR_AMD:
415 /* Needs to be at least an Athlon (or hammer in 32bit mode) */
416
417 switch (family) {
418 default:
419 return -ENODEV;
420 case 6:
421 model = &op_athlon_spec;
422 cpu_type = "i386/athlon";
423 break;
424 case 0xf:
425 model = &op_athlon_spec;
426 /* Actually it could be i386/hammer too, but give
427 user space an consistent name. */
428 cpu_type = "x86-64/hammer";
429 break;
430 case 0x10:
431 model = &op_athlon_spec;
432 cpu_type = "x86-64/family10";
433 break;
434 }
435 break;
436
437 case X86_VENDOR_INTEL:
438 switch (family) {
439 /* Pentium IV */
440 case 0xf:
441 if (!p4_init(&cpu_type))
442 return -ENODEV;
443 break;
444
445 /* A P6-class processor */
446 case 6:
447 if (!ppro_init(&cpu_type))
448 return -ENODEV;
449 break;
450
451 default:
452 return -ENODEV;
453 }
454 break;
455
456 default:
457 return -ENODEV;
458 }
459
460 init_sysfs();
461 using_nmi = 1;
462 ops->create_files = nmi_create_files;
463 ops->setup = nmi_setup;
464 ops->shutdown = nmi_shutdown;
465 ops->start = nmi_start;
466 ops->stop = nmi_stop;
467 ops->cpu_type = cpu_type;
468 printk(KERN_INFO "oprofile: using NMI interrupt.\n");
469 return 0;
470}
471
472
473void op_nmi_exit(void)
474{
475 if (using_nmi)
476 exit_sysfs();
477}
diff --git a/arch/x86/oprofile/nmi_timer_int.c b/arch/x86/oprofile/nmi_timer_int.c
new file mode 100644
index 000000000000..1418e36ae7ab
--- /dev/null
+++ b/arch/x86/oprofile/nmi_timer_int.c
@@ -0,0 +1,69 @@
1/**
2 * @file nmi_timer_int.c
3 *
4 * @remark Copyright 2003 OProfile authors
5 * @remark Read the file COPYING
6 *
7 * @author Zwane Mwaikambo <zwane@linuxpower.ca>
8 */
9
10#include <linux/init.h>
11#include <linux/smp.h>
12#include <linux/errno.h>
13#include <linux/oprofile.h>
14#include <linux/rcupdate.h>
15#include <linux/kdebug.h>
16
17#include <asm/nmi.h>
18#include <asm/apic.h>
19#include <asm/ptrace.h>
20
21static int profile_timer_exceptions_notify(struct notifier_block *self,
22 unsigned long val, void *data)
23{
24 struct die_args *args = (struct die_args *)data;
25 int ret = NOTIFY_DONE;
26
27 switch(val) {
28 case DIE_NMI:
29 oprofile_add_sample(args->regs, 0);
30 ret = NOTIFY_STOP;
31 break;
32 default:
33 break;
34 }
35 return ret;
36}
37
38static struct notifier_block profile_timer_exceptions_nb = {
39 .notifier_call = profile_timer_exceptions_notify,
40 .next = NULL,
41 .priority = 0
42};
43
44static int timer_start(void)
45{
46 if (register_die_notifier(&profile_timer_exceptions_nb))
47 return 1;
48 return 0;
49}
50
51
52static void timer_stop(void)
53{
54 unregister_die_notifier(&profile_timer_exceptions_nb);
55 synchronize_sched(); /* Allow already-started NMIs to complete. */
56}
57
58
59int __init op_nmi_timer_init(struct oprofile_operations * ops)
60{
61 if ((nmi_watchdog != NMI_IO_APIC) || (atomic_read(&nmi_active) <= 0))
62 return -ENODEV;
63
64 ops->start = timer_start;
65 ops->stop = timer_stop;
66 ops->cpu_type = "timer";
67 printk(KERN_INFO "oprofile: using NMI timer interrupt.\n");
68 return 0;
69}
diff --git a/arch/x86/oprofile/op_counter.h b/arch/x86/oprofile/op_counter.h
new file mode 100644
index 000000000000..2880b15c4675
--- /dev/null
+++ b/arch/x86/oprofile/op_counter.h
@@ -0,0 +1,29 @@
1/**
2 * @file op_counter.h
3 *
4 * @remark Copyright 2002 OProfile authors
5 * @remark Read the file COPYING
6 *
7 * @author John Levon
8 */
9
10#ifndef OP_COUNTER_H
11#define OP_COUNTER_H
12
13#define OP_MAX_COUNTER 8
14
15/* Per-perfctr configuration as set via
16 * oprofilefs.
17 */
18struct op_counter_config {
19 unsigned long count;
20 unsigned long enabled;
21 unsigned long event;
22 unsigned long kernel;
23 unsigned long user;
24 unsigned long unit_mask;
25};
26
27extern struct op_counter_config counter_config[];
28
29#endif /* OP_COUNTER_H */
diff --git a/arch/x86/oprofile/op_model_athlon.c b/arch/x86/oprofile/op_model_athlon.c
new file mode 100644
index 000000000000..3057a19e4641
--- /dev/null
+++ b/arch/x86/oprofile/op_model_athlon.c
@@ -0,0 +1,180 @@
1/**
2 * @file op_model_athlon.h
3 * athlon / K7 model-specific MSR operations
4 *
5 * @remark Copyright 2002 OProfile authors
6 * @remark Read the file COPYING
7 *
8 * @author John Levon
9 * @author Philippe Elie
10 * @author Graydon Hoare
11 */
12
13#include <linux/oprofile.h>
14#include <asm/ptrace.h>
15#include <asm/msr.h>
16#include <asm/nmi.h>
17
18#include "op_x86_model.h"
19#include "op_counter.h"
20
21#define NUM_COUNTERS 4
22#define NUM_CONTROLS 4
23
24#define CTR_IS_RESERVED(msrs,c) (msrs->counters[(c)].addr ? 1 : 0)
25#define CTR_READ(l,h,msrs,c) do {rdmsr(msrs->counters[(c)].addr, (l), (h));} while (0)
26#define CTR_WRITE(l,msrs,c) do {wrmsr(msrs->counters[(c)].addr, -(unsigned int)(l), -1);} while (0)
27#define CTR_OVERFLOWED(n) (!((n) & (1U<<31)))
28
29#define CTRL_IS_RESERVED(msrs,c) (msrs->controls[(c)].addr ? 1 : 0)
30#define CTRL_READ(l,h,msrs,c) do {rdmsr(msrs->controls[(c)].addr, (l), (h));} while (0)
31#define CTRL_WRITE(l,h,msrs,c) do {wrmsr(msrs->controls[(c)].addr, (l), (h));} while (0)
32#define CTRL_SET_ACTIVE(n) (n |= (1<<22))
33#define CTRL_SET_INACTIVE(n) (n &= ~(1<<22))
34#define CTRL_CLEAR(x) (x &= (1<<21))
35#define CTRL_SET_ENABLE(val) (val |= 1<<20)
36#define CTRL_SET_USR(val,u) (val |= ((u & 1) << 16))
37#define CTRL_SET_KERN(val,k) (val |= ((k & 1) << 17))
38#define CTRL_SET_UM(val, m) (val |= (m << 8))
39#define CTRL_SET_EVENT(val, e) (val |= e)
40
41static unsigned long reset_value[NUM_COUNTERS];
42
43static void athlon_fill_in_addresses(struct op_msrs * const msrs)
44{
45 int i;
46
47 for (i=0; i < NUM_COUNTERS; i++) {
48 if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i))
49 msrs->counters[i].addr = MSR_K7_PERFCTR0 + i;
50 else
51 msrs->counters[i].addr = 0;
52 }
53
54 for (i=0; i < NUM_CONTROLS; i++) {
55 if (reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i))
56 msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i;
57 else
58 msrs->controls[i].addr = 0;
59 }
60}
61
62
63static void athlon_setup_ctrs(struct op_msrs const * const msrs)
64{
65 unsigned int low, high;
66 int i;
67
68 /* clear all counters */
69 for (i = 0 ; i < NUM_CONTROLS; ++i) {
70 if (unlikely(!CTRL_IS_RESERVED(msrs,i)))
71 continue;
72 CTRL_READ(low, high, msrs, i);
73 CTRL_CLEAR(low);
74 CTRL_WRITE(low, high, msrs, i);
75 }
76
77 /* avoid a false detection of ctr overflows in NMI handler */
78 for (i = 0; i < NUM_COUNTERS; ++i) {
79 if (unlikely(!CTR_IS_RESERVED(msrs,i)))
80 continue;
81 CTR_WRITE(1, msrs, i);
82 }
83
84 /* enable active counters */
85 for (i = 0; i < NUM_COUNTERS; ++i) {
86 if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs,i))) {
87 reset_value[i] = counter_config[i].count;
88
89 CTR_WRITE(counter_config[i].count, msrs, i);
90
91 CTRL_READ(low, high, msrs, i);
92 CTRL_CLEAR(low);
93 CTRL_SET_ENABLE(low);
94 CTRL_SET_USR(low, counter_config[i].user);
95 CTRL_SET_KERN(low, counter_config[i].kernel);
96 CTRL_SET_UM(low, counter_config[i].unit_mask);
97 CTRL_SET_EVENT(low, counter_config[i].event);
98 CTRL_WRITE(low, high, msrs, i);
99 } else {
100 reset_value[i] = 0;
101 }
102 }
103}
104
105
106static int athlon_check_ctrs(struct pt_regs * const regs,
107 struct op_msrs const * const msrs)
108{
109 unsigned int low, high;
110 int i;
111
112 for (i = 0 ; i < NUM_COUNTERS; ++i) {
113 if (!reset_value[i])
114 continue;
115 CTR_READ(low, high, msrs, i);
116 if (CTR_OVERFLOWED(low)) {
117 oprofile_add_sample(regs, i);
118 CTR_WRITE(reset_value[i], msrs, i);
119 }
120 }
121
122 /* See op_model_ppro.c */
123 return 1;
124}
125
126
127static void athlon_start(struct op_msrs const * const msrs)
128{
129 unsigned int low, high;
130 int i;
131 for (i = 0 ; i < NUM_COUNTERS ; ++i) {
132 if (reset_value[i]) {
133 CTRL_READ(low, high, msrs, i);
134 CTRL_SET_ACTIVE(low);
135 CTRL_WRITE(low, high, msrs, i);
136 }
137 }
138}
139
140
141static void athlon_stop(struct op_msrs const * const msrs)
142{
143 unsigned int low,high;
144 int i;
145
146 /* Subtle: stop on all counters to avoid race with
147 * setting our pm callback */
148 for (i = 0 ; i < NUM_COUNTERS ; ++i) {
149 if (!reset_value[i])
150 continue;
151 CTRL_READ(low, high, msrs, i);
152 CTRL_SET_INACTIVE(low);
153 CTRL_WRITE(low, high, msrs, i);
154 }
155}
156
157static void athlon_shutdown(struct op_msrs const * const msrs)
158{
159 int i;
160
161 for (i = 0 ; i < NUM_COUNTERS ; ++i) {
162 if (CTR_IS_RESERVED(msrs,i))
163 release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
164 }
165 for (i = 0 ; i < NUM_CONTROLS ; ++i) {
166 if (CTRL_IS_RESERVED(msrs,i))
167 release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
168 }
169}
170
171struct op_x86_model_spec const op_athlon_spec = {
172 .num_counters = NUM_COUNTERS,
173 .num_controls = NUM_CONTROLS,
174 .fill_in_addresses = &athlon_fill_in_addresses,
175 .setup_ctrs = &athlon_setup_ctrs,
176 .check_ctrs = &athlon_check_ctrs,
177 .start = &athlon_start,
178 .stop = &athlon_stop,
179 .shutdown = &athlon_shutdown
180};
diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c
new file mode 100644
index 000000000000..47925927b12f
--- /dev/null
+++ b/arch/x86/oprofile/op_model_p4.c
@@ -0,0 +1,722 @@
1/**
2 * @file op_model_p4.c
3 * P4 model-specific MSR operations
4 *
5 * @remark Copyright 2002 OProfile authors
6 * @remark Read the file COPYING
7 *
8 * @author Graydon Hoare
9 */
10
11#include <linux/oprofile.h>
12#include <linux/smp.h>
13#include <asm/msr.h>
14#include <asm/ptrace.h>
15#include <asm/fixmap.h>
16#include <asm/apic.h>
17#include <asm/nmi.h>
18
19#include "op_x86_model.h"
20#include "op_counter.h"
21
22#define NUM_EVENTS 39
23
24#define NUM_COUNTERS_NON_HT 8
25#define NUM_ESCRS_NON_HT 45
26#define NUM_CCCRS_NON_HT 18
27#define NUM_CONTROLS_NON_HT (NUM_ESCRS_NON_HT + NUM_CCCRS_NON_HT)
28
29#define NUM_COUNTERS_HT2 4
30#define NUM_ESCRS_HT2 23
31#define NUM_CCCRS_HT2 9
32#define NUM_CONTROLS_HT2 (NUM_ESCRS_HT2 + NUM_CCCRS_HT2)
33
34static unsigned int num_counters = NUM_COUNTERS_NON_HT;
35static unsigned int num_controls = NUM_CONTROLS_NON_HT;
36
37/* this has to be checked dynamically since the
38 hyper-threadedness of a chip is discovered at
39 kernel boot-time. */
40static inline void setup_num_counters(void)
41{
42#ifdef CONFIG_SMP
43 if (smp_num_siblings == 2){
44 num_counters = NUM_COUNTERS_HT2;
45 num_controls = NUM_CONTROLS_HT2;
46 }
47#endif
48}
49
50static int inline addr_increment(void)
51{
52#ifdef CONFIG_SMP
53 return smp_num_siblings == 2 ? 2 : 1;
54#else
55 return 1;
56#endif
57}
58
59
60/* tables to simulate simplified hardware view of p4 registers */
61struct p4_counter_binding {
62 int virt_counter;
63 int counter_address;
64 int cccr_address;
65};
66
67struct p4_event_binding {
68 int escr_select; /* value to put in CCCR */
69 int event_select; /* value to put in ESCR */
70 struct {
71 int virt_counter; /* for this counter... */
72 int escr_address; /* use this ESCR */
73 } bindings[2];
74};
75
76/* nb: these CTR_* defines are a duplicate of defines in
77 event/i386.p4*events. */
78
79
80#define CTR_BPU_0 (1 << 0)
81#define CTR_MS_0 (1 << 1)
82#define CTR_FLAME_0 (1 << 2)
83#define CTR_IQ_4 (1 << 3)
84#define CTR_BPU_2 (1 << 4)
85#define CTR_MS_2 (1 << 5)
86#define CTR_FLAME_2 (1 << 6)
87#define CTR_IQ_5 (1 << 7)
88
89static struct p4_counter_binding p4_counters [NUM_COUNTERS_NON_HT] = {
90 { CTR_BPU_0, MSR_P4_BPU_PERFCTR0, MSR_P4_BPU_CCCR0 },
91 { CTR_MS_0, MSR_P4_MS_PERFCTR0, MSR_P4_MS_CCCR0 },
92 { CTR_FLAME_0, MSR_P4_FLAME_PERFCTR0, MSR_P4_FLAME_CCCR0 },
93 { CTR_IQ_4, MSR_P4_IQ_PERFCTR4, MSR_P4_IQ_CCCR4 },
94 { CTR_BPU_2, MSR_P4_BPU_PERFCTR2, MSR_P4_BPU_CCCR2 },
95 { CTR_MS_2, MSR_P4_MS_PERFCTR2, MSR_P4_MS_CCCR2 },
96 { CTR_FLAME_2, MSR_P4_FLAME_PERFCTR2, MSR_P4_FLAME_CCCR2 },
97 { CTR_IQ_5, MSR_P4_IQ_PERFCTR5, MSR_P4_IQ_CCCR5 }
98};
99
100#define NUM_UNUSED_CCCRS NUM_CCCRS_NON_HT - NUM_COUNTERS_NON_HT
101
102/* p4 event codes in libop/op_event.h are indices into this table. */
103
104static struct p4_event_binding p4_events[NUM_EVENTS] = {
105
106 { /* BRANCH_RETIRED */
107 0x05, 0x06,
108 { {CTR_IQ_4, MSR_P4_CRU_ESCR2},
109 {CTR_IQ_5, MSR_P4_CRU_ESCR3} }
110 },
111
112 { /* MISPRED_BRANCH_RETIRED */
113 0x04, 0x03,
114 { { CTR_IQ_4, MSR_P4_CRU_ESCR0},
115 { CTR_IQ_5, MSR_P4_CRU_ESCR1} }
116 },
117
118 { /* TC_DELIVER_MODE */
119 0x01, 0x01,
120 { { CTR_MS_0, MSR_P4_TC_ESCR0},
121 { CTR_MS_2, MSR_P4_TC_ESCR1} }
122 },
123
124 { /* BPU_FETCH_REQUEST */
125 0x00, 0x03,
126 { { CTR_BPU_0, MSR_P4_BPU_ESCR0},
127 { CTR_BPU_2, MSR_P4_BPU_ESCR1} }
128 },
129
130 { /* ITLB_REFERENCE */
131 0x03, 0x18,
132 { { CTR_BPU_0, MSR_P4_ITLB_ESCR0},
133 { CTR_BPU_2, MSR_P4_ITLB_ESCR1} }
134 },
135
136 { /* MEMORY_CANCEL */
137 0x05, 0x02,
138 { { CTR_FLAME_0, MSR_P4_DAC_ESCR0},
139 { CTR_FLAME_2, MSR_P4_DAC_ESCR1} }
140 },
141
142 { /* MEMORY_COMPLETE */
143 0x02, 0x08,
144 { { CTR_FLAME_0, MSR_P4_SAAT_ESCR0},
145 { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} }
146 },
147
148 { /* LOAD_PORT_REPLAY */
149 0x02, 0x04,
150 { { CTR_FLAME_0, MSR_P4_SAAT_ESCR0},
151 { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} }
152 },
153
154 { /* STORE_PORT_REPLAY */
155 0x02, 0x05,
156 { { CTR_FLAME_0, MSR_P4_SAAT_ESCR0},
157 { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} }
158 },
159
160 { /* MOB_LOAD_REPLAY */
161 0x02, 0x03,
162 { { CTR_BPU_0, MSR_P4_MOB_ESCR0},
163 { CTR_BPU_2, MSR_P4_MOB_ESCR1} }
164 },
165
166 { /* PAGE_WALK_TYPE */
167 0x04, 0x01,
168 { { CTR_BPU_0, MSR_P4_PMH_ESCR0},
169 { CTR_BPU_2, MSR_P4_PMH_ESCR1} }
170 },
171
172 { /* BSQ_CACHE_REFERENCE */
173 0x07, 0x0c,
174 { { CTR_BPU_0, MSR_P4_BSU_ESCR0},
175 { CTR_BPU_2, MSR_P4_BSU_ESCR1} }
176 },
177
178 { /* IOQ_ALLOCATION */
179 0x06, 0x03,
180 { { CTR_BPU_0, MSR_P4_FSB_ESCR0},
181 { 0, 0 } }
182 },
183
184 { /* IOQ_ACTIVE_ENTRIES */
185 0x06, 0x1a,
186 { { CTR_BPU_2, MSR_P4_FSB_ESCR1},
187 { 0, 0 } }
188 },
189
190 { /* FSB_DATA_ACTIVITY */
191 0x06, 0x17,
192 { { CTR_BPU_0, MSR_P4_FSB_ESCR0},
193 { CTR_BPU_2, MSR_P4_FSB_ESCR1} }
194 },
195
196 { /* BSQ_ALLOCATION */
197 0x07, 0x05,
198 { { CTR_BPU_0, MSR_P4_BSU_ESCR0},
199 { 0, 0 } }
200 },
201
202 { /* BSQ_ACTIVE_ENTRIES */
203 0x07, 0x06,
204 { { CTR_BPU_2, MSR_P4_BSU_ESCR1 /* guess */},
205 { 0, 0 } }
206 },
207
208 { /* X87_ASSIST */
209 0x05, 0x03,
210 { { CTR_IQ_4, MSR_P4_CRU_ESCR2},
211 { CTR_IQ_5, MSR_P4_CRU_ESCR3} }
212 },
213
214 { /* SSE_INPUT_ASSIST */
215 0x01, 0x34,
216 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
217 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
218 },
219
220 { /* PACKED_SP_UOP */
221 0x01, 0x08,
222 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
223 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
224 },
225
226 { /* PACKED_DP_UOP */
227 0x01, 0x0c,
228 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
229 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
230 },
231
232 { /* SCALAR_SP_UOP */
233 0x01, 0x0a,
234 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
235 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
236 },
237
238 { /* SCALAR_DP_UOP */
239 0x01, 0x0e,
240 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
241 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
242 },
243
244 { /* 64BIT_MMX_UOP */
245 0x01, 0x02,
246 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
247 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
248 },
249
250 { /* 128BIT_MMX_UOP */
251 0x01, 0x1a,
252 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
253 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
254 },
255
256 { /* X87_FP_UOP */
257 0x01, 0x04,
258 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
259 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
260 },
261
262 { /* X87_SIMD_MOVES_UOP */
263 0x01, 0x2e,
264 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
265 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
266 },
267
268 { /* MACHINE_CLEAR */
269 0x05, 0x02,
270 { { CTR_IQ_4, MSR_P4_CRU_ESCR2},
271 { CTR_IQ_5, MSR_P4_CRU_ESCR3} }
272 },
273
274 { /* GLOBAL_POWER_EVENTS */
275 0x06, 0x13 /* older manual says 0x05, newer 0x13 */,
276 { { CTR_BPU_0, MSR_P4_FSB_ESCR0},
277 { CTR_BPU_2, MSR_P4_FSB_ESCR1} }
278 },
279
280 { /* TC_MS_XFER */
281 0x00, 0x05,
282 { { CTR_MS_0, MSR_P4_MS_ESCR0},
283 { CTR_MS_2, MSR_P4_MS_ESCR1} }
284 },
285
286 { /* UOP_QUEUE_WRITES */
287 0x00, 0x09,
288 { { CTR_MS_0, MSR_P4_MS_ESCR0},
289 { CTR_MS_2, MSR_P4_MS_ESCR1} }
290 },
291
292 { /* FRONT_END_EVENT */
293 0x05, 0x08,
294 { { CTR_IQ_4, MSR_P4_CRU_ESCR2},
295 { CTR_IQ_5, MSR_P4_CRU_ESCR3} }
296 },
297
298 { /* EXECUTION_EVENT */
299 0x05, 0x0c,
300 { { CTR_IQ_4, MSR_P4_CRU_ESCR2},
301 { CTR_IQ_5, MSR_P4_CRU_ESCR3} }
302 },
303
304 { /* REPLAY_EVENT */
305 0x05, 0x09,
306 { { CTR_IQ_4, MSR_P4_CRU_ESCR2},
307 { CTR_IQ_5, MSR_P4_CRU_ESCR3} }
308 },
309
310 { /* INSTR_RETIRED */
311 0x04, 0x02,
312 { { CTR_IQ_4, MSR_P4_CRU_ESCR0},
313 { CTR_IQ_5, MSR_P4_CRU_ESCR1} }
314 },
315
316 { /* UOPS_RETIRED */
317 0x04, 0x01,
318 { { CTR_IQ_4, MSR_P4_CRU_ESCR0},
319 { CTR_IQ_5, MSR_P4_CRU_ESCR1} }
320 },
321
322 { /* UOP_TYPE */
323 0x02, 0x02,
324 { { CTR_IQ_4, MSR_P4_RAT_ESCR0},
325 { CTR_IQ_5, MSR_P4_RAT_ESCR1} }
326 },
327
328 { /* RETIRED_MISPRED_BRANCH_TYPE */
329 0x02, 0x05,
330 { { CTR_MS_0, MSR_P4_TBPU_ESCR0},
331 { CTR_MS_2, MSR_P4_TBPU_ESCR1} }
332 },
333
334 { /* RETIRED_BRANCH_TYPE */
335 0x02, 0x04,
336 { { CTR_MS_0, MSR_P4_TBPU_ESCR0},
337 { CTR_MS_2, MSR_P4_TBPU_ESCR1} }
338 }
339};
340
341
342#define MISC_PMC_ENABLED_P(x) ((x) & 1 << 7)
343
344#define ESCR_RESERVED_BITS 0x80000003
345#define ESCR_CLEAR(escr) ((escr) &= ESCR_RESERVED_BITS)
346#define ESCR_SET_USR_0(escr, usr) ((escr) |= (((usr) & 1) << 2))
347#define ESCR_SET_OS_0(escr, os) ((escr) |= (((os) & 1) << 3))
348#define ESCR_SET_USR_1(escr, usr) ((escr) |= (((usr) & 1)))
349#define ESCR_SET_OS_1(escr, os) ((escr) |= (((os) & 1) << 1))
350#define ESCR_SET_EVENT_SELECT(escr, sel) ((escr) |= (((sel) & 0x3f) << 25))
351#define ESCR_SET_EVENT_MASK(escr, mask) ((escr) |= (((mask) & 0xffff) << 9))
352#define ESCR_READ(escr,high,ev,i) do {rdmsr(ev->bindings[(i)].escr_address, (escr), (high));} while (0)
353#define ESCR_WRITE(escr,high,ev,i) do {wrmsr(ev->bindings[(i)].escr_address, (escr), (high));} while (0)
354
355#define CCCR_RESERVED_BITS 0x38030FFF
356#define CCCR_CLEAR(cccr) ((cccr) &= CCCR_RESERVED_BITS)
357#define CCCR_SET_REQUIRED_BITS(cccr) ((cccr) |= 0x00030000)
358#define CCCR_SET_ESCR_SELECT(cccr, sel) ((cccr) |= (((sel) & 0x07) << 13))
359#define CCCR_SET_PMI_OVF_0(cccr) ((cccr) |= (1<<26))
360#define CCCR_SET_PMI_OVF_1(cccr) ((cccr) |= (1<<27))
361#define CCCR_SET_ENABLE(cccr) ((cccr) |= (1<<12))
362#define CCCR_SET_DISABLE(cccr) ((cccr) &= ~(1<<12))
363#define CCCR_READ(low, high, i) do {rdmsr(p4_counters[(i)].cccr_address, (low), (high));} while (0)
364#define CCCR_WRITE(low, high, i) do {wrmsr(p4_counters[(i)].cccr_address, (low), (high));} while (0)
365#define CCCR_OVF_P(cccr) ((cccr) & (1U<<31))
366#define CCCR_CLEAR_OVF(cccr) ((cccr) &= (~(1U<<31)))
367
368#define CTRL_IS_RESERVED(msrs,c) (msrs->controls[(c)].addr ? 1 : 0)
369#define CTR_IS_RESERVED(msrs,c) (msrs->counters[(c)].addr ? 1 : 0)
370#define CTR_READ(l,h,i) do {rdmsr(p4_counters[(i)].counter_address, (l), (h));} while (0)
371#define CTR_WRITE(l,i) do {wrmsr(p4_counters[(i)].counter_address, -(u32)(l), -1);} while (0)
372#define CTR_OVERFLOW_P(ctr) (!((ctr) & 0x80000000))
373
374
375/* this assigns a "stagger" to the current CPU, which is used throughout
376 the code in this module as an extra array offset, to select the "even"
377 or "odd" part of all the divided resources. */
378static unsigned int get_stagger(void)
379{
380#ifdef CONFIG_SMP
381 int cpu = smp_processor_id();
382 return (cpu != first_cpu(cpu_sibling_map[cpu]));
383#endif
384 return 0;
385}
386
387
388/* finally, mediate access to a real hardware counter
389 by passing a "virtual" counter numer to this macro,
390 along with your stagger setting. */
391#define VIRT_CTR(stagger, i) ((i) + ((num_counters) * (stagger)))
392
393static unsigned long reset_value[NUM_COUNTERS_NON_HT];
394
395
396static void p4_fill_in_addresses(struct op_msrs * const msrs)
397{
398 unsigned int i;
399 unsigned int addr, cccraddr, stag;
400
401 setup_num_counters();
402 stag = get_stagger();
403
404 /* initialize some registers */
405 for (i = 0; i < num_counters; ++i) {
406 msrs->counters[i].addr = 0;
407 }
408 for (i = 0; i < num_controls; ++i) {
409 msrs->controls[i].addr = 0;
410 }
411
412 /* the counter & cccr registers we pay attention to */
413 for (i = 0; i < num_counters; ++i) {
414 addr = p4_counters[VIRT_CTR(stag, i)].counter_address;
415 cccraddr = p4_counters[VIRT_CTR(stag, i)].cccr_address;
416 if (reserve_perfctr_nmi(addr)){
417 msrs->counters[i].addr = addr;
418 msrs->controls[i].addr = cccraddr;
419 }
420 }
421
422 /* 43 ESCR registers in three or four discontiguous group */
423 for (addr = MSR_P4_BSU_ESCR0 + stag;
424 addr < MSR_P4_IQ_ESCR0; ++i, addr += addr_increment()) {
425 if (reserve_evntsel_nmi(addr))
426 msrs->controls[i].addr = addr;
427 }
428
429 /* no IQ_ESCR0/1 on some models, we save a seconde time BSU_ESCR0/1
430 * to avoid special case in nmi_{save|restore}_registers() */
431 if (boot_cpu_data.x86_model >= 0x3) {
432 for (addr = MSR_P4_BSU_ESCR0 + stag;
433 addr <= MSR_P4_BSU_ESCR1; ++i, addr += addr_increment()) {
434 if (reserve_evntsel_nmi(addr))
435 msrs->controls[i].addr = addr;
436 }
437 } else {
438 for (addr = MSR_P4_IQ_ESCR0 + stag;
439 addr <= MSR_P4_IQ_ESCR1; ++i, addr += addr_increment()) {
440 if (reserve_evntsel_nmi(addr))
441 msrs->controls[i].addr = addr;
442 }
443 }
444
445 for (addr = MSR_P4_RAT_ESCR0 + stag;
446 addr <= MSR_P4_SSU_ESCR0; ++i, addr += addr_increment()) {
447 if (reserve_evntsel_nmi(addr))
448 msrs->controls[i].addr = addr;
449 }
450
451 for (addr = MSR_P4_MS_ESCR0 + stag;
452 addr <= MSR_P4_TC_ESCR1; ++i, addr += addr_increment()) {
453 if (reserve_evntsel_nmi(addr))
454 msrs->controls[i].addr = addr;
455 }
456
457 for (addr = MSR_P4_IX_ESCR0 + stag;
458 addr <= MSR_P4_CRU_ESCR3; ++i, addr += addr_increment()) {
459 if (reserve_evntsel_nmi(addr))
460 msrs->controls[i].addr = addr;
461 }
462
463 /* there are 2 remaining non-contiguously located ESCRs */
464
465 if (num_counters == NUM_COUNTERS_NON_HT) {
466 /* standard non-HT CPUs handle both remaining ESCRs*/
467 if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR5))
468 msrs->controls[i++].addr = MSR_P4_CRU_ESCR5;
469 if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR4))
470 msrs->controls[i++].addr = MSR_P4_CRU_ESCR4;
471
472 } else if (stag == 0) {
473 /* HT CPUs give the first remainder to the even thread, as
474 the 32nd control register */
475 if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR4))
476 msrs->controls[i++].addr = MSR_P4_CRU_ESCR4;
477
478 } else {
479 /* and two copies of the second to the odd thread,
480 for the 22st and 23nd control registers */
481 if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR5)) {
482 msrs->controls[i++].addr = MSR_P4_CRU_ESCR5;
483 msrs->controls[i++].addr = MSR_P4_CRU_ESCR5;
484 }
485 }
486}
487
488
489static void pmc_setup_one_p4_counter(unsigned int ctr)
490{
491 int i;
492 int const maxbind = 2;
493 unsigned int cccr = 0;
494 unsigned int escr = 0;
495 unsigned int high = 0;
496 unsigned int counter_bit;
497 struct p4_event_binding *ev = NULL;
498 unsigned int stag;
499
500 stag = get_stagger();
501
502 /* convert from counter *number* to counter *bit* */
503 counter_bit = 1 << VIRT_CTR(stag, ctr);
504
505 /* find our event binding structure. */
506 if (counter_config[ctr].event <= 0 || counter_config[ctr].event > NUM_EVENTS) {
507 printk(KERN_ERR
508 "oprofile: P4 event code 0x%lx out of range\n",
509 counter_config[ctr].event);
510 return;
511 }
512
513 ev = &(p4_events[counter_config[ctr].event - 1]);
514
515 for (i = 0; i < maxbind; i++) {
516 if (ev->bindings[i].virt_counter & counter_bit) {
517
518 /* modify ESCR */
519 ESCR_READ(escr, high, ev, i);
520 ESCR_CLEAR(escr);
521 if (stag == 0) {
522 ESCR_SET_USR_0(escr, counter_config[ctr].user);
523 ESCR_SET_OS_0(escr, counter_config[ctr].kernel);
524 } else {
525 ESCR_SET_USR_1(escr, counter_config[ctr].user);
526 ESCR_SET_OS_1(escr, counter_config[ctr].kernel);
527 }
528 ESCR_SET_EVENT_SELECT(escr, ev->event_select);
529 ESCR_SET_EVENT_MASK(escr, counter_config[ctr].unit_mask);
530 ESCR_WRITE(escr, high, ev, i);
531
532 /* modify CCCR */
533 CCCR_READ(cccr, high, VIRT_CTR(stag, ctr));
534 CCCR_CLEAR(cccr);
535 CCCR_SET_REQUIRED_BITS(cccr);
536 CCCR_SET_ESCR_SELECT(cccr, ev->escr_select);
537 if (stag == 0) {
538 CCCR_SET_PMI_OVF_0(cccr);
539 } else {
540 CCCR_SET_PMI_OVF_1(cccr);
541 }
542 CCCR_WRITE(cccr, high, VIRT_CTR(stag, ctr));
543 return;
544 }
545 }
546
547 printk(KERN_ERR
548 "oprofile: P4 event code 0x%lx no binding, stag %d ctr %d\n",
549 counter_config[ctr].event, stag, ctr);
550}
551
552
553static void p4_setup_ctrs(struct op_msrs const * const msrs)
554{
555 unsigned int i;
556 unsigned int low, high;
557 unsigned int stag;
558
559 stag = get_stagger();
560
561 rdmsr(MSR_IA32_MISC_ENABLE, low, high);
562 if (! MISC_PMC_ENABLED_P(low)) {
563 printk(KERN_ERR "oprofile: P4 PMC not available\n");
564 return;
565 }
566
567 /* clear the cccrs we will use */
568 for (i = 0 ; i < num_counters ; i++) {
569 if (unlikely(!CTRL_IS_RESERVED(msrs,i)))
570 continue;
571 rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high);
572 CCCR_CLEAR(low);
573 CCCR_SET_REQUIRED_BITS(low);
574 wrmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high);
575 }
576
577 /* clear all escrs (including those outside our concern) */
578 for (i = num_counters; i < num_controls; i++) {
579 if (unlikely(!CTRL_IS_RESERVED(msrs,i)))
580 continue;
581 wrmsr(msrs->controls[i].addr, 0, 0);
582 }
583
584 /* setup all counters */
585 for (i = 0 ; i < num_counters ; ++i) {
586 if ((counter_config[i].enabled) && (CTRL_IS_RESERVED(msrs,i))) {
587 reset_value[i] = counter_config[i].count;
588 pmc_setup_one_p4_counter(i);
589 CTR_WRITE(counter_config[i].count, VIRT_CTR(stag, i));
590 } else {
591 reset_value[i] = 0;
592 }
593 }
594}
595
596
597static int p4_check_ctrs(struct pt_regs * const regs,
598 struct op_msrs const * const msrs)
599{
600 unsigned long ctr, low, high, stag, real;
601 int i;
602
603 stag = get_stagger();
604
605 for (i = 0; i < num_counters; ++i) {
606
607 if (!reset_value[i])
608 continue;
609
610 /*
611 * there is some eccentricity in the hardware which
612 * requires that we perform 2 extra corrections:
613 *
614 * - check both the CCCR:OVF flag for overflow and the
615 * counter high bit for un-flagged overflows.
616 *
617 * - write the counter back twice to ensure it gets
618 * updated properly.
619 *
620 * the former seems to be related to extra NMIs happening
621 * during the current NMI; the latter is reported as errata
622 * N15 in intel doc 249199-029, pentium 4 specification
623 * update, though their suggested work-around does not
624 * appear to solve the problem.
625 */
626
627 real = VIRT_CTR(stag, i);
628
629 CCCR_READ(low, high, real);
630 CTR_READ(ctr, high, real);
631 if (CCCR_OVF_P(low) || CTR_OVERFLOW_P(ctr)) {
632 oprofile_add_sample(regs, i);
633 CTR_WRITE(reset_value[i], real);
634 CCCR_CLEAR_OVF(low);
635 CCCR_WRITE(low, high, real);
636 CTR_WRITE(reset_value[i], real);
637 }
638 }
639
640 /* P4 quirk: you have to re-unmask the apic vector */
641 apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
642
643 /* See op_model_ppro.c */
644 return 1;
645}
646
647
648static void p4_start(struct op_msrs const * const msrs)
649{
650 unsigned int low, high, stag;
651 int i;
652
653 stag = get_stagger();
654
655 for (i = 0; i < num_counters; ++i) {
656 if (!reset_value[i])
657 continue;
658 CCCR_READ(low, high, VIRT_CTR(stag, i));
659 CCCR_SET_ENABLE(low);
660 CCCR_WRITE(low, high, VIRT_CTR(stag, i));
661 }
662}
663
664
665static void p4_stop(struct op_msrs const * const msrs)
666{
667 unsigned int low, high, stag;
668 int i;
669
670 stag = get_stagger();
671
672 for (i = 0; i < num_counters; ++i) {
673 if (!reset_value[i])
674 continue;
675 CCCR_READ(low, high, VIRT_CTR(stag, i));
676 CCCR_SET_DISABLE(low);
677 CCCR_WRITE(low, high, VIRT_CTR(stag, i));
678 }
679}
680
681static void p4_shutdown(struct op_msrs const * const msrs)
682{
683 int i;
684
685 for (i = 0 ; i < num_counters ; ++i) {
686 if (CTR_IS_RESERVED(msrs,i))
687 release_perfctr_nmi(msrs->counters[i].addr);
688 }
689 /* some of the control registers are specially reserved in
690 * conjunction with the counter registers (hence the starting offset).
691 * This saves a few bits.
692 */
693 for (i = num_counters ; i < num_controls ; ++i) {
694 if (CTRL_IS_RESERVED(msrs,i))
695 release_evntsel_nmi(msrs->controls[i].addr);
696 }
697}
698
699
700#ifdef CONFIG_SMP
701struct op_x86_model_spec const op_p4_ht2_spec = {
702 .num_counters = NUM_COUNTERS_HT2,
703 .num_controls = NUM_CONTROLS_HT2,
704 .fill_in_addresses = &p4_fill_in_addresses,
705 .setup_ctrs = &p4_setup_ctrs,
706 .check_ctrs = &p4_check_ctrs,
707 .start = &p4_start,
708 .stop = &p4_stop,
709 .shutdown = &p4_shutdown
710};
711#endif
712
713struct op_x86_model_spec const op_p4_spec = {
714 .num_counters = NUM_COUNTERS_NON_HT,
715 .num_controls = NUM_CONTROLS_NON_HT,
716 .fill_in_addresses = &p4_fill_in_addresses,
717 .setup_ctrs = &p4_setup_ctrs,
718 .check_ctrs = &p4_check_ctrs,
719 .start = &p4_start,
720 .stop = &p4_stop,
721 .shutdown = &p4_shutdown
722};
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
new file mode 100644
index 000000000000..c554f52cb808
--- /dev/null
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -0,0 +1,192 @@
1/**
2 * @file op_model_ppro.h
3 * pentium pro / P6 model-specific MSR operations
4 *
5 * @remark Copyright 2002 OProfile authors
6 * @remark Read the file COPYING
7 *
8 * @author John Levon
9 * @author Philippe Elie
10 * @author Graydon Hoare
11 */
12
13#include <linux/oprofile.h>
14#include <asm/ptrace.h>
15#include <asm/msr.h>
16#include <asm/apic.h>
17#include <asm/nmi.h>
18
19#include "op_x86_model.h"
20#include "op_counter.h"
21
22#define NUM_COUNTERS 2
23#define NUM_CONTROLS 2
24
25#define CTR_IS_RESERVED(msrs,c) (msrs->counters[(c)].addr ? 1 : 0)
26#define CTR_READ(l,h,msrs,c) do {rdmsr(msrs->counters[(c)].addr, (l), (h));} while (0)
27#define CTR_32BIT_WRITE(l,msrs,c) \
28 do {wrmsr(msrs->counters[(c)].addr, -(u32)(l), 0);} while (0)
29#define CTR_OVERFLOWED(n) (!((n) & (1U<<31)))
30
31#define CTRL_IS_RESERVED(msrs,c) (msrs->controls[(c)].addr ? 1 : 0)
32#define CTRL_READ(l,h,msrs,c) do {rdmsr((msrs->controls[(c)].addr), (l), (h));} while (0)
33#define CTRL_WRITE(l,h,msrs,c) do {wrmsr((msrs->controls[(c)].addr), (l), (h));} while (0)
34#define CTRL_SET_ACTIVE(n) (n |= (1<<22))
35#define CTRL_SET_INACTIVE(n) (n &= ~(1<<22))
36#define CTRL_CLEAR(x) (x &= (1<<21))
37#define CTRL_SET_ENABLE(val) (val |= 1<<20)
38#define CTRL_SET_USR(val,u) (val |= ((u & 1) << 16))
39#define CTRL_SET_KERN(val,k) (val |= ((k & 1) << 17))
40#define CTRL_SET_UM(val, m) (val |= (m << 8))
41#define CTRL_SET_EVENT(val, e) (val |= e)
42
43static unsigned long reset_value[NUM_COUNTERS];
44
45static void ppro_fill_in_addresses(struct op_msrs * const msrs)
46{
47 int i;
48
49 for (i=0; i < NUM_COUNTERS; i++) {
50 if (reserve_perfctr_nmi(MSR_P6_PERFCTR0 + i))
51 msrs->counters[i].addr = MSR_P6_PERFCTR0 + i;
52 else
53 msrs->counters[i].addr = 0;
54 }
55
56 for (i=0; i < NUM_CONTROLS; i++) {
57 if (reserve_evntsel_nmi(MSR_P6_EVNTSEL0 + i))
58 msrs->controls[i].addr = MSR_P6_EVNTSEL0 + i;
59 else
60 msrs->controls[i].addr = 0;
61 }
62}
63
64
65static void ppro_setup_ctrs(struct op_msrs const * const msrs)
66{
67 unsigned int low, high;
68 int i;
69
70 /* clear all counters */
71 for (i = 0 ; i < NUM_CONTROLS; ++i) {
72 if (unlikely(!CTRL_IS_RESERVED(msrs,i)))
73 continue;
74 CTRL_READ(low, high, msrs, i);
75 CTRL_CLEAR(low);
76 CTRL_WRITE(low, high, msrs, i);
77 }
78
79 /* avoid a false detection of ctr overflows in NMI handler */
80 for (i = 0; i < NUM_COUNTERS; ++i) {
81 if (unlikely(!CTR_IS_RESERVED(msrs,i)))
82 continue;
83 CTR_32BIT_WRITE(1, msrs, i);
84 }
85
86 /* enable active counters */
87 for (i = 0; i < NUM_COUNTERS; ++i) {
88 if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs,i))) {
89 reset_value[i] = counter_config[i].count;
90
91 CTR_32BIT_WRITE(counter_config[i].count, msrs, i);
92
93 CTRL_READ(low, high, msrs, i);
94 CTRL_CLEAR(low);
95 CTRL_SET_ENABLE(low);
96 CTRL_SET_USR(low, counter_config[i].user);
97 CTRL_SET_KERN(low, counter_config[i].kernel);
98 CTRL_SET_UM(low, counter_config[i].unit_mask);
99 CTRL_SET_EVENT(low, counter_config[i].event);
100 CTRL_WRITE(low, high, msrs, i);
101 } else {
102 reset_value[i] = 0;
103 }
104 }
105}
106
107
108static int ppro_check_ctrs(struct pt_regs * const regs,
109 struct op_msrs const * const msrs)
110{
111 unsigned int low, high;
112 int i;
113
114 for (i = 0 ; i < NUM_COUNTERS; ++i) {
115 if (!reset_value[i])
116 continue;
117 CTR_READ(low, high, msrs, i);
118 if (CTR_OVERFLOWED(low)) {
119 oprofile_add_sample(regs, i);
120 CTR_32BIT_WRITE(reset_value[i], msrs, i);
121 }
122 }
123
124 /* Only P6 based Pentium M need to re-unmask the apic vector but it
125 * doesn't hurt other P6 variant */
126 apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
127
128 /* We can't work out if we really handled an interrupt. We
129 * might have caught a *second* counter just after overflowing
130 * the interrupt for this counter then arrives
131 * and we don't find a counter that's overflowed, so we
132 * would return 0 and get dazed + confused. Instead we always
133 * assume we found an overflow. This sucks.
134 */
135 return 1;
136}
137
138
139static void ppro_start(struct op_msrs const * const msrs)
140{
141 unsigned int low,high;
142 int i;
143
144 for (i = 0; i < NUM_COUNTERS; ++i) {
145 if (reset_value[i]) {
146 CTRL_READ(low, high, msrs, i);
147 CTRL_SET_ACTIVE(low);
148 CTRL_WRITE(low, high, msrs, i);
149 }
150 }
151}
152
153
154static void ppro_stop(struct op_msrs const * const msrs)
155{
156 unsigned int low,high;
157 int i;
158
159 for (i = 0; i < NUM_COUNTERS; ++i) {
160 if (!reset_value[i])
161 continue;
162 CTRL_READ(low, high, msrs, i);
163 CTRL_SET_INACTIVE(low);
164 CTRL_WRITE(low, high, msrs, i);
165 }
166}
167
168static void ppro_shutdown(struct op_msrs const * const msrs)
169{
170 int i;
171
172 for (i = 0 ; i < NUM_COUNTERS ; ++i) {
173 if (CTR_IS_RESERVED(msrs,i))
174 release_perfctr_nmi(MSR_P6_PERFCTR0 + i);
175 }
176 for (i = 0 ; i < NUM_CONTROLS ; ++i) {
177 if (CTRL_IS_RESERVED(msrs,i))
178 release_evntsel_nmi(MSR_P6_EVNTSEL0 + i);
179 }
180}
181
182
183struct op_x86_model_spec const op_ppro_spec = {
184 .num_counters = NUM_COUNTERS,
185 .num_controls = NUM_CONTROLS,
186 .fill_in_addresses = &ppro_fill_in_addresses,
187 .setup_ctrs = &ppro_setup_ctrs,
188 .check_ctrs = &ppro_check_ctrs,
189 .start = &ppro_start,
190 .stop = &ppro_stop,
191 .shutdown = &ppro_shutdown
192};
diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h
new file mode 100644
index 000000000000..abb1aa95b979
--- /dev/null
+++ b/arch/x86/oprofile/op_x86_model.h
@@ -0,0 +1,51 @@
1/**
2 * @file op_x86_model.h
3 * interface to x86 model-specific MSR operations
4 *
5 * @remark Copyright 2002 OProfile authors
6 * @remark Read the file COPYING
7 *
8 * @author Graydon Hoare
9 */
10
11#ifndef OP_X86_MODEL_H
12#define OP_X86_MODEL_H
13
14struct op_saved_msr {
15 unsigned int high;
16 unsigned int low;
17};
18
19struct op_msr {
20 unsigned long addr;
21 struct op_saved_msr saved;
22};
23
24struct op_msrs {
25 struct op_msr * counters;
26 struct op_msr * controls;
27};
28
29struct pt_regs;
30
31/* The model vtable abstracts the differences between
32 * various x86 CPU model's perfctr support.
33 */
34struct op_x86_model_spec {
35 unsigned int const num_counters;
36 unsigned int const num_controls;
37 void (*fill_in_addresses)(struct op_msrs * const msrs);
38 void (*setup_ctrs)(struct op_msrs const * const msrs);
39 int (*check_ctrs)(struct pt_regs * const regs,
40 struct op_msrs const * const msrs);
41 void (*start)(struct op_msrs const * const msrs);
42 void (*stop)(struct op_msrs const * const msrs);
43 void (*shutdown)(struct op_msrs const * const msrs);
44};
45
46extern struct op_x86_model_spec const op_ppro_spec;
47extern struct op_x86_model_spec const op_p4_spec;
48extern struct op_x86_model_spec const op_p4_ht2_spec;
49extern struct op_x86_model_spec const op_athlon_spec;
50
51#endif /* OP_X86_MODEL_H */
diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
new file mode 100644
index 000000000000..c5c8e485fc44
--- /dev/null
+++ b/arch/x86/pci/Makefile
@@ -0,0 +1,5 @@
1ifeq ($(CONFIG_X86_32),y)
2include ${srctree}/arch/x86/pci/Makefile_32
3else
4include ${srctree}/arch/x86/pci/Makefile_64
5endif
diff --git a/arch/x86/pci/Makefile_32 b/arch/x86/pci/Makefile_32
new file mode 100644
index 000000000000..cdd6828b5abb
--- /dev/null
+++ b/arch/x86/pci/Makefile_32
@@ -0,0 +1,14 @@
1obj-y := i386.o init.o
2
3obj-$(CONFIG_PCI_BIOS) += pcbios.o
4obj-$(CONFIG_PCI_MMCONFIG) += mmconfig_32.o direct.o mmconfig-shared.o
5obj-$(CONFIG_PCI_DIRECT) += direct.o
6
7pci-y := fixup.o
8pci-$(CONFIG_ACPI) += acpi.o
9pci-y += legacy.o irq.o
10
11pci-$(CONFIG_X86_VISWS) := visws.o fixup.o
12pci-$(CONFIG_X86_NUMAQ) := numa.o irq.o
13
14obj-y += $(pci-y) common.o early.o
diff --git a/arch/x86/pci/Makefile_64 b/arch/x86/pci/Makefile_64
new file mode 100644
index 000000000000..7d8c467bf143
--- /dev/null
+++ b/arch/x86/pci/Makefile_64
@@ -0,0 +1,17 @@
1#
2# Makefile for X86_64 specific PCI routines
3#
4# Reuse the i386 PCI subsystem
5#
6EXTRA_CFLAGS += -Iarch/x86/pci
7
8obj-y := i386.o
9obj-$(CONFIG_PCI_DIRECT)+= direct.o
10obj-y += fixup.o init.o
11obj-$(CONFIG_ACPI) += acpi.o
12obj-y += legacy.o irq.o common.o early.o
13# mmconfig has a 64bit special
14obj-$(CONFIG_PCI_MMCONFIG) += mmconfig_64.o direct.o mmconfig-shared.o
15
16obj-$(CONFIG_NUMA) += k8-bus_64.o
17
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
new file mode 100644
index 000000000000..bc8a44bddaa7
--- /dev/null
+++ b/arch/x86/pci/acpi.c
@@ -0,0 +1,90 @@
1#include <linux/pci.h>
2#include <linux/acpi.h>
3#include <linux/init.h>
4#include <linux/irq.h>
5#include <asm/numa.h>
6#include "pci.h"
7
8struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int domain, int busnum)
9{
10 struct pci_bus *bus;
11 struct pci_sysdata *sd;
12 int pxm;
13
14 /* Allocate per-root-bus (not per bus) arch-specific data.
15 * TODO: leak; this memory is never freed.
16 * It's arguable whether it's worth the trouble to care.
17 */
18 sd = kzalloc(sizeof(*sd), GFP_KERNEL);
19 if (!sd) {
20 printk(KERN_ERR "PCI: OOM, not probing PCI bus %02x\n", busnum);
21 return NULL;
22 }
23
24 if (domain != 0) {
25 printk(KERN_WARNING "PCI: Multiple domains not supported\n");
26 kfree(sd);
27 return NULL;
28 }
29
30 sd->node = -1;
31
32 pxm = acpi_get_pxm(device->handle);
33#ifdef CONFIG_ACPI_NUMA
34 if (pxm >= 0)
35 sd->node = pxm_to_node(pxm);
36#endif
37
38 bus = pci_scan_bus_parented(NULL, busnum, &pci_root_ops, sd);
39 if (!bus)
40 kfree(sd);
41
42#ifdef CONFIG_ACPI_NUMA
43 if (bus != NULL) {
44 if (pxm >= 0) {
45 printk("bus %d -> pxm %d -> node %d\n",
46 busnum, pxm, sd->node);
47 }
48 }
49#endif
50
51 return bus;
52}
53
54extern int pci_routeirq;
55static int __init pci_acpi_init(void)
56{
57 struct pci_dev *dev = NULL;
58
59 if (pcibios_scanned)
60 return 0;
61
62 if (acpi_noirq)
63 return 0;
64
65 printk(KERN_INFO "PCI: Using ACPI for IRQ routing\n");
66 acpi_irq_penalty_init();
67 pcibios_scanned++;
68 pcibios_enable_irq = acpi_pci_irq_enable;
69 pcibios_disable_irq = acpi_pci_irq_disable;
70
71 if (pci_routeirq) {
72 /*
73 * PCI IRQ routing is set up by pci_enable_device(), but we
74 * also do it here in case there are still broken drivers that
75 * don't use pci_enable_device().
76 */
77 printk(KERN_INFO "PCI: Routing PCI interrupts for all devices because \"pci=routeirq\" specified\n");
78 for_each_pci_dev(dev)
79 acpi_pci_irq_enable(dev);
80 } else
81 printk(KERN_INFO "PCI: If a device doesn't work, try \"pci=routeirq\". If it helps, post a report\n");
82
83#ifdef CONFIG_X86_IO_APIC
84 if (acpi_ioapic)
85 print_IO_APIC();
86#endif
87
88 return 0;
89}
90subsys_initcall(pci_acpi_init);
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
new file mode 100644
index 000000000000..07d5223442bf
--- /dev/null
+++ b/arch/x86/pci/common.c
@@ -0,0 +1,480 @@
1/*
2 * Low-Level PCI Support for PC
3 *
4 * (c) 1999--2000 Martin Mares <mj@ucw.cz>
5 */
6
7#include <linux/sched.h>
8#include <linux/pci.h>
9#include <linux/ioport.h>
10#include <linux/init.h>
11#include <linux/dmi.h>
12
13#include <asm/acpi.h>
14#include <asm/segment.h>
15#include <asm/io.h>
16#include <asm/smp.h>
17
18#include "pci.h"
19
20unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 |
21 PCI_PROBE_MMCONF;
22
23static int pci_bf_sort;
24int pci_routeirq;
25int pcibios_last_bus = -1;
26unsigned long pirq_table_addr;
27struct pci_bus *pci_root_bus;
28struct pci_raw_ops *raw_pci_ops;
29
30static int pci_read(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 *value)
31{
32 return raw_pci_ops->read(0, bus->number, devfn, where, size, value);
33}
34
35static int pci_write(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 value)
36{
37 return raw_pci_ops->write(0, bus->number, devfn, where, size, value);
38}
39
40struct pci_ops pci_root_ops = {
41 .read = pci_read,
42 .write = pci_write,
43};
44
45/*
46 * legacy, numa, and acpi all want to call pcibios_scan_root
47 * from their initcalls. This flag prevents that.
48 */
49int pcibios_scanned;
50
51/*
52 * This interrupt-safe spinlock protects all accesses to PCI
53 * configuration space.
54 */
55DEFINE_SPINLOCK(pci_config_lock);
56
57/*
58 * Several buggy motherboards address only 16 devices and mirror
59 * them to next 16 IDs. We try to detect this `feature' on all
60 * primary buses (those containing host bridges as they are
61 * expected to be unique) and remove the ghost devices.
62 */
63
64static void __devinit pcibios_fixup_ghosts(struct pci_bus *b)
65{
66 struct list_head *ln, *mn;
67 struct pci_dev *d, *e;
68 int mirror = PCI_DEVFN(16,0);
69 int seen_host_bridge = 0;
70 int i;
71
72 DBG("PCI: Scanning for ghost devices on bus %d\n", b->number);
73 list_for_each(ln, &b->devices) {
74 d = pci_dev_b(ln);
75 if ((d->class >> 8) == PCI_CLASS_BRIDGE_HOST)
76 seen_host_bridge++;
77 for (mn=ln->next; mn != &b->devices; mn=mn->next) {
78 e = pci_dev_b(mn);
79 if (e->devfn != d->devfn + mirror ||
80 e->vendor != d->vendor ||
81 e->device != d->device ||
82 e->class != d->class)
83 continue;
84 for(i=0; i<PCI_NUM_RESOURCES; i++)
85 if (e->resource[i].start != d->resource[i].start ||
86 e->resource[i].end != d->resource[i].end ||
87 e->resource[i].flags != d->resource[i].flags)
88 continue;
89 break;
90 }
91 if (mn == &b->devices)
92 return;
93 }
94 if (!seen_host_bridge)
95 return;
96 printk(KERN_WARNING "PCI: Ignoring ghost devices on bus %02x\n", b->number);
97
98 ln = &b->devices;
99 while (ln->next != &b->devices) {
100 d = pci_dev_b(ln->next);
101 if (d->devfn >= mirror) {
102 list_del(&d->global_list);
103 list_del(&d->bus_list);
104 kfree(d);
105 } else
106 ln = ln->next;
107 }
108}
109
110/*
111 * Called after each bus is probed, but before its children
112 * are examined.
113 */
114
115void __devinit pcibios_fixup_bus(struct pci_bus *b)
116{
117 pcibios_fixup_ghosts(b);
118 pci_read_bridge_bases(b);
119}
120
121/*
122 * Only use DMI information to set this if nothing was passed
123 * on the kernel command line (which was parsed earlier).
124 */
125
126static int __devinit set_bf_sort(const struct dmi_system_id *d)
127{
128 if (pci_bf_sort == pci_bf_sort_default) {
129 pci_bf_sort = pci_dmi_bf;
130 printk(KERN_INFO "PCI: %s detected, enabling pci=bfsort.\n", d->ident);
131 }
132 return 0;
133}
134
135/*
136 * Enable renumbering of PCI bus# ranges to reach all PCI busses (Cardbus)
137 */
138#ifdef __i386__
139static int __devinit assign_all_busses(const struct dmi_system_id *d)
140{
141 pci_probe |= PCI_ASSIGN_ALL_BUSSES;
142 printk(KERN_INFO "%s detected: enabling PCI bus# renumbering"
143 " (pci=assign-busses)\n", d->ident);
144 return 0;
145}
146#endif
147
148static struct dmi_system_id __devinitdata pciprobe_dmi_table[] = {
149#ifdef __i386__
150/*
151 * Laptops which need pci=assign-busses to see Cardbus cards
152 */
153 {
154 .callback = assign_all_busses,
155 .ident = "Samsung X20 Laptop",
156 .matches = {
157 DMI_MATCH(DMI_SYS_VENDOR, "Samsung Electronics"),
158 DMI_MATCH(DMI_PRODUCT_NAME, "SX20S"),
159 },
160 },
161#endif /* __i386__ */
162 {
163 .callback = set_bf_sort,
164 .ident = "Dell PowerEdge 1950",
165 .matches = {
166 DMI_MATCH(DMI_SYS_VENDOR, "Dell"),
167 DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 1950"),
168 },
169 },
170 {
171 .callback = set_bf_sort,
172 .ident = "Dell PowerEdge 1955",
173 .matches = {
174 DMI_MATCH(DMI_SYS_VENDOR, "Dell"),
175 DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 1955"),
176 },
177 },
178 {
179 .callback = set_bf_sort,
180 .ident = "Dell PowerEdge 2900",
181 .matches = {
182 DMI_MATCH(DMI_SYS_VENDOR, "Dell"),
183 DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2900"),
184 },
185 },
186 {
187 .callback = set_bf_sort,
188 .ident = "Dell PowerEdge 2950",
189 .matches = {
190 DMI_MATCH(DMI_SYS_VENDOR, "Dell"),
191 DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2950"),
192 },
193 },
194 {
195 .callback = set_bf_sort,
196 .ident = "Dell PowerEdge R900",
197 .matches = {
198 DMI_MATCH(DMI_SYS_VENDOR, "Dell"),
199 DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge R900"),
200 },
201 },
202 {
203 .callback = set_bf_sort,
204 .ident = "HP ProLiant BL20p G3",
205 .matches = {
206 DMI_MATCH(DMI_SYS_VENDOR, "HP"),
207 DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL20p G3"),
208 },
209 },
210 {
211 .callback = set_bf_sort,
212 .ident = "HP ProLiant BL20p G4",
213 .matches = {
214 DMI_MATCH(DMI_SYS_VENDOR, "HP"),
215 DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL20p G4"),
216 },
217 },
218 {
219 .callback = set_bf_sort,
220 .ident = "HP ProLiant BL30p G1",
221 .matches = {
222 DMI_MATCH(DMI_SYS_VENDOR, "HP"),
223 DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL30p G1"),
224 },
225 },
226 {
227 .callback = set_bf_sort,
228 .ident = "HP ProLiant BL25p G1",
229 .matches = {
230 DMI_MATCH(DMI_SYS_VENDOR, "HP"),
231 DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL25p G1"),
232 },
233 },
234 {
235 .callback = set_bf_sort,
236 .ident = "HP ProLiant BL35p G1",
237 .matches = {
238 DMI_MATCH(DMI_SYS_VENDOR, "HP"),
239 DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL35p G1"),
240 },
241 },
242 {
243 .callback = set_bf_sort,
244 .ident = "HP ProLiant BL45p G1",
245 .matches = {
246 DMI_MATCH(DMI_SYS_VENDOR, "HP"),
247 DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL45p G1"),
248 },
249 },
250 {
251 .callback = set_bf_sort,
252 .ident = "HP ProLiant BL45p G2",
253 .matches = {
254 DMI_MATCH(DMI_SYS_VENDOR, "HP"),
255 DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL45p G2"),
256 },
257 },
258 {
259 .callback = set_bf_sort,
260 .ident = "HP ProLiant BL460c G1",
261 .matches = {
262 DMI_MATCH(DMI_SYS_VENDOR, "HP"),
263 DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL460c G1"),
264 },
265 },
266 {
267 .callback = set_bf_sort,
268 .ident = "HP ProLiant BL465c G1",
269 .matches = {
270 DMI_MATCH(DMI_SYS_VENDOR, "HP"),
271 DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL465c G1"),
272 },
273 },
274 {
275 .callback = set_bf_sort,
276 .ident = "HP ProLiant BL480c G1",
277 .matches = {
278 DMI_MATCH(DMI_SYS_VENDOR, "HP"),
279 DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL480c G1"),
280 },
281 },
282 {
283 .callback = set_bf_sort,
284 .ident = "HP ProLiant BL685c G1",
285 .matches = {
286 DMI_MATCH(DMI_SYS_VENDOR, "HP"),
287 DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL685c G1"),
288 },
289 },
290 {}
291};
292
293struct pci_bus * __devinit pcibios_scan_root(int busnum)
294{
295 struct pci_bus *bus = NULL;
296 struct pci_sysdata *sd;
297
298 dmi_check_system(pciprobe_dmi_table);
299
300 while ((bus = pci_find_next_bus(bus)) != NULL) {
301 if (bus->number == busnum) {
302 /* Already scanned */
303 return bus;
304 }
305 }
306
307 /* Allocate per-root-bus (not per bus) arch-specific data.
308 * TODO: leak; this memory is never freed.
309 * It's arguable whether it's worth the trouble to care.
310 */
311 sd = kzalloc(sizeof(*sd), GFP_KERNEL);
312 if (!sd) {
313 printk(KERN_ERR "PCI: OOM, not probing PCI bus %02x\n", busnum);
314 return NULL;
315 }
316
317 printk(KERN_DEBUG "PCI: Probing PCI hardware (bus %02x)\n", busnum);
318
319 return pci_scan_bus_parented(NULL, busnum, &pci_root_ops, sd);
320}
321
322extern u8 pci_cache_line_size;
323
324static int __init pcibios_init(void)
325{
326 struct cpuinfo_x86 *c = &boot_cpu_data;
327
328 if (!raw_pci_ops) {
329 printk(KERN_WARNING "PCI: System does not support PCI\n");
330 return 0;
331 }
332
333 /*
334 * Assume PCI cacheline size of 32 bytes for all x86s except K7/K8
335 * and P4. It's also good for 386/486s (which actually have 16)
336 * as quite a few PCI devices do not support smaller values.
337 */
338 pci_cache_line_size = 32 >> 2;
339 if (c->x86 >= 6 && c->x86_vendor == X86_VENDOR_AMD)
340 pci_cache_line_size = 64 >> 2; /* K7 & K8 */
341 else if (c->x86 > 6 && c->x86_vendor == X86_VENDOR_INTEL)
342 pci_cache_line_size = 128 >> 2; /* P4 */
343
344 pcibios_resource_survey();
345
346 if (pci_bf_sort >= pci_force_bf)
347 pci_sort_breadthfirst();
348#ifdef CONFIG_PCI_BIOS
349 if ((pci_probe & PCI_BIOS_SORT) && !(pci_probe & PCI_NO_SORT))
350 pcibios_sort();
351#endif
352 return 0;
353}
354
355subsys_initcall(pcibios_init);
356
357char * __devinit pcibios_setup(char *str)
358{
359 if (!strcmp(str, "off")) {
360 pci_probe = 0;
361 return NULL;
362 } else if (!strcmp(str, "bfsort")) {
363 pci_bf_sort = pci_force_bf;
364 return NULL;
365 } else if (!strcmp(str, "nobfsort")) {
366 pci_bf_sort = pci_force_nobf;
367 return NULL;
368 }
369#ifdef CONFIG_PCI_BIOS
370 else if (!strcmp(str, "bios")) {
371 pci_probe = PCI_PROBE_BIOS;
372 return NULL;
373 } else if (!strcmp(str, "nobios")) {
374 pci_probe &= ~PCI_PROBE_BIOS;
375 return NULL;
376 } else if (!strcmp(str, "nosort")) {
377 pci_probe |= PCI_NO_SORT;
378 return NULL;
379 } else if (!strcmp(str, "biosirq")) {
380 pci_probe |= PCI_BIOS_IRQ_SCAN;
381 return NULL;
382 } else if (!strncmp(str, "pirqaddr=", 9)) {
383 pirq_table_addr = simple_strtoul(str+9, NULL, 0);
384 return NULL;
385 }
386#endif
387#ifdef CONFIG_PCI_DIRECT
388 else if (!strcmp(str, "conf1")) {
389 pci_probe = PCI_PROBE_CONF1 | PCI_NO_CHECKS;
390 return NULL;
391 }
392 else if (!strcmp(str, "conf2")) {
393 pci_probe = PCI_PROBE_CONF2 | PCI_NO_CHECKS;
394 return NULL;
395 }
396#endif
397#ifdef CONFIG_PCI_MMCONFIG
398 else if (!strcmp(str, "nommconf")) {
399 pci_probe &= ~PCI_PROBE_MMCONF;
400 return NULL;
401 }
402#endif
403 else if (!strcmp(str, "noacpi")) {
404 acpi_noirq_set();
405 return NULL;
406 }
407 else if (!strcmp(str, "noearly")) {
408 pci_probe |= PCI_PROBE_NOEARLY;
409 return NULL;
410 }
411#ifndef CONFIG_X86_VISWS
412 else if (!strcmp(str, "usepirqmask")) {
413 pci_probe |= PCI_USE_PIRQ_MASK;
414 return NULL;
415 } else if (!strncmp(str, "irqmask=", 8)) {
416 pcibios_irq_mask = simple_strtol(str+8, NULL, 0);
417 return NULL;
418 } else if (!strncmp(str, "lastbus=", 8)) {
419 pcibios_last_bus = simple_strtol(str+8, NULL, 0);
420 return NULL;
421 }
422#endif
423 else if (!strcmp(str, "rom")) {
424 pci_probe |= PCI_ASSIGN_ROMS;
425 return NULL;
426 } else if (!strcmp(str, "assign-busses")) {
427 pci_probe |= PCI_ASSIGN_ALL_BUSSES;
428 return NULL;
429 } else if (!strcmp(str, "routeirq")) {
430 pci_routeirq = 1;
431 return NULL;
432 }
433 return str;
434}
435
436unsigned int pcibios_assign_all_busses(void)
437{
438 return (pci_probe & PCI_ASSIGN_ALL_BUSSES) ? 1 : 0;
439}
440
441int pcibios_enable_device(struct pci_dev *dev, int mask)
442{
443 int err;
444
445 if ((err = pcibios_enable_resources(dev, mask)) < 0)
446 return err;
447
448 if (!dev->msi_enabled)
449 return pcibios_enable_irq(dev);
450 return 0;
451}
452
453void pcibios_disable_device (struct pci_dev *dev)
454{
455 if (!dev->msi_enabled && pcibios_disable_irq)
456 pcibios_disable_irq(dev);
457}
458
459struct pci_bus *pci_scan_bus_with_sysdata(int busno)
460{
461 struct pci_bus *bus = NULL;
462 struct pci_sysdata *sd;
463
464 /*
465 * Allocate per-root-bus (not per bus) arch-specific data.
466 * TODO: leak; this memory is never freed.
467 * It's arguable whether it's worth the trouble to care.
468 */
469 sd = kzalloc(sizeof(*sd), GFP_KERNEL);
470 if (!sd) {
471 printk(KERN_ERR "PCI: OOM, skipping PCI bus %02x\n", busno);
472 return NULL;
473 }
474 sd->node = -1;
475 bus = pci_scan_bus(busno, &pci_root_ops, sd);
476 if (!bus)
477 kfree(sd);
478
479 return bus;
480}
diff --git a/arch/x86/pci/direct.c b/arch/x86/pci/direct.c
new file mode 100644
index 000000000000..431c9a51b157
--- /dev/null
+++ b/arch/x86/pci/direct.c
@@ -0,0 +1,302 @@
1/*
2 * direct.c - Low-level direct PCI config space access
3 */
4
5#include <linux/pci.h>
6#include <linux/init.h>
7#include <linux/dmi.h>
8#include "pci.h"
9
10/*
11 * Functions for accessing PCI configuration space with type 1 accesses
12 */
13
14#define PCI_CONF1_ADDRESS(bus, devfn, reg) \
15 (0x80000000 | (bus << 16) | (devfn << 8) | (reg & ~3))
16
17int pci_conf1_read(unsigned int seg, unsigned int bus,
18 unsigned int devfn, int reg, int len, u32 *value)
19{
20 unsigned long flags;
21
22 if ((bus > 255) || (devfn > 255) || (reg > 255)) {
23 *value = -1;
24 return -EINVAL;
25 }
26
27 spin_lock_irqsave(&pci_config_lock, flags);
28
29 outl(PCI_CONF1_ADDRESS(bus, devfn, reg), 0xCF8);
30
31 switch (len) {
32 case 1:
33 *value = inb(0xCFC + (reg & 3));
34 break;
35 case 2:
36 *value = inw(0xCFC + (reg & 2));
37 break;
38 case 4:
39 *value = inl(0xCFC);
40 break;
41 }
42
43 spin_unlock_irqrestore(&pci_config_lock, flags);
44
45 return 0;
46}
47
48int pci_conf1_write(unsigned int seg, unsigned int bus,
49 unsigned int devfn, int reg, int len, u32 value)
50{
51 unsigned long flags;
52
53 if ((bus > 255) || (devfn > 255) || (reg > 255))
54 return -EINVAL;
55
56 spin_lock_irqsave(&pci_config_lock, flags);
57
58 outl(PCI_CONF1_ADDRESS(bus, devfn, reg), 0xCF8);
59
60 switch (len) {
61 case 1:
62 outb((u8)value, 0xCFC + (reg & 3));
63 break;
64 case 2:
65 outw((u16)value, 0xCFC + (reg & 2));
66 break;
67 case 4:
68 outl((u32)value, 0xCFC);
69 break;
70 }
71
72 spin_unlock_irqrestore(&pci_config_lock, flags);
73
74 return 0;
75}
76
77#undef PCI_CONF1_ADDRESS
78
79struct pci_raw_ops pci_direct_conf1 = {
80 .read = pci_conf1_read,
81 .write = pci_conf1_write,
82};
83
84
85/*
86 * Functions for accessing PCI configuration space with type 2 accesses
87 */
88
89#define PCI_CONF2_ADDRESS(dev, reg) (u16)(0xC000 | (dev << 8) | reg)
90
91static int pci_conf2_read(unsigned int seg, unsigned int bus,
92 unsigned int devfn, int reg, int len, u32 *value)
93{
94 unsigned long flags;
95 int dev, fn;
96
97 if ((bus > 255) || (devfn > 255) || (reg > 255)) {
98 *value = -1;
99 return -EINVAL;
100 }
101
102 dev = PCI_SLOT(devfn);
103 fn = PCI_FUNC(devfn);
104
105 if (dev & 0x10)
106 return PCIBIOS_DEVICE_NOT_FOUND;
107
108 spin_lock_irqsave(&pci_config_lock, flags);
109
110 outb((u8)(0xF0 | (fn << 1)), 0xCF8);
111 outb((u8)bus, 0xCFA);
112
113 switch (len) {
114 case 1:
115 *value = inb(PCI_CONF2_ADDRESS(dev, reg));
116 break;
117 case 2:
118 *value = inw(PCI_CONF2_ADDRESS(dev, reg));
119 break;
120 case 4:
121 *value = inl(PCI_CONF2_ADDRESS(dev, reg));
122 break;
123 }
124
125 outb(0, 0xCF8);
126
127 spin_unlock_irqrestore(&pci_config_lock, flags);
128
129 return 0;
130}
131
132static int pci_conf2_write(unsigned int seg, unsigned int bus,
133 unsigned int devfn, int reg, int len, u32 value)
134{
135 unsigned long flags;
136 int dev, fn;
137
138 if ((bus > 255) || (devfn > 255) || (reg > 255))
139 return -EINVAL;
140
141 dev = PCI_SLOT(devfn);
142 fn = PCI_FUNC(devfn);
143
144 if (dev & 0x10)
145 return PCIBIOS_DEVICE_NOT_FOUND;
146
147 spin_lock_irqsave(&pci_config_lock, flags);
148
149 outb((u8)(0xF0 | (fn << 1)), 0xCF8);
150 outb((u8)bus, 0xCFA);
151
152 switch (len) {
153 case 1:
154 outb((u8)value, PCI_CONF2_ADDRESS(dev, reg));
155 break;
156 case 2:
157 outw((u16)value, PCI_CONF2_ADDRESS(dev, reg));
158 break;
159 case 4:
160 outl((u32)value, PCI_CONF2_ADDRESS(dev, reg));
161 break;
162 }
163
164 outb(0, 0xCF8);
165
166 spin_unlock_irqrestore(&pci_config_lock, flags);
167
168 return 0;
169}
170
171#undef PCI_CONF2_ADDRESS
172
173static struct pci_raw_ops pci_direct_conf2 = {
174 .read = pci_conf2_read,
175 .write = pci_conf2_write,
176};
177
178
179/*
180 * Before we decide to use direct hardware access mechanisms, we try to do some
181 * trivial checks to ensure it at least _seems_ to be working -- we just test
182 * whether bus 00 contains a host bridge (this is similar to checking
183 * techniques used in XFree86, but ours should be more reliable since we
184 * attempt to make use of direct access hints provided by the PCI BIOS).
185 *
186 * This should be close to trivial, but it isn't, because there are buggy
187 * chipsets (yes, you guessed it, by Intel and Compaq) that have no class ID.
188 */
189static int __init pci_sanity_check(struct pci_raw_ops *o)
190{
191 u32 x = 0;
192 int devfn;
193
194 if (pci_probe & PCI_NO_CHECKS)
195 return 1;
196 /* Assume Type 1 works for newer systems.
197 This handles machines that don't have anything on PCI Bus 0. */
198 if (dmi_get_year(DMI_BIOS_DATE) >= 2001)
199 return 1;
200
201 for (devfn = 0; devfn < 0x100; devfn++) {
202 if (o->read(0, 0, devfn, PCI_CLASS_DEVICE, 2, &x))
203 continue;
204 if (x == PCI_CLASS_BRIDGE_HOST || x == PCI_CLASS_DISPLAY_VGA)
205 return 1;
206
207 if (o->read(0, 0, devfn, PCI_VENDOR_ID, 2, &x))
208 continue;
209 if (x == PCI_VENDOR_ID_INTEL || x == PCI_VENDOR_ID_COMPAQ)
210 return 1;
211 }
212
213 DBG(KERN_WARNING "PCI: Sanity check failed\n");
214 return 0;
215}
216
217static int __init pci_check_type1(void)
218{
219 unsigned long flags;
220 unsigned int tmp;
221 int works = 0;
222
223 local_irq_save(flags);
224
225 outb(0x01, 0xCFB);
226 tmp = inl(0xCF8);
227 outl(0x80000000, 0xCF8);
228 if (inl(0xCF8) == 0x80000000 && pci_sanity_check(&pci_direct_conf1)) {
229 works = 1;
230 }
231 outl(tmp, 0xCF8);
232 local_irq_restore(flags);
233
234 return works;
235}
236
237static int __init pci_check_type2(void)
238{
239 unsigned long flags;
240 int works = 0;
241
242 local_irq_save(flags);
243
244 outb(0x00, 0xCFB);
245 outb(0x00, 0xCF8);
246 outb(0x00, 0xCFA);
247 if (inb(0xCF8) == 0x00 && inb(0xCFA) == 0x00 &&
248 pci_sanity_check(&pci_direct_conf2)) {
249 works = 1;
250 }
251
252 local_irq_restore(flags);
253
254 return works;
255}
256
257void __init pci_direct_init(int type)
258{
259 if (type == 0)
260 return;
261 printk(KERN_INFO "PCI: Using configuration type %d\n", type);
262 if (type == 1)
263 raw_pci_ops = &pci_direct_conf1;
264 else
265 raw_pci_ops = &pci_direct_conf2;
266}
267
268int __init pci_direct_probe(void)
269{
270 struct resource *region, *region2;
271
272 if ((pci_probe & PCI_PROBE_CONF1) == 0)
273 goto type2;
274 region = request_region(0xCF8, 8, "PCI conf1");
275 if (!region)
276 goto type2;
277
278 if (pci_check_type1())
279 return 1;
280 release_resource(region);
281
282 type2:
283 if ((pci_probe & PCI_PROBE_CONF2) == 0)
284 return 0;
285 region = request_region(0xCF8, 4, "PCI conf2");
286 if (!region)
287 return 0;
288 region2 = request_region(0xC000, 0x1000, "PCI conf2");
289 if (!region2)
290 goto fail2;
291
292 if (pci_check_type2()) {
293 printk(KERN_INFO "PCI: Using configuration type 2\n");
294 raw_pci_ops = &pci_direct_conf2;
295 return 2;
296 }
297
298 release_resource(region2);
299 fail2:
300 release_resource(region);
301 return 0;
302}
diff --git a/arch/x86/pci/early.c b/arch/x86/pci/early.c
new file mode 100644
index 000000000000..42df4b6606df
--- /dev/null
+++ b/arch/x86/pci/early.c
@@ -0,0 +1,59 @@
1#include <linux/kernel.h>
2#include <linux/pci.h>
3#include <asm/pci-direct.h>
4#include <asm/io.h>
5#include "pci.h"
6
7/* Direct PCI access. This is used for PCI accesses in early boot before
8 the PCI subsystem works. */
9
10#define PDprintk(x...)
11
12u32 read_pci_config(u8 bus, u8 slot, u8 func, u8 offset)
13{
14 u32 v;
15 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
16 v = inl(0xcfc);
17 if (v != 0xffffffff)
18 PDprintk("%x reading 4 from %x: %x\n", slot, offset, v);
19 return v;
20}
21
22u8 read_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset)
23{
24 u8 v;
25 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
26 v = inb(0xcfc + (offset&3));
27 PDprintk("%x reading 1 from %x: %x\n", slot, offset, v);
28 return v;
29}
30
31u16 read_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset)
32{
33 u16 v;
34 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
35 v = inw(0xcfc + (offset&2));
36 PDprintk("%x reading 2 from %x: %x\n", slot, offset, v);
37 return v;
38}
39
40void write_pci_config(u8 bus, u8 slot, u8 func, u8 offset,
41 u32 val)
42{
43 PDprintk("%x writing to %x: %x\n", slot, offset, val);
44 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
45 outl(val, 0xcfc);
46}
47
48void write_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset, u8 val)
49{
50 PDprintk("%x writing to %x: %x\n", slot, offset, val);
51 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
52 outb(val, 0xcfc);
53}
54
55int early_pci_allowed(void)
56{
57 return (pci_probe & (PCI_PROBE_CONF1|PCI_PROBE_NOEARLY)) ==
58 PCI_PROBE_CONF1;
59}
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
new file mode 100644
index 000000000000..c82cbf4c7226
--- /dev/null
+++ b/arch/x86/pci/fixup.c
@@ -0,0 +1,446 @@
1/*
2 * Exceptions for specific devices. Usually work-arounds for fatal design flaws.
3 */
4
5#include <linux/delay.h>
6#include <linux/dmi.h>
7#include <linux/pci.h>
8#include <linux/init.h>
9#include "pci.h"
10
11
12static void __devinit pci_fixup_i450nx(struct pci_dev *d)
13{
14 /*
15 * i450NX -- Find and scan all secondary buses on all PXB's.
16 */
17 int pxb, reg;
18 u8 busno, suba, subb;
19
20 printk(KERN_WARNING "PCI: Searching for i450NX host bridges on %s\n", pci_name(d));
21 reg = 0xd0;
22 for(pxb=0; pxb<2; pxb++) {
23 pci_read_config_byte(d, reg++, &busno);
24 pci_read_config_byte(d, reg++, &suba);
25 pci_read_config_byte(d, reg++, &subb);
26 DBG("i450NX PXB %d: %02x/%02x/%02x\n", pxb, busno, suba, subb);
27 if (busno)
28 pci_scan_bus_with_sysdata(busno); /* Bus A */
29 if (suba < subb)
30 pci_scan_bus_with_sysdata(suba+1); /* Bus B */
31 }
32 pcibios_last_bus = -1;
33}
34DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82451NX, pci_fixup_i450nx);
35
36static void __devinit pci_fixup_i450gx(struct pci_dev *d)
37{
38 /*
39 * i450GX and i450KX -- Find and scan all secondary buses.
40 * (called separately for each PCI bridge found)
41 */
42 u8 busno;
43 pci_read_config_byte(d, 0x4a, &busno);
44 printk(KERN_INFO "PCI: i440KX/GX host bridge %s: secondary bus %02x\n", pci_name(d), busno);
45 pci_scan_bus_with_sysdata(busno);
46 pcibios_last_bus = -1;
47}
48DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82454GX, pci_fixup_i450gx);
49
50static void __devinit pci_fixup_umc_ide(struct pci_dev *d)
51{
52 /*
53 * UM8886BF IDE controller sets region type bits incorrectly,
54 * therefore they look like memory despite of them being I/O.
55 */
56 int i;
57
58 printk(KERN_WARNING "PCI: Fixing base address flags for device %s\n", pci_name(d));
59 for(i=0; i<4; i++)
60 d->resource[i].flags |= PCI_BASE_ADDRESS_SPACE_IO;
61}
62DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_UMC, PCI_DEVICE_ID_UMC_UM8886BF, pci_fixup_umc_ide);
63
64static void __devinit pci_fixup_ncr53c810(struct pci_dev *d)
65{
66 /*
67 * NCR 53C810 returns class code 0 (at least on some systems).
68 * Fix class to be PCI_CLASS_STORAGE_SCSI
69 */
70 if (!d->class) {
71 printk(KERN_WARNING "PCI: fixing NCR 53C810 class code for %s\n", pci_name(d));
72 d->class = PCI_CLASS_STORAGE_SCSI << 8;
73 }
74}
75DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NCR, PCI_DEVICE_ID_NCR_53C810, pci_fixup_ncr53c810);
76
77static void __devinit pci_fixup_latency(struct pci_dev *d)
78{
79 /*
80 * SiS 5597 and 5598 chipsets require latency timer set to
81 * at most 32 to avoid lockups.
82 */
83 DBG("PCI: Setting max latency to 32\n");
84 pcibios_max_latency = 32;
85}
86DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_SI, PCI_DEVICE_ID_SI_5597, pci_fixup_latency);
87DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_SI, PCI_DEVICE_ID_SI_5598, pci_fixup_latency);
88
89static void __devinit pci_fixup_piix4_acpi(struct pci_dev *d)
90{
91 /*
92 * PIIX4 ACPI device: hardwired IRQ9
93 */
94 d->irq = 9;
95}
96DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82371AB_3, pci_fixup_piix4_acpi);
97
98/*
99 * Addresses issues with problems in the memory write queue timer in
100 * certain VIA Northbridges. This bugfix is per VIA's specifications,
101 * except for the KL133/KM133: clearing bit 5 on those Northbridges seems
102 * to trigger a bug in its integrated ProSavage video card, which
103 * causes screen corruption. We only clear bits 6 and 7 for that chipset,
104 * until VIA can provide us with definitive information on why screen
105 * corruption occurs, and what exactly those bits do.
106 *
107 * VIA 8363,8622,8361 Northbridges:
108 * - bits 5, 6, 7 at offset 0x55 need to be turned off
109 * VIA 8367 (KT266x) Northbridges:
110 * - bits 5, 6, 7 at offset 0x95 need to be turned off
111 * VIA 8363 rev 0x81/0x84 (KL133/KM133) Northbridges:
112 * - bits 6, 7 at offset 0x55 need to be turned off
113 */
114
115#define VIA_8363_KL133_REVISION_ID 0x81
116#define VIA_8363_KM133_REVISION_ID 0x84
117
118static void pci_fixup_via_northbridge_bug(struct pci_dev *d)
119{
120 u8 v;
121 int where = 0x55;
122 int mask = 0x1f; /* clear bits 5, 6, 7 by default */
123
124 if (d->device == PCI_DEVICE_ID_VIA_8367_0) {
125 /* fix pci bus latency issues resulted by NB bios error
126 it appears on bug free^Wreduced kt266x's bios forces
127 NB latency to zero */
128 pci_write_config_byte(d, PCI_LATENCY_TIMER, 0);
129
130 where = 0x95; /* the memory write queue timer register is
131 different for the KT266x's: 0x95 not 0x55 */
132 } else if (d->device == PCI_DEVICE_ID_VIA_8363_0 &&
133 (d->revision == VIA_8363_KL133_REVISION_ID ||
134 d->revision == VIA_8363_KM133_REVISION_ID)) {
135 mask = 0x3f; /* clear only bits 6 and 7; clearing bit 5
136 causes screen corruption on the KL133/KM133 */
137 }
138
139 pci_read_config_byte(d, where, &v);
140 if (v & ~mask) {
141 printk(KERN_WARNING "Disabling VIA memory write queue (PCI ID %04x, rev %02x): [%02x] %02x & %02x -> %02x\n", \
142 d->device, d->revision, where, v, mask, v & mask);
143 v &= mask;
144 pci_write_config_byte(d, where, v);
145 }
146}
147DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8363_0, pci_fixup_via_northbridge_bug);
148DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8622, pci_fixup_via_northbridge_bug);
149DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8361, pci_fixup_via_northbridge_bug);
150DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8367_0, pci_fixup_via_northbridge_bug);
151DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8363_0, pci_fixup_via_northbridge_bug);
152DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8622, pci_fixup_via_northbridge_bug);
153DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8361, pci_fixup_via_northbridge_bug);
154DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8367_0, pci_fixup_via_northbridge_bug);
155
156/*
157 * For some reasons Intel decided that certain parts of their
158 * 815, 845 and some other chipsets must look like PCI-to-PCI bridges
159 * while they are obviously not. The 82801 family (AA, AB, BAM/CAM,
160 * BA/CA/DB and E) PCI bridges are actually HUB-to-PCI ones, according
161 * to Intel terminology. These devices do forward all addresses from
162 * system to PCI bus no matter what are their window settings, so they are
163 * "transparent" (or subtractive decoding) from programmers point of view.
164 */
165static void __devinit pci_fixup_transparent_bridge(struct pci_dev *dev)
166{
167 if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI &&
168 (dev->device & 0xff00) == 0x2400)
169 dev->transparent = 1;
170}
171DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, pci_fixup_transparent_bridge);
172
173/*
174 * Fixup for C1 Halt Disconnect problem on nForce2 systems.
175 *
176 * From information provided by "Allen Martin" <AMartin@nvidia.com>:
177 *
178 * A hang is caused when the CPU generates a very fast CONNECT/HALT cycle
179 * sequence. Workaround is to set the SYSTEM_IDLE_TIMEOUT to 80 ns.
180 * This allows the state-machine and timer to return to a proper state within
181 * 80 ns of the CONNECT and probe appearing together. Since the CPU will not
182 * issue another HALT within 80 ns of the initial HALT, the failure condition
183 * is avoided.
184 */
185static void pci_fixup_nforce2(struct pci_dev *dev)
186{
187 u32 val;
188
189 /*
190 * Chip Old value New value
191 * C17 0x1F0FFF01 0x1F01FF01
192 * C18D 0x9F0FFF01 0x9F01FF01
193 *
194 * Northbridge chip version may be determined by
195 * reading the PCI revision ID (0xC1 or greater is C18D).
196 */
197 pci_read_config_dword(dev, 0x6c, &val);
198
199 /*
200 * Apply fixup if needed, but don't touch disconnect state
201 */
202 if ((val & 0x00FF0000) != 0x00010000) {
203 printk(KERN_WARNING "PCI: nForce2 C1 Halt Disconnect fixup\n");
204 pci_write_config_dword(dev, 0x6c, (val & 0xFF00FFFF) | 0x00010000);
205 }
206}
207DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE2, pci_fixup_nforce2);
208DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE2, pci_fixup_nforce2);
209
210/* Max PCI Express root ports */
211#define MAX_PCIEROOT 6
212static int quirk_aspm_offset[MAX_PCIEROOT << 3];
213
214#define GET_INDEX(a, b) ((((a) - PCI_DEVICE_ID_INTEL_MCH_PA) << 3) + ((b) & 7))
215
216static int quirk_pcie_aspm_read(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 *value)
217{
218 return raw_pci_ops->read(0, bus->number, devfn, where, size, value);
219}
220
221/*
222 * Replace the original pci bus ops for write with a new one that will filter
223 * the request to insure ASPM cannot be enabled.
224 */
225static int quirk_pcie_aspm_write(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 value)
226{
227 u8 offset;
228
229 offset = quirk_aspm_offset[GET_INDEX(bus->self->device, devfn)];
230
231 if ((offset) && (where == offset))
232 value = value & 0xfffffffc;
233
234 return raw_pci_ops->write(0, bus->number, devfn, where, size, value);
235}
236
237static struct pci_ops quirk_pcie_aspm_ops = {
238 .read = quirk_pcie_aspm_read,
239 .write = quirk_pcie_aspm_write,
240};
241
242/*
243 * Prevents PCI Express ASPM (Active State Power Management) being enabled.
244 *
245 * Save the register offset, where the ASPM control bits are located,
246 * for each PCI Express device that is in the device list of
247 * the root port in an array for fast indexing. Replace the bus ops
248 * with the modified one.
249 */
250static void pcie_rootport_aspm_quirk(struct pci_dev *pdev)
251{
252 int cap_base, i;
253 struct pci_bus *pbus;
254 struct pci_dev *dev;
255
256 if ((pbus = pdev->subordinate) == NULL)
257 return;
258
259 /*
260 * Check if the DID of pdev matches one of the six root ports. This
261 * check is needed in the case this function is called directly by the
262 * hot-plug driver.
263 */
264 if ((pdev->device < PCI_DEVICE_ID_INTEL_MCH_PA) ||
265 (pdev->device > PCI_DEVICE_ID_INTEL_MCH_PC1))
266 return;
267
268 if (list_empty(&pbus->devices)) {
269 /*
270 * If no device is attached to the root port at power-up or
271 * after hot-remove, the pbus->devices is empty and this code
272 * will set the offsets to zero and the bus ops to parent's bus
273 * ops, which is unmodified.
274 */
275 for (i= GET_INDEX(pdev->device, 0); i <= GET_INDEX(pdev->device, 7); ++i)
276 quirk_aspm_offset[i] = 0;
277
278 pbus->ops = pbus->parent->ops;
279 } else {
280 /*
281 * If devices are attached to the root port at power-up or
282 * after hot-add, the code loops through the device list of
283 * each root port to save the register offsets and replace the
284 * bus ops.
285 */
286 list_for_each_entry(dev, &pbus->devices, bus_list) {
287 /* There are 0 to 8 devices attached to this bus */
288 cap_base = pci_find_capability(dev, PCI_CAP_ID_EXP);
289 quirk_aspm_offset[GET_INDEX(pdev->device, dev->devfn)]= cap_base + 0x10;
290 }
291 pbus->ops = &quirk_pcie_aspm_ops;
292 }
293}
294DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PA, pcie_rootport_aspm_quirk );
295DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PA1, pcie_rootport_aspm_quirk );
296DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PB, pcie_rootport_aspm_quirk );
297DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PB1, pcie_rootport_aspm_quirk );
298DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PC, pcie_rootport_aspm_quirk );
299DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PC1, pcie_rootport_aspm_quirk );
300
301/*
302 * Fixup to mark boot BIOS video selected by BIOS before it changes
303 *
304 * From information provided by "Jon Smirl" <jonsmirl@gmail.com>
305 *
306 * The standard boot ROM sequence for an x86 machine uses the BIOS
307 * to select an initial video card for boot display. This boot video
308 * card will have it's BIOS copied to C0000 in system RAM.
309 * IORESOURCE_ROM_SHADOW is used to associate the boot video
310 * card with this copy. On laptops this copy has to be used since
311 * the main ROM may be compressed or combined with another image.
312 * See pci_map_rom() for use of this flag. IORESOURCE_ROM_SHADOW
313 * is marked here since the boot video device will be the only enabled
314 * video device at this point.
315 */
316
317static void __devinit pci_fixup_video(struct pci_dev *pdev)
318{
319 struct pci_dev *bridge;
320 struct pci_bus *bus;
321 u16 config;
322
323 if ((pdev->class >> 8) != PCI_CLASS_DISPLAY_VGA)
324 return;
325
326 /* Is VGA routed to us? */
327 bus = pdev->bus;
328 while (bus) {
329 bridge = bus->self;
330
331 /*
332 * From information provided by
333 * "David Miller" <davem@davemloft.net>
334 * The bridge control register is valid for PCI header
335 * type BRIDGE, or CARDBUS. Host to PCI controllers use
336 * PCI header type NORMAL.
337 */
338 if (bridge
339 &&((bridge->hdr_type == PCI_HEADER_TYPE_BRIDGE)
340 ||(bridge->hdr_type == PCI_HEADER_TYPE_CARDBUS))) {
341 pci_read_config_word(bridge, PCI_BRIDGE_CONTROL,
342 &config);
343 if (!(config & PCI_BRIDGE_CTL_VGA))
344 return;
345 }
346 bus = bus->parent;
347 }
348 pci_read_config_word(pdev, PCI_COMMAND, &config);
349 if (config & (PCI_COMMAND_IO | PCI_COMMAND_MEMORY)) {
350 pdev->resource[PCI_ROM_RESOURCE].flags |= IORESOURCE_ROM_SHADOW;
351 printk(KERN_DEBUG "Boot video device is %s\n", pci_name(pdev));
352 }
353}
354DECLARE_PCI_FIXUP_FINAL(PCI_ANY_ID, PCI_ANY_ID, pci_fixup_video);
355
356/*
357 * Some Toshiba laptops need extra code to enable their TI TSB43AB22/A.
358 *
359 * We pretend to bring them out of full D3 state, and restore the proper
360 * IRQ, PCI cache line size, and BARs, otherwise the device won't function
361 * properly. In some cases, the device will generate an interrupt on
362 * the wrong IRQ line, causing any devices sharing the line it's
363 * *supposed* to use to be disabled by the kernel's IRQ debug code.
364 */
365static u16 toshiba_line_size;
366
367static struct dmi_system_id __devinitdata toshiba_ohci1394_dmi_table[] = {
368 {
369 .ident = "Toshiba PS5 based laptop",
370 .matches = {
371 DMI_MATCH(DMI_SYS_VENDOR, "TOSHIBA"),
372 DMI_MATCH(DMI_PRODUCT_VERSION, "PS5"),
373 },
374 },
375 {
376 .ident = "Toshiba PSM4 based laptop",
377 .matches = {
378 DMI_MATCH(DMI_SYS_VENDOR, "TOSHIBA"),
379 DMI_MATCH(DMI_PRODUCT_VERSION, "PSM4"),
380 },
381 },
382 {
383 .ident = "Toshiba A40 based laptop",
384 .matches = {
385 DMI_MATCH(DMI_SYS_VENDOR, "TOSHIBA"),
386 DMI_MATCH(DMI_PRODUCT_VERSION, "PSA40U"),
387 },
388 },
389 { }
390};
391
392static void __devinit pci_pre_fixup_toshiba_ohci1394(struct pci_dev *dev)
393{
394 if (!dmi_check_system(toshiba_ohci1394_dmi_table))
395 return; /* only applies to certain Toshibas (so far) */
396
397 dev->current_state = PCI_D3cold;
398 pci_read_config_word(dev, PCI_CACHE_LINE_SIZE, &toshiba_line_size);
399}
400DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_TI, 0x8032,
401 pci_pre_fixup_toshiba_ohci1394);
402
403static void __devinit pci_post_fixup_toshiba_ohci1394(struct pci_dev *dev)
404{
405 if (!dmi_check_system(toshiba_ohci1394_dmi_table))
406 return; /* only applies to certain Toshibas (so far) */
407
408 /* Restore config space on Toshiba laptops */
409 pci_write_config_word(dev, PCI_CACHE_LINE_SIZE, toshiba_line_size);
410 pci_read_config_byte(dev, PCI_INTERRUPT_LINE, (u8 *)&dev->irq);
411 pci_write_config_dword(dev, PCI_BASE_ADDRESS_0,
412 pci_resource_start(dev, 0));
413 pci_write_config_dword(dev, PCI_BASE_ADDRESS_1,
414 pci_resource_start(dev, 1));
415}
416DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_TI, 0x8032,
417 pci_post_fixup_toshiba_ohci1394);
418
419
420/*
421 * Prevent the BIOS trapping accesses to the Cyrix CS5530A video device
422 * configuration space.
423 */
424static void pci_early_fixup_cyrix_5530(struct pci_dev *dev)
425{
426 u8 r;
427 /* clear 'F4 Video Configuration Trap' bit */
428 pci_read_config_byte(dev, 0x42, &r);
429 r &= 0xfd;
430 pci_write_config_byte(dev, 0x42, r);
431}
432DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY,
433 pci_early_fixup_cyrix_5530);
434DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY,
435 pci_early_fixup_cyrix_5530);
436
437/*
438 * Siemens Nixdorf AG FSC Multiprocessor Interrupt Controller:
439 * prevent update of the BAR0, which doesn't look like a normal BAR.
440 */
441static void __devinit pci_siemens_interrupt_controller(struct pci_dev *dev)
442{
443 dev->resource[0].flags |= IORESOURCE_PCI_FIXED;
444}
445DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_SIEMENS, 0x0015,
446 pci_siemens_interrupt_controller);
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
new file mode 100644
index 000000000000..bcd2f94b732c
--- /dev/null
+++ b/arch/x86/pci/i386.c
@@ -0,0 +1,315 @@
1/*
2 * Low-Level PCI Access for i386 machines
3 *
4 * Copyright 1993, 1994 Drew Eckhardt
5 * Visionary Computing
6 * (Unix and Linux consulting and custom programming)
7 * Drew@Colorado.EDU
8 * +1 (303) 786-7975
9 *
10 * Drew's work was sponsored by:
11 * iX Multiuser Multitasking Magazine
12 * Hannover, Germany
13 * hm@ix.de
14 *
15 * Copyright 1997--2000 Martin Mares <mj@ucw.cz>
16 *
17 * For more information, please consult the following manuals (look at
18 * http://www.pcisig.com/ for how to get them):
19 *
20 * PCI BIOS Specification
21 * PCI Local Bus Specification
22 * PCI to PCI Bridge Specification
23 * PCI System Design Guide
24 *
25 */
26
27#include <linux/types.h>
28#include <linux/kernel.h>
29#include <linux/pci.h>
30#include <linux/init.h>
31#include <linux/ioport.h>
32#include <linux/errno.h>
33
34#include "pci.h"
35
36/*
37 * We need to avoid collisions with `mirrored' VGA ports
38 * and other strange ISA hardware, so we always want the
39 * addresses to be allocated in the 0x000-0x0ff region
40 * modulo 0x400.
41 *
42 * Why? Because some silly external IO cards only decode
43 * the low 10 bits of the IO address. The 0x00-0xff region
44 * is reserved for motherboard devices that decode all 16
45 * bits, so it's ok to allocate at, say, 0x2800-0x28ff,
46 * but we want to try to avoid allocating at 0x2900-0x2bff
47 * which might have be mirrored at 0x0100-0x03ff..
48 */
49void
50pcibios_align_resource(void *data, struct resource *res,
51 resource_size_t size, resource_size_t align)
52{
53 if (res->flags & IORESOURCE_IO) {
54 resource_size_t start = res->start;
55
56 if (start & 0x300) {
57 start = (start + 0x3ff) & ~0x3ff;
58 res->start = start;
59 }
60 }
61}
62
63
64/*
65 * Handle resources of PCI devices. If the world were perfect, we could
66 * just allocate all the resource regions and do nothing more. It isn't.
67 * On the other hand, we cannot just re-allocate all devices, as it would
68 * require us to know lots of host bridge internals. So we attempt to
69 * keep as much of the original configuration as possible, but tweak it
70 * when it's found to be wrong.
71 *
72 * Known BIOS problems we have to work around:
73 * - I/O or memory regions not configured
74 * - regions configured, but not enabled in the command register
75 * - bogus I/O addresses above 64K used
76 * - expansion ROMs left enabled (this may sound harmless, but given
77 * the fact the PCI specs explicitly allow address decoders to be
78 * shared between expansion ROMs and other resource regions, it's
79 * at least dangerous)
80 *
81 * Our solution:
82 * (1) Allocate resources for all buses behind PCI-to-PCI bridges.
83 * This gives us fixed barriers on where we can allocate.
84 * (2) Allocate resources for all enabled devices. If there is
85 * a collision, just mark the resource as unallocated. Also
86 * disable expansion ROMs during this step.
87 * (3) Try to allocate resources for disabled devices. If the
88 * resources were assigned correctly, everything goes well,
89 * if they weren't, they won't disturb allocation of other
90 * resources.
91 * (4) Assign new addresses to resources which were either
92 * not configured at all or misconfigured. If explicitly
93 * requested by the user, configure expansion ROM address
94 * as well.
95 */
96
97static void __init pcibios_allocate_bus_resources(struct list_head *bus_list)
98{
99 struct pci_bus *bus;
100 struct pci_dev *dev;
101 int idx;
102 struct resource *r, *pr;
103
104 /* Depth-First Search on bus tree */
105 list_for_each_entry(bus, bus_list, node) {
106 if ((dev = bus->self)) {
107 for (idx = PCI_BRIDGE_RESOURCES;
108 idx < PCI_NUM_RESOURCES; idx++) {
109 r = &dev->resource[idx];
110 if (!r->flags)
111 continue;
112 pr = pci_find_parent_resource(dev, r);
113 if (!r->start || !pr ||
114 request_resource(pr, r) < 0) {
115 printk(KERN_ERR "PCI: Cannot allocate "
116 "resource region %d "
117 "of bridge %s\n",
118 idx, pci_name(dev));
119 /*
120 * Something is wrong with the region.
121 * Invalidate the resource to prevent
122 * child resource allocations in this
123 * range.
124 */
125 r->flags = 0;
126 }
127 }
128 }
129 pcibios_allocate_bus_resources(&bus->children);
130 }
131}
132
133static void __init pcibios_allocate_resources(int pass)
134{
135 struct pci_dev *dev = NULL;
136 int idx, disabled;
137 u16 command;
138 struct resource *r, *pr;
139
140 for_each_pci_dev(dev) {
141 pci_read_config_word(dev, PCI_COMMAND, &command);
142 for (idx = 0; idx < PCI_ROM_RESOURCE; idx++) {
143 r = &dev->resource[idx];
144 if (r->parent) /* Already allocated */
145 continue;
146 if (!r->start) /* Address not assigned at all */
147 continue;
148 if (r->flags & IORESOURCE_IO)
149 disabled = !(command & PCI_COMMAND_IO);
150 else
151 disabled = !(command & PCI_COMMAND_MEMORY);
152 if (pass == disabled) {
153 DBG("PCI: Resource %08lx-%08lx "
154 "(f=%lx, d=%d, p=%d)\n",
155 r->start, r->end, r->flags, disabled, pass);
156 pr = pci_find_parent_resource(dev, r);
157 if (!pr || request_resource(pr, r) < 0) {
158 printk(KERN_ERR "PCI: Cannot allocate "
159 "resource region %d "
160 "of device %s\n",
161 idx, pci_name(dev));
162 /* We'll assign a new address later */
163 r->end -= r->start;
164 r->start = 0;
165 }
166 }
167 }
168 if (!pass) {
169 r = &dev->resource[PCI_ROM_RESOURCE];
170 if (r->flags & IORESOURCE_ROM_ENABLE) {
171 /* Turn the ROM off, leave the resource region,
172 * but keep it unregistered. */
173 u32 reg;
174 DBG("PCI: Switching off ROM of %s\n",
175 pci_name(dev));
176 r->flags &= ~IORESOURCE_ROM_ENABLE;
177 pci_read_config_dword(dev,
178 dev->rom_base_reg, &reg);
179 pci_write_config_dword(dev, dev->rom_base_reg,
180 reg & ~PCI_ROM_ADDRESS_ENABLE);
181 }
182 }
183 }
184}
185
186static int __init pcibios_assign_resources(void)
187{
188 struct pci_dev *dev = NULL;
189 struct resource *r, *pr;
190
191 if (!(pci_probe & PCI_ASSIGN_ROMS)) {
192 /*
193 * Try to use BIOS settings for ROMs, otherwise let
194 * pci_assign_unassigned_resources() allocate the new
195 * addresses.
196 */
197 for_each_pci_dev(dev) {
198 r = &dev->resource[PCI_ROM_RESOURCE];
199 if (!r->flags || !r->start)
200 continue;
201 pr = pci_find_parent_resource(dev, r);
202 if (!pr || request_resource(pr, r) < 0) {
203 r->end -= r->start;
204 r->start = 0;
205 }
206 }
207 }
208
209 pci_assign_unassigned_resources();
210
211 return 0;
212}
213
214void __init pcibios_resource_survey(void)
215{
216 DBG("PCI: Allocating resources\n");
217 pcibios_allocate_bus_resources(&pci_root_buses);
218 pcibios_allocate_resources(0);
219 pcibios_allocate_resources(1);
220}
221
222/**
223 * called in fs_initcall (one below subsys_initcall),
224 * give a chance for motherboard reserve resources
225 */
226fs_initcall(pcibios_assign_resources);
227
228int pcibios_enable_resources(struct pci_dev *dev, int mask)
229{
230 u16 cmd, old_cmd;
231 int idx;
232 struct resource *r;
233
234 pci_read_config_word(dev, PCI_COMMAND, &cmd);
235 old_cmd = cmd;
236 for (idx = 0; idx < PCI_NUM_RESOURCES; idx++) {
237 /* Only set up the requested stuff */
238 if (!(mask & (1 << idx)))
239 continue;
240
241 r = &dev->resource[idx];
242 if (!(r->flags & (IORESOURCE_IO | IORESOURCE_MEM)))
243 continue;
244 if ((idx == PCI_ROM_RESOURCE) &&
245 (!(r->flags & IORESOURCE_ROM_ENABLE)))
246 continue;
247 if (!r->start && r->end) {
248 printk(KERN_ERR "PCI: Device %s not available "
249 "because of resource %d collisions\n",
250 pci_name(dev), idx);
251 return -EINVAL;
252 }
253 if (r->flags & IORESOURCE_IO)
254 cmd |= PCI_COMMAND_IO;
255 if (r->flags & IORESOURCE_MEM)
256 cmd |= PCI_COMMAND_MEMORY;
257 }
258 if (cmd != old_cmd) {
259 printk("PCI: Enabling device %s (%04x -> %04x)\n",
260 pci_name(dev), old_cmd, cmd);
261 pci_write_config_word(dev, PCI_COMMAND, cmd);
262 }
263 return 0;
264}
265
266/*
267 * If we set up a device for bus mastering, we need to check the latency
268 * timer as certain crappy BIOSes forget to set it properly.
269 */
270unsigned int pcibios_max_latency = 255;
271
272void pcibios_set_master(struct pci_dev *dev)
273{
274 u8 lat;
275 pci_read_config_byte(dev, PCI_LATENCY_TIMER, &lat);
276 if (lat < 16)
277 lat = (64 <= pcibios_max_latency) ? 64 : pcibios_max_latency;
278 else if (lat > pcibios_max_latency)
279 lat = pcibios_max_latency;
280 else
281 return;
282 printk(KERN_DEBUG "PCI: Setting latency timer of device %s to %d\n",
283 pci_name(dev), lat);
284 pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat);
285}
286
287int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
288 enum pci_mmap_state mmap_state, int write_combine)
289{
290 unsigned long prot;
291
292 /* I/O space cannot be accessed via normal processor loads and
293 * stores on this platform.
294 */
295 if (mmap_state == pci_mmap_io)
296 return -EINVAL;
297
298 /* Leave vm_pgoff as-is, the PCI space address is the physical
299 * address on this platform.
300 */
301 prot = pgprot_val(vma->vm_page_prot);
302 if (boot_cpu_data.x86 > 3)
303 prot |= _PAGE_PCD | _PAGE_PWT;
304 vma->vm_page_prot = __pgprot(prot);
305
306 /* Write-combine setting is ignored, it is changed via the mtrr
307 * interfaces on this platform.
308 */
309 if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
310 vma->vm_end - vma->vm_start,
311 vma->vm_page_prot))
312 return -EAGAIN;
313
314 return 0;
315}
diff --git a/arch/x86/pci/init.c b/arch/x86/pci/init.c
new file mode 100644
index 000000000000..3de9f9ba2da6
--- /dev/null
+++ b/arch/x86/pci/init.c
@@ -0,0 +1,37 @@
1#include <linux/pci.h>
2#include <linux/init.h>
3#include "pci.h"
4
5/* arch_initcall has too random ordering, so call the initializers
6 in the right sequence from here. */
7static __init int pci_access_init(void)
8{
9 int type __maybe_unused = 0;
10
11#ifdef CONFIG_PCI_DIRECT
12 type = pci_direct_probe();
13#endif
14#ifdef CONFIG_PCI_MMCONFIG
15 pci_mmcfg_init(type);
16#endif
17 if (raw_pci_ops)
18 return 0;
19#ifdef CONFIG_PCI_BIOS
20 pci_pcbios_init();
21#endif
22 /*
23 * don't check for raw_pci_ops here because we want pcbios as last
24 * fallback, yet it's needed to run first to set pcibios_last_bus
25 * in case legacy PCI probing is used. otherwise detecting peer busses
26 * fails.
27 */
28#ifdef CONFIG_PCI_DIRECT
29 pci_direct_init(type);
30#endif
31 if (!raw_pci_ops)
32 printk(KERN_ERR
33 "PCI: Fatal: No config space access function found\n");
34
35 return 0;
36}
37arch_initcall(pci_access_init);
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
new file mode 100644
index 000000000000..d98c6b096f8e
--- /dev/null
+++ b/arch/x86/pci/irq.c
@@ -0,0 +1,1173 @@
1/*
2 * Low-Level PCI Support for PC -- Routing of Interrupts
3 *
4 * (c) 1999--2000 Martin Mares <mj@ucw.cz>
5 */
6
7#include <linux/types.h>
8#include <linux/kernel.h>
9#include <linux/pci.h>
10#include <linux/init.h>
11#include <linux/slab.h>
12#include <linux/interrupt.h>
13#include <linux/dmi.h>
14#include <asm/io.h>
15#include <asm/smp.h>
16#include <asm/io_apic.h>
17#include <linux/irq.h>
18#include <linux/acpi.h>
19
20#include "pci.h"
21
22#define PIRQ_SIGNATURE (('$' << 0) + ('P' << 8) + ('I' << 16) + ('R' << 24))
23#define PIRQ_VERSION 0x0100
24
25static int broken_hp_bios_irq9;
26static int acer_tm360_irqrouting;
27
28static struct irq_routing_table *pirq_table;
29
30static int pirq_enable_irq(struct pci_dev *dev);
31
32/*
33 * Never use: 0, 1, 2 (timer, keyboard, and cascade)
34 * Avoid using: 13, 14 and 15 (FP error and IDE).
35 * Penalize: 3, 4, 6, 7, 12 (known ISA uses: serial, floppy, parallel and mouse)
36 */
37unsigned int pcibios_irq_mask = 0xfff8;
38
39static int pirq_penalty[16] = {
40 1000000, 1000000, 1000000, 1000, 1000, 0, 1000, 1000,
41 0, 0, 0, 0, 1000, 100000, 100000, 100000
42};
43
44struct irq_router {
45 char *name;
46 u16 vendor, device;
47 int (*get)(struct pci_dev *router, struct pci_dev *dev, int pirq);
48 int (*set)(struct pci_dev *router, struct pci_dev *dev, int pirq, int new);
49};
50
51struct irq_router_handler {
52 u16 vendor;
53 int (*probe)(struct irq_router *r, struct pci_dev *router, u16 device);
54};
55
56int (*pcibios_enable_irq)(struct pci_dev *dev) = NULL;
57void (*pcibios_disable_irq)(struct pci_dev *dev) = NULL;
58
59/*
60 * Check passed address for the PCI IRQ Routing Table signature
61 * and perform checksum verification.
62 */
63
64static inline struct irq_routing_table * pirq_check_routing_table(u8 *addr)
65{
66 struct irq_routing_table *rt;
67 int i;
68 u8 sum;
69
70 rt = (struct irq_routing_table *) addr;
71 if (rt->signature != PIRQ_SIGNATURE ||
72 rt->version != PIRQ_VERSION ||
73 rt->size % 16 ||
74 rt->size < sizeof(struct irq_routing_table))
75 return NULL;
76 sum = 0;
77 for (i=0; i < rt->size; i++)
78 sum += addr[i];
79 if (!sum) {
80 DBG(KERN_DEBUG "PCI: Interrupt Routing Table found at 0x%p\n", rt);
81 return rt;
82 }
83 return NULL;
84}
85
86
87
88/*
89 * Search 0xf0000 -- 0xfffff for the PCI IRQ Routing Table.
90 */
91
92static struct irq_routing_table * __init pirq_find_routing_table(void)
93{
94 u8 *addr;
95 struct irq_routing_table *rt;
96
97 if (pirq_table_addr) {
98 rt = pirq_check_routing_table((u8 *) __va(pirq_table_addr));
99 if (rt)
100 return rt;
101 printk(KERN_WARNING "PCI: PIRQ table NOT found at pirqaddr\n");
102 }
103 for(addr = (u8 *) __va(0xf0000); addr < (u8 *) __va(0x100000); addr += 16) {
104 rt = pirq_check_routing_table(addr);
105 if (rt)
106 return rt;
107 }
108 return NULL;
109}
110
111/*
112 * If we have a IRQ routing table, use it to search for peer host
113 * bridges. It's a gross hack, but since there are no other known
114 * ways how to get a list of buses, we have to go this way.
115 */
116
117static void __init pirq_peer_trick(void)
118{
119 struct irq_routing_table *rt = pirq_table;
120 u8 busmap[256];
121 int i;
122 struct irq_info *e;
123
124 memset(busmap, 0, sizeof(busmap));
125 for(i=0; i < (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); i++) {
126 e = &rt->slots[i];
127#ifdef DEBUG
128 {
129 int j;
130 DBG(KERN_DEBUG "%02x:%02x slot=%02x", e->bus, e->devfn/8, e->slot);
131 for(j=0; j<4; j++)
132 DBG(" %d:%02x/%04x", j, e->irq[j].link, e->irq[j].bitmap);
133 DBG("\n");
134 }
135#endif
136 busmap[e->bus] = 1;
137 }
138 for(i = 1; i < 256; i++) {
139 if (!busmap[i] || pci_find_bus(0, i))
140 continue;
141 if (pci_scan_bus_with_sysdata(i))
142 printk(KERN_INFO "PCI: Discovered primary peer "
143 "bus %02x [IRQ]\n", i);
144 }
145 pcibios_last_bus = -1;
146}
147
148/*
149 * Code for querying and setting of IRQ routes on various interrupt routers.
150 */
151
152void eisa_set_level_irq(unsigned int irq)
153{
154 unsigned char mask = 1 << (irq & 7);
155 unsigned int port = 0x4d0 + (irq >> 3);
156 unsigned char val;
157 static u16 eisa_irq_mask;
158
159 if (irq >= 16 || (1 << irq) & eisa_irq_mask)
160 return;
161
162 eisa_irq_mask |= (1 << irq);
163 printk(KERN_DEBUG "PCI: setting IRQ %u as level-triggered\n", irq);
164 val = inb(port);
165 if (!(val & mask)) {
166 DBG(KERN_DEBUG " -> edge");
167 outb(val | mask, port);
168 }
169}
170
171/*
172 * Common IRQ routing practice: nybbles in config space,
173 * offset by some magic constant.
174 */
175static unsigned int read_config_nybble(struct pci_dev *router, unsigned offset, unsigned nr)
176{
177 u8 x;
178 unsigned reg = offset + (nr >> 1);
179
180 pci_read_config_byte(router, reg, &x);
181 return (nr & 1) ? (x >> 4) : (x & 0xf);
182}
183
184static void write_config_nybble(struct pci_dev *router, unsigned offset, unsigned nr, unsigned int val)
185{
186 u8 x;
187 unsigned reg = offset + (nr >> 1);
188
189 pci_read_config_byte(router, reg, &x);
190 x = (nr & 1) ? ((x & 0x0f) | (val << 4)) : ((x & 0xf0) | val);
191 pci_write_config_byte(router, reg, x);
192}
193
194/*
195 * ALI pirq entries are damn ugly, and completely undocumented.
196 * This has been figured out from pirq tables, and it's not a pretty
197 * picture.
198 */
199static int pirq_ali_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
200{
201 static const unsigned char irqmap[16] = { 0, 9, 3, 10, 4, 5, 7, 6, 1, 11, 0, 12, 0, 14, 0, 15 };
202
203 return irqmap[read_config_nybble(router, 0x48, pirq-1)];
204}
205
206static int pirq_ali_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
207{
208 static const unsigned char irqmap[16] = { 0, 8, 0, 2, 4, 5, 7, 6, 0, 1, 3, 9, 11, 0, 13, 15 };
209 unsigned int val = irqmap[irq];
210
211 if (val) {
212 write_config_nybble(router, 0x48, pirq-1, val);
213 return 1;
214 }
215 return 0;
216}
217
218/*
219 * The Intel PIIX4 pirq rules are fairly simple: "pirq" is
220 * just a pointer to the config space.
221 */
222static int pirq_piix_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
223{
224 u8 x;
225
226 pci_read_config_byte(router, pirq, &x);
227 return (x < 16) ? x : 0;
228}
229
230static int pirq_piix_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
231{
232 pci_write_config_byte(router, pirq, irq);
233 return 1;
234}
235
236/*
237 * The VIA pirq rules are nibble-based, like ALI,
238 * but without the ugly irq number munging.
239 * However, PIRQD is in the upper instead of lower 4 bits.
240 */
241static int pirq_via_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
242{
243 return read_config_nybble(router, 0x55, pirq == 4 ? 5 : pirq);
244}
245
246static int pirq_via_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
247{
248 write_config_nybble(router, 0x55, pirq == 4 ? 5 : pirq, irq);
249 return 1;
250}
251
252/*
253 * The VIA pirq rules are nibble-based, like ALI,
254 * but without the ugly irq number munging.
255 * However, for 82C586, nibble map is different .
256 */
257static int pirq_via586_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
258{
259 static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };
260 return read_config_nybble(router, 0x55, pirqmap[pirq-1]);
261}
262
263static int pirq_via586_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
264{
265 static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };
266 write_config_nybble(router, 0x55, pirqmap[pirq-1], irq);
267 return 1;
268}
269
270/*
271 * ITE 8330G pirq rules are nibble-based
272 * FIXME: pirqmap may be { 1, 0, 3, 2 },
273 * 2+3 are both mapped to irq 9 on my system
274 */
275static int pirq_ite_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
276{
277 static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
278 return read_config_nybble(router,0x43, pirqmap[pirq-1]);
279}
280
281static int pirq_ite_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
282{
283 static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
284 write_config_nybble(router, 0x43, pirqmap[pirq-1], irq);
285 return 1;
286}
287
288/*
289 * OPTI: high four bits are nibble pointer..
290 * I wonder what the low bits do?
291 */
292static int pirq_opti_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
293{
294 return read_config_nybble(router, 0xb8, pirq >> 4);
295}
296
297static int pirq_opti_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
298{
299 write_config_nybble(router, 0xb8, pirq >> 4, irq);
300 return 1;
301}
302
303/*
304 * Cyrix: nibble offset 0x5C
305 * 0x5C bits 7:4 is INTB bits 3:0 is INTA
306 * 0x5D bits 7:4 is INTD bits 3:0 is INTC
307 */
308static int pirq_cyrix_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
309{
310 return read_config_nybble(router, 0x5C, (pirq-1)^1);
311}
312
313static int pirq_cyrix_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
314{
315 write_config_nybble(router, 0x5C, (pirq-1)^1, irq);
316 return 1;
317}
318
319/*
320 * PIRQ routing for SiS 85C503 router used in several SiS chipsets.
321 * We have to deal with the following issues here:
322 * - vendors have different ideas about the meaning of link values
323 * - some onboard devices (integrated in the chipset) have special
324 * links and are thus routed differently (i.e. not via PCI INTA-INTD)
325 * - different revision of the router have a different layout for
326 * the routing registers, particularly for the onchip devices
327 *
328 * For all routing registers the common thing is we have one byte
329 * per routeable link which is defined as:
330 * bit 7 IRQ mapping enabled (0) or disabled (1)
331 * bits [6:4] reserved (sometimes used for onchip devices)
332 * bits [3:0] IRQ to map to
333 * allowed: 3-7, 9-12, 14-15
334 * reserved: 0, 1, 2, 8, 13
335 *
336 * The config-space registers located at 0x41/0x42/0x43/0x44 are
337 * always used to route the normal PCI INT A/B/C/D respectively.
338 * Apparently there are systems implementing PCI routing table using
339 * link values 0x01-0x04 and others using 0x41-0x44 for PCI INTA..D.
340 * We try our best to handle both link mappings.
341 *
342 * Currently (2003-05-21) it appears most SiS chipsets follow the
343 * definition of routing registers from the SiS-5595 southbridge.
344 * According to the SiS 5595 datasheets the revision id's of the
345 * router (ISA-bridge) should be 0x01 or 0xb0.
346 *
347 * Furthermore we've also seen lspci dumps with revision 0x00 and 0xb1.
348 * Looks like these are used in a number of SiS 5xx/6xx/7xx chipsets.
349 * They seem to work with the current routing code. However there is
350 * some concern because of the two USB-OHCI HCs (original SiS 5595
351 * had only one). YMMV.
352 *
353 * Onchip routing for router rev-id 0x01/0xb0 and probably 0x00/0xb1:
354 *
355 * 0x61: IDEIRQ:
356 * bits [6:5] must be written 01
357 * bit 4 channel-select primary (0), secondary (1)
358 *
359 * 0x62: USBIRQ:
360 * bit 6 OHCI function disabled (0), enabled (1)
361 *
362 * 0x6a: ACPI/SCI IRQ: bits 4-6 reserved
363 *
364 * 0x7e: Data Acq. Module IRQ - bits 4-6 reserved
365 *
366 * We support USBIRQ (in addition to INTA-INTD) and keep the
367 * IDE, ACPI and DAQ routing untouched as set by the BIOS.
368 *
369 * Currently the only reported exception is the new SiS 65x chipset
370 * which includes the SiS 69x southbridge. Here we have the 85C503
371 * router revision 0x04 and there are changes in the register layout
372 * mostly related to the different USB HCs with USB 2.0 support.
373 *
374 * Onchip routing for router rev-id 0x04 (try-and-error observation)
375 *
376 * 0x60/0x61/0x62/0x63: 1xEHCI and 3xOHCI (companion) USB-HCs
377 * bit 6-4 are probably unused, not like 5595
378 */
379
380#define PIRQ_SIS_IRQ_MASK 0x0f
381#define PIRQ_SIS_IRQ_DISABLE 0x80
382#define PIRQ_SIS_USB_ENABLE 0x40
383
384static int pirq_sis_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
385{
386 u8 x;
387 int reg;
388
389 reg = pirq;
390 if (reg >= 0x01 && reg <= 0x04)
391 reg += 0x40;
392 pci_read_config_byte(router, reg, &x);
393 return (x & PIRQ_SIS_IRQ_DISABLE) ? 0 : (x & PIRQ_SIS_IRQ_MASK);
394}
395
396static int pirq_sis_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
397{
398 u8 x;
399 int reg;
400
401 reg = pirq;
402 if (reg >= 0x01 && reg <= 0x04)
403 reg += 0x40;
404 pci_read_config_byte(router, reg, &x);
405 x &= ~(PIRQ_SIS_IRQ_MASK | PIRQ_SIS_IRQ_DISABLE);
406 x |= irq ? irq: PIRQ_SIS_IRQ_DISABLE;
407 pci_write_config_byte(router, reg, x);
408 return 1;
409}
410
411
412/*
413 * VLSI: nibble offset 0x74 - educated guess due to routing table and
414 * config space of VLSI 82C534 PCI-bridge/router (1004:0102)
415 * Tested on HP OmniBook 800 covering PIRQ 1, 2, 4, 8 for onboard
416 * devices, PIRQ 3 for non-pci(!) soundchip and (untested) PIRQ 6
417 * for the busbridge to the docking station.
418 */
419
420static int pirq_vlsi_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
421{
422 if (pirq > 8) {
423 printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
424 return 0;
425 }
426 return read_config_nybble(router, 0x74, pirq-1);
427}
428
429static int pirq_vlsi_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
430{
431 if (pirq > 8) {
432 printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
433 return 0;
434 }
435 write_config_nybble(router, 0x74, pirq-1, irq);
436 return 1;
437}
438
439/*
440 * ServerWorks: PCI interrupts mapped to system IRQ lines through Index
441 * and Redirect I/O registers (0x0c00 and 0x0c01). The Index register
442 * format is (PCIIRQ## | 0x10), e.g.: PCIIRQ10=0x1a. The Redirect
443 * register is a straight binary coding of desired PIC IRQ (low nibble).
444 *
445 * The 'link' value in the PIRQ table is already in the correct format
446 * for the Index register. There are some special index values:
447 * 0x00 for ACPI (SCI), 0x01 for USB, 0x02 for IDE0, 0x04 for IDE1,
448 * and 0x03 for SMBus.
449 */
450static int pirq_serverworks_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
451{
452 outb_p(pirq, 0xc00);
453 return inb(0xc01) & 0xf;
454}
455
456static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
457{
458 outb_p(pirq, 0xc00);
459 outb_p(irq, 0xc01);
460 return 1;
461}
462
463/* Support for AMD756 PCI IRQ Routing
464 * Jhon H. Caicedo <jhcaiced@osso.org.co>
465 * Jun/21/2001 0.2.0 Release, fixed to use "nybble" functions... (jhcaiced)
466 * Jun/19/2001 Alpha Release 0.1.0 (jhcaiced)
467 * The AMD756 pirq rules are nibble-based
468 * offset 0x56 0-3 PIRQA 4-7 PIRQB
469 * offset 0x57 0-3 PIRQC 4-7 PIRQD
470 */
471static int pirq_amd756_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
472{
473 u8 irq;
474 irq = 0;
475 if (pirq <= 4)
476 {
477 irq = read_config_nybble(router, 0x56, pirq - 1);
478 }
479 printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d get irq : %2d\n",
480 dev->vendor, dev->device, pirq, irq);
481 return irq;
482}
483
484static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
485{
486 printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d SET irq : %2d\n",
487 dev->vendor, dev->device, pirq, irq);
488 if (pirq <= 4)
489 {
490 write_config_nybble(router, 0x56, pirq - 1, irq);
491 }
492 return 1;
493}
494
495#ifdef CONFIG_PCI_BIOS
496
497static int pirq_bios_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
498{
499 struct pci_dev *bridge;
500 int pin = pci_get_interrupt_pin(dev, &bridge);
501 return pcibios_set_irq_routing(bridge, pin, irq);
502}
503
504#endif
505
506static __init int intel_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
507{
508 static struct pci_device_id __initdata pirq_440gx[] = {
509 { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82443GX_0) },
510 { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82443GX_2) },
511 { },
512 };
513
514 /* 440GX has a proprietary PIRQ router -- don't use it */
515 if (pci_dev_present(pirq_440gx))
516 return 0;
517
518 switch(device)
519 {
520 case PCI_DEVICE_ID_INTEL_82371FB_0:
521 case PCI_DEVICE_ID_INTEL_82371SB_0:
522 case PCI_DEVICE_ID_INTEL_82371AB_0:
523 case PCI_DEVICE_ID_INTEL_82371MX:
524 case PCI_DEVICE_ID_INTEL_82443MX_0:
525 case PCI_DEVICE_ID_INTEL_82801AA_0:
526 case PCI_DEVICE_ID_INTEL_82801AB_0:
527 case PCI_DEVICE_ID_INTEL_82801BA_0:
528 case PCI_DEVICE_ID_INTEL_82801BA_10:
529 case PCI_DEVICE_ID_INTEL_82801CA_0:
530 case PCI_DEVICE_ID_INTEL_82801CA_12:
531 case PCI_DEVICE_ID_INTEL_82801DB_0:
532 case PCI_DEVICE_ID_INTEL_82801E_0:
533 case PCI_DEVICE_ID_INTEL_82801EB_0:
534 case PCI_DEVICE_ID_INTEL_ESB_1:
535 case PCI_DEVICE_ID_INTEL_ICH6_0:
536 case PCI_DEVICE_ID_INTEL_ICH6_1:
537 case PCI_DEVICE_ID_INTEL_ICH7_0:
538 case PCI_DEVICE_ID_INTEL_ICH7_1:
539 case PCI_DEVICE_ID_INTEL_ICH7_30:
540 case PCI_DEVICE_ID_INTEL_ICH7_31:
541 case PCI_DEVICE_ID_INTEL_ESB2_0:
542 case PCI_DEVICE_ID_INTEL_ICH8_0:
543 case PCI_DEVICE_ID_INTEL_ICH8_1:
544 case PCI_DEVICE_ID_INTEL_ICH8_2:
545 case PCI_DEVICE_ID_INTEL_ICH8_3:
546 case PCI_DEVICE_ID_INTEL_ICH8_4:
547 case PCI_DEVICE_ID_INTEL_ICH9_0:
548 case PCI_DEVICE_ID_INTEL_ICH9_1:
549 case PCI_DEVICE_ID_INTEL_ICH9_2:
550 case PCI_DEVICE_ID_INTEL_ICH9_3:
551 case PCI_DEVICE_ID_INTEL_ICH9_4:
552 case PCI_DEVICE_ID_INTEL_ICH9_5:
553 case PCI_DEVICE_ID_INTEL_TOLAPAI_0:
554 r->name = "PIIX/ICH";
555 r->get = pirq_piix_get;
556 r->set = pirq_piix_set;
557 return 1;
558 }
559 return 0;
560}
561
562static __init int via_router_probe(struct irq_router *r,
563 struct pci_dev *router, u16 device)
564{
565 /* FIXME: We should move some of the quirk fixup stuff here */
566
567 /*
568 * work arounds for some buggy BIOSes
569 */
570 if (device == PCI_DEVICE_ID_VIA_82C586_0) {
571 switch(router->device) {
572 case PCI_DEVICE_ID_VIA_82C686:
573 /*
574 * Asus k7m bios wrongly reports 82C686A
575 * as 586-compatible
576 */
577 device = PCI_DEVICE_ID_VIA_82C686;
578 break;
579 case PCI_DEVICE_ID_VIA_8235:
580 /**
581 * Asus a7v-x bios wrongly reports 8235
582 * as 586-compatible
583 */
584 device = PCI_DEVICE_ID_VIA_8235;
585 break;
586 }
587 }
588
589 switch(device) {
590 case PCI_DEVICE_ID_VIA_82C586_0:
591 r->name = "VIA";
592 r->get = pirq_via586_get;
593 r->set = pirq_via586_set;
594 return 1;
595 case PCI_DEVICE_ID_VIA_82C596:
596 case PCI_DEVICE_ID_VIA_82C686:
597 case PCI_DEVICE_ID_VIA_8231:
598 case PCI_DEVICE_ID_VIA_8233A:
599 case PCI_DEVICE_ID_VIA_8235:
600 case PCI_DEVICE_ID_VIA_8237:
601 /* FIXME: add new ones for 8233/5 */
602 r->name = "VIA";
603 r->get = pirq_via_get;
604 r->set = pirq_via_set;
605 return 1;
606 }
607 return 0;
608}
609
610static __init int vlsi_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
611{
612 switch(device)
613 {
614 case PCI_DEVICE_ID_VLSI_82C534:
615 r->name = "VLSI 82C534";
616 r->get = pirq_vlsi_get;
617 r->set = pirq_vlsi_set;
618 return 1;
619 }
620 return 0;
621}
622
623
624static __init int serverworks_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
625{
626 switch(device)
627 {
628 case PCI_DEVICE_ID_SERVERWORKS_OSB4:
629 case PCI_DEVICE_ID_SERVERWORKS_CSB5:
630 r->name = "ServerWorks";
631 r->get = pirq_serverworks_get;
632 r->set = pirq_serverworks_set;
633 return 1;
634 }
635 return 0;
636}
637
638static __init int sis_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
639{
640 if (device != PCI_DEVICE_ID_SI_503)
641 return 0;
642
643 r->name = "SIS";
644 r->get = pirq_sis_get;
645 r->set = pirq_sis_set;
646 return 1;
647}
648
649static __init int cyrix_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
650{
651 switch(device)
652 {
653 case PCI_DEVICE_ID_CYRIX_5520:
654 r->name = "NatSemi";
655 r->get = pirq_cyrix_get;
656 r->set = pirq_cyrix_set;
657 return 1;
658 }
659 return 0;
660}
661
662static __init int opti_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
663{
664 switch(device)
665 {
666 case PCI_DEVICE_ID_OPTI_82C700:
667 r->name = "OPTI";
668 r->get = pirq_opti_get;
669 r->set = pirq_opti_set;
670 return 1;
671 }
672 return 0;
673}
674
675static __init int ite_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
676{
677 switch(device)
678 {
679 case PCI_DEVICE_ID_ITE_IT8330G_0:
680 r->name = "ITE";
681 r->get = pirq_ite_get;
682 r->set = pirq_ite_set;
683 return 1;
684 }
685 return 0;
686}
687
688static __init int ali_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
689{
690 switch(device)
691 {
692 case PCI_DEVICE_ID_AL_M1533:
693 case PCI_DEVICE_ID_AL_M1563:
694 printk(KERN_DEBUG "PCI: Using ALI IRQ Router\n");
695 r->name = "ALI";
696 r->get = pirq_ali_get;
697 r->set = pirq_ali_set;
698 return 1;
699 }
700 return 0;
701}
702
703static __init int amd_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
704{
705 switch(device)
706 {
707 case PCI_DEVICE_ID_AMD_VIPER_740B:
708 r->name = "AMD756";
709 break;
710 case PCI_DEVICE_ID_AMD_VIPER_7413:
711 r->name = "AMD766";
712 break;
713 case PCI_DEVICE_ID_AMD_VIPER_7443:
714 r->name = "AMD768";
715 break;
716 default:
717 return 0;
718 }
719 r->get = pirq_amd756_get;
720 r->set = pirq_amd756_set;
721 return 1;
722}
723
724static __initdata struct irq_router_handler pirq_routers[] = {
725 { PCI_VENDOR_ID_INTEL, intel_router_probe },
726 { PCI_VENDOR_ID_AL, ali_router_probe },
727 { PCI_VENDOR_ID_ITE, ite_router_probe },
728 { PCI_VENDOR_ID_VIA, via_router_probe },
729 { PCI_VENDOR_ID_OPTI, opti_router_probe },
730 { PCI_VENDOR_ID_SI, sis_router_probe },
731 { PCI_VENDOR_ID_CYRIX, cyrix_router_probe },
732 { PCI_VENDOR_ID_VLSI, vlsi_router_probe },
733 { PCI_VENDOR_ID_SERVERWORKS, serverworks_router_probe },
734 { PCI_VENDOR_ID_AMD, amd_router_probe },
735 /* Someone with docs needs to add the ATI Radeon IGP */
736 { 0, NULL }
737};
738static struct irq_router pirq_router;
739static struct pci_dev *pirq_router_dev;
740
741
742/*
743 * FIXME: should we have an option to say "generic for
744 * chipset" ?
745 */
746
747static void __init pirq_find_router(struct irq_router *r)
748{
749 struct irq_routing_table *rt = pirq_table;
750 struct irq_router_handler *h;
751
752#ifdef CONFIG_PCI_BIOS
753 if (!rt->signature) {
754 printk(KERN_INFO "PCI: Using BIOS for IRQ routing\n");
755 r->set = pirq_bios_set;
756 r->name = "BIOS";
757 return;
758 }
759#endif
760
761 /* Default unless a driver reloads it */
762 r->name = "default";
763 r->get = NULL;
764 r->set = NULL;
765
766 DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for %04x:%04x\n",
767 rt->rtr_vendor, rt->rtr_device);
768
769 pirq_router_dev = pci_get_bus_and_slot(rt->rtr_bus, rt->rtr_devfn);
770 if (!pirq_router_dev) {
771 DBG(KERN_DEBUG "PCI: Interrupt router not found at "
772 "%02x:%02x\n", rt->rtr_bus, rt->rtr_devfn);
773 return;
774 }
775
776 for( h = pirq_routers; h->vendor; h++) {
777 /* First look for a router match */
778 if (rt->rtr_vendor == h->vendor && h->probe(r, pirq_router_dev, rt->rtr_device))
779 break;
780 /* Fall back to a device match */
781 if (pirq_router_dev->vendor == h->vendor && h->probe(r, pirq_router_dev, pirq_router_dev->device))
782 break;
783 }
784 printk(KERN_INFO "PCI: Using IRQ router %s [%04x/%04x] at %s\n",
785 pirq_router.name,
786 pirq_router_dev->vendor,
787 pirq_router_dev->device,
788 pci_name(pirq_router_dev));
789
790 /* The device remains referenced for the kernel lifetime */
791}
792
793static struct irq_info *pirq_get_info(struct pci_dev *dev)
794{
795 struct irq_routing_table *rt = pirq_table;
796 int entries = (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info);
797 struct irq_info *info;
798
799 for (info = rt->slots; entries--; info++)
800 if (info->bus == dev->bus->number && PCI_SLOT(info->devfn) == PCI_SLOT(dev->devfn))
801 return info;
802 return NULL;
803}
804
805static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
806{
807 u8 pin;
808 struct irq_info *info;
809 int i, pirq, newirq;
810 int irq = 0;
811 u32 mask;
812 struct irq_router *r = &pirq_router;
813 struct pci_dev *dev2 = NULL;
814 char *msg = NULL;
815
816 /* Find IRQ pin */
817 pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
818 if (!pin) {
819 DBG(KERN_DEBUG " -> no interrupt pin\n");
820 return 0;
821 }
822 pin = pin - 1;
823
824 /* Find IRQ routing entry */
825
826 if (!pirq_table)
827 return 0;
828
829 DBG(KERN_DEBUG "IRQ for %s[%c]", pci_name(dev), 'A' + pin);
830 info = pirq_get_info(dev);
831 if (!info) {
832 DBG(" -> not found in routing table\n" KERN_DEBUG);
833 return 0;
834 }
835 pirq = info->irq[pin].link;
836 mask = info->irq[pin].bitmap;
837 if (!pirq) {
838 DBG(" -> not routed\n" KERN_DEBUG);
839 return 0;
840 }
841 DBG(" -> PIRQ %02x, mask %04x, excl %04x", pirq, mask, pirq_table->exclusive_irqs);
842 mask &= pcibios_irq_mask;
843
844 /* Work around broken HP Pavilion Notebooks which assign USB to
845 IRQ 9 even though it is actually wired to IRQ 11 */
846
847 if (broken_hp_bios_irq9 && pirq == 0x59 && dev->irq == 9) {
848 dev->irq = 11;
849 pci_write_config_byte(dev, PCI_INTERRUPT_LINE, 11);
850 r->set(pirq_router_dev, dev, pirq, 11);
851 }
852
853 /* same for Acer Travelmate 360, but with CB and irq 11 -> 10 */
854 if (acer_tm360_irqrouting && dev->irq == 11 && dev->vendor == PCI_VENDOR_ID_O2) {
855 pirq = 0x68;
856 mask = 0x400;
857 dev->irq = r->get(pirq_router_dev, dev, pirq);
858 pci_write_config_byte(dev, PCI_INTERRUPT_LINE, dev->irq);
859 }
860
861 /*
862 * Find the best IRQ to assign: use the one
863 * reported by the device if possible.
864 */
865 newirq = dev->irq;
866 if (newirq && !((1 << newirq) & mask)) {
867 if ( pci_probe & PCI_USE_PIRQ_MASK) newirq = 0;
868 else printk("\n" KERN_WARNING
869 "PCI: IRQ %i for device %s doesn't match PIRQ mask "
870 "- try pci=usepirqmask\n" KERN_DEBUG, newirq,
871 pci_name(dev));
872 }
873 if (!newirq && assign) {
874 for (i = 0; i < 16; i++) {
875 if (!(mask & (1 << i)))
876 continue;
877 if (pirq_penalty[i] < pirq_penalty[newirq] && can_request_irq(i, IRQF_SHARED))
878 newirq = i;
879 }
880 }
881 DBG(" -> newirq=%d", newirq);
882
883 /* Check if it is hardcoded */
884 if ((pirq & 0xf0) == 0xf0) {
885 irq = pirq & 0xf;
886 DBG(" -> hardcoded IRQ %d\n", irq);
887 msg = "Hardcoded";
888 } else if ( r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \
889 ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask)) ) {
890 DBG(" -> got IRQ %d\n", irq);
891 msg = "Found";
892 eisa_set_level_irq(irq);
893 } else if (newirq && r->set && (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) {
894 DBG(" -> assigning IRQ %d", newirq);
895 if (r->set(pirq_router_dev, dev, pirq, newirq)) {
896 eisa_set_level_irq(newirq);
897 DBG(" ... OK\n");
898 msg = "Assigned";
899 irq = newirq;
900 }
901 }
902
903 if (!irq) {
904 DBG(" ... failed\n");
905 if (newirq && mask == (1 << newirq)) {
906 msg = "Guessed";
907 irq = newirq;
908 } else
909 return 0;
910 }
911 printk(KERN_INFO "PCI: %s IRQ %d for device %s\n", msg, irq, pci_name(dev));
912
913 /* Update IRQ for all devices with the same pirq value */
914 while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) {
915 pci_read_config_byte(dev2, PCI_INTERRUPT_PIN, &pin);
916 if (!pin)
917 continue;
918 pin--;
919 info = pirq_get_info(dev2);
920 if (!info)
921 continue;
922 if (info->irq[pin].link == pirq) {
923 /* We refuse to override the dev->irq information. Give a warning! */
924 if ( dev2->irq && dev2->irq != irq && \
925 (!(pci_probe & PCI_USE_PIRQ_MASK) || \
926 ((1 << dev2->irq) & mask)) ) {
927#ifndef CONFIG_PCI_MSI
928 printk(KERN_INFO "IRQ routing conflict for %s, have irq %d, want irq %d\n",
929 pci_name(dev2), dev2->irq, irq);
930#endif
931 continue;
932 }
933 dev2->irq = irq;
934 pirq_penalty[irq]++;
935 if (dev != dev2)
936 printk(KERN_INFO "PCI: Sharing IRQ %d with %s\n", irq, pci_name(dev2));
937 }
938 }
939 return 1;
940}
941
942static void __init pcibios_fixup_irqs(void)
943{
944 struct pci_dev *dev = NULL;
945 u8 pin;
946
947 DBG(KERN_DEBUG "PCI: IRQ fixup\n");
948 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
949 /*
950 * If the BIOS has set an out of range IRQ number, just ignore it.
951 * Also keep track of which IRQ's are already in use.
952 */
953 if (dev->irq >= 16) {
954 DBG(KERN_DEBUG "%s: ignoring bogus IRQ %d\n", pci_name(dev), dev->irq);
955 dev->irq = 0;
956 }
957 /* If the IRQ is already assigned to a PCI device, ignore its ISA use penalty */
958 if (pirq_penalty[dev->irq] >= 100 && pirq_penalty[dev->irq] < 100000)
959 pirq_penalty[dev->irq] = 0;
960 pirq_penalty[dev->irq]++;
961 }
962
963 dev = NULL;
964 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
965 pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
966#ifdef CONFIG_X86_IO_APIC
967 /*
968 * Recalculate IRQ numbers if we use the I/O APIC.
969 */
970 if (io_apic_assign_pci_irqs)
971 {
972 int irq;
973
974 if (pin) {
975 pin--; /* interrupt pins are numbered starting from 1 */
976 irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin);
977 /*
978 * Busses behind bridges are typically not listed in the MP-table.
979 * In this case we have to look up the IRQ based on the parent bus,
980 * parent slot, and pin number. The SMP code detects such bridged
981 * busses itself so we should get into this branch reliably.
982 */
983 if (irq < 0 && dev->bus->parent) { /* go back to the bridge */
984 struct pci_dev * bridge = dev->bus->self;
985
986 pin = (pin + PCI_SLOT(dev->devfn)) % 4;
987 irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
988 PCI_SLOT(bridge->devfn), pin);
989 if (irq >= 0)
990 printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n",
991 pci_name(bridge), 'A' + pin, irq);
992 }
993 if (irq >= 0) {
994 printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
995 pci_name(dev), 'A' + pin, irq);
996 dev->irq = irq;
997 }
998 }
999 }
1000#endif
1001 /*
1002 * Still no IRQ? Try to lookup one...
1003 */
1004 if (pin && !dev->irq)
1005 pcibios_lookup_irq(dev, 0);
1006 }
1007}
1008
1009/*
1010 * Work around broken HP Pavilion Notebooks which assign USB to
1011 * IRQ 9 even though it is actually wired to IRQ 11
1012 */
1013static int __init fix_broken_hp_bios_irq9(const struct dmi_system_id *d)
1014{
1015 if (!broken_hp_bios_irq9) {
1016 broken_hp_bios_irq9 = 1;
1017 printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident);
1018 }
1019 return 0;
1020}
1021
1022/*
1023 * Work around broken Acer TravelMate 360 Notebooks which assign
1024 * Cardbus to IRQ 11 even though it is actually wired to IRQ 10
1025 */
1026static int __init fix_acer_tm360_irqrouting(const struct dmi_system_id *d)
1027{
1028 if (!acer_tm360_irqrouting) {
1029 acer_tm360_irqrouting = 1;
1030 printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident);
1031 }
1032 return 0;
1033}
1034
1035static struct dmi_system_id __initdata pciirq_dmi_table[] = {
1036 {
1037 .callback = fix_broken_hp_bios_irq9,
1038 .ident = "HP Pavilion N5400 Series Laptop",
1039 .matches = {
1040 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
1041 DMI_MATCH(DMI_BIOS_VERSION, "GE.M1.03"),
1042 DMI_MATCH(DMI_PRODUCT_VERSION, "HP Pavilion Notebook Model GE"),
1043 DMI_MATCH(DMI_BOARD_VERSION, "OmniBook N32N-736"),
1044 },
1045 },
1046 {
1047 .callback = fix_acer_tm360_irqrouting,
1048 .ident = "Acer TravelMate 36x Laptop",
1049 .matches = {
1050 DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
1051 DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"),
1052 },
1053 },
1054 { }
1055};
1056
1057static int __init pcibios_irq_init(void)
1058{
1059 DBG(KERN_DEBUG "PCI: IRQ init\n");
1060
1061 if (pcibios_enable_irq || raw_pci_ops == NULL)
1062 return 0;
1063
1064 dmi_check_system(pciirq_dmi_table);
1065
1066 pirq_table = pirq_find_routing_table();
1067
1068#ifdef CONFIG_PCI_BIOS
1069 if (!pirq_table && (pci_probe & PCI_BIOS_IRQ_SCAN))
1070 pirq_table = pcibios_get_irq_routing_table();
1071#endif
1072 if (pirq_table) {
1073 pirq_peer_trick();
1074 pirq_find_router(&pirq_router);
1075 if (pirq_table->exclusive_irqs) {
1076 int i;
1077 for (i=0; i<16; i++)
1078 if (!(pirq_table->exclusive_irqs & (1 << i)))
1079 pirq_penalty[i] += 100;
1080 }
1081 /* If we're using the I/O APIC, avoid using the PCI IRQ routing table */
1082 if (io_apic_assign_pci_irqs)
1083 pirq_table = NULL;
1084 }
1085
1086 pcibios_enable_irq = pirq_enable_irq;
1087
1088 pcibios_fixup_irqs();
1089 return 0;
1090}
1091
1092subsys_initcall(pcibios_irq_init);
1093
1094
1095static void pirq_penalize_isa_irq(int irq, int active)
1096{
1097 /*
1098 * If any ISAPnP device reports an IRQ in its list of possible
1099 * IRQ's, we try to avoid assigning it to PCI devices.
1100 */
1101 if (irq < 16) {
1102 if (active)
1103 pirq_penalty[irq] += 1000;
1104 else
1105 pirq_penalty[irq] += 100;
1106 }
1107}
1108
1109void pcibios_penalize_isa_irq(int irq, int active)
1110{
1111#ifdef CONFIG_ACPI
1112 if (!acpi_noirq)
1113 acpi_penalize_isa_irq(irq, active);
1114 else
1115#endif
1116 pirq_penalize_isa_irq(irq, active);
1117}
1118
1119static int pirq_enable_irq(struct pci_dev *dev)
1120{
1121 u8 pin;
1122 struct pci_dev *temp_dev;
1123
1124 pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
1125 if (pin && !pcibios_lookup_irq(dev, 1) && !dev->irq) {
1126 char *msg = "";
1127
1128 pin--; /* interrupt pins are numbered starting from 1 */
1129
1130 if (io_apic_assign_pci_irqs) {
1131 int irq;
1132
1133 irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin);
1134 /*
1135 * Busses behind bridges are typically not listed in the MP-table.
1136 * In this case we have to look up the IRQ based on the parent bus,
1137 * parent slot, and pin number. The SMP code detects such bridged
1138 * busses itself so we should get into this branch reliably.
1139 */
1140 temp_dev = dev;
1141 while (irq < 0 && dev->bus->parent) { /* go back to the bridge */
1142 struct pci_dev * bridge = dev->bus->self;
1143
1144 pin = (pin + PCI_SLOT(dev->devfn)) % 4;
1145 irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
1146 PCI_SLOT(bridge->devfn), pin);
1147 if (irq >= 0)
1148 printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n",
1149 pci_name(bridge), 'A' + pin, irq);
1150 dev = bridge;
1151 }
1152 dev = temp_dev;
1153 if (irq >= 0) {
1154 printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
1155 pci_name(dev), 'A' + pin, irq);
1156 dev->irq = irq;
1157 return 0;
1158 } else
1159 msg = " Probably buggy MP table.";
1160 } else if (pci_probe & PCI_BIOS_IRQ_SCAN)
1161 msg = "";
1162 else
1163 msg = " Please try using pci=biosirq.";
1164
1165 /* With IDE legacy devices the IRQ lookup failure is not a problem.. */
1166 if (dev->class >> 8 == PCI_CLASS_STORAGE_IDE && !(dev->class & 0x5))
1167 return 0;
1168
1169 printk(KERN_WARNING "PCI: No IRQ known for interrupt pin %c of device %s.%s\n",
1170 'A' + pin, pci_name(dev), msg);
1171 }
1172 return 0;
1173}
diff --git a/arch/x86/pci/k8-bus_64.c b/arch/x86/pci/k8-bus_64.c
new file mode 100644
index 000000000000..9cc813e29706
--- /dev/null
+++ b/arch/x86/pci/k8-bus_64.c
@@ -0,0 +1,83 @@
1#include <linux/init.h>
2#include <linux/pci.h>
3#include <asm/mpspec.h>
4#include <linux/cpumask.h>
5
6/*
7 * This discovers the pcibus <-> node mapping on AMD K8.
8 *
9 * RED-PEN need to call this again on PCI hotplug
10 * RED-PEN empty cpus get reported wrong
11 */
12
13#define NODE_ID_REGISTER 0x60
14#define NODE_ID(dword) (dword & 0x07)
15#define LDT_BUS_NUMBER_REGISTER_0 0x94
16#define LDT_BUS_NUMBER_REGISTER_1 0xB4
17#define LDT_BUS_NUMBER_REGISTER_2 0xD4
18#define NR_LDT_BUS_NUMBER_REGISTERS 3
19#define SECONDARY_LDT_BUS_NUMBER(dword) ((dword >> 8) & 0xFF)
20#define SUBORDINATE_LDT_BUS_NUMBER(dword) ((dword >> 16) & 0xFF)
21#define PCI_DEVICE_ID_K8HTCONFIG 0x1100
22
23/**
24 * fill_mp_bus_to_cpumask()
25 * fills the mp_bus_to_cpumask array based according to the LDT Bus Number
26 * Registers found in the K8 northbridge
27 */
28__init static int
29fill_mp_bus_to_cpumask(void)
30{
31 struct pci_dev *nb_dev = NULL;
32 int i, j;
33 u32 ldtbus, nid;
34 static int lbnr[3] = {
35 LDT_BUS_NUMBER_REGISTER_0,
36 LDT_BUS_NUMBER_REGISTER_1,
37 LDT_BUS_NUMBER_REGISTER_2
38 };
39
40 while ((nb_dev = pci_get_device(PCI_VENDOR_ID_AMD,
41 PCI_DEVICE_ID_K8HTCONFIG, nb_dev))) {
42 pci_read_config_dword(nb_dev, NODE_ID_REGISTER, &nid);
43
44 for (i = 0; i < NR_LDT_BUS_NUMBER_REGISTERS; i++) {
45 pci_read_config_dword(nb_dev, lbnr[i], &ldtbus);
46 /*
47 * if there are no busses hanging off of the current
48 * ldt link then both the secondary and subordinate
49 * bus number fields are set to 0.
50 *
51 * RED-PEN
52 * This is slightly broken because it assumes
53 * HT node IDs == Linux node ids, which is not always
54 * true. However it is probably mostly true.
55 */
56 if (!(SECONDARY_LDT_BUS_NUMBER(ldtbus) == 0
57 && SUBORDINATE_LDT_BUS_NUMBER(ldtbus) == 0)) {
58 for (j = SECONDARY_LDT_BUS_NUMBER(ldtbus);
59 j <= SUBORDINATE_LDT_BUS_NUMBER(ldtbus);
60 j++) {
61 struct pci_bus *bus;
62 struct pci_sysdata *sd;
63
64 long node = NODE_ID(nid);
65 /* Algorithm a bit dumb, but
66 it shouldn't matter here */
67 bus = pci_find_bus(0, j);
68 if (!bus)
69 continue;
70 if (!node_online(node))
71 node = 0;
72
73 sd = bus->sysdata;
74 sd->node = node;
75 }
76 }
77 }
78 }
79
80 return 0;
81}
82
83fs_initcall(fill_mp_bus_to_cpumask);
diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c
new file mode 100644
index 000000000000..5565d7016b75
--- /dev/null
+++ b/arch/x86/pci/legacy.c
@@ -0,0 +1,56 @@
1/*
2 * legacy.c - traditional, old school PCI bus probing
3 */
4#include <linux/init.h>
5#include <linux/pci.h>
6#include "pci.h"
7
8/*
9 * Discover remaining PCI buses in case there are peer host bridges.
10 * We use the number of last PCI bus provided by the PCI BIOS.
11 */
12static void __devinit pcibios_fixup_peer_bridges(void)
13{
14 int n, devfn;
15
16 if (pcibios_last_bus <= 0 || pcibios_last_bus >= 0xff)
17 return;
18 DBG("PCI: Peer bridge fixup\n");
19
20 for (n=0; n <= pcibios_last_bus; n++) {
21 u32 l;
22 if (pci_find_bus(0, n))
23 continue;
24 for (devfn = 0; devfn < 256; devfn += 8) {
25 if (!raw_pci_ops->read(0, n, devfn, PCI_VENDOR_ID, 2, &l) &&
26 l != 0x0000 && l != 0xffff) {
27 DBG("Found device at %02x:%02x [%04x]\n", n, devfn, l);
28 printk(KERN_INFO "PCI: Discovered peer bus %02x\n", n);
29 pci_scan_bus_with_sysdata(n);
30 break;
31 }
32 }
33 }
34}
35
36static int __init pci_legacy_init(void)
37{
38 if (!raw_pci_ops) {
39 printk("PCI: System does not support PCI\n");
40 return 0;
41 }
42
43 if (pcibios_scanned++)
44 return 0;
45
46 printk("PCI: Probing PCI hardware\n");
47 pci_root_bus = pcibios_scan_root(0);
48 if (pci_root_bus)
49 pci_bus_add_devices(pci_root_bus);
50
51 pcibios_fixup_peer_bridges();
52
53 return 0;
54}
55
56subsys_initcall(pci_legacy_init);
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
new file mode 100644
index 000000000000..4df637e34f81
--- /dev/null
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -0,0 +1,315 @@
1/*
2 * mmconfig-shared.c - Low-level direct PCI config space access via
3 * MMCONFIG - common code between i386 and x86-64.
4 *
5 * This code does:
6 * - known chipset handling
7 * - ACPI decoding and validation
8 *
9 * Per-architecture code takes care of the mappings and accesses
10 * themselves.
11 */
12
13#include <linux/pci.h>
14#include <linux/init.h>
15#include <linux/acpi.h>
16#include <linux/bitmap.h>
17#include <asm/e820.h>
18
19#include "pci.h"
20
21/* aperture is up to 256MB but BIOS may reserve less */
22#define MMCONFIG_APER_MIN (2 * 1024*1024)
23#define MMCONFIG_APER_MAX (256 * 1024*1024)
24
25DECLARE_BITMAP(pci_mmcfg_fallback_slots, 32*PCI_MMCFG_MAX_CHECK_BUS);
26
27/* Indicate if the mmcfg resources have been placed into the resource table. */
28static int __initdata pci_mmcfg_resources_inserted;
29
30/* K8 systems have some devices (typically in the builtin northbridge)
31 that are only accessible using type1
32 Normally this can be expressed in the MCFG by not listing them
33 and assigning suitable _SEGs, but this isn't implemented in some BIOS.
34 Instead try to discover all devices on bus 0 that are unreachable using MM
35 and fallback for them. */
36static void __init unreachable_devices(void)
37{
38 int i, bus;
39 /* Use the max bus number from ACPI here? */
40 for (bus = 0; bus < PCI_MMCFG_MAX_CHECK_BUS; bus++) {
41 for (i = 0; i < 32; i++) {
42 unsigned int devfn = PCI_DEVFN(i, 0);
43 u32 val1, val2;
44
45 pci_conf1_read(0, bus, devfn, 0, 4, &val1);
46 if (val1 == 0xffffffff)
47 continue;
48
49 if (pci_mmcfg_arch_reachable(0, bus, devfn)) {
50 raw_pci_ops->read(0, bus, devfn, 0, 4, &val2);
51 if (val1 == val2)
52 continue;
53 }
54 set_bit(i + 32 * bus, pci_mmcfg_fallback_slots);
55 printk(KERN_NOTICE "PCI: No mmconfig possible on device"
56 " %02x:%02x\n", bus, i);
57 }
58 }
59}
60
61static const char __init *pci_mmcfg_e7520(void)
62{
63 u32 win;
64 pci_conf1_read(0, 0, PCI_DEVFN(0,0), 0xce, 2, &win);
65
66 win = win & 0xf000;
67 if(win == 0x0000 || win == 0xf000)
68 pci_mmcfg_config_num = 0;
69 else {
70 pci_mmcfg_config_num = 1;
71 pci_mmcfg_config = kzalloc(sizeof(pci_mmcfg_config[0]), GFP_KERNEL);
72 if (!pci_mmcfg_config)
73 return NULL;
74 pci_mmcfg_config[0].address = win << 16;
75 pci_mmcfg_config[0].pci_segment = 0;
76 pci_mmcfg_config[0].start_bus_number = 0;
77 pci_mmcfg_config[0].end_bus_number = 255;
78 }
79
80 return "Intel Corporation E7520 Memory Controller Hub";
81}
82
83static const char __init *pci_mmcfg_intel_945(void)
84{
85 u32 pciexbar, mask = 0, len = 0;
86
87 pci_mmcfg_config_num = 1;
88
89 pci_conf1_read(0, 0, PCI_DEVFN(0,0), 0x48, 4, &pciexbar);
90
91 /* Enable bit */
92 if (!(pciexbar & 1))
93 pci_mmcfg_config_num = 0;
94
95 /* Size bits */
96 switch ((pciexbar >> 1) & 3) {
97 case 0:
98 mask = 0xf0000000U;
99 len = 0x10000000U;
100 break;
101 case 1:
102 mask = 0xf8000000U;
103 len = 0x08000000U;
104 break;
105 case 2:
106 mask = 0xfc000000U;
107 len = 0x04000000U;
108 break;
109 default:
110 pci_mmcfg_config_num = 0;
111 }
112
113 /* Errata #2, things break when not aligned on a 256Mb boundary */
114 /* Can only happen in 64M/128M mode */
115
116 if ((pciexbar & mask) & 0x0fffffffU)
117 pci_mmcfg_config_num = 0;
118
119 /* Don't hit the APIC registers and their friends */
120 if ((pciexbar & mask) >= 0xf0000000U)
121 pci_mmcfg_config_num = 0;
122
123 if (pci_mmcfg_config_num) {
124 pci_mmcfg_config = kzalloc(sizeof(pci_mmcfg_config[0]), GFP_KERNEL);
125 if (!pci_mmcfg_config)
126 return NULL;
127 pci_mmcfg_config[0].address = pciexbar & mask;
128 pci_mmcfg_config[0].pci_segment = 0;
129 pci_mmcfg_config[0].start_bus_number = 0;
130 pci_mmcfg_config[0].end_bus_number = (len >> 20) - 1;
131 }
132
133 return "Intel Corporation 945G/GZ/P/PL Express Memory Controller Hub";
134}
135
136struct pci_mmcfg_hostbridge_probe {
137 u32 vendor;
138 u32 device;
139 const char *(*probe)(void);
140};
141
142static struct pci_mmcfg_hostbridge_probe pci_mmcfg_probes[] __initdata = {
143 { PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, pci_mmcfg_e7520 },
144 { PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82945G_HB, pci_mmcfg_intel_945 },
145};
146
147static int __init pci_mmcfg_check_hostbridge(void)
148{
149 u32 l;
150 u16 vendor, device;
151 int i;
152 const char *name;
153
154 pci_conf1_read(0, 0, PCI_DEVFN(0,0), 0, 4, &l);
155 vendor = l & 0xffff;
156 device = (l >> 16) & 0xffff;
157
158 pci_mmcfg_config_num = 0;
159 pci_mmcfg_config = NULL;
160 name = NULL;
161
162 for (i = 0; !name && i < ARRAY_SIZE(pci_mmcfg_probes); i++) {
163 if (pci_mmcfg_probes[i].vendor == vendor &&
164 pci_mmcfg_probes[i].device == device)
165 name = pci_mmcfg_probes[i].probe();
166 }
167
168 if (name) {
169 printk(KERN_INFO "PCI: Found %s %s MMCONFIG support.\n",
170 name, pci_mmcfg_config_num ? "with" : "without");
171 }
172
173 return name != NULL;
174}
175
176static void __init pci_mmcfg_insert_resources(unsigned long resource_flags)
177{
178#define PCI_MMCFG_RESOURCE_NAME_LEN 19
179 int i;
180 struct resource *res;
181 char *names;
182 unsigned num_buses;
183
184 res = kcalloc(PCI_MMCFG_RESOURCE_NAME_LEN + sizeof(*res),
185 pci_mmcfg_config_num, GFP_KERNEL);
186 if (!res) {
187 printk(KERN_ERR "PCI: Unable to allocate MMCONFIG resources\n");
188 return;
189 }
190
191 names = (void *)&res[pci_mmcfg_config_num];
192 for (i = 0; i < pci_mmcfg_config_num; i++, res++) {
193 struct acpi_mcfg_allocation *cfg = &pci_mmcfg_config[i];
194 num_buses = cfg->end_bus_number - cfg->start_bus_number + 1;
195 res->name = names;
196 snprintf(names, PCI_MMCFG_RESOURCE_NAME_LEN, "PCI MMCONFIG %u",
197 cfg->pci_segment);
198 res->start = cfg->address;
199 res->end = res->start + (num_buses << 20) - 1;
200 res->flags = IORESOURCE_MEM | resource_flags;
201 insert_resource(&iomem_resource, res);
202 names += PCI_MMCFG_RESOURCE_NAME_LEN;
203 }
204
205 /* Mark that the resources have been inserted. */
206 pci_mmcfg_resources_inserted = 1;
207}
208
209static void __init pci_mmcfg_reject_broken(int type)
210{
211 typeof(pci_mmcfg_config[0]) *cfg;
212
213 if ((pci_mmcfg_config_num == 0) ||
214 (pci_mmcfg_config == NULL) ||
215 (pci_mmcfg_config[0].address == 0))
216 return;
217
218 cfg = &pci_mmcfg_config[0];
219
220 /*
221 * Handle more broken MCFG tables on Asus etc.
222 * They only contain a single entry for bus 0-0.
223 */
224 if (pci_mmcfg_config_num == 1 &&
225 cfg->pci_segment == 0 &&
226 (cfg->start_bus_number | cfg->end_bus_number) == 0) {
227 printk(KERN_ERR "PCI: start and end of bus number is 0. "
228 "Rejected as broken MCFG.\n");
229 goto reject;
230 }
231
232 /*
233 * Only do this check when type 1 works. If it doesn't work
234 * assume we run on a Mac and always use MCFG
235 */
236 if (type == 1 && !e820_all_mapped(cfg->address,
237 cfg->address + MMCONFIG_APER_MIN,
238 E820_RESERVED)) {
239 printk(KERN_ERR "PCI: BIOS Bug: MCFG area at %Lx is not"
240 " E820-reserved\n", cfg->address);
241 goto reject;
242 }
243 return;
244
245reject:
246 printk(KERN_ERR "PCI: Not using MMCONFIG.\n");
247 kfree(pci_mmcfg_config);
248 pci_mmcfg_config = NULL;
249 pci_mmcfg_config_num = 0;
250}
251
252void __init pci_mmcfg_init(int type)
253{
254 int known_bridge = 0;
255
256 if ((pci_probe & PCI_PROBE_MMCONF) == 0)
257 return;
258
259 if (type == 1 && pci_mmcfg_check_hostbridge())
260 known_bridge = 1;
261
262 if (!known_bridge) {
263 acpi_table_parse(ACPI_SIG_MCFG, acpi_parse_mcfg);
264 pci_mmcfg_reject_broken(type);
265 }
266
267 if ((pci_mmcfg_config_num == 0) ||
268 (pci_mmcfg_config == NULL) ||
269 (pci_mmcfg_config[0].address == 0))
270 return;
271
272 if (pci_mmcfg_arch_init()) {
273 if (type == 1)
274 unreachable_devices();
275 if (known_bridge)
276 pci_mmcfg_insert_resources(IORESOURCE_BUSY);
277 pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF;
278 } else {
279 /*
280 * Signal not to attempt to insert mmcfg resources because
281 * the architecture mmcfg setup could not initialize.
282 */
283 pci_mmcfg_resources_inserted = 1;
284 }
285}
286
287static int __init pci_mmcfg_late_insert_resources(void)
288{
289 /*
290 * If resources are already inserted or we are not using MMCONFIG,
291 * don't insert the resources.
292 */
293 if ((pci_mmcfg_resources_inserted == 1) ||
294 (pci_probe & PCI_PROBE_MMCONF) == 0 ||
295 (pci_mmcfg_config_num == 0) ||
296 (pci_mmcfg_config == NULL) ||
297 (pci_mmcfg_config[0].address == 0))
298 return 1;
299
300 /*
301 * Attempt to insert the mmcfg resources but not with the busy flag
302 * marked so it won't cause request errors when __request_region is
303 * called.
304 */
305 pci_mmcfg_insert_resources(0);
306
307 return 0;
308}
309
310/*
311 * Perform MMCONFIG resource insertion after PCI initialization to allow for
312 * misprogrammed MCFG tables that state larger sizes but actually conflict
313 * with other system resources.
314 */
315late_initcall(pci_mmcfg_late_insert_resources);
diff --git a/arch/x86/pci/mmconfig_32.c b/arch/x86/pci/mmconfig_32.c
new file mode 100644
index 000000000000..1bf5816d34c8
--- /dev/null
+++ b/arch/x86/pci/mmconfig_32.c
@@ -0,0 +1,148 @@
1/*
2 * Copyright (C) 2004 Matthew Wilcox <matthew@wil.cx>
3 * Copyright (C) 2004 Intel Corp.
4 *
5 * This code is released under the GNU General Public License version 2.
6 */
7
8/*
9 * mmconfig.c - Low-level direct PCI config space access via MMCONFIG
10 */
11
12#include <linux/pci.h>
13#include <linux/init.h>
14#include <linux/acpi.h>
15#include <asm/e820.h>
16#include "pci.h"
17
18/* Assume systems with more busses have correct MCFG */
19#define mmcfg_virt_addr ((void __iomem *) fix_to_virt(FIX_PCIE_MCFG))
20
21/* The base address of the last MMCONFIG device accessed */
22static u32 mmcfg_last_accessed_device;
23static int mmcfg_last_accessed_cpu;
24
25/*
26 * Functions for accessing PCI configuration space with MMCONFIG accesses
27 */
28static u32 get_base_addr(unsigned int seg, int bus, unsigned devfn)
29{
30 struct acpi_mcfg_allocation *cfg;
31 int cfg_num;
32
33 if (seg == 0 && bus < PCI_MMCFG_MAX_CHECK_BUS &&
34 test_bit(PCI_SLOT(devfn) + 32*bus, pci_mmcfg_fallback_slots))
35 return 0;
36
37 for (cfg_num = 0; cfg_num < pci_mmcfg_config_num; cfg_num++) {
38 cfg = &pci_mmcfg_config[cfg_num];
39 if (cfg->pci_segment == seg &&
40 (cfg->start_bus_number <= bus) &&
41 (cfg->end_bus_number >= bus))
42 return cfg->address;
43 }
44
45 /* Fall back to type 0 */
46 return 0;
47}
48
49/*
50 * This is always called under pci_config_lock
51 */
52static void pci_exp_set_dev_base(unsigned int base, int bus, int devfn)
53{
54 u32 dev_base = base | (bus << 20) | (devfn << 12);
55 int cpu = smp_processor_id();
56 if (dev_base != mmcfg_last_accessed_device ||
57 cpu != mmcfg_last_accessed_cpu) {
58 mmcfg_last_accessed_device = dev_base;
59 mmcfg_last_accessed_cpu = cpu;
60 set_fixmap_nocache(FIX_PCIE_MCFG, dev_base);
61 }
62}
63
64static int pci_mmcfg_read(unsigned int seg, unsigned int bus,
65 unsigned int devfn, int reg, int len, u32 *value)
66{
67 unsigned long flags;
68 u32 base;
69
70 if ((bus > 255) || (devfn > 255) || (reg > 4095)) {
71 *value = -1;
72 return -EINVAL;
73 }
74
75 base = get_base_addr(seg, bus, devfn);
76 if (!base)
77 return pci_conf1_read(seg,bus,devfn,reg,len,value);
78
79 spin_lock_irqsave(&pci_config_lock, flags);
80
81 pci_exp_set_dev_base(base, bus, devfn);
82
83 switch (len) {
84 case 1:
85 *value = mmio_config_readb(mmcfg_virt_addr + reg);
86 break;
87 case 2:
88 *value = mmio_config_readw(mmcfg_virt_addr + reg);
89 break;
90 case 4:
91 *value = mmio_config_readl(mmcfg_virt_addr + reg);
92 break;
93 }
94 spin_unlock_irqrestore(&pci_config_lock, flags);
95
96 return 0;
97}
98
99static int pci_mmcfg_write(unsigned int seg, unsigned int bus,
100 unsigned int devfn, int reg, int len, u32 value)
101{
102 unsigned long flags;
103 u32 base;
104
105 if ((bus > 255) || (devfn > 255) || (reg > 4095))
106 return -EINVAL;
107
108 base = get_base_addr(seg, bus, devfn);
109 if (!base)
110 return pci_conf1_write(seg,bus,devfn,reg,len,value);
111
112 spin_lock_irqsave(&pci_config_lock, flags);
113
114 pci_exp_set_dev_base(base, bus, devfn);
115
116 switch (len) {
117 case 1:
118 mmio_config_writeb(mmcfg_virt_addr + reg, value);
119 break;
120 case 2:
121 mmio_config_writew(mmcfg_virt_addr + reg, value);
122 break;
123 case 4:
124 mmio_config_writel(mmcfg_virt_addr + reg, value);
125 break;
126 }
127 spin_unlock_irqrestore(&pci_config_lock, flags);
128
129 return 0;
130}
131
132static struct pci_raw_ops pci_mmcfg = {
133 .read = pci_mmcfg_read,
134 .write = pci_mmcfg_write,
135};
136
137int __init pci_mmcfg_arch_reachable(unsigned int seg, unsigned int bus,
138 unsigned int devfn)
139{
140 return get_base_addr(seg, bus, devfn) != 0;
141}
142
143int __init pci_mmcfg_arch_init(void)
144{
145 printk(KERN_INFO "PCI: Using MMCONFIG\n");
146 raw_pci_ops = &pci_mmcfg;
147 return 1;
148}
diff --git a/arch/x86/pci/mmconfig_64.c b/arch/x86/pci/mmconfig_64.c
new file mode 100644
index 000000000000..4095e4d66a1d
--- /dev/null
+++ b/arch/x86/pci/mmconfig_64.c
@@ -0,0 +1,157 @@
1/*
2 * mmconfig.c - Low-level direct PCI config space access via MMCONFIG
3 *
4 * This is an 64bit optimized version that always keeps the full mmconfig
5 * space mapped. This allows lockless config space operation.
6 */
7
8#include <linux/pci.h>
9#include <linux/init.h>
10#include <linux/acpi.h>
11#include <linux/bitmap.h>
12#include <asm/e820.h>
13
14#include "pci.h"
15
16/* Static virtual mapping of the MMCONFIG aperture */
17struct mmcfg_virt {
18 struct acpi_mcfg_allocation *cfg;
19 char __iomem *virt;
20};
21static struct mmcfg_virt *pci_mmcfg_virt;
22
23static char __iomem *get_virt(unsigned int seg, unsigned bus)
24{
25 struct acpi_mcfg_allocation *cfg;
26 int cfg_num;
27
28 for (cfg_num = 0; cfg_num < pci_mmcfg_config_num; cfg_num++) {
29 cfg = pci_mmcfg_virt[cfg_num].cfg;
30 if (cfg->pci_segment == seg &&
31 (cfg->start_bus_number <= bus) &&
32 (cfg->end_bus_number >= bus))
33 return pci_mmcfg_virt[cfg_num].virt;
34 }
35
36 /* Fall back to type 0 */
37 return NULL;
38}
39
40static char __iomem *pci_dev_base(unsigned int seg, unsigned int bus, unsigned int devfn)
41{
42 char __iomem *addr;
43 if (seg == 0 && bus < PCI_MMCFG_MAX_CHECK_BUS &&
44 test_bit(32*bus + PCI_SLOT(devfn), pci_mmcfg_fallback_slots))
45 return NULL;
46 addr = get_virt(seg, bus);
47 if (!addr)
48 return NULL;
49 return addr + ((bus << 20) | (devfn << 12));
50}
51
52static int pci_mmcfg_read(unsigned int seg, unsigned int bus,
53 unsigned int devfn, int reg, int len, u32 *value)
54{
55 char __iomem *addr;
56
57 /* Why do we have this when nobody checks it. How about a BUG()!? -AK */
58 if (unlikely((bus > 255) || (devfn > 255) || (reg > 4095))) {
59 *value = -1;
60 return -EINVAL;
61 }
62
63 addr = pci_dev_base(seg, bus, devfn);
64 if (!addr)
65 return pci_conf1_read(seg,bus,devfn,reg,len,value);
66
67 switch (len) {
68 case 1:
69 *value = mmio_config_readb(addr + reg);
70 break;
71 case 2:
72 *value = mmio_config_readw(addr + reg);
73 break;
74 case 4:
75 *value = mmio_config_readl(addr + reg);
76 break;
77 }
78
79 return 0;
80}
81
82static int pci_mmcfg_write(unsigned int seg, unsigned int bus,
83 unsigned int devfn, int reg, int len, u32 value)
84{
85 char __iomem *addr;
86
87 /* Why do we have this when nobody checks it. How about a BUG()!? -AK */
88 if (unlikely((bus > 255) || (devfn > 255) || (reg > 4095)))
89 return -EINVAL;
90
91 addr = pci_dev_base(seg, bus, devfn);
92 if (!addr)
93 return pci_conf1_write(seg,bus,devfn,reg,len,value);
94
95 switch (len) {
96 case 1:
97 mmio_config_writeb(addr + reg, value);
98 break;
99 case 2:
100 mmio_config_writew(addr + reg, value);
101 break;
102 case 4:
103 mmio_config_writel(addr + reg, value);
104 break;
105 }
106
107 return 0;
108}
109
110static struct pci_raw_ops pci_mmcfg = {
111 .read = pci_mmcfg_read,
112 .write = pci_mmcfg_write,
113};
114
115static void __iomem * __init mcfg_ioremap(struct acpi_mcfg_allocation *cfg)
116{
117 void __iomem *addr;
118 u32 size;
119
120 size = (cfg->end_bus_number + 1) << 20;
121 addr = ioremap_nocache(cfg->address, size);
122 if (addr) {
123 printk(KERN_INFO "PCI: Using MMCONFIG at %Lx - %Lx\n",
124 cfg->address, cfg->address + size - 1);
125 }
126 return addr;
127}
128
129int __init pci_mmcfg_arch_reachable(unsigned int seg, unsigned int bus,
130 unsigned int devfn)
131{
132 return pci_dev_base(seg, bus, devfn) != NULL;
133}
134
135int __init pci_mmcfg_arch_init(void)
136{
137 int i;
138 pci_mmcfg_virt = kmalloc(sizeof(*pci_mmcfg_virt) *
139 pci_mmcfg_config_num, GFP_KERNEL);
140 if (pci_mmcfg_virt == NULL) {
141 printk(KERN_ERR "PCI: Can not allocate memory for mmconfig structures\n");
142 return 0;
143 }
144
145 for (i = 0; i < pci_mmcfg_config_num; ++i) {
146 pci_mmcfg_virt[i].cfg = &pci_mmcfg_config[i];
147 pci_mmcfg_virt[i].virt = mcfg_ioremap(&pci_mmcfg_config[i]);
148 if (!pci_mmcfg_virt[i].virt) {
149 printk(KERN_ERR "PCI: Cannot map mmconfig aperture for "
150 "segment %d\n",
151 pci_mmcfg_config[i].pci_segment);
152 return 0;
153 }
154 }
155 raw_pci_ops = &pci_mmcfg;
156 return 1;
157}
diff --git a/arch/x86/pci/numa.c b/arch/x86/pci/numa.c
new file mode 100644
index 000000000000..f5f165f69e0c
--- /dev/null
+++ b/arch/x86/pci/numa.c
@@ -0,0 +1,135 @@
1/*
2 * numa.c - Low-level PCI access for NUMA-Q machines
3 */
4
5#include <linux/pci.h>
6#include <linux/init.h>
7#include <linux/nodemask.h>
8#include "pci.h"
9
10#define BUS2QUAD(global) (mp_bus_id_to_node[global])
11#define BUS2LOCAL(global) (mp_bus_id_to_local[global])
12#define QUADLOCAL2BUS(quad,local) (quad_local_to_mp_bus_id[quad][local])
13
14#define PCI_CONF1_MQ_ADDRESS(bus, devfn, reg) \
15 (0x80000000 | (BUS2LOCAL(bus) << 16) | (devfn << 8) | (reg & ~3))
16
17static int pci_conf1_mq_read(unsigned int seg, unsigned int bus,
18 unsigned int devfn, int reg, int len, u32 *value)
19{
20 unsigned long flags;
21
22 if (!value || (bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255))
23 return -EINVAL;
24
25 spin_lock_irqsave(&pci_config_lock, flags);
26
27 outl_quad(PCI_CONF1_MQ_ADDRESS(bus, devfn, reg), 0xCF8, BUS2QUAD(bus));
28
29 switch (len) {
30 case 1:
31 *value = inb_quad(0xCFC + (reg & 3), BUS2QUAD(bus));
32 break;
33 case 2:
34 *value = inw_quad(0xCFC + (reg & 2), BUS2QUAD(bus));
35 break;
36 case 4:
37 *value = inl_quad(0xCFC, BUS2QUAD(bus));
38 break;
39 }
40
41 spin_unlock_irqrestore(&pci_config_lock, flags);
42
43 return 0;
44}
45
46static int pci_conf1_mq_write(unsigned int seg, unsigned int bus,
47 unsigned int devfn, int reg, int len, u32 value)
48{
49 unsigned long flags;
50
51 if ((bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255))
52 return -EINVAL;
53
54 spin_lock_irqsave(&pci_config_lock, flags);
55
56 outl_quad(PCI_CONF1_MQ_ADDRESS(bus, devfn, reg), 0xCF8, BUS2QUAD(bus));
57
58 switch (len) {
59 case 1:
60 outb_quad((u8)value, 0xCFC + (reg & 3), BUS2QUAD(bus));
61 break;
62 case 2:
63 outw_quad((u16)value, 0xCFC + (reg & 2), BUS2QUAD(bus));
64 break;
65 case 4:
66 outl_quad((u32)value, 0xCFC, BUS2QUAD(bus));
67 break;
68 }
69
70 spin_unlock_irqrestore(&pci_config_lock, flags);
71
72 return 0;
73}
74
75#undef PCI_CONF1_MQ_ADDRESS
76
77static struct pci_raw_ops pci_direct_conf1_mq = {
78 .read = pci_conf1_mq_read,
79 .write = pci_conf1_mq_write
80};
81
82
83static void __devinit pci_fixup_i450nx(struct pci_dev *d)
84{
85 /*
86 * i450NX -- Find and scan all secondary buses on all PXB's.
87 */
88 int pxb, reg;
89 u8 busno, suba, subb;
90 int quad = BUS2QUAD(d->bus->number);
91
92 printk("PCI: Searching for i450NX host bridges on %s\n", pci_name(d));
93 reg = 0xd0;
94 for(pxb=0; pxb<2; pxb++) {
95 pci_read_config_byte(d, reg++, &busno);
96 pci_read_config_byte(d, reg++, &suba);
97 pci_read_config_byte(d, reg++, &subb);
98 DBG("i450NX PXB %d: %02x/%02x/%02x\n", pxb, busno, suba, subb);
99 if (busno) {
100 /* Bus A */
101 pci_scan_bus_with_sysdata(QUADLOCAL2BUS(quad, busno));
102 }
103 if (suba < subb) {
104 /* Bus B */
105 pci_scan_bus_with_sysdata(QUADLOCAL2BUS(quad, suba+1));
106 }
107 }
108 pcibios_last_bus = -1;
109}
110DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82451NX, pci_fixup_i450nx);
111
112static int __init pci_numa_init(void)
113{
114 int quad;
115
116 raw_pci_ops = &pci_direct_conf1_mq;
117
118 if (pcibios_scanned++)
119 return 0;
120
121 pci_root_bus = pcibios_scan_root(0);
122 if (pci_root_bus)
123 pci_bus_add_devices(pci_root_bus);
124 if (num_online_nodes() > 1)
125 for_each_online_node(quad) {
126 if (quad == 0)
127 continue;
128 printk("Scanning PCI bus %d for quad %d\n",
129 QUADLOCAL2BUS(quad,0), quad);
130 pci_scan_bus_with_sysdata(QUADLOCAL2BUS(quad, 0));
131 }
132 return 0;
133}
134
135subsys_initcall(pci_numa_init);
diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c
new file mode 100644
index 000000000000..10ac8c316c46
--- /dev/null
+++ b/arch/x86/pci/pcbios.c
@@ -0,0 +1,492 @@
1/*
2 * BIOS32 and PCI BIOS handling.
3 */
4
5#include <linux/pci.h>
6#include <linux/init.h>
7#include <linux/module.h>
8#include <linux/uaccess.h>
9#include "pci.h"
10#include "pci-functions.h"
11
12
13/* BIOS32 signature: "_32_" */
14#define BIOS32_SIGNATURE (('_' << 0) + ('3' << 8) + ('2' << 16) + ('_' << 24))
15
16/* PCI signature: "PCI " */
17#define PCI_SIGNATURE (('P' << 0) + ('C' << 8) + ('I' << 16) + (' ' << 24))
18
19/* PCI service signature: "$PCI" */
20#define PCI_SERVICE (('$' << 0) + ('P' << 8) + ('C' << 16) + ('I' << 24))
21
22/* PCI BIOS hardware mechanism flags */
23#define PCIBIOS_HW_TYPE1 0x01
24#define PCIBIOS_HW_TYPE2 0x02
25#define PCIBIOS_HW_TYPE1_SPEC 0x10
26#define PCIBIOS_HW_TYPE2_SPEC 0x20
27
28/*
29 * This is the standard structure used to identify the entry point
30 * to the BIOS32 Service Directory, as documented in
31 * Standard BIOS 32-bit Service Directory Proposal
32 * Revision 0.4 May 24, 1993
33 * Phoenix Technologies Ltd.
34 * Norwood, MA
35 * and the PCI BIOS specification.
36 */
37
38union bios32 {
39 struct {
40 unsigned long signature; /* _32_ */
41 unsigned long entry; /* 32 bit physical address */
42 unsigned char revision; /* Revision level, 0 */
43 unsigned char length; /* Length in paragraphs should be 01 */
44 unsigned char checksum; /* All bytes must add up to zero */
45 unsigned char reserved[5]; /* Must be zero */
46 } fields;
47 char chars[16];
48};
49
50/*
51 * Physical address of the service directory. I don't know if we're
52 * allowed to have more than one of these or not, so just in case
53 * we'll make pcibios_present() take a memory start parameter and store
54 * the array there.
55 */
56
57static struct {
58 unsigned long address;
59 unsigned short segment;
60} bios32_indirect = { 0, __KERNEL_CS };
61
62/*
63 * Returns the entry point for the given service, NULL on error
64 */
65
66static unsigned long bios32_service(unsigned long service)
67{
68 unsigned char return_code; /* %al */
69 unsigned long address; /* %ebx */
70 unsigned long length; /* %ecx */
71 unsigned long entry; /* %edx */
72 unsigned long flags;
73
74 local_irq_save(flags);
75 __asm__("lcall *(%%edi); cld"
76 : "=a" (return_code),
77 "=b" (address),
78 "=c" (length),
79 "=d" (entry)
80 : "0" (service),
81 "1" (0),
82 "D" (&bios32_indirect));
83 local_irq_restore(flags);
84
85 switch (return_code) {
86 case 0:
87 return address + entry;
88 case 0x80: /* Not present */
89 printk(KERN_WARNING "bios32_service(0x%lx): not present\n", service);
90 return 0;
91 default: /* Shouldn't happen */
92 printk(KERN_WARNING "bios32_service(0x%lx): returned 0x%x -- BIOS bug!\n",
93 service, return_code);
94 return 0;
95 }
96}
97
98static struct {
99 unsigned long address;
100 unsigned short segment;
101} pci_indirect = { 0, __KERNEL_CS };
102
103static int pci_bios_present;
104
105static int __devinit check_pcibios(void)
106{
107 u32 signature, eax, ebx, ecx;
108 u8 status, major_ver, minor_ver, hw_mech;
109 unsigned long flags, pcibios_entry;
110
111 if ((pcibios_entry = bios32_service(PCI_SERVICE))) {
112 pci_indirect.address = pcibios_entry + PAGE_OFFSET;
113
114 local_irq_save(flags);
115 __asm__(
116 "lcall *(%%edi); cld\n\t"
117 "jc 1f\n\t"
118 "xor %%ah, %%ah\n"
119 "1:"
120 : "=d" (signature),
121 "=a" (eax),
122 "=b" (ebx),
123 "=c" (ecx)
124 : "1" (PCIBIOS_PCI_BIOS_PRESENT),
125 "D" (&pci_indirect)
126 : "memory");
127 local_irq_restore(flags);
128
129 status = (eax >> 8) & 0xff;
130 hw_mech = eax & 0xff;
131 major_ver = (ebx >> 8) & 0xff;
132 minor_ver = ebx & 0xff;
133 if (pcibios_last_bus < 0)
134 pcibios_last_bus = ecx & 0xff;
135 DBG("PCI: BIOS probe returned s=%02x hw=%02x ver=%02x.%02x l=%02x\n",
136 status, hw_mech, major_ver, minor_ver, pcibios_last_bus);
137 if (status || signature != PCI_SIGNATURE) {
138 printk (KERN_ERR "PCI: BIOS BUG #%x[%08x] found\n",
139 status, signature);
140 return 0;
141 }
142 printk(KERN_INFO "PCI: PCI BIOS revision %x.%02x entry at 0x%lx, last bus=%d\n",
143 major_ver, minor_ver, pcibios_entry, pcibios_last_bus);
144#ifdef CONFIG_PCI_DIRECT
145 if (!(hw_mech & PCIBIOS_HW_TYPE1))
146 pci_probe &= ~PCI_PROBE_CONF1;
147 if (!(hw_mech & PCIBIOS_HW_TYPE2))
148 pci_probe &= ~PCI_PROBE_CONF2;
149#endif
150 return 1;
151 }
152 return 0;
153}
154
155static int __devinit pci_bios_find_device (unsigned short vendor, unsigned short device_id,
156 unsigned short index, unsigned char *bus, unsigned char *device_fn)
157{
158 unsigned short bx;
159 unsigned short ret;
160
161 __asm__("lcall *(%%edi); cld\n\t"
162 "jc 1f\n\t"
163 "xor %%ah, %%ah\n"
164 "1:"
165 : "=b" (bx),
166 "=a" (ret)
167 : "1" (PCIBIOS_FIND_PCI_DEVICE),
168 "c" (device_id),
169 "d" (vendor),
170 "S" ((int) index),
171 "D" (&pci_indirect));
172 *bus = (bx >> 8) & 0xff;
173 *device_fn = bx & 0xff;
174 return (int) (ret & 0xff00) >> 8;
175}
176
177static int pci_bios_read(unsigned int seg, unsigned int bus,
178 unsigned int devfn, int reg, int len, u32 *value)
179{
180 unsigned long result = 0;
181 unsigned long flags;
182 unsigned long bx = (bus << 8) | devfn;
183
184 if (!value || (bus > 255) || (devfn > 255) || (reg > 255))
185 return -EINVAL;
186
187 spin_lock_irqsave(&pci_config_lock, flags);
188
189 switch (len) {
190 case 1:
191 __asm__("lcall *(%%esi); cld\n\t"
192 "jc 1f\n\t"
193 "xor %%ah, %%ah\n"
194 "1:"
195 : "=c" (*value),
196 "=a" (result)
197 : "1" (PCIBIOS_READ_CONFIG_BYTE),
198 "b" (bx),
199 "D" ((long)reg),
200 "S" (&pci_indirect));
201 break;
202 case 2:
203 __asm__("lcall *(%%esi); cld\n\t"
204 "jc 1f\n\t"
205 "xor %%ah, %%ah\n"
206 "1:"
207 : "=c" (*value),
208 "=a" (result)
209 : "1" (PCIBIOS_READ_CONFIG_WORD),
210 "b" (bx),
211 "D" ((long)reg),
212 "S" (&pci_indirect));
213 break;
214 case 4:
215 __asm__("lcall *(%%esi); cld\n\t"
216 "jc 1f\n\t"
217 "xor %%ah, %%ah\n"
218 "1:"
219 : "=c" (*value),
220 "=a" (result)
221 : "1" (PCIBIOS_READ_CONFIG_DWORD),
222 "b" (bx),
223 "D" ((long)reg),
224 "S" (&pci_indirect));
225 break;
226 }
227
228 spin_unlock_irqrestore(&pci_config_lock, flags);
229
230 return (int)((result & 0xff00) >> 8);
231}
232
233static int pci_bios_write(unsigned int seg, unsigned int bus,
234 unsigned int devfn, int reg, int len, u32 value)
235{
236 unsigned long result = 0;
237 unsigned long flags;
238 unsigned long bx = (bus << 8) | devfn;
239
240 if ((bus > 255) || (devfn > 255) || (reg > 255))
241 return -EINVAL;
242
243 spin_lock_irqsave(&pci_config_lock, flags);
244
245 switch (len) {
246 case 1:
247 __asm__("lcall *(%%esi); cld\n\t"
248 "jc 1f\n\t"
249 "xor %%ah, %%ah\n"
250 "1:"
251 : "=a" (result)
252 : "0" (PCIBIOS_WRITE_CONFIG_BYTE),
253 "c" (value),
254 "b" (bx),
255 "D" ((long)reg),
256 "S" (&pci_indirect));
257 break;
258 case 2:
259 __asm__("lcall *(%%esi); cld\n\t"
260 "jc 1f\n\t"
261 "xor %%ah, %%ah\n"
262 "1:"
263 : "=a" (result)
264 : "0" (PCIBIOS_WRITE_CONFIG_WORD),
265 "c" (value),
266 "b" (bx),
267 "D" ((long)reg),
268 "S" (&pci_indirect));
269 break;
270 case 4:
271 __asm__("lcall *(%%esi); cld\n\t"
272 "jc 1f\n\t"
273 "xor %%ah, %%ah\n"
274 "1:"
275 : "=a" (result)
276 : "0" (PCIBIOS_WRITE_CONFIG_DWORD),
277 "c" (value),
278 "b" (bx),
279 "D" ((long)reg),
280 "S" (&pci_indirect));
281 break;
282 }
283
284 spin_unlock_irqrestore(&pci_config_lock, flags);
285
286 return (int)((result & 0xff00) >> 8);
287}
288
289
290/*
291 * Function table for BIOS32 access
292 */
293
294static struct pci_raw_ops pci_bios_access = {
295 .read = pci_bios_read,
296 .write = pci_bios_write
297};
298
299/*
300 * Try to find PCI BIOS.
301 */
302
303static struct pci_raw_ops * __devinit pci_find_bios(void)
304{
305 union bios32 *check;
306 unsigned char sum;
307 int i, length;
308
309 /*
310 * Follow the standard procedure for locating the BIOS32 Service
311 * directory by scanning the permissible address range from
312 * 0xe0000 through 0xfffff for a valid BIOS32 structure.
313 */
314
315 for (check = (union bios32 *) __va(0xe0000);
316 check <= (union bios32 *) __va(0xffff0);
317 ++check) {
318 long sig;
319 if (probe_kernel_address(&check->fields.signature, sig))
320 continue;
321
322 if (check->fields.signature != BIOS32_SIGNATURE)
323 continue;
324 length = check->fields.length * 16;
325 if (!length)
326 continue;
327 sum = 0;
328 for (i = 0; i < length ; ++i)
329 sum += check->chars[i];
330 if (sum != 0)
331 continue;
332 if (check->fields.revision != 0) {
333 printk("PCI: unsupported BIOS32 revision %d at 0x%p\n",
334 check->fields.revision, check);
335 continue;
336 }
337 DBG("PCI: BIOS32 Service Directory structure at 0x%p\n", check);
338 if (check->fields.entry >= 0x100000) {
339 printk("PCI: BIOS32 entry (0x%p) in high memory, "
340 "cannot use.\n", check);
341 return NULL;
342 } else {
343 unsigned long bios32_entry = check->fields.entry;
344 DBG("PCI: BIOS32 Service Directory entry at 0x%lx\n",
345 bios32_entry);
346 bios32_indirect.address = bios32_entry + PAGE_OFFSET;
347 if (check_pcibios())
348 return &pci_bios_access;
349 }
350 break; /* Hopefully more than one BIOS32 cannot happen... */
351 }
352
353 return NULL;
354}
355
356/*
357 * Sort the device list according to PCI BIOS. Nasty hack, but since some
358 * fool forgot to define the `correct' device order in the PCI BIOS specs
359 * and we want to be (possibly bug-to-bug ;-]) compatible with older kernels
360 * which used BIOS ordering, we are bound to do this...
361 */
362
363void __devinit pcibios_sort(void)
364{
365 LIST_HEAD(sorted_devices);
366 struct list_head *ln;
367 struct pci_dev *dev, *d;
368 int idx, found;
369 unsigned char bus, devfn;
370
371 DBG("PCI: Sorting device list...\n");
372 while (!list_empty(&pci_devices)) {
373 ln = pci_devices.next;
374 dev = pci_dev_g(ln);
375 idx = found = 0;
376 while (pci_bios_find_device(dev->vendor, dev->device, idx, &bus, &devfn) == PCIBIOS_SUCCESSFUL) {
377 idx++;
378 list_for_each(ln, &pci_devices) {
379 d = pci_dev_g(ln);
380 if (d->bus->number == bus && d->devfn == devfn) {
381 list_move_tail(&d->global_list, &sorted_devices);
382 if (d == dev)
383 found = 1;
384 break;
385 }
386 }
387 if (ln == &pci_devices) {
388 printk(KERN_WARNING "PCI: BIOS reporting unknown device %02x:%02x\n", bus, devfn);
389 /*
390 * We must not continue scanning as several buggy BIOSes
391 * return garbage after the last device. Grr.
392 */
393 break;
394 }
395 }
396 if (!found) {
397 printk(KERN_WARNING "PCI: Device %s not found by BIOS\n",
398 pci_name(dev));
399 list_move_tail(&dev->global_list, &sorted_devices);
400 }
401 }
402 list_splice(&sorted_devices, &pci_devices);
403}
404
405/*
406 * BIOS Functions for IRQ Routing
407 */
408
409struct irq_routing_options {
410 u16 size;
411 struct irq_info *table;
412 u16 segment;
413} __attribute__((packed));
414
415struct irq_routing_table * pcibios_get_irq_routing_table(void)
416{
417 struct irq_routing_options opt;
418 struct irq_routing_table *rt = NULL;
419 int ret, map;
420 unsigned long page;
421
422 if (!pci_bios_present)
423 return NULL;
424 page = __get_free_page(GFP_KERNEL);
425 if (!page)
426 return NULL;
427 opt.table = (struct irq_info *) page;
428 opt.size = PAGE_SIZE;
429 opt.segment = __KERNEL_DS;
430
431 DBG("PCI: Fetching IRQ routing table... ");
432 __asm__("push %%es\n\t"
433 "push %%ds\n\t"
434 "pop %%es\n\t"
435 "lcall *(%%esi); cld\n\t"
436 "pop %%es\n\t"
437 "jc 1f\n\t"
438 "xor %%ah, %%ah\n"
439 "1:"
440 : "=a" (ret),
441 "=b" (map),
442 "=m" (opt)
443 : "0" (PCIBIOS_GET_ROUTING_OPTIONS),
444 "1" (0),
445 "D" ((long) &opt),
446 "S" (&pci_indirect),
447 "m" (opt)
448 : "memory");
449 DBG("OK ret=%d, size=%d, map=%x\n", ret, opt.size, map);
450 if (ret & 0xff00)
451 printk(KERN_ERR "PCI: Error %02x when fetching IRQ routing table.\n", (ret >> 8) & 0xff);
452 else if (opt.size) {
453 rt = kmalloc(sizeof(struct irq_routing_table) + opt.size, GFP_KERNEL);
454 if (rt) {
455 memset(rt, 0, sizeof(struct irq_routing_table));
456 rt->size = opt.size + sizeof(struct irq_routing_table);
457 rt->exclusive_irqs = map;
458 memcpy(rt->slots, (void *) page, opt.size);
459 printk(KERN_INFO "PCI: Using BIOS Interrupt Routing Table\n");
460 }
461 }
462 free_page(page);
463 return rt;
464}
465EXPORT_SYMBOL(pcibios_get_irq_routing_table);
466
467int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq)
468{
469 int ret;
470
471 __asm__("lcall *(%%esi); cld\n\t"
472 "jc 1f\n\t"
473 "xor %%ah, %%ah\n"
474 "1:"
475 : "=a" (ret)
476 : "0" (PCIBIOS_SET_PCI_HW_INT),
477 "b" ((dev->bus->number << 8) | dev->devfn),
478 "c" ((irq << 8) | (pin + 10)),
479 "S" (&pci_indirect));
480 return !(ret & 0xff00);
481}
482EXPORT_SYMBOL(pcibios_set_irq_routing);
483
484void __init pci_pcbios_init(void)
485{
486 if ((pci_probe & PCI_PROBE_BIOS)
487 && ((raw_pci_ops = pci_find_bios()))) {
488 pci_probe |= PCI_BIOS_SORT;
489 pci_bios_present = 1;
490 }
491}
492
diff --git a/arch/x86/pci/pci.h b/arch/x86/pci/pci.h
new file mode 100644
index 000000000000..8c66f275756f
--- /dev/null
+++ b/arch/x86/pci/pci.h
@@ -0,0 +1,149 @@
1/*
2 * Low-Level PCI Access for i386 machines.
3 *
4 * (c) 1999 Martin Mares <mj@ucw.cz>
5 */
6
7#undef DEBUG
8
9#ifdef DEBUG
10#define DBG(x...) printk(x)
11#else
12#define DBG(x...)
13#endif
14
15#define PCI_PROBE_BIOS 0x0001
16#define PCI_PROBE_CONF1 0x0002
17#define PCI_PROBE_CONF2 0x0004
18#define PCI_PROBE_MMCONF 0x0008
19#define PCI_PROBE_MASK 0x000f
20#define PCI_PROBE_NOEARLY 0x0010
21
22#define PCI_NO_SORT 0x0100
23#define PCI_BIOS_SORT 0x0200
24#define PCI_NO_CHECKS 0x0400
25#define PCI_USE_PIRQ_MASK 0x0800
26#define PCI_ASSIGN_ROMS 0x1000
27#define PCI_BIOS_IRQ_SCAN 0x2000
28#define PCI_ASSIGN_ALL_BUSSES 0x4000
29
30extern unsigned int pci_probe;
31extern unsigned long pirq_table_addr;
32
33enum pci_bf_sort_state {
34 pci_bf_sort_default,
35 pci_force_nobf,
36 pci_force_bf,
37 pci_dmi_bf,
38};
39
40/* pci-i386.c */
41
42extern unsigned int pcibios_max_latency;
43
44void pcibios_resource_survey(void);
45int pcibios_enable_resources(struct pci_dev *, int);
46
47/* pci-pc.c */
48
49extern int pcibios_last_bus;
50extern struct pci_bus *pci_root_bus;
51extern struct pci_ops pci_root_ops;
52
53/* pci-irq.c */
54
55struct irq_info {
56 u8 bus, devfn; /* Bus, device and function */
57 struct {
58 u8 link; /* IRQ line ID, chipset dependent, 0=not routed */
59 u16 bitmap; /* Available IRQs */
60 } __attribute__((packed)) irq[4];
61 u8 slot; /* Slot number, 0=onboard */
62 u8 rfu;
63} __attribute__((packed));
64
65struct irq_routing_table {
66 u32 signature; /* PIRQ_SIGNATURE should be here */
67 u16 version; /* PIRQ_VERSION */
68 u16 size; /* Table size in bytes */
69 u8 rtr_bus, rtr_devfn; /* Where the interrupt router lies */
70 u16 exclusive_irqs; /* IRQs devoted exclusively to PCI usage */
71 u16 rtr_vendor, rtr_device; /* Vendor and device ID of interrupt router */
72 u32 miniport_data; /* Crap */
73 u8 rfu[11];
74 u8 checksum; /* Modulo 256 checksum must give zero */
75 struct irq_info slots[0];
76} __attribute__((packed));
77
78extern unsigned int pcibios_irq_mask;
79
80extern int pcibios_scanned;
81extern spinlock_t pci_config_lock;
82
83extern int (*pcibios_enable_irq)(struct pci_dev *dev);
84extern void (*pcibios_disable_irq)(struct pci_dev *dev);
85
86extern int pci_conf1_write(unsigned int seg, unsigned int bus,
87 unsigned int devfn, int reg, int len, u32 value);
88extern int pci_conf1_read(unsigned int seg, unsigned int bus,
89 unsigned int devfn, int reg, int len, u32 *value);
90
91extern int pci_direct_probe(void);
92extern void pci_direct_init(int type);
93extern void pci_pcbios_init(void);
94extern void pci_mmcfg_init(int type);
95extern void pcibios_sort(void);
96
97/* pci-mmconfig.c */
98
99/* Verify the first 16 busses. We assume that systems with more busses
100 get MCFG right. */
101#define PCI_MMCFG_MAX_CHECK_BUS 16
102extern DECLARE_BITMAP(pci_mmcfg_fallback_slots, 32*PCI_MMCFG_MAX_CHECK_BUS);
103
104extern int __init pci_mmcfg_arch_reachable(unsigned int seg, unsigned int bus,
105 unsigned int devfn);
106extern int __init pci_mmcfg_arch_init(void);
107
108/*
109 * AMD Fam10h CPUs are buggy, and cannot access MMIO config space
110 * on their northbrige except through the * %eax register. As such, you MUST
111 * NOT use normal IOMEM accesses, you need to only use the magic mmio-config
112 * accessor functions.
113 * In fact just use pci_config_*, nothing else please.
114 */
115static inline unsigned char mmio_config_readb(void __iomem *pos)
116{
117 u8 val;
118 asm volatile("movb (%1),%%al" : "=a" (val) : "r" (pos));
119 return val;
120}
121
122static inline unsigned short mmio_config_readw(void __iomem *pos)
123{
124 u16 val;
125 asm volatile("movw (%1),%%ax" : "=a" (val) : "r" (pos));
126 return val;
127}
128
129static inline unsigned int mmio_config_readl(void __iomem *pos)
130{
131 u32 val;
132 asm volatile("movl (%1),%%eax" : "=a" (val) : "r" (pos));
133 return val;
134}
135
136static inline void mmio_config_writeb(void __iomem *pos, u8 val)
137{
138 asm volatile("movb %%al,(%1)" :: "a" (val), "r" (pos) : "memory");
139}
140
141static inline void mmio_config_writew(void __iomem *pos, u16 val)
142{
143 asm volatile("movw %%ax,(%1)" :: "a" (val), "r" (pos) : "memory");
144}
145
146static inline void mmio_config_writel(void __iomem *pos, u32 val)
147{
148 asm volatile("movl %%eax,(%1)" :: "a" (val), "r" (pos) : "memory");
149}
diff --git a/arch/x86/pci/visws.c b/arch/x86/pci/visws.c
new file mode 100644
index 000000000000..8ecb1c722594
--- /dev/null
+++ b/arch/x86/pci/visws.c
@@ -0,0 +1,111 @@
1/*
2 * Low-Level PCI Support for SGI Visual Workstation
3 *
4 * (c) 1999--2000 Martin Mares <mj@ucw.cz>
5 */
6
7#include <linux/kernel.h>
8#include <linux/pci.h>
9#include <linux/init.h>
10
11#include "cobalt.h"
12#include "lithium.h"
13
14#include "pci.h"
15
16
17extern struct pci_raw_ops pci_direct_conf1;
18
19static int pci_visws_enable_irq(struct pci_dev *dev) { return 0; }
20static void pci_visws_disable_irq(struct pci_dev *dev) { }
21
22int (*pcibios_enable_irq)(struct pci_dev *dev) = &pci_visws_enable_irq;
23void (*pcibios_disable_irq)(struct pci_dev *dev) = &pci_visws_disable_irq;
24
25void __init pcibios_penalize_isa_irq(int irq, int active) {}
26
27
28unsigned int pci_bus0, pci_bus1;
29
30static inline u8 bridge_swizzle(u8 pin, u8 slot)
31{
32 return (((pin - 1) + slot) % 4) + 1;
33}
34
35static u8 __init visws_swizzle(struct pci_dev *dev, u8 *pinp)
36{
37 u8 pin = *pinp;
38
39 while (dev->bus->self) { /* Move up the chain of bridges. */
40 pin = bridge_swizzle(pin, PCI_SLOT(dev->devfn));
41 dev = dev->bus->self;
42 }
43 *pinp = pin;
44
45 return PCI_SLOT(dev->devfn);
46}
47
48static int __init visws_map_irq(struct pci_dev *dev, u8 slot, u8 pin)
49{
50 int irq, bus = dev->bus->number;
51
52 pin--;
53
54 /* Nothing useful at PIIX4 pin 1 */
55 if (bus == pci_bus0 && slot == 4 && pin == 0)
56 return -1;
57
58 /* PIIX4 USB is on Bus 0, Slot 4, Line 3 */
59 if (bus == pci_bus0 && slot == 4 && pin == 3) {
60 irq = CO_IRQ(CO_APIC_PIIX4_USB);
61 goto out;
62 }
63
64 /* First pin spread down 1 APIC entry per slot */
65 if (pin == 0) {
66 irq = CO_IRQ((bus == pci_bus0 ? CO_APIC_PCIB_BASE0 :
67 CO_APIC_PCIA_BASE0) + slot);
68 goto out;
69 }
70
71 /* lines 1,2,3 from any slot is shared in this twirly pattern */
72 if (bus == pci_bus1) {
73 /* lines 1-3 from devices 0 1 rotate over 2 apic entries */
74 irq = CO_IRQ(CO_APIC_PCIA_BASE123 + ((slot + (pin - 1)) % 2));
75 } else { /* bus == pci_bus0 */
76 /* lines 1-3 from devices 0-3 rotate over 3 apic entries */
77 if (slot == 0)
78 slot = 3; /* same pattern */
79 irq = CO_IRQ(CO_APIC_PCIA_BASE123 + ((3 - slot) + (pin - 1) % 3));
80 }
81out:
82 printk(KERN_DEBUG "PCI: Bus %d Slot %d Line %d -> IRQ %d\n", bus, slot, pin, irq);
83 return irq;
84}
85
86void __init pcibios_update_irq(struct pci_dev *dev, int irq)
87{
88 pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq);
89}
90
91static int __init pcibios_init(void)
92{
93 /* The VISWS supports configuration access type 1 only */
94 pci_probe = (pci_probe | PCI_PROBE_CONF1) &
95 ~(PCI_PROBE_BIOS | PCI_PROBE_CONF2);
96
97 pci_bus0 = li_pcib_read16(LI_PCI_BUSNUM) & 0xff;
98 pci_bus1 = li_pcia_read16(LI_PCI_BUSNUM) & 0xff;
99
100 printk(KERN_INFO "PCI: Lithium bridge A bus: %u, "
101 "bridge B (PIIX4) bus: %u\n", pci_bus1, pci_bus0);
102
103 raw_pci_ops = &pci_direct_conf1;
104 pci_scan_bus_with_sysdata(pci_bus0);
105 pci_scan_bus_with_sysdata(pci_bus1);
106 pci_fixup_irqs(visws_swizzle, visws_map_irq);
107 pcibios_resource_survey();
108 return 0;
109}
110
111subsys_initcall(pcibios_init);
diff --git a/arch/x86/power/Makefile b/arch/x86/power/Makefile
new file mode 100644
index 000000000000..d764ec950065
--- /dev/null
+++ b/arch/x86/power/Makefile
@@ -0,0 +1,2 @@
1obj-$(CONFIG_PM) += cpu.o
2obj-$(CONFIG_HIBERNATION) += swsusp.o suspend.o
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
new file mode 100644
index 000000000000..998fd3ec0d68
--- /dev/null
+++ b/arch/x86/power/cpu.c
@@ -0,0 +1,133 @@
1/*
2 * Suspend support specific for i386.
3 *
4 * Distribute under GPLv2
5 *
6 * Copyright (c) 2002 Pavel Machek <pavel@suse.cz>
7 * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org>
8 */
9
10#include <linux/module.h>
11#include <linux/suspend.h>
12#include <asm/mtrr.h>
13#include <asm/mce.h>
14
15static struct saved_context saved_context;
16
17unsigned long saved_context_ebx;
18unsigned long saved_context_esp, saved_context_ebp;
19unsigned long saved_context_esi, saved_context_edi;
20unsigned long saved_context_eflags;
21
22void __save_processor_state(struct saved_context *ctxt)
23{
24 mtrr_save_fixed_ranges(NULL);
25 kernel_fpu_begin();
26
27 /*
28 * descriptor tables
29 */
30 store_gdt(&ctxt->gdt);
31 store_idt(&ctxt->idt);
32 store_tr(ctxt->tr);
33
34 /*
35 * segment registers
36 */
37 savesegment(es, ctxt->es);
38 savesegment(fs, ctxt->fs);
39 savesegment(gs, ctxt->gs);
40 savesegment(ss, ctxt->ss);
41
42 /*
43 * control registers
44 */
45 ctxt->cr0 = read_cr0();
46 ctxt->cr2 = read_cr2();
47 ctxt->cr3 = read_cr3();
48 ctxt->cr4 = read_cr4();
49}
50
51void save_processor_state(void)
52{
53 __save_processor_state(&saved_context);
54}
55
56static void do_fpu_end(void)
57{
58 /*
59 * Restore FPU regs if necessary.
60 */
61 kernel_fpu_end();
62}
63
64static void fix_processor_context(void)
65{
66 int cpu = smp_processor_id();
67 struct tss_struct * t = &per_cpu(init_tss, cpu);
68
69 set_tss_desc(cpu,t); /* This just modifies memory; should not be necessary. But... This is necessary, because 386 hardware has concept of busy TSS or some similar stupidity. */
70
71 load_TR_desc(); /* This does ltr */
72 load_LDT(&current->active_mm->context); /* This does lldt */
73
74 /*
75 * Now maybe reload the debug registers
76 */
77 if (current->thread.debugreg[7]){
78 set_debugreg(current->thread.debugreg[0], 0);
79 set_debugreg(current->thread.debugreg[1], 1);
80 set_debugreg(current->thread.debugreg[2], 2);
81 set_debugreg(current->thread.debugreg[3], 3);
82 /* no 4 and 5 */
83 set_debugreg(current->thread.debugreg[6], 6);
84 set_debugreg(current->thread.debugreg[7], 7);
85 }
86
87}
88
89void __restore_processor_state(struct saved_context *ctxt)
90{
91 /*
92 * control registers
93 */
94 write_cr4(ctxt->cr4);
95 write_cr3(ctxt->cr3);
96 write_cr2(ctxt->cr2);
97 write_cr0(ctxt->cr0);
98
99 /*
100 * now restore the descriptor tables to their proper values
101 * ltr is done i fix_processor_context().
102 */
103 load_gdt(&ctxt->gdt);
104 load_idt(&ctxt->idt);
105
106 /*
107 * segment registers
108 */
109 loadsegment(es, ctxt->es);
110 loadsegment(fs, ctxt->fs);
111 loadsegment(gs, ctxt->gs);
112 loadsegment(ss, ctxt->ss);
113
114 /*
115 * sysenter MSRs
116 */
117 if (boot_cpu_has(X86_FEATURE_SEP))
118 enable_sep_cpu();
119
120 fix_processor_context();
121 do_fpu_end();
122 mtrr_ap_init();
123 mcheck_init(&boot_cpu_data);
124}
125
126void restore_processor_state(void)
127{
128 __restore_processor_state(&saved_context);
129}
130
131/* Needed by apm.c */
132EXPORT_SYMBOL(save_processor_state);
133EXPORT_SYMBOL(restore_processor_state);
diff --git a/arch/x86/power/suspend.c b/arch/x86/power/suspend.c
new file mode 100644
index 000000000000..a0020b913f31
--- /dev/null
+++ b/arch/x86/power/suspend.c
@@ -0,0 +1,172 @@
1/*
2 * Suspend support specific for i386 - temporary page tables
3 *
4 * Distribute under GPLv2
5 *
6 * Copyright (c) 2006 Rafael J. Wysocki <rjw@sisk.pl>
7 */
8
9#include <linux/suspend.h>
10#include <linux/bootmem.h>
11
12#include <asm/system.h>
13#include <asm/page.h>
14#include <asm/pgtable.h>
15
16/* Defined in arch/i386/power/swsusp.S */
17extern int restore_image(void);
18
19/* References to section boundaries */
20extern const void __nosave_begin, __nosave_end;
21
22/* Pointer to the temporary resume page tables */
23pgd_t *resume_pg_dir;
24
25/* The following three functions are based on the analogous code in
26 * arch/i386/mm/init.c
27 */
28
29/*
30 * Create a middle page table on a resume-safe page and put a pointer to it in
31 * the given global directory entry. This only returns the gd entry
32 * in non-PAE compilation mode, since the middle layer is folded.
33 */
34static pmd_t *resume_one_md_table_init(pgd_t *pgd)
35{
36 pud_t *pud;
37 pmd_t *pmd_table;
38
39#ifdef CONFIG_X86_PAE
40 pmd_table = (pmd_t *)get_safe_page(GFP_ATOMIC);
41 if (!pmd_table)
42 return NULL;
43
44 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
45 pud = pud_offset(pgd, 0);
46
47 BUG_ON(pmd_table != pmd_offset(pud, 0));
48#else
49 pud = pud_offset(pgd, 0);
50 pmd_table = pmd_offset(pud, 0);
51#endif
52
53 return pmd_table;
54}
55
56/*
57 * Create a page table on a resume-safe page and place a pointer to it in
58 * a middle page directory entry.
59 */
60static pte_t *resume_one_page_table_init(pmd_t *pmd)
61{
62 if (pmd_none(*pmd)) {
63 pte_t *page_table = (pte_t *)get_safe_page(GFP_ATOMIC);
64 if (!page_table)
65 return NULL;
66
67 set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
68
69 BUG_ON(page_table != pte_offset_kernel(pmd, 0));
70
71 return page_table;
72 }
73
74 return pte_offset_kernel(pmd, 0);
75}
76
77/*
78 * This maps the physical memory to kernel virtual address space, a total
79 * of max_low_pfn pages, by creating page tables starting from address
80 * PAGE_OFFSET. The page tables are allocated out of resume-safe pages.
81 */
82static int resume_physical_mapping_init(pgd_t *pgd_base)
83{
84 unsigned long pfn;
85 pgd_t *pgd;
86 pmd_t *pmd;
87 pte_t *pte;
88 int pgd_idx, pmd_idx;
89
90 pgd_idx = pgd_index(PAGE_OFFSET);
91 pgd = pgd_base + pgd_idx;
92 pfn = 0;
93
94 for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
95 pmd = resume_one_md_table_init(pgd);
96 if (!pmd)
97 return -ENOMEM;
98
99 if (pfn >= max_low_pfn)
100 continue;
101
102 for (pmd_idx = 0; pmd_idx < PTRS_PER_PMD; pmd++, pmd_idx++) {
103 if (pfn >= max_low_pfn)
104 break;
105
106 /* Map with big pages if possible, otherwise create
107 * normal page tables.
108 * NOTE: We can mark everything as executable here
109 */
110 if (cpu_has_pse) {
111 set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC));
112 pfn += PTRS_PER_PTE;
113 } else {
114 pte_t *max_pte;
115
116 pte = resume_one_page_table_init(pmd);
117 if (!pte)
118 return -ENOMEM;
119
120 max_pte = pte + PTRS_PER_PTE;
121 for (; pte < max_pte; pte++, pfn++) {
122 if (pfn >= max_low_pfn)
123 break;
124
125 set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));
126 }
127 }
128 }
129 }
130 return 0;
131}
132
133static inline void resume_init_first_level_page_table(pgd_t *pg_dir)
134{
135#ifdef CONFIG_X86_PAE
136 int i;
137
138 /* Init entries of the first-level page table to the zero page */
139 for (i = 0; i < PTRS_PER_PGD; i++)
140 set_pgd(pg_dir + i,
141 __pgd(__pa(empty_zero_page) | _PAGE_PRESENT));
142#endif
143}
144
145int swsusp_arch_resume(void)
146{
147 int error;
148
149 resume_pg_dir = (pgd_t *)get_safe_page(GFP_ATOMIC);
150 if (!resume_pg_dir)
151 return -ENOMEM;
152
153 resume_init_first_level_page_table(resume_pg_dir);
154 error = resume_physical_mapping_init(resume_pg_dir);
155 if (error)
156 return error;
157
158 /* We have got enough memory and from now on we cannot recover */
159 restore_image();
160 return 0;
161}
162
163/*
164 * pfn_is_nosave - check if given pfn is in the 'nosave' section
165 */
166
167int pfn_is_nosave(unsigned long pfn)
168{
169 unsigned long nosave_begin_pfn = __pa_symbol(&__nosave_begin) >> PAGE_SHIFT;
170 unsigned long nosave_end_pfn = PAGE_ALIGN(__pa_symbol(&__nosave_end)) >> PAGE_SHIFT;
171 return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
172}
diff --git a/arch/x86/power/swsusp.S b/arch/x86/power/swsusp.S
new file mode 100644
index 000000000000..53662e05b393
--- /dev/null
+++ b/arch/x86/power/swsusp.S
@@ -0,0 +1,78 @@
1.text
2
3/* Originally gcc generated, modified by hand
4 *
5 * This may not use any stack, nor any variable that is not "NoSave":
6 *
7 * Its rewriting one kernel image with another. What is stack in "old"
8 * image could very well be data page in "new" image, and overwriting
9 * your own stack under you is bad idea.
10 */
11
12#include <linux/linkage.h>
13#include <asm/segment.h>
14#include <asm/page.h>
15#include <asm/asm-offsets.h>
16
17 .text
18
19ENTRY(swsusp_arch_suspend)
20
21 movl %esp, saved_context_esp
22 movl %ebx, saved_context_ebx
23 movl %ebp, saved_context_ebp
24 movl %esi, saved_context_esi
25 movl %edi, saved_context_edi
26 pushfl ; popl saved_context_eflags
27
28 call swsusp_save
29 ret
30
31ENTRY(restore_image)
32 movl resume_pg_dir, %ecx
33 subl $__PAGE_OFFSET, %ecx
34 movl %ecx, %cr3
35
36 movl restore_pblist, %edx
37 .p2align 4,,7
38
39copy_loop:
40 testl %edx, %edx
41 jz done
42
43 movl pbe_address(%edx), %esi
44 movl pbe_orig_address(%edx), %edi
45
46 movl $1024, %ecx
47 rep
48 movsl
49
50 movl pbe_next(%edx), %edx
51 jmp copy_loop
52 .p2align 4,,7
53
54done:
55 /* go back to the original page tables */
56 movl $swapper_pg_dir, %ecx
57 subl $__PAGE_OFFSET, %ecx
58 movl %ecx, %cr3
59 /* Flush TLB, including "global" things (vmalloc) */
60 movl mmu_cr4_features, %eax
61 movl %eax, %edx
62 andl $~(1<<7), %edx; # PGE
63 movl %edx, %cr4; # turn off PGE
64 movl %cr3, %ecx; # flush TLB
65 movl %ecx, %cr3
66 movl %eax, %cr4; # turn PGE back on
67
68 movl saved_context_esp, %esp
69 movl saved_context_ebp, %ebp
70 movl saved_context_ebx, %ebx
71 movl saved_context_esi, %esi
72 movl saved_context_edi, %edi
73
74 pushl saved_context_eflags ; popfl
75
76 xorl %eax, %eax
77
78 ret
diff --git a/arch/x86/vdso/.gitignore b/arch/x86/vdso/.gitignore
new file mode 100644
index 000000000000..f8b69d84238e
--- /dev/null
+++ b/arch/x86/vdso/.gitignore
@@ -0,0 +1 @@
vdso.lds
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
new file mode 100644
index 000000000000..8d03de029d9b
--- /dev/null
+++ b/arch/x86/vdso/Makefile
@@ -0,0 +1,49 @@
1#
2# x86-64 vDSO.
3#
4
5# files to link into the vdso
6# vdso-start.o has to be first
7vobjs-y := vdso-start.o vdso-note.o vclock_gettime.o vgetcpu.o vvar.o
8
9# files to link into kernel
10obj-y := vma.o vdso.o vdso-syms.o
11
12vobjs := $(foreach F,$(vobjs-y),$(obj)/$F)
13
14$(obj)/vdso.o: $(obj)/vdso.so
15
16targets += vdso.so vdso.lds $(vobjs-y) vdso-syms.o
17
18# The DSO images are built using a special linker script.
19quiet_cmd_syscall = SYSCALL $@
20 cmd_syscall = $(CC) -m elf_x86_64 -nostdlib $(SYSCFLAGS_$(@F)) \
21 -Wl,-T,$(filter-out FORCE,$^) -o $@
22
23export CPPFLAGS_vdso.lds += -P -C -U$(ARCH)
24
25vdso-flags = -fPIC -shared -Wl,-soname=linux-vdso.so.1 \
26 $(call ld-option, -Wl$(comma)--hash-style=sysv) \
27 -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096
28SYSCFLAGS_vdso.so = $(vdso-flags)
29
30$(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so
31
32$(obj)/vdso.so: $(src)/vdso.lds $(vobjs) FORCE
33 $(call if_changed,syscall)
34
35CFL := $(PROFILING) -mcmodel=small -fPIC -g0 -O2 -fasynchronous-unwind-tables -m64
36
37$(obj)/vclock_gettime.o: CFLAGS = $(CFL)
38$(obj)/vgetcpu.o: CFLAGS = $(CFL)
39
40# We also create a special relocatable object that should mirror the symbol
41# table and layout of the linked DSO. With ld -R we can then refer to
42# these symbols in the kernel code rather than hand-coded addresses.
43extra-y += vdso-syms.o
44$(obj)/built-in.o: $(obj)/vdso-syms.o
45$(obj)/built-in.o: ld_flags += -R $(obj)/vdso-syms.o
46
47SYSCFLAGS_vdso-syms.o = -r -d
48$(obj)/vdso-syms.o: $(src)/vdso.lds $(vobjs) FORCE
49 $(call if_changed,syscall)
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
new file mode 100644
index 000000000000..5b54cdfb2b07
--- /dev/null
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -0,0 +1,121 @@
1/*
2 * Copyright 2006 Andi Kleen, SUSE Labs.
3 * Subject to the GNU Public License, v.2
4 *
5 * Fast user context implementation of clock_gettime and gettimeofday.
6 *
7 * The code should have no internal unresolved relocations.
8 * Check with readelf after changing.
9 * Also alternative() doesn't work.
10 */
11
12#include <linux/kernel.h>
13#include <linux/posix-timers.h>
14#include <linux/time.h>
15#include <linux/string.h>
16#include <asm/vsyscall.h>
17#include <asm/vgtod.h>
18#include <asm/timex.h>
19#include <asm/hpet.h>
20#include <asm/unistd.h>
21#include <asm/io.h>
22#include <asm/vgtod.h>
23#include "vextern.h"
24
25#define gtod vdso_vsyscall_gtod_data
26
27static long vdso_fallback_gettime(long clock, struct timespec *ts)
28{
29 long ret;
30 asm("syscall" : "=a" (ret) :
31 "0" (__NR_clock_gettime),"D" (clock), "S" (ts) : "memory");
32 return ret;
33}
34
35static inline long vgetns(void)
36{
37 long v;
38 cycles_t (*vread)(void);
39 vread = gtod->clock.vread;
40 v = (vread() - gtod->clock.cycle_last) & gtod->clock.mask;
41 return (v * gtod->clock.mult) >> gtod->clock.shift;
42}
43
44static noinline int do_realtime(struct timespec *ts)
45{
46 unsigned long seq, ns;
47 do {
48 seq = read_seqbegin(&gtod->lock);
49 ts->tv_sec = gtod->wall_time_sec;
50 ts->tv_nsec = gtod->wall_time_nsec;
51 ns = vgetns();
52 } while (unlikely(read_seqretry(&gtod->lock, seq)));
53 timespec_add_ns(ts, ns);
54 return 0;
55}
56
57/* Copy of the version in kernel/time.c which we cannot directly access */
58static void vset_normalized_timespec(struct timespec *ts, long sec, long nsec)
59{
60 while (nsec >= NSEC_PER_SEC) {
61 nsec -= NSEC_PER_SEC;
62 ++sec;
63 }
64 while (nsec < 0) {
65 nsec += NSEC_PER_SEC;
66 --sec;
67 }
68 ts->tv_sec = sec;
69 ts->tv_nsec = nsec;
70}
71
72static noinline int do_monotonic(struct timespec *ts)
73{
74 unsigned long seq, ns, secs;
75 do {
76 seq = read_seqbegin(&gtod->lock);
77 secs = gtod->wall_time_sec;
78 ns = gtod->wall_time_nsec + vgetns();
79 secs += gtod->wall_to_monotonic.tv_sec;
80 ns += gtod->wall_to_monotonic.tv_nsec;
81 } while (unlikely(read_seqretry(&gtod->lock, seq)));
82 vset_normalized_timespec(ts, secs, ns);
83 return 0;
84}
85
86int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
87{
88 if (likely(gtod->sysctl_enabled && gtod->clock.vread))
89 switch (clock) {
90 case CLOCK_REALTIME:
91 return do_realtime(ts);
92 case CLOCK_MONOTONIC:
93 return do_monotonic(ts);
94 }
95 return vdso_fallback_gettime(clock, ts);
96}
97int clock_gettime(clockid_t, struct timespec *)
98 __attribute__((weak, alias("__vdso_clock_gettime")));
99
100int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
101{
102 long ret;
103 if (likely(gtod->sysctl_enabled && gtod->clock.vread)) {
104 BUILD_BUG_ON(offsetof(struct timeval, tv_usec) !=
105 offsetof(struct timespec, tv_nsec) ||
106 sizeof(*tv) != sizeof(struct timespec));
107 do_realtime((struct timespec *)tv);
108 tv->tv_usec /= 1000;
109 if (unlikely(tz != NULL)) {
110 /* This relies on gcc inlining the memcpy. We'll notice
111 if it ever fails to do so. */
112 memcpy(tz, &gtod->sys_tz, sizeof(struct timezone));
113 }
114 return 0;
115 }
116 asm("syscall" : "=a" (ret) :
117 "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory");
118 return ret;
119}
120int gettimeofday(struct timeval *, struct timezone *)
121 __attribute__((weak, alias("__vdso_gettimeofday")));
diff --git a/arch/x86/vdso/vdso-note.S b/arch/x86/vdso/vdso-note.S
new file mode 100644
index 000000000000..79a071e4357e
--- /dev/null
+++ b/arch/x86/vdso/vdso-note.S
@@ -0,0 +1,12 @@
1/*
2 * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text.
3 * Here we can supply some information useful to userland.
4 */
5
6#include <linux/uts.h>
7#include <linux/version.h>
8#include <linux/elfnote.h>
9
10ELFNOTE_START(Linux, 0, "a")
11 .long LINUX_VERSION_CODE
12ELFNOTE_END
diff --git a/arch/x86/vdso/vdso-start.S b/arch/x86/vdso/vdso-start.S
new file mode 100644
index 000000000000..2dc2cdb84d67
--- /dev/null
+++ b/arch/x86/vdso/vdso-start.S
@@ -0,0 +1,2 @@
1 .globl vdso_kernel_start
2vdso_kernel_start:
diff --git a/arch/x86/vdso/vdso.S b/arch/x86/vdso/vdso.S
new file mode 100644
index 000000000000..4b1620a1529e
--- /dev/null
+++ b/arch/x86/vdso/vdso.S
@@ -0,0 +1,2 @@
1 .section ".vdso","a"
2 .incbin "arch/x86/vdso/vdso.so"
diff --git a/arch/x86/vdso/vdso.lds.S b/arch/x86/vdso/vdso.lds.S
new file mode 100644
index 000000000000..b9a60e665d08
--- /dev/null
+++ b/arch/x86/vdso/vdso.lds.S
@@ -0,0 +1,77 @@
1/*
2 * Linker script for vsyscall DSO. The vsyscall page is an ELF shared
3 * object prelinked to its virtual address, and with only one read-only
4 * segment (that fits in one page). This script controls its layout.
5 */
6#include <asm/asm-offsets.h>
7#include "voffset.h"
8
9#define VDSO_PRELINK 0xffffffffff700000
10
11SECTIONS
12{
13 . = VDSO_PRELINK + SIZEOF_HEADERS;
14
15 .hash : { *(.hash) } :text
16 .gnu.hash : { *(.gnu.hash) }
17 .dynsym : { *(.dynsym) }
18 .dynstr : { *(.dynstr) }
19 .gnu.version : { *(.gnu.version) }
20 .gnu.version_d : { *(.gnu.version_d) }
21 .gnu.version_r : { *(.gnu.version_r) }
22
23 /* This linker script is used both with -r and with -shared.
24 For the layouts to match, we need to skip more than enough
25 space for the dynamic symbol table et al. If this amount
26 is insufficient, ld -shared will barf. Just increase it here. */
27 . = VDSO_PRELINK + VDSO_TEXT_OFFSET;
28
29 .text : { *(.text) } :text
30 .text.ptr : { *(.text.ptr) } :text
31 . = VDSO_PRELINK + 0x900;
32 .data : { *(.data) } :text
33 .bss : { *(.bss) } :text
34
35 .altinstructions : { *(.altinstructions) } :text
36 .altinstr_replacement : { *(.altinstr_replacement) } :text
37
38 .note : { *(.note.*) } :text :note
39 .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr
40 .eh_frame : { KEEP (*(.eh_frame)) } :text
41 .dynamic : { *(.dynamic) } :text :dynamic
42 .useless : {
43 *(.got.plt) *(.got)
44 *(.gnu.linkonce.d.*)
45 *(.dynbss)
46 *(.gnu.linkonce.b.*)
47 } :text
48}
49
50/*
51 * We must supply the ELF program headers explicitly to get just one
52 * PT_LOAD segment, and set the flags explicitly to make segments read-only.
53 */
54PHDRS
55{
56 text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */
57 dynamic PT_DYNAMIC FLAGS(4); /* PF_R */
58 note PT_NOTE FLAGS(4); /* PF_R */
59 eh_frame_hdr 0x6474e550; /* PT_GNU_EH_FRAME, but ld doesn't match the name */
60}
61
62/*
63 * This controls what symbols we export from the DSO.
64 */
65VERSION
66{
67 LINUX_2.6 {
68 global:
69 clock_gettime;
70 __vdso_clock_gettime;
71 gettimeofday;
72 __vdso_gettimeofday;
73 getcpu;
74 __vdso_getcpu;
75 local: *;
76 };
77}
diff --git a/arch/x86/vdso/vextern.h b/arch/x86/vdso/vextern.h
new file mode 100644
index 000000000000..1683ba2ae3e8
--- /dev/null
+++ b/arch/x86/vdso/vextern.h
@@ -0,0 +1,16 @@
1#ifndef VEXTERN
2#include <asm/vsyscall.h>
3#define VEXTERN(x) \
4 extern typeof(x) *vdso_ ## x __attribute__((visibility("hidden")));
5#endif
6
7#define VMAGIC 0xfeedbabeabcdefabUL
8
9/* Any kernel variables used in the vDSO must be exported in the main
10 kernel's vmlinux.lds.S/vsyscall.h/proper __section and
11 put into vextern.h and be referenced as a pointer with vdso prefix.
12 The main kernel later fills in the values. */
13
14VEXTERN(jiffies)
15VEXTERN(vgetcpu_mode)
16VEXTERN(vsyscall_gtod_data)
diff --git a/arch/x86/vdso/vgetcpu.c b/arch/x86/vdso/vgetcpu.c
new file mode 100644
index 000000000000..91f6e85d0fc2
--- /dev/null
+++ b/arch/x86/vdso/vgetcpu.c
@@ -0,0 +1,50 @@
1/*
2 * Copyright 2006 Andi Kleen, SUSE Labs.
3 * Subject to the GNU Public License, v.2
4 *
5 * Fast user context implementation of getcpu()
6 */
7
8#include <linux/kernel.h>
9#include <linux/getcpu.h>
10#include <linux/jiffies.h>
11#include <linux/time.h>
12#include <asm/vsyscall.h>
13#include <asm/vgtod.h>
14#include "vextern.h"
15
16long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
17{
18 unsigned int dummy, p;
19 unsigned long j = 0;
20
21 /* Fast cache - only recompute value once per jiffies and avoid
22 relatively costly rdtscp/cpuid otherwise.
23 This works because the scheduler usually keeps the process
24 on the same CPU and this syscall doesn't guarantee its
25 results anyways.
26 We do this here because otherwise user space would do it on
27 its own in a likely inferior way (no access to jiffies).
28 If you don't like it pass NULL. */
29 if (tcache && tcache->blob[0] == (j = *vdso_jiffies)) {
30 p = tcache->blob[1];
31 } else if (*vdso_vgetcpu_mode == VGETCPU_RDTSCP) {
32 /* Load per CPU data from RDTSCP */
33 rdtscp(dummy, dummy, p);
34 } else {
35 /* Load per CPU data from GDT */
36 asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
37 }
38 if (tcache) {
39 tcache->blob[0] = j;
40 tcache->blob[1] = p;
41 }
42 if (cpu)
43 *cpu = p & 0xfff;
44 if (node)
45 *node = p >> 12;
46 return 0;
47}
48
49long getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
50 __attribute__((weak, alias("__vdso_getcpu")));
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
new file mode 100644
index 000000000000..ff9333e5fb08
--- /dev/null
+++ b/arch/x86/vdso/vma.c
@@ -0,0 +1,140 @@
1/*
2 * Set up the VMAs to tell the VM about the vDSO.
3 * Copyright 2007 Andi Kleen, SUSE Labs.
4 * Subject to the GPL, v.2
5 */
6#include <linux/mm.h>
7#include <linux/err.h>
8#include <linux/sched.h>
9#include <linux/init.h>
10#include <linux/random.h>
11#include <asm/vsyscall.h>
12#include <asm/vgtod.h>
13#include <asm/proto.h>
14#include "voffset.h"
15
16int vdso_enabled = 1;
17
18#define VEXTERN(x) extern typeof(__ ## x) *vdso_ ## x;
19#include "vextern.h"
20#undef VEXTERN
21
22extern char vdso_kernel_start[], vdso_start[], vdso_end[];
23extern unsigned short vdso_sync_cpuid;
24
25struct page **vdso_pages;
26
27static inline void *var_ref(void *vbase, char *var, char *name)
28{
29 unsigned offset = var - &vdso_kernel_start[0] + VDSO_TEXT_OFFSET;
30 void *p = vbase + offset;
31 if (*(void **)p != (void *)VMAGIC) {
32 printk("VDSO: variable %s broken\n", name);
33 vdso_enabled = 0;
34 }
35 return p;
36}
37
38static int __init init_vdso_vars(void)
39{
40 int npages = (vdso_end - vdso_start + PAGE_SIZE - 1) / PAGE_SIZE;
41 int i;
42 char *vbase;
43
44 vdso_pages = kmalloc(sizeof(struct page *) * npages, GFP_KERNEL);
45 if (!vdso_pages)
46 goto oom;
47 for (i = 0; i < npages; i++) {
48 struct page *p;
49 p = alloc_page(GFP_KERNEL);
50 if (!p)
51 goto oom;
52 vdso_pages[i] = p;
53 copy_page(page_address(p), vdso_start + i*PAGE_SIZE);
54 }
55
56 vbase = vmap(vdso_pages, npages, 0, PAGE_KERNEL);
57 if (!vbase)
58 goto oom;
59
60 if (memcmp(vbase, "\177ELF", 4)) {
61 printk("VDSO: I'm broken; not ELF\n");
62 vdso_enabled = 0;
63 }
64
65#define V(x) *(typeof(x) *) var_ref(vbase, (char *)RELOC_HIDE(&x, 0), #x)
66#define VEXTERN(x) \
67 V(vdso_ ## x) = &__ ## x;
68#include "vextern.h"
69#undef VEXTERN
70 return 0;
71
72 oom:
73 printk("Cannot allocate vdso\n");
74 vdso_enabled = 0;
75 return -ENOMEM;
76}
77__initcall(init_vdso_vars);
78
79struct linux_binprm;
80
81/* Put the vdso above the (randomized) stack with another randomized offset.
82 This way there is no hole in the middle of address space.
83 To save memory make sure it is still in the same PTE as the stack top.
84 This doesn't give that many random bits */
85static unsigned long vdso_addr(unsigned long start, unsigned len)
86{
87 unsigned long addr, end;
88 unsigned offset;
89 end = (start + PMD_SIZE - 1) & PMD_MASK;
90 if (end >= TASK_SIZE64)
91 end = TASK_SIZE64;
92 end -= len;
93 /* This loses some more bits than a modulo, but is cheaper */
94 offset = get_random_int() & (PTRS_PER_PTE - 1);
95 addr = start + (offset << PAGE_SHIFT);
96 if (addr >= end)
97 addr = end;
98 return addr;
99}
100
101/* Setup a VMA at program startup for the vsyscall page.
102 Not called for compat tasks */
103int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
104{
105 struct mm_struct *mm = current->mm;
106 unsigned long addr;
107 int ret;
108 unsigned len = round_up(vdso_end - vdso_start, PAGE_SIZE);
109
110 if (!vdso_enabled)
111 return 0;
112
113 down_write(&mm->mmap_sem);
114 addr = vdso_addr(mm->start_stack, len);
115 addr = get_unmapped_area(NULL, addr, len, 0, 0);
116 if (IS_ERR_VALUE(addr)) {
117 ret = addr;
118 goto up_fail;
119 }
120
121 ret = install_special_mapping(mm, addr, len,
122 VM_READ|VM_EXEC|
123 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
124 VM_ALWAYSDUMP,
125 vdso_pages);
126 if (ret)
127 goto up_fail;
128
129 current->mm->context.vdso = (void *)addr;
130up_fail:
131 up_write(&mm->mmap_sem);
132 return ret;
133}
134
135static __init int vdso_setup(char *s)
136{
137 vdso_enabled = simple_strtoul(s, NULL, 0);
138 return 0;
139}
140__setup("vdso=", vdso_setup);
diff --git a/arch/x86/vdso/voffset.h b/arch/x86/vdso/voffset.h
new file mode 100644
index 000000000000..4af67c79085f
--- /dev/null
+++ b/arch/x86/vdso/voffset.h
@@ -0,0 +1 @@
#define VDSO_TEXT_OFFSET 0x600
diff --git a/arch/x86/vdso/vvar.c b/arch/x86/vdso/vvar.c
new file mode 100644
index 000000000000..6fc22219a472
--- /dev/null
+++ b/arch/x86/vdso/vvar.c
@@ -0,0 +1,12 @@
1/* Define pointer to external vDSO variables.
2 These are part of the vDSO. The kernel fills in the real addresses
3 at boot time. This is done because when the vdso is linked the
4 kernel isn't yet and we don't know the final addresses. */
5#include <linux/kernel.h>
6#include <linux/time.h>
7#include <asm/vsyscall.h>
8#include <asm/timex.h>
9#include <asm/vgtod.h>
10
11#define VEXTERN(x) typeof (__ ## x) *vdso_ ## x = (void *)VMAGIC;
12#include "vextern.h"
diff --git a/arch/x86/video/Makefile b/arch/x86/video/Makefile
new file mode 100644
index 000000000000..2c447c94adcc
--- /dev/null
+++ b/arch/x86/video/Makefile
@@ -0,0 +1 @@
obj-$(CONFIG_FB) += fbdev.o
diff --git a/arch/x86/video/fbdev.c b/arch/x86/video/fbdev.c
new file mode 100644
index 000000000000..48fb38d7d2c0
--- /dev/null
+++ b/arch/x86/video/fbdev.c
@@ -0,0 +1,32 @@
1/*
2 * arch/i386/video/fbdev.c - i386 Framebuffer
3 *
4 * Copyright (C) 2007 Antonino Daplas <adaplas@gmail.com>
5 *
6 * This file is subject to the terms and conditions of the GNU General Public
7 * License. See the file COPYING in the main directory of this archive
8 * for more details.
9 *
10 */
11#include <linux/fb.h>
12#include <linux/pci.h>
13
14int fb_is_primary_device(struct fb_info *info)
15{
16 struct device *device = info->device;
17 struct pci_dev *pci_dev = NULL;
18 struct resource *res = NULL;
19 int retval = 0;
20
21 if (device)
22 pci_dev = to_pci_dev(device);
23
24 if (pci_dev)
25 res = &pci_dev->resource[PCI_ROM_RESOURCE];
26
27 if (res && res->flags & IORESOURCE_ROM_SHADOW)
28 retval = 1;
29
30 return retval;
31}
32EXPORT_SYMBOL(fb_is_primary_device);
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
new file mode 100644
index 000000000000..9df99e1885a4
--- /dev/null
+++ b/arch/x86/xen/Kconfig
@@ -0,0 +1,11 @@
1#
2# This Kconfig describes xen options
3#
4
5config XEN
6 bool "Enable support for Xen hypervisor"
7 depends on PARAVIRT && X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES
8 help
9 This is the Linux Xen port. Enabling this will allow the
10 kernel to boot in a paravirtualized environment under the
11 Xen hypervisor.
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
new file mode 100644
index 000000000000..343df246bd3e
--- /dev/null
+++ b/arch/x86/xen/Makefile
@@ -0,0 +1,4 @@
1obj-y := enlighten.o setup.o features.o multicalls.o mmu.o \
2 events.o time.o manage.o xen-asm.o
3
4obj-$(CONFIG_SMP) += smp.o
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
new file mode 100644
index 000000000000..f01bfcd4bdee
--- /dev/null
+++ b/arch/x86/xen/enlighten.c
@@ -0,0 +1,1146 @@
1/*
2 * Core of Xen paravirt_ops implementation.
3 *
4 * This file contains the xen_paravirt_ops structure itself, and the
5 * implementations for:
6 * - privileged instructions
7 * - interrupt flags
8 * - segment operations
9 * - booting and setup
10 *
11 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
12 */
13
14#include <linux/kernel.h>
15#include <linux/init.h>
16#include <linux/smp.h>
17#include <linux/preempt.h>
18#include <linux/hardirq.h>
19#include <linux/percpu.h>
20#include <linux/delay.h>
21#include <linux/start_kernel.h>
22#include <linux/sched.h>
23#include <linux/bootmem.h>
24#include <linux/module.h>
25#include <linux/mm.h>
26#include <linux/page-flags.h>
27#include <linux/highmem.h>
28#include <linux/smp.h>
29
30#include <xen/interface/xen.h>
31#include <xen/interface/physdev.h>
32#include <xen/interface/vcpu.h>
33#include <xen/interface/sched.h>
34#include <xen/features.h>
35#include <xen/page.h>
36
37#include <asm/paravirt.h>
38#include <asm/page.h>
39#include <asm/xen/hypercall.h>
40#include <asm/xen/hypervisor.h>
41#include <asm/fixmap.h>
42#include <asm/processor.h>
43#include <asm/setup.h>
44#include <asm/desc.h>
45#include <asm/pgtable.h>
46#include <asm/tlbflush.h>
47#include <asm/reboot.h>
48
49#include "xen-ops.h"
50#include "mmu.h"
51#include "multicalls.h"
52
53EXPORT_SYMBOL_GPL(hypercall_page);
54
55DEFINE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode);
56
57DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
58DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
59DEFINE_PER_CPU(unsigned long, xen_cr3);
60
61struct start_info *xen_start_info;
62EXPORT_SYMBOL_GPL(xen_start_info);
63
64static /* __initdata */ struct shared_info dummy_shared_info;
65
66/*
67 * Point at some empty memory to start with. We map the real shared_info
68 * page as soon as fixmap is up and running.
69 */
70struct shared_info *HYPERVISOR_shared_info = (void *)&dummy_shared_info;
71
72/*
73 * Flag to determine whether vcpu info placement is available on all
74 * VCPUs. We assume it is to start with, and then set it to zero on
75 * the first failure. This is because it can succeed on some VCPUs
76 * and not others, since it can involve hypervisor memory allocation,
77 * or because the guest failed to guarantee all the appropriate
78 * constraints on all VCPUs (ie buffer can't cross a page boundary).
79 *
80 * Note that any particular CPU may be using a placed vcpu structure,
81 * but we can only optimise if the all are.
82 *
83 * 0: not available, 1: available
84 */
85static int have_vcpu_info_placement = 1;
86
87static void __init xen_vcpu_setup(int cpu)
88{
89 struct vcpu_register_vcpu_info info;
90 int err;
91 struct vcpu_info *vcpup;
92
93 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
94
95 if (!have_vcpu_info_placement)
96 return; /* already tested, not available */
97
98 vcpup = &per_cpu(xen_vcpu_info, cpu);
99
100 info.mfn = virt_to_mfn(vcpup);
101 info.offset = offset_in_page(vcpup);
102
103 printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %x, offset %d\n",
104 cpu, vcpup, info.mfn, info.offset);
105
106 /* Check to see if the hypervisor will put the vcpu_info
107 structure where we want it, which allows direct access via
108 a percpu-variable. */
109 err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, cpu, &info);
110
111 if (err) {
112 printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err);
113 have_vcpu_info_placement = 0;
114 } else {
115 /* This cpu is using the registered vcpu info, even if
116 later ones fail to. */
117 per_cpu(xen_vcpu, cpu) = vcpup;
118
119 printk(KERN_DEBUG "cpu %d using vcpu_info at %p\n",
120 cpu, vcpup);
121 }
122}
123
124static void __init xen_banner(void)
125{
126 printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
127 paravirt_ops.name);
128 printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic);
129}
130
131static void xen_cpuid(unsigned int *eax, unsigned int *ebx,
132 unsigned int *ecx, unsigned int *edx)
133{
134 unsigned maskedx = ~0;
135
136 /*
137 * Mask out inconvenient features, to try and disable as many
138 * unsupported kernel subsystems as possible.
139 */
140 if (*eax == 1)
141 maskedx = ~((1 << X86_FEATURE_APIC) | /* disable APIC */
142 (1 << X86_FEATURE_ACPI) | /* disable ACPI */
143 (1 << X86_FEATURE_ACC)); /* thermal monitoring */
144
145 asm(XEN_EMULATE_PREFIX "cpuid"
146 : "=a" (*eax),
147 "=b" (*ebx),
148 "=c" (*ecx),
149 "=d" (*edx)
150 : "0" (*eax), "2" (*ecx));
151 *edx &= maskedx;
152}
153
154static void xen_set_debugreg(int reg, unsigned long val)
155{
156 HYPERVISOR_set_debugreg(reg, val);
157}
158
159static unsigned long xen_get_debugreg(int reg)
160{
161 return HYPERVISOR_get_debugreg(reg);
162}
163
164static unsigned long xen_save_fl(void)
165{
166 struct vcpu_info *vcpu;
167 unsigned long flags;
168
169 vcpu = x86_read_percpu(xen_vcpu);
170
171 /* flag has opposite sense of mask */
172 flags = !vcpu->evtchn_upcall_mask;
173
174 /* convert to IF type flag
175 -0 -> 0x00000000
176 -1 -> 0xffffffff
177 */
178 return (-flags) & X86_EFLAGS_IF;
179}
180
181static void xen_restore_fl(unsigned long flags)
182{
183 struct vcpu_info *vcpu;
184
185 /* convert from IF type flag */
186 flags = !(flags & X86_EFLAGS_IF);
187
188 /* There's a one instruction preempt window here. We need to
189 make sure we're don't switch CPUs between getting the vcpu
190 pointer and updating the mask. */
191 preempt_disable();
192 vcpu = x86_read_percpu(xen_vcpu);
193 vcpu->evtchn_upcall_mask = flags;
194 preempt_enable_no_resched();
195
196 /* Doesn't matter if we get preempted here, because any
197 pending event will get dealt with anyway. */
198
199 if (flags == 0) {
200 preempt_check_resched();
201 barrier(); /* unmask then check (avoid races) */
202 if (unlikely(vcpu->evtchn_upcall_pending))
203 force_evtchn_callback();
204 }
205}
206
207static void xen_irq_disable(void)
208{
209 /* There's a one instruction preempt window here. We need to
210 make sure we're don't switch CPUs between getting the vcpu
211 pointer and updating the mask. */
212 preempt_disable();
213 x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1;
214 preempt_enable_no_resched();
215}
216
217static void xen_irq_enable(void)
218{
219 struct vcpu_info *vcpu;
220
221 /* There's a one instruction preempt window here. We need to
222 make sure we're don't switch CPUs between getting the vcpu
223 pointer and updating the mask. */
224 preempt_disable();
225 vcpu = x86_read_percpu(xen_vcpu);
226 vcpu->evtchn_upcall_mask = 0;
227 preempt_enable_no_resched();
228
229 /* Doesn't matter if we get preempted here, because any
230 pending event will get dealt with anyway. */
231
232 barrier(); /* unmask then check (avoid races) */
233 if (unlikely(vcpu->evtchn_upcall_pending))
234 force_evtchn_callback();
235}
236
237static void xen_safe_halt(void)
238{
239 /* Blocking includes an implicit local_irq_enable(). */
240 if (HYPERVISOR_sched_op(SCHEDOP_block, 0) != 0)
241 BUG();
242}
243
244static void xen_halt(void)
245{
246 if (irqs_disabled())
247 HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
248 else
249 xen_safe_halt();
250}
251
252static void xen_set_lazy_mode(enum paravirt_lazy_mode mode)
253{
254 BUG_ON(preemptible());
255
256 switch (mode) {
257 case PARAVIRT_LAZY_NONE:
258 BUG_ON(x86_read_percpu(xen_lazy_mode) == PARAVIRT_LAZY_NONE);
259 break;
260
261 case PARAVIRT_LAZY_MMU:
262 case PARAVIRT_LAZY_CPU:
263 BUG_ON(x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE);
264 break;
265
266 case PARAVIRT_LAZY_FLUSH:
267 /* flush if necessary, but don't change state */
268 if (x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE)
269 xen_mc_flush();
270 return;
271 }
272
273 xen_mc_flush();
274 x86_write_percpu(xen_lazy_mode, mode);
275}
276
277static unsigned long xen_store_tr(void)
278{
279 return 0;
280}
281
282static void xen_set_ldt(const void *addr, unsigned entries)
283{
284 unsigned long linear_addr = (unsigned long)addr;
285 struct mmuext_op *op;
286 struct multicall_space mcs = xen_mc_entry(sizeof(*op));
287
288 op = mcs.args;
289 op->cmd = MMUEXT_SET_LDT;
290 if (linear_addr) {
291 /* ldt my be vmalloced, use arbitrary_virt_to_machine */
292 xmaddr_t maddr;
293 maddr = arbitrary_virt_to_machine((unsigned long)addr);
294 linear_addr = (unsigned long)maddr.maddr;
295 }
296 op->arg1.linear_addr = linear_addr;
297 op->arg2.nr_ents = entries;
298
299 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
300
301 xen_mc_issue(PARAVIRT_LAZY_CPU);
302}
303
304static void xen_load_gdt(const struct Xgt_desc_struct *dtr)
305{
306 unsigned long *frames;
307 unsigned long va = dtr->address;
308 unsigned int size = dtr->size + 1;
309 unsigned pages = (size + PAGE_SIZE - 1) / PAGE_SIZE;
310 int f;
311 struct multicall_space mcs;
312
313 /* A GDT can be up to 64k in size, which corresponds to 8192
314 8-byte entries, or 16 4k pages.. */
315
316 BUG_ON(size > 65536);
317 BUG_ON(va & ~PAGE_MASK);
318
319 mcs = xen_mc_entry(sizeof(*frames) * pages);
320 frames = mcs.args;
321
322 for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
323 frames[f] = virt_to_mfn(va);
324 make_lowmem_page_readonly((void *)va);
325 }
326
327 MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct));
328
329 xen_mc_issue(PARAVIRT_LAZY_CPU);
330}
331
332static void load_TLS_descriptor(struct thread_struct *t,
333 unsigned int cpu, unsigned int i)
334{
335 struct desc_struct *gdt = get_cpu_gdt_table(cpu);
336 xmaddr_t maddr = virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
337 struct multicall_space mc = __xen_mc_entry(0);
338
339 MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
340}
341
342static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
343{
344 xen_mc_batch();
345
346 load_TLS_descriptor(t, cpu, 0);
347 load_TLS_descriptor(t, cpu, 1);
348 load_TLS_descriptor(t, cpu, 2);
349
350 xen_mc_issue(PARAVIRT_LAZY_CPU);
351
352 /*
353 * XXX sleazy hack: If we're being called in a lazy-cpu zone,
354 * it means we're in a context switch, and %gs has just been
355 * saved. This means we can zero it out to prevent faults on
356 * exit from the hypervisor if the next process has no %gs.
357 * Either way, it has been saved, and the new value will get
358 * loaded properly. This will go away as soon as Xen has been
359 * modified to not save/restore %gs for normal hypercalls.
360 */
361 if (xen_get_lazy_mode() == PARAVIRT_LAZY_CPU)
362 loadsegment(gs, 0);
363}
364
365static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
366 u32 low, u32 high)
367{
368 unsigned long lp = (unsigned long)&dt[entrynum];
369 xmaddr_t mach_lp = virt_to_machine(lp);
370 u64 entry = (u64)high << 32 | low;
371
372 preempt_disable();
373
374 xen_mc_flush();
375 if (HYPERVISOR_update_descriptor(mach_lp.maddr, entry))
376 BUG();
377
378 preempt_enable();
379}
380
381static int cvt_gate_to_trap(int vector, u32 low, u32 high,
382 struct trap_info *info)
383{
384 u8 type, dpl;
385
386 type = (high >> 8) & 0x1f;
387 dpl = (high >> 13) & 3;
388
389 if (type != 0xf && type != 0xe)
390 return 0;
391
392 info->vector = vector;
393 info->address = (high & 0xffff0000) | (low & 0x0000ffff);
394 info->cs = low >> 16;
395 info->flags = dpl;
396 /* interrupt gates clear IF */
397 if (type == 0xe)
398 info->flags |= 4;
399
400 return 1;
401}
402
403/* Locations of each CPU's IDT */
404static DEFINE_PER_CPU(struct Xgt_desc_struct, idt_desc);
405
406/* Set an IDT entry. If the entry is part of the current IDT, then
407 also update Xen. */
408static void xen_write_idt_entry(struct desc_struct *dt, int entrynum,
409 u32 low, u32 high)
410{
411 unsigned long p = (unsigned long)&dt[entrynum];
412 unsigned long start, end;
413
414 preempt_disable();
415
416 start = __get_cpu_var(idt_desc).address;
417 end = start + __get_cpu_var(idt_desc).size + 1;
418
419 xen_mc_flush();
420
421 write_dt_entry(dt, entrynum, low, high);
422
423 if (p >= start && (p + 8) <= end) {
424 struct trap_info info[2];
425
426 info[1].address = 0;
427
428 if (cvt_gate_to_trap(entrynum, low, high, &info[0]))
429 if (HYPERVISOR_set_trap_table(info))
430 BUG();
431 }
432
433 preempt_enable();
434}
435
436static void xen_convert_trap_info(const struct Xgt_desc_struct *desc,
437 struct trap_info *traps)
438{
439 unsigned in, out, count;
440
441 count = (desc->size+1) / 8;
442 BUG_ON(count > 256);
443
444 for (in = out = 0; in < count; in++) {
445 const u32 *entry = (u32 *)(desc->address + in * 8);
446
447 if (cvt_gate_to_trap(in, entry[0], entry[1], &traps[out]))
448 out++;
449 }
450 traps[out].address = 0;
451}
452
453void xen_copy_trap_info(struct trap_info *traps)
454{
455 const struct Xgt_desc_struct *desc = &__get_cpu_var(idt_desc);
456
457 xen_convert_trap_info(desc, traps);
458}
459
460/* Load a new IDT into Xen. In principle this can be per-CPU, so we
461 hold a spinlock to protect the static traps[] array (static because
462 it avoids allocation, and saves stack space). */
463static void xen_load_idt(const struct Xgt_desc_struct *desc)
464{
465 static DEFINE_SPINLOCK(lock);
466 static struct trap_info traps[257];
467
468 spin_lock(&lock);
469
470 __get_cpu_var(idt_desc) = *desc;
471
472 xen_convert_trap_info(desc, traps);
473
474 xen_mc_flush();
475 if (HYPERVISOR_set_trap_table(traps))
476 BUG();
477
478 spin_unlock(&lock);
479}
480
481/* Write a GDT descriptor entry. Ignore LDT descriptors, since
482 they're handled differently. */
483static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
484 u32 low, u32 high)
485{
486 preempt_disable();
487
488 switch ((high >> 8) & 0xff) {
489 case DESCTYPE_LDT:
490 case DESCTYPE_TSS:
491 /* ignore */
492 break;
493
494 default: {
495 xmaddr_t maddr = virt_to_machine(&dt[entry]);
496 u64 desc = (u64)high << 32 | low;
497
498 xen_mc_flush();
499 if (HYPERVISOR_update_descriptor(maddr.maddr, desc))
500 BUG();
501 }
502
503 }
504
505 preempt_enable();
506}
507
508static void xen_load_esp0(struct tss_struct *tss,
509 struct thread_struct *thread)
510{
511 struct multicall_space mcs = xen_mc_entry(0);
512 MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->esp0);
513 xen_mc_issue(PARAVIRT_LAZY_CPU);
514}
515
516static void xen_set_iopl_mask(unsigned mask)
517{
518 struct physdev_set_iopl set_iopl;
519
520 /* Force the change at ring 0. */
521 set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
522 HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
523}
524
525static void xen_io_delay(void)
526{
527}
528
529#ifdef CONFIG_X86_LOCAL_APIC
530static unsigned long xen_apic_read(unsigned long reg)
531{
532 return 0;
533}
534
535static void xen_apic_write(unsigned long reg, unsigned long val)
536{
537 /* Warn to see if there's any stray references */
538 WARN_ON(1);
539}
540#endif
541
542static void xen_flush_tlb(void)
543{
544 struct mmuext_op *op;
545 struct multicall_space mcs = xen_mc_entry(sizeof(*op));
546
547 op = mcs.args;
548 op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
549 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
550
551 xen_mc_issue(PARAVIRT_LAZY_MMU);
552}
553
554static void xen_flush_tlb_single(unsigned long addr)
555{
556 struct mmuext_op *op;
557 struct multicall_space mcs = xen_mc_entry(sizeof(*op));
558
559 op = mcs.args;
560 op->cmd = MMUEXT_INVLPG_LOCAL;
561 op->arg1.linear_addr = addr & PAGE_MASK;
562 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
563
564 xen_mc_issue(PARAVIRT_LAZY_MMU);
565}
566
567static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm,
568 unsigned long va)
569{
570 struct {
571 struct mmuext_op op;
572 cpumask_t mask;
573 } *args;
574 cpumask_t cpumask = *cpus;
575 struct multicall_space mcs;
576
577 /*
578 * A couple of (to be removed) sanity checks:
579 *
580 * - current CPU must not be in mask
581 * - mask must exist :)
582 */
583 BUG_ON(cpus_empty(cpumask));
584 BUG_ON(cpu_isset(smp_processor_id(), cpumask));
585 BUG_ON(!mm);
586
587 /* If a CPU which we ran on has gone down, OK. */
588 cpus_and(cpumask, cpumask, cpu_online_map);
589 if (cpus_empty(cpumask))
590 return;
591
592 mcs = xen_mc_entry(sizeof(*args));
593 args = mcs.args;
594 args->mask = cpumask;
595 args->op.arg2.vcpumask = &args->mask;
596
597 if (va == TLB_FLUSH_ALL) {
598 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
599 } else {
600 args->op.cmd = MMUEXT_INVLPG_MULTI;
601 args->op.arg1.linear_addr = va;
602 }
603
604 MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
605
606 xen_mc_issue(PARAVIRT_LAZY_MMU);
607}
608
609static void xen_write_cr2(unsigned long cr2)
610{
611 x86_read_percpu(xen_vcpu)->arch.cr2 = cr2;
612}
613
614static unsigned long xen_read_cr2(void)
615{
616 return x86_read_percpu(xen_vcpu)->arch.cr2;
617}
618
619static unsigned long xen_read_cr2_direct(void)
620{
621 return x86_read_percpu(xen_vcpu_info.arch.cr2);
622}
623
624static void xen_write_cr4(unsigned long cr4)
625{
626 /* Just ignore cr4 changes; Xen doesn't allow us to do
627 anything anyway. */
628}
629
630static unsigned long xen_read_cr3(void)
631{
632 return x86_read_percpu(xen_cr3);
633}
634
635static void xen_write_cr3(unsigned long cr3)
636{
637 BUG_ON(preemptible());
638
639 if (cr3 == x86_read_percpu(xen_cr3)) {
640 /* just a simple tlb flush */
641 xen_flush_tlb();
642 return;
643 }
644
645 x86_write_percpu(xen_cr3, cr3);
646
647
648 {
649 struct mmuext_op *op;
650 struct multicall_space mcs = xen_mc_entry(sizeof(*op));
651 unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3));
652
653 op = mcs.args;
654 op->cmd = MMUEXT_NEW_BASEPTR;
655 op->arg1.mfn = mfn;
656
657 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
658
659 xen_mc_issue(PARAVIRT_LAZY_CPU);
660 }
661}
662
663/* Early in boot, while setting up the initial pagetable, assume
664 everything is pinned. */
665static __init void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn)
666{
667 BUG_ON(mem_map); /* should only be used early */
668 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
669}
670
671/* This needs to make sure the new pte page is pinned iff its being
672 attached to a pinned pagetable. */
673static void xen_alloc_pt(struct mm_struct *mm, u32 pfn)
674{
675 struct page *page = pfn_to_page(pfn);
676
677 if (PagePinned(virt_to_page(mm->pgd))) {
678 SetPagePinned(page);
679
680 if (!PageHighMem(page))
681 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
682 else
683 /* make sure there are no stray mappings of
684 this page */
685 kmap_flush_unused();
686 }
687}
688
689/* This should never happen until we're OK to use struct page */
690static void xen_release_pt(u32 pfn)
691{
692 struct page *page = pfn_to_page(pfn);
693
694 if (PagePinned(page)) {
695 if (!PageHighMem(page))
696 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
697 }
698}
699
700#ifdef CONFIG_HIGHPTE
701static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
702{
703 pgprot_t prot = PAGE_KERNEL;
704
705 if (PagePinned(page))
706 prot = PAGE_KERNEL_RO;
707
708 if (0 && PageHighMem(page))
709 printk("mapping highpte %lx type %d prot %s\n",
710 page_to_pfn(page), type,
711 (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ");
712
713 return kmap_atomic_prot(page, type, prot);
714}
715#endif
716
717static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
718{
719 /* If there's an existing pte, then don't allow _PAGE_RW to be set */
720 if (pte_val_ma(*ptep) & _PAGE_PRESENT)
721 pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
722 pte_val_ma(pte));
723
724 return pte;
725}
726
727/* Init-time set_pte while constructing initial pagetables, which
728 doesn't allow RO pagetable pages to be remapped RW */
729static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
730{
731 pte = mask_rw_pte(ptep, pte);
732
733 xen_set_pte(ptep, pte);
734}
735
736static __init void xen_pagetable_setup_start(pgd_t *base)
737{
738 pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
739
740 /* special set_pte for pagetable initialization */
741 paravirt_ops.set_pte = xen_set_pte_init;
742
743 init_mm.pgd = base;
744 /*
745 * copy top-level of Xen-supplied pagetable into place. For
746 * !PAE we can use this as-is, but for PAE it is a stand-in
747 * while we copy the pmd pages.
748 */
749 memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t));
750
751 if (PTRS_PER_PMD > 1) {
752 int i;
753 /*
754 * For PAE, need to allocate new pmds, rather than
755 * share Xen's, since Xen doesn't like pmd's being
756 * shared between address spaces.
757 */
758 for (i = 0; i < PTRS_PER_PGD; i++) {
759 if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
760 pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
761
762 memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]),
763 PAGE_SIZE);
764
765 make_lowmem_page_readonly(pmd);
766
767 set_pgd(&base[i], __pgd(1 + __pa(pmd)));
768 } else
769 pgd_clear(&base[i]);
770 }
771 }
772
773 /* make sure zero_page is mapped RO so we can use it in pagetables */
774 make_lowmem_page_readonly(empty_zero_page);
775 make_lowmem_page_readonly(base);
776 /*
777 * Switch to new pagetable. This is done before
778 * pagetable_init has done anything so that the new pages
779 * added to the table can be prepared properly for Xen.
780 */
781 xen_write_cr3(__pa(base));
782}
783
784static __init void xen_pagetable_setup_done(pgd_t *base)
785{
786 /* This will work as long as patching hasn't happened yet
787 (which it hasn't) */
788 paravirt_ops.alloc_pt = xen_alloc_pt;
789 paravirt_ops.set_pte = xen_set_pte;
790
791 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
792 /*
793 * Create a mapping for the shared info page.
794 * Should be set_fixmap(), but shared_info is a machine
795 * address with no corresponding pseudo-phys address.
796 */
797 set_pte_mfn(fix_to_virt(FIX_PARAVIRT_BOOTMAP),
798 PFN_DOWN(xen_start_info->shared_info),
799 PAGE_KERNEL);
800
801 HYPERVISOR_shared_info =
802 (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
803
804 } else
805 HYPERVISOR_shared_info =
806 (struct shared_info *)__va(xen_start_info->shared_info);
807
808 /* Actually pin the pagetable down, but we can't set PG_pinned
809 yet because the page structures don't exist yet. */
810 {
811 struct mmuext_op op;
812#ifdef CONFIG_X86_PAE
813 op.cmd = MMUEXT_PIN_L3_TABLE;
814#else
815 op.cmd = MMUEXT_PIN_L3_TABLE;
816#endif
817 op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(base)));
818 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
819 BUG();
820 }
821}
822
823/* This is called once we have the cpu_possible_map */
824void __init xen_setup_vcpu_info_placement(void)
825{
826 int cpu;
827
828 for_each_possible_cpu(cpu)
829 xen_vcpu_setup(cpu);
830
831 /* xen_vcpu_setup managed to place the vcpu_info within the
832 percpu area for all cpus, so make use of it */
833 if (have_vcpu_info_placement) {
834 printk(KERN_INFO "Xen: using vcpu_info placement\n");
835
836 paravirt_ops.save_fl = xen_save_fl_direct;
837 paravirt_ops.restore_fl = xen_restore_fl_direct;
838 paravirt_ops.irq_disable = xen_irq_disable_direct;
839 paravirt_ops.irq_enable = xen_irq_enable_direct;
840 paravirt_ops.read_cr2 = xen_read_cr2_direct;
841 paravirt_ops.iret = xen_iret_direct;
842 }
843}
844
845static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
846 unsigned long addr, unsigned len)
847{
848 char *start, *end, *reloc;
849 unsigned ret;
850
851 start = end = reloc = NULL;
852
853#define SITE(x) \
854 case PARAVIRT_PATCH(x): \
855 if (have_vcpu_info_placement) { \
856 start = (char *)xen_##x##_direct; \
857 end = xen_##x##_direct_end; \
858 reloc = xen_##x##_direct_reloc; \
859 } \
860 goto patch_site
861
862 switch (type) {
863 SITE(irq_enable);
864 SITE(irq_disable);
865 SITE(save_fl);
866 SITE(restore_fl);
867#undef SITE
868
869 patch_site:
870 if (start == NULL || (end-start) > len)
871 goto default_patch;
872
873 ret = paravirt_patch_insns(insnbuf, len, start, end);
874
875 /* Note: because reloc is assigned from something that
876 appears to be an array, gcc assumes it's non-null,
877 but doesn't know its relationship with start and
878 end. */
879 if (reloc > start && reloc < end) {
880 int reloc_off = reloc - start;
881 long *relocp = (long *)(insnbuf + reloc_off);
882 long delta = start - (char *)addr;
883
884 *relocp += delta;
885 }
886 break;
887
888 default_patch:
889 default:
890 ret = paravirt_patch_default(type, clobbers, insnbuf,
891 addr, len);
892 break;
893 }
894
895 return ret;
896}
897
898static const struct paravirt_ops xen_paravirt_ops __initdata = {
899 .paravirt_enabled = 1,
900 .shared_kernel_pmd = 0,
901
902 .name = "Xen",
903 .banner = xen_banner,
904
905 .patch = xen_patch,
906
907 .memory_setup = xen_memory_setup,
908 .arch_setup = xen_arch_setup,
909 .init_IRQ = xen_init_IRQ,
910 .post_allocator_init = xen_mark_init_mm_pinned,
911
912 .time_init = xen_time_init,
913 .set_wallclock = xen_set_wallclock,
914 .get_wallclock = xen_get_wallclock,
915 .get_cpu_khz = xen_cpu_khz,
916 .sched_clock = xen_sched_clock,
917
918 .cpuid = xen_cpuid,
919
920 .set_debugreg = xen_set_debugreg,
921 .get_debugreg = xen_get_debugreg,
922
923 .clts = native_clts,
924
925 .read_cr0 = native_read_cr0,
926 .write_cr0 = native_write_cr0,
927
928 .read_cr2 = xen_read_cr2,
929 .write_cr2 = xen_write_cr2,
930
931 .read_cr3 = xen_read_cr3,
932 .write_cr3 = xen_write_cr3,
933
934 .read_cr4 = native_read_cr4,
935 .read_cr4_safe = native_read_cr4_safe,
936 .write_cr4 = xen_write_cr4,
937
938 .save_fl = xen_save_fl,
939 .restore_fl = xen_restore_fl,
940 .irq_disable = xen_irq_disable,
941 .irq_enable = xen_irq_enable,
942 .safe_halt = xen_safe_halt,
943 .halt = xen_halt,
944 .wbinvd = native_wbinvd,
945
946 .read_msr = native_read_msr_safe,
947 .write_msr = native_write_msr_safe,
948 .read_tsc = native_read_tsc,
949 .read_pmc = native_read_pmc,
950
951 .iret = (void *)&hypercall_page[__HYPERVISOR_iret],
952 .irq_enable_sysexit = NULL, /* never called */
953
954 .load_tr_desc = paravirt_nop,
955 .set_ldt = xen_set_ldt,
956 .load_gdt = xen_load_gdt,
957 .load_idt = xen_load_idt,
958 .load_tls = xen_load_tls,
959
960 .store_gdt = native_store_gdt,
961 .store_idt = native_store_idt,
962 .store_tr = xen_store_tr,
963
964 .write_ldt_entry = xen_write_ldt_entry,
965 .write_gdt_entry = xen_write_gdt_entry,
966 .write_idt_entry = xen_write_idt_entry,
967 .load_esp0 = xen_load_esp0,
968
969 .set_iopl_mask = xen_set_iopl_mask,
970 .io_delay = xen_io_delay,
971
972#ifdef CONFIG_X86_LOCAL_APIC
973 .apic_write = xen_apic_write,
974 .apic_write_atomic = xen_apic_write,
975 .apic_read = xen_apic_read,
976 .setup_boot_clock = paravirt_nop,
977 .setup_secondary_clock = paravirt_nop,
978 .startup_ipi_hook = paravirt_nop,
979#endif
980
981 .flush_tlb_user = xen_flush_tlb,
982 .flush_tlb_kernel = xen_flush_tlb,
983 .flush_tlb_single = xen_flush_tlb_single,
984 .flush_tlb_others = xen_flush_tlb_others,
985
986 .pte_update = paravirt_nop,
987 .pte_update_defer = paravirt_nop,
988
989 .pagetable_setup_start = xen_pagetable_setup_start,
990 .pagetable_setup_done = xen_pagetable_setup_done,
991
992 .alloc_pt = xen_alloc_pt_init,
993 .release_pt = xen_release_pt,
994 .alloc_pd = paravirt_nop,
995 .alloc_pd_clone = paravirt_nop,
996 .release_pd = paravirt_nop,
997
998#ifdef CONFIG_HIGHPTE
999 .kmap_atomic_pte = xen_kmap_atomic_pte,
1000#endif
1001
1002 .set_pte = NULL, /* see xen_pagetable_setup_* */
1003 .set_pte_at = xen_set_pte_at,
1004 .set_pmd = xen_set_pmd,
1005
1006 .pte_val = xen_pte_val,
1007 .pgd_val = xen_pgd_val,
1008
1009 .make_pte = xen_make_pte,
1010 .make_pgd = xen_make_pgd,
1011
1012#ifdef CONFIG_X86_PAE
1013 .set_pte_atomic = xen_set_pte_atomic,
1014 .set_pte_present = xen_set_pte_at,
1015 .set_pud = xen_set_pud,
1016 .pte_clear = xen_pte_clear,
1017 .pmd_clear = xen_pmd_clear,
1018
1019 .make_pmd = xen_make_pmd,
1020 .pmd_val = xen_pmd_val,
1021#endif /* PAE */
1022
1023 .activate_mm = xen_activate_mm,
1024 .dup_mmap = xen_dup_mmap,
1025 .exit_mmap = xen_exit_mmap,
1026
1027 .set_lazy_mode = xen_set_lazy_mode,
1028};
1029
1030#ifdef CONFIG_SMP
1031static const struct smp_ops xen_smp_ops __initdata = {
1032 .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
1033 .smp_prepare_cpus = xen_smp_prepare_cpus,
1034 .cpu_up = xen_cpu_up,
1035 .smp_cpus_done = xen_smp_cpus_done,
1036
1037 .smp_send_stop = xen_smp_send_stop,
1038 .smp_send_reschedule = xen_smp_send_reschedule,
1039 .smp_call_function_mask = xen_smp_call_function_mask,
1040};
1041#endif /* CONFIG_SMP */
1042
1043static void xen_reboot(int reason)
1044{
1045#ifdef CONFIG_SMP
1046 smp_send_stop();
1047#endif
1048
1049 if (HYPERVISOR_sched_op(SCHEDOP_shutdown, reason))
1050 BUG();
1051}
1052
1053static void xen_restart(char *msg)
1054{
1055 xen_reboot(SHUTDOWN_reboot);
1056}
1057
1058static void xen_emergency_restart(void)
1059{
1060 xen_reboot(SHUTDOWN_reboot);
1061}
1062
1063static void xen_machine_halt(void)
1064{
1065 xen_reboot(SHUTDOWN_poweroff);
1066}
1067
1068static void xen_crash_shutdown(struct pt_regs *regs)
1069{
1070 xen_reboot(SHUTDOWN_crash);
1071}
1072
1073static const struct machine_ops __initdata xen_machine_ops = {
1074 .restart = xen_restart,
1075 .halt = xen_machine_halt,
1076 .power_off = xen_machine_halt,
1077 .shutdown = xen_machine_halt,
1078 .crash_shutdown = xen_crash_shutdown,
1079 .emergency_restart = xen_emergency_restart,
1080};
1081
1082
1083/* First C function to be called on Xen boot */
1084asmlinkage void __init xen_start_kernel(void)
1085{
1086 pgd_t *pgd;
1087
1088 if (!xen_start_info)
1089 return;
1090
1091 BUG_ON(memcmp(xen_start_info->magic, "xen-3.0", 7) != 0);
1092
1093 /* Install Xen paravirt ops */
1094 paravirt_ops = xen_paravirt_ops;
1095 machine_ops = xen_machine_ops;
1096
1097#ifdef CONFIG_SMP
1098 smp_ops = xen_smp_ops;
1099#endif
1100
1101 xen_setup_features();
1102
1103 /* Get mfn list */
1104 if (!xen_feature(XENFEAT_auto_translated_physmap))
1105 phys_to_machine_mapping = (unsigned long *)xen_start_info->mfn_list;
1106
1107 pgd = (pgd_t *)xen_start_info->pt_base;
1108
1109 init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
1110
1111 init_mm.pgd = pgd; /* use the Xen pagetables to start */
1112
1113 /* keep using Xen gdt for now; no urgent need to change it */
1114
1115 x86_write_percpu(xen_cr3, __pa(pgd));
1116
1117#ifdef CONFIG_SMP
1118 /* Don't do the full vcpu_info placement stuff until we have a
1119 possible map. */
1120 per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
1121#else
1122 /* May as well do it now, since there's no good time to call
1123 it later on UP. */
1124 xen_setup_vcpu_info_placement();
1125#endif
1126
1127 paravirt_ops.kernel_rpl = 1;
1128 if (xen_feature(XENFEAT_supervisor_mode_kernel))
1129 paravirt_ops.kernel_rpl = 0;
1130
1131 /* set the limit of our address space */
1132 reserve_top_address(-HYPERVISOR_VIRT_START + 2 * PAGE_SIZE);
1133
1134 /* set up basic CPUID stuff */
1135 cpu_detect(&new_cpu_data);
1136 new_cpu_data.hard_math = 1;
1137 new_cpu_data.x86_capability[0] = cpuid_edx(1);
1138
1139 /* Poke various useful things into boot_params */
1140 LOADER_TYPE = (9 << 4) | 0;
1141 INITRD_START = xen_start_info->mod_start ? __pa(xen_start_info->mod_start) : 0;
1142 INITRD_SIZE = xen_start_info->mod_len;
1143
1144 /* Start the world */
1145 start_kernel();
1146}
diff --git a/arch/x86/xen/events.c b/arch/x86/xen/events.c
new file mode 100644
index 000000000000..da1b173547a1
--- /dev/null
+++ b/arch/x86/xen/events.c
@@ -0,0 +1,591 @@
1/*
2 * Xen event channels
3 *
4 * Xen models interrupts with abstract event channels. Because each
5 * domain gets 1024 event channels, but NR_IRQ is not that large, we
6 * must dynamically map irqs<->event channels. The event channels
7 * interface with the rest of the kernel by defining a xen interrupt
8 * chip. When an event is recieved, it is mapped to an irq and sent
9 * through the normal interrupt processing path.
10 *
11 * There are four kinds of events which can be mapped to an event
12 * channel:
13 *
14 * 1. Inter-domain notifications. This includes all the virtual
15 * device events, since they're driven by front-ends in another domain
16 * (typically dom0).
17 * 2. VIRQs, typically used for timers. These are per-cpu events.
18 * 3. IPIs.
19 * 4. Hardware interrupts. Not supported at present.
20 *
21 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
22 */
23
24#include <linux/linkage.h>
25#include <linux/interrupt.h>
26#include <linux/irq.h>
27#include <linux/module.h>
28#include <linux/string.h>
29
30#include <asm/ptrace.h>
31#include <asm/irq.h>
32#include <asm/sync_bitops.h>
33#include <asm/xen/hypercall.h>
34#include <asm/xen/hypervisor.h>
35
36#include <xen/events.h>
37#include <xen/interface/xen.h>
38#include <xen/interface/event_channel.h>
39
40#include "xen-ops.h"
41
42/*
43 * This lock protects updates to the following mapping and reference-count
44 * arrays. The lock does not need to be acquired to read the mapping tables.
45 */
46static DEFINE_SPINLOCK(irq_mapping_update_lock);
47
48/* IRQ <-> VIRQ mapping. */
49static DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1};
50
51/* IRQ <-> IPI mapping */
52static DEFINE_PER_CPU(int, ipi_to_irq[XEN_NR_IPIS]) = {[0 ... XEN_NR_IPIS-1] = -1};
53
54/* Packed IRQ information: binding type, sub-type index, and event channel. */
55struct packed_irq
56{
57 unsigned short evtchn;
58 unsigned char index;
59 unsigned char type;
60};
61
62static struct packed_irq irq_info[NR_IRQS];
63
64/* Binding types. */
65enum {
66 IRQT_UNBOUND,
67 IRQT_PIRQ,
68 IRQT_VIRQ,
69 IRQT_IPI,
70 IRQT_EVTCHN
71};
72
73/* Convenient shorthand for packed representation of an unbound IRQ. */
74#define IRQ_UNBOUND mk_irq_info(IRQT_UNBOUND, 0, 0)
75
76static int evtchn_to_irq[NR_EVENT_CHANNELS] = {
77 [0 ... NR_EVENT_CHANNELS-1] = -1
78};
79static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG];
80static u8 cpu_evtchn[NR_EVENT_CHANNELS];
81
82/* Reference counts for bindings to IRQs. */
83static int irq_bindcount[NR_IRQS];
84
85/* Xen will never allocate port zero for any purpose. */
86#define VALID_EVTCHN(chn) ((chn) != 0)
87
88/*
89 * Force a proper event-channel callback from Xen after clearing the
90 * callback mask. We do this in a very simple manner, by making a call
91 * down into Xen. The pending flag will be checked by Xen on return.
92 */
93void force_evtchn_callback(void)
94{
95 (void)HYPERVISOR_xen_version(0, NULL);
96}
97EXPORT_SYMBOL_GPL(force_evtchn_callback);
98
99static struct irq_chip xen_dynamic_chip;
100
101/* Constructor for packed IRQ information. */
102static inline struct packed_irq mk_irq_info(u32 type, u32 index, u32 evtchn)
103{
104 return (struct packed_irq) { evtchn, index, type };
105}
106
107/*
108 * Accessors for packed IRQ information.
109 */
110static inline unsigned int evtchn_from_irq(int irq)
111{
112 return irq_info[irq].evtchn;
113}
114
115static inline unsigned int index_from_irq(int irq)
116{
117 return irq_info[irq].index;
118}
119
120static inline unsigned int type_from_irq(int irq)
121{
122 return irq_info[irq].type;
123}
124
125static inline unsigned long active_evtchns(unsigned int cpu,
126 struct shared_info *sh,
127 unsigned int idx)
128{
129 return (sh->evtchn_pending[idx] &
130 cpu_evtchn_mask[cpu][idx] &
131 ~sh->evtchn_mask[idx]);
132}
133
134static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
135{
136 int irq = evtchn_to_irq[chn];
137
138 BUG_ON(irq == -1);
139#ifdef CONFIG_SMP
140 irq_desc[irq].affinity = cpumask_of_cpu(cpu);
141#endif
142
143 __clear_bit(chn, cpu_evtchn_mask[cpu_evtchn[chn]]);
144 __set_bit(chn, cpu_evtchn_mask[cpu]);
145
146 cpu_evtchn[chn] = cpu;
147}
148
149static void init_evtchn_cpu_bindings(void)
150{
151#ifdef CONFIG_SMP
152 int i;
153 /* By default all event channels notify CPU#0. */
154 for (i = 0; i < NR_IRQS; i++)
155 irq_desc[i].affinity = cpumask_of_cpu(0);
156#endif
157
158 memset(cpu_evtchn, 0, sizeof(cpu_evtchn));
159 memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0]));
160}
161
162static inline unsigned int cpu_from_evtchn(unsigned int evtchn)
163{
164 return cpu_evtchn[evtchn];
165}
166
167static inline void clear_evtchn(int port)
168{
169 struct shared_info *s = HYPERVISOR_shared_info;
170 sync_clear_bit(port, &s->evtchn_pending[0]);
171}
172
173static inline void set_evtchn(int port)
174{
175 struct shared_info *s = HYPERVISOR_shared_info;
176 sync_set_bit(port, &s->evtchn_pending[0]);
177}
178
179
180/**
181 * notify_remote_via_irq - send event to remote end of event channel via irq
182 * @irq: irq of event channel to send event to
183 *
184 * Unlike notify_remote_via_evtchn(), this is safe to use across
185 * save/restore. Notifications on a broken connection are silently
186 * dropped.
187 */
188void notify_remote_via_irq(int irq)
189{
190 int evtchn = evtchn_from_irq(irq);
191
192 if (VALID_EVTCHN(evtchn))
193 notify_remote_via_evtchn(evtchn);
194}
195EXPORT_SYMBOL_GPL(notify_remote_via_irq);
196
197static void mask_evtchn(int port)
198{
199 struct shared_info *s = HYPERVISOR_shared_info;
200 sync_set_bit(port, &s->evtchn_mask[0]);
201}
202
203static void unmask_evtchn(int port)
204{
205 struct shared_info *s = HYPERVISOR_shared_info;
206 unsigned int cpu = get_cpu();
207
208 BUG_ON(!irqs_disabled());
209
210 /* Slow path (hypercall) if this is a non-local port. */
211 if (unlikely(cpu != cpu_from_evtchn(port))) {
212 struct evtchn_unmask unmask = { .port = port };
213 (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask);
214 } else {
215 struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
216
217 sync_clear_bit(port, &s->evtchn_mask[0]);
218
219 /*
220 * The following is basically the equivalent of
221 * 'hw_resend_irq'. Just like a real IO-APIC we 'lose
222 * the interrupt edge' if the channel is masked.
223 */
224 if (sync_test_bit(port, &s->evtchn_pending[0]) &&
225 !sync_test_and_set_bit(port / BITS_PER_LONG,
226 &vcpu_info->evtchn_pending_sel))
227 vcpu_info->evtchn_upcall_pending = 1;
228 }
229
230 put_cpu();
231}
232
233static int find_unbound_irq(void)
234{
235 int irq;
236
237 /* Only allocate from dynirq range */
238 for (irq = 0; irq < NR_IRQS; irq++)
239 if (irq_bindcount[irq] == 0)
240 break;
241
242 if (irq == NR_IRQS)
243 panic("No available IRQ to bind to: increase NR_IRQS!\n");
244
245 return irq;
246}
247
248int bind_evtchn_to_irq(unsigned int evtchn)
249{
250 int irq;
251
252 spin_lock(&irq_mapping_update_lock);
253
254 irq = evtchn_to_irq[evtchn];
255
256 if (irq == -1) {
257 irq = find_unbound_irq();
258
259 dynamic_irq_init(irq);
260 set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
261 handle_level_irq, "event");
262
263 evtchn_to_irq[evtchn] = irq;
264 irq_info[irq] = mk_irq_info(IRQT_EVTCHN, 0, evtchn);
265 }
266
267 irq_bindcount[irq]++;
268
269 spin_unlock(&irq_mapping_update_lock);
270
271 return irq;
272}
273EXPORT_SYMBOL_GPL(bind_evtchn_to_irq);
274
275static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
276{
277 struct evtchn_bind_ipi bind_ipi;
278 int evtchn, irq;
279
280 spin_lock(&irq_mapping_update_lock);
281
282 irq = per_cpu(ipi_to_irq, cpu)[ipi];
283 if (irq == -1) {
284 irq = find_unbound_irq();
285 if (irq < 0)
286 goto out;
287
288 dynamic_irq_init(irq);
289 set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
290 handle_level_irq, "ipi");
291
292 bind_ipi.vcpu = cpu;
293 if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
294 &bind_ipi) != 0)
295 BUG();
296 evtchn = bind_ipi.port;
297
298 evtchn_to_irq[evtchn] = irq;
299 irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn);
300
301 per_cpu(ipi_to_irq, cpu)[ipi] = irq;
302
303 bind_evtchn_to_cpu(evtchn, cpu);
304 }
305
306 irq_bindcount[irq]++;
307
308 out:
309 spin_unlock(&irq_mapping_update_lock);
310 return irq;
311}
312
313
314static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
315{
316 struct evtchn_bind_virq bind_virq;
317 int evtchn, irq;
318
319 spin_lock(&irq_mapping_update_lock);
320
321 irq = per_cpu(virq_to_irq, cpu)[virq];
322
323 if (irq == -1) {
324 bind_virq.virq = virq;
325 bind_virq.vcpu = cpu;
326 if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
327 &bind_virq) != 0)
328 BUG();
329 evtchn = bind_virq.port;
330
331 irq = find_unbound_irq();
332
333 dynamic_irq_init(irq);
334 set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
335 handle_level_irq, "virq");
336
337 evtchn_to_irq[evtchn] = irq;
338 irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn);
339
340 per_cpu(virq_to_irq, cpu)[virq] = irq;
341
342 bind_evtchn_to_cpu(evtchn, cpu);
343 }
344
345 irq_bindcount[irq]++;
346
347 spin_unlock(&irq_mapping_update_lock);
348
349 return irq;
350}
351
352static void unbind_from_irq(unsigned int irq)
353{
354 struct evtchn_close close;
355 int evtchn = evtchn_from_irq(irq);
356
357 spin_lock(&irq_mapping_update_lock);
358
359 if (VALID_EVTCHN(evtchn) && (--irq_bindcount[irq] == 0)) {
360 close.port = evtchn;
361 if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
362 BUG();
363
364 switch (type_from_irq(irq)) {
365 case IRQT_VIRQ:
366 per_cpu(virq_to_irq, cpu_from_evtchn(evtchn))
367 [index_from_irq(irq)] = -1;
368 break;
369 default:
370 break;
371 }
372
373 /* Closed ports are implicitly re-bound to VCPU0. */
374 bind_evtchn_to_cpu(evtchn, 0);
375
376 evtchn_to_irq[evtchn] = -1;
377 irq_info[irq] = IRQ_UNBOUND;
378
379 dynamic_irq_init(irq);
380 }
381
382 spin_unlock(&irq_mapping_update_lock);
383}
384
385int bind_evtchn_to_irqhandler(unsigned int evtchn,
386 irqreturn_t (*handler)(int, void *),
387 unsigned long irqflags,
388 const char *devname, void *dev_id)
389{
390 unsigned int irq;
391 int retval;
392
393 irq = bind_evtchn_to_irq(evtchn);
394 retval = request_irq(irq, handler, irqflags, devname, dev_id);
395 if (retval != 0) {
396 unbind_from_irq(irq);
397 return retval;
398 }
399
400 return irq;
401}
402EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler);
403
404int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
405 irqreturn_t (*handler)(int, void *),
406 unsigned long irqflags, const char *devname, void *dev_id)
407{
408 unsigned int irq;
409 int retval;
410
411 irq = bind_virq_to_irq(virq, cpu);
412 retval = request_irq(irq, handler, irqflags, devname, dev_id);
413 if (retval != 0) {
414 unbind_from_irq(irq);
415 return retval;
416 }
417
418 return irq;
419}
420EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler);
421
422int bind_ipi_to_irqhandler(enum ipi_vector ipi,
423 unsigned int cpu,
424 irq_handler_t handler,
425 unsigned long irqflags,
426 const char *devname,
427 void *dev_id)
428{
429 int irq, retval;
430
431 irq = bind_ipi_to_irq(ipi, cpu);
432 if (irq < 0)
433 return irq;
434
435 retval = request_irq(irq, handler, irqflags, devname, dev_id);
436 if (retval != 0) {
437 unbind_from_irq(irq);
438 return retval;
439 }
440
441 return irq;
442}
443
444void unbind_from_irqhandler(unsigned int irq, void *dev_id)
445{
446 free_irq(irq, dev_id);
447 unbind_from_irq(irq);
448}
449EXPORT_SYMBOL_GPL(unbind_from_irqhandler);
450
451void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector)
452{
453 int irq = per_cpu(ipi_to_irq, cpu)[vector];
454 BUG_ON(irq < 0);
455 notify_remote_via_irq(irq);
456}
457
458
459/*
460 * Search the CPUs pending events bitmasks. For each one found, map
461 * the event number to an irq, and feed it into do_IRQ() for
462 * handling.
463 *
464 * Xen uses a two-level bitmap to speed searching. The first level is
465 * a bitset of words which contain pending event bits. The second
466 * level is a bitset of pending events themselves.
467 */
468fastcall void xen_evtchn_do_upcall(struct pt_regs *regs)
469{
470 int cpu = get_cpu();
471 struct shared_info *s = HYPERVISOR_shared_info;
472 struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
473 unsigned long pending_words;
474
475 vcpu_info->evtchn_upcall_pending = 0;
476
477 /* NB. No need for a barrier here -- XCHG is a barrier on x86. */
478 pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0);
479 while (pending_words != 0) {
480 unsigned long pending_bits;
481 int word_idx = __ffs(pending_words);
482 pending_words &= ~(1UL << word_idx);
483
484 while ((pending_bits = active_evtchns(cpu, s, word_idx)) != 0) {
485 int bit_idx = __ffs(pending_bits);
486 int port = (word_idx * BITS_PER_LONG) + bit_idx;
487 int irq = evtchn_to_irq[port];
488
489 if (irq != -1) {
490 regs->orig_eax = ~irq;
491 do_IRQ(regs);
492 }
493 }
494 }
495
496 put_cpu();
497}
498
499/* Rebind an evtchn so that it gets delivered to a specific cpu */
500static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
501{
502 struct evtchn_bind_vcpu bind_vcpu;
503 int evtchn = evtchn_from_irq(irq);
504
505 if (!VALID_EVTCHN(evtchn))
506 return;
507
508 /* Send future instances of this interrupt to other vcpu. */
509 bind_vcpu.port = evtchn;
510 bind_vcpu.vcpu = tcpu;
511
512 /*
513 * If this fails, it usually just indicates that we're dealing with a
514 * virq or IPI channel, which don't actually need to be rebound. Ignore
515 * it, but don't do the xenlinux-level rebind in that case.
516 */
517 if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0)
518 bind_evtchn_to_cpu(evtchn, tcpu);
519}
520
521
522static void set_affinity_irq(unsigned irq, cpumask_t dest)
523{
524 unsigned tcpu = first_cpu(dest);
525 rebind_irq_to_cpu(irq, tcpu);
526}
527
528static void enable_dynirq(unsigned int irq)
529{
530 int evtchn = evtchn_from_irq(irq);
531
532 if (VALID_EVTCHN(evtchn))
533 unmask_evtchn(evtchn);
534}
535
536static void disable_dynirq(unsigned int irq)
537{
538 int evtchn = evtchn_from_irq(irq);
539
540 if (VALID_EVTCHN(evtchn))
541 mask_evtchn(evtchn);
542}
543
544static void ack_dynirq(unsigned int irq)
545{
546 int evtchn = evtchn_from_irq(irq);
547
548 move_native_irq(irq);
549
550 if (VALID_EVTCHN(evtchn))
551 clear_evtchn(evtchn);
552}
553
554static int retrigger_dynirq(unsigned int irq)
555{
556 int evtchn = evtchn_from_irq(irq);
557 int ret = 0;
558
559 if (VALID_EVTCHN(evtchn)) {
560 set_evtchn(evtchn);
561 ret = 1;
562 }
563
564 return ret;
565}
566
567static struct irq_chip xen_dynamic_chip __read_mostly = {
568 .name = "xen-dyn",
569 .mask = disable_dynirq,
570 .unmask = enable_dynirq,
571 .ack = ack_dynirq,
572 .set_affinity = set_affinity_irq,
573 .retrigger = retrigger_dynirq,
574};
575
576void __init xen_init_IRQ(void)
577{
578 int i;
579
580 init_evtchn_cpu_bindings();
581
582 /* No event channels are 'live' right now. */
583 for (i = 0; i < NR_EVENT_CHANNELS; i++)
584 mask_evtchn(i);
585
586 /* Dynamic IRQ space is currently unbound. Zero the refcnts. */
587 for (i = 0; i < NR_IRQS; i++)
588 irq_bindcount[i] = 0;
589
590 irq_ctx_init(smp_processor_id());
591}
diff --git a/arch/x86/xen/features.c b/arch/x86/xen/features.c
new file mode 100644
index 000000000000..0707714e40d6
--- /dev/null
+++ b/arch/x86/xen/features.c
@@ -0,0 +1,29 @@
1/******************************************************************************
2 * features.c
3 *
4 * Xen feature flags.
5 *
6 * Copyright (c) 2006, Ian Campbell, XenSource Inc.
7 */
8#include <linux/types.h>
9#include <linux/cache.h>
10#include <linux/module.h>
11#include <asm/xen/hypervisor.h>
12#include <xen/features.h>
13
14u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly;
15EXPORT_SYMBOL_GPL(xen_features);
16
17void xen_setup_features(void)
18{
19 struct xen_feature_info fi;
20 int i, j;
21
22 for (i = 0; i < XENFEAT_NR_SUBMAPS; i++) {
23 fi.submap_idx = i;
24 if (HYPERVISOR_xen_version(XENVER_get_features, &fi) < 0)
25 break;
26 for (j = 0; j < 32; j++)
27 xen_features[i * 32 + j] = !!(fi.submap & 1<<j);
28 }
29}
diff --git a/arch/x86/xen/manage.c b/arch/x86/xen/manage.c
new file mode 100644
index 000000000000..aa7af9e6abc0
--- /dev/null
+++ b/arch/x86/xen/manage.c
@@ -0,0 +1,143 @@
1/*
2 * Handle extern requests for shutdown, reboot and sysrq
3 */
4#include <linux/kernel.h>
5#include <linux/err.h>
6#include <linux/reboot.h>
7#include <linux/sysrq.h>
8
9#include <xen/xenbus.h>
10
11#define SHUTDOWN_INVALID -1
12#define SHUTDOWN_POWEROFF 0
13#define SHUTDOWN_SUSPEND 2
14/* Code 3 is SHUTDOWN_CRASH, which we don't use because the domain can only
15 * report a crash, not be instructed to crash!
16 * HALT is the same as POWEROFF, as far as we're concerned. The tools use
17 * the distinction when we return the reason code to them.
18 */
19#define SHUTDOWN_HALT 4
20
21/* Ignore multiple shutdown requests. */
22static int shutting_down = SHUTDOWN_INVALID;
23
24static void shutdown_handler(struct xenbus_watch *watch,
25 const char **vec, unsigned int len)
26{
27 char *str;
28 struct xenbus_transaction xbt;
29 int err;
30
31 if (shutting_down != SHUTDOWN_INVALID)
32 return;
33
34 again:
35 err = xenbus_transaction_start(&xbt);
36 if (err)
37 return;
38
39 str = (char *)xenbus_read(xbt, "control", "shutdown", NULL);
40 /* Ignore read errors and empty reads. */
41 if (XENBUS_IS_ERR_READ(str)) {
42 xenbus_transaction_end(xbt, 1);
43 return;
44 }
45
46 xenbus_write(xbt, "control", "shutdown", "");
47
48 err = xenbus_transaction_end(xbt, 0);
49 if (err == -EAGAIN) {
50 kfree(str);
51 goto again;
52 }
53
54 if (strcmp(str, "poweroff") == 0 ||
55 strcmp(str, "halt") == 0)
56 orderly_poweroff(false);
57 else if (strcmp(str, "reboot") == 0)
58 ctrl_alt_del();
59 else {
60 printk(KERN_INFO "Ignoring shutdown request: %s\n", str);
61 shutting_down = SHUTDOWN_INVALID;
62 }
63
64 kfree(str);
65}
66
67static void sysrq_handler(struct xenbus_watch *watch, const char **vec,
68 unsigned int len)
69{
70 char sysrq_key = '\0';
71 struct xenbus_transaction xbt;
72 int err;
73
74 again:
75 err = xenbus_transaction_start(&xbt);
76 if (err)
77 return;
78 if (!xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key)) {
79 printk(KERN_ERR "Unable to read sysrq code in "
80 "control/sysrq\n");
81 xenbus_transaction_end(xbt, 1);
82 return;
83 }
84
85 if (sysrq_key != '\0')
86 xenbus_printf(xbt, "control", "sysrq", "%c", '\0');
87
88 err = xenbus_transaction_end(xbt, 0);
89 if (err == -EAGAIN)
90 goto again;
91
92 if (sysrq_key != '\0')
93 handle_sysrq(sysrq_key, NULL);
94}
95
96static struct xenbus_watch shutdown_watch = {
97 .node = "control/shutdown",
98 .callback = shutdown_handler
99};
100
101static struct xenbus_watch sysrq_watch = {
102 .node = "control/sysrq",
103 .callback = sysrq_handler
104};
105
106static int setup_shutdown_watcher(void)
107{
108 int err;
109
110 err = register_xenbus_watch(&shutdown_watch);
111 if (err) {
112 printk(KERN_ERR "Failed to set shutdown watcher\n");
113 return err;
114 }
115
116 err = register_xenbus_watch(&sysrq_watch);
117 if (err) {
118 printk(KERN_ERR "Failed to set sysrq watcher\n");
119 return err;
120 }
121
122 return 0;
123}
124
125static int shutdown_event(struct notifier_block *notifier,
126 unsigned long event,
127 void *data)
128{
129 setup_shutdown_watcher();
130 return NOTIFY_DONE;
131}
132
133static int __init setup_shutdown_event(void)
134{
135 static struct notifier_block xenstore_notifier = {
136 .notifier_call = shutdown_event
137 };
138 register_xenstore_notifier(&xenstore_notifier);
139
140 return 0;
141}
142
143subsys_initcall(setup_shutdown_event);
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
new file mode 100644
index 000000000000..874db0cd1d2a
--- /dev/null
+++ b/arch/x86/xen/mmu.c
@@ -0,0 +1,567 @@
1/*
2 * Xen mmu operations
3 *
4 * This file contains the various mmu fetch and update operations.
5 * The most important job they must perform is the mapping between the
6 * domain's pfn and the overall machine mfns.
7 *
8 * Xen allows guests to directly update the pagetable, in a controlled
9 * fashion. In other words, the guest modifies the same pagetable
10 * that the CPU actually uses, which eliminates the overhead of having
11 * a separate shadow pagetable.
12 *
13 * In order to allow this, it falls on the guest domain to map its
14 * notion of a "physical" pfn - which is just a domain-local linear
15 * address - into a real "machine address" which the CPU's MMU can
16 * use.
17 *
18 * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
19 * inserted directly into the pagetable. When creating a new
20 * pte/pmd/pgd, it converts the passed pfn into an mfn. Conversely,
21 * when reading the content back with __(pgd|pmd|pte)_val, it converts
22 * the mfn back into a pfn.
23 *
24 * The other constraint is that all pages which make up a pagetable
25 * must be mapped read-only in the guest. This prevents uncontrolled
26 * guest updates to the pagetable. Xen strictly enforces this, and
27 * will disallow any pagetable update which will end up mapping a
28 * pagetable page RW, and will disallow using any writable page as a
29 * pagetable.
30 *
31 * Naively, when loading %cr3 with the base of a new pagetable, Xen
32 * would need to validate the whole pagetable before going on.
33 * Naturally, this is quite slow. The solution is to "pin" a
34 * pagetable, which enforces all the constraints on the pagetable even
35 * when it is not actively in use. This menas that Xen can be assured
36 * that it is still valid when you do load it into %cr3, and doesn't
37 * need to revalidate it.
38 *
39 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
40 */
41#include <linux/sched.h>
42#include <linux/highmem.h>
43#include <linux/bug.h>
44#include <linux/sched.h>
45
46#include <asm/pgtable.h>
47#include <asm/tlbflush.h>
48#include <asm/mmu_context.h>
49#include <asm/paravirt.h>
50
51#include <asm/xen/hypercall.h>
52#include <asm/xen/hypervisor.h>
53
54#include <xen/page.h>
55#include <xen/interface/xen.h>
56
57#include "multicalls.h"
58#include "mmu.h"
59
60xmaddr_t arbitrary_virt_to_machine(unsigned long address)
61{
62 pte_t *pte = lookup_address(address);
63 unsigned offset = address & PAGE_MASK;
64
65 BUG_ON(pte == NULL);
66
67 return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset);
68}
69
70void make_lowmem_page_readonly(void *vaddr)
71{
72 pte_t *pte, ptev;
73 unsigned long address = (unsigned long)vaddr;
74
75 pte = lookup_address(address);
76 BUG_ON(pte == NULL);
77
78 ptev = pte_wrprotect(*pte);
79
80 if (HYPERVISOR_update_va_mapping(address, ptev, 0))
81 BUG();
82}
83
84void make_lowmem_page_readwrite(void *vaddr)
85{
86 pte_t *pte, ptev;
87 unsigned long address = (unsigned long)vaddr;
88
89 pte = lookup_address(address);
90 BUG_ON(pte == NULL);
91
92 ptev = pte_mkwrite(*pte);
93
94 if (HYPERVISOR_update_va_mapping(address, ptev, 0))
95 BUG();
96}
97
98
99void xen_set_pmd(pmd_t *ptr, pmd_t val)
100{
101 struct multicall_space mcs;
102 struct mmu_update *u;
103
104 preempt_disable();
105
106 mcs = xen_mc_entry(sizeof(*u));
107 u = mcs.args;
108 u->ptr = virt_to_machine(ptr).maddr;
109 u->val = pmd_val_ma(val);
110 MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
111
112 xen_mc_issue(PARAVIRT_LAZY_MMU);
113
114 preempt_enable();
115}
116
117/*
118 * Associate a virtual page frame with a given physical page frame
119 * and protection flags for that frame.
120 */
121void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
122{
123 pgd_t *pgd;
124 pud_t *pud;
125 pmd_t *pmd;
126 pte_t *pte;
127
128 pgd = swapper_pg_dir + pgd_index(vaddr);
129 if (pgd_none(*pgd)) {
130 BUG();
131 return;
132 }
133 pud = pud_offset(pgd, vaddr);
134 if (pud_none(*pud)) {
135 BUG();
136 return;
137 }
138 pmd = pmd_offset(pud, vaddr);
139 if (pmd_none(*pmd)) {
140 BUG();
141 return;
142 }
143 pte = pte_offset_kernel(pmd, vaddr);
144 /* <mfn,flags> stored as-is, to permit clearing entries */
145 xen_set_pte(pte, mfn_pte(mfn, flags));
146
147 /*
148 * It's enough to flush this one mapping.
149 * (PGE mappings get flushed as well)
150 */
151 __flush_tlb_one(vaddr);
152}
153
154void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
155 pte_t *ptep, pte_t pteval)
156{
157 if (mm == current->mm || mm == &init_mm) {
158 if (xen_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
159 struct multicall_space mcs;
160 mcs = xen_mc_entry(0);
161
162 MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
163 xen_mc_issue(PARAVIRT_LAZY_MMU);
164 return;
165 } else
166 if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
167 return;
168 }
169 xen_set_pte(ptep, pteval);
170}
171
172#ifdef CONFIG_X86_PAE
173void xen_set_pud(pud_t *ptr, pud_t val)
174{
175 struct multicall_space mcs;
176 struct mmu_update *u;
177
178 preempt_disable();
179
180 mcs = xen_mc_entry(sizeof(*u));
181 u = mcs.args;
182 u->ptr = virt_to_machine(ptr).maddr;
183 u->val = pud_val_ma(val);
184 MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
185
186 xen_mc_issue(PARAVIRT_LAZY_MMU);
187
188 preempt_enable();
189}
190
191void xen_set_pte(pte_t *ptep, pte_t pte)
192{
193 ptep->pte_high = pte.pte_high;
194 smp_wmb();
195 ptep->pte_low = pte.pte_low;
196}
197
198void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
199{
200 set_64bit((u64 *)ptep, pte_val_ma(pte));
201}
202
203void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
204{
205 ptep->pte_low = 0;
206 smp_wmb(); /* make sure low gets written first */
207 ptep->pte_high = 0;
208}
209
210void xen_pmd_clear(pmd_t *pmdp)
211{
212 xen_set_pmd(pmdp, __pmd(0));
213}
214
215unsigned long long xen_pte_val(pte_t pte)
216{
217 unsigned long long ret = 0;
218
219 if (pte.pte_low) {
220 ret = ((unsigned long long)pte.pte_high << 32) | pte.pte_low;
221 ret = machine_to_phys(XMADDR(ret)).paddr | 1;
222 }
223
224 return ret;
225}
226
227unsigned long long xen_pmd_val(pmd_t pmd)
228{
229 unsigned long long ret = pmd.pmd;
230 if (ret)
231 ret = machine_to_phys(XMADDR(ret)).paddr | 1;
232 return ret;
233}
234
235unsigned long long xen_pgd_val(pgd_t pgd)
236{
237 unsigned long long ret = pgd.pgd;
238 if (ret)
239 ret = machine_to_phys(XMADDR(ret)).paddr | 1;
240 return ret;
241}
242
243pte_t xen_make_pte(unsigned long long pte)
244{
245 if (pte & 1)
246 pte = phys_to_machine(XPADDR(pte)).maddr;
247
248 return (pte_t){ pte, pte >> 32 };
249}
250
251pmd_t xen_make_pmd(unsigned long long pmd)
252{
253 if (pmd & 1)
254 pmd = phys_to_machine(XPADDR(pmd)).maddr;
255
256 return (pmd_t){ pmd };
257}
258
259pgd_t xen_make_pgd(unsigned long long pgd)
260{
261 if (pgd & _PAGE_PRESENT)
262 pgd = phys_to_machine(XPADDR(pgd)).maddr;
263
264 return (pgd_t){ pgd };
265}
266#else /* !PAE */
267void xen_set_pte(pte_t *ptep, pte_t pte)
268{
269 *ptep = pte;
270}
271
272unsigned long xen_pte_val(pte_t pte)
273{
274 unsigned long ret = pte.pte_low;
275
276 if (ret & _PAGE_PRESENT)
277 ret = machine_to_phys(XMADDR(ret)).paddr;
278
279 return ret;
280}
281
282unsigned long xen_pgd_val(pgd_t pgd)
283{
284 unsigned long ret = pgd.pgd;
285 if (ret)
286 ret = machine_to_phys(XMADDR(ret)).paddr | 1;
287 return ret;
288}
289
290pte_t xen_make_pte(unsigned long pte)
291{
292 if (pte & _PAGE_PRESENT)
293 pte = phys_to_machine(XPADDR(pte)).maddr;
294
295 return (pte_t){ pte };
296}
297
298pgd_t xen_make_pgd(unsigned long pgd)
299{
300 if (pgd & _PAGE_PRESENT)
301 pgd = phys_to_machine(XPADDR(pgd)).maddr;
302
303 return (pgd_t){ pgd };
304}
305#endif /* CONFIG_X86_PAE */
306
307
308
309/*
310 (Yet another) pagetable walker. This one is intended for pinning a
311 pagetable. This means that it walks a pagetable and calls the
312 callback function on each page it finds making up the page table,
313 at every level. It walks the entire pagetable, but it only bothers
314 pinning pte pages which are below pte_limit. In the normal case
315 this will be TASK_SIZE, but at boot we need to pin up to
316 FIXADDR_TOP. But the important bit is that we don't pin beyond
317 there, because then we start getting into Xen's ptes.
318*/
319static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
320 unsigned long limit)
321{
322 pgd_t *pgd = pgd_base;
323 int flush = 0;
324 unsigned long addr = 0;
325 unsigned long pgd_next;
326
327 BUG_ON(limit > FIXADDR_TOP);
328
329 if (xen_feature(XENFEAT_auto_translated_physmap))
330 return 0;
331
332 for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) {
333 pud_t *pud;
334 unsigned long pud_limit, pud_next;
335
336 pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP);
337
338 if (!pgd_val(*pgd))
339 continue;
340
341 pud = pud_offset(pgd, 0);
342
343 if (PTRS_PER_PUD > 1) /* not folded */
344 flush |= (*func)(virt_to_page(pud), 0);
345
346 for (; addr != pud_limit; pud++, addr = pud_next) {
347 pmd_t *pmd;
348 unsigned long pmd_limit;
349
350 pud_next = pud_addr_end(addr, pud_limit);
351
352 if (pud_next < limit)
353 pmd_limit = pud_next;
354 else
355 pmd_limit = limit;
356
357 if (pud_none(*pud))
358 continue;
359
360 pmd = pmd_offset(pud, 0);
361
362 if (PTRS_PER_PMD > 1) /* not folded */
363 flush |= (*func)(virt_to_page(pmd), 0);
364
365 for (; addr != pmd_limit; pmd++) {
366 addr += (PAGE_SIZE * PTRS_PER_PTE);
367 if ((pmd_limit-1) < (addr-1)) {
368 addr = pmd_limit;
369 break;
370 }
371
372 if (pmd_none(*pmd))
373 continue;
374
375 flush |= (*func)(pmd_page(*pmd), 0);
376 }
377 }
378 }
379
380 flush |= (*func)(virt_to_page(pgd_base), UVMF_TLB_FLUSH);
381
382 return flush;
383}
384
385static int pin_page(struct page *page, unsigned flags)
386{
387 unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags);
388 int flush;
389
390 if (pgfl)
391 flush = 0; /* already pinned */
392 else if (PageHighMem(page))
393 /* kmaps need flushing if we found an unpinned
394 highpage */
395 flush = 1;
396 else {
397 void *pt = lowmem_page_address(page);
398 unsigned long pfn = page_to_pfn(page);
399 struct multicall_space mcs = __xen_mc_entry(0);
400
401 flush = 0;
402
403 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
404 pfn_pte(pfn, PAGE_KERNEL_RO),
405 flags);
406 }
407
408 return flush;
409}
410
411/* This is called just after a mm has been created, but it has not
412 been used yet. We need to make sure that its pagetable is all
413 read-only, and can be pinned. */
414void xen_pgd_pin(pgd_t *pgd)
415{
416 struct multicall_space mcs;
417 struct mmuext_op *op;
418
419 xen_mc_batch();
420
421 if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
422 /* re-enable interrupts for kmap_flush_unused */
423 xen_mc_issue(0);
424 kmap_flush_unused();
425 xen_mc_batch();
426 }
427
428 mcs = __xen_mc_entry(sizeof(*op));
429 op = mcs.args;
430
431#ifdef CONFIG_X86_PAE
432 op->cmd = MMUEXT_PIN_L3_TABLE;
433#else
434 op->cmd = MMUEXT_PIN_L2_TABLE;
435#endif
436 op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
437 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
438
439 xen_mc_issue(0);
440}
441
442/* The init_mm pagetable is really pinned as soon as its created, but
443 that's before we have page structures to store the bits. So do all
444 the book-keeping now. */
445static __init int mark_pinned(struct page *page, unsigned flags)
446{
447 SetPagePinned(page);
448 return 0;
449}
450
451void __init xen_mark_init_mm_pinned(void)
452{
453 pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP);
454}
455
456static int unpin_page(struct page *page, unsigned flags)
457{
458 unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags);
459
460 if (pgfl && !PageHighMem(page)) {
461 void *pt = lowmem_page_address(page);
462 unsigned long pfn = page_to_pfn(page);
463 struct multicall_space mcs = __xen_mc_entry(0);
464
465 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
466 pfn_pte(pfn, PAGE_KERNEL),
467 flags);
468 }
469
470 return 0; /* never need to flush on unpin */
471}
472
473/* Release a pagetables pages back as normal RW */
474static void xen_pgd_unpin(pgd_t *pgd)
475{
476 struct mmuext_op *op;
477 struct multicall_space mcs;
478
479 xen_mc_batch();
480
481 mcs = __xen_mc_entry(sizeof(*op));
482
483 op = mcs.args;
484 op->cmd = MMUEXT_UNPIN_TABLE;
485 op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
486
487 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
488
489 pgd_walk(pgd, unpin_page, TASK_SIZE);
490
491 xen_mc_issue(0);
492}
493
494void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
495{
496 spin_lock(&next->page_table_lock);
497 xen_pgd_pin(next->pgd);
498 spin_unlock(&next->page_table_lock);
499}
500
501void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
502{
503 spin_lock(&mm->page_table_lock);
504 xen_pgd_pin(mm->pgd);
505 spin_unlock(&mm->page_table_lock);
506}
507
508
509#ifdef CONFIG_SMP
510/* Another cpu may still have their %cr3 pointing at the pagetable, so
511 we need to repoint it somewhere else before we can unpin it. */
512static void drop_other_mm_ref(void *info)
513{
514 struct mm_struct *mm = info;
515
516 if (__get_cpu_var(cpu_tlbstate).active_mm == mm)
517 leave_mm(smp_processor_id());
518}
519
520static void drop_mm_ref(struct mm_struct *mm)
521{
522 if (current->active_mm == mm) {
523 if (current->mm == mm)
524 load_cr3(swapper_pg_dir);
525 else
526 leave_mm(smp_processor_id());
527 }
528
529 if (!cpus_empty(mm->cpu_vm_mask))
530 xen_smp_call_function_mask(mm->cpu_vm_mask, drop_other_mm_ref,
531 mm, 1);
532}
533#else
534static void drop_mm_ref(struct mm_struct *mm)
535{
536 if (current->active_mm == mm)
537 load_cr3(swapper_pg_dir);
538}
539#endif
540
541/*
542 * While a process runs, Xen pins its pagetables, which means that the
543 * hypervisor forces it to be read-only, and it controls all updates
544 * to it. This means that all pagetable updates have to go via the
545 * hypervisor, which is moderately expensive.
546 *
547 * Since we're pulling the pagetable down, we switch to use init_mm,
548 * unpin old process pagetable and mark it all read-write, which
549 * allows further operations on it to be simple memory accesses.
550 *
551 * The only subtle point is that another CPU may be still using the
552 * pagetable because of lazy tlb flushing. This means we need need to
553 * switch all CPUs off this pagetable before we can unpin it.
554 */
555void xen_exit_mmap(struct mm_struct *mm)
556{
557 get_cpu(); /* make sure we don't move around */
558 drop_mm_ref(mm);
559 put_cpu();
560
561 spin_lock(&mm->page_table_lock);
562
563 /* pgd may not be pinned in the error exit path of execve */
564 if (PagePinned(virt_to_page(mm->pgd)))
565 xen_pgd_unpin(mm->pgd);
566 spin_unlock(&mm->page_table_lock);
567}
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
new file mode 100644
index 000000000000..c9ff27f3ac3a
--- /dev/null
+++ b/arch/x86/xen/mmu.h
@@ -0,0 +1,60 @@
1#ifndef _XEN_MMU_H
2
3#include <linux/linkage.h>
4#include <asm/page.h>
5
6/*
7 * Page-directory addresses above 4GB do not fit into architectural %cr3.
8 * When accessing %cr3, or equivalent field in vcpu_guest_context, guests
9 * must use the following accessor macros to pack/unpack valid MFNs.
10 *
11 * Note that Xen is using the fact that the pagetable base is always
12 * page-aligned, and putting the 12 MSB of the address into the 12 LSB
13 * of cr3.
14 */
15#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
16#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
17
18
19void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
20
21void xen_set_pte(pte_t *ptep, pte_t pteval);
22void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
23 pte_t *ptep, pte_t pteval);
24void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval);
25
26void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next);
27void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
28void xen_exit_mmap(struct mm_struct *mm);
29
30void xen_pgd_pin(pgd_t *pgd);
31//void xen_pgd_unpin(pgd_t *pgd);
32
33#ifdef CONFIG_X86_PAE
34unsigned long long xen_pte_val(pte_t);
35unsigned long long xen_pmd_val(pmd_t);
36unsigned long long xen_pgd_val(pgd_t);
37
38pte_t xen_make_pte(unsigned long long);
39pmd_t xen_make_pmd(unsigned long long);
40pgd_t xen_make_pgd(unsigned long long);
41
42void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
43 pte_t *ptep, pte_t pteval);
44void xen_set_pte_atomic(pte_t *ptep, pte_t pte);
45void xen_set_pud(pud_t *ptr, pud_t val);
46void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
47void xen_pmd_clear(pmd_t *pmdp);
48
49
50#else
51unsigned long xen_pte_val(pte_t);
52unsigned long xen_pmd_val(pmd_t);
53unsigned long xen_pgd_val(pgd_t);
54
55pte_t xen_make_pte(unsigned long);
56pmd_t xen_make_pmd(unsigned long);
57pgd_t xen_make_pgd(unsigned long);
58#endif
59
60#endif /* _XEN_MMU_H */
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
new file mode 100644
index 000000000000..c837e8e463db
--- /dev/null
+++ b/arch/x86/xen/multicalls.c
@@ -0,0 +1,90 @@
1/*
2 * Xen hypercall batching.
3 *
4 * Xen allows multiple hypercalls to be issued at once, using the
5 * multicall interface. This allows the cost of trapping into the
6 * hypervisor to be amortized over several calls.
7 *
8 * This file implements a simple interface for multicalls. There's a
9 * per-cpu buffer of outstanding multicalls. When you want to queue a
10 * multicall for issuing, you can allocate a multicall slot for the
11 * call and its arguments, along with storage for space which is
12 * pointed to by the arguments (for passing pointers to structures,
13 * etc). When the multicall is actually issued, all the space for the
14 * commands and allocated memory is freed for reuse.
15 *
16 * Multicalls are flushed whenever any of the buffers get full, or
17 * when explicitly requested. There's no way to get per-multicall
18 * return results back. It will BUG if any of the multicalls fail.
19 *
20 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
21 */
22#include <linux/percpu.h>
23#include <linux/hardirq.h>
24
25#include <asm/xen/hypercall.h>
26
27#include "multicalls.h"
28
29#define MC_BATCH 32
30#define MC_ARGS (MC_BATCH * 16 / sizeof(u64))
31
32struct mc_buffer {
33 struct multicall_entry entries[MC_BATCH];
34 u64 args[MC_ARGS];
35 unsigned mcidx, argidx;
36};
37
38static DEFINE_PER_CPU(struct mc_buffer, mc_buffer);
39DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags);
40
41void xen_mc_flush(void)
42{
43 struct mc_buffer *b = &__get_cpu_var(mc_buffer);
44 int ret = 0;
45 unsigned long flags;
46
47 BUG_ON(preemptible());
48
49 /* Disable interrupts in case someone comes in and queues
50 something in the middle */
51 local_irq_save(flags);
52
53 if (b->mcidx) {
54 int i;
55
56 if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0)
57 BUG();
58 for (i = 0; i < b->mcidx; i++)
59 if (b->entries[i].result < 0)
60 ret++;
61 b->mcidx = 0;
62 b->argidx = 0;
63 } else
64 BUG_ON(b->argidx != 0);
65
66 local_irq_restore(flags);
67
68 BUG_ON(ret);
69}
70
71struct multicall_space __xen_mc_entry(size_t args)
72{
73 struct mc_buffer *b = &__get_cpu_var(mc_buffer);
74 struct multicall_space ret;
75 unsigned argspace = (args + sizeof(u64) - 1) / sizeof(u64);
76
77 BUG_ON(preemptible());
78 BUG_ON(argspace > MC_ARGS);
79
80 if (b->mcidx == MC_BATCH ||
81 (b->argidx + argspace) > MC_ARGS)
82 xen_mc_flush();
83
84 ret.mc = &b->entries[b->mcidx];
85 b->mcidx++;
86 ret.args = &b->args[b->argidx];
87 b->argidx += argspace;
88
89 return ret;
90}
diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h
new file mode 100644
index 000000000000..e6f7530b156c
--- /dev/null
+++ b/arch/x86/xen/multicalls.h
@@ -0,0 +1,45 @@
1#ifndef _XEN_MULTICALLS_H
2#define _XEN_MULTICALLS_H
3
4#include "xen-ops.h"
5
6/* Multicalls */
7struct multicall_space
8{
9 struct multicall_entry *mc;
10 void *args;
11};
12
13/* Allocate room for a multicall and its args */
14struct multicall_space __xen_mc_entry(size_t args);
15
16DECLARE_PER_CPU(unsigned long, xen_mc_irq_flags);
17
18/* Call to start a batch of multiple __xen_mc_entry()s. Must be
19 paired with xen_mc_issue() */
20static inline void xen_mc_batch(void)
21{
22 /* need to disable interrupts until this entry is complete */
23 local_irq_save(__get_cpu_var(xen_mc_irq_flags));
24}
25
26static inline struct multicall_space xen_mc_entry(size_t args)
27{
28 xen_mc_batch();
29 return __xen_mc_entry(args);
30}
31
32/* Flush all pending multicalls */
33void xen_mc_flush(void);
34
35/* Issue a multicall if we're not in a lazy mode */
36static inline void xen_mc_issue(unsigned mode)
37{
38 if ((xen_get_lazy_mode() & mode) == 0)
39 xen_mc_flush();
40
41 /* restore flags saved in xen_mc_batch */
42 local_irq_restore(x86_read_percpu(xen_mc_irq_flags));
43}
44
45#endif /* _XEN_MULTICALLS_H */
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
new file mode 100644
index 000000000000..f84e77226646
--- /dev/null
+++ b/arch/x86/xen/setup.c
@@ -0,0 +1,111 @@
1/*
2 * Machine specific setup for xen
3 *
4 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
5 */
6
7#include <linux/module.h>
8#include <linux/sched.h>
9#include <linux/mm.h>
10#include <linux/pm.h>
11
12#include <asm/elf.h>
13#include <asm/e820.h>
14#include <asm/setup.h>
15#include <asm/xen/hypervisor.h>
16#include <asm/xen/hypercall.h>
17
18#include <xen/interface/physdev.h>
19#include <xen/features.h>
20
21#include "xen-ops.h"
22#include "vdso.h"
23
24/* These are code, but not functions. Defined in entry.S */
25extern const char xen_hypervisor_callback[];
26extern const char xen_failsafe_callback[];
27
28unsigned long *phys_to_machine_mapping;
29EXPORT_SYMBOL(phys_to_machine_mapping);
30
31/**
32 * machine_specific_memory_setup - Hook for machine specific memory setup.
33 **/
34
35char * __init xen_memory_setup(void)
36{
37 unsigned long max_pfn = xen_start_info->nr_pages;
38
39 e820.nr_map = 0;
40 add_memory_region(0, PFN_PHYS(max_pfn), E820_RAM);
41
42 return "Xen";
43}
44
45static void xen_idle(void)
46{
47 local_irq_disable();
48
49 if (need_resched())
50 local_irq_enable();
51 else {
52 current_thread_info()->status &= ~TS_POLLING;
53 smp_mb__after_clear_bit();
54 safe_halt();
55 current_thread_info()->status |= TS_POLLING;
56 }
57}
58
59/*
60 * Set the bit indicating "nosegneg" library variants should be used.
61 */
62static void fiddle_vdso(void)
63{
64 extern u32 VDSO_NOTE_MASK; /* See ../kernel/vsyscall-note.S. */
65 extern char vsyscall_int80_start;
66 u32 *mask = (u32 *) ((unsigned long) &VDSO_NOTE_MASK - VDSO_PRELINK +
67 &vsyscall_int80_start);
68 *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
69}
70
71void __init xen_arch_setup(void)
72{
73 struct physdev_set_iopl set_iopl;
74 int rc;
75
76 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
77 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
78
79 if (!xen_feature(XENFEAT_auto_translated_physmap))
80 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3);
81
82 HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback,
83 __KERNEL_CS, (unsigned long)xen_failsafe_callback);
84
85 set_iopl.iopl = 1;
86 rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
87 if (rc != 0)
88 printk(KERN_INFO "physdev_op failed %d\n", rc);
89
90#ifdef CONFIG_ACPI
91 if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
92 printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
93 disable_acpi();
94 }
95#endif
96
97 memcpy(boot_command_line, xen_start_info->cmd_line,
98 MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ?
99 COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE);
100
101 pm_idle = xen_idle;
102
103#ifdef CONFIG_SMP
104 /* fill cpus_possible with all available cpus */
105 xen_fill_possible_map();
106#endif
107
108 paravirt_disable_iospace();
109
110 fiddle_vdso();
111}
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
new file mode 100644
index 000000000000..557b8e24706a
--- /dev/null
+++ b/arch/x86/xen/smp.c
@@ -0,0 +1,404 @@
1/*
2 * Xen SMP support
3 *
4 * This file implements the Xen versions of smp_ops. SMP under Xen is
5 * very straightforward. Bringing a CPU up is simply a matter of
6 * loading its initial context and setting it running.
7 *
8 * IPIs are handled through the Xen event mechanism.
9 *
10 * Because virtual CPUs can be scheduled onto any real CPU, there's no
11 * useful topology information for the kernel to make use of. As a
12 * result, all CPUs are treated as if they're single-core and
13 * single-threaded.
14 *
15 * This does not handle HOTPLUG_CPU yet.
16 */
17#include <linux/sched.h>
18#include <linux/err.h>
19#include <linux/smp.h>
20
21#include <asm/paravirt.h>
22#include <asm/desc.h>
23#include <asm/pgtable.h>
24#include <asm/cpu.h>
25
26#include <xen/interface/xen.h>
27#include <xen/interface/vcpu.h>
28
29#include <asm/xen/interface.h>
30#include <asm/xen/hypercall.h>
31
32#include <xen/page.h>
33#include <xen/events.h>
34
35#include "xen-ops.h"
36#include "mmu.h"
37
38static cpumask_t cpu_initialized_map;
39static DEFINE_PER_CPU(int, resched_irq);
40static DEFINE_PER_CPU(int, callfunc_irq);
41
42/*
43 * Structure and data for smp_call_function(). This is designed to minimise
44 * static memory requirements. It also looks cleaner.
45 */
46static DEFINE_SPINLOCK(call_lock);
47
48struct call_data_struct {
49 void (*func) (void *info);
50 void *info;
51 atomic_t started;
52 atomic_t finished;
53 int wait;
54};
55
56static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
57
58static struct call_data_struct *call_data;
59
60/*
61 * Reschedule call back. Nothing to do,
62 * all the work is done automatically when
63 * we return from the interrupt.
64 */
65static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
66{
67 return IRQ_HANDLED;
68}
69
70static __cpuinit void cpu_bringup_and_idle(void)
71{
72 int cpu = smp_processor_id();
73
74 cpu_init();
75
76 preempt_disable();
77 per_cpu(cpu_state, cpu) = CPU_ONLINE;
78
79 xen_setup_cpu_clockevents();
80
81 /* We can take interrupts now: we're officially "up". */
82 local_irq_enable();
83
84 wmb(); /* make sure everything is out */
85 cpu_idle();
86}
87
88static int xen_smp_intr_init(unsigned int cpu)
89{
90 int rc;
91 const char *resched_name, *callfunc_name;
92
93 per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1;
94
95 resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu);
96 rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR,
97 cpu,
98 xen_reschedule_interrupt,
99 IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
100 resched_name,
101 NULL);
102 if (rc < 0)
103 goto fail;
104 per_cpu(resched_irq, cpu) = rc;
105
106 callfunc_name = kasprintf(GFP_KERNEL, "callfunc%d", cpu);
107 rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR,
108 cpu,
109 xen_call_function_interrupt,
110 IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
111 callfunc_name,
112 NULL);
113 if (rc < 0)
114 goto fail;
115 per_cpu(callfunc_irq, cpu) = rc;
116
117 return 0;
118
119 fail:
120 if (per_cpu(resched_irq, cpu) >= 0)
121 unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
122 if (per_cpu(callfunc_irq, cpu) >= 0)
123 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
124 return rc;
125}
126
127void __init xen_fill_possible_map(void)
128{
129 int i, rc;
130
131 for (i = 0; i < NR_CPUS; i++) {
132 rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
133 if (rc >= 0)
134 cpu_set(i, cpu_possible_map);
135 }
136}
137
138void __init xen_smp_prepare_boot_cpu(void)
139{
140 int cpu;
141
142 BUG_ON(smp_processor_id() != 0);
143 native_smp_prepare_boot_cpu();
144
145 /* We've switched to the "real" per-cpu gdt, so make sure the
146 old memory can be recycled */
147 make_lowmem_page_readwrite(&per_cpu__gdt_page);
148
149 for (cpu = 0; cpu < NR_CPUS; cpu++) {
150 cpus_clear(cpu_sibling_map[cpu]);
151 cpus_clear(cpu_core_map[cpu]);
152 }
153
154 xen_setup_vcpu_info_placement();
155}
156
157void __init xen_smp_prepare_cpus(unsigned int max_cpus)
158{
159 unsigned cpu;
160
161 for (cpu = 0; cpu < NR_CPUS; cpu++) {
162 cpus_clear(cpu_sibling_map[cpu]);
163 cpus_clear(cpu_core_map[cpu]);
164 }
165
166 smp_store_cpu_info(0);
167 set_cpu_sibling_map(0);
168
169 if (xen_smp_intr_init(0))
170 BUG();
171
172 cpu_initialized_map = cpumask_of_cpu(0);
173
174 /* Restrict the possible_map according to max_cpus. */
175 while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) {
176 for (cpu = NR_CPUS-1; !cpu_isset(cpu, cpu_possible_map); cpu--)
177 continue;
178 cpu_clear(cpu, cpu_possible_map);
179 }
180
181 for_each_possible_cpu (cpu) {
182 struct task_struct *idle;
183
184 if (cpu == 0)
185 continue;
186
187 idle = fork_idle(cpu);
188 if (IS_ERR(idle))
189 panic("failed fork for CPU %d", cpu);
190
191 cpu_set(cpu, cpu_present_map);
192 }
193
194 //init_xenbus_allowed_cpumask();
195}
196
197static __cpuinit int
198cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
199{
200 struct vcpu_guest_context *ctxt;
201 struct gdt_page *gdt = &per_cpu(gdt_page, cpu);
202
203 if (cpu_test_and_set(cpu, cpu_initialized_map))
204 return 0;
205
206 ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
207 if (ctxt == NULL)
208 return -ENOMEM;
209
210 ctxt->flags = VGCF_IN_KERNEL;
211 ctxt->user_regs.ds = __USER_DS;
212 ctxt->user_regs.es = __USER_DS;
213 ctxt->user_regs.fs = __KERNEL_PERCPU;
214 ctxt->user_regs.gs = 0;
215 ctxt->user_regs.ss = __KERNEL_DS;
216 ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
217 ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
218
219 memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt));
220
221 xen_copy_trap_info(ctxt->trap_ctxt);
222
223 ctxt->ldt_ents = 0;
224
225 BUG_ON((unsigned long)gdt->gdt & ~PAGE_MASK);
226 make_lowmem_page_readonly(gdt->gdt);
227
228 ctxt->gdt_frames[0] = virt_to_mfn(gdt->gdt);
229 ctxt->gdt_ents = ARRAY_SIZE(gdt->gdt);
230
231 ctxt->user_regs.cs = __KERNEL_CS;
232 ctxt->user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs);
233
234 ctxt->kernel_ss = __KERNEL_DS;
235 ctxt->kernel_sp = idle->thread.esp0;
236
237 ctxt->event_callback_cs = __KERNEL_CS;
238 ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback;
239 ctxt->failsafe_callback_cs = __KERNEL_CS;
240 ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback;
241
242 per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
243 ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
244
245 if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, ctxt))
246 BUG();
247
248 kfree(ctxt);
249 return 0;
250}
251
252int __cpuinit xen_cpu_up(unsigned int cpu)
253{
254 struct task_struct *idle = idle_task(cpu);
255 int rc;
256
257#if 0
258 rc = cpu_up_check(cpu);
259 if (rc)
260 return rc;
261#endif
262
263 init_gdt(cpu);
264 per_cpu(current_task, cpu) = idle;
265 irq_ctx_init(cpu);
266 xen_setup_timer(cpu);
267
268 /* make sure interrupts start blocked */
269 per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
270
271 rc = cpu_initialize_context(cpu, idle);
272 if (rc)
273 return rc;
274
275 if (num_online_cpus() == 1)
276 alternatives_smp_switch(1);
277
278 rc = xen_smp_intr_init(cpu);
279 if (rc)
280 return rc;
281
282 smp_store_cpu_info(cpu);
283 set_cpu_sibling_map(cpu);
284 /* This must be done before setting cpu_online_map */
285 wmb();
286
287 cpu_set(cpu, cpu_online_map);
288
289 rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
290 BUG_ON(rc);
291
292 return 0;
293}
294
295void xen_smp_cpus_done(unsigned int max_cpus)
296{
297}
298
299static void stop_self(void *v)
300{
301 int cpu = smp_processor_id();
302
303 /* make sure we're not pinning something down */
304 load_cr3(swapper_pg_dir);
305 /* should set up a minimal gdt */
306
307 HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL);
308 BUG();
309}
310
311void xen_smp_send_stop(void)
312{
313 smp_call_function(stop_self, NULL, 0, 0);
314}
315
316void xen_smp_send_reschedule(int cpu)
317{
318 xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
319}
320
321
322static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector)
323{
324 unsigned cpu;
325
326 cpus_and(mask, mask, cpu_online_map);
327
328 for_each_cpu_mask(cpu, mask)
329 xen_send_IPI_one(cpu, vector);
330}
331
332static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
333{
334 void (*func) (void *info) = call_data->func;
335 void *info = call_data->info;
336 int wait = call_data->wait;
337
338 /*
339 * Notify initiating CPU that I've grabbed the data and am
340 * about to execute the function
341 */
342 mb();
343 atomic_inc(&call_data->started);
344 /*
345 * At this point the info structure may be out of scope unless wait==1
346 */
347 irq_enter();
348 (*func)(info);
349 irq_exit();
350
351 if (wait) {
352 mb(); /* commit everything before setting finished */
353 atomic_inc(&call_data->finished);
354 }
355
356 return IRQ_HANDLED;
357}
358
359int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
360 void *info, int wait)
361{
362 struct call_data_struct data;
363 int cpus;
364
365 /* Holding any lock stops cpus from going down. */
366 spin_lock(&call_lock);
367
368 cpu_clear(smp_processor_id(), mask);
369
370 cpus = cpus_weight(mask);
371 if (!cpus) {
372 spin_unlock(&call_lock);
373 return 0;
374 }
375
376 /* Can deadlock when called with interrupts disabled */
377 WARN_ON(irqs_disabled());
378
379 data.func = func;
380 data.info = info;
381 atomic_set(&data.started, 0);
382 data.wait = wait;
383 if (wait)
384 atomic_set(&data.finished, 0);
385
386 call_data = &data;
387 mb(); /* write everything before IPI */
388
389 /* Send a message to other CPUs and wait for them to respond */
390 xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
391
392 /* Make sure other vcpus get a chance to run.
393 XXX too severe? Maybe we should check the other CPU's states? */
394 HYPERVISOR_sched_op(SCHEDOP_yield, 0);
395
396 /* Wait for response */
397 while (atomic_read(&data.started) != cpus ||
398 (wait && atomic_read(&data.finished) != cpus))
399 cpu_relax();
400
401 spin_unlock(&call_lock);
402
403 return 0;
404}
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
new file mode 100644
index 000000000000..dfd6db69ead5
--- /dev/null
+++ b/arch/x86/xen/time.c
@@ -0,0 +1,593 @@
1/*
2 * Xen time implementation.
3 *
4 * This is implemented in terms of a clocksource driver which uses
5 * the hypervisor clock as a nanosecond timebase, and a clockevent
6 * driver which uses the hypervisor's timer mechanism.
7 *
8 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
9 */
10#include <linux/kernel.h>
11#include <linux/interrupt.h>
12#include <linux/clocksource.h>
13#include <linux/clockchips.h>
14#include <linux/kernel_stat.h>
15
16#include <asm/xen/hypervisor.h>
17#include <asm/xen/hypercall.h>
18
19#include <xen/events.h>
20#include <xen/interface/xen.h>
21#include <xen/interface/vcpu.h>
22
23#include "xen-ops.h"
24
25#define XEN_SHIFT 22
26
27/* Xen may fire a timer up to this many ns early */
28#define TIMER_SLOP 100000
29#define NS_PER_TICK (1000000000LL / HZ)
30
31static cycle_t xen_clocksource_read(void);
32
33/* These are perodically updated in shared_info, and then copied here. */
34struct shadow_time_info {
35 u64 tsc_timestamp; /* TSC at last update of time vals. */
36 u64 system_timestamp; /* Time, in nanosecs, since boot. */
37 u32 tsc_to_nsec_mul;
38 int tsc_shift;
39 u32 version;
40};
41
42static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
43
44/* runstate info updated by Xen */
45static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
46
47/* snapshots of runstate info */
48static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot);
49
50/* unused ns of stolen and blocked time */
51static DEFINE_PER_CPU(u64, residual_stolen);
52static DEFINE_PER_CPU(u64, residual_blocked);
53
54/* return an consistent snapshot of 64-bit time/counter value */
55static u64 get64(const u64 *p)
56{
57 u64 ret;
58
59 if (BITS_PER_LONG < 64) {
60 u32 *p32 = (u32 *)p;
61 u32 h, l;
62
63 /*
64 * Read high then low, and then make sure high is
65 * still the same; this will only loop if low wraps
66 * and carries into high.
67 * XXX some clean way to make this endian-proof?
68 */
69 do {
70 h = p32[1];
71 barrier();
72 l = p32[0];
73 barrier();
74 } while (p32[1] != h);
75
76 ret = (((u64)h) << 32) | l;
77 } else
78 ret = *p;
79
80 return ret;
81}
82
83/*
84 * Runstate accounting
85 */
86static void get_runstate_snapshot(struct vcpu_runstate_info *res)
87{
88 u64 state_time;
89 struct vcpu_runstate_info *state;
90
91 BUG_ON(preemptible());
92
93 state = &__get_cpu_var(runstate);
94
95 /*
96 * The runstate info is always updated by the hypervisor on
97 * the current CPU, so there's no need to use anything
98 * stronger than a compiler barrier when fetching it.
99 */
100 do {
101 state_time = get64(&state->state_entry_time);
102 barrier();
103 *res = *state;
104 barrier();
105 } while (get64(&state->state_entry_time) != state_time);
106}
107
108static void setup_runstate_info(int cpu)
109{
110 struct vcpu_register_runstate_memory_area area;
111
112 area.addr.v = &per_cpu(runstate, cpu);
113
114 if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
115 cpu, &area))
116 BUG();
117}
118
119static void do_stolen_accounting(void)
120{
121 struct vcpu_runstate_info state;
122 struct vcpu_runstate_info *snap;
123 s64 blocked, runnable, offline, stolen;
124 cputime_t ticks;
125
126 get_runstate_snapshot(&state);
127
128 WARN_ON(state.state != RUNSTATE_running);
129
130 snap = &__get_cpu_var(runstate_snapshot);
131
132 /* work out how much time the VCPU has not been runn*ing* */
133 blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked];
134 runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable];
135 offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline];
136
137 *snap = state;
138
139 /* Add the appropriate number of ticks of stolen time,
140 including any left-overs from last time. Passing NULL to
141 account_steal_time accounts the time as stolen. */
142 stolen = runnable + offline + __get_cpu_var(residual_stolen);
143
144 if (stolen < 0)
145 stolen = 0;
146
147 ticks = 0;
148 while (stolen >= NS_PER_TICK) {
149 ticks++;
150 stolen -= NS_PER_TICK;
151 }
152 __get_cpu_var(residual_stolen) = stolen;
153 account_steal_time(NULL, ticks);
154
155 /* Add the appropriate number of ticks of blocked time,
156 including any left-overs from last time. Passing idle to
157 account_steal_time accounts the time as idle/wait. */
158 blocked += __get_cpu_var(residual_blocked);
159
160 if (blocked < 0)
161 blocked = 0;
162
163 ticks = 0;
164 while (blocked >= NS_PER_TICK) {
165 ticks++;
166 blocked -= NS_PER_TICK;
167 }
168 __get_cpu_var(residual_blocked) = blocked;
169 account_steal_time(idle_task(smp_processor_id()), ticks);
170}
171
172/*
173 * Xen sched_clock implementation. Returns the number of unstolen
174 * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED
175 * states.
176 */
177unsigned long long xen_sched_clock(void)
178{
179 struct vcpu_runstate_info state;
180 cycle_t now;
181 u64 ret;
182 s64 offset;
183
184 /*
185 * Ideally sched_clock should be called on a per-cpu basis
186 * anyway, so preempt should already be disabled, but that's
187 * not current practice at the moment.
188 */
189 preempt_disable();
190
191 now = xen_clocksource_read();
192
193 get_runstate_snapshot(&state);
194
195 WARN_ON(state.state != RUNSTATE_running);
196
197 offset = now - state.state_entry_time;
198 if (offset < 0)
199 offset = 0;
200
201 ret = state.time[RUNSTATE_blocked] +
202 state.time[RUNSTATE_running] +
203 offset;
204
205 preempt_enable();
206
207 return ret;
208}
209
210
211/* Get the CPU speed from Xen */
212unsigned long xen_cpu_khz(void)
213{
214 u64 cpu_khz = 1000000ULL << 32;
215 const struct vcpu_time_info *info =
216 &HYPERVISOR_shared_info->vcpu_info[0].time;
217
218 do_div(cpu_khz, info->tsc_to_system_mul);
219 if (info->tsc_shift < 0)
220 cpu_khz <<= -info->tsc_shift;
221 else
222 cpu_khz >>= info->tsc_shift;
223
224 return cpu_khz;
225}
226
227/*
228 * Reads a consistent set of time-base values from Xen, into a shadow data
229 * area.
230 */
231static unsigned get_time_values_from_xen(void)
232{
233 struct vcpu_time_info *src;
234 struct shadow_time_info *dst;
235
236 /* src is shared memory with the hypervisor, so we need to
237 make sure we get a consistent snapshot, even in the face of
238 being preempted. */
239 src = &__get_cpu_var(xen_vcpu)->time;
240 dst = &__get_cpu_var(shadow_time);
241
242 do {
243 dst->version = src->version;
244 rmb(); /* fetch version before data */
245 dst->tsc_timestamp = src->tsc_timestamp;
246 dst->system_timestamp = src->system_time;
247 dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
248 dst->tsc_shift = src->tsc_shift;
249 rmb(); /* test version after fetching data */
250 } while ((src->version & 1) | (dst->version ^ src->version));
251
252 return dst->version;
253}
254
255/*
256 * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
257 * yielding a 64-bit result.
258 */
259static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
260{
261 u64 product;
262#ifdef __i386__
263 u32 tmp1, tmp2;
264#endif
265
266 if (shift < 0)
267 delta >>= -shift;
268 else
269 delta <<= shift;
270
271#ifdef __i386__
272 __asm__ (
273 "mul %5 ; "
274 "mov %4,%%eax ; "
275 "mov %%edx,%4 ; "
276 "mul %5 ; "
277 "xor %5,%5 ; "
278 "add %4,%%eax ; "
279 "adc %5,%%edx ; "
280 : "=A" (product), "=r" (tmp1), "=r" (tmp2)
281 : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
282#elif __x86_64__
283 __asm__ (
284 "mul %%rdx ; shrd $32,%%rdx,%%rax"
285 : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
286#else
287#error implement me!
288#endif
289
290 return product;
291}
292
293static u64 get_nsec_offset(struct shadow_time_info *shadow)
294{
295 u64 now, delta;
296 now = native_read_tsc();
297 delta = now - shadow->tsc_timestamp;
298 return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
299}
300
301static cycle_t xen_clocksource_read(void)
302{
303 struct shadow_time_info *shadow = &get_cpu_var(shadow_time);
304 cycle_t ret;
305 unsigned version;
306
307 do {
308 version = get_time_values_from_xen();
309 barrier();
310 ret = shadow->system_timestamp + get_nsec_offset(shadow);
311 barrier();
312 } while (version != __get_cpu_var(xen_vcpu)->time.version);
313
314 put_cpu_var(shadow_time);
315
316 return ret;
317}
318
319static void xen_read_wallclock(struct timespec *ts)
320{
321 const struct shared_info *s = HYPERVISOR_shared_info;
322 u32 version;
323 u64 delta;
324 struct timespec now;
325
326 /* get wallclock at system boot */
327 do {
328 version = s->wc_version;
329 rmb(); /* fetch version before time */
330 now.tv_sec = s->wc_sec;
331 now.tv_nsec = s->wc_nsec;
332 rmb(); /* fetch time before checking version */
333 } while ((s->wc_version & 1) | (version ^ s->wc_version));
334
335 delta = xen_clocksource_read(); /* time since system boot */
336 delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
337
338 now.tv_nsec = do_div(delta, NSEC_PER_SEC);
339 now.tv_sec = delta;
340
341 set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
342}
343
344unsigned long xen_get_wallclock(void)
345{
346 struct timespec ts;
347
348 xen_read_wallclock(&ts);
349
350 return ts.tv_sec;
351}
352
353int xen_set_wallclock(unsigned long now)
354{
355 /* do nothing for domU */
356 return -1;
357}
358
359static struct clocksource xen_clocksource __read_mostly = {
360 .name = "xen",
361 .rating = 400,
362 .read = xen_clocksource_read,
363 .mask = ~0,
364 .mult = 1<<XEN_SHIFT, /* time directly in nanoseconds */
365 .shift = XEN_SHIFT,
366 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
367};
368
369/*
370 Xen clockevent implementation
371
372 Xen has two clockevent implementations:
373
374 The old timer_op one works with all released versions of Xen prior
375 to version 3.0.4. This version of the hypervisor provides a
376 single-shot timer with nanosecond resolution. However, sharing the
377 same event channel is a 100Hz tick which is delivered while the
378 vcpu is running. We don't care about or use this tick, but it will
379 cause the core time code to think the timer fired too soon, and
380 will end up resetting it each time. It could be filtered, but
381 doing so has complications when the ktime clocksource is not yet
382 the xen clocksource (ie, at boot time).
383
384 The new vcpu_op-based timer interface allows the tick timer period
385 to be changed or turned off. The tick timer is not useful as a
386 periodic timer because events are only delivered to running vcpus.
387 The one-shot timer can report when a timeout is in the past, so
388 set_next_event is capable of returning -ETIME when appropriate.
389 This interface is used when available.
390*/
391
392
393/*
394 Get a hypervisor absolute time. In theory we could maintain an
395 offset between the kernel's time and the hypervisor's time, and
396 apply that to a kernel's absolute timeout. Unfortunately the
397 hypervisor and kernel times can drift even if the kernel is using
398 the Xen clocksource, because ntp can warp the kernel's clocksource.
399*/
400static s64 get_abs_timeout(unsigned long delta)
401{
402 return xen_clocksource_read() + delta;
403}
404
405static void xen_timerop_set_mode(enum clock_event_mode mode,
406 struct clock_event_device *evt)
407{
408 switch (mode) {
409 case CLOCK_EVT_MODE_PERIODIC:
410 /* unsupported */
411 WARN_ON(1);
412 break;
413
414 case CLOCK_EVT_MODE_ONESHOT:
415 case CLOCK_EVT_MODE_RESUME:
416 break;
417
418 case CLOCK_EVT_MODE_UNUSED:
419 case CLOCK_EVT_MODE_SHUTDOWN:
420 HYPERVISOR_set_timer_op(0); /* cancel timeout */
421 break;
422 }
423}
424
425static int xen_timerop_set_next_event(unsigned long delta,
426 struct clock_event_device *evt)
427{
428 WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
429
430 if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
431 BUG();
432
433 /* We may have missed the deadline, but there's no real way of
434 knowing for sure. If the event was in the past, then we'll
435 get an immediate interrupt. */
436
437 return 0;
438}
439
440static const struct clock_event_device xen_timerop_clockevent = {
441 .name = "xen",
442 .features = CLOCK_EVT_FEAT_ONESHOT,
443
444 .max_delta_ns = 0xffffffff,
445 .min_delta_ns = TIMER_SLOP,
446
447 .mult = 1,
448 .shift = 0,
449 .rating = 500,
450
451 .set_mode = xen_timerop_set_mode,
452 .set_next_event = xen_timerop_set_next_event,
453};
454
455
456
457static void xen_vcpuop_set_mode(enum clock_event_mode mode,
458 struct clock_event_device *evt)
459{
460 int cpu = smp_processor_id();
461
462 switch (mode) {
463 case CLOCK_EVT_MODE_PERIODIC:
464 WARN_ON(1); /* unsupported */
465 break;
466
467 case CLOCK_EVT_MODE_ONESHOT:
468 if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
469 BUG();
470 break;
471
472 case CLOCK_EVT_MODE_UNUSED:
473 case CLOCK_EVT_MODE_SHUTDOWN:
474 if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) ||
475 HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
476 BUG();
477 break;
478 case CLOCK_EVT_MODE_RESUME:
479 break;
480 }
481}
482
483static int xen_vcpuop_set_next_event(unsigned long delta,
484 struct clock_event_device *evt)
485{
486 int cpu = smp_processor_id();
487 struct vcpu_set_singleshot_timer single;
488 int ret;
489
490 WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
491
492 single.timeout_abs_ns = get_abs_timeout(delta);
493 single.flags = VCPU_SSHOTTMR_future;
494
495 ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &single);
496
497 BUG_ON(ret != 0 && ret != -ETIME);
498
499 return ret;
500}
501
502static const struct clock_event_device xen_vcpuop_clockevent = {
503 .name = "xen",
504 .features = CLOCK_EVT_FEAT_ONESHOT,
505
506 .max_delta_ns = 0xffffffff,
507 .min_delta_ns = TIMER_SLOP,
508
509 .mult = 1,
510 .shift = 0,
511 .rating = 500,
512
513 .set_mode = xen_vcpuop_set_mode,
514 .set_next_event = xen_vcpuop_set_next_event,
515};
516
517static const struct clock_event_device *xen_clockevent =
518 &xen_timerop_clockevent;
519static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events);
520
521static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
522{
523 struct clock_event_device *evt = &__get_cpu_var(xen_clock_events);
524 irqreturn_t ret;
525
526 ret = IRQ_NONE;
527 if (evt->event_handler) {
528 evt->event_handler(evt);
529 ret = IRQ_HANDLED;
530 }
531
532 do_stolen_accounting();
533
534 return ret;
535}
536
537void xen_setup_timer(int cpu)
538{
539 const char *name;
540 struct clock_event_device *evt;
541 int irq;
542
543 printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu);
544
545 name = kasprintf(GFP_KERNEL, "timer%d", cpu);
546 if (!name)
547 name = "<timer kasprintf failed>";
548
549 irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
550 IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
551 name, NULL);
552
553 evt = &per_cpu(xen_clock_events, cpu);
554 memcpy(evt, xen_clockevent, sizeof(*evt));
555
556 evt->cpumask = cpumask_of_cpu(cpu);
557 evt->irq = irq;
558
559 setup_runstate_info(cpu);
560}
561
562void xen_setup_cpu_clockevents(void)
563{
564 BUG_ON(preemptible());
565
566 clockevents_register_device(&__get_cpu_var(xen_clock_events));
567}
568
569__init void xen_time_init(void)
570{
571 int cpu = smp_processor_id();
572
573 get_time_values_from_xen();
574
575 clocksource_register(&xen_clocksource);
576
577 if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
578 /* Successfully turned off 100Hz tick, so we have the
579 vcpuop-based timer interface */
580 printk(KERN_DEBUG "Xen: using vcpuop timer interface\n");
581 xen_clockevent = &xen_vcpuop_clockevent;
582 }
583
584 /* Set initial system time with full resolution */
585 xen_read_wallclock(&xtime);
586 set_normalized_timespec(&wall_to_monotonic,
587 -xtime.tv_sec, -xtime.tv_nsec);
588
589 tsc_disable = 0;
590
591 xen_setup_timer(cpu);
592 xen_setup_cpu_clockevents();
593}
diff --git a/arch/x86/xen/vdso.h b/arch/x86/xen/vdso.h
new file mode 100644
index 000000000000..861fedfe5230
--- /dev/null
+++ b/arch/x86/xen/vdso.h
@@ -0,0 +1,4 @@
1/* Bit used for the pseudo-hwcap for non-negative segments. We use
2 bit 1 to avoid bugs in some versions of glibc when bit 0 is
3 used; the choice is otherwise arbitrary. */
4#define VDSO_NOTE_NONEGSEG_BIT 1
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
new file mode 100644
index 000000000000..1a43b60c0c62
--- /dev/null
+++ b/arch/x86/xen/xen-asm.S
@@ -0,0 +1,291 @@
1/*
2 Asm versions of Xen pv-ops, suitable for either direct use or inlining.
3 The inline versions are the same as the direct-use versions, with the
4 pre- and post-amble chopped off.
5
6 This code is encoded for size rather than absolute efficiency,
7 with a view to being able to inline as much as possible.
8
9 We only bother with direct forms (ie, vcpu in pda) of the operations
10 here; the indirect forms are better handled in C, since they're
11 generally too large to inline anyway.
12 */
13
14#include <linux/linkage.h>
15
16#include <asm/asm-offsets.h>
17#include <asm/thread_info.h>
18#include <asm/percpu.h>
19#include <asm/processor-flags.h>
20#include <asm/segment.h>
21
22#include <xen/interface/xen.h>
23
24#define RELOC(x, v) .globl x##_reloc; x##_reloc=v
25#define ENDPATCH(x) .globl x##_end; x##_end=.
26
27/* Pseudo-flag used for virtual NMI, which we don't implement yet */
28#define XEN_EFLAGS_NMI 0x80000000
29
30/*
31 Enable events. This clears the event mask and tests the pending
32 event status with one and operation. If there are pending
33 events, then enter the hypervisor to get them handled.
34 */
35ENTRY(xen_irq_enable_direct)
36 /* Clear mask and test pending */
37 andw $0x00ff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
38 /* Preempt here doesn't matter because that will deal with
39 any pending interrupts. The pending check may end up being
40 run on the wrong CPU, but that doesn't hurt. */
41 jz 1f
422: call check_events
431:
44ENDPATCH(xen_irq_enable_direct)
45 ret
46 ENDPROC(xen_irq_enable_direct)
47 RELOC(xen_irq_enable_direct, 2b+1)
48
49
50/*
51 Disabling events is simply a matter of making the event mask
52 non-zero.
53 */
54ENTRY(xen_irq_disable_direct)
55 movb $1, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
56ENDPATCH(xen_irq_disable_direct)
57 ret
58 ENDPROC(xen_irq_disable_direct)
59 RELOC(xen_irq_disable_direct, 0)
60
61/*
62 (xen_)save_fl is used to get the current interrupt enable status.
63 Callers expect the status to be in X86_EFLAGS_IF, and other bits
64 may be set in the return value. We take advantage of this by
65 making sure that X86_EFLAGS_IF has the right value (and other bits
66 in that byte are 0), but other bits in the return value are
67 undefined. We need to toggle the state of the bit, because
68 Xen and x86 use opposite senses (mask vs enable).
69 */
70ENTRY(xen_save_fl_direct)
71 testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
72 setz %ah
73 addb %ah,%ah
74ENDPATCH(xen_save_fl_direct)
75 ret
76 ENDPROC(xen_save_fl_direct)
77 RELOC(xen_save_fl_direct, 0)
78
79
80/*
81 In principle the caller should be passing us a value return
82 from xen_save_fl_direct, but for robustness sake we test only
83 the X86_EFLAGS_IF flag rather than the whole byte. After
84 setting the interrupt mask state, it checks for unmasked
85 pending events and enters the hypervisor to get them delivered
86 if so.
87 */
88ENTRY(xen_restore_fl_direct)
89 testb $X86_EFLAGS_IF>>8, %ah
90 setz PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
91 /* Preempt here doesn't matter because that will deal with
92 any pending interrupts. The pending check may end up being
93 run on the wrong CPU, but that doesn't hurt. */
94
95 /* check for unmasked and pending */
96 cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
97 jz 1f
982: call check_events
991:
100ENDPATCH(xen_restore_fl_direct)
101 ret
102 ENDPROC(xen_restore_fl_direct)
103 RELOC(xen_restore_fl_direct, 2b+1)
104
105/*
106 This is run where a normal iret would be run, with the same stack setup:
107 8: eflags
108 4: cs
109 esp-> 0: eip
110
111 This attempts to make sure that any pending events are dealt
112 with on return to usermode, but there is a small window in
113 which an event can happen just before entering usermode. If
114 the nested interrupt ends up setting one of the TIF_WORK_MASK
115 pending work flags, they will not be tested again before
116 returning to usermode. This means that a process can end up
117 with pending work, which will be unprocessed until the process
118 enters and leaves the kernel again, which could be an
119 unbounded amount of time. This means that a pending signal or
120 reschedule event could be indefinitely delayed.
121
122 The fix is to notice a nested interrupt in the critical
123 window, and if one occurs, then fold the nested interrupt into
124 the current interrupt stack frame, and re-process it
125 iteratively rather than recursively. This means that it will
126 exit via the normal path, and all pending work will be dealt
127 with appropriately.
128
129 Because the nested interrupt handler needs to deal with the
130 current stack state in whatever form its in, we keep things
131 simple by only using a single register which is pushed/popped
132 on the stack.
133
134 Non-direct iret could be done in the same way, but it would
135 require an annoying amount of code duplication. We'll assume
136 that direct mode will be the common case once the hypervisor
137 support becomes commonplace.
138 */
139ENTRY(xen_iret_direct)
140 /* test eflags for special cases */
141 testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp)
142 jnz hyper_iret
143
144 push %eax
145 ESP_OFFSET=4 # bytes pushed onto stack
146
147 /* Store vcpu_info pointer for easy access. Do it this
148 way to avoid having to reload %fs */
149#ifdef CONFIG_SMP
150 GET_THREAD_INFO(%eax)
151 movl TI_cpu(%eax),%eax
152 movl __per_cpu_offset(,%eax,4),%eax
153 lea per_cpu__xen_vcpu_info(%eax),%eax
154#else
155 movl $per_cpu__xen_vcpu_info, %eax
156#endif
157
158 /* check IF state we're restoring */
159 testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp)
160
161 /* Maybe enable events. Once this happens we could get a
162 recursive event, so the critical region starts immediately
163 afterwards. However, if that happens we don't end up
164 resuming the code, so we don't have to be worried about
165 being preempted to another CPU. */
166 setz XEN_vcpu_info_mask(%eax)
167xen_iret_start_crit:
168
169 /* check for unmasked and pending */
170 cmpw $0x0001, XEN_vcpu_info_pending(%eax)
171
172 /* If there's something pending, mask events again so we
173 can jump back into xen_hypervisor_callback */
174 sete XEN_vcpu_info_mask(%eax)
175
176 popl %eax
177
178 /* From this point on the registers are restored and the stack
179 updated, so we don't need to worry about it if we're preempted */
180iret_restore_end:
181
182 /* Jump to hypervisor_callback after fixing up the stack.
183 Events are masked, so jumping out of the critical
184 region is OK. */
185 je xen_hypervisor_callback
186
187 iret
188xen_iret_end_crit:
189
190hyper_iret:
191 /* put this out of line since its very rarely used */
192 jmp hypercall_page + __HYPERVISOR_iret * 32
193
194 .globl xen_iret_start_crit, xen_iret_end_crit
195
196/*
197 This is called by xen_hypervisor_callback in entry.S when it sees
198 that the EIP at the time of interrupt was between xen_iret_start_crit
199 and xen_iret_end_crit. We're passed the EIP in %eax so we can do
200 a more refined determination of what to do.
201
202 The stack format at this point is:
203 ----------------
204 ss : (ss/esp may be present if we came from usermode)
205 esp :
206 eflags } outer exception info
207 cs }
208 eip }
209 ---------------- <- edi (copy dest)
210 eax : outer eax if it hasn't been restored
211 ----------------
212 eflags } nested exception info
213 cs } (no ss/esp because we're nested
214 eip } from the same ring)
215 orig_eax }<- esi (copy src)
216 - - - - - - - -
217 fs }
218 es }
219 ds } SAVE_ALL state
220 eax }
221 : :
222 ebx }
223 ----------------
224 return addr <- esp
225 ----------------
226
227 In order to deliver the nested exception properly, we need to shift
228 everything from the return addr up to the error code so it
229 sits just under the outer exception info. This means that when we
230 handle the exception, we do it in the context of the outer exception
231 rather than starting a new one.
232
233 The only caveat is that if the outer eax hasn't been
234 restored yet (ie, it's still on stack), we need to insert
235 its value into the SAVE_ALL state before going on, since
236 it's usermode state which we eventually need to restore.
237 */
238ENTRY(xen_iret_crit_fixup)
239 /* offsets +4 for return address */
240
241 /*
242 Paranoia: Make sure we're really coming from userspace.
243 One could imagine a case where userspace jumps into the
244 critical range address, but just before the CPU delivers a GP,
245 it decides to deliver an interrupt instead. Unlikely?
246 Definitely. Easy to avoid? Yes. The Intel documents
247 explicitly say that the reported EIP for a bad jump is the
248 jump instruction itself, not the destination, but some virtual
249 environments get this wrong.
250 */
251 movl PT_CS+4(%esp), %ecx
252 andl $SEGMENT_RPL_MASK, %ecx
253 cmpl $USER_RPL, %ecx
254 je 2f
255
256 lea PT_ORIG_EAX+4(%esp), %esi
257 lea PT_EFLAGS+4(%esp), %edi
258
259 /* If eip is before iret_restore_end then stack
260 hasn't been restored yet. */
261 cmp $iret_restore_end, %eax
262 jae 1f
263
264 movl 0+4(%edi),%eax /* copy EAX */
265 movl %eax, PT_EAX+4(%esp)
266
267 lea ESP_OFFSET(%edi),%edi /* move dest up over saved regs */
268
269 /* set up the copy */
2701: std
271 mov $(PT_EIP+4) / 4, %ecx /* copy ret+saved regs up to orig_eax */
272 rep movsl
273 cld
274
275 lea 4(%edi),%esp /* point esp to new frame */
2762: ret
277
278
279/*
280 Force an event check by making a hypercall,
281 but preserve regs before making the call.
282 */
283check_events:
284 push %eax
285 push %ecx
286 push %edx
287 call force_evtchn_callback
288 pop %edx
289 pop %ecx
290 pop %eax
291 ret
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
new file mode 100644
index 000000000000..f8d6937db2ec
--- /dev/null
+++ b/arch/x86/xen/xen-head.S
@@ -0,0 +1,38 @@
1/* Xen-specific pieces of head.S, intended to be included in the right
2 place in head.S */
3
4#ifdef CONFIG_XEN
5
6#include <linux/elfnote.h>
7#include <asm/boot.h>
8#include <xen/interface/elfnote.h>
9
10.pushsection .init.text
11ENTRY(startup_xen)
12 movl %esi,xen_start_info
13 cld
14 movl $(init_thread_union+THREAD_SIZE),%esp
15 jmp xen_start_kernel
16.popsection
17
18.pushsection .bss.page_aligned
19 .align PAGE_SIZE_asm
20ENTRY(hypercall_page)
21 .skip 0x1000
22.popsection
23
24 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux")
25 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6")
26 ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0")
27 ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long __PAGE_OFFSET)
28 ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_xen)
29 ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page)
30 ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb")
31#ifdef CONFIG_X86_PAE
32 ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes")
33#else
34 ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "no")
35#endif
36 ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
37
38#endif /*CONFIG_XEN */
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
new file mode 100644
index 000000000000..b9aaea45f07f
--- /dev/null
+++ b/arch/x86/xen/xen-ops.h
@@ -0,0 +1,71 @@
1#ifndef XEN_OPS_H
2#define XEN_OPS_H
3
4#include <linux/init.h>
5
6/* These are code, but not functions. Defined in entry.S */
7extern const char xen_hypervisor_callback[];
8extern const char xen_failsafe_callback[];
9
10void xen_copy_trap_info(struct trap_info *traps);
11
12DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu);
13DECLARE_PER_CPU(unsigned long, xen_cr3);
14
15extern struct start_info *xen_start_info;
16extern struct shared_info *HYPERVISOR_shared_info;
17
18char * __init xen_memory_setup(void);
19void __init xen_arch_setup(void);
20void __init xen_init_IRQ(void);
21
22void xen_setup_timer(int cpu);
23void xen_setup_cpu_clockevents(void);
24unsigned long xen_cpu_khz(void);
25void __init xen_time_init(void);
26unsigned long xen_get_wallclock(void);
27int xen_set_wallclock(unsigned long time);
28unsigned long long xen_sched_clock(void);
29
30void xen_mark_init_mm_pinned(void);
31
32DECLARE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode);
33
34static inline unsigned xen_get_lazy_mode(void)
35{
36 return x86_read_percpu(xen_lazy_mode);
37}
38
39void __init xen_fill_possible_map(void);
40
41void __init xen_setup_vcpu_info_placement(void);
42void xen_smp_prepare_boot_cpu(void);
43void xen_smp_prepare_cpus(unsigned int max_cpus);
44int xen_cpu_up(unsigned int cpu);
45void xen_smp_cpus_done(unsigned int max_cpus);
46
47void xen_smp_send_stop(void);
48void xen_smp_send_reschedule(int cpu);
49int xen_smp_call_function (void (*func) (void *info), void *info, int nonatomic,
50 int wait);
51int xen_smp_call_function_single(int cpu, void (*func) (void *info), void *info,
52 int nonatomic, int wait);
53
54int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
55 void *info, int wait);
56
57
58/* Declare an asm function, along with symbols needed to make it
59 inlineable */
60#define DECL_ASM(ret, name, ...) \
61 ret name(__VA_ARGS__); \
62 extern char name##_end[]; \
63 extern char name##_reloc[] \
64
65DECL_ASM(void, xen_irq_enable_direct, void);
66DECL_ASM(void, xen_irq_disable_direct, void);
67DECL_ASM(unsigned long, xen_save_fl_direct, void);
68DECL_ASM(void, xen_restore_fl_direct, unsigned long);
69
70void xen_iret_direct(void);
71#endif /* XEN_OPS_H */